diff options
author | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 15:20:36 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 15:20:36 -0700 |
commit | 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch) | |
tree | 0bba044c4ce775e45a88a51686b5d9f90697ea9d /arch/x86_64/kernel |
Linux-2.6.12-rc2v2.6.12-rc2
Initial git repository build. I'm not bothering with the full history,
even though we have it. We can create a separate "historical" git
archive of that later if we want to, and in the meantime it's about
3.2GB when imported into git - space that would just make the early
git days unnecessarily complicated, when we don't have a lot of good
infrastructure for it.
Let it rip!
Diffstat (limited to 'arch/x86_64/kernel')
53 files changed, 20636 insertions, 0 deletions
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile new file mode 100644 index 00000000000..0a3318e08ab --- /dev/null +++ b/arch/x86_64/kernel/Makefile @@ -0,0 +1,45 @@ +# +# Makefile for the linux kernel. +# + +extra-y := head.o head64.o init_task.o vmlinux.lds +EXTRA_AFLAGS := -traditional +obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o \ + ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \ + x8664_ksyms.o i387.o syscall.o vsyscall.o \ + setup64.o bootflag.o e820.o reboot.o quirks.o + +obj-$(CONFIG_X86_MCE) += mce.o +obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o +obj-$(CONFIG_MTRR) += ../../i386/kernel/cpu/mtrr/ +obj-$(CONFIG_ACPI_BOOT) += acpi/ +obj-$(CONFIG_X86_MSR) += msr.o +obj-$(CONFIG_MICROCODE) += microcode.o +obj-$(CONFIG_X86_CPUID) += cpuid.o +obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o +obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o +obj-$(CONFIG_X86_IO_APIC) += io_apic.o mpparse.o \ + genapic.o genapic_cluster.o genapic_flat.o +obj-$(CONFIG_PM) += suspend.o +obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend_asm.o +obj-$(CONFIG_CPU_FREQ) += cpufreq/ +obj-$(CONFIG_EARLY_PRINTK) += early_printk.o +obj-$(CONFIG_GART_IOMMU) += pci-gart.o aperture.o +obj-$(CONFIG_DUMMY_IOMMU) += pci-nommu.o pci-dma.o +obj-$(CONFIG_SWIOTLB) += swiotlb.o +obj-$(CONFIG_KPROBES) += kprobes.o + +obj-$(CONFIG_MODULES) += module.o + +obj-y += topology.o +obj-y += intel_cacheinfo.o + +CFLAGS_vsyscall.o := $(PROFILING) -g0 + +bootflag-y += ../../i386/kernel/bootflag.o +cpuid-$(subst m,y,$(CONFIG_X86_CPUID)) += ../../i386/kernel/cpuid.o +topology-y += ../../i386/mach-default/topology.o +swiotlb-$(CONFIG_SWIOTLB) += ../../ia64/lib/swiotlb.o +microcode-$(subst m,y,$(CONFIG_MICROCODE)) += ../../i386/kernel/microcode.o +intel_cacheinfo-y += ../../i386/kernel/cpu/intel_cacheinfo.o +quirks-y += ../../i386/kernel/quirks.o diff --git a/arch/x86_64/kernel/acpi/Makefile b/arch/x86_64/kernel/acpi/Makefile new file mode 100644 index 00000000000..d2c2ee5f9a8 --- /dev/null +++ b/arch/x86_64/kernel/acpi/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_ACPI_BOOT) := boot.o +boot-$(CONFIG_ACPI_BOOT) := ../../../i386/kernel/acpi/boot.o +obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup.o diff --git a/arch/x86_64/kernel/acpi/sleep.c b/arch/x86_64/kernel/acpi/sleep.c new file mode 100644 index 00000000000..7a275de6df2 --- /dev/null +++ b/arch/x86_64/kernel/acpi/sleep.c @@ -0,0 +1,132 @@ +/* + * acpi.c - Architecture-Specific Low-Level ACPI Support + * + * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com> + * Copyright (C) 2001 Jun Nakajima <jun.nakajima@intel.com> + * Copyright (C) 2001 Patrick Mochel <mochel@osdl.org> + * Copyright (C) 2002 Andi Kleen, SuSE Labs (x86-64 port) + * Copyright (C) 2003 Pavel Machek, SuSE Labs + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + */ + +#include <linux/config.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/types.h> +#include <linux/stddef.h> +#include <linux/slab.h> +#include <linux/pci.h> +#include <linux/bootmem.h> +#include <linux/irq.h> +#include <linux/acpi.h> +#include <asm/mpspec.h> +#include <asm/io.h> +#include <asm/apic.h> +#include <asm/apicdef.h> +#include <asm/page.h> +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/io_apic.h> +#include <asm/proto.h> +#include <asm/tlbflush.h> + + +/* -------------------------------------------------------------------------- + Low-Level Sleep Support + -------------------------------------------------------------------------- */ + +#ifdef CONFIG_ACPI_SLEEP + +/* address in low memory of the wakeup routine. */ +unsigned long acpi_wakeup_address = 0; +unsigned long acpi_video_flags; +extern char wakeup_start, wakeup_end; + +extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long)); + +static pgd_t low_ptr; + +static void init_low_mapping(void) +{ + pgd_t *slot0 = pgd_offset(current->mm, 0UL); + low_ptr = *slot0; + set_pgd(slot0, *pgd_offset(current->mm, PAGE_OFFSET)); + flush_tlb_all(); +} + +/** + * acpi_save_state_mem - save kernel state + * + * Create an identity mapped page table and copy the wakeup routine to + * low memory. + */ +int acpi_save_state_mem (void) +{ + init_low_mapping(); + + memcpy((void *) acpi_wakeup_address, &wakeup_start, &wakeup_end - &wakeup_start); + acpi_copy_wakeup_routine(acpi_wakeup_address); + + return 0; +} + +/* + * acpi_restore_state + */ +void acpi_restore_state_mem (void) +{ + set_pgd(pgd_offset(current->mm, 0UL), low_ptr); + flush_tlb_all(); +} + +/** + * acpi_reserve_bootmem - do _very_ early ACPI initialisation + * + * We allocate a page in low memory for the wakeup + * routine for when we come back from a sleep state. The + * runtime allocator allows specification of <16M pages, but not + * <1M pages. + */ +void __init acpi_reserve_bootmem(void) +{ + acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE); + if ((&wakeup_end - &wakeup_start) > PAGE_SIZE) + printk(KERN_CRIT "ACPI: Wakeup code way too big, will crash on attempt to suspend\n"); +} + +static int __init acpi_sleep_setup(char *str) +{ + while ((str != NULL) && (*str != '\0')) { + if (strncmp(str, "s3_bios", 7) == 0) + acpi_video_flags = 1; + if (strncmp(str, "s3_mode", 7) == 0) + acpi_video_flags |= 2; + str = strchr(str, ','); + if (str != NULL) + str += strspn(str, ", \t"); + } + return 1; +} + +__setup("acpi_sleep=", acpi_sleep_setup); + +#endif /*CONFIG_ACPI_SLEEP*/ + +void acpi_pci_link_exit(void) {} diff --git a/arch/x86_64/kernel/acpi/wakeup.S b/arch/x86_64/kernel/acpi/wakeup.S new file mode 100644 index 00000000000..a4c630034cd --- /dev/null +++ b/arch/x86_64/kernel/acpi/wakeup.S @@ -0,0 +1,527 @@ +.text +#include <linux/linkage.h> +#include <asm/segment.h> +#include <asm/page.h> +#include <asm/msr.h> + +# Copyright 2003 Pavel Machek <pavel@suse.cz>, distribute under GPLv2 +# +# wakeup_code runs in real mode, and at unknown address (determined at run-time). +# Therefore it must only use relative jumps/calls. +# +# Do we need to deal with A20? It is okay: ACPI specs says A20 must be enabled +# +# If physical address of wakeup_code is 0x12345, BIOS should call us with +# cs = 0x1234, eip = 0x05 +# + + +ALIGN + .align 16 +ENTRY(wakeup_start) +wakeup_code: + wakeup_code_start = . + .code16 + +# Running in *copy* of this code, somewhere in low 1MB. + + movb $0xa1, %al ; outb %al, $0x80 + cli + cld + # setup data segment + movw %cs, %ax + movw %ax, %ds # Make ds:0 point to wakeup_start + movw %ax, %ss + mov $(wakeup_stack - wakeup_code), %sp # Private stack is needed for ASUS board + + pushl $0 # Kill any dangerous flags + popfl + + movl real_magic - wakeup_code, %eax + cmpl $0x12345678, %eax + jne bogus_real_magic + + testl $1, video_flags - wakeup_code + jz 1f + lcall $0xc000,$3 + movw %cs, %ax + movw %ax, %ds # Bios might have played with that + movw %ax, %ss +1: + + testl $2, video_flags - wakeup_code + jz 1f + mov video_mode - wakeup_code, %ax + call mode_seta +1: + + movw $0xb800, %ax + movw %ax,%fs + movw $0x0e00 + 'L', %fs:(0x10) + + movb $0xa2, %al ; outb %al, $0x80 + + lidt %ds:idt_48a - wakeup_code + xorl %eax, %eax + movw %ds, %ax # (Convert %ds:gdt to a linear ptr) + shll $4, %eax + addl $(gdta - wakeup_code), %eax + movl %eax, gdt_48a +2 - wakeup_code + lgdt %ds:gdt_48a - wakeup_code # load gdt with whatever is + # appropriate + + movl $1, %eax # protected mode (PE) bit + lmsw %ax # This is it! + jmp 1f +1: + + .byte 0x66, 0xea # prefix + jmpi-opcode + .long wakeup_32 - __START_KERNEL_map + .word __KERNEL_CS + + .code32 +wakeup_32: +# Running in this code, but at low address; paging is not yet turned on. + movb $0xa5, %al ; outb %al, $0x80 + + /* Check if extended functions are implemented */ + movl $0x80000000, %eax + cpuid + cmpl $0x80000000, %eax + jbe bogus_cpu + wbinvd + mov $0x80000001, %eax + cpuid + btl $29, %edx + jnc bogus_cpu + movl %edx,%edi + + movw $__KERNEL_DS, %ax + movw %ax, %ds + movw %ax, %es + movw %ax, %fs + movw %ax, %gs + + movw $__KERNEL_DS, %ax + movw %ax, %ss + + mov $(wakeup_stack - __START_KERNEL_map), %esp + movl saved_magic - __START_KERNEL_map, %eax + cmpl $0x9abcdef0, %eax + jne bogus_32_magic + + /* + * Prepare for entering 64bits mode + */ + + /* Enable PAE mode and PGE */ + xorl %eax, %eax + btsl $5, %eax + btsl $7, %eax + movl %eax, %cr4 + + /* Setup early boot stage 4 level pagetables */ + movl $(wakeup_level4_pgt - __START_KERNEL_map), %eax + movl %eax, %cr3 + + /* Setup EFER (Extended Feature Enable Register) */ + movl $MSR_EFER, %ecx + rdmsr + /* Fool rdmsr and reset %eax to avoid dependences */ + xorl %eax, %eax + /* Enable Long Mode */ + btsl $_EFER_LME, %eax + /* Enable System Call */ + btsl $_EFER_SCE, %eax + + /* No Execute supported? */ + btl $20,%edi + jnc 1f + btsl $_EFER_NX, %eax +1: + + /* Make changes effective */ + wrmsr + wbinvd + + xorl %eax, %eax + btsl $31, %eax /* Enable paging and in turn activate Long Mode */ + btsl $0, %eax /* Enable protected mode */ + btsl $1, %eax /* Enable MP */ + btsl $4, %eax /* Enable ET */ + btsl $5, %eax /* Enable NE */ + btsl $16, %eax /* Enable WP */ + btsl $18, %eax /* Enable AM */ + + /* Make changes effective */ + movl %eax, %cr0 + /* At this point: + CR4.PAE must be 1 + CS.L must be 0 + CR3 must point to PML4 + Next instruction must be a branch + This must be on identity-mapped page + */ + jmp reach_compatibility_mode +reach_compatibility_mode: + movw $0x0e00 + 'i', %ds:(0xb8012) + movb $0xa8, %al ; outb %al, $0x80; + + /* + * At this point we're in long mode but in 32bit compatibility mode + * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn + * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we load + * the new gdt/idt that has __KERNEL_CS with CS.L = 1. + */ + + movw $0x0e00 + 'n', %ds:(0xb8014) + movb $0xa9, %al ; outb %al, $0x80 + + /* Load new GDT with the 64bit segment using 32bit descriptor */ + movl $(pGDT32 - __START_KERNEL_map), %eax + lgdt (%eax) + + movl $(wakeup_jumpvector - __START_KERNEL_map), %eax + /* Finally jump in 64bit mode */ + ljmp *(%eax) + +wakeup_jumpvector: + .long wakeup_long64 - __START_KERNEL_map + .word __KERNEL_CS + +.code64 + + /* Hooray, we are in Long 64-bit mode (but still running in low memory) */ +wakeup_long64: + /* + * We must switch to a new descriptor in kernel space for the GDT + * because soon the kernel won't have access anymore to the userspace + * addresses where we're currently running on. We have to do that here + * because in 32bit we couldn't load a 64bit linear address. + */ + lgdt cpu_gdt_descr - __START_KERNEL_map + + movw $0x0e00 + 'u', %ds:(0xb8016) + + nop + nop + movw $__KERNEL_DS, %ax + movw %ax, %ss + movw %ax, %ds + movw %ax, %es + movw %ax, %fs + movw %ax, %gs + movq saved_esp, %rsp + + movw $0x0e00 + 'x', %ds:(0xb8018) + movq saved_ebx, %rbx + movq saved_edi, %rdi + movq saved_esi, %rsi + movq saved_ebp, %rbp + + movw $0x0e00 + '!', %ds:(0xb801a) + movq saved_eip, %rax + jmp *%rax + +.code32 + + .align 64 +gdta: + .word 0, 0, 0, 0 # dummy + + .word 0, 0, 0, 0 # unused + + .word 0xFFFF # 4Gb - (0x100000*0x1000 = 4Gb) + .word 0 # base address = 0 + .word 0x9B00 # code read/exec. ??? Why I need 0x9B00 (as opposed to 0x9A00 in order for this to work?) + .word 0x00CF # granularity = 4096, 386 + # (+5th nibble of limit) + + .word 0xFFFF # 4Gb - (0x100000*0x1000 = 4Gb) + .word 0 # base address = 0 + .word 0x9200 # data read/write + .word 0x00CF # granularity = 4096, 386 + # (+5th nibble of limit) +# this is 64bit descriptor for code + .word 0xFFFF + .word 0 + .word 0x9A00 # code read/exec + .word 0x00AF # as above, but it is long mode and with D=0 + +idt_48a: + .word 0 # idt limit = 0 + .word 0, 0 # idt base = 0L + +gdt_48a: + .word 0x8000 # gdt limit=2048, + # 256 GDT entries + .word 0, 0 # gdt base (filled in later) + + +real_save_gdt: .word 0 + .quad 0 +real_magic: .quad 0 +video_mode: .quad 0 +video_flags: .quad 0 + +bogus_real_magic: + movb $0xba,%al ; outb %al,$0x80 + jmp bogus_real_magic + +bogus_32_magic: + movb $0xb3,%al ; outb %al,$0x80 + jmp bogus_32_magic + +bogus_31_magic: + movb $0xb1,%al ; outb %al,$0x80 + jmp bogus_31_magic + +bogus_cpu: + movb $0xbc,%al ; outb %al,$0x80 + jmp bogus_cpu + + +/* This code uses an extended set of video mode numbers. These include: + * Aliases for standard modes + * NORMAL_VGA (-1) + * EXTENDED_VGA (-2) + * ASK_VGA (-3) + * Video modes numbered by menu position -- NOT RECOMMENDED because of lack + * of compatibility when extending the table. These are between 0x00 and 0xff. + */ +#define VIDEO_FIRST_MENU 0x0000 + +/* Standard BIOS video modes (BIOS number + 0x0100) */ +#define VIDEO_FIRST_BIOS 0x0100 + +/* VESA BIOS video modes (VESA number + 0x0200) */ +#define VIDEO_FIRST_VESA 0x0200 + +/* Video7 special modes (BIOS number + 0x0900) */ +#define VIDEO_FIRST_V7 0x0900 + +# Setting of user mode (AX=mode ID) => CF=success +mode_seta: + movw %ax, %bx +#if 0 + cmpb $0xff, %ah + jz setalias + + testb $VIDEO_RECALC>>8, %ah + jnz _setrec + + cmpb $VIDEO_FIRST_RESOLUTION>>8, %ah + jnc setres + + cmpb $VIDEO_FIRST_SPECIAL>>8, %ah + jz setspc + + cmpb $VIDEO_FIRST_V7>>8, %ah + jz setv7 +#endif + + cmpb $VIDEO_FIRST_VESA>>8, %ah + jnc check_vesaa +#if 0 + orb %ah, %ah + jz setmenu +#endif + + decb %ah +# jz setbios Add bios modes later + +setbada: clc + ret + +check_vesaa: + subb $VIDEO_FIRST_VESA>>8, %bh + orw $0x4000, %bx # Use linear frame buffer + movw $0x4f02, %ax # VESA BIOS mode set call + int $0x10 + cmpw $0x004f, %ax # AL=4f if implemented + jnz _setbada # AH=0 if OK + + stc + ret + +_setbada: jmp setbada + + .code64 +bogus_magic: + movw $0x0e00 + 'B', %ds:(0xb8018) + jmp bogus_magic + +bogus_magic2: + movw $0x0e00 + '2', %ds:(0xb8018) + jmp bogus_magic2 + + +wakeup_stack_begin: # Stack grows down + +.org 0xff0 +wakeup_stack: # Just below end of page + +ENTRY(wakeup_end) + +## +# acpi_copy_wakeup_routine +# +# Copy the above routine to low memory. +# +# Parameters: +# %rdi: place to copy wakeup routine to +# +# Returned address is location of code in low memory (past data and stack) +# +ENTRY(acpi_copy_wakeup_routine) + pushq %rax + pushq %rcx + pushq %rdx + + sgdt saved_gdt + sidt saved_idt + sldt saved_ldt + str saved_tss + + movq %cr3, %rdx + movq %rdx, saved_cr3 + movq %cr4, %rdx + movq %rdx, saved_cr4 + movq %cr0, %rdx + movq %rdx, saved_cr0 + sgdt real_save_gdt - wakeup_start (,%rdi) + movl $MSR_EFER, %ecx + rdmsr + movl %eax, saved_efer + movl %edx, saved_efer2 + + movl saved_video_mode, %edx + movl %edx, video_mode - wakeup_start (,%rdi) + movl acpi_video_flags, %edx + movl %edx, video_flags - wakeup_start (,%rdi) + movq $0x12345678, real_magic - wakeup_start (,%rdi) + movq $0x123456789abcdef0, %rdx + movq %rdx, saved_magic + + movl saved_magic - __START_KERNEL_map, %eax + cmpl $0x9abcdef0, %eax + jne bogus_32_magic + + # make sure %cr4 is set correctly (features, etc) + movl saved_cr4 - __START_KERNEL_map, %eax + movq %rax, %cr4 + + movl saved_cr0 - __START_KERNEL_map, %eax + movq %rax, %cr0 + jmp 1f # Flush pipelines +1: + # restore the regs we used + popq %rdx + popq %rcx + popq %rax +ENTRY(do_suspend_lowlevel_s4bios) + ret + + .align 2 + .p2align 4,,15 +.globl do_suspend_lowlevel + .type do_suspend_lowlevel,@function +do_suspend_lowlevel: +.LFB5: + subq $8, %rsp + xorl %eax, %eax + call save_processor_state + + movq %rsp, saved_context_esp(%rip) + movq %rax, saved_context_eax(%rip) + movq %rbx, saved_context_ebx(%rip) + movq %rcx, saved_context_ecx(%rip) + movq %rdx, saved_context_edx(%rip) + movq %rbp, saved_context_ebp(%rip) + movq %rsi, saved_context_esi(%rip) + movq %rdi, saved_context_edi(%rip) + movq %r8, saved_context_r08(%rip) + movq %r9, saved_context_r09(%rip) + movq %r10, saved_context_r10(%rip) + movq %r11, saved_context_r11(%rip) + movq %r12, saved_context_r12(%rip) + movq %r13, saved_context_r13(%rip) + movq %r14, saved_context_r14(%rip) + movq %r15, saved_context_r15(%rip) + pushfq ; popq saved_context_eflags(%rip) + + movq $.L97, saved_eip(%rip) + + movq %rsp,saved_esp + movq %rbp,saved_ebp + movq %rbx,saved_ebx + movq %rdi,saved_edi + movq %rsi,saved_esi + + addq $8, %rsp + movl $3, %edi + xorl %eax, %eax + jmp acpi_enter_sleep_state +.L97: + .p2align 4,,7 +.L99: + .align 4 + movl $24, %eax + movw %ax, %ds + movq saved_context+58(%rip), %rax + movq %rax, %cr4 + movq saved_context+50(%rip), %rax + movq %rax, %cr3 + movq saved_context+42(%rip), %rax + movq %rax, %cr2 + movq saved_context+34(%rip), %rax + movq %rax, %cr0 + pushq saved_context_eflags(%rip) ; popfq + movq saved_context_esp(%rip), %rsp + movq saved_context_ebp(%rip), %rbp + movq saved_context_eax(%rip), %rax + movq saved_context_ebx(%rip), %rbx + movq saved_context_ecx(%rip), %rcx + movq saved_context_edx(%rip), %rdx + movq saved_context_esi(%rip), %rsi + movq saved_context_edi(%rip), %rdi + movq saved_context_r08(%rip), %r8 + movq saved_context_r09(%rip), %r9 + movq saved_context_r10(%rip), %r10 + movq saved_context_r11(%rip), %r11 + movq saved_context_r12(%rip), %r12 + movq saved_context_r13(%rip), %r13 + movq saved_context_r14(%rip), %r14 + movq saved_context_r15(%rip), %r15 + + xorl %eax, %eax + addq $8, %rsp + jmp restore_processor_state +.LFE5: +.Lfe5: + .size do_suspend_lowlevel,.Lfe5-do_suspend_lowlevel + +.data +ALIGN +ENTRY(saved_ebp) .quad 0 +ENTRY(saved_esi) .quad 0 +ENTRY(saved_edi) .quad 0 +ENTRY(saved_ebx) .quad 0 + +ENTRY(saved_eip) .quad 0 +ENTRY(saved_esp) .quad 0 + +ENTRY(saved_magic) .quad 0 + +ALIGN +# saved registers +saved_gdt: .quad 0,0 +saved_idt: .quad 0,0 +saved_ldt: .quad 0 +saved_tss: .quad 0 + +saved_cr0: .quad 0 +saved_cr3: .quad 0 +saved_cr4: .quad 0 +saved_efer: .quad 0 +saved_efer2: .quad 0 diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c new file mode 100644 index 00000000000..4baa99fe1e5 --- /dev/null +++ b/arch/x86_64/kernel/aperture.c @@ -0,0 +1,286 @@ +/* + * Firmware replacement code. + * + * Work around broken BIOSes that don't set an aperture or only set the + * aperture in the AGP bridge. + * If all fails map the aperture over some low memory. This is cheaper than + * doing bounce buffering. The memory is lost. This is done at early boot + * because only the bootmem allocator can allocate 32+MB. + * + * Copyright 2002 Andi Kleen, SuSE Labs. + * $Id: aperture.c,v 1.7 2003/08/01 03:36:18 ak Exp $ + */ +#include <linux/config.h> +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/init.h> +#include <linux/bootmem.h> +#include <linux/mmzone.h> +#include <linux/pci_ids.h> +#include <linux/pci.h> +#include <linux/bitops.h> +#include <asm/e820.h> +#include <asm/io.h> +#include <asm/proto.h> +#include <asm/pci-direct.h> + +int iommu_aperture; +int iommu_aperture_disabled __initdata = 0; +int iommu_aperture_allowed __initdata = 0; + +int fallback_aper_order __initdata = 1; /* 64MB */ +int fallback_aper_force __initdata = 0; + +int fix_aperture __initdata = 1; + +/* This code runs before the PCI subsystem is initialized, so just + access the northbridge directly. */ + +#define NB_ID_3 (PCI_VENDOR_ID_AMD | (0x1103<<16)) + +static u32 __init allocate_aperture(void) +{ +#ifdef CONFIG_DISCONTIGMEM + pg_data_t *nd0 = NODE_DATA(0); +#else + pg_data_t *nd0 = &contig_page_data; +#endif + u32 aper_size; + void *p; + + if (fallback_aper_order > 7) + fallback_aper_order = 7; + aper_size = (32 * 1024 * 1024) << fallback_aper_order; + + /* + * Aperture has to be naturally aligned. This means an 2GB aperture won't + * have much chances to find a place in the lower 4GB of memory. + * Unfortunately we cannot move it up because that would make the + * IOMMU useless. + */ + p = __alloc_bootmem_node(nd0, aper_size, aper_size, 0); + if (!p || __pa(p)+aper_size > 0xffffffff) { + printk("Cannot allocate aperture memory hole (%p,%uK)\n", + p, aper_size>>10); + if (p) + free_bootmem_node(nd0, (unsigned long)p, aper_size); + return 0; + } + printk("Mapping aperture over %d KB of RAM @ %lx\n", + aper_size >> 10, __pa(p)); + return (u32)__pa(p); +} + +static int __init aperture_valid(char *name, u64 aper_base, u32 aper_size) +{ + if (!aper_base) + return 0; + if (aper_size < 64*1024*1024) { + printk("Aperture from %s too small (%d MB)\n", name, aper_size>>20); + return 0; + } + if (aper_base + aper_size >= 0xffffffff) { + printk("Aperture from %s beyond 4GB. Ignoring.\n",name); + return 0; + } + if (e820_mapped(aper_base, aper_base + aper_size, E820_RAM)) { + printk("Aperture from %s pointing to e820 RAM. Ignoring.\n",name); + return 0; + } + return 1; +} + +/* Find a PCI capability */ +static __u32 __init find_cap(int num, int slot, int func, int cap) +{ + u8 pos; + int bytes; + if (!(read_pci_config_16(num,slot,func,PCI_STATUS) & PCI_STATUS_CAP_LIST)) + return 0; + pos = read_pci_config_byte(num,slot,func,PCI_CAPABILITY_LIST); + for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) { + u8 id; + pos &= ~3; + id = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_ID); + if (id == 0xff) + break; + if (id == cap) + return pos; + pos = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_NEXT); + } + return 0; +} + +/* Read a standard AGPv3 bridge header */ +static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order) +{ + u32 apsize; + u32 apsizereg; + int nbits; + u32 aper_low, aper_hi; + u64 aper; + + printk("AGP bridge at %02x:%02x:%02x\n", num, slot, func); + apsizereg = read_pci_config_16(num,slot,func, cap + 0x14); + if (apsizereg == 0xffffffff) { + printk("APSIZE in AGP bridge unreadable\n"); + return 0; + } + + apsize = apsizereg & 0xfff; + /* Some BIOS use weird encodings not in the AGPv3 table. */ + if (apsize & 0xff) + apsize |= 0xf00; + nbits = hweight16(apsize); + *order = 7 - nbits; + if ((int)*order < 0) /* < 32MB */ + *order = 0; + + aper_low = read_pci_config(num,slot,func, 0x10); + aper_hi = read_pci_config(num,slot,func,0x14); + aper = (aper_low & ~((1<<22)-1)) | ((u64)aper_hi << 32); + + printk("Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n", + aper, 32 << *order, apsizereg); + + if (!aperture_valid("AGP bridge", aper, (32*1024*1024) << *order)) + return 0; + return (u32)aper; +} + +/* Look for an AGP bridge. Windows only expects the aperture in the + AGP bridge and some BIOS forget to initialize the Northbridge too. + Work around this here. + + Do an PCI bus scan by hand because we're running before the PCI + subsystem. + + All K8 AGP bridges are AGPv3 compliant, so we can do this scan + generically. It's probably overkill to always scan all slots because + the AGP bridges should be always an own bus on the HT hierarchy, + but do it here for future safety. */ +static __u32 __init search_agp_bridge(u32 *order, int *valid_agp) +{ + int num, slot, func; + + /* Poor man's PCI discovery */ + for (num = 0; num < 32; num++) { + for (slot = 0; slot < 32; slot++) { + for (func = 0; func < 8; func++) { + u32 class, cap; + u8 type; + class = read_pci_config(num,slot,func, + PCI_CLASS_REVISION); + if (class == 0xffffffff) + break; + + switch (class >> 16) { + case PCI_CLASS_BRIDGE_HOST: + case PCI_CLASS_BRIDGE_OTHER: /* needed? */ + /* AGP bridge? */ + cap = find_cap(num,slot,func,PCI_CAP_ID_AGP); + if (!cap) + break; + *valid_agp = 1; + return read_agp(num,slot,func,cap,order); + } + + /* No multi-function device? */ + type = read_pci_config_byte(num,slot,func, + PCI_HEADER_TYPE); + if (!(type & 0x80)) + break; + } + } + } + printk("No AGP bridge found\n"); + return 0; +} + +void __init iommu_hole_init(void) +{ + int fix, num; + u32 aper_size, aper_alloc = 0, aper_order, last_aper_order = 0; + u64 aper_base, last_aper_base = 0; + int valid_agp = 0; + + if (iommu_aperture_disabled || !fix_aperture) + return; + + printk("Checking aperture...\n"); + + fix = 0; + for (num = 24; num < 32; num++) { + char name[30]; + if (read_pci_config(0, num, 3, 0x00) != NB_ID_3) + continue; + + iommu_aperture = 1; + + aper_order = (read_pci_config(0, num, 3, 0x90) >> 1) & 7; + aper_size = (32 * 1024 * 1024) << aper_order; + aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff; + aper_base <<= 25; + + printk("CPU %d: aperture @ %Lx size %u MB\n", num-24, + aper_base, aper_size>>20); + + sprintf(name, "northbridge cpu %d", num-24); + + if (!aperture_valid(name, aper_base, aper_size)) { + fix = 1; + break; + } + + if ((last_aper_order && aper_order != last_aper_order) || + (last_aper_base && aper_base != last_aper_base)) { + fix = 1; + break; + } + last_aper_order = aper_order; + last_aper_base = aper_base; + } + + if (!fix && !fallback_aper_force) + return; + + if (!fallback_aper_force) + aper_alloc = search_agp_bridge(&aper_order, &valid_agp); + + if (aper_alloc) { + /* Got the aperture from the AGP bridge */ + } else if ((!no_iommu && end_pfn >= 0xffffffff>>PAGE_SHIFT) || + force_iommu || + valid_agp || + fallback_aper_force) { + printk("Your BIOS doesn't leave a aperture memory hole\n"); + printk("Please enable the IOMMU option in the BIOS setup\n"); + printk("This costs you %d MB of RAM\n", + 32 << fallback_aper_order); + + aper_order = fallback_aper_order; + aper_alloc = allocate_aperture(); + if (!aper_alloc) { + /* Could disable AGP and IOMMU here, but it's probably + not worth it. But the later users cannot deal with + bad apertures and turning on the aperture over memory + causes very strange problems, so it's better to + panic early. */ + panic("Not enough memory for aperture"); + } + } else { + return; + } + + /* Fix up the north bridges */ + for (num = 24; num < 32; num++) { + if (read_pci_config(0, num, 3, 0x00) != NB_ID_3) + continue; + + /* Don't enable translation yet. That is done later. + Assume this BIOS didn't initialise the GART so + just overwrite all previous bits */ + write_pci_config(0, num, 3, 0x90, aper_order<<1); + write_pci_config(0, num, 3, 0x94, aper_alloc>>25); + } +} diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c new file mode 100644 index 00000000000..9826f03dc9a --- /dev/null +++ b/arch/x86_64/kernel/apic.c @@ -0,0 +1,1088 @@ +/* + * Local APIC handling, local APIC timers + * + * (c) 1999, 2000 Ingo Molnar <mingo@redhat.com> + * + * Fixes + * Maciej W. Rozycki : Bits for genuine 82489DX APICs; + * thanks to Eric Gilmore + * and Rolf G. Tews + * for testing these extensively. + * Maciej W. Rozycki : Various updates and fixes. + * Mikael Pettersson : Power Management for UP-APIC. + * Pavel Machek and + * Mikael Pettersson : PM converted to driver model. + */ + +#include <linux/config.h> +#include <linux/init.h> + +#include <linux/mm.h> +#include <linux/irq.h> +#include <linux/delay.h> +#include <linux/bootmem.h> +#include <linux/smp_lock.h> +#include <linux/interrupt.h> +#include <linux/mc146818rtc.h> +#include <linux/kernel_stat.h> +#include <linux/sysdev.h> + +#include <asm/atomic.h> +#include <asm/smp.h> +#include <asm/mtrr.h> +#include <asm/mpspec.h> +#include <asm/pgalloc.h> +#include <asm/mach_apic.h> + +int apic_verbosity; + +int disable_apic_timer __initdata; + +/* Using APIC to generate smp_local_timer_interrupt? */ +int using_apic_timer = 0; + +static DEFINE_PER_CPU(int, prof_multiplier) = 1; +static DEFINE_PER_CPU(int, prof_old_multiplier) = 1; +static DEFINE_PER_CPU(int, prof_counter) = 1; + +static void apic_pm_activate(void); + +void enable_NMI_through_LVT0 (void * dummy) +{ + unsigned int v, ver; + + ver = apic_read(APIC_LVR); + ver = GET_APIC_VERSION(ver); + v = APIC_DM_NMI; /* unmask and set to NMI */ + apic_write_around(APIC_LVT0, v); +} + +int get_maxlvt(void) +{ + unsigned int v, ver, maxlvt; + + v = apic_read(APIC_LVR); + ver = GET_APIC_VERSION(v); + maxlvt = GET_APIC_MAXLVT(v); + return maxlvt; +} + +void clear_local_APIC(void) +{ + int maxlvt; + unsigned int v; + + maxlvt = get_maxlvt(); + + /* + * Masking an LVT entry on a P6 can trigger a local APIC error + * if the vector is zero. Mask LVTERR first to prevent this. + */ + if (maxlvt >= 3) { + v = ERROR_APIC_VECTOR; /* any non-zero vector will do */ + apic_write_around(APIC_LVTERR, v | APIC_LVT_MASKED); + } + /* + * Careful: we have to set masks only first to deassert + * any level-triggered sources. + */ + v = apic_read(APIC_LVTT); + apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED); + v = apic_read(APIC_LVT0); + apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); + v = apic_read(APIC_LVT1); + apic_write_around(APIC_LVT1, v | APIC_LVT_MASKED); + if (maxlvt >= 4) { + v = apic_read(APIC_LVTPC); + apic_write_around(APIC_LVTPC, v | APIC_LVT_MASKED); + } + + /* + * Clean APIC state for other OSs: + */ + apic_write_around(APIC_LVTT, APIC_LVT_MASKED); + apic_write_around(APIC_LVT0, APIC_LVT_MASKED); + apic_write_around(APIC_LVT1, APIC_LVT_MASKED); + if (maxlvt >= 3) + apic_write_around(APIC_LVTERR, APIC_LVT_MASKED); + if (maxlvt >= 4) + apic_write_around(APIC_LVTPC, APIC_LVT_MASKED); + v = GET_APIC_VERSION(apic_read(APIC_LVR)); + if (APIC_INTEGRATED(v)) { /* !82489DX */ + if (maxlvt > 3) /* Due to Pentium errata 3AP and 11AP. */ + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + } +} + +void __init connect_bsp_APIC(void) +{ + if (pic_mode) { + /* + * Do not trust the local APIC being empty at bootup. + */ + clear_local_APIC(); + /* + * PIC mode, enable APIC mode in the IMCR, i.e. + * connect BSP's local APIC to INT and NMI lines. + */ + apic_printk(APIC_VERBOSE, "leaving PIC mode, enabling APIC mode.\n"); + outb(0x70, 0x22); + outb(0x01, 0x23); + } +} + +void disconnect_bsp_APIC(void) +{ + if (pic_mode) { + /* + * Put the board back into PIC mode (has an effect + * only on certain older boards). Note that APIC + * interrupts, including IPIs, won't work beyond + * this point! The only exception are INIT IPIs. + */ + apic_printk(APIC_QUIET, "disabling APIC mode, entering PIC mode.\n"); + outb(0x70, 0x22); + outb(0x00, 0x23); + } +} + +void disable_local_APIC(void) +{ + unsigned int value; + + clear_local_APIC(); + + /* + * Disable APIC (implies clearing of registers + * for 82489DX!). + */ + value = apic_read(APIC_SPIV); + value &= ~APIC_SPIV_APIC_ENABLED; + apic_write_around(APIC_SPIV, value); +} + +/* + * This is to verify that we're looking at a real local APIC. + * Check these against your board if the CPUs aren't getting + * started for no apparent reason. + */ +int __init verify_local_APIC(void) +{ + unsigned int reg0, reg1; + + /* + * The version register is read-only in a real APIC. + */ + reg0 = apic_read(APIC_LVR); + apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0); + apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK); + reg1 = apic_read(APIC_LVR); + apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1); + + /* + * The two version reads above should print the same + * numbers. If the second one is different, then we + * poke at a non-APIC. + */ + if (reg1 != reg0) + return 0; + + /* + * Check if the version looks reasonably. + */ + reg1 = GET_APIC_VERSION(reg0); + if (reg1 == 0x00 || reg1 == 0xff) + return 0; + reg1 = get_maxlvt(); + if (reg1 < 0x02 || reg1 == 0xff) + return 0; + + /* + * The ID register is read/write in a real APIC. + */ + reg0 = apic_read(APIC_ID); + apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0); + apic_write(APIC_ID, reg0 ^ APIC_ID_MASK); + reg1 = apic_read(APIC_ID); + apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1); + apic_write(APIC_ID, reg0); + if (reg1 != (reg0 ^ APIC_ID_MASK)) + return 0; + + /* + * The next two are just to see if we have sane values. + * They're only really relevant if we're in Virtual Wire + * compatibility mode, but most boxes are anymore. + */ + reg0 = apic_read(APIC_LVT0); + apic_printk(APIC_DEBUG,"Getting LVT0: %x\n", reg0); + reg1 = apic_read(APIC_LVT1); + apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1); + + return 1; +} + +void __init sync_Arb_IDs(void) +{ + /* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 */ + unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR)); + if (ver >= 0x14) /* P4 or higher */ + return; + + /* + * Wait for idle. + */ + apic_wait_icr_idle(); + + apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); + apic_write_around(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG + | APIC_DM_INIT); +} + +extern void __error_in_apic_c (void); + +/* + * An initial setup of the virtual wire mode. + */ +void __init init_bsp_APIC(void) +{ + unsigned int value, ver; + + /* + * Don't do the setup now if we have a SMP BIOS as the + * through-I/O-APIC virtual wire mode might be active. + */ + if (smp_found_config || !cpu_has_apic) + return; + + value = apic_read(APIC_LVR); + ver = GET_APIC_VERSION(value); + + /* + * Do not trust the local APIC being empty at bootup. + */ + clear_local_APIC(); + + /* + * Enable APIC. + */ + value = apic_read(APIC_SPIV); + value &= ~APIC_VECTOR_MASK; + value |= APIC_SPIV_APIC_ENABLED; + value |= APIC_SPIV_FOCUS_DISABLED; + value |= SPURIOUS_APIC_VECTOR; + apic_write_around(APIC_SPIV, value); + + /* + * Set up the virtual wire mode. + */ + apic_write_around(APIC_LVT0, APIC_DM_EXTINT); + value = APIC_DM_NMI; + if (!APIC_INTEGRATED(ver)) /* 82489DX */ + value |= APIC_LVT_LEVEL_TRIGGER; + apic_write_around(APIC_LVT1, value); +} + +void __init setup_local_APIC (void) +{ + unsigned int value, ver, maxlvt; + + /* Pound the ESR really hard over the head with a big hammer - mbligh */ + if (esr_disable) { + apic_write(APIC_ESR, 0); + apic_write(APIC_ESR, 0); + apic_write(APIC_ESR, 0); + apic_write(APIC_ESR, 0); + } + + value = apic_read(APIC_LVR); + ver = GET_APIC_VERSION(value); + + if ((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f) + __error_in_apic_c(); + + /* + * Double-check whether this APIC is really registered. + * This is meaningless in clustered apic mode, so we skip it. + */ + if (!apic_id_registered()) + BUG(); + + /* + * Intel recommends to set DFR, LDR and TPR before enabling + * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel + * document number 292116). So here it goes... + */ + init_apic_ldr(); + + /* + * Set Task Priority to 'accept all'. We never change this + * later on. + */ + value = apic_read(APIC_TASKPRI); + value &= ~APIC_TPRI_MASK; + apic_write_around(APIC_TASKPRI, value); + + /* + * Now that we are all set up, enable the APIC + */ + value = apic_read(APIC_SPIV); + value &= ~APIC_VECTOR_MASK; + /* + * Enable APIC + */ + value |= APIC_SPIV_APIC_ENABLED; + + /* + * Some unknown Intel IO/APIC (or APIC) errata is biting us with + * certain networking cards. If high frequency interrupts are + * happening on a particular IOAPIC pin, plus the IOAPIC routing + * entry is masked/unmasked at a high rate as well then sooner or + * later IOAPIC line gets 'stuck', no more interrupts are received + * from the device. If focus CPU is disabled then the hang goes + * away, oh well :-( + * + * [ This bug can be reproduced easily with a level-triggered + * PCI Ne2000 networking cards and PII/PIII processors, dual + * BX chipset. ] + */ + /* + * Actually disabling the focus CPU check just makes the hang less + * frequent as it makes the interrupt distributon model be more + * like LRU than MRU (the short-term load is more even across CPUs). + * See also the comment in end_level_ioapic_irq(). --macro + */ +#if 1 + /* Enable focus processor (bit==0) */ + value &= ~APIC_SPIV_FOCUS_DISABLED; +#else + /* Disable focus processor (bit==1) */ + value |= APIC_SPIV_FOCUS_DISABLED; +#endif + /* + * Set spurious IRQ vector + */ + value |= SPURIOUS_APIC_VECTOR; + apic_write_around(APIC_SPIV, value); + + /* + * Set up LVT0, LVT1: + * + * set up through-local-APIC on the BP's LINT0. This is not + * strictly necessary in pure symmetric-IO mode, but sometimes + * we delegate interrupts to the 8259A. + */ + /* + * TODO: set up through-local-APIC from through-I/O-APIC? --macro + */ + value = apic_read(APIC_LVT0) & APIC_LVT_MASKED; + if (!smp_processor_id() && (pic_mode || !value)) { + value = APIC_DM_EXTINT; + apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", smp_processor_id()); + } else { + value = APIC_DM_EXTINT | APIC_LVT_MASKED; + apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", smp_processor_id()); + } + apic_write_around(APIC_LVT0, value); + + /* + * only the BP should see the LINT1 NMI signal, obviously. + */ + if (!smp_processor_id()) + value = APIC_DM_NMI; + else + value = APIC_DM_NMI | APIC_LVT_MASKED; + if (!APIC_INTEGRATED(ver)) /* 82489DX */ + value |= APIC_LVT_LEVEL_TRIGGER; + apic_write_around(APIC_LVT1, value); + + if (APIC_INTEGRATED(ver) && !esr_disable) { /* !82489DX */ + unsigned oldvalue; + maxlvt = get_maxlvt(); + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); + oldvalue = apic_read(APIC_ESR); + value = ERROR_APIC_VECTOR; // enables sending errors + apic_write_around(APIC_LVTERR, value); + /* + * spec says clear errors after enabling vector. + */ + if (maxlvt > 3) + apic_write(APIC_ESR, 0); + value = apic_read(APIC_ESR); + if (value != oldvalue) + apic_printk(APIC_VERBOSE, + "ESR value after enabling vector: %08x, after %08x\n", + oldvalue, value); + } else { + if (esr_disable) + /* + * Something untraceble is creating bad interrupts on + * secondary quads ... for the moment, just leave the + * ESR disabled - we can't do anything useful with the + * errors anyway - mbligh + */ + apic_printk(APIC_DEBUG, "Leaving ESR disabled.\n"); + else + apic_printk(APIC_DEBUG, "No ESR for 82489DX.\n"); + } + + nmi_watchdog_default(); + if (nmi_watchdog == NMI_LOCAL_APIC) + setup_apic_nmi_watchdog(); + apic_pm_activate(); +} + +#ifdef CONFIG_PM + +static struct { + /* 'active' is true if the local APIC was enabled by us and + not the BIOS; this signifies that we are also responsible + for disabling it before entering apm/acpi suspend */ + int active; + /* r/w apic fields */ + unsigned int apic_id; + unsigned int apic_taskpri; + unsigned int apic_ldr; + unsigned int apic_dfr; + unsigned int apic_spiv; + unsigned int apic_lvtt; + unsigned int apic_lvtpc; + unsigned int apic_lvt0; + unsigned int apic_lvt1; + unsigned int apic_lvterr; + unsigned int apic_tmict; + unsigned int apic_tdcr; + unsigned int apic_thmr; +} apic_pm_state; + +static int lapic_suspend(struct sys_device *dev, u32 state) +{ + unsigned long flags; + + if (!apic_pm_state.active) + return 0; + + apic_pm_state.apic_id = apic_read(APIC_ID); + apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI); + apic_pm_state.apic_ldr = apic_read(APIC_LDR); + apic_pm_state.apic_dfr = apic_read(APIC_DFR); + apic_pm_state.apic_spiv = apic_read(APIC_SPIV); + apic_pm_state.apic_lvtt = apic_read(APIC_LVTT); + apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC); + apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0); + apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1); + apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); + apic_pm_state.apic_tmict = apic_read(APIC_TMICT); + apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); + apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); + local_save_flags(flags); + local_irq_disable(); + disable_local_APIC(); + local_irq_restore(flags); + return 0; +} + +static int lapic_resume(struct sys_device *dev) +{ + unsigned int l, h; + unsigned long flags; + + if (!apic_pm_state.active) + return 0; + + /* XXX: Pavel needs this for S3 resume, but can't explain why */ + set_fixmap_nocache(FIX_APIC_BASE, APIC_DEFAULT_PHYS_BASE); + + local_irq_save(flags); + rdmsr(MSR_IA32_APICBASE, l, h); + l &= ~MSR_IA32_APICBASE_BASE; + l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE; + wrmsr(MSR_IA32_APICBASE, l, h); + apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); + apic_write(APIC_ID, apic_pm_state.apic_id); + apic_write(APIC_DFR, apic_pm_state.apic_dfr); + apic_write(APIC_LDR, apic_pm_state.apic_ldr); + apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri); + apic_write(APIC_SPIV, apic_pm_state.apic_spiv); + apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); + apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); + apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); + apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc); + apic_write(APIC_LVTT, apic_pm_state.apic_lvtt); + apic_write(APIC_TDCR, apic_pm_state.apic_tdcr); + apic_write(APIC_TMICT, apic_pm_state.apic_tmict); + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + local_irq_restore(flags); + return 0; +} + +static struct sysdev_class lapic_sysclass = { + set_kset_name("lapic"), + .resume = lapic_resume, + .suspend = lapic_suspend, +}; + +static struct sys_device device_lapic = { + .id = 0, + .cls = &lapic_sysclass, +}; + +static void __init apic_pm_activate(void) +{ + apic_pm_state.active = 1; +} + +static int __init init_lapic_sysfs(void) +{ + int error; + if (!cpu_has_apic) + return 0; + /* XXX: remove suspend/resume procs if !apic_pm_state.active? */ + error = sysdev_class_register(&lapic_sysclass); + if (!error) + error = sysdev_register(&device_lapic); + return error; +} +device_initcall(init_lapic_sysfs); + +#else /* CONFIG_PM */ + +static void apic_pm_activate(void) { } + +#endif /* CONFIG_PM */ + +static int __init apic_set_verbosity(char *str) +{ + if (strcmp("debug", str) == 0) + apic_verbosity = APIC_DEBUG; + else if (strcmp("verbose", str) == 0) + apic_verbosity = APIC_VERBOSE; + else + printk(KERN_WARNING "APIC Verbosity level %s not recognised" + " use apic=verbose or apic=debug", str); + + return 0; +} + +__setup("apic=", apic_set_verbosity); + +/* + * Detect and enable local APICs on non-SMP boards. + * Original code written by Keir Fraser. + * On AMD64 we trust the BIOS - if it says no APIC it is likely + * not correctly set up (usually the APIC timer won't work etc.) + */ + +static int __init detect_init_APIC (void) +{ + if (!cpu_has_apic) { + printk(KERN_INFO "No local APIC present\n"); + return -1; + } + + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; + boot_cpu_id = 0; + return 0; +} + +void __init init_apic_mappings(void) +{ + unsigned long apic_phys; + + /* + * If no local APIC can be found then set up a fake all + * zeroes page to simulate the local APIC and another + * one for the IO-APIC. + */ + if (!smp_found_config && detect_init_APIC()) { + apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); + apic_phys = __pa(apic_phys); + } else + apic_phys = mp_lapic_addr; + + set_fixmap_nocache(FIX_APIC_BASE, apic_phys); + apic_printk(APIC_VERBOSE,"mapped APIC to %16lx (%16lx)\n", APIC_BASE, apic_phys); + + /* + * Fetch the APIC ID of the BSP in case we have a + * default configuration (or the MP table is broken). + */ + if (boot_cpu_id == -1U) + boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID)); + +#ifdef CONFIG_X86_IO_APIC + { + unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; + int i; + + for (i = 0; i < nr_ioapics; i++) { + if (smp_found_config) { + ioapic_phys = mp_ioapics[i].mpc_apicaddr; + } else { + ioapic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); + ioapic_phys = __pa(ioapic_phys); + } + set_fixmap_nocache(idx, ioapic_phys); + apic_printk(APIC_VERBOSE,"mapped IOAPIC to %016lx (%016lx)\n", + __fix_to_virt(idx), ioapic_phys); + idx++; + } + } +#endif +} + +/* + * This function sets up the local APIC timer, with a timeout of + * 'clocks' APIC bus clock. During calibration we actually call + * this function twice on the boot CPU, once with a bogus timeout + * value, second time for real. The other (noncalibrating) CPUs + * call this function only once, with the real, calibrated value. + * + * We do reads before writes even if unnecessary, to get around the + * P5 APIC double write bug. + */ + +#define APIC_DIVISOR 16 + +static void __setup_APIC_LVTT(unsigned int clocks) +{ + unsigned int lvtt_value, tmp_value, ver; + + ver = GET_APIC_VERSION(apic_read(APIC_LVR)); + lvtt_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR; + if (!APIC_INTEGRATED(ver)) + lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV); + apic_write_around(APIC_LVTT, lvtt_value); + + /* + * Divide PICLK by 16 + */ + tmp_value = apic_read(APIC_TDCR); + apic_write_around(APIC_TDCR, (tmp_value + & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) + | APIC_TDR_DIV_16); + + apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR); +} + +static void setup_APIC_timer(unsigned int clocks) +{ + unsigned long flags; + + local_irq_save(flags); + + /* For some reasons this doesn't work on Simics, so fake it for now */ + if (!strstr(boot_cpu_data.x86_model_id, "Screwdriver")) { + __setup_APIC_LVTT(clocks); + return; + } + + /* wait for irq slice */ + if (vxtime.hpet_address) { + int trigger = hpet_readl(HPET_T0_CMP); + while (hpet_readl(HPET_COUNTER) >= trigger) + /* do nothing */ ; + while (hpet_readl(HPET_COUNTER) < trigger) + /* do nothing */ ; + } else { + int c1, c2; + outb_p(0x00, 0x43); + c2 = inb_p(0x40); + c2 |= inb_p(0x40) << 8; + do { + c1 = c2; + outb_p(0x00, 0x43); + c2 = inb_p(0x40); + c2 |= inb_p(0x40) << 8; + } while (c2 - c1 < 300); + } + + __setup_APIC_LVTT(clocks); + + local_irq_restore(flags); +} + +/* + * In this function we calibrate APIC bus clocks to the external + * timer. Unfortunately we cannot use jiffies and the timer irq + * to calibrate, since some later bootup code depends on getting + * the first irq? Ugh. + * + * We want to do the calibration only once since we + * want to have local timer irqs syncron. CPUs connected + * by the same APIC bus have the very same bus frequency. + * And we want to have irqs off anyways, no accidental + * APIC irq that way. + */ + +#define TICK_COUNT 100000000 + +static int __init calibrate_APIC_clock(void) +{ + int apic, apic_start, tsc, tsc_start; + int result; + /* + * Put whatever arbitrary (but long enough) timeout + * value into the APIC clock, we just want to get the + * counter running for calibration. + */ + __setup_APIC_LVTT(1000000000); + + apic_start = apic_read(APIC_TMCCT); + rdtscl(tsc_start); + + do { + apic = apic_read(APIC_TMCCT); + rdtscl(tsc); + } while ((tsc - tsc_start) < TICK_COUNT && (apic - apic_start) < TICK_COUNT); + + result = (apic_start - apic) * 1000L * cpu_khz / (tsc - tsc_start); + + printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n", + result / 1000 / 1000, result / 1000 % 1000); + + return result * APIC_DIVISOR / HZ; +} + +static unsigned int calibration_result; + +void __init setup_boot_APIC_clock (void) +{ + if (disable_apic_timer) { + printk(KERN_INFO "Disabling APIC timer\n"); + return; + } + + printk(KERN_INFO "Using local APIC timer interrupts.\n"); + using_apic_timer = 1; + + local_irq_disable(); + + calibration_result = calibrate_APIC_clock(); + /* + * Now set up the timer for real. + */ + setup_APIC_timer(calibration_result); + + local_irq_enable(); +} + +void __init setup_secondary_APIC_clock(void) +{ + local_irq_disable(); /* FIXME: Do we need this? --RR */ + setup_APIC_timer(calibration_result); + local_irq_enable(); +} + +void __init disable_APIC_timer(void) +{ + if (using_apic_timer) { + unsigned long v; + + v = apic_read(APIC_LVTT); + apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED); + } +} + +void enable_APIC_timer(void) +{ + if (using_apic_timer) { + unsigned long v; + + v = apic_read(APIC_LVTT); + apic_write_around(APIC_LVTT, v & ~APIC_LVT_MASKED); + } +} + +/* + * the frequency of the profiling timer can be changed + * by writing a multiplier value into /proc/profile. + */ +int setup_profiling_timer(unsigned int multiplier) +{ + int i; + + /* + * Sanity check. [at least 500 APIC cycles should be + * between APIC interrupts as a rule of thumb, to avoid + * irqs flooding us] + */ + if ( (!multiplier) || (calibration_result/multiplier < 500)) + return -EINVAL; + + /* + * Set the new multiplier for each CPU. CPUs don't start using the + * new values until the next timer interrupt in which they do process + * accounting. At that time they also adjust their APIC timers + * accordingly. + */ + for (i = 0; i < NR_CPUS; ++i) + per_cpu(prof_multiplier, i) = multiplier; + + return 0; +} + +#undef APIC_DIVISOR + +/* + * Local timer interrupt handler. It does both profiling and + * process statistics/rescheduling. + * + * We do profiling in every local tick, statistics/rescheduling + * happen only every 'profiling multiplier' ticks. The default + * multiplier is 1 and it can be changed by writing the new multiplier + * value into /proc/profile. + */ + +void smp_local_timer_interrupt(struct pt_regs *regs) +{ + int cpu = smp_processor_id(); + + profile_tick(CPU_PROFILING, regs); + if (--per_cpu(prof_counter, cpu) <= 0) { + /* + * The multiplier may have changed since the last time we got + * to this point as a result of the user writing to + * /proc/profile. In this case we need to adjust the APIC + * timer accordingly. + * + * Interrupts are already masked off at this point. + */ + per_cpu(prof_counter, cpu) = per_cpu(prof_multiplier, cpu); + if (per_cpu(prof_counter, cpu) != + per_cpu(prof_old_multiplier, cpu)) { + __setup_APIC_LVTT(calibration_result/ + per_cpu(prof_counter, cpu)); + per_cpu(prof_old_multiplier, cpu) = + per_cpu(prof_counter, cpu); + } + +#ifdef CONFIG_SMP + update_process_times(user_mode(regs)); +#endif + } + + /* + * We take the 'long' return path, and there every subsystem + * grabs the appropriate locks (kernel lock/ irq lock). + * + * we might want to decouple profiling from the 'long path', + * and do the profiling totally in assembly. + * + * Currently this isn't too much of an issue (performance wise), + * we can take more than 100K local irqs per second on a 100 MHz P5. + */ +} + +/* + * Local APIC timer interrupt. This is the most natural way for doing + * local interrupts, but local timer interrupts can be emulated by + * broadcast interrupts too. [in case the hw doesn't support APIC timers] + * + * [ if a single-CPU system runs an SMP kernel then we call the local + * interrupt as well. Thus we cannot inline the local irq ... ] + */ +void smp_apic_timer_interrupt(struct pt_regs *regs) +{ + /* + * the NMI deadlock-detector uses this. + */ + add_pda(apic_timer_irqs, 1); + + /* + * NOTE! We'd better ACK the irq immediately, + * because timer handling can be slow. + */ + ack_APIC_irq(); + /* + * update_process_times() expects us to have done irq_enter(). + * Besides, if we don't timer interrupts ignore the global + * interrupt lock, which is the WrongThing (tm) to do. + */ + irq_enter(); + smp_local_timer_interrupt(regs); + irq_exit(); +} + +/* + * oem_force_hpet_timer -- force HPET mode for some boxes. + * + * Thus far, the major user of this is IBM's Summit2 series: + * + * Clustered boxes may have unsynced TSC problems if they are + * multi-chassis. Use available data to take a good guess. + * If in doubt, go HPET. + */ +__init int oem_force_hpet_timer(void) +{ + int i, clusters, zeros; + unsigned id; + DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS); + + bitmap_empty(clustermap, NUM_APIC_CLUSTERS); + + for (i = 0; i < NR_CPUS; i++) { + id = bios_cpu_apicid[i]; + if (id != BAD_APICID) + __set_bit(APIC_CLUSTERID(id), clustermap); + } + + /* Problem: Partially populated chassis may not have CPUs in some of + * the APIC clusters they have been allocated. Only present CPUs have + * bios_cpu_apicid entries, thus causing zeroes in the bitmap. Since + * clusters are allocated sequentially, count zeros only if they are + * bounded by ones. + */ + clusters = 0; + zeros = 0; + for (i = 0; i < NUM_APIC_CLUSTERS; i++) { + if (test_bit(i, clustermap)) { + clusters += 1 + zeros; + zeros = 0; + } else + ++zeros; + } + + /* + * If clusters > 2, then should be multi-chassis. Return 1 for HPET. + * Else return 0 to use TSC. + * May have to revisit this when multi-core + hyperthreaded CPUs come + * out, but AFAIK this will work even for them. + */ + return (clusters > 2); +} + +/* + * This interrupt should _never_ happen with our APIC/SMP architecture + */ +asmlinkage void smp_spurious_interrupt(void) +{ + unsigned int v; + irq_enter(); + /* + * Check if this really is a spurious interrupt and ACK it + * if it is a vectored one. Just in case... + * Spurious interrupts should not be ACKed. + */ + v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1)); + if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) + ack_APIC_irq(); + +#if 0 + static unsigned long last_warning; + static unsigned long skipped; + + /* see sw-dev-man vol 3, chapter 7.4.13.5 */ + if (time_before(last_warning+30*HZ,jiffies)) { + printk(KERN_INFO "spurious APIC interrupt on CPU#%d, %ld skipped.\n", + smp_processor_id(), skipped); + last_warning = jiffies; + skipped = 0; + } else { + skipped++; + } +#endif + irq_exit(); +} + +/* + * This interrupt should never happen with our APIC/SMP architecture + */ + +asmlinkage void smp_error_interrupt(void) +{ + unsigned int v, v1; + + irq_enter(); + /* First tickle the hardware, only then report what went on. -- REW */ + v = apic_read(APIC_ESR); + apic_write(APIC_ESR, 0); + v1 = apic_read(APIC_ESR); + ack_APIC_irq(); + atomic_inc(&irq_err_count); + + /* Here is what the APIC error bits mean: + 0: Send CS error + 1: Receive CS error + 2: Send accept error + 3: Receive accept error + 4: Reserved + 5: Send illegal vector + 6: Received illegal vector + 7: Illegal register address + */ + printk (KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n", + smp_processor_id(), v , v1); + irq_exit(); +} + +int disable_apic; + +/* + * This initializes the IO-APIC and APIC hardware if this is + * a UP kernel. + */ +int __init APIC_init_uniprocessor (void) +{ + if (disable_apic) { + printk(KERN_INFO "Apic disabled\n"); + return -1; + } + if (!cpu_has_apic) { + disable_apic = 1; + printk(KERN_INFO "Apic disabled by BIOS\n"); + return -1; + } + + verify_local_APIC(); + + connect_bsp_APIC(); + + phys_cpu_present_map = physid_mask_of_physid(0); + apic_write_around(APIC_ID, boot_cpu_id); + + setup_local_APIC(); + +#ifdef CONFIG_X86_IO_APIC + if (smp_found_config && !skip_ioapic_setup && nr_ioapics) + setup_IO_APIC(); + else + nr_ioapics = 0; +#endif + setup_boot_APIC_clock(); + + return 0; +} + +static __init int setup_disableapic(char *str) +{ + disable_apic = 1; + return 0; +} + +static __init int setup_nolapic(char *str) +{ + disable_apic = 1; + return 0; +} + +static __init int setup_noapictimer(char *str) +{ + disable_apic_timer = 1; + return 0; +} + +/* dummy parsing: see setup.c */ + +__setup("disableapic", setup_disableapic); +__setup("nolapic", setup_nolapic); /* same as disableapic, for compatibility */ + +__setup("noapictimer", setup_noapictimer); + +/* no "lapic" flag - we only use the lapic when the BIOS tells us so. */ diff --git a/arch/x86_64/kernel/asm-offsets.c b/arch/x86_64/kernel/asm-offsets.c new file mode 100644 index 00000000000..35b4c3fcbb3 --- /dev/null +++ b/arch/x86_64/kernel/asm-offsets.c @@ -0,0 +1,69 @@ +/* + * Generate definitions needed by assembly language modules. + * This code generates raw asm output which is post-processed to extract + * and format the required data. + */ + +#include <linux/sched.h> +#include <linux/stddef.h> +#include <linux/errno.h> +#include <linux/hardirq.h> +#include <linux/suspend.h> +#include <asm/pda.h> +#include <asm/processor.h> +#include <asm/segment.h> +#include <asm/thread_info.h> +#include <asm/ia32.h> + +#define DEFINE(sym, val) \ + asm volatile("\n->" #sym " %0 " #val : : "i" (val)) + +#define BLANK() asm volatile("\n->" : : ) + +int main(void) +{ +#define ENTRY(entry) DEFINE(tsk_ ## entry, offsetof(struct task_struct, entry)) + ENTRY(state); + ENTRY(flags); + ENTRY(thread); + ENTRY(pid); + BLANK(); +#undef ENTRY +#define ENTRY(entry) DEFINE(threadinfo_ ## entry, offsetof(struct thread_info, entry)) + ENTRY(flags); + ENTRY(addr_limit); + ENTRY(preempt_count); + BLANK(); +#undef ENTRY +#define ENTRY(entry) DEFINE(pda_ ## entry, offsetof(struct x8664_pda, entry)) + ENTRY(kernelstack); + ENTRY(oldrsp); + ENTRY(pcurrent); + ENTRY(irqrsp); + ENTRY(irqcount); + ENTRY(cpunumber); + ENTRY(irqstackptr); + BLANK(); +#undef ENTRY +#ifdef CONFIG_IA32_EMULATION +#define ENTRY(entry) DEFINE(IA32_SIGCONTEXT_ ## entry, offsetof(struct sigcontext_ia32, entry)) + ENTRY(eax); + ENTRY(ebx); + ENTRY(ecx); + ENTRY(edx); + ENTRY(esi); + ENTRY(edi); + ENTRY(ebp); + ENTRY(esp); + ENTRY(eip); + BLANK(); +#undef ENTRY + DEFINE(IA32_RT_SIGFRAME_sigcontext, + offsetof (struct rt_sigframe32, uc.uc_mcontext)); + BLANK(); +#endif + DEFINE(pbe_address, offsetof(struct pbe, address)); + DEFINE(pbe_orig_address, offsetof(struct pbe, orig_address)); + DEFINE(pbe_next, offsetof(struct pbe, next)); + return 0; +} diff --git a/arch/x86_64/kernel/cpufreq/Kconfig b/arch/x86_64/kernel/cpufreq/Kconfig new file mode 100644 index 00000000000..81f1562e539 --- /dev/null +++ b/arch/x86_64/kernel/cpufreq/Kconfig @@ -0,0 +1,96 @@ +# +# CPU Frequency scaling +# + +menu "CPU Frequency scaling" + +source "drivers/cpufreq/Kconfig" + +if CPU_FREQ + +comment "CPUFreq processor drivers" + +config X86_POWERNOW_K8 + tristate "AMD Opteron/Athlon64 PowerNow!" + select CPU_FREQ_TABLE + help + This adds the CPUFreq driver for mobile AMD Opteron/Athlon64 processors. + + For details, take a look at <file:Documentation/cpu-freq/>. + + If in doubt, say N. + +config X86_POWERNOW_K8_ACPI + bool + depends on X86_POWERNOW_K8 && ACPI_PROCESSOR + depends on !(X86_POWERNOW_K8 = y && ACPI_PROCESSOR = m) + default y + +config X86_SPEEDSTEP_CENTRINO + tristate "Intel Enhanced SpeedStep" + select CPU_FREQ_TABLE + depends on ACPI_PROCESSOR + help + This adds the CPUFreq driver for Enhanced SpeedStep enabled + mobile CPUs. This means Intel Pentium M (Centrino) CPUs + or 64bit enabled Intel Xeons. + + For details, take a look at <file:Documentation/cpu-freq/>. + + If in doubt, say N. + +config X86_SPEEDSTEP_CENTRINO_ACPI + bool + depends on X86_SPEEDSTEP_CENTRINO + default y + +config X86_ACPI_CPUFREQ + tristate "ACPI Processor P-States driver" + depends on ACPI_PROCESSOR + help + This driver adds a CPUFreq driver which utilizes the ACPI + Processor Performance States. + + For details, take a look at <file:Documentation/cpu-freq/>. + + If in doubt, say N. + +comment "shared options" + +config X86_ACPI_CPUFREQ_PROC_INTF + bool "/proc/acpi/processor/../performance interface (deprecated)" + depends on PROC_FS + depends on X86_ACPI_CPUFREQ || X86_SPEEDSTEP_CENTRINO_ACPI || X86_POWERNOW_K8_ACPI + help + This enables the deprecated /proc/acpi/processor/../performance + interface. While it is helpful for debugging, the generic, + cross-architecture cpufreq interfaces should be used. + + If in doubt, say N. + +config X86_P4_CLOCKMOD + tristate "Intel Pentium 4 clock modulation" + depends on EMBEDDED + help + This adds the clock modulation driver for Intel Pentium 4 / XEON + processors. When enabled it will lower CPU temperature by skipping + clocks. + + This driver should be only used in exceptional + circumstances when very low power is needed because it causes severe + slowdowns and noticeable latencies. Normally Speedstep should be used + instead. + + For details, take a look at <file:Documentation/cpu-freq/>. + + Unless you are absolutely sure say N. + + +config X86_SPEEDSTEP_LIB + tristate + default X86_P4_CLOCKMOD + +endif + +endmenu + diff --git a/arch/x86_64/kernel/cpufreq/Makefile b/arch/x86_64/kernel/cpufreq/Makefile new file mode 100644 index 00000000000..d8b59387922 --- /dev/null +++ b/arch/x86_64/kernel/cpufreq/Makefile @@ -0,0 +1,17 @@ +# +# Reuse the i386 cpufreq drivers +# + +SRCDIR := ../../../i386/kernel/cpu/cpufreq + +obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o +obj-$(CONFIG_X86_SPEEDSTEP_CENTRINO) += speedstep-centrino.o +obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o +obj-$(CONFIG_X86_P4_CLOCKMOD) += p4-clockmod.o +obj-$(CONFIG_X86_SPEEDSTEP_LIB) += speedstep-lib.o + +powernow-k8-objs := ${SRCDIR}/powernow-k8.o +speedstep-centrino-objs := ${SRCDIR}/speedstep-centrino.o +acpi-cpufreq-objs := ${SRCDIR}/acpi-cpufreq.o +p4-clockmod-objs := ${SRCDIR}/p4-clockmod.o +speedstep-lib-objs := ${SRCDIR}/speedstep-lib.o diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c new file mode 100644 index 00000000000..657003e461e --- /dev/null +++ b/arch/x86_64/kernel/e820.c @@ -0,0 +1,513 @@ +/* + * Handle the memory map. + * The functions here do the job until bootmem takes over. + * $Id: e820.c,v 1.4 2002/09/19 19:25:32 ak Exp $ + */ +#include <linux/config.h> +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/init.h> +#include <linux/bootmem.h> +#include <linux/ioport.h> +#include <linux/string.h> +#include <asm/page.h> +#include <asm/e820.h> +#include <asm/proto.h> +#include <asm/bootsetup.h> + +extern char _end[]; + +/* + * PFN of last memory page. + */ +unsigned long end_pfn; + +/* + * end_pfn only includes RAM, while end_pfn_map includes all e820 entries. + * The direct mapping extends to end_pfn_map, so that we can directly access + * apertures, ACPI and other tables without having to play with fixmaps. + */ +unsigned long end_pfn_map; + +/* + * Last pfn which the user wants to use. + */ +unsigned long end_user_pfn = MAXMEM>>PAGE_SHIFT; + +extern struct resource code_resource, data_resource; + +/* Check for some hardcoded bad areas that early boot is not allowed to touch */ +static inline int bad_addr(unsigned long *addrp, unsigned long size) +{ + unsigned long addr = *addrp, last = addr + size; + + /* various gunk below that needed for SMP startup */ + if (addr < 0x8000) { + *addrp = 0x8000; + return 1; + } + + /* direct mapping tables of the kernel */ + if (last >= table_start<<PAGE_SHIFT && addr < table_end<<PAGE_SHIFT) { + *addrp = table_end << PAGE_SHIFT; + return 1; + } + + /* initrd */ +#ifdef CONFIG_BLK_DEV_INITRD + if (LOADER_TYPE && INITRD_START && last >= INITRD_START && + addr < INITRD_START+INITRD_SIZE) { + *addrp = INITRD_START + INITRD_SIZE; + return 1; + } +#endif + /* kernel code + 640k memory hole (later should not be needed, but + be paranoid for now) */ + if (last >= 640*1024 && addr < __pa_symbol(&_end)) { + *addrp = __pa_symbol(&_end); + return 1; + } + /* XXX ramdisk image here? */ + return 0; +} + +int __init e820_mapped(unsigned long start, unsigned long end, unsigned type) +{ + int i; + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + if (type && ei->type != type) + continue; + if (ei->addr >= end || ei->addr + ei->size < start) + continue; + return 1; + } + return 0; +} + +/* + * Find a free area in a specific range. + */ +unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsigned size) +{ + int i; + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + unsigned long addr = ei->addr, last; + if (ei->type != E820_RAM) + continue; + if (addr < start) + addr = start; + if (addr > ei->addr + ei->size) + continue; + while (bad_addr(&addr, size) && addr+size < ei->addr + ei->size) + ; + last = addr + size; + if (last > ei->addr + ei->size) + continue; + if (last > end) + continue; + return addr; + } + return -1UL; +} + +/* + * Free bootmem based on the e820 table for a node. + */ +void __init e820_bootmem_free(pg_data_t *pgdat, unsigned long start,unsigned long end) +{ + int i; + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + unsigned long last, addr; + + if (ei->type != E820_RAM || + ei->addr+ei->size <= start || + ei->addr > end) + continue; + + addr = round_up(ei->addr, PAGE_SIZE); + if (addr < start) + addr = start; + + last = round_down(ei->addr + ei->size, PAGE_SIZE); + if (last >= end) + last = end; + + if (last > addr && last-addr >= PAGE_SIZE) + free_bootmem_node(pgdat, addr, last-addr); + } +} + +/* + * Find the highest page frame number we have available + */ +unsigned long __init e820_end_of_ram(void) +{ + int i; + unsigned long end_pfn = 0; + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + unsigned long start, end; + + start = round_up(ei->addr, PAGE_SIZE); + end = round_down(ei->addr + ei->size, PAGE_SIZE); + if (start >= end) + continue; + if (ei->type == E820_RAM) { + if (end > end_pfn<<PAGE_SHIFT) + end_pfn = end>>PAGE_SHIFT; + } else { + if (end > end_pfn_map<<PAGE_SHIFT) + end_pfn_map = end>>PAGE_SHIFT; + } + } + + if (end_pfn > end_pfn_map) + end_pfn_map = end_pfn; + if (end_pfn_map > MAXMEM>>PAGE_SHIFT) + end_pfn_map = MAXMEM>>PAGE_SHIFT; + if (end_pfn > end_user_pfn) + end_pfn = end_user_pfn; + if (end_pfn > end_pfn_map) + end_pfn = end_pfn_map; + + return end_pfn; +} + +/* + * Mark e820 reserved areas as busy for the resource manager. + */ +void __init e820_reserve_resources(void) +{ + int i; + for (i = 0; i < e820.nr_map; i++) { + struct resource *res; + if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL) + continue; + res = alloc_bootmem_low(sizeof(struct resource)); + switch (e820.map[i].type) { + case E820_RAM: res->name = "System RAM"; break; + case E820_ACPI: res->name = "ACPI Tables"; break; + case E820_NVS: res->name = "ACPI Non-volatile Storage"; break; + default: res->name = "reserved"; + } + res->start = e820.map[i].addr; + res->end = res->start + e820.map[i].size - 1; + res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; + request_resource(&iomem_resource, res); + if (e820.map[i].type == E820_RAM) { + /* + * We don't know which RAM region contains kernel data, + * so we try it repeatedly and let the resource manager + * test it. + */ + request_resource(res, &code_resource); + request_resource(res, &data_resource); + } + } +} + +/* + * Add a memory region to the kernel e820 map. + */ +void __init add_memory_region(unsigned long start, unsigned long size, int type) +{ + int x = e820.nr_map; + + if (x == E820MAX) { + printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); + return; + } + + e820.map[x].addr = start; + e820.map[x].size = size; + e820.map[x].type = type; + e820.nr_map++; +} + +void __init e820_print_map(char *who) +{ + int i; + + for (i = 0; i < e820.nr_map; i++) { + printk(" %s: %016Lx - %016Lx ", who, + (unsigned long long) e820.map[i].addr, + (unsigned long long) (e820.map[i].addr + e820.map[i].size)); + switch (e820.map[i].type) { + case E820_RAM: printk("(usable)\n"); + break; + case E820_RESERVED: + printk("(reserved)\n"); + break; + case E820_ACPI: + printk("(ACPI data)\n"); + break; + case E820_NVS: + printk("(ACPI NVS)\n"); + break; + default: printk("type %u\n", e820.map[i].type); + break; + } + } +} + +/* + * Sanitize the BIOS e820 map. + * + * Some e820 responses include overlapping entries. The following + * replaces the original e820 map with a new one, removing overlaps. + * + */ +static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) +{ + struct change_member { + struct e820entry *pbios; /* pointer to original bios entry */ + unsigned long long addr; /* address for this change point */ + }; + static struct change_member change_point_list[2*E820MAX] __initdata; + static struct change_member *change_point[2*E820MAX] __initdata; + static struct e820entry *overlap_list[E820MAX] __initdata; + static struct e820entry new_bios[E820MAX] __initdata; + struct change_member *change_tmp; + unsigned long current_type, last_type; + unsigned long long last_addr; + int chgidx, still_changing; + int overlap_entries; + int new_bios_entry; + int old_nr, new_nr; + int i; + + /* + Visually we're performing the following (1,2,3,4 = memory types)... + + Sample memory map (w/overlaps): + ____22__________________ + ______________________4_ + ____1111________________ + _44_____________________ + 11111111________________ + ____________________33__ + ___________44___________ + __________33333_________ + ______________22________ + ___________________2222_ + _________111111111______ + _____________________11_ + _________________4______ + + Sanitized equivalent (no overlap): + 1_______________________ + _44_____________________ + ___1____________________ + ____22__________________ + ______11________________ + _________1______________ + __________3_____________ + ___________44___________ + _____________33_________ + _______________2________ + ________________1_______ + _________________4______ + ___________________2____ + ____________________33__ + ______________________4_ + */ + + /* if there's only one memory region, don't bother */ + if (*pnr_map < 2) + return -1; + + old_nr = *pnr_map; + + /* bail out if we find any unreasonable addresses in bios map */ + for (i=0; i<old_nr; i++) + if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) + return -1; + + /* create pointers for initial change-point information (for sorting) */ + for (i=0; i < 2*old_nr; i++) + change_point[i] = &change_point_list[i]; + + /* record all known change-points (starting and ending addresses) */ + chgidx = 0; + for (i=0; i < old_nr; i++) { + change_point[chgidx]->addr = biosmap[i].addr; + change_point[chgidx++]->pbios = &biosmap[i]; + change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size; + change_point[chgidx++]->pbios = &biosmap[i]; + } + + /* sort change-point list by memory addresses (low -> high) */ + still_changing = 1; + while (still_changing) { + still_changing = 0; + for (i=1; i < 2*old_nr; i++) { + /* if <current_addr> > <last_addr>, swap */ + /* or, if current=<start_addr> & last=<end_addr>, swap */ + if ((change_point[i]->addr < change_point[i-1]->addr) || + ((change_point[i]->addr == change_point[i-1]->addr) && + (change_point[i]->addr == change_point[i]->pbios->addr) && + (change_point[i-1]->addr != change_point[i-1]->pbios->addr)) + ) + { + change_tmp = change_point[i]; + change_point[i] = change_point[i-1]; + change_point[i-1] = change_tmp; + still_changing=1; + } + } + } + + /* create a new bios memory map, removing overlaps */ + overlap_entries=0; /* number of entries in the overlap table */ + new_bios_entry=0; /* index for creating new bios map entries */ + last_type = 0; /* start with undefined memory type */ + last_addr = 0; /* start with 0 as last starting address */ + /* loop through change-points, determining affect on the new bios map */ + for (chgidx=0; chgidx < 2*old_nr; chgidx++) + { + /* keep track of all overlapping bios entries */ + if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr) + { + /* add map entry to overlap list (> 1 entry implies an overlap) */ + overlap_list[overlap_entries++]=change_point[chgidx]->pbios; + } + else + { + /* remove entry from list (order independent, so swap with last) */ + for (i=0; i<overlap_entries; i++) + { + if (overlap_list[i] == change_point[chgidx]->pbios) + overlap_list[i] = overlap_list[overlap_entries-1]; + } + overlap_entries--; + } + /* if there are overlapping entries, decide which "type" to use */ + /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */ + current_type = 0; + for (i=0; i<overlap_entries; i++) + if (overlap_list[i]->type > current_type) + current_type = overlap_list[i]->type; + /* continue building up new bios map based on this information */ + if (current_type != last_type) { + if (last_type != 0) { + new_bios[new_bios_entry].size = + change_point[chgidx]->addr - last_addr; + /* move forward only if the new size was non-zero */ + if (new_bios[new_bios_entry].size != 0) + if (++new_bios_entry >= E820MAX) + break; /* no more space left for new bios entries */ + } + if (current_type != 0) { + new_bios[new_bios_entry].addr = change_point[chgidx]->addr; + new_bios[new_bios_entry].type = current_type; + last_addr=change_point[chgidx]->addr; + } + last_type = current_type; + } + } + new_nr = new_bios_entry; /* retain count for new bios entries */ + + /* copy new bios mapping into original location */ + memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry)); + *pnr_map = new_nr; + + return 0; +} + +/* + * Copy the BIOS e820 map into a safe place. + * + * Sanity-check it while we're at it.. + * + * If we're lucky and live on a modern system, the setup code + * will have given us a memory map that we can use to properly + * set up memory. If we aren't, we'll fake a memory map. + * + * We check to see that the memory map contains at least 2 elements + * before we'll use it, because the detection code in setup.S may + * not be perfect and most every PC known to man has two memory + * regions: one from 0 to 640k, and one from 1mb up. (The IBM + * thinkpad 560x, for example, does not cooperate with the memory + * detection code.) + */ +static int __init copy_e820_map(struct e820entry * biosmap, int nr_map) +{ + /* Only one memory region (or negative)? Ignore it */ + if (nr_map < 2) + return -1; + + do { + unsigned long start = biosmap->addr; + unsigned long size = biosmap->size; + unsigned long end = start + size; + unsigned long type = biosmap->type; + + /* Overflow in 64 bits? Ignore the memory map. */ + if (start > end) + return -1; + + /* + * Some BIOSes claim RAM in the 640k - 1M region. + * Not right. Fix it up. + * + * This should be removed on Hammer which is supposed to not + * have non e820 covered ISA mappings there, but I had some strange + * problems so it stays for now. -AK + */ + if (type == E820_RAM) { + if (start < 0x100000ULL && end > 0xA0000ULL) { + if (start < 0xA0000ULL) + add_memory_region(start, 0xA0000ULL-start, type); + if (end <= 0x100000ULL) + continue; + start = 0x100000ULL; + size = end - start; + } + } + + add_memory_region(start, size, type); + } while (biosmap++,--nr_map); + return 0; +} + +void __init setup_memory_region(void) +{ + char *who = "BIOS-e820"; + + /* + * Try to copy the BIOS-supplied E820-map. + * + * Otherwise fake a memory map; one section from 0k->640k, + * the next section from 1mb->appropriate_mem_k + */ + sanitize_e820_map(E820_MAP, &E820_MAP_NR); + if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0) { + unsigned long mem_size; + + /* compare results from other methods and take the greater */ + if (ALT_MEM_K < EXT_MEM_K) { + mem_size = EXT_MEM_K; + who = "BIOS-88"; + } else { + mem_size = ALT_MEM_K; + who = "BIOS-e801"; + } + + e820.nr_map = 0; + add_memory_region(0, LOWMEMSIZE(), E820_RAM); + add_memory_region(HIGH_MEMORY, mem_size << 10, E820_RAM); + } + printk(KERN_INFO "BIOS-provided physical RAM map:\n"); + e820_print_map(who); +} + +void __init parse_memopt(char *p, char **from) +{ + end_user_pfn = memparse(p, from); + end_user_pfn >>= PAGE_SHIFT; +} + diff --git a/arch/x86_64/kernel/early_printk.c b/arch/x86_64/kernel/early_printk.c new file mode 100644 index 00000000000..750bcd0655d --- /dev/null +++ b/arch/x86_64/kernel/early_printk.c @@ -0,0 +1,220 @@ +#include <linux/console.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/string.h> +#include <asm/io.h> +#include <asm/processor.h> + +/* Simple VGA output */ + +#ifdef __i386__ +#define VGABASE (__ISA_IO_base + 0xb8000) +#else +#define VGABASE ((void __iomem *)0xffffffff800b8000UL) +#endif + +#define MAX_YPOS 25 +#define MAX_XPOS 80 + +static int current_ypos = 1, current_xpos = 0; + +static void early_vga_write(struct console *con, const char *str, unsigned n) +{ + char c; + int i, k, j; + + while ((c = *str++) != '\0' && n-- > 0) { + if (current_ypos >= MAX_YPOS) { + /* scroll 1 line up */ + for (k = 1, j = 0; k < MAX_YPOS; k++, j++) { + for (i = 0; i < MAX_XPOS; i++) { + writew(readw(VGABASE + 2*(MAX_XPOS*k + i)), + VGABASE + 2*(MAX_XPOS*j + i)); + } + } + for (i = 0; i < MAX_XPOS; i++) + writew(0x720, VGABASE + 2*(MAX_XPOS*j + i)); + current_ypos = MAX_YPOS-1; + } + if (c == '\n') { + current_xpos = 0; + current_ypos++; + } else if (c != '\r') { + writew(((0x7 << 8) | (unsigned short) c), + VGABASE + 2*(MAX_XPOS*current_ypos + + current_xpos++)); + if (current_xpos >= MAX_XPOS) { + current_xpos = 0; + current_ypos++; + } + } + } +} + +static struct console early_vga_console = { + .name = "earlyvga", + .write = early_vga_write, + .flags = CON_PRINTBUFFER, + .index = -1, +}; + +/* Serial functions loosely based on a similar package from Klaus P. Gerlicher */ + +int early_serial_base = 0x3f8; /* ttyS0 */ + +#define XMTRDY 0x20 + +#define DLAB 0x80 + +#define TXR 0 /* Transmit register (WRITE) */ +#define RXR 0 /* Receive register (READ) */ +#define IER 1 /* Interrupt Enable */ +#define IIR 2 /* Interrupt ID */ +#define FCR 2 /* FIFO control */ +#define LCR 3 /* Line control */ +#define MCR 4 /* Modem control */ +#define LSR 5 /* Line Status */ +#define MSR 6 /* Modem Status */ +#define DLL 0 /* Divisor Latch Low */ +#define DLH 1 /* Divisor latch High */ + +static int early_serial_putc(unsigned char ch) +{ + unsigned timeout = 0xffff; + while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout) + cpu_relax(); + outb(ch, early_serial_base + TXR); + return timeout ? 0 : -1; +} + +static void early_serial_write(struct console *con, const char *s, unsigned n) +{ + while (*s && n-- > 0) { + early_serial_putc(*s); + if (*s == '\n') + early_serial_putc('\r'); + s++; + } +} + +#define DEFAULT_BAUD 9600 + +static __init void early_serial_init(char *s) +{ + unsigned char c; + unsigned divisor; + unsigned baud = DEFAULT_BAUD; + char *e; + + if (*s == ',') + ++s; + + if (*s) { + unsigned port; + if (!strncmp(s,"0x",2)) { + early_serial_base = simple_strtoul(s, &e, 16); + } else { + static int bases[] = { 0x3f8, 0x2f8 }; + + if (!strncmp(s,"ttyS",4)) + s += 4; + port = simple_strtoul(s, &e, 10); + if (port > 1 || s == e) + port = 0; + early_serial_base = bases[port]; + } + s += strcspn(s, ","); + if (*s == ',') + s++; + } + + outb(0x3, early_serial_base + LCR); /* 8n1 */ + outb(0, early_serial_base + IER); /* no interrupt */ + outb(0, early_serial_base + FCR); /* no fifo */ + outb(0x3, early_serial_base + MCR); /* DTR + RTS */ + + if (*s) { + baud = simple_strtoul(s, &e, 0); + if (baud == 0 || s == e) + baud = DEFAULT_BAUD; + } + + divisor = 115200 / baud; + c = inb(early_serial_base + LCR); + outb(c | DLAB, early_serial_base + LCR); + outb(divisor & 0xff, early_serial_base + DLL); + outb((divisor >> 8) & 0xff, early_serial_base + DLH); + outb(c & ~DLAB, early_serial_base + LCR); +} + +static struct console early_serial_console = { + .name = "earlyser", + .write = early_serial_write, + .flags = CON_PRINTBUFFER, + .index = -1, +}; + +/* Direct interface for emergencies */ +struct console *early_console = &early_vga_console; +static int early_console_initialized = 0; + +void early_printk(const char *fmt, ...) +{ + char buf[512]; + int n; + va_list ap; + + va_start(ap,fmt); + n = vscnprintf(buf,512,fmt,ap); + early_console->write(early_console,buf,n); + va_end(ap); +} + +static int keep_early; + +int __init setup_early_printk(char *opt) +{ + char *space; + char buf[256]; + + if (early_console_initialized) + return -1; + + opt = strchr(opt, '=') + 1; + + strlcpy(buf,opt,sizeof(buf)); + space = strchr(buf, ' '); + if (space) + *space = 0; + + if (strstr(buf,"keep")) + keep_early = 1; + + if (!strncmp(buf, "serial", 6)) { + early_serial_init(buf + 6); + early_console = &early_serial_console; + } else if (!strncmp(buf, "ttyS", 4)) { + early_serial_init(buf); + early_console = &early_serial_console; + } else if (!strncmp(buf, "vga", 3)) { + early_console = &early_vga_console; + } + early_console_initialized = 1; + register_console(early_console); + return 0; +} + +void __init disable_early_printk(void) +{ + if (!early_console_initialized || !early_console) + return; + if (!keep_early) { + printk("disabling early console\n"); + unregister_console(early_console); + early_console_initialized = 0; + } else { + printk("keeping early console\n"); + } +} + +__setup("earlyprintk=", setup_early_printk); diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S new file mode 100644 index 00000000000..e126284db7a --- /dev/null +++ b/arch/x86_64/kernel/entry.S @@ -0,0 +1,920 @@ +/* + * linux/arch/x86_64/entry.S + * + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs + * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> + * + * $Id$ + */ + +/* + * entry.S contains the system-call and fault low-level handling routines. + * + * NOTE: This code handles signal-recognition, which happens every time + * after an interrupt and after each system call. + * + * Normal syscalls and interrupts don't save a full stack frame, this is + * only done for syscall tracing, signals or fork/exec et.al. + * + * A note on terminology: + * - top of stack: Architecture defined interrupt frame from SS to RIP + * at the top of the kernel process stack. + * - partial stack frame: partially saved registers upto R11. + * - full stack frame: Like partial stack frame, but all register saved. + * + * TODO: + * - schedule it carefully for the final hardware. + */ + +#define ASSEMBLY 1 +#include <linux/config.h> +#include <linux/linkage.h> +#include <asm/segment.h> +#include <asm/smp.h> +#include <asm/cache.h> +#include <asm/errno.h> +#include <asm/dwarf2.h> +#include <asm/calling.h> +#include <asm/offset.h> +#include <asm/msr.h> +#include <asm/unistd.h> +#include <asm/thread_info.h> +#include <asm/hw_irq.h> + + .code64 + +#ifdef CONFIG_PREEMPT +#define preempt_stop cli +#else +#define preempt_stop +#define retint_kernel retint_restore_args +#endif + +/* + * C code is not supposed to know about undefined top of stack. Every time + * a C function with an pt_regs argument is called from the SYSCALL based + * fast path FIXUP_TOP_OF_STACK is needed. + * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs + * manipulation. + */ + + /* %rsp:at FRAMEEND */ + .macro FIXUP_TOP_OF_STACK tmp + movq %gs:pda_oldrsp,\tmp + movq \tmp,RSP(%rsp) + movq $__USER_DS,SS(%rsp) + movq $__USER_CS,CS(%rsp) + movq $-1,RCX(%rsp) + movq R11(%rsp),\tmp /* get eflags */ + movq \tmp,EFLAGS(%rsp) + .endm + + .macro RESTORE_TOP_OF_STACK tmp,offset=0 + movq RSP-\offset(%rsp),\tmp + movq \tmp,%gs:pda_oldrsp + movq EFLAGS-\offset(%rsp),\tmp + movq \tmp,R11-\offset(%rsp) + .endm + + .macro FAKE_STACK_FRAME child_rip + /* push in order ss, rsp, eflags, cs, rip */ + xorq %rax, %rax + pushq %rax /* ss */ + CFI_ADJUST_CFA_OFFSET 8 + pushq %rax /* rsp */ + CFI_ADJUST_CFA_OFFSET 8 + CFI_OFFSET rip,0 + pushq $(1<<9) /* eflags - interrupts on */ + CFI_ADJUST_CFA_OFFSET 8 + pushq $__KERNEL_CS /* cs */ + CFI_ADJUST_CFA_OFFSET 8 + pushq \child_rip /* rip */ + CFI_ADJUST_CFA_OFFSET 8 + CFI_OFFSET rip,0 + pushq %rax /* orig rax */ + CFI_ADJUST_CFA_OFFSET 8 + .endm + + .macro UNFAKE_STACK_FRAME + addq $8*6, %rsp + CFI_ADJUST_CFA_OFFSET -(6*8) + .endm + + .macro CFI_DEFAULT_STACK + CFI_ADJUST_CFA_OFFSET (SS) + CFI_OFFSET r15,R15-SS + CFI_OFFSET r14,R14-SS + CFI_OFFSET r13,R13-SS + CFI_OFFSET r12,R12-SS + CFI_OFFSET rbp,RBP-SS + CFI_OFFSET rbx,RBX-SS + CFI_OFFSET r11,R11-SS + CFI_OFFSET r10,R10-SS + CFI_OFFSET r9,R9-SS + CFI_OFFSET r8,R8-SS + CFI_OFFSET rax,RAX-SS + CFI_OFFSET rcx,RCX-SS + CFI_OFFSET rdx,RDX-SS + CFI_OFFSET rsi,RSI-SS + CFI_OFFSET rdi,RDI-SS + CFI_OFFSET rsp,RSP-SS + CFI_OFFSET rip,RIP-SS + .endm +/* + * A newly forked process directly context switches into this. + */ +/* rdi: prev */ +ENTRY(ret_from_fork) + CFI_STARTPROC + CFI_DEFAULT_STACK + call schedule_tail + GET_THREAD_INFO(%rcx) + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx) + jnz rff_trace +rff_action: + RESTORE_REST + testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread? + je int_ret_from_sys_call + testl $_TIF_IA32,threadinfo_flags(%rcx) + jnz int_ret_from_sys_call + RESTORE_TOP_OF_STACK %rdi,ARGOFFSET + jmp ret_from_sys_call +rff_trace: + movq %rsp,%rdi + call syscall_trace_leave + GET_THREAD_INFO(%rcx) + jmp rff_action + CFI_ENDPROC + +/* + * System call entry. Upto 6 arguments in registers are supported. + * + * SYSCALL does not save anything on the stack and does not change the + * stack pointer. + */ + +/* + * Register setup: + * rax system call number + * rdi arg0 + * rcx return address for syscall/sysret, C arg3 + * rsi arg1 + * rdx arg2 + * r10 arg3 (--> moved to rcx for C) + * r8 arg4 + * r9 arg5 + * r11 eflags for syscall/sysret, temporary for C + * r12-r15,rbp,rbx saved by C code, not touched. + * + * Interrupts are off on entry. + * Only called from user space. + * + * XXX if we had a free scratch register we could save the RSP into the stack frame + * and report it properly in ps. Unfortunately we haven't. + */ + +ENTRY(system_call) + CFI_STARTPROC + swapgs + movq %rsp,%gs:pda_oldrsp + movq %gs:pda_kernelstack,%rsp + sti + SAVE_ARGS 8,1 + movq %rax,ORIG_RAX-ARGOFFSET(%rsp) + movq %rcx,RIP-ARGOFFSET(%rsp) + GET_THREAD_INFO(%rcx) + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx) + jnz tracesys + cmpq $__NR_syscall_max,%rax + ja badsys + movq %r10,%rcx + call *sys_call_table(,%rax,8) # XXX: rip relative + movq %rax,RAX-ARGOFFSET(%rsp) +/* + * Syscall return path ending with SYSRET (fast path) + * Has incomplete stack frame and undefined top of stack. + */ + .globl ret_from_sys_call +ret_from_sys_call: + movl $_TIF_WORK_MASK,%edi + /* edi: flagmask */ +sysret_check: + GET_THREAD_INFO(%rcx) + cli + movl threadinfo_flags(%rcx),%edx + andl %edi,%edx + jnz sysret_careful + movq RIP-ARGOFFSET(%rsp),%rcx + RESTORE_ARGS 0,-ARG_SKIP,1 + movq %gs:pda_oldrsp,%rsp + swapgs + sysretq + + /* Handle reschedules */ + /* edx: work, edi: workmask */ +sysret_careful: + bt $TIF_NEED_RESCHED,%edx + jnc sysret_signal + sti + pushq %rdi + call schedule + popq %rdi + jmp sysret_check + + /* Handle a signal */ +sysret_signal: + sti + testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx + jz 1f + + /* Really a signal */ + /* edx: work flags (arg3) */ + leaq do_notify_resume(%rip),%rax + leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 + xorl %esi,%esi # oldset -> arg2 + call ptregscall_common +1: movl $_TIF_NEED_RESCHED,%edi + jmp sysret_check + + /* Do syscall tracing */ +tracesys: + SAVE_REST + movq $-ENOSYS,RAX(%rsp) + FIXUP_TOP_OF_STACK %rdi + movq %rsp,%rdi + call syscall_trace_enter + LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */ + RESTORE_REST + cmpq $__NR_syscall_max,%rax + ja 1f + movq %r10,%rcx /* fixup for C */ + call *sys_call_table(,%rax,8) + movq %rax,RAX-ARGOFFSET(%rsp) +1: SAVE_REST + movq %rsp,%rdi + call syscall_trace_leave + RESTORE_TOP_OF_STACK %rbx + RESTORE_REST + jmp ret_from_sys_call + +badsys: + movq $-ENOSYS,RAX-ARGOFFSET(%rsp) + jmp ret_from_sys_call + +/* + * Syscall return path ending with IRET. + * Has correct top of stack, but partial stack frame. + */ +ENTRY(int_ret_from_sys_call) + cli + testl $3,CS-ARGOFFSET(%rsp) + je retint_restore_args + movl $_TIF_ALLWORK_MASK,%edi + /* edi: mask to check */ +int_with_check: + GET_THREAD_INFO(%rcx) + movl threadinfo_flags(%rcx),%edx + andl %edi,%edx + jnz int_careful + jmp retint_swapgs + + /* Either reschedule or signal or syscall exit tracking needed. */ + /* First do a reschedule test. */ + /* edx: work, edi: workmask */ +int_careful: + bt $TIF_NEED_RESCHED,%edx + jnc int_very_careful + sti + pushq %rdi + call schedule + popq %rdi + jmp int_with_check + + /* handle signals and tracing -- both require a full stack frame */ +int_very_careful: + sti + SAVE_REST + /* Check for syscall exit trace */ + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx + jz int_signal + pushq %rdi + leaq 8(%rsp),%rdi # &ptregs -> arg1 + call syscall_trace_leave + popq %rdi + btr $TIF_SYSCALL_TRACE,%edi + btr $TIF_SYSCALL_AUDIT,%edi + btr $TIF_SINGLESTEP,%edi + jmp int_restore_rest + +int_signal: + testl $(_TIF_NOTIFY_RESUME|_TIF_SIGPENDING|_TIF_SINGLESTEP),%edx + jz 1f + movq %rsp,%rdi # &ptregs -> arg1 + xorl %esi,%esi # oldset -> arg2 + call do_notify_resume +1: movl $_TIF_NEED_RESCHED,%edi +int_restore_rest: + RESTORE_REST + jmp int_with_check + CFI_ENDPROC + +/* + * Certain special system calls that need to save a complete full stack frame. + */ + + .macro PTREGSCALL label,func,arg + .globl \label +\label: + leaq \func(%rip),%rax + leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */ + jmp ptregscall_common + .endm + + PTREGSCALL stub_clone, sys_clone, %r8 + PTREGSCALL stub_fork, sys_fork, %rdi + PTREGSCALL stub_vfork, sys_vfork, %rdi + PTREGSCALL stub_rt_sigsuspend, sys_rt_sigsuspend, %rdx + PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx + PTREGSCALL stub_iopl, sys_iopl, %rsi + +ENTRY(ptregscall_common) + CFI_STARTPROC + popq %r11 + CFI_ADJUST_CFA_OFFSET -8 + SAVE_REST + movq %r11, %r15 + FIXUP_TOP_OF_STACK %r11 + call *%rax + RESTORE_TOP_OF_STACK %r11 + movq %r15, %r11 + RESTORE_REST + pushq %r11 + CFI_ADJUST_CFA_OFFSET 8 + ret + CFI_ENDPROC + +ENTRY(stub_execve) + CFI_STARTPROC + popq %r11 + CFI_ADJUST_CFA_OFFSET -8 + SAVE_REST + movq %r11, %r15 + FIXUP_TOP_OF_STACK %r11 + call sys_execve + GET_THREAD_INFO(%rcx) + bt $TIF_IA32,threadinfo_flags(%rcx) + jc exec_32bit + RESTORE_TOP_OF_STACK %r11 + movq %r15, %r11 + RESTORE_REST + push %r11 + ret + +exec_32bit: + CFI_ADJUST_CFA_OFFSET REST_SKIP + movq %rax,RAX(%rsp) + RESTORE_REST + jmp int_ret_from_sys_call + CFI_ENDPROC + +/* + * sigreturn is special because it needs to restore all registers on return. + * This cannot be done with SYSRET, so use the IRET return path instead. + */ +ENTRY(stub_rt_sigreturn) + CFI_STARTPROC + addq $8, %rsp + SAVE_REST + movq %rsp,%rdi + FIXUP_TOP_OF_STACK %r11 + call sys_rt_sigreturn + movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer + RESTORE_REST + jmp int_ret_from_sys_call + CFI_ENDPROC + +/* + * Interrupt entry/exit. + * + * Interrupt entry points save only callee clobbered registers in fast path. + * + * Entry runs with interrupts off. + */ + +/* 0(%rsp): interrupt number */ + .macro interrupt func + CFI_STARTPROC simple + CFI_DEF_CFA rsp,(SS-RDI) + CFI_REL_OFFSET rsp,(RSP-ORIG_RAX) + CFI_REL_OFFSET rip,(RIP-ORIG_RAX) + cld +#ifdef CONFIG_DEBUG_INFO + SAVE_ALL + movq %rsp,%rdi + /* + * Setup a stack frame pointer. This allows gdb to trace + * back to the original stack. + */ + movq %rsp,%rbp + CFI_DEF_CFA_REGISTER rbp +#else + SAVE_ARGS + leaq -ARGOFFSET(%rsp),%rdi # arg1 for handler +#endif + testl $3,CS(%rdi) + je 1f + swapgs +1: addl $1,%gs:pda_irqcount # RED-PEN should check preempt count + movq %gs:pda_irqstackptr,%rax + cmoveq %rax,%rsp + pushq %rdi # save old stack + call \func + .endm + +ENTRY(common_interrupt) + interrupt do_IRQ + /* 0(%rsp): oldrsp-ARGOFFSET */ +ret_from_intr: + popq %rdi + cli + subl $1,%gs:pda_irqcount +#ifdef CONFIG_DEBUG_INFO + movq RBP(%rdi),%rbp +#endif + leaq ARGOFFSET(%rdi),%rsp +exit_intr: + GET_THREAD_INFO(%rcx) + testl $3,CS-ARGOFFSET(%rsp) + je retint_kernel + + /* Interrupt came from user space */ + /* + * Has a correct top of stack, but a partial stack frame + * %rcx: thread info. Interrupts off. + */ +retint_with_reschedule: + movl $_TIF_WORK_MASK,%edi +retint_check: + movl threadinfo_flags(%rcx),%edx + andl %edi,%edx + jnz retint_careful +retint_swapgs: + cli + swapgs +retint_restore_args: + cli + RESTORE_ARGS 0,8,0 +iret_label: + iretq + + .section __ex_table,"a" + .quad iret_label,bad_iret + .previous + .section .fixup,"ax" + /* force a signal here? this matches i386 behaviour */ + /* running with kernel gs */ +bad_iret: + movq $-9999,%rdi /* better code? */ + jmp do_exit + .previous + + /* edi: workmask, edx: work */ +retint_careful: + bt $TIF_NEED_RESCHED,%edx + jnc retint_signal + sti + pushq %rdi + call schedule + popq %rdi + GET_THREAD_INFO(%rcx) + cli + jmp retint_check + +retint_signal: + testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx + jz retint_swapgs + sti + SAVE_REST + movq $-1,ORIG_RAX(%rsp) + xorq %rsi,%rsi # oldset + movq %rsp,%rdi # &pt_regs + call do_notify_resume + RESTORE_REST + cli + movl $_TIF_NEED_RESCHED,%edi + GET_THREAD_INFO(%rcx) + jmp retint_check + +#ifdef CONFIG_PREEMPT + /* Returning to kernel space. Check if we need preemption */ + /* rcx: threadinfo. interrupts off. */ + .p2align +retint_kernel: + cmpl $0,threadinfo_preempt_count(%rcx) + jnz retint_restore_args + bt $TIF_NEED_RESCHED,threadinfo_flags(%rcx) + jnc retint_restore_args + bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */ + jnc retint_restore_args + call preempt_schedule_irq + jmp exit_intr +#endif + CFI_ENDPROC + +/* + * APIC interrupts. + */ + .macro apicinterrupt num,func + pushq $\num-256 + interrupt \func + jmp ret_from_intr + CFI_ENDPROC + .endm + +ENTRY(thermal_interrupt) + apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt + +#ifdef CONFIG_SMP +ENTRY(reschedule_interrupt) + apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt + +ENTRY(invalidate_interrupt) + apicinterrupt INVALIDATE_TLB_VECTOR,smp_invalidate_interrupt + +ENTRY(call_function_interrupt) + apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt +#endif + +#ifdef CONFIG_X86_LOCAL_APIC +ENTRY(apic_timer_interrupt) + apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt + +ENTRY(error_interrupt) + apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt + +ENTRY(spurious_interrupt) + apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt +#endif + +/* + * Exception entry points. + */ + .macro zeroentry sym + pushq $0 /* push error code/oldrax */ + pushq %rax /* push real oldrax to the rdi slot */ + leaq \sym(%rip),%rax + jmp error_entry + .endm + + .macro errorentry sym + pushq %rax + leaq \sym(%rip),%rax + jmp error_entry + .endm + + /* error code is on the stack already */ + /* handle NMI like exceptions that can happen everywhere */ + .macro paranoidentry sym + SAVE_ALL + cld + movl $1,%ebx + movl $MSR_GS_BASE,%ecx + rdmsr + testl %edx,%edx + js 1f + swapgs + xorl %ebx,%ebx +1: movq %rsp,%rdi + movq ORIG_RAX(%rsp),%rsi + movq $-1,ORIG_RAX(%rsp) + call \sym + .endm + +/* + * Exception entry point. This expects an error code/orig_rax on the stack + * and the exception handler in %rax. + */ +ENTRY(error_entry) + CFI_STARTPROC simple + CFI_DEF_CFA rsp,(SS-RDI) + CFI_REL_OFFSET rsp,(RSP-RDI) + CFI_REL_OFFSET rip,(RIP-RDI) + /* rdi slot contains rax, oldrax contains error code */ + cld + subq $14*8,%rsp + CFI_ADJUST_CFA_OFFSET (14*8) + movq %rsi,13*8(%rsp) + CFI_REL_OFFSET rsi,RSI + movq 14*8(%rsp),%rsi /* load rax from rdi slot */ + movq %rdx,12*8(%rsp) + CFI_REL_OFFSET rdx,RDX + movq %rcx,11*8(%rsp) + CFI_REL_OFFSET rcx,RCX + movq %rsi,10*8(%rsp) /* store rax */ + CFI_REL_OFFSET rax,RAX + movq %r8, 9*8(%rsp) + CFI_REL_OFFSET r8,R8 + movq %r9, 8*8(%rsp) + CFI_REL_OFFSET r9,R9 + movq %r10,7*8(%rsp) + CFI_REL_OFFSET r10,R10 + movq %r11,6*8(%rsp) + CFI_REL_OFFSET r11,R11 + movq %rbx,5*8(%rsp) + CFI_REL_OFFSET rbx,RBX + movq %rbp,4*8(%rsp) + CFI_REL_OFFSET rbp,RBP + movq %r12,3*8(%rsp) + CFI_REL_OFFSET r12,R12 + movq %r13,2*8(%rsp) + CFI_REL_OFFSET r13,R13 + movq %r14,1*8(%rsp) + CFI_REL_OFFSET r14,R14 + movq %r15,(%rsp) + CFI_REL_OFFSET r15,R15 + xorl %ebx,%ebx + testl $3,CS(%rsp) + je error_kernelspace +error_swapgs: + swapgs +error_sti: + movq %rdi,RDI(%rsp) + movq %rsp,%rdi + movq ORIG_RAX(%rsp),%rsi /* get error code */ + movq $-1,ORIG_RAX(%rsp) + call *%rax + /* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */ +error_exit: + movl %ebx,%eax + RESTORE_REST + cli + GET_THREAD_INFO(%rcx) + testl %eax,%eax + jne retint_kernel + movl threadinfo_flags(%rcx),%edx + movl $_TIF_WORK_MASK,%edi + andl %edi,%edx + jnz retint_careful + swapgs + RESTORE_ARGS 0,8,0 + iretq + CFI_ENDPROC + +error_kernelspace: + incl %ebx + /* There are two places in the kernel that can potentially fault with + usergs. Handle them here. The exception handlers after + iret run with kernel gs again, so don't set the user space flag. + B stepping K8s sometimes report an truncated RIP for IRET + exceptions returning to compat mode. Check for these here too. */ + leaq iret_label(%rip),%rbp + cmpq %rbp,RIP(%rsp) + je error_swapgs + movl %ebp,%ebp /* zero extend */ + cmpq %rbp,RIP(%rsp) + je error_swapgs + cmpq $gs_change,RIP(%rsp) + je error_swapgs + jmp error_sti + + /* Reload gs selector with exception handling */ + /* edi: new selector */ +ENTRY(load_gs_index) + pushf + cli + swapgs +gs_change: + movl %edi,%gs +2: mfence /* workaround */ + swapgs + popf + ret + + .section __ex_table,"a" + .align 8 + .quad gs_change,bad_gs + .previous + .section .fixup,"ax" + /* running with kernelgs */ +bad_gs: + swapgs /* switch back to user gs */ + xorl %eax,%eax + movl %eax,%gs + jmp 2b + .previous + +/* + * Create a kernel thread. + * + * C extern interface: + * extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) + * + * asm input arguments: + * rdi: fn, rsi: arg, rdx: flags + */ +ENTRY(kernel_thread) + CFI_STARTPROC + FAKE_STACK_FRAME $child_rip + SAVE_ALL + + # rdi: flags, rsi: usp, rdx: will be &pt_regs + movq %rdx,%rdi + orq kernel_thread_flags(%rip),%rdi + movq $-1, %rsi + movq %rsp, %rdx + + xorl %r8d,%r8d + xorl %r9d,%r9d + + # clone now + call do_fork + movq %rax,RAX(%rsp) + xorl %edi,%edi + + /* + * It isn't worth to check for reschedule here, + * so internally to the x86_64 port you can rely on kernel_thread() + * not to reschedule the child before returning, this avoids the need + * of hacks for example to fork off the per-CPU idle tasks. + * [Hopefully no generic code relies on the reschedule -AK] + */ + RESTORE_ALL + UNFAKE_STACK_FRAME + ret + CFI_ENDPROC + + +child_rip: + /* + * Here we are in the child and the registers are set as they were + * at kernel_thread() invocation in the parent. + */ + movq %rdi, %rax + movq %rsi, %rdi + call *%rax + # exit + xorq %rdi, %rdi + call do_exit + +/* + * execve(). This function needs to use IRET, not SYSRET, to set up all state properly. + * + * C extern interface: + * extern long execve(char *name, char **argv, char **envp) + * + * asm input arguments: + * rdi: name, rsi: argv, rdx: envp + * + * We want to fallback into: + * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs regs) + * + * do_sys_execve asm fallback arguments: + * rdi: name, rsi: argv, rdx: envp, fake frame on the stack + */ +ENTRY(execve) + CFI_STARTPROC + FAKE_STACK_FRAME $0 + SAVE_ALL + call sys_execve + movq %rax, RAX(%rsp) + RESTORE_REST + testq %rax,%rax + je int_ret_from_sys_call + RESTORE_ARGS + UNFAKE_STACK_FRAME + ret + CFI_ENDPROC + +ENTRY(page_fault) + errorentry do_page_fault + +ENTRY(coprocessor_error) + zeroentry do_coprocessor_error + +ENTRY(simd_coprocessor_error) + zeroentry do_simd_coprocessor_error + +ENTRY(device_not_available) + zeroentry math_state_restore + + /* runs on exception stack */ +ENTRY(debug) + CFI_STARTPROC + pushq $0 + CFI_ADJUST_CFA_OFFSET 8 + paranoidentry do_debug + /* switch back to process stack to restore the state ptrace touched */ + movq %rax,%rsp + testl $3,CS(%rsp) + jnz paranoid_userspace + jmp paranoid_exit + CFI_ENDPROC + + /* runs on exception stack */ +ENTRY(nmi) + CFI_STARTPROC + pushq $-1 + CFI_ADJUST_CFA_OFFSET 8 + paranoidentry do_nmi + /* ebx: no swapgs flag */ +paranoid_exit: + testl %ebx,%ebx /* swapgs needed? */ + jnz paranoid_restore +paranoid_swapgs: + cli + swapgs +paranoid_restore: + RESTORE_ALL 8 + iretq +paranoid_userspace: + cli + GET_THREAD_INFO(%rcx) + movl threadinfo_flags(%rcx),%edx + testl $_TIF_NEED_RESCHED,%edx + jnz paranoid_resched + testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx + jnz paranoid_signal + jmp paranoid_swapgs +paranoid_resched: + sti + call schedule + jmp paranoid_exit +paranoid_signal: + sti + xorl %esi,%esi /* oldset */ + movq %rsp,%rdi /* &pt_regs */ + call do_notify_resume + jmp paranoid_exit + CFI_ENDPROC + +ENTRY(int3) + zeroentry do_int3 + +ENTRY(overflow) + zeroentry do_overflow + +ENTRY(bounds) + zeroentry do_bounds + +ENTRY(invalid_op) + zeroentry do_invalid_op + +ENTRY(coprocessor_segment_overrun) + zeroentry do_coprocessor_segment_overrun + +ENTRY(reserved) + zeroentry do_reserved + + /* runs on exception stack */ +ENTRY(double_fault) + CFI_STARTPROC + paranoidentry do_double_fault + movq %rax,%rsp + testl $3,CS(%rsp) + jnz paranoid_userspace + jmp paranoid_exit + CFI_ENDPROC + +ENTRY(invalid_TSS) + errorentry do_invalid_TSS + +ENTRY(segment_not_present) + errorentry do_segment_not_present + + /* runs on exception stack */ +ENTRY(stack_segment) + CFI_STARTPROC + paranoidentry do_stack_segment + movq %rax,%rsp + testl $3,CS(%rsp) + jnz paranoid_userspace + jmp paranoid_exit + CFI_ENDPROC + +ENTRY(general_protection) + errorentry do_general_protection + +ENTRY(alignment_check) + errorentry do_alignment_check + +ENTRY(divide_error) + zeroentry do_divide_error + +ENTRY(spurious_interrupt_bug) + zeroentry do_spurious_interrupt_bug + +#ifdef CONFIG_X86_MCE + /* runs on exception stack */ +ENTRY(machine_check) + CFI_STARTPROC + pushq $0 + CFI_ADJUST_CFA_OFFSET 8 + paranoidentry do_machine_check + jmp paranoid_exit + CFI_ENDPROC +#endif + +ENTRY(call_debug) + zeroentry do_call_debug + diff --git a/arch/x86_64/kernel/genapic.c b/arch/x86_64/kernel/genapic.c new file mode 100644 index 00000000000..d2c42fb99cb --- /dev/null +++ b/arch/x86_64/kernel/genapic.c @@ -0,0 +1,89 @@ +/* + * Copyright 2004 James Cleverdon, IBM. + * Subject to the GNU Public License, v.2 + * + * Generic APIC sub-arch probe layer. + * + * Hacked for x86-64 by James Cleverdon from i386 architecture code by + * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and + * James Cleverdon. + */ +#include <linux/config.h> +#include <linux/threads.h> +#include <linux/cpumask.h> +#include <linux/string.h> +#include <linux/kernel.h> +#include <linux/ctype.h> +#include <linux/init.h> +#include <linux/module.h> + +#include <asm/smp.h> +#include <asm/ipi.h> + +/* which logical CPU number maps to which CPU (physical APIC ID) */ +u8 x86_cpu_to_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; +EXPORT_SYMBOL(x86_cpu_to_apicid); +u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; + +extern struct genapic apic_cluster; +extern struct genapic apic_flat; + +struct genapic *genapic = &apic_flat; + + +/* + * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. + */ +void __init clustered_apic_check(void) +{ + long i; + u8 clusters, max_cluster; + u8 id; + u8 cluster_cnt[NUM_APIC_CLUSTERS]; + + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { + /* AMD always uses flat mode right now */ + genapic = &apic_flat; + goto print; + } + + memset(cluster_cnt, 0, sizeof(cluster_cnt)); + + for (i = 0; i < NR_CPUS; i++) { + id = bios_cpu_apicid[i]; + if (id != BAD_APICID) + cluster_cnt[APIC_CLUSTERID(id)]++; + } + + clusters = 0; + max_cluster = 0; + for (i = 0; i < NUM_APIC_CLUSTERS; i++) { + if (cluster_cnt[i] > 0) { + ++clusters; + if (cluster_cnt[i] > max_cluster) + max_cluster = cluster_cnt[i]; + } + } + + /* + * If we have clusters <= 1 and CPUs <= 8 in cluster 0, then flat mode, + * else if max_cluster <= 4 and cluster_cnt[15] == 0, clustered logical + * else physical mode. + * (We don't use lowest priority delivery + HW APIC IRQ steering, so + * can ignore the clustered logical case and go straight to physical.) + */ + if (clusters <= 1 && max_cluster <= 8 && cluster_cnt[0] == max_cluster) + genapic = &apic_flat; + else + genapic = &apic_cluster; + +print: + printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name); +} + +/* Same for both flat and clustered. */ + +void send_IPI_self(int vector) +{ + __send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL); +} diff --git a/arch/x86_64/kernel/genapic_cluster.c b/arch/x86_64/kernel/genapic_cluster.c new file mode 100644 index 00000000000..9703da7202e --- /dev/null +++ b/arch/x86_64/kernel/genapic_cluster.c @@ -0,0 +1,130 @@ +/* + * Copyright 2004 James Cleverdon, IBM. + * Subject to the GNU Public License, v.2 + * + * Clustered APIC subarch code. Up to 255 CPUs, physical delivery. + * (A more realistic maximum is around 230 CPUs.) + * + * Hacked for x86-64 by James Cleverdon from i386 architecture code by + * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and + * James Cleverdon. + */ +#include <linux/config.h> +#include <linux/threads.h> +#include <linux/cpumask.h> +#include <linux/string.h> +#include <linux/kernel.h> +#include <linux/ctype.h> +#include <linux/init.h> +#include <asm/smp.h> +#include <asm/ipi.h> + + +/* + * Set up the logical destination ID. + * + * Intel recommends to set DFR, LDR and TPR before enabling + * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel + * document number 292116). So here it goes... + */ +static void cluster_init_apic_ldr(void) +{ + unsigned long val, id; + long i, count; + u8 lid; + u8 my_id = hard_smp_processor_id(); + u8 my_cluster = APIC_CLUSTER(my_id); + + /* Create logical APIC IDs by counting CPUs already in cluster. */ + for (count = 0, i = NR_CPUS; --i >= 0; ) { + lid = x86_cpu_to_log_apicid[i]; + if (lid != BAD_APICID && APIC_CLUSTER(lid) == my_cluster) + ++count; + } + /* + * We only have a 4 wide bitmap in cluster mode. There's no way + * to get above 60 CPUs and still give each one it's own bit. + * But, we're using physical IRQ delivery, so we don't care. + * Use bit 3 for the 4th through Nth CPU in each cluster. + */ + if (count >= XAPIC_DEST_CPUS_SHIFT) + count = 3; + id = my_cluster | (1UL << count); + x86_cpu_to_log_apicid[smp_processor_id()] = id; + apic_write_around(APIC_DFR, APIC_DFR_CLUSTER); + val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; + val |= SET_APIC_LOGICAL_ID(id); + apic_write_around(APIC_LDR, val); +} + +/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ + +static cpumask_t cluster_target_cpus(void) +{ + return cpumask_of_cpu(0); +} + +static void cluster_send_IPI_mask(cpumask_t mask, int vector) +{ + send_IPI_mask_sequence(mask, vector); +} + +static void cluster_send_IPI_allbutself(int vector) +{ + cpumask_t mask = cpu_online_map; + cpu_clear(smp_processor_id(), mask); + + if (!cpus_empty(mask)) + cluster_send_IPI_mask(mask, vector); +} + +static void cluster_send_IPI_all(int vector) +{ + cluster_send_IPI_mask(cpu_online_map, vector); +} + +static int cluster_apic_id_registered(void) +{ + return 1; +} + +static unsigned int cluster_cpu_mask_to_apicid(cpumask_t cpumask) +{ + int cpu; + + /* + * We're using fixed IRQ delivery, can only return one phys APIC ID. + * May as well be the first. + */ + cpu = first_cpu(cpumask); + if ((unsigned)cpu < NR_CPUS) + return x86_cpu_to_apicid[cpu]; + else + return BAD_APICID; +} + +/* cpuid returns the value latched in the HW at reset, not the APIC ID + * register's value. For any box whose BIOS changes APIC IDs, like + * clustered APIC systems, we must use hard_smp_processor_id. + * + * See Intel's IA-32 SW Dev's Manual Vol2 under CPUID. + */ +static unsigned int phys_pkg_id(int index_msb) +{ + return hard_smp_processor_id() >> index_msb; +} + +struct genapic apic_cluster = { + .name = "clustered", + .int_delivery_mode = dest_Fixed, + .int_dest_mode = (APIC_DEST_PHYSICAL != 0), + .int_delivery_dest = APIC_DEST_PHYSICAL | APIC_DM_FIXED, + .target_cpus = cluster_target_cpus, + .apic_id_registered = cluster_apic_id_registered, + .init_apic_ldr = cluster_init_apic_ldr, + .send_IPI_all = cluster_send_IPI_all, + .send_IPI_allbutself = cluster_send_IPI_allbutself, + .send_IPI_mask = cluster_send_IPI_mask, + .cpu_mask_to_apicid = cluster_cpu_mask_to_apicid, + .phys_pkg_id = phys_pkg_id, +}; diff --git a/arch/x86_64/kernel/genapic_flat.c b/arch/x86_64/kernel/genapic_flat.c new file mode 100644 index 00000000000..b4cbbad0422 --- /dev/null +++ b/arch/x86_64/kernel/genapic_flat.c @@ -0,0 +1,127 @@ +/* + * Copyright 2004 James Cleverdon, IBM. + * Subject to the GNU Public License, v.2 + * + * Flat APIC subarch code. Maximum 8 CPUs, logical delivery. + * + * Hacked for x86-64 by James Cleverdon from i386 architecture code by + * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and + * James Cleverdon. + */ +#include <linux/config.h> +#include <linux/threads.h> +#include <linux/cpumask.h> +#include <linux/string.h> +#include <linux/kernel.h> +#include <linux/ctype.h> +#include <linux/init.h> +#include <asm/smp.h> +#include <asm/ipi.h> + + +static cpumask_t flat_target_cpus(void) +{ + return cpu_online_map; +} + +/* + * Set up the logical destination ID. + * + * Intel recommends to set DFR, LDR and TPR before enabling + * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel + * document number 292116). So here it goes... + */ +static void flat_init_apic_ldr(void) +{ + unsigned long val; + unsigned long num, id; + + num = smp_processor_id(); + id = 1UL << num; + x86_cpu_to_log_apicid[num] = id; + apic_write_around(APIC_DFR, APIC_DFR_FLAT); + val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; + val |= SET_APIC_LOGICAL_ID(id); + apic_write_around(APIC_LDR, val); +} + +static void flat_send_IPI_allbutself(int vector) +{ + /* + * if there are no other CPUs in the system then + * we get an APIC send error if we try to broadcast. + * thus we have to avoid sending IPIs in this case. + */ + if (num_online_cpus() > 1) + __send_IPI_shortcut(APIC_DEST_ALLBUT, vector, APIC_DEST_LOGICAL); +} + +static void flat_send_IPI_all(int vector) +{ + __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL); +} + +static void flat_send_IPI_mask(cpumask_t cpumask, int vector) +{ + unsigned long mask = cpus_addr(cpumask)[0]; + unsigned long cfg; + unsigned long flags; + + local_save_flags(flags); + local_irq_disable(); + + /* + * Wait for idle. + */ + apic_wait_icr_idle(); + + /* + * prepare target chip field + */ + cfg = __prepare_ICR2(mask); + apic_write_around(APIC_ICR2, cfg); + + /* + * program the ICR + */ + cfg = __prepare_ICR(0, vector, APIC_DEST_LOGICAL); + + /* + * Send the IPI. The write to APIC_ICR fires this off. + */ + apic_write_around(APIC_ICR, cfg); + local_irq_restore(flags); +} + +static int flat_apic_id_registered(void) +{ + return physid_isset(GET_APIC_ID(apic_read(APIC_ID)), phys_cpu_present_map); +} + +static unsigned int flat_cpu_mask_to_apicid(cpumask_t cpumask) +{ + return cpus_addr(cpumask)[0] & APIC_ALL_CPUS; +} + +static unsigned int phys_pkg_id(int index_msb) +{ + u32 ebx; + + ebx = cpuid_ebx(1); + return ((ebx >> 24) & 0xFF) >> index_msb; +} + +struct genapic apic_flat = { + .name = "flat", + .int_delivery_mode = dest_LowestPrio, + .int_dest_mode = (APIC_DEST_LOGICAL != 0), + .int_delivery_dest = APIC_DEST_LOGICAL | APIC_DM_LOWEST, + .target_cpus = flat_target_cpus, + .apic_id_registered = flat_apic_id_registered, + .init_apic_ldr = flat_init_apic_ldr, + .send_IPI_all = flat_send_IPI_all, + .send_IPI_allbutself = flat_send_IPI_allbutself, + .send_IPI_mask = flat_send_IPI_mask, + .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, + .phys_pkg_id = phys_pkg_id, +}; diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S new file mode 100644 index 00000000000..b6d8725c1f6 --- /dev/null +++ b/arch/x86_64/kernel/head.S @@ -0,0 +1,396 @@ +/* + * linux/arch/x86_64/kernel/head.S -- start in 32bit and switch to 64bit + * + * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE + * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> + * Copyright (C) 2000 Karsten Keil <kkeil@suse.de> + * Copyright (C) 2001,2002 Andi Kleen <ak@suse.de> + * + * $Id: head.S,v 1.49 2002/03/19 17:39:25 ak Exp $ + */ + + +#include <linux/linkage.h> +#include <linux/threads.h> +#include <asm/desc.h> +#include <asm/segment.h> +#include <asm/page.h> +#include <asm/msr.h> +#include <asm/cache.h> + +/* we are not able to switch in one step to the final KERNEL ADRESS SPACE + * because we need identity-mapped pages on setup so define __START_KERNEL to + * 0x100000 for this stage + * + */ + + .text + .code32 + .globl startup_32 +/* %bx: 1 if coming from smp trampoline on secondary cpu */ +startup_32: + + /* + * At this point the CPU runs in 32bit protected mode (CS.D = 1) with + * paging disabled and the point of this file is to switch to 64bit + * long mode with a kernel mapping for kerneland to jump into the + * kernel virtual addresses. + * There is no stack until we set one up. + */ + + /* Initialize the %ds segment register */ + movl $__KERNEL_DS,%eax + movl %eax,%ds + + /* Load new GDT with the 64bit segments using 32bit descriptor */ + lgdt pGDT32 - __START_KERNEL_map + + /* If the CPU doesn't support CPUID this will double fault. + * Unfortunately it is hard to check for CPUID without a stack. + */ + + /* Check if extended functions are implemented */ + movl $0x80000000, %eax + cpuid + cmpl $0x80000000, %eax + jbe no_long_mode + /* Check if long mode is implemented */ + mov $0x80000001, %eax + cpuid + btl $29, %edx + jnc no_long_mode + + /* + * Prepare for entering 64bits mode + */ + + /* Enable PAE mode */ + xorl %eax, %eax + btsl $5, %eax + movl %eax, %cr4 + + /* Setup early boot stage 4 level pagetables */ + movl $(init_level4_pgt - __START_KERNEL_map), %eax + movl %eax, %cr3 + + /* Setup EFER (Extended Feature Enable Register) */ + movl $MSR_EFER, %ecx + rdmsr + + /* Enable Long Mode */ + btsl $_EFER_LME, %eax + + /* Make changes effective */ + wrmsr + + xorl %eax, %eax + btsl $31, %eax /* Enable paging and in turn activate Long Mode */ + btsl $0, %eax /* Enable protected mode */ + /* Make changes effective */ + movl %eax, %cr0 + /* + * At this point we're in long mode but in 32bit compatibility mode + * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn + * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we use + * the new gdt/idt that has __KERNEL_CS with CS.L = 1. + */ + ljmp $__KERNEL_CS, $(startup_64 - __START_KERNEL_map) + + .code64 + .org 0x100 + .globl startup_64 +startup_64: + /* We come here either from startup_32 + * or directly from a 64bit bootloader. + * Since we may have come directly from a bootloader we + * reload the page tables here. + */ + + /* Enable PAE mode and PGE */ + xorq %rax, %rax + btsq $5, %rax + btsq $7, %rax + movq %rax, %cr4 + + /* Setup early boot stage 4 level pagetables. */ + movq $(init_level4_pgt - __START_KERNEL_map), %rax + movq %rax, %cr3 + + /* Check if nx is implemented */ + movl $0x80000001, %eax + cpuid + movl %edx,%edi + + /* Setup EFER (Extended Feature Enable Register) */ + movl $MSR_EFER, %ecx + rdmsr + + /* Enable System Call */ + btsl $_EFER_SCE, %eax + + /* No Execute supported? */ + btl $20,%edi + jnc 1f + btsl $_EFER_NX, %eax +1: + /* Make changes effective */ + wrmsr + + /* Setup cr0 */ + xorq %rax, %rax + btsq $31, %rax /* Enable paging */ + btsq $0, %rax /* Enable protected mode */ + btsq $1, %rax /* Enable MP */ + btsq $4, %rax /* Enable ET */ + btsq $5, %rax /* Enable NE */ + btsq $16, %rax /* Enable WP */ + btsq $18, %rax /* Enable AM */ + /* Make changes effective */ + movq %rax, %cr0 + + /* Setup a boot time stack */ + movq init_rsp(%rip),%rsp + + /* zero EFLAGS after setting rsp */ + pushq $0 + popfq + + /* + * We must switch to a new descriptor in kernel space for the GDT + * because soon the kernel won't have access anymore to the userspace + * addresses where we're currently running on. We have to do that here + * because in 32bit we couldn't load a 64bit linear address. + */ + lgdt cpu_gdt_descr + + /* + * Setup up a dummy PDA. this is just for some early bootup code + * that does in_interrupt() + */ + movl $MSR_GS_BASE,%ecx + movq $empty_zero_page,%rax + movq %rax,%rdx + shrq $32,%rdx + wrmsr + + /* set up data segments. actually 0 would do too */ + movl $__KERNEL_DS,%eax + movl %eax,%ds + movl %eax,%ss + movl %eax,%es + + /* esi is pointer to real mode structure with interesting info. + pass it to C */ + movl %esi, %edi + + /* Finally jump to run C code and to be on real kernel address + * Since we are running on identity-mapped space we have to jump + * to the full 64bit address , this is only possible as indirect + * jump + */ + movq initial_code(%rip),%rax + jmp *%rax + + /* SMP bootup changes these two */ + .globl initial_code +initial_code: + .quad x86_64_start_kernel + .globl init_rsp +init_rsp: + .quad init_thread_union+THREAD_SIZE-8 + +ENTRY(early_idt_handler) + xorl %eax,%eax + movq 8(%rsp),%rsi # get rip + movq (%rsp),%rdx + movq %cr2,%rcx + leaq early_idt_msg(%rip),%rdi + call early_printk +1: hlt + jmp 1b + +early_idt_msg: + .asciz "PANIC: early exception rip %lx error %lx cr2 %lx\n" + +.code32 +ENTRY(no_long_mode) + /* This isn't an x86-64 CPU so hang */ +1: + jmp 1b + +.org 0xf00 + .globl pGDT32 +pGDT32: + .word gdt_end-cpu_gdt_table + .long cpu_gdt_table-__START_KERNEL_map + +.org 0xf10 +ljumpvector: + .long startup_64-__START_KERNEL_map + .word __KERNEL_CS + +ENTRY(stext) +ENTRY(_stext) + + /* + * This default setting generates an ident mapping at address 0x100000 + * and a mapping for the kernel that precisely maps virtual address + * 0xffffffff80000000 to physical address 0x000000. (always using + * 2Mbyte large pages provided by PAE mode) + */ +.org 0x1000 +ENTRY(init_level4_pgt) + .quad 0x0000000000102007 /* -> level3_ident_pgt */ + .fill 255,8,0 + .quad 0x000000000010a007 + .fill 254,8,0 + /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ + .quad 0x0000000000103007 /* -> level3_kernel_pgt */ + +.org 0x2000 +ENTRY(level3_ident_pgt) + .quad 0x0000000000104007 + .fill 511,8,0 + +.org 0x3000 +ENTRY(level3_kernel_pgt) + .fill 510,8,0 + /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */ + .quad 0x0000000000105007 /* -> level2_kernel_pgt */ + .fill 1,8,0 + +.org 0x4000 +ENTRY(level2_ident_pgt) + /* 40MB for bootup. */ + .quad 0x0000000000000283 + .quad 0x0000000000200183 + .quad 0x0000000000400183 + .quad 0x0000000000600183 + .quad 0x0000000000800183 + .quad 0x0000000000A00183 + .quad 0x0000000000C00183 + .quad 0x0000000000E00183 + .quad 0x0000000001000183 + .quad 0x0000000001200183 + .quad 0x0000000001400183 + .quad 0x0000000001600183 + .quad 0x0000000001800183 + .quad 0x0000000001A00183 + .quad 0x0000000001C00183 + .quad 0x0000000001E00183 + .quad 0x0000000002000183 + .quad 0x0000000002200183 + .quad 0x0000000002400183 + .quad 0x0000000002600183 + /* Temporary mappings for the super early allocator in arch/x86_64/mm/init.c */ + .globl temp_boot_pmds +temp_boot_pmds: + .fill 492,8,0 + +.org 0x5000 +ENTRY(level2_kernel_pgt) + /* 40MB kernel mapping. The kernel code cannot be bigger than that. + When you change this change KERNEL_TEXT_SIZE in page.h too. */ + /* (2^48-(2*1024*1024*1024)-((2^39)*511)-((2^30)*510)) = 0 */ + .quad 0x0000000000000183 + .quad 0x0000000000200183 + .quad 0x0000000000400183 + .quad 0x0000000000600183 + .quad 0x0000000000800183 + .quad 0x0000000000A00183 + .quad 0x0000000000C00183 + .quad 0x0000000000E00183 + .quad 0x0000000001000183 + .quad 0x0000000001200183 + .quad 0x0000000001400183 + .quad 0x0000000001600183 + .quad 0x0000000001800183 + .quad 0x0000000001A00183 + .quad 0x0000000001C00183 + .quad 0x0000000001E00183 + .quad 0x0000000002000183 + .quad 0x0000000002200183 + .quad 0x0000000002400183 + .quad 0x0000000002600183 + /* Module mapping starts here */ + .fill 492,8,0 + +.org 0x6000 +ENTRY(empty_zero_page) + +.org 0x7000 +ENTRY(empty_bad_page) + +.org 0x8000 +ENTRY(empty_bad_pte_table) + +.org 0x9000 +ENTRY(empty_bad_pmd_table) + +.org 0xa000 +ENTRY(level3_physmem_pgt) + .quad 0x0000000000105007 /* -> level2_kernel_pgt (so that __va works even before pagetable_init) */ + + .org 0xb000 +#ifdef CONFIG_ACPI_SLEEP +ENTRY(wakeup_level4_pgt) + .quad 0x0000000000102007 /* -> level3_ident_pgt */ + .fill 255,8,0 + .quad 0x000000000010a007 + .fill 254,8,0 + /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ + .quad 0x0000000000103007 /* -> level3_kernel_pgt */ +#endif + + .data + + .align 16 + .globl cpu_gdt_descr +cpu_gdt_descr: + .word gdt_end-cpu_gdt_table +gdt: + .quad cpu_gdt_table +#ifdef CONFIG_SMP + .rept NR_CPUS-1 + .word 0 + .quad 0 + .endr +#endif + +/* We need valid kernel segments for data and code in long mode too + * IRET will check the segment types kkeil 2000/10/28 + * Also sysret mandates a special GDT layout + */ + +.align L1_CACHE_BYTES + +/* The TLS descriptors are currently at a different place compared to i386. + Hopefully nobody expects them at a fixed place (Wine?) */ + +ENTRY(cpu_gdt_table) + .quad 0x0000000000000000 /* NULL descriptor */ + .quad 0x008f9a000000ffff /* __KERNEL_COMPAT32_CS */ + .quad 0x00af9a000000ffff /* __KERNEL_CS */ + .quad 0x00cf92000000ffff /* __KERNEL_DS */ + .quad 0x00cffa000000ffff /* __USER32_CS */ + .quad 0x00cff2000000ffff /* __USER_DS, __USER32_DS */ + .quad 0x00affa000000ffff /* __USER_CS */ + .quad 0x00cf9a000000ffff /* __KERNEL32_CS */ + .quad 0,0 /* TSS */ + .quad 0,0 /* LDT */ + .quad 0,0,0 /* three TLS descriptors */ + .quad 0x00009a000000ffff /* __KERNEL16_CS - 16bit PM for S3 wakeup. */ + /* base must be patched for real base address. */ +gdt_end: + /* asm/segment.h:GDT_ENTRIES must match this */ + /* This should be a multiple of the cache line size */ + /* GDTs of other CPUs: */ + .fill (GDT_SIZE * NR_CPUS) - (gdt_end - cpu_gdt_table) + + .align L1_CACHE_BYTES +ENTRY(idt_table) + .rept 256 + .quad 0 + .quad 0 + .endr + diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c new file mode 100644 index 00000000000..6cad46c98a2 --- /dev/null +++ b/arch/x86_64/kernel/head64.c @@ -0,0 +1,117 @@ +/* + * linux/arch/x86_64/kernel/head64.c -- prepare to run common code + * + * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE + * + * $Id: head64.c,v 1.22 2001/07/06 14:28:20 ak Exp $ + */ + +#include <linux/init.h> +#include <linux/linkage.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/percpu.h> + +#include <asm/processor.h> +#include <asm/proto.h> +#include <asm/smp.h> +#include <asm/bootsetup.h> +#include <asm/setup.h> +#include <asm/desc.h> + +/* Don't add a printk in there. printk relies on the PDA which is not initialized + yet. */ +static void __init clear_bss(void) +{ + extern char __bss_start[], __bss_end[]; + memset(__bss_start, 0, + (unsigned long) __bss_end - (unsigned long) __bss_start); +} + +extern char x86_boot_params[2048]; + +#define NEW_CL_POINTER 0x228 /* Relative to real mode data */ +#define OLD_CL_MAGIC_ADDR 0x90020 +#define OLD_CL_MAGIC 0xA33F +#define OLD_CL_BASE_ADDR 0x90000 +#define OLD_CL_OFFSET 0x90022 + +extern char saved_command_line[]; + +static void __init copy_bootdata(char *real_mode_data) +{ + int new_data; + char * command_line; + + memcpy(x86_boot_params, real_mode_data, 2048); + new_data = *(int *) (x86_boot_params + NEW_CL_POINTER); + if (!new_data) { + if (OLD_CL_MAGIC != * (u16 *) OLD_CL_MAGIC_ADDR) { + printk("so old bootloader that it does not support commandline?!\n"); + return; + } + new_data = OLD_CL_BASE_ADDR + * (u16 *) OLD_CL_OFFSET; + printk("old bootloader convention, maybe loadlin?\n"); + } + command_line = (char *) ((u64)(new_data)); + memcpy(saved_command_line, command_line, COMMAND_LINE_SIZE); + printk("Bootdata ok (command line is %s)\n", saved_command_line); +} + +static void __init setup_boot_cpu_data(void) +{ + unsigned int dummy, eax; + + /* get vendor info */ + cpuid(0, (unsigned int *)&boot_cpu_data.cpuid_level, + (unsigned int *)&boot_cpu_data.x86_vendor_id[0], + (unsigned int *)&boot_cpu_data.x86_vendor_id[8], + (unsigned int *)&boot_cpu_data.x86_vendor_id[4]); + + /* get cpu type */ + cpuid(1, &eax, &dummy, &dummy, + (unsigned int *) &boot_cpu_data.x86_capability); + boot_cpu_data.x86 = (eax >> 8) & 0xf; + boot_cpu_data.x86_model = (eax >> 4) & 0xf; + boot_cpu_data.x86_mask = eax & 0xf; +} + +extern char _end[]; + +void __init x86_64_start_kernel(char * real_mode_data) +{ + char *s; + int i; + + for (i = 0; i < 256; i++) + set_intr_gate(i, early_idt_handler); + asm volatile("lidt %0" :: "m" (idt_descr)); + clear_bss(); + pda_init(0); + copy_bootdata(real_mode_data); +#ifdef CONFIG_SMP + cpu_set(0, cpu_online_map); +#endif + /* default console: */ + if (!strstr(saved_command_line, "console=")) + strcat(saved_command_line, " console=tty0"); + s = strstr(saved_command_line, "earlyprintk="); + if (s != NULL) + setup_early_printk(s); +#ifdef CONFIG_DISCONTIGMEM + s = strstr(saved_command_line, "numa="); + if (s != NULL) + numa_setup(s+5); +#endif +#ifdef CONFIG_X86_IO_APIC + if (strstr(saved_command_line, "disableapic")) + disable_apic = 1; +#endif + /* You need early console to see that */ + if (__pa_symbol(&_end) >= KERNEL_TEXT_SIZE) + panic("Kernel too big for kernel mapping\n"); + + setup_boot_cpu_data(); + start_kernel(); +} diff --git a/arch/x86_64/kernel/i387.c b/arch/x86_64/kernel/i387.c new file mode 100644 index 00000000000..ba139cac57c --- /dev/null +++ b/arch/x86_64/kernel/i387.c @@ -0,0 +1,155 @@ +/* + * linux/arch/x86_64/kernel/i387.c + * + * Copyright (C) 1994 Linus Torvalds + * Copyright (C) 2002 Andi Kleen, SuSE Labs + * + * Pentium III FXSR, SSE support + * General FPU state handling cleanups + * Gareth Hughes <gareth@valinux.com>, May 2000 + * + * x86-64 rework 2002 Andi Kleen. + * Does direct fxsave in and out of user space now for signal handlers. + * All the FSAVE<->FXSAVE conversion code has been moved to the 32bit emulation, + * the 64bit user space sees a FXSAVE frame directly. + */ + +#include <linux/config.h> +#include <linux/sched.h> +#include <linux/init.h> +#include <asm/processor.h> +#include <asm/i387.h> +#include <asm/sigcontext.h> +#include <asm/user.h> +#include <asm/ptrace.h> +#include <asm/uaccess.h> + +unsigned int mxcsr_feature_mask = 0xffffffff; + +void mxcsr_feature_mask_init(void) +{ + unsigned int mask; + clts(); + memset(¤t->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct)); + asm volatile("fxsave %0" : : "m" (current->thread.i387.fxsave)); + mask = current->thread.i387.fxsave.mxcsr_mask; + if (mask == 0) mask = 0x0000ffbf; + mxcsr_feature_mask &= mask; + stts(); +} + +/* + * Called at bootup to set up the initial FPU state that is later cloned + * into all processes. + */ +void __init fpu_init(void) +{ + unsigned long oldcr0 = read_cr0(); + extern void __bad_fxsave_alignment(void); + + if (offsetof(struct task_struct, thread.i387.fxsave) & 15) + __bad_fxsave_alignment(); + set_in_cr4(X86_CR4_OSFXSR); + set_in_cr4(X86_CR4_OSXMMEXCPT); + + write_cr0(oldcr0 & ~((1UL<<3)|(1UL<<2))); /* clear TS and EM */ + + mxcsr_feature_mask_init(); + /* clean state in init */ + current_thread_info()->status = 0; + clear_used_math(); +} + +void init_fpu(struct task_struct *child) +{ + if (tsk_used_math(child)) { + if (child == current) + unlazy_fpu(child); + return; + } + memset(&child->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct)); + child->thread.i387.fxsave.cwd = 0x37f; + child->thread.i387.fxsave.mxcsr = 0x1f80; + /* only the device not available exception or ptrace can call init_fpu */ + set_stopped_child_used_math(child); +} + +/* + * Signal frame handlers. + */ + +int save_i387(struct _fpstate __user *buf) +{ + struct task_struct *tsk = current; + int err = 0; + + { + extern void bad_user_i387_struct(void); + if (sizeof(struct user_i387_struct) != sizeof(tsk->thread.i387.fxsave)) + bad_user_i387_struct(); + } + + if ((unsigned long)buf % 16) + printk("save_i387: bad fpstate %p\n",buf); + + if (!used_math()) + return 0; + clear_used_math(); /* trigger finit */ + if (tsk->thread_info->status & TS_USEDFPU) { + err = save_i387_checking((struct i387_fxsave_struct __user *)buf); + if (err) return err; + stts(); + } else { + if (__copy_to_user(buf, &tsk->thread.i387.fxsave, + sizeof(struct i387_fxsave_struct))) + return -1; + } + return 1; +} + +/* + * ptrace request handlers. + */ + +int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *tsk) +{ + init_fpu(tsk); + return __copy_to_user(buf, &tsk->thread.i387.fxsave, + sizeof(struct user_i387_struct)) ? -EFAULT : 0; +} + +int set_fpregs(struct task_struct *tsk, struct user_i387_struct __user *buf) +{ + if (__copy_from_user(&tsk->thread.i387.fxsave, buf, + sizeof(struct user_i387_struct))) + return -EFAULT; + return 0; +} + +/* + * FPU state for core dumps. + */ + +int dump_fpu( struct pt_regs *regs, struct user_i387_struct *fpu ) +{ + struct task_struct *tsk = current; + + if (!used_math()) + return 0; + + unlazy_fpu(tsk); + memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(struct user_i387_struct)); + return 1; +} + +int dump_task_fpu(struct task_struct *tsk, struct user_i387_struct *fpu) +{ + int fpvalid = !!tsk_used_math(tsk); + + if (fpvalid) { + if (tsk == current) + unlazy_fpu(tsk); + memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(struct user_i387_struct)); +} + return fpvalid; +} diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c new file mode 100644 index 00000000000..d5270124601 --- /dev/null +++ b/arch/x86_64/kernel/i8259.c @@ -0,0 +1,579 @@ +#include <linux/linkage.h> +#include <linux/config.h> +#include <linux/errno.h> +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/ioport.h> +#include <linux/interrupt.h> +#include <linux/timex.h> +#include <linux/slab.h> +#include <linux/random.h> +#include <linux/smp_lock.h> +#include <linux/init.h> +#include <linux/kernel_stat.h> +#include <linux/sysdev.h> +#include <linux/bitops.h> + +#include <asm/acpi.h> +#include <asm/atomic.h> +#include <asm/system.h> +#include <asm/io.h> +#include <asm/irq.h> +#include <asm/hw_irq.h> +#include <asm/pgtable.h> +#include <asm/delay.h> +#include <asm/desc.h> +#include <asm/apic.h> + +#include <linux/irq.h> + +/* + * Common place to define all x86 IRQ vectors + * + * This builds up the IRQ handler stubs using some ugly macros in irq.h + * + * These macros create the low-level assembly IRQ routines that save + * register context and call do_IRQ(). do_IRQ() then does all the + * operations that are needed to keep the AT (or SMP IOAPIC) + * interrupt-controller happy. + */ + +#define BI(x,y) \ + BUILD_IRQ(x##y) + +#define BUILD_16_IRQS(x) \ + BI(x,0) BI(x,1) BI(x,2) BI(x,3) \ + BI(x,4) BI(x,5) BI(x,6) BI(x,7) \ + BI(x,8) BI(x,9) BI(x,a) BI(x,b) \ + BI(x,c) BI(x,d) BI(x,e) BI(x,f) + +#define BUILD_14_IRQS(x) \ + BI(x,0) BI(x,1) BI(x,2) BI(x,3) \ + BI(x,4) BI(x,5) BI(x,6) BI(x,7) \ + BI(x,8) BI(x,9) BI(x,a) BI(x,b) \ + BI(x,c) BI(x,d) + +/* + * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: + * (these are usually mapped to vectors 0x20-0x2f) + */ +BUILD_16_IRQS(0x0) + +#ifdef CONFIG_X86_LOCAL_APIC +/* + * The IO-APIC gives us many more interrupt sources. Most of these + * are unused but an SMP system is supposed to have enough memory ... + * sometimes (mostly wrt. hw bugs) we get corrupted vectors all + * across the spectrum, so we really want to be prepared to get all + * of these. Plus, more powerful systems might have more than 64 + * IO-APIC registers. + * + * (these are usually mapped into the 0x30-0xff vector range) + */ + BUILD_16_IRQS(0x1) BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3) +BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7) +BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb) +BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) + +#ifdef CONFIG_PCI_MSI + BUILD_14_IRQS(0xe) +#endif + +#endif + +#undef BUILD_16_IRQS +#undef BUILD_14_IRQS +#undef BI + + +#define IRQ(x,y) \ + IRQ##x##y##_interrupt + +#define IRQLIST_16(x) \ + IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \ + IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \ + IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \ + IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f) + +#define IRQLIST_14(x) \ + IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \ + IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \ + IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \ + IRQ(x,c), IRQ(x,d) + +void (*interrupt[NR_IRQS])(void) = { + IRQLIST_16(0x0), + +#ifdef CONFIG_X86_IO_APIC + IRQLIST_16(0x1), IRQLIST_16(0x2), IRQLIST_16(0x3), + IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7), + IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb), + IRQLIST_16(0xc), IRQLIST_16(0xd) + +#ifdef CONFIG_PCI_MSI + , IRQLIST_14(0xe) +#endif + +#endif +}; + +#undef IRQ +#undef IRQLIST_16 +#undef IRQLIST_14 + +/* + * This is the 'legacy' 8259A Programmable Interrupt Controller, + * present in the majority of PC/AT boxes. + * plus some generic x86 specific things if generic specifics makes + * any sense at all. + * this file should become arch/i386/kernel/irq.c when the old irq.c + * moves to arch independent land + */ + +DEFINE_SPINLOCK(i8259A_lock); + +static void end_8259A_irq (unsigned int irq) +{ + if (irq > 256) { + char var; + printk("return %p stack %p ti %p\n", __builtin_return_address(0), &var, current->thread_info); + + BUG(); + } + + if (!(irq_desc[irq].status & (IRQ_DISABLED|IRQ_INPROGRESS)) && + irq_desc[irq].action) + enable_8259A_irq(irq); +} + +#define shutdown_8259A_irq disable_8259A_irq + +static void mask_and_ack_8259A(unsigned int); + +static unsigned int startup_8259A_irq(unsigned int irq) +{ + enable_8259A_irq(irq); + return 0; /* never anything pending */ +} + +static struct hw_interrupt_type i8259A_irq_type = { + "XT-PIC", + startup_8259A_irq, + shutdown_8259A_irq, + enable_8259A_irq, + disable_8259A_irq, + mask_and_ack_8259A, + end_8259A_irq, + NULL +}; + +/* + * 8259A PIC functions to handle ISA devices: + */ + +/* + * This contains the irq mask for both 8259A irq controllers, + */ +static unsigned int cached_irq_mask = 0xffff; + +#define __byte(x,y) (((unsigned char *)&(y))[x]) +#define cached_21 (__byte(0,cached_irq_mask)) +#define cached_A1 (__byte(1,cached_irq_mask)) + +/* + * Not all IRQs can be routed through the IO-APIC, eg. on certain (older) + * boards the timer interrupt is not really connected to any IO-APIC pin, + * it's fed to the master 8259A's IR0 line only. + * + * Any '1' bit in this mask means the IRQ is routed through the IO-APIC. + * this 'mixed mode' IRQ handling costs nothing because it's only used + * at IRQ setup time. + */ +unsigned long io_apic_irqs; + +void disable_8259A_irq(unsigned int irq) +{ + unsigned int mask = 1 << irq; + unsigned long flags; + + spin_lock_irqsave(&i8259A_lock, flags); + cached_irq_mask |= mask; + if (irq & 8) + outb(cached_A1,0xA1); + else + outb(cached_21,0x21); + spin_unlock_irqrestore(&i8259A_lock, flags); +} + +void enable_8259A_irq(unsigned int irq) +{ + unsigned int mask = ~(1 << irq); + unsigned long flags; + + spin_lock_irqsave(&i8259A_lock, flags); + cached_irq_mask &= mask; + if (irq & 8) + outb(cached_A1,0xA1); + else + outb(cached_21,0x21); + spin_unlock_irqrestore(&i8259A_lock, flags); +} + +int i8259A_irq_pending(unsigned int irq) +{ + unsigned int mask = 1<<irq; + unsigned long flags; + int ret; + + spin_lock_irqsave(&i8259A_lock, flags); + if (irq < 8) + ret = inb(0x20) & mask; + else + ret = inb(0xA0) & (mask >> 8); + spin_unlock_irqrestore(&i8259A_lock, flags); + + return ret; +} + +void make_8259A_irq(unsigned int irq) +{ + disable_irq_nosync(irq); + io_apic_irqs &= ~(1<<irq); + irq_desc[irq].handler = &i8259A_irq_type; + enable_irq(irq); +} + +/* + * This function assumes to be called rarely. Switching between + * 8259A registers is slow. + * This has to be protected by the irq controller spinlock + * before being called. + */ +static inline int i8259A_irq_real(unsigned int irq) +{ + int value; + int irqmask = 1<<irq; + + if (irq < 8) { + outb(0x0B,0x20); /* ISR register */ + value = inb(0x20) & irqmask; + outb(0x0A,0x20); /* back to the IRR register */ + return value; + } + outb(0x0B,0xA0); /* ISR register */ + value = inb(0xA0) & (irqmask >> 8); + outb(0x0A,0xA0); /* back to the IRR register */ + return value; +} + +/* + * Careful! The 8259A is a fragile beast, it pretty + * much _has_ to be done exactly like this (mask it + * first, _then_ send the EOI, and the order of EOI + * to the two 8259s is important! + */ +static void mask_and_ack_8259A(unsigned int irq) +{ + unsigned int irqmask = 1 << irq; + unsigned long flags; + + spin_lock_irqsave(&i8259A_lock, flags); + /* + * Lightweight spurious IRQ detection. We do not want + * to overdo spurious IRQ handling - it's usually a sign + * of hardware problems, so we only do the checks we can + * do without slowing down good hardware unnecesserily. + * + * Note that IRQ7 and IRQ15 (the two spurious IRQs + * usually resulting from the 8259A-1|2 PICs) occur + * even if the IRQ is masked in the 8259A. Thus we + * can check spurious 8259A IRQs without doing the + * quite slow i8259A_irq_real() call for every IRQ. + * This does not cover 100% of spurious interrupts, + * but should be enough to warn the user that there + * is something bad going on ... + */ + if (cached_irq_mask & irqmask) + goto spurious_8259A_irq; + cached_irq_mask |= irqmask; + +handle_real_irq: + if (irq & 8) { + inb(0xA1); /* DUMMY - (do we need this?) */ + outb(cached_A1,0xA1); + outb(0x60+(irq&7),0xA0);/* 'Specific EOI' to slave */ + outb(0x62,0x20); /* 'Specific EOI' to master-IRQ2 */ + } else { + inb(0x21); /* DUMMY - (do we need this?) */ + outb(cached_21,0x21); + outb(0x60+irq,0x20); /* 'Specific EOI' to master */ + } + spin_unlock_irqrestore(&i8259A_lock, flags); + return; + +spurious_8259A_irq: + /* + * this is the slow path - should happen rarely. + */ + if (i8259A_irq_real(irq)) + /* + * oops, the IRQ _is_ in service according to the + * 8259A - not spurious, go handle it. + */ + goto handle_real_irq; + + { + static int spurious_irq_mask; + /* + * At this point we can be sure the IRQ is spurious, + * lets ACK and report it. [once per IRQ] + */ + if (!(spurious_irq_mask & irqmask)) { + printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq); + spurious_irq_mask |= irqmask; + } + atomic_inc(&irq_err_count); + /* + * Theoretically we do not have to handle this IRQ, + * but in Linux this does not cause problems and is + * simpler for us. + */ + goto handle_real_irq; + } +} + +void init_8259A(int auto_eoi) +{ + unsigned long flags; + + spin_lock_irqsave(&i8259A_lock, flags); + + outb(0xff, 0x21); /* mask all of 8259A-1 */ + outb(0xff, 0xA1); /* mask all of 8259A-2 */ + + /* + * outb_p - this has to work on a wide range of PC hardware. + */ + outb_p(0x11, 0x20); /* ICW1: select 8259A-1 init */ + outb_p(0x20 + 0, 0x21); /* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */ + outb_p(0x04, 0x21); /* 8259A-1 (the master) has a slave on IR2 */ + if (auto_eoi) + outb_p(0x03, 0x21); /* master does Auto EOI */ + else + outb_p(0x01, 0x21); /* master expects normal EOI */ + + outb_p(0x11, 0xA0); /* ICW1: select 8259A-2 init */ + outb_p(0x20 + 8, 0xA1); /* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */ + outb_p(0x02, 0xA1); /* 8259A-2 is a slave on master's IR2 */ + outb_p(0x01, 0xA1); /* (slave's support for AEOI in flat mode + is to be investigated) */ + + if (auto_eoi) + /* + * in AEOI mode we just have to mask the interrupt + * when acking. + */ + i8259A_irq_type.ack = disable_8259A_irq; + else + i8259A_irq_type.ack = mask_and_ack_8259A; + + udelay(100); /* wait for 8259A to initialize */ + + outb(cached_21, 0x21); /* restore master IRQ mask */ + outb(cached_A1, 0xA1); /* restore slave IRQ mask */ + + spin_unlock_irqrestore(&i8259A_lock, flags); +} + +static char irq_trigger[2]; +/** + * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ + */ +static void restore_ELCR(char *trigger) +{ + outb(trigger[0], 0x4d0); + outb(trigger[1], 0x4d1); +} + +static void save_ELCR(char *trigger) +{ + /* IRQ 0,1,2,8,13 are marked as reserved */ + trigger[0] = inb(0x4d0) & 0xF8; + trigger[1] = inb(0x4d1) & 0xDE; +} + +static int i8259A_resume(struct sys_device *dev) +{ + init_8259A(0); + restore_ELCR(irq_trigger); + return 0; +} + +static int i8259A_suspend(struct sys_device *dev, u32 state) +{ + save_ELCR(irq_trigger); + return 0; +} + +static struct sysdev_class i8259_sysdev_class = { + set_kset_name("i8259"), + .suspend = i8259A_suspend, + .resume = i8259A_resume, +}; + +static struct sys_device device_i8259A = { + .id = 0, + .cls = &i8259_sysdev_class, +}; + +static int __init i8259A_init_sysfs(void) +{ + int error = sysdev_class_register(&i8259_sysdev_class); + if (!error) + error = sysdev_register(&device_i8259A); + return error; +} + +device_initcall(i8259A_init_sysfs); + +/* + * IRQ2 is cascade interrupt to second interrupt controller + */ + +static struct irqaction irq2 = { no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL}; + +void __init init_ISA_irqs (void) +{ + int i; + +#ifdef CONFIG_X86_LOCAL_APIC + init_bsp_APIC(); +#endif + init_8259A(0); + + for (i = 0; i < NR_IRQS; i++) { + irq_desc[i].status = IRQ_DISABLED; + irq_desc[i].action = NULL; + irq_desc[i].depth = 1; + + if (i < 16) { + /* + * 16 old-style INTA-cycle interrupts: + */ + irq_desc[i].handler = &i8259A_irq_type; + } else { + /* + * 'high' PCI IRQs filled in on demand + */ + irq_desc[i].handler = &no_irq_type; + } + } +} + +void apic_timer_interrupt(void); +void spurious_interrupt(void); +void error_interrupt(void); +void reschedule_interrupt(void); +void call_function_interrupt(void); +void invalidate_interrupt(void); +void thermal_interrupt(void); +void i8254_timer_resume(void); + +static void setup_timer(void) +{ + outb_p(0x34,0x43); /* binary, mode 2, LSB/MSB, ch 0 */ + udelay(10); + outb_p(LATCH & 0xff , 0x40); /* LSB */ + udelay(10); + outb(LATCH >> 8 , 0x40); /* MSB */ +} + +static int timer_resume(struct sys_device *dev) +{ + setup_timer(); + return 0; +} + +void i8254_timer_resume(void) +{ + setup_timer(); +} + +static struct sysdev_class timer_sysclass = { + set_kset_name("timer"), + .resume = timer_resume, +}; + +static struct sys_device device_timer = { + .id = 0, + .cls = &timer_sysclass, +}; + +static int __init init_timer_sysfs(void) +{ + int error = sysdev_class_register(&timer_sysclass); + if (!error) + error = sysdev_register(&device_timer); + return error; +} + +device_initcall(init_timer_sysfs); + +void __init init_IRQ(void) +{ + int i; + + init_ISA_irqs(); + /* + * Cover the whole vector space, no vector can escape + * us. (some of these will be overridden and become + * 'special' SMP interrupts) + */ + for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { + int vector = FIRST_EXTERNAL_VECTOR + i; + if (i >= NR_IRQS) + break; + if (vector != IA32_SYSCALL_VECTOR && vector != KDB_VECTOR) { + set_intr_gate(vector, interrupt[i]); + } + } + +#ifdef CONFIG_SMP + /* + * IRQ0 must be given a fixed assignment and initialized, + * because it's used before the IO-APIC is set up. + */ + set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]); + + /* + * The reschedule interrupt is a CPU-to-CPU reschedule-helper + * IPI, driven by wakeup. + */ + set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); + + /* IPI for invalidation */ + set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt); + + /* IPI for generic function call */ + set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); +#endif + set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); + +#ifdef CONFIG_X86_LOCAL_APIC + /* self generated IPI for local APIC timer */ + set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); + + /* IPI vectors for APIC spurious and error interrupts */ + set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); + set_intr_gate(ERROR_APIC_VECTOR, error_interrupt); +#endif + + /* + * Set the clock to HZ Hz, we already have a valid + * vector now: + */ + setup_timer(); + + if (!acpi_ioapic) + setup_irq(2, &irq2); +} diff --git a/arch/x86_64/kernel/init_task.c b/arch/x86_64/kernel/init_task.c new file mode 100644 index 00000000000..c4dc9149115 --- /dev/null +++ b/arch/x86_64/kernel/init_task.c @@ -0,0 +1,49 @@ +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/init.h> +#include <linux/init_task.h> +#include <linux/fs.h> +#include <linux/mqueue.h> + +#include <asm/uaccess.h> +#include <asm/pgtable.h> +#include <asm/desc.h> + +static struct fs_struct init_fs = INIT_FS; +static struct files_struct init_files = INIT_FILES; +static struct signal_struct init_signals = INIT_SIGNALS(init_signals); +static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); +struct mm_struct init_mm = INIT_MM(init_mm); + +EXPORT_SYMBOL(init_mm); + +/* + * Initial task structure. + * + * We need to make sure that this is 8192-byte aligned due to the + * way process stacks are handled. This is done by having a special + * "init_task" linker map entry.. + */ +union thread_union init_thread_union + __attribute__((__section__(".data.init_task"))) = + { INIT_THREAD_INFO(init_task) }; + +/* + * Initial task structure. + * + * All other task structs will be allocated on slabs in fork.c + */ +struct task_struct init_task = INIT_TASK(init_task); + +EXPORT_SYMBOL(init_task); +/* + * per-CPU TSS segments. Threads are completely 'soft' on Linux, + * no more per-task TSS's. The TSS size is kept cacheline-aligned + * so they are allowed to end up in the .data.cacheline_aligned + * section. Since TSS's are completely CPU-local, we want them + * on exact cacheline boundaries, to eliminate cacheline ping-pong. + */ +DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_maxaligned_in_smp; + +#define ALIGN_TO_4K __attribute__((section(".data.init_task"))) diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c new file mode 100644 index 00000000000..adebe70c180 --- /dev/null +++ b/arch/x86_64/kernel/io_apic.c @@ -0,0 +1,1982 @@ +/* + * Intel IO-APIC support for multi-Pentium hosts. + * + * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo + * + * Many thanks to Stig Venaas for trying out countless experimental + * patches and reporting/debugging problems patiently! + * + * (c) 1999, Multiple IO-APIC support, developed by + * Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and + * Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>, + * further tested and cleaned up by Zach Brown <zab@redhat.com> + * and Ingo Molnar <mingo@redhat.com> + * + * Fixes + * Maciej W. Rozycki : Bits for genuine 82489DX APICs; + * thanks to Eric Gilmore + * and Rolf G. Tews + * for testing these extensively + * Paul Diefenbaugh : Added full ACPI support + */ + +#include <linux/mm.h> +#include <linux/irq.h> +#include <linux/interrupt.h> +#include <linux/init.h> +#include <linux/delay.h> +#include <linux/sched.h> +#include <linux/config.h> +#include <linux/smp_lock.h> +#include <linux/mc146818rtc.h> +#include <linux/acpi.h> +#include <linux/sysdev.h> + +#include <asm/io.h> +#include <asm/smp.h> +#include <asm/desc.h> +#include <asm/proto.h> +#include <asm/mach_apic.h> + +#define __apicdebuginit __init + +int sis_apic_bug; /* not actually supported, dummy for compile */ + +static DEFINE_SPINLOCK(ioapic_lock); + +/* + * # of IRQ routing registers + */ +int nr_ioapic_registers[MAX_IO_APICS]; + +/* + * Rough estimation of how many shared IRQs there are, can + * be changed anytime. + */ +#define MAX_PLUS_SHARED_IRQS NR_IRQS +#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS) + +/* + * This is performance-critical, we want to do it O(1) + * + * the indexing order of this array favors 1:1 mappings + * between pins and IRQs. + */ + +static struct irq_pin_list { + short apic, pin, next; +} irq_2_pin[PIN_MAP_SIZE]; + +int vector_irq[NR_VECTORS] = { [0 ... NR_VECTORS - 1] = -1}; +#ifdef CONFIG_PCI_MSI +#define vector_to_irq(vector) \ + (platform_legacy_irq(vector) ? vector : vector_irq[vector]) +#else +#define vector_to_irq(vector) (vector) +#endif + +/* + * The common case is 1:1 IRQ<->pin mappings. Sometimes there are + * shared ISA-space IRQs, so we have to support them. We are super + * fast in the common case, and fast for shared ISA-space IRQs. + */ +static void add_pin_to_irq(unsigned int irq, int apic, int pin) +{ + static int first_free_entry = NR_IRQS; + struct irq_pin_list *entry = irq_2_pin + irq; + + while (entry->next) + entry = irq_2_pin + entry->next; + + if (entry->pin != -1) { + entry->next = first_free_entry; + entry = irq_2_pin + entry->next; + if (++first_free_entry >= PIN_MAP_SIZE) + panic("io_apic.c: whoops"); + } + entry->apic = apic; + entry->pin = pin; +} + +#define __DO_ACTION(R, ACTION, FINAL) \ + \ +{ \ + int pin; \ + struct irq_pin_list *entry = irq_2_pin + irq; \ + \ + for (;;) { \ + unsigned int reg; \ + pin = entry->pin; \ + if (pin == -1) \ + break; \ + reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \ + reg ACTION; \ + io_apic_modify(entry->apic, reg); \ + if (!entry->next) \ + break; \ + entry = irq_2_pin + entry->next; \ + } \ + FINAL; \ +} + +#define DO_ACTION(name,R,ACTION, FINAL) \ + \ + static void name##_IO_APIC_irq (unsigned int irq) \ + __DO_ACTION(R, ACTION, FINAL) + +DO_ACTION( __mask, 0, |= 0x00010000, io_apic_sync(entry->apic) ) + /* mask = 1 */ +DO_ACTION( __unmask, 0, &= 0xfffeffff, ) + /* mask = 0 */ + +static void mask_IO_APIC_irq (unsigned int irq) +{ + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + __mask_IO_APIC_irq(irq); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +static void unmask_IO_APIC_irq (unsigned int irq) +{ + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + __unmask_IO_APIC_irq(irq); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) +{ + struct IO_APIC_route_entry entry; + unsigned long flags; + + /* Check delivery_mode to be sure we're not clearing an SMI pin */ + spin_lock_irqsave(&ioapic_lock, flags); + *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin); + *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin); + spin_unlock_irqrestore(&ioapic_lock, flags); + if (entry.delivery_mode == dest_SMI) + return; + /* + * Disable it in the IO-APIC irq-routing table: + */ + memset(&entry, 0, sizeof(entry)); + entry.mask = 1; + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0)); + io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1)); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +static void clear_IO_APIC (void) +{ + int apic, pin; + + for (apic = 0; apic < nr_ioapics; apic++) + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) + clear_IO_APIC_pin(apic, pin); +} + +/* + * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to + * specific CPU-side IRQs. + */ + +#define MAX_PIRQS 8 +static int pirq_entries [MAX_PIRQS]; +static int pirqs_enabled; +int skip_ioapic_setup; +int ioapic_force; + +/* dummy parsing: see setup.c */ + +static int __init disable_ioapic_setup(char *str) +{ + skip_ioapic_setup = 1; + return 1; +} + +static int __init enable_ioapic_setup(char *str) +{ + ioapic_force = 1; + skip_ioapic_setup = 0; + return 1; +} + +__setup("noapic", disable_ioapic_setup); +__setup("apic", enable_ioapic_setup); + +#include <asm/pci-direct.h> +#include <linux/pci_ids.h> +#include <linux/pci.h> + +/* Temporary Hack. Nvidia and VIA boards currently only work with IO-APIC + off. Check for an Nvidia or VIA PCI bridge and turn it off. + Use pci direct infrastructure because this runs before the PCI subsystem. + + Can be overwritten with "apic" + + And another hack to disable the IOMMU on VIA chipsets. + + Kludge-O-Rama. */ +void __init check_ioapic(void) +{ + int num,slot,func; + if (ioapic_force) + return; + + /* Poor man's PCI discovery */ + for (num = 0; num < 32; num++) { + for (slot = 0; slot < 32; slot++) { + for (func = 0; func < 8; func++) { + u32 class; + u32 vendor; + u8 type; + class = read_pci_config(num,slot,func, + PCI_CLASS_REVISION); + if (class == 0xffffffff) + break; + + if ((class >> 16) != PCI_CLASS_BRIDGE_PCI) + continue; + + vendor = read_pci_config(num, slot, func, + PCI_VENDOR_ID); + vendor &= 0xffff; + switch (vendor) { + case PCI_VENDOR_ID_VIA: +#ifdef CONFIG_GART_IOMMU + if ((end_pfn >= (0xffffffff>>PAGE_SHIFT) || + force_iommu) && + !iommu_aperture_allowed) { + printk(KERN_INFO + "Looks like a VIA chipset. Disabling IOMMU. Overwrite with \"iommu=allowed\"\n"); + iommu_aperture_disabled = 1; + } +#endif + return; + case PCI_VENDOR_ID_NVIDIA: +#ifdef CONFIG_ACPI + /* All timer overrides on Nvidia + seem to be wrong. Skip them. */ + acpi_skip_timer_override = 1; + printk(KERN_INFO + "Nvidia board detected. Ignoring ACPI timer override.\n"); +#endif + /* RED-PEN skip them on mptables too? */ + return; + } + + /* No multi-function device? */ + type = read_pci_config_byte(num,slot,func, + PCI_HEADER_TYPE); + if (!(type & 0x80)) + break; + } + } + } +} + +static int __init ioapic_pirq_setup(char *str) +{ + int i, max; + int ints[MAX_PIRQS+1]; + + get_options(str, ARRAY_SIZE(ints), ints); + + for (i = 0; i < MAX_PIRQS; i++) + pirq_entries[i] = -1; + + pirqs_enabled = 1; + apic_printk(APIC_VERBOSE, "PIRQ redirection, working around broken MP-BIOS.\n"); + max = MAX_PIRQS; + if (ints[0] < MAX_PIRQS) + max = ints[0]; + + for (i = 0; i < max; i++) { + apic_printk(APIC_VERBOSE, "... PIRQ%d -> IRQ %d\n", i, ints[i+1]); + /* + * PIRQs are mapped upside down, usually. + */ + pirq_entries[MAX_PIRQS-i-1] = ints[i+1]; + } + return 1; +} + +__setup("pirq=", ioapic_pirq_setup); + +/* + * Find the IRQ entry number of a certain pin. + */ +static int find_irq_entry(int apic, int pin, int type) +{ + int i; + + for (i = 0; i < mp_irq_entries; i++) + if (mp_irqs[i].mpc_irqtype == type && + (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid || + mp_irqs[i].mpc_dstapic == MP_APIC_ALL) && + mp_irqs[i].mpc_dstirq == pin) + return i; + + return -1; +} + +/* + * Find the pin to which IRQ[irq] (ISA) is connected + */ +static int __init find_isa_irq_pin(int irq, int type) +{ + int i; + + for (i = 0; i < mp_irq_entries; i++) { + int lbus = mp_irqs[i].mpc_srcbus; + + if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || + mp_bus_id_to_type[lbus] == MP_BUS_EISA || + mp_bus_id_to_type[lbus] == MP_BUS_MCA) && + (mp_irqs[i].mpc_irqtype == type) && + (mp_irqs[i].mpc_srcbusirq == irq)) + + return mp_irqs[i].mpc_dstirq; + } + return -1; +} + +/* + * Find a specific PCI IRQ entry. + * Not an __init, possibly needed by modules + */ +static int pin_2_irq(int idx, int apic, int pin); + +int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) +{ + int apic, i, best_guess = -1; + + apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", + bus, slot, pin); + if (mp_bus_id_to_pci_bus[bus] == -1) { + apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus); + return -1; + } + for (i = 0; i < mp_irq_entries; i++) { + int lbus = mp_irqs[i].mpc_srcbus; + + for (apic = 0; apic < nr_ioapics; apic++) + if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic || + mp_irqs[i].mpc_dstapic == MP_APIC_ALL) + break; + + if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) && + !mp_irqs[i].mpc_irqtype && + (bus == lbus) && + (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) { + int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq); + + if (!(apic || IO_APIC_IRQ(irq))) + continue; + + if (pin == (mp_irqs[i].mpc_srcbusirq & 3)) + return irq; + /* + * Use the first all-but-pin matching entry as a + * best-guess fuzzy result for broken mptables. + */ + if (best_guess < 0) + best_guess = irq; + } + } + return best_guess; +} + +/* + * EISA Edge/Level control register, ELCR + */ +static int EISA_ELCR(unsigned int irq) +{ + if (irq < 16) { + unsigned int port = 0x4d0 + (irq >> 3); + return (inb(port) >> (irq & 7)) & 1; + } + apic_printk(APIC_VERBOSE, "Broken MPtable reports ISA irq %d\n", irq); + return 0; +} + +/* EISA interrupts are always polarity zero and can be edge or level + * trigger depending on the ELCR value. If an interrupt is listed as + * EISA conforming in the MP table, that means its trigger type must + * be read in from the ELCR */ + +#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq)) +#define default_EISA_polarity(idx) (0) + +/* ISA interrupts are always polarity zero edge triggered, + * when listed as conforming in the MP table. */ + +#define default_ISA_trigger(idx) (0) +#define default_ISA_polarity(idx) (0) + +/* PCI interrupts are always polarity one level triggered, + * when listed as conforming in the MP table. */ + +#define default_PCI_trigger(idx) (1) +#define default_PCI_polarity(idx) (1) + +/* MCA interrupts are always polarity zero level triggered, + * when listed as conforming in the MP table. */ + +#define default_MCA_trigger(idx) (1) +#define default_MCA_polarity(idx) (0) + +static int __init MPBIOS_polarity(int idx) +{ + int bus = mp_irqs[idx].mpc_srcbus; + int polarity; + + /* + * Determine IRQ line polarity (high active or low active): + */ + switch (mp_irqs[idx].mpc_irqflag & 3) + { + case 0: /* conforms, ie. bus-type dependent polarity */ + { + switch (mp_bus_id_to_type[bus]) + { + case MP_BUS_ISA: /* ISA pin */ + { + polarity = default_ISA_polarity(idx); + break; + } + case MP_BUS_EISA: /* EISA pin */ + { + polarity = default_EISA_polarity(idx); + break; + } + case MP_BUS_PCI: /* PCI pin */ + { + polarity = default_PCI_polarity(idx); + break; + } + case MP_BUS_MCA: /* MCA pin */ + { + polarity = default_MCA_polarity(idx); + break; + } + default: + { + printk(KERN_WARNING "broken BIOS!!\n"); + polarity = 1; + break; + } + } + break; + } + case 1: /* high active */ + { + polarity = 0; + break; + } + case 2: /* reserved */ + { + printk(KERN_WARNING "broken BIOS!!\n"); + polarity = 1; + break; + } + case 3: /* low active */ + { + polarity = 1; + break; + } + default: /* invalid */ + { + printk(KERN_WARNING "broken BIOS!!\n"); + polarity = 1; + break; + } + } + return polarity; +} + +static int MPBIOS_trigger(int idx) +{ + int bus = mp_irqs[idx].mpc_srcbus; + int trigger; + + /* + * Determine IRQ trigger mode (edge or level sensitive): + */ + switch ((mp_irqs[idx].mpc_irqflag>>2) & 3) + { + case 0: /* conforms, ie. bus-type dependent */ + { + switch (mp_bus_id_to_type[bus]) + { + case MP_BUS_ISA: /* ISA pin */ + { + trigger = default_ISA_trigger(idx); + break; + } + case MP_BUS_EISA: /* EISA pin */ + { + trigger = default_EISA_trigger(idx); + break; + } + case MP_BUS_PCI: /* PCI pin */ + { + trigger = default_PCI_trigger(idx); + break; + } + case MP_BUS_MCA: /* MCA pin */ + { + trigger = default_MCA_trigger(idx); + break; + } + default: + { + printk(KERN_WARNING "broken BIOS!!\n"); + trigger = 1; + break; + } + } + break; + } + case 1: /* edge */ + { + trigger = 0; + break; + } + case 2: /* reserved */ + { + printk(KERN_WARNING "broken BIOS!!\n"); + trigger = 1; + break; + } + case 3: /* level */ + { + trigger = 1; + break; + } + default: /* invalid */ + { + printk(KERN_WARNING "broken BIOS!!\n"); + trigger = 0; + break; + } + } + return trigger; +} + +static inline int irq_polarity(int idx) +{ + return MPBIOS_polarity(idx); +} + +static inline int irq_trigger(int idx) +{ + return MPBIOS_trigger(idx); +} + +static int pin_2_irq(int idx, int apic, int pin) +{ + int irq, i; + int bus = mp_irqs[idx].mpc_srcbus; + + /* + * Debugging check, we are in big trouble if this message pops up! + */ + if (mp_irqs[idx].mpc_dstirq != pin) + printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); + + switch (mp_bus_id_to_type[bus]) + { + case MP_BUS_ISA: /* ISA pin */ + case MP_BUS_EISA: + case MP_BUS_MCA: + { + irq = mp_irqs[idx].mpc_srcbusirq; + break; + } + case MP_BUS_PCI: /* PCI pin */ + { + /* + * PCI IRQs are mapped in order + */ + i = irq = 0; + while (i < apic) + irq += nr_ioapic_registers[i++]; + irq += pin; + break; + } + default: + { + printk(KERN_ERR "unknown bus type %d.\n",bus); + irq = 0; + break; + } + } + + /* + * PCI IRQ command line redirection. Yes, limits are hardcoded. + */ + if ((pin >= 16) && (pin <= 23)) { + if (pirq_entries[pin-16] != -1) { + if (!pirq_entries[pin-16]) { + apic_printk(APIC_VERBOSE, "disabling PIRQ%d\n", pin-16); + } else { + irq = pirq_entries[pin-16]; + apic_printk(APIC_VERBOSE, "using PIRQ%d -> IRQ %d\n", + pin-16, irq); + } + } + } + return irq; +} + +static inline int IO_APIC_irq_trigger(int irq) +{ + int apic, idx, pin; + + for (apic = 0; apic < nr_ioapics; apic++) { + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + idx = find_irq_entry(apic,pin,mp_INT); + if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin))) + return irq_trigger(idx); + } + } + /* + * nonexistent IRQs are edge default + */ + return 0; +} + +/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */ +u8 irq_vector[NR_IRQ_VECTORS] = { FIRST_DEVICE_VECTOR , 0 }; + +int assign_irq_vector(int irq) +{ + static int current_vector = FIRST_DEVICE_VECTOR, offset = 0; + + BUG_ON(irq >= NR_IRQ_VECTORS); + if (IO_APIC_VECTOR(irq) > 0) + return IO_APIC_VECTOR(irq); +next: + current_vector += 8; + if (current_vector == IA32_SYSCALL_VECTOR) + goto next; + + if (current_vector >= FIRST_SYSTEM_VECTOR) { + offset++; + if (!(offset%8)) + return -ENOSPC; + current_vector = FIRST_DEVICE_VECTOR + offset; + } + + vector_irq[current_vector] = irq; + if (irq != AUTO_ASSIGN) + IO_APIC_VECTOR(irq) = current_vector; + + return current_vector; +} + +extern void (*interrupt[NR_IRQS])(void); +static struct hw_interrupt_type ioapic_level_type; +static struct hw_interrupt_type ioapic_edge_type; + +#define IOAPIC_AUTO -1 +#define IOAPIC_EDGE 0 +#define IOAPIC_LEVEL 1 + +static inline void ioapic_register_intr(int irq, int vector, unsigned long trigger) +{ + if (use_pci_vector() && !platform_legacy_irq(irq)) { + if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || + trigger == IOAPIC_LEVEL) + irq_desc[vector].handler = &ioapic_level_type; + else + irq_desc[vector].handler = &ioapic_edge_type; + set_intr_gate(vector, interrupt[vector]); + } else { + if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || + trigger == IOAPIC_LEVEL) + irq_desc[irq].handler = &ioapic_level_type; + else + irq_desc[irq].handler = &ioapic_edge_type; + set_intr_gate(vector, interrupt[irq]); + } +} + +static void __init setup_IO_APIC_irqs(void) +{ + struct IO_APIC_route_entry entry; + int apic, pin, idx, irq, first_notcon = 1, vector; + unsigned long flags; + + apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); + + for (apic = 0; apic < nr_ioapics; apic++) { + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + + /* + * add it to the IO-APIC irq-routing table: + */ + memset(&entry,0,sizeof(entry)); + + entry.delivery_mode = INT_DELIVERY_MODE; + entry.dest_mode = INT_DEST_MODE; + entry.mask = 0; /* enable IRQ */ + entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); + + idx = find_irq_entry(apic,pin,mp_INT); + if (idx == -1) { + if (first_notcon) { + apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mpc_apicid, pin); + first_notcon = 0; + } else + apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mpc_apicid, pin); + continue; + } + + entry.trigger = irq_trigger(idx); + entry.polarity = irq_polarity(idx); + + if (irq_trigger(idx)) { + entry.trigger = 1; + entry.mask = 1; + entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); + } + + irq = pin_2_irq(idx, apic, pin); + add_pin_to_irq(irq, apic, pin); + + if (!apic && !IO_APIC_IRQ(irq)) + continue; + + if (IO_APIC_IRQ(irq)) { + vector = assign_irq_vector(irq); + entry.vector = vector; + + ioapic_register_intr(irq, vector, IOAPIC_AUTO); + if (!apic && (irq < 16)) + disable_8259A_irq(irq); + } + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1)); + io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0)); + spin_unlock_irqrestore(&ioapic_lock, flags); + } + } + + if (!first_notcon) + apic_printk(APIC_VERBOSE," not connected.\n"); +} + +/* + * Set up the 8259A-master output pin as broadcast to all + * CPUs. + */ +static void __init setup_ExtINT_IRQ0_pin(unsigned int pin, int vector) +{ + struct IO_APIC_route_entry entry; + unsigned long flags; + + memset(&entry,0,sizeof(entry)); + + disable_8259A_irq(0); + + /* mask LVT0 */ + apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); + + /* + * We use logical delivery to get the timer IRQ + * to the first CPU. + */ + entry.dest_mode = INT_DEST_MODE; + entry.mask = 0; /* unmask IRQ now */ + entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); + entry.delivery_mode = INT_DELIVERY_MODE; + entry.polarity = 0; + entry.trigger = 0; + entry.vector = vector; + + /* + * The timer IRQ doesn't have to know that behind the + * scene we have a 8259A-master in AEOI mode ... + */ + irq_desc[0].handler = &ioapic_edge_type; + + /* + * Add it to the IO-APIC irq-routing table: + */ + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(0, 0x11+2*pin, *(((int *)&entry)+1)); + io_apic_write(0, 0x10+2*pin, *(((int *)&entry)+0)); + spin_unlock_irqrestore(&ioapic_lock, flags); + + enable_8259A_irq(0); +} + +void __init UNEXPECTED_IO_APIC(void) +{ +} + +void __apicdebuginit print_IO_APIC(void) +{ + int apic, i; + union IO_APIC_reg_00 reg_00; + union IO_APIC_reg_01 reg_01; + union IO_APIC_reg_02 reg_02; + unsigned long flags; + + if (apic_verbosity == APIC_QUIET) + return; + + printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); + for (i = 0; i < nr_ioapics; i++) + printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", + mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]); + + /* + * We are a bit conservative about what we expect. We have to + * know about every hardware change ASAP. + */ + printk(KERN_INFO "testing the IO APIC.......................\n"); + + for (apic = 0; apic < nr_ioapics; apic++) { + + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(apic, 0); + reg_01.raw = io_apic_read(apic, 1); + if (reg_01.bits.version >= 0x10) + reg_02.raw = io_apic_read(apic, 2); + spin_unlock_irqrestore(&ioapic_lock, flags); + + printk("\n"); + printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid); + printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); + printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); + if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2) + UNEXPECTED_IO_APIC(); + + printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)®_01); + printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); + if ( (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */ + (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */ + (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */ + (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */ + (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */ + (reg_01.bits.entries != 0x2E) && + (reg_01.bits.entries != 0x3F) && + (reg_01.bits.entries != 0x03) + ) + UNEXPECTED_IO_APIC(); + + printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); + printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); + if ( (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */ + (reg_01.bits.version != 0x02) && /* 82801BA IO-APICs (ICH2) */ + (reg_01.bits.version != 0x10) && /* oldest IO-APICs */ + (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */ + (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */ + (reg_01.bits.version != 0x20) /* Intel P64H (82806 AA) */ + ) + UNEXPECTED_IO_APIC(); + if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2) + UNEXPECTED_IO_APIC(); + + if (reg_01.bits.version >= 0x10) { + printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); + printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); + if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2) + UNEXPECTED_IO_APIC(); + } + + printk(KERN_DEBUG ".... IRQ redirection table:\n"); + + printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol" + " Stat Dest Deli Vect: \n"); + + for (i = 0; i <= reg_01.bits.entries; i++) { + struct IO_APIC_route_entry entry; + + spin_lock_irqsave(&ioapic_lock, flags); + *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2); + *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2); + spin_unlock_irqrestore(&ioapic_lock, flags); + + printk(KERN_DEBUG " %02x %03X %02X ", + i, + entry.dest.logical.logical_dest, + entry.dest.physical.physical_dest + ); + + printk("%1d %1d %1d %1d %1d %1d %1d %02X\n", + entry.mask, + entry.trigger, + entry.irr, + entry.polarity, + entry.delivery_status, + entry.dest_mode, + entry.delivery_mode, + entry.vector + ); + } + } + if (use_pci_vector()) + printk(KERN_INFO "Using vector-based indexing\n"); + printk(KERN_DEBUG "IRQ to pin mappings:\n"); + for (i = 0; i < NR_IRQS; i++) { + struct irq_pin_list *entry = irq_2_pin + i; + if (entry->pin < 0) + continue; + if (use_pci_vector() && !platform_legacy_irq(i)) + printk(KERN_DEBUG "IRQ%d ", IO_APIC_VECTOR(i)); + else + printk(KERN_DEBUG "IRQ%d ", i); + for (;;) { + printk("-> %d:%d", entry->apic, entry->pin); + if (!entry->next) + break; + entry = irq_2_pin + entry->next; + } + printk("\n"); + } + + printk(KERN_INFO ".................................... done.\n"); + + return; +} + +#if 0 + +static __apicdebuginit void print_APIC_bitfield (int base) +{ + unsigned int v; + int i, j; + + if (apic_verbosity == APIC_QUIET) + return; + + printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG); + for (i = 0; i < 8; i++) { + v = apic_read(base + i*0x10); + for (j = 0; j < 32; j++) { + if (v & (1<<j)) + printk("1"); + else + printk("0"); + } + printk("\n"); + } +} + +void __apicdebuginit print_local_APIC(void * dummy) +{ + unsigned int v, ver, maxlvt; + + if (apic_verbosity == APIC_QUIET) + return; + + printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", + smp_processor_id(), hard_smp_processor_id()); + v = apic_read(APIC_ID); + printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(v)); + v = apic_read(APIC_LVR); + printk(KERN_INFO "... APIC VERSION: %08x\n", v); + ver = GET_APIC_VERSION(v); + maxlvt = get_maxlvt(); + + v = apic_read(APIC_TASKPRI); + printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK); + + if (APIC_INTEGRATED(ver)) { /* !82489DX */ + v = apic_read(APIC_ARBPRI); + printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v, + v & APIC_ARBPRI_MASK); + v = apic_read(APIC_PROCPRI); + printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v); + } + + v = apic_read(APIC_EOI); + printk(KERN_DEBUG "... APIC EOI: %08x\n", v); + v = apic_read(APIC_RRR); + printk(KERN_DEBUG "... APIC RRR: %08x\n", v); + v = apic_read(APIC_LDR); + printk(KERN_DEBUG "... APIC LDR: %08x\n", v); + v = apic_read(APIC_DFR); + printk(KERN_DEBUG "... APIC DFR: %08x\n", v); + v = apic_read(APIC_SPIV); + printk(KERN_DEBUG "... APIC SPIV: %08x\n", v); + + printk(KERN_DEBUG "... APIC ISR field:\n"); + print_APIC_bitfield(APIC_ISR); + printk(KERN_DEBUG "... APIC TMR field:\n"); + print_APIC_bitfield(APIC_TMR); + printk(KERN_DEBUG "... APIC IRR field:\n"); + print_APIC_bitfield(APIC_IRR); + + if (APIC_INTEGRATED(ver)) { /* !82489DX */ + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); + v = apic_read(APIC_ESR); + printk(KERN_DEBUG "... APIC ESR: %08x\n", v); + } + + v = apic_read(APIC_ICR); + printk(KERN_DEBUG "... APIC ICR: %08x\n", v); + v = apic_read(APIC_ICR2); + printk(KERN_DEBUG "... APIC ICR2: %08x\n", v); + + v = apic_read(APIC_LVTT); + printk(KERN_DEBUG "... APIC LVTT: %08x\n", v); + + if (maxlvt > 3) { /* PC is LVT#4. */ + v = apic_read(APIC_LVTPC); + printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v); + } + v = apic_read(APIC_LVT0); + printk(KERN_DEBUG "... APIC LVT0: %08x\n", v); + v = apic_read(APIC_LVT1); + printk(KERN_DEBUG "... APIC LVT1: %08x\n", v); + + if (maxlvt > 2) { /* ERR is LVT#3. */ + v = apic_read(APIC_LVTERR); + printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v); + } + + v = apic_read(APIC_TMICT); + printk(KERN_DEBUG "... APIC TMICT: %08x\n", v); + v = apic_read(APIC_TMCCT); + printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v); + v = apic_read(APIC_TDCR); + printk(KERN_DEBUG "... APIC TDCR: %08x\n", v); + printk("\n"); +} + +void print_all_local_APICs (void) +{ + on_each_cpu(print_local_APIC, NULL, 1, 1); +} + +void __apicdebuginit print_PIC(void) +{ + extern spinlock_t i8259A_lock; + unsigned int v; + unsigned long flags; + + if (apic_verbosity == APIC_QUIET) + return; + + printk(KERN_DEBUG "\nprinting PIC contents\n"); + + spin_lock_irqsave(&i8259A_lock, flags); + + v = inb(0xa1) << 8 | inb(0x21); + printk(KERN_DEBUG "... PIC IMR: %04x\n", v); + + v = inb(0xa0) << 8 | inb(0x20); + printk(KERN_DEBUG "... PIC IRR: %04x\n", v); + + outb(0x0b,0xa0); + outb(0x0b,0x20); + v = inb(0xa0) << 8 | inb(0x20); + outb(0x0a,0xa0); + outb(0x0a,0x20); + + spin_unlock_irqrestore(&i8259A_lock, flags); + + printk(KERN_DEBUG "... PIC ISR: %04x\n", v); + + v = inb(0x4d1) << 8 | inb(0x4d0); + printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); +} + +#endif /* 0 */ + +static void __init enable_IO_APIC(void) +{ + union IO_APIC_reg_01 reg_01; + int i; + unsigned long flags; + + for (i = 0; i < PIN_MAP_SIZE; i++) { + irq_2_pin[i].pin = -1; + irq_2_pin[i].next = 0; + } + if (!pirqs_enabled) + for (i = 0; i < MAX_PIRQS; i++) + pirq_entries[i] = -1; + + /* + * The number of IO-APIC IRQ registers (== #pins): + */ + for (i = 0; i < nr_ioapics; i++) { + spin_lock_irqsave(&ioapic_lock, flags); + reg_01.raw = io_apic_read(i, 1); + spin_unlock_irqrestore(&ioapic_lock, flags); + nr_ioapic_registers[i] = reg_01.bits.entries+1; + } + + /* + * Do not trust the IO-APIC being empty at bootup + */ + clear_IO_APIC(); +} + +/* + * Not an __init, needed by the reboot code + */ +void disable_IO_APIC(void) +{ + /* + * Clear the IO-APIC before rebooting: + */ + clear_IO_APIC(); + + disconnect_bsp_APIC(); +} + +/* + * function to set the IO-APIC physical IDs based on the + * values stored in the MPC table. + * + * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999 + */ + +static void __init setup_ioapic_ids_from_mpc (void) +{ + union IO_APIC_reg_00 reg_00; + int apic; + int i; + unsigned char old_id; + unsigned long flags; + + /* + * Set the IOAPIC ID to the value stored in the MPC table. + */ + for (apic = 0; apic < nr_ioapics; apic++) { + + /* Read the register 0 value */ + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(apic, 0); + spin_unlock_irqrestore(&ioapic_lock, flags); + + old_id = mp_ioapics[apic].mpc_apicid; + + + printk(KERN_INFO "Using IO-APIC %d\n", mp_ioapics[apic].mpc_apicid); + + + /* + * We need to adjust the IRQ routing table + * if the ID changed. + */ + if (old_id != mp_ioapics[apic].mpc_apicid) + for (i = 0; i < mp_irq_entries; i++) + if (mp_irqs[i].mpc_dstapic == old_id) + mp_irqs[i].mpc_dstapic + = mp_ioapics[apic].mpc_apicid; + + /* + * Read the right value from the MPC table and + * write it into the ID register. + */ + apic_printk(APIC_VERBOSE,KERN_INFO "...changing IO-APIC physical APIC ID to %d ...", + mp_ioapics[apic].mpc_apicid); + + reg_00.bits.ID = mp_ioapics[apic].mpc_apicid; + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(apic, 0, reg_00.raw); + spin_unlock_irqrestore(&ioapic_lock, flags); + + /* + * Sanity check + */ + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(apic, 0); + spin_unlock_irqrestore(&ioapic_lock, flags); + if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid) + printk("could not set ID!\n"); + else + apic_printk(APIC_VERBOSE," ok.\n"); + } +} + +/* + * There is a nasty bug in some older SMP boards, their mptable lies + * about the timer IRQ. We do the following to work around the situation: + * + * - timer IRQ defaults to IO-APIC IRQ + * - if this function detects that timer IRQs are defunct, then we fall + * back to ISA timer IRQs + */ +static int __init timer_irq_works(void) +{ + unsigned long t1 = jiffies; + + local_irq_enable(); + /* Let ten ticks pass... */ + mdelay((10 * 1000) / HZ); + + /* + * Expect a few ticks at least, to be sure some possible + * glue logic does not lock up after one or two first + * ticks in a non-ExtINT mode. Also the local APIC + * might have cached one ExtINT interrupt. Finally, at + * least one tick may be lost due to delays. + */ + + /* jiffies wrap? */ + if (jiffies - t1 > 4) + return 1; + return 0; +} + +/* + * In the SMP+IOAPIC case it might happen that there are an unspecified + * number of pending IRQ events unhandled. These cases are very rare, + * so we 'resend' these IRQs via IPIs, to the same CPU. It's much + * better to do it this way as thus we do not have to be aware of + * 'pending' interrupts in the IRQ path, except at this point. + */ +/* + * Edge triggered needs to resend any interrupt + * that was delayed but this is now handled in the device + * independent code. + */ + +/* + * Starting up a edge-triggered IO-APIC interrupt is + * nasty - we need to make sure that we get the edge. + * If it is already asserted for some reason, we need + * return 1 to indicate that is was pending. + * + * This is not complete - we should be able to fake + * an edge even if it isn't on the 8259A... + */ + +static unsigned int startup_edge_ioapic_irq(unsigned int irq) +{ + int was_pending = 0; + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + if (irq < 16) { + disable_8259A_irq(irq); + if (i8259A_irq_pending(irq)) + was_pending = 1; + } + __unmask_IO_APIC_irq(irq); + spin_unlock_irqrestore(&ioapic_lock, flags); + + return was_pending; +} + +/* + * Once we have recorded IRQ_PENDING already, we can mask the + * interrupt for real. This prevents IRQ storms from unhandled + * devices. + */ +static void ack_edge_ioapic_irq(unsigned int irq) +{ + if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED)) + == (IRQ_PENDING | IRQ_DISABLED)) + mask_IO_APIC_irq(irq); + ack_APIC_irq(); +} + +/* + * Level triggered interrupts can just be masked, + * and shutting down and starting up the interrupt + * is the same as enabling and disabling them -- except + * with a startup need to return a "was pending" value. + * + * Level triggered interrupts are special because we + * do not touch any IO-APIC register while handling + * them. We ack the APIC in the end-IRQ handler, not + * in the start-IRQ-handler. Protection against reentrance + * from the same interrupt is still provided, both by the + * generic IRQ layer and by the fact that an unacked local + * APIC does not accept IRQs. + */ +static unsigned int startup_level_ioapic_irq (unsigned int irq) +{ + unmask_IO_APIC_irq(irq); + + return 0; /* don't check for pending */ +} + +static void end_level_ioapic_irq (unsigned int irq) +{ + ack_APIC_irq(); +} + +static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) +{ + unsigned long flags; + unsigned int dest; + + dest = cpu_mask_to_apicid(mask); + + /* + * Only the high 8 bits are valid. + */ + dest = SET_APIC_LOGICAL_ID(dest); + + spin_lock_irqsave(&ioapic_lock, flags); + __DO_ACTION(1, = dest, ) + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +#ifdef CONFIG_PCI_MSI +static unsigned int startup_edge_ioapic_vector(unsigned int vector) +{ + int irq = vector_to_irq(vector); + + return startup_edge_ioapic_irq(irq); +} + +static void ack_edge_ioapic_vector(unsigned int vector) +{ + int irq = vector_to_irq(vector); + + ack_edge_ioapic_irq(irq); +} + +static unsigned int startup_level_ioapic_vector (unsigned int vector) +{ + int irq = vector_to_irq(vector); + + return startup_level_ioapic_irq (irq); +} + +static void end_level_ioapic_vector (unsigned int vector) +{ + int irq = vector_to_irq(vector); + + end_level_ioapic_irq(irq); +} + +static void mask_IO_APIC_vector (unsigned int vector) +{ + int irq = vector_to_irq(vector); + + mask_IO_APIC_irq(irq); +} + +static void unmask_IO_APIC_vector (unsigned int vector) +{ + int irq = vector_to_irq(vector); + + unmask_IO_APIC_irq(irq); +} + +static void set_ioapic_affinity_vector (unsigned int vector, + cpumask_t cpu_mask) +{ + int irq = vector_to_irq(vector); + + set_ioapic_affinity_irq(irq, cpu_mask); +} +#endif + +/* + * Level and edge triggered IO-APIC interrupts need different handling, + * so we use two separate IRQ descriptors. Edge triggered IRQs can be + * handled with the level-triggered descriptor, but that one has slightly + * more overhead. Level-triggered interrupts cannot be handled with the + * edge-triggered handler, without risking IRQ storms and other ugly + * races. + */ + +static struct hw_interrupt_type ioapic_edge_type = { + .typename = "IO-APIC-edge", + .startup = startup_edge_ioapic, + .shutdown = shutdown_edge_ioapic, + .enable = enable_edge_ioapic, + .disable = disable_edge_ioapic, + .ack = ack_edge_ioapic, + .end = end_edge_ioapic, + .set_affinity = set_ioapic_affinity, +}; + +static struct hw_interrupt_type ioapic_level_type = { + .typename = "IO-APIC-level", + .startup = startup_level_ioapic, + .shutdown = shutdown_level_ioapic, + .enable = enable_level_ioapic, + .disable = disable_level_ioapic, + .ack = mask_and_ack_level_ioapic, + .end = end_level_ioapic, + .set_affinity = set_ioapic_affinity, +}; + +static inline void init_IO_APIC_traps(void) +{ + int irq; + + /* + * NOTE! The local APIC isn't very good at handling + * multiple interrupts at the same interrupt level. + * As the interrupt level is determined by taking the + * vector number and shifting that right by 4, we + * want to spread these out a bit so that they don't + * all fall in the same interrupt level. + * + * Also, we've got to be careful not to trash gate + * 0x80, because int 0x80 is hm, kind of importantish. ;) + */ + for (irq = 0; irq < NR_IRQS ; irq++) { + int tmp = irq; + if (use_pci_vector()) { + if (!platform_legacy_irq(tmp)) + if ((tmp = vector_to_irq(tmp)) == -1) + continue; + } + if (IO_APIC_IRQ(tmp) && !IO_APIC_VECTOR(tmp)) { + /* + * Hmm.. We don't have an entry for this, + * so default to an old-fashioned 8259 + * interrupt if we can.. + */ + if (irq < 16) + make_8259A_irq(irq); + else + /* Strange. Oh, well.. */ + irq_desc[irq].handler = &no_irq_type; + } + } +} + +static void enable_lapic_irq (unsigned int irq) +{ + unsigned long v; + + v = apic_read(APIC_LVT0); + apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED); +} + +static void disable_lapic_irq (unsigned int irq) +{ + unsigned long v; + + v = apic_read(APIC_LVT0); + apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); +} + +static void ack_lapic_irq (unsigned int irq) +{ + ack_APIC_irq(); +} + +static void end_lapic_irq (unsigned int i) { /* nothing */ } + +static struct hw_interrupt_type lapic_irq_type = { + .typename = "local-APIC-edge", + .startup = NULL, /* startup_irq() not used for IRQ0 */ + .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */ + .enable = enable_lapic_irq, + .disable = disable_lapic_irq, + .ack = ack_lapic_irq, + .end = end_lapic_irq, +}; + +static void setup_nmi (void) +{ + /* + * Dirty trick to enable the NMI watchdog ... + * We put the 8259A master into AEOI mode and + * unmask on all local APICs LVT0 as NMI. + * + * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire') + * is from Maciej W. Rozycki - so we do not have to EOI from + * the NMI handler or the timer interrupt. + */ + printk(KERN_INFO "activating NMI Watchdog ..."); + + enable_NMI_through_LVT0(NULL); + + printk(" done.\n"); +} + +/* + * This looks a bit hackish but it's about the only one way of sending + * a few INTA cycles to 8259As and any associated glue logic. ICR does + * not support the ExtINT mode, unfortunately. We need to send these + * cycles as some i82489DX-based boards have glue logic that keeps the + * 8259A interrupt line asserted until INTA. --macro + */ +static inline void unlock_ExtINT_logic(void) +{ + int pin, i; + struct IO_APIC_route_entry entry0, entry1; + unsigned char save_control, save_freq_select; + unsigned long flags; + + pin = find_isa_irq_pin(8, mp_INT); + if (pin == -1) + return; + + spin_lock_irqsave(&ioapic_lock, flags); + *(((int *)&entry0) + 1) = io_apic_read(0, 0x11 + 2 * pin); + *(((int *)&entry0) + 0) = io_apic_read(0, 0x10 + 2 * pin); + spin_unlock_irqrestore(&ioapic_lock, flags); + clear_IO_APIC_pin(0, pin); + + memset(&entry1, 0, sizeof(entry1)); + + entry1.dest_mode = 0; /* physical delivery */ + entry1.mask = 0; /* unmask IRQ now */ + entry1.dest.physical.physical_dest = hard_smp_processor_id(); + entry1.delivery_mode = dest_ExtINT; + entry1.polarity = entry0.polarity; + entry1.trigger = 0; + entry1.vector = 0; + + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(0, 0x11 + 2 * pin, *(((int *)&entry1) + 1)); + io_apic_write(0, 0x10 + 2 * pin, *(((int *)&entry1) + 0)); + spin_unlock_irqrestore(&ioapic_lock, flags); + + save_control = CMOS_READ(RTC_CONTROL); + save_freq_select = CMOS_READ(RTC_FREQ_SELECT); + CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6, + RTC_FREQ_SELECT); + CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL); + + i = 100; + while (i-- > 0) { + mdelay(10); + if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF) + i -= 10; + } + + CMOS_WRITE(save_control, RTC_CONTROL); + CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); + clear_IO_APIC_pin(0, pin); + + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(0, 0x11 + 2 * pin, *(((int *)&entry0) + 1)); + io_apic_write(0, 0x10 + 2 * pin, *(((int *)&entry0) + 0)); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +/* + * This code may look a bit paranoid, but it's supposed to cooperate with + * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ + * is so screwy. Thanks to Brian Perkins for testing/hacking this beast + * fanatically on his truly buggy board. + */ +static inline void check_timer(void) +{ + int pin1, pin2; + int vector; + + /* + * get/set the timer IRQ vector: + */ + disable_8259A_irq(0); + vector = assign_irq_vector(0); + set_intr_gate(vector, interrupt[0]); + + /* + * Subtle, code in do_timer_interrupt() expects an AEOI + * mode for the 8259A whenever interrupts are routed + * through I/O APICs. Also IRQ0 has to be enabled in + * the 8259A which implies the virtual wire has to be + * disabled in the local APIC. + */ + apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); + init_8259A(1); + enable_8259A_irq(0); + + pin1 = find_isa_irq_pin(0, mp_INT); + pin2 = find_isa_irq_pin(0, mp_ExtINT); + + apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X pin1=%d pin2=%d\n", vector, pin1, pin2); + + if (pin1 != -1) { + /* + * Ok, does IRQ0 through the IOAPIC work? + */ + unmask_IO_APIC_irq(0); + if (timer_irq_works()) { + nmi_watchdog_default(); + if (nmi_watchdog == NMI_IO_APIC) { + disable_8259A_irq(0); + setup_nmi(); + enable_8259A_irq(0); + check_nmi_watchdog(); + } + return; + } + clear_IO_APIC_pin(0, pin1); + apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not connected to IO-APIC\n"); + } + + apic_printk(APIC_VERBOSE,KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... "); + if (pin2 != -1) { + apic_printk(APIC_VERBOSE,"\n..... (found pin %d) ...", pin2); + /* + * legacy devices should be connected to IO APIC #0 + */ + setup_ExtINT_IRQ0_pin(pin2, vector); + if (timer_irq_works()) { + printk("works.\n"); + nmi_watchdog_default(); + if (nmi_watchdog == NMI_IO_APIC) { + setup_nmi(); + check_nmi_watchdog(); + } + return; + } + /* + * Cleanup, just in case ... + */ + clear_IO_APIC_pin(0, pin2); + } + printk(" failed.\n"); + + if (nmi_watchdog) { + printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n"); + nmi_watchdog = 0; + } + + apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); + + disable_8259A_irq(0); + irq_desc[0].handler = &lapic_irq_type; + apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */ + enable_8259A_irq(0); + + if (timer_irq_works()) { + apic_printk(APIC_QUIET, " works.\n"); + return; + } + apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector); + apic_printk(APIC_VERBOSE," failed.\n"); + + apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ..."); + + init_8259A(0); + make_8259A_irq(0); + apic_write_around(APIC_LVT0, APIC_DM_EXTINT); + + unlock_ExtINT_logic(); + + if (timer_irq_works()) { + apic_printk(APIC_VERBOSE," works.\n"); + return; + } + apic_printk(APIC_VERBOSE," failed :(.\n"); + panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n"); +} + +/* + * + * IRQ's that are handled by the PIC in the MPS IOAPIC case. + * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ. + * Linux doesn't really care, as it's not actually used + * for any interrupt handling anyway. + */ +#define PIC_IRQS (1<<2) + +void __init setup_IO_APIC(void) +{ + enable_IO_APIC(); + + if (acpi_ioapic) + io_apic_irqs = ~0; /* all IRQs go through IOAPIC */ + else + io_apic_irqs = ~PIC_IRQS; + + apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); + + /* + * Set up the IO-APIC IRQ routing table. + */ + if (!acpi_ioapic) + setup_ioapic_ids_from_mpc(); + sync_Arb_IDs(); + setup_IO_APIC_irqs(); + init_IO_APIC_traps(); + check_timer(); + if (!acpi_ioapic) + print_IO_APIC(); +} + +struct sysfs_ioapic_data { + struct sys_device dev; + struct IO_APIC_route_entry entry[0]; +}; +static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS]; + +static int ioapic_suspend(struct sys_device *dev, u32 state) +{ + struct IO_APIC_route_entry *entry; + struct sysfs_ioapic_data *data; + unsigned long flags; + int i; + + data = container_of(dev, struct sysfs_ioapic_data, dev); + entry = data->entry; + spin_lock_irqsave(&ioapic_lock, flags); + for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) { + *(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i); + *(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i); + } + spin_unlock_irqrestore(&ioapic_lock, flags); + + return 0; +} + +static int ioapic_resume(struct sys_device *dev) +{ + struct IO_APIC_route_entry *entry; + struct sysfs_ioapic_data *data; + unsigned long flags; + union IO_APIC_reg_00 reg_00; + int i; + + data = container_of(dev, struct sysfs_ioapic_data, dev); + entry = data->entry; + + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(dev->id, 0); + if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) { + reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid; + io_apic_write(dev->id, 0, reg_00.raw); + } + for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) { + io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1)); + io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0)); + } + spin_unlock_irqrestore(&ioapic_lock, flags); + + return 0; +} + +static struct sysdev_class ioapic_sysdev_class = { + set_kset_name("ioapic"), + .suspend = ioapic_suspend, + .resume = ioapic_resume, +}; + +static int __init ioapic_init_sysfs(void) +{ + struct sys_device * dev; + int i, size, error = 0; + + error = sysdev_class_register(&ioapic_sysdev_class); + if (error) + return error; + + for (i = 0; i < nr_ioapics; i++ ) { + size = sizeof(struct sys_device) + nr_ioapic_registers[i] + * sizeof(struct IO_APIC_route_entry); + mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL); + if (!mp_ioapic_data[i]) { + printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); + continue; + } + memset(mp_ioapic_data[i], 0, size); + dev = &mp_ioapic_data[i]->dev; + dev->id = i; + dev->cls = &ioapic_sysdev_class; + error = sysdev_register(dev); + if (error) { + kfree(mp_ioapic_data[i]); + mp_ioapic_data[i] = NULL; + printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); + continue; + } + } + + return 0; +} + +device_initcall(ioapic_init_sysfs); + +/* -------------------------------------------------------------------------- + ACPI-based IOAPIC Configuration + -------------------------------------------------------------------------- */ + +#ifdef CONFIG_ACPI_BOOT + +#define IO_APIC_MAX_ID 0xFE + +int __init io_apic_get_unique_id (int ioapic, int apic_id) +{ + union IO_APIC_reg_00 reg_00; + static physid_mask_t apic_id_map; + unsigned long flags; + int i = 0; + + /* + * The P4 platform supports up to 256 APIC IDs on two separate APIC + * buses (one for LAPICs, one for IOAPICs), where predecessors only + * supports up to 16 on one shared APIC bus. + * + * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full + * advantage of new APIC bus architecture. + */ + + if (physids_empty(apic_id_map)) + apic_id_map = phys_cpu_present_map; + + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(ioapic, 0); + spin_unlock_irqrestore(&ioapic_lock, flags); + + if (apic_id >= IO_APIC_MAX_ID) { + apic_printk(APIC_QUIET, KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying " + "%d\n", ioapic, apic_id, reg_00.bits.ID); + apic_id = reg_00.bits.ID; + } + + /* + * Every APIC in a system must have a unique ID or we get lots of nice + * 'stuck on smp_invalidate_needed IPI wait' messages. + */ + if (physid_isset(apic_id, apic_id_map)) { + + for (i = 0; i < IO_APIC_MAX_ID; i++) { + if (!physid_isset(i, apic_id_map)) + break; + } + + if (i == IO_APIC_MAX_ID) + panic("Max apic_id exceeded!\n"); + + apic_printk(APIC_VERBOSE, KERN_WARNING "IOAPIC[%d]: apic_id %d already used, " + "trying %d\n", ioapic, apic_id, i); + + apic_id = i; + } + + physid_set(apic_id, apic_id_map); + + if (reg_00.bits.ID != apic_id) { + reg_00.bits.ID = apic_id; + + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(ioapic, 0, reg_00.raw); + reg_00.raw = io_apic_read(ioapic, 0); + spin_unlock_irqrestore(&ioapic_lock, flags); + + /* Sanity check */ + if (reg_00.bits.ID != apic_id) + panic("IOAPIC[%d]: Unable change apic_id!\n", ioapic); + } + + apic_printk(APIC_VERBOSE,KERN_INFO "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id); + + return apic_id; +} + + +int __init io_apic_get_version (int ioapic) +{ + union IO_APIC_reg_01 reg_01; + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + reg_01.raw = io_apic_read(ioapic, 1); + spin_unlock_irqrestore(&ioapic_lock, flags); + + return reg_01.bits.version; +} + + +int __init io_apic_get_redir_entries (int ioapic) +{ + union IO_APIC_reg_01 reg_01; + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + reg_01.raw = io_apic_read(ioapic, 1); + spin_unlock_irqrestore(&ioapic_lock, flags); + + return reg_01.bits.entries; +} + + +int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low) +{ + struct IO_APIC_route_entry entry; + unsigned long flags; + + if (!IO_APIC_IRQ(irq)) { + apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", + ioapic); + return -EINVAL; + } + + /* + * Generate a PCI IRQ routing entry and program the IOAPIC accordingly. + * Note that we mask (disable) IRQs now -- these get enabled when the + * corresponding device driver registers for this IRQ. + */ + + memset(&entry,0,sizeof(entry)); + + entry.delivery_mode = INT_DELIVERY_MODE; + entry.dest_mode = INT_DEST_MODE; + entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); + entry.trigger = edge_level; + entry.polarity = active_high_low; + entry.mask = 1; /* Disabled (masked) */ + + /* + * IRQs < 16 are already in the irq_2_pin[] map + */ + if (irq >= 16) + add_pin_to_irq(irq, ioapic, pin); + + entry.vector = assign_irq_vector(irq); + + apic_printk(APIC_VERBOSE,KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry (%d-%d -> 0x%x -> " + "IRQ %d Mode:%i Active:%i)\n", ioapic, + mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq, + edge_level, active_high_low); + + ioapic_register_intr(irq, entry.vector, edge_level); + + if (!ioapic && (irq < 16)) + disable_8259A_irq(irq); + + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1)); + io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0)); + spin_unlock_irqrestore(&ioapic_lock, flags); + + return 0; +} + +#endif /*CONFIG_ACPI_BOOT*/ + + +/* + * This function currently is only a helper for the i386 smp boot process where + * we need to reprogram the ioredtbls to cater for the cpus which have come online + * so mask in all cases should simply be TARGET_CPUS + */ +void __init setup_ioapic_dest(void) +{ + int pin, ioapic, irq, irq_entry; + + if (skip_ioapic_setup == 1) + return; + + for (ioapic = 0; ioapic < nr_ioapics; ioapic++) { + for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { + irq_entry = find_irq_entry(ioapic, pin, mp_INT); + if (irq_entry == -1) + continue; + irq = pin_2_irq(irq_entry, ioapic, pin); + set_ioapic_affinity_irq(irq, TARGET_CPUS); + } + + } +} diff --git a/arch/x86_64/kernel/ioport.c b/arch/x86_64/kernel/ioport.c new file mode 100644 index 00000000000..cde0e868efe --- /dev/null +++ b/arch/x86_64/kernel/ioport.c @@ -0,0 +1,117 @@ +/* + * linux/arch/x86_64/kernel/ioport.c + * + * This contains the io-permission bitmap code - written by obz, with changes + * by Linus. + */ + +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/ioport.h> +#include <linux/smp.h> +#include <linux/smp_lock.h> +#include <linux/stddef.h> +#include <linux/slab.h> +#include <linux/thread_info.h> + +/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ +static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value) +{ + int i; + if (new_value) + for (i = base; i < base + extent; i++) + __set_bit(i, bitmap); + else + for (i = base; i < base + extent; i++) + clear_bit(i, bitmap); +} + +/* + * this changes the io permissions bitmap in the current task. + */ +asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) +{ + unsigned int i, max_long, bytes, bytes_updated; + struct thread_struct * t = ¤t->thread; + struct tss_struct * tss; + unsigned long *bitmap; + + if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) + return -EINVAL; + if (turn_on && !capable(CAP_SYS_RAWIO)) + return -EPERM; + + /* + * If it's the first ioperm() call in this thread's lifetime, set the + * IO bitmap up. ioperm() is much less timing critical than clone(), + * this is why we delay this operation until now: + */ + if (!t->io_bitmap_ptr) { + bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); + if (!bitmap) + return -ENOMEM; + + memset(bitmap, 0xff, IO_BITMAP_BYTES); + t->io_bitmap_ptr = bitmap; + } + + /* + * do it in the per-thread copy and in the TSS ... + * + * Disable preemption via get_cpu() - we must not switch away + * because the ->io_bitmap_max value must match the bitmap + * contents: + */ + tss = &per_cpu(init_tss, get_cpu()); + + set_bitmap(t->io_bitmap_ptr, from, num, !turn_on); + + /* + * Search for a (possibly new) maximum. This is simple and stupid, + * to keep it obviously correct: + */ + max_long = 0; + for (i = 0; i < IO_BITMAP_LONGS; i++) + if (t->io_bitmap_ptr[i] != ~0UL) + max_long = i; + + bytes = (max_long + 1) * sizeof(long); + bytes_updated = max(bytes, t->io_bitmap_max); + + t->io_bitmap_max = bytes; + + /* Update the TSS: */ + memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated); + + put_cpu(); + + return 0; +} + +/* + * sys_iopl has to be used when you want to access the IO ports + * beyond the 0x3ff range: to get the full 65536 ports bitmapped + * you'd need 8kB of bitmaps/process, which is a bit excessive. + * + * Here we just change the eflags value on the stack: we allow + * only the super-user to do it. This depends on the stack-layout + * on system-call entry - see also fork() and the signal handling + * code. + */ + +asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs) +{ + unsigned int old = (regs->eflags >> 12) & 3; + + if (level > 3) + return -EINVAL; + /* Trying to gain more privileges? */ + if (level > old) { + if (!capable(CAP_SYS_RAWIO)) + return -EPERM; + } + regs->eflags = (regs->eflags &~ 0x3000UL) | (level << 12); + return 0; +} diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c new file mode 100644 index 00000000000..62b112e4deb --- /dev/null +++ b/arch/x86_64/kernel/irq.c @@ -0,0 +1,108 @@ +/* + * linux/arch/x86_64/kernel/irq.c + * + * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar + * + * This file contains the lowest level x86_64-specific interrupt + * entry and irq statistics code. All the remaining irq logic is + * done by the generic kernel/irq/ code and in the + * x86_64-specific irq controller code. (e.g. i8259.c and + * io_apic.c.) + */ + +#include <linux/kernel_stat.h> +#include <linux/interrupt.h> +#include <linux/seq_file.h> +#include <linux/module.h> +#include <asm/uaccess.h> +#include <asm/io_apic.h> + +atomic_t irq_err_count; +#ifdef CONFIG_X86_IO_APIC +#ifdef APIC_MISMATCH_DEBUG +atomic_t irq_mis_count; +#endif +#endif + +/* + * Generic, controller-independent functions: + */ + +int show_interrupts(struct seq_file *p, void *v) +{ + int i = *(loff_t *) v, j; + struct irqaction * action; + unsigned long flags; + + if (i == 0) { + seq_printf(p, " "); + for (j=0; j<NR_CPUS; j++) + if (cpu_online(j)) + seq_printf(p, "CPU%d ",j); + seq_putc(p, '\n'); + } + + if (i < NR_IRQS) { + spin_lock_irqsave(&irq_desc[i].lock, flags); + action = irq_desc[i].action; + if (!action) + goto skip; + seq_printf(p, "%3d: ",i); +#ifndef CONFIG_SMP + seq_printf(p, "%10u ", kstat_irqs(i)); +#else + for (j=0; j<NR_CPUS; j++) + if (cpu_online(j)) + seq_printf(p, "%10u ", + kstat_cpu(j).irqs[i]); +#endif + seq_printf(p, " %14s", irq_desc[i].handler->typename); + + seq_printf(p, " %s", action->name); + for (action=action->next; action; action = action->next) + seq_printf(p, ", %s", action->name); + seq_putc(p, '\n'); +skip: + spin_unlock_irqrestore(&irq_desc[i].lock, flags); + } else if (i == NR_IRQS) { + seq_printf(p, "NMI: "); + for (j = 0; j < NR_CPUS; j++) + if (cpu_online(j)) + seq_printf(p, "%10u ", cpu_pda[j].__nmi_count); + seq_putc(p, '\n'); +#ifdef CONFIG_X86_LOCAL_APIC + seq_printf(p, "LOC: "); + for (j = 0; j < NR_CPUS; j++) + if (cpu_online(j)) + seq_printf(p, "%10u ", cpu_pda[j].apic_timer_irqs); + seq_putc(p, '\n'); +#endif + seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); +#ifdef CONFIG_X86_IO_APIC +#ifdef APIC_MISMATCH_DEBUG + seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count)); +#endif +#endif + } + return 0; +} + +/* + * do_IRQ handles all normal device IRQ's (the special + * SMP cross-CPU interrupts have their own specific + * handlers). + */ +asmlinkage unsigned int do_IRQ(struct pt_regs *regs) +{ + /* high bits used in ret_from_ code */ + unsigned irq = regs->orig_rax & 0xff; + + irq_enter(); + BUG_ON(irq > 256); + + __do_IRQ(irq, regs); + irq_exit(); + + return 1; +} + diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c new file mode 100644 index 00000000000..4f2a852299b --- /dev/null +++ b/arch/x86_64/kernel/kprobes.c @@ -0,0 +1,631 @@ +/* + * Kernel Probes (KProbes) + * arch/x86_64/kernel/kprobes.c + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2002, 2004 + * + * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel + * Probes initial implementation ( includes contributions from + * Rusty Russell). + * 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes + * interface to access function arguments. + * 2004-Oct Jim Keniston <kenistoj@us.ibm.com> and Prasanna S Panchamukhi + * <prasanna@in.ibm.com> adapted for x86_64 + * 2005-Mar Roland McGrath <roland@redhat.com> + * Fixed to handle %rip-relative addressing mode correctly. + */ + +#include <linux/config.h> +#include <linux/kprobes.h> +#include <linux/ptrace.h> +#include <linux/spinlock.h> +#include <linux/string.h> +#include <linux/slab.h> +#include <linux/preempt.h> +#include <linux/moduleloader.h> + +#include <asm/pgtable.h> +#include <asm/kdebug.h> + +static DECLARE_MUTEX(kprobe_mutex); + +/* kprobe_status settings */ +#define KPROBE_HIT_ACTIVE 0x00000001 +#define KPROBE_HIT_SS 0x00000002 + +static struct kprobe *current_kprobe; +static unsigned long kprobe_status, kprobe_old_rflags, kprobe_saved_rflags; +static struct pt_regs jprobe_saved_regs; +static long *jprobe_saved_rsp; +static kprobe_opcode_t *get_insn_slot(void); +static void free_insn_slot(kprobe_opcode_t *slot); +void jprobe_return_end(void); + +/* copy of the kernel stack at the probe fire time */ +static kprobe_opcode_t jprobes_stack[MAX_STACK_SIZE]; + +/* + * returns non-zero if opcode modifies the interrupt flag. + */ +static inline int is_IF_modifier(kprobe_opcode_t *insn) +{ + switch (*insn) { + case 0xfa: /* cli */ + case 0xfb: /* sti */ + case 0xcf: /* iret/iretd */ + case 0x9d: /* popf/popfd */ + return 1; + } + + if (*insn >= 0x40 && *insn <= 0x4f && *++insn == 0xcf) + return 1; + return 0; +} + +int arch_prepare_kprobe(struct kprobe *p) +{ + /* insn: must be on special executable page on x86_64. */ + up(&kprobe_mutex); + p->ainsn.insn = get_insn_slot(); + down(&kprobe_mutex); + if (!p->ainsn.insn) { + return -ENOMEM; + } + return 0; +} + +/* + * Determine if the instruction uses the %rip-relative addressing mode. + * If it does, return the address of the 32-bit displacement word. + * If not, return null. + */ +static inline s32 *is_riprel(u8 *insn) +{ +#define W(row,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf) \ + (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ + (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \ + (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \ + (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \ + << (row % 64)) + static const u64 onebyte_has_modrm[256 / 64] = { + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* ------------------------------- */ + W(0x00, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 00 */ + W(0x10, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 10 */ + W(0x20, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 20 */ + W(0x30, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0), /* 30 */ + W(0x40, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 40 */ + W(0x50, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 50 */ + W(0x60, 0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0)| /* 60 */ + W(0x70, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 70 */ + W(0x80, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 80 */ + W(0x90, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 90 */ + W(0xa0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* a0 */ + W(0xb0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* b0 */ + W(0xc0, 1,1,0,0,1,1,1,1,0,0,0,0,0,0,0,0)| /* c0 */ + W(0xd0, 1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1)| /* d0 */ + W(0xe0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* e0 */ + W(0xf0, 0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,1) /* f0 */ + /* ------------------------------- */ + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + }; + static const u64 twobyte_has_modrm[256 / 64] = { + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* ------------------------------- */ + W(0x00, 1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,1)| /* 0f */ + W(0x10, 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0)| /* 1f */ + W(0x20, 1,1,1,1,1,0,1,0,1,1,1,1,1,1,1,1)| /* 2f */ + W(0x30, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 3f */ + W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 4f */ + W(0x50, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 5f */ + W(0x60, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 6f */ + W(0x70, 1,1,1,1,1,1,1,0,0,0,0,0,1,1,1,1), /* 7f */ + W(0x80, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 8f */ + W(0x90, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 9f */ + W(0xa0, 0,0,0,1,1,1,1,1,0,0,0,1,1,1,1,1)| /* af */ + W(0xb0, 1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1), /* bf */ + W(0xc0, 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0)| /* cf */ + W(0xd0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* df */ + W(0xe0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* ef */ + W(0xf0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0) /* ff */ + /* ------------------------------- */ + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + }; +#undef W + int need_modrm; + + /* Skip legacy instruction prefixes. */ + while (1) { + switch (*insn) { + case 0x66: + case 0x67: + case 0x2e: + case 0x3e: + case 0x26: + case 0x64: + case 0x65: + case 0x36: + case 0xf0: + case 0xf3: + case 0xf2: + ++insn; + continue; + } + break; + } + + /* Skip REX instruction prefix. */ + if ((*insn & 0xf0) == 0x40) + ++insn; + + if (*insn == 0x0f) { /* Two-byte opcode. */ + ++insn; + need_modrm = test_bit(*insn, twobyte_has_modrm); + } else { /* One-byte opcode. */ + need_modrm = test_bit(*insn, onebyte_has_modrm); + } + + if (need_modrm) { + u8 modrm = *++insn; + if ((modrm & 0xc7) == 0x05) { /* %rip+disp32 addressing mode */ + /* Displacement follows ModRM byte. */ + return (s32 *) ++insn; + } + } + + /* No %rip-relative addressing mode here. */ + return NULL; +} + +void arch_copy_kprobe(struct kprobe *p) +{ + s32 *ripdisp; + memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE); + ripdisp = is_riprel(p->ainsn.insn); + if (ripdisp) { + /* + * The copied instruction uses the %rip-relative + * addressing mode. Adjust the displacement for the + * difference between the original location of this + * instruction and the location of the copy that will + * actually be run. The tricky bit here is making sure + * that the sign extension happens correctly in this + * calculation, since we need a signed 32-bit result to + * be sign-extended to 64 bits when it's added to the + * %rip value and yield the same 64-bit result that the + * sign-extension of the original signed 32-bit + * displacement would have given. + */ + s64 disp = (u8 *) p->addr + *ripdisp - (u8 *) p->ainsn.insn; + BUG_ON((s64) (s32) disp != disp); /* Sanity check. */ + *ripdisp = disp; + } +} + +void arch_remove_kprobe(struct kprobe *p) +{ + up(&kprobe_mutex); + free_insn_slot(p->ainsn.insn); + down(&kprobe_mutex); +} + +static inline void disarm_kprobe(struct kprobe *p, struct pt_regs *regs) +{ + *p->addr = p->opcode; + regs->rip = (unsigned long)p->addr; +} + +static void prepare_singlestep(struct kprobe *p, struct pt_regs *regs) +{ + regs->eflags |= TF_MASK; + regs->eflags &= ~IF_MASK; + /*single step inline if the instruction is an int3*/ + if (p->opcode == BREAKPOINT_INSTRUCTION) + regs->rip = (unsigned long)p->addr; + else + regs->rip = (unsigned long)p->ainsn.insn; +} + +/* + * Interrupts are disabled on entry as trap3 is an interrupt gate and they + * remain disabled thorough out this function. + */ +int kprobe_handler(struct pt_regs *regs) +{ + struct kprobe *p; + int ret = 0; + kprobe_opcode_t *addr = (kprobe_opcode_t *)(regs->rip - sizeof(kprobe_opcode_t)); + + /* We're in an interrupt, but this is clear and BUG()-safe. */ + preempt_disable(); + + /* Check we're not actually recursing */ + if (kprobe_running()) { + /* We *are* holding lock here, so this is safe. + Disarm the probe we just hit, and ignore it. */ + p = get_kprobe(addr); + if (p) { + if (kprobe_status == KPROBE_HIT_SS) { + regs->eflags &= ~TF_MASK; + regs->eflags |= kprobe_saved_rflags; + unlock_kprobes(); + goto no_kprobe; + } + disarm_kprobe(p, regs); + ret = 1; + } else { + p = current_kprobe; + if (p->break_handler && p->break_handler(p, regs)) { + goto ss_probe; + } + } + /* If it's not ours, can't be delete race, (we hold lock). */ + goto no_kprobe; + } + + lock_kprobes(); + p = get_kprobe(addr); + if (!p) { + unlock_kprobes(); + if (*addr != BREAKPOINT_INSTRUCTION) { + /* + * The breakpoint instruction was removed right + * after we hit it. Another cpu has removed + * either a probepoint or a debugger breakpoint + * at this address. In either case, no further + * handling of this interrupt is appropriate. + */ + ret = 1; + } + /* Not one of ours: let kernel handle it */ + goto no_kprobe; + } + + kprobe_status = KPROBE_HIT_ACTIVE; + current_kprobe = p; + kprobe_saved_rflags = kprobe_old_rflags + = (regs->eflags & (TF_MASK | IF_MASK)); + if (is_IF_modifier(p->ainsn.insn)) + kprobe_saved_rflags &= ~IF_MASK; + + if (p->pre_handler && p->pre_handler(p, regs)) + /* handler has already set things up, so skip ss setup */ + return 1; + +ss_probe: + prepare_singlestep(p, regs); + kprobe_status = KPROBE_HIT_SS; + return 1; + +no_kprobe: + preempt_enable_no_resched(); + return ret; +} + +/* + * Called after single-stepping. p->addr is the address of the + * instruction whose first byte has been replaced by the "int 3" + * instruction. To avoid the SMP problems that can occur when we + * temporarily put back the original opcode to single-step, we + * single-stepped a copy of the instruction. The address of this + * copy is p->ainsn.insn. + * + * This function prepares to return from the post-single-step + * interrupt. We have to fix up the stack as follows: + * + * 0) Except in the case of absolute or indirect jump or call instructions, + * the new rip is relative to the copied instruction. We need to make + * it relative to the original instruction. + * + * 1) If the single-stepped instruction was pushfl, then the TF and IF + * flags are set in the just-pushed eflags, and may need to be cleared. + * + * 2) If the single-stepped instruction was a call, the return address + * that is atop the stack is the address following the copied instruction. + * We need to make it the address following the original instruction. + */ +static void resume_execution(struct kprobe *p, struct pt_regs *regs) +{ + unsigned long *tos = (unsigned long *)regs->rsp; + unsigned long next_rip = 0; + unsigned long copy_rip = (unsigned long)p->ainsn.insn; + unsigned long orig_rip = (unsigned long)p->addr; + kprobe_opcode_t *insn = p->ainsn.insn; + + /*skip the REX prefix*/ + if (*insn >= 0x40 && *insn <= 0x4f) + insn++; + + switch (*insn) { + case 0x9c: /* pushfl */ + *tos &= ~(TF_MASK | IF_MASK); + *tos |= kprobe_old_rflags; + break; + case 0xe8: /* call relative - Fix return addr */ + *tos = orig_rip + (*tos - copy_rip); + break; + case 0xff: + if ((*insn & 0x30) == 0x10) { + /* call absolute, indirect */ + /* Fix return addr; rip is correct. */ + next_rip = regs->rip; + *tos = orig_rip + (*tos - copy_rip); + } else if (((*insn & 0x31) == 0x20) || /* jmp near, absolute indirect */ + ((*insn & 0x31) == 0x21)) { /* jmp far, absolute indirect */ + /* rip is correct. */ + next_rip = regs->rip; + } + break; + case 0xea: /* jmp absolute -- rip is correct */ + next_rip = regs->rip; + break; + default: + break; + } + + regs->eflags &= ~TF_MASK; + if (next_rip) { + regs->rip = next_rip; + } else { + regs->rip = orig_rip + (regs->rip - copy_rip); + } +} + +/* + * Interrupts are disabled on entry as trap1 is an interrupt gate and they + * remain disabled thoroughout this function. And we hold kprobe lock. + */ +int post_kprobe_handler(struct pt_regs *regs) +{ + if (!kprobe_running()) + return 0; + + if (current_kprobe->post_handler) + current_kprobe->post_handler(current_kprobe, regs, 0); + + resume_execution(current_kprobe, regs); + regs->eflags |= kprobe_saved_rflags; + + unlock_kprobes(); + preempt_enable_no_resched(); + + /* + * if somebody else is singlestepping across a probe point, eflags + * will have TF set, in which case, continue the remaining processing + * of do_debug, as if this is not a probe hit. + */ + if (regs->eflags & TF_MASK) + return 0; + + return 1; +} + +/* Interrupts disabled, kprobe_lock held. */ +int kprobe_fault_handler(struct pt_regs *regs, int trapnr) +{ + if (current_kprobe->fault_handler + && current_kprobe->fault_handler(current_kprobe, regs, trapnr)) + return 1; + + if (kprobe_status & KPROBE_HIT_SS) { + resume_execution(current_kprobe, regs); + regs->eflags |= kprobe_old_rflags; + + unlock_kprobes(); + preempt_enable_no_resched(); + } + return 0; +} + +/* + * Wrapper routine for handling exceptions. + */ +int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, + void *data) +{ + struct die_args *args = (struct die_args *)data; + switch (val) { + case DIE_INT3: + if (kprobe_handler(args->regs)) + return NOTIFY_STOP; + break; + case DIE_DEBUG: + if (post_kprobe_handler(args->regs)) + return NOTIFY_STOP; + break; + case DIE_GPF: + if (kprobe_running() && + kprobe_fault_handler(args->regs, args->trapnr)) + return NOTIFY_STOP; + break; + case DIE_PAGE_FAULT: + if (kprobe_running() && + kprobe_fault_handler(args->regs, args->trapnr)) + return NOTIFY_STOP; + break; + default: + break; + } + return NOTIFY_DONE; +} + +int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) +{ + struct jprobe *jp = container_of(p, struct jprobe, kp); + unsigned long addr; + + jprobe_saved_regs = *regs; + jprobe_saved_rsp = (long *) regs->rsp; + addr = (unsigned long)jprobe_saved_rsp; + /* + * As Linus pointed out, gcc assumes that the callee + * owns the argument space and could overwrite it, e.g. + * tailcall optimization. So, to be absolutely safe + * we also save and restore enough stack bytes to cover + * the argument area. + */ + memcpy(jprobes_stack, (kprobe_opcode_t *) addr, MIN_STACK_SIZE(addr)); + regs->eflags &= ~IF_MASK; + regs->rip = (unsigned long)(jp->entry); + return 1; +} + +void jprobe_return(void) +{ + preempt_enable_no_resched(); + asm volatile (" xchg %%rbx,%%rsp \n" + " int3 \n" + " .globl jprobe_return_end \n" + " jprobe_return_end: \n" + " nop \n"::"b" + (jprobe_saved_rsp):"memory"); +} + +int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) +{ + u8 *addr = (u8 *) (regs->rip - 1); + unsigned long stack_addr = (unsigned long)jprobe_saved_rsp; + struct jprobe *jp = container_of(p, struct jprobe, kp); + + if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) { + if ((long *)regs->rsp != jprobe_saved_rsp) { + struct pt_regs *saved_regs = + container_of(jprobe_saved_rsp, struct pt_regs, rsp); + printk("current rsp %p does not match saved rsp %p\n", + (long *)regs->rsp, jprobe_saved_rsp); + printk("Saved registers for jprobe %p\n", jp); + show_registers(saved_regs); + printk("Current registers\n"); + show_registers(regs); + BUG(); + } + *regs = jprobe_saved_regs; + memcpy((kprobe_opcode_t *) stack_addr, jprobes_stack, + MIN_STACK_SIZE(stack_addr)); + return 1; + } + return 0; +} + +/* + * kprobe->ainsn.insn points to the copy of the instruction to be single-stepped. + * By default on x86_64, pages we get from kmalloc or vmalloc are not + * executable. Single-stepping an instruction on such a page yields an + * oops. So instead of storing the instruction copies in their respective + * kprobe objects, we allocate a page, map it executable, and store all the + * instruction copies there. (We can allocate additional pages if somebody + * inserts a huge number of probes.) Each page can hold up to INSNS_PER_PAGE + * instruction slots, each of which is MAX_INSN_SIZE*sizeof(kprobe_opcode_t) + * bytes. + */ +#define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE*sizeof(kprobe_opcode_t))) +struct kprobe_insn_page { + struct hlist_node hlist; + kprobe_opcode_t *insns; /* page of instruction slots */ + char slot_used[INSNS_PER_PAGE]; + int nused; +}; + +static struct hlist_head kprobe_insn_pages; + +/** + * get_insn_slot() - Find a slot on an executable page for an instruction. + * We allocate an executable page if there's no room on existing ones. + */ +static kprobe_opcode_t *get_insn_slot(void) +{ + struct kprobe_insn_page *kip; + struct hlist_node *pos; + + hlist_for_each(pos, &kprobe_insn_pages) { + kip = hlist_entry(pos, struct kprobe_insn_page, hlist); + if (kip->nused < INSNS_PER_PAGE) { + int i; + for (i = 0; i < INSNS_PER_PAGE; i++) { + if (!kip->slot_used[i]) { + kip->slot_used[i] = 1; + kip->nused++; + return kip->insns + (i*MAX_INSN_SIZE); + } + } + /* Surprise! No unused slots. Fix kip->nused. */ + kip->nused = INSNS_PER_PAGE; + } + } + + /* All out of space. Need to allocate a new page. Use slot 0.*/ + kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL); + if (!kip) { + return NULL; + } + + /* + * For the %rip-relative displacement fixups to be doable, we + * need our instruction copy to be within +/- 2GB of any data it + * might access via %rip. That is, within 2GB of where the + * kernel image and loaded module images reside. So we allocate + * a page in the module loading area. + */ + kip->insns = module_alloc(PAGE_SIZE); + if (!kip->insns) { + kfree(kip); + return NULL; + } + INIT_HLIST_NODE(&kip->hlist); + hlist_add_head(&kip->hlist, &kprobe_insn_pages); + memset(kip->slot_used, 0, INSNS_PER_PAGE); + kip->slot_used[0] = 1; + kip->nused = 1; + return kip->insns; +} + +/** + * free_insn_slot() - Free instruction slot obtained from get_insn_slot(). + */ +static void free_insn_slot(kprobe_opcode_t *slot) +{ + struct kprobe_insn_page *kip; + struct hlist_node *pos; + + hlist_for_each(pos, &kprobe_insn_pages) { + kip = hlist_entry(pos, struct kprobe_insn_page, hlist); + if (kip->insns <= slot + && slot < kip->insns+(INSNS_PER_PAGE*MAX_INSN_SIZE)) { + int i = (slot - kip->insns) / MAX_INSN_SIZE; + kip->slot_used[i] = 0; + kip->nused--; + if (kip->nused == 0) { + /* + * Page is no longer in use. Free it unless + * it's the last one. We keep the last one + * so as not to have to set it up again the + * next time somebody inserts a probe. + */ + hlist_del(&kip->hlist); + if (hlist_empty(&kprobe_insn_pages)) { + INIT_HLIST_NODE(&kip->hlist); + hlist_add_head(&kip->hlist, + &kprobe_insn_pages); + } else { + module_free(NULL, kip->insns); + kfree(kip); + } + } + return; + } + } +} diff --git a/arch/x86_64/kernel/ldt.c b/arch/x86_64/kernel/ldt.c new file mode 100644 index 00000000000..d7e5d0cf428 --- /dev/null +++ b/arch/x86_64/kernel/ldt.c @@ -0,0 +1,253 @@ +/* + * linux/arch/x86_64/kernel/ldt.c + * + * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds + * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> + * Copyright (C) 2002 Andi Kleen + * + * This handles calls from both 32bit and 64bit mode. + */ + +#include <linux/errno.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/smp_lock.h> +#include <linux/vmalloc.h> +#include <linux/slab.h> + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/ldt.h> +#include <asm/desc.h> +#include <asm/proto.h> + +#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */ +static void flush_ldt(void *null) +{ + if (current->active_mm) + load_LDT(¤t->active_mm->context); +} +#endif + +static int alloc_ldt(mm_context_t *pc, unsigned mincount, int reload) +{ + void *oldldt; + void *newldt; + unsigned oldsize; + + if (mincount <= (unsigned)pc->size) + return 0; + oldsize = pc->size; + mincount = (mincount+511)&(~511); + if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE) + newldt = vmalloc(mincount*LDT_ENTRY_SIZE); + else + newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); + + if (!newldt) + return -ENOMEM; + + if (oldsize) + memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE); + oldldt = pc->ldt; + memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE); + wmb(); + pc->ldt = newldt; + wmb(); + pc->size = mincount; + wmb(); + if (reload) { +#ifdef CONFIG_SMP + cpumask_t mask; + + preempt_disable(); + mask = cpumask_of_cpu(smp_processor_id()); + load_LDT(pc); + if (!cpus_equal(current->mm->cpu_vm_mask, mask)) + smp_call_function(flush_ldt, NULL, 1, 1); + preempt_enable(); +#else + load_LDT(pc); +#endif + } + if (oldsize) { + if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE) + vfree(oldldt); + else + kfree(oldldt); + } + return 0; +} + +static inline int copy_ldt(mm_context_t *new, mm_context_t *old) +{ + int err = alloc_ldt(new, old->size, 0); + if (err < 0) + return err; + memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE); + return 0; +} + +/* + * we do not have to muck with descriptors here, that is + * done in switch_mm() as needed. + */ +int init_new_context(struct task_struct *tsk, struct mm_struct *mm) +{ + struct mm_struct * old_mm; + int retval = 0; + + init_MUTEX(&mm->context.sem); + mm->context.size = 0; + old_mm = current->mm; + if (old_mm && old_mm->context.size > 0) { + down(&old_mm->context.sem); + retval = copy_ldt(&mm->context, &old_mm->context); + up(&old_mm->context.sem); + } + return retval; +} + +/* + * + * Don't touch the LDT register - we're already in the next thread. + */ +void destroy_context(struct mm_struct *mm) +{ + if (mm->context.size) { + if ((unsigned)mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE) + vfree(mm->context.ldt); + else + kfree(mm->context.ldt); + mm->context.size = 0; + } +} + +static int read_ldt(void __user * ptr, unsigned long bytecount) +{ + int err; + unsigned long size; + struct mm_struct * mm = current->mm; + + if (!mm->context.size) + return 0; + if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES) + bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES; + + down(&mm->context.sem); + size = mm->context.size*LDT_ENTRY_SIZE; + if (size > bytecount) + size = bytecount; + + err = 0; + if (copy_to_user(ptr, mm->context.ldt, size)) + err = -EFAULT; + up(&mm->context.sem); + if (err < 0) + goto error_return; + if (size != bytecount) { + /* zero-fill the rest */ + if (clear_user(ptr+size, bytecount-size) != 0) { + err = -EFAULT; + goto error_return; + } + } + return bytecount; +error_return: + return err; +} + +static int read_default_ldt(void __user * ptr, unsigned long bytecount) +{ + /* Arbitrary number */ + /* x86-64 default LDT is all zeros */ + if (bytecount > 128) + bytecount = 128; + if (clear_user(ptr, bytecount)) + return -EFAULT; + return bytecount; +} + +static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode) +{ + struct task_struct *me = current; + struct mm_struct * mm = me->mm; + __u32 entry_1, entry_2, *lp; + int error; + struct user_desc ldt_info; + + error = -EINVAL; + + if (bytecount != sizeof(ldt_info)) + goto out; + error = -EFAULT; + if (copy_from_user(&ldt_info, ptr, bytecount)) + goto out; + + error = -EINVAL; + if (ldt_info.entry_number >= LDT_ENTRIES) + goto out; + if (ldt_info.contents == 3) { + if (oldmode) + goto out; + if (ldt_info.seg_not_present == 0) + goto out; + } + + down(&mm->context.sem); + if (ldt_info.entry_number >= (unsigned)mm->context.size) { + error = alloc_ldt(¤t->mm->context, ldt_info.entry_number+1, 1); + if (error < 0) + goto out_unlock; + } + + lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt); + + /* Allow LDTs to be cleared by the user. */ + if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { + if (oldmode || LDT_empty(&ldt_info)) { + entry_1 = 0; + entry_2 = 0; + goto install; + } + } + + entry_1 = LDT_entry_a(&ldt_info); + entry_2 = LDT_entry_b(&ldt_info); + if (oldmode) + entry_2 &= ~(1 << 20); + + /* Install the new entry ... */ +install: + *lp = entry_1; + *(lp+1) = entry_2; + error = 0; + +out_unlock: + up(&mm->context.sem); +out: + return error; +} + +asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount) +{ + int ret = -ENOSYS; + + switch (func) { + case 0: + ret = read_ldt(ptr, bytecount); + break; + case 1: + ret = write_ldt(ptr, bytecount, 1); + break; + case 2: + ret = read_default_ldt(ptr, bytecount); + break; + case 0x11: + ret = write_ldt(ptr, bytecount, 0); + break; + } + return ret; +} diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c new file mode 100644 index 00000000000..86f9fd85016 --- /dev/null +++ b/arch/x86_64/kernel/mce.c @@ -0,0 +1,548 @@ +/* + * Machine check handler. + * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. + * Rest from unknown author(s). + * 2004 Andi Kleen. Rewrote most of it. + */ + +#include <linux/init.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/rcupdate.h> +#include <linux/kallsyms.h> +#include <linux/sysdev.h> +#include <linux/miscdevice.h> +#include <linux/fs.h> +#include <asm/processor.h> +#include <asm/msr.h> +#include <asm/mce.h> +#include <asm/kdebug.h> +#include <asm/uaccess.h> + +#define MISC_MCELOG_MINOR 227 +#define NR_BANKS 5 + +static int mce_dont_init; + +/* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic, + 3: never panic or exit (for testing only) */ +static int tolerant = 1; +static int banks; +static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL }; +static unsigned long console_logged; +static int notify_user; + +/* + * Lockless MCE logging infrastructure. + * This avoids deadlocks on printk locks without having to break locks. Also + * separate MCEs from kernel messages to avoid bogus bug reports. + */ + +struct mce_log mcelog = { + MCE_LOG_SIGNATURE, + MCE_LOG_LEN, +}; + +void mce_log(struct mce *mce) +{ + unsigned next, entry; + mce->finished = 0; + smp_wmb(); + for (;;) { + entry = rcu_dereference(mcelog.next); + /* When the buffer fills up discard new entries. Assume + that the earlier errors are the more interesting. */ + if (entry >= MCE_LOG_LEN) { + set_bit(MCE_OVERFLOW, &mcelog.flags); + return; + } + /* Old left over entry. Skip. */ + if (mcelog.entry[entry].finished) + continue; + smp_rmb(); + next = entry + 1; + if (cmpxchg(&mcelog.next, entry, next) == entry) + break; + } + memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); + smp_wmb(); + mcelog.entry[entry].finished = 1; + smp_wmb(); + + if (!test_and_set_bit(0, &console_logged)) + notify_user = 1; +} + +static void print_mce(struct mce *m) +{ + printk(KERN_EMERG "\n" + KERN_EMERG + "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", + m->cpu, m->mcgstatus, m->bank, m->status); + if (m->rip) { + printk(KERN_EMERG + "RIP%s %02x:<%016Lx> ", + !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", + m->cs, m->rip); + if (m->cs == __KERNEL_CS) + print_symbol("{%s}", m->rip); + printk("\n"); + } + printk(KERN_EMERG "TSC %Lx ", m->tsc); + if (m->addr) + printk("ADDR %Lx ", m->addr); + if (m->misc) + printk("MISC %Lx ", m->misc); + printk("\n"); +} + +static void mce_panic(char *msg, struct mce *backup, unsigned long start) +{ + int i; + oops_begin(); + for (i = 0; i < MCE_LOG_LEN; i++) { + unsigned long tsc = mcelog.entry[i].tsc; + if (time_before(tsc, start)) + continue; + print_mce(&mcelog.entry[i]); + if (backup && mcelog.entry[i].tsc == backup->tsc) + backup = NULL; + } + if (backup) + print_mce(backup); + if (tolerant >= 3) + printk("Fake panic: %s\n", msg); + else + panic(msg); +} + +static int mce_available(struct cpuinfo_x86 *c) +{ + return test_bit(X86_FEATURE_MCE, &c->x86_capability) && + test_bit(X86_FEATURE_MCA, &c->x86_capability); +} + +/* + * The actual machine check handler + */ + +void do_machine_check(struct pt_regs * regs, long error_code) +{ + struct mce m, panicm; + int nowayout = (tolerant < 1); + int kill_it = 0; + u64 mcestart = 0; + int i; + int panicm_found = 0; + + if (regs) + notify_die(DIE_NMI, "machine check", regs, error_code, 255, SIGKILL); + if (!banks) + return; + + memset(&m, 0, sizeof(struct mce)); + m.cpu = hard_smp_processor_id(); + rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); + if (!(m.mcgstatus & MCG_STATUS_RIPV)) + kill_it = 1; + + rdtscll(mcestart); + barrier(); + + for (i = 0; i < banks; i++) { + if (!bank[i]) + continue; + + m.misc = 0; + m.addr = 0; + m.bank = i; + m.tsc = 0; + + rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); + if ((m.status & MCI_STATUS_VAL) == 0) + continue; + + if (m.status & MCI_STATUS_EN) { + /* In theory _OVER could be a nowayout too, but + assume any overflowed errors were no fatal. */ + nowayout |= !!(m.status & MCI_STATUS_PCC); + kill_it |= !!(m.status & MCI_STATUS_UC); + } + + if (m.status & MCI_STATUS_MISCV) + rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc); + if (m.status & MCI_STATUS_ADDRV) + rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); + + if (regs && (m.mcgstatus & MCG_STATUS_RIPV)) { + m.rip = regs->rip; + m.cs = regs->cs; + } else { + m.rip = 0; + m.cs = 0; + } + + if (error_code != -1) + rdtscll(m.tsc); + wrmsrl(MSR_IA32_MC0_STATUS + i*4, 0); + mce_log(&m); + + /* Did this bank cause the exception? */ + /* Assume that the bank with uncorrectable errors did it, + and that there is only a single one. */ + if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) { + panicm = m; + panicm_found = 1; + } + + tainted |= TAINT_MACHINE_CHECK; + } + + /* Never do anything final in the polling timer */ + if (!regs) + goto out; + + /* If we didn't find an uncorrectable error, pick + the last one (shouldn't happen, just being safe). */ + if (!panicm_found) + panicm = m; + if (nowayout) + mce_panic("Machine check", &panicm, mcestart); + if (kill_it) { + int user_space = 0; + + if (m.mcgstatus & MCG_STATUS_RIPV) + user_space = panicm.rip && (panicm.cs & 3); + + /* When the machine was in user space and the CPU didn't get + confused it's normally not necessary to panic, unless you + are paranoid (tolerant == 0) + + RED-PEN could be more tolerant for MCEs in idle, + but most likely they occur at boot anyways, where + it is best to just halt the machine. */ + if ((!user_space && (panic_on_oops || tolerant < 2)) || + (unsigned)current->pid <= 1) + mce_panic("Uncorrected machine check", &panicm, mcestart); + + /* do_exit takes an awful lot of locks and has as + slight risk of deadlocking. If you don't want that + don't set tolerant >= 2 */ + if (tolerant < 3) + do_exit(SIGBUS); + } + + out: + /* Last thing done in the machine check exception to clear state. */ + wrmsrl(MSR_IA32_MCG_STATUS, 0); +} + +/* + * Periodic polling timer for "silent" machine check errors. + */ + +static int check_interval = 5 * 60; /* 5 minutes */ +static void mcheck_timer(void *data); +static DECLARE_WORK(mcheck_work, mcheck_timer, NULL); + +static void mcheck_check_cpu(void *info) +{ + if (mce_available(¤t_cpu_data)) + do_machine_check(NULL, 0); +} + +static void mcheck_timer(void *data) +{ + on_each_cpu(mcheck_check_cpu, NULL, 1, 1); + schedule_delayed_work(&mcheck_work, check_interval * HZ); + + /* + * It's ok to read stale data here for notify_user and + * console_logged as we'll simply get the updated versions + * on the next mcheck_timer execution and atomic operations + * on console_logged act as synchronization for notify_user + * writes. + */ + if (notify_user && console_logged) { + notify_user = 0; + clear_bit(0, &console_logged); + printk(KERN_INFO "Machine check events logged\n"); + } +} + + +static __init int periodic_mcheck_init(void) +{ + if (check_interval) + schedule_delayed_work(&mcheck_work, check_interval*HZ); + return 0; +} +__initcall(periodic_mcheck_init); + + +/* + * Initialize Machine Checks for a CPU. + */ +static void mce_init(void *dummy) +{ + u64 cap; + int i; + + rdmsrl(MSR_IA32_MCG_CAP, cap); + banks = cap & 0xff; + if (banks > NR_BANKS) { + printk(KERN_INFO "MCE: warning: using only %d banks\n", banks); + banks = NR_BANKS; + } + + /* Log the machine checks left over from the previous reset. + This also clears all registers */ + do_machine_check(NULL, -1); + + set_in_cr4(X86_CR4_MCE); + + if (cap & MCG_CTL_P) + wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); + + for (i = 0; i < banks; i++) { + wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); + wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + } +} + +/* Add per CPU specific workarounds here */ +static void __init mce_cpu_quirks(struct cpuinfo_x86 *c) +{ + /* This should be disabled by the BIOS, but isn't always */ + if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) { + /* disable GART TBL walk error reporting, which trips off + incorrectly with the IOMMU & 3ware & Cerberus. */ + clear_bit(10, &bank[4]); + } +} + +static void __init mce_cpu_features(struct cpuinfo_x86 *c) +{ + switch (c->x86_vendor) { + case X86_VENDOR_INTEL: + mce_intel_feature_init(c); + break; + default: + break; + } +} + +/* + * Called for each booted CPU to set up machine checks. + * Must be called with preempt off. + */ +void __init mcheck_init(struct cpuinfo_x86 *c) +{ + static cpumask_t mce_cpus __initdata = CPU_MASK_NONE; + + mce_cpu_quirks(c); + + if (mce_dont_init || + cpu_test_and_set(smp_processor_id(), mce_cpus) || + !mce_available(c)) + return; + + mce_init(NULL); + mce_cpu_features(c); +} + +/* + * Character device to read and clear the MCE log. + */ + +static void collect_tscs(void *data) +{ + unsigned long *cpu_tsc = (unsigned long *)data; + rdtscll(cpu_tsc[smp_processor_id()]); +} + +static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off) +{ + unsigned long cpu_tsc[NR_CPUS]; + static DECLARE_MUTEX(mce_read_sem); + unsigned next; + char __user *buf = ubuf; + int i, err; + + down(&mce_read_sem); + next = rcu_dereference(mcelog.next); + + /* Only supports full reads right now */ + if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { + up(&mce_read_sem); + return -EINVAL; + } + + err = 0; + for (i = 0; i < next; i++) { + if (!mcelog.entry[i].finished) + continue; + smp_rmb(); + err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce)); + buf += sizeof(struct mce); + } + + memset(mcelog.entry, 0, next * sizeof(struct mce)); + mcelog.next = 0; + + synchronize_kernel(); + + /* Collect entries that were still getting written before the synchronize. */ + + on_each_cpu(collect_tscs, cpu_tsc, 1, 1); + for (i = next; i < MCE_LOG_LEN; i++) { + if (mcelog.entry[i].finished && + mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { + err |= copy_to_user(buf, mcelog.entry+i, sizeof(struct mce)); + smp_rmb(); + buf += sizeof(struct mce); + memset(&mcelog.entry[i], 0, sizeof(struct mce)); + } + } + up(&mce_read_sem); + return err ? -EFAULT : buf - ubuf; +} + +static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg) +{ + int __user *p = (int __user *)arg; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + switch (cmd) { + case MCE_GET_RECORD_LEN: + return put_user(sizeof(struct mce), p); + case MCE_GET_LOG_LEN: + return put_user(MCE_LOG_LEN, p); + case MCE_GETCLEAR_FLAGS: { + unsigned flags; + do { + flags = mcelog.flags; + } while (cmpxchg(&mcelog.flags, flags, 0) != flags); + return put_user(flags, p); + } + default: + return -ENOTTY; + } +} + +static struct file_operations mce_chrdev_ops = { + .read = mce_read, + .ioctl = mce_ioctl, +}; + +static struct miscdevice mce_log_device = { + MISC_MCELOG_MINOR, + "mcelog", + &mce_chrdev_ops, +}; + +/* + * Old style boot options parsing. Only for compatibility. + */ + +static int __init mcheck_disable(char *str) +{ + mce_dont_init = 1; + return 0; +} + +/* mce=off disables machine check. Note you can reenable it later + using sysfs */ +static int __init mcheck_enable(char *str) +{ + if (!strcmp(str, "off")) + mce_dont_init = 1; + else + printk("mce= argument %s ignored. Please use /sys", str); + return 0; +} + +__setup("nomce", mcheck_disable); +__setup("mce", mcheck_enable); + +/* + * Sysfs support + */ + +/* On resume clear all MCE state. Don't want to see leftovers from the BIOS. */ +static int mce_resume(struct sys_device *dev) +{ + on_each_cpu(mce_init, NULL, 1, 1); + return 0; +} + +/* Reinit MCEs after user configuration changes */ +static void mce_restart(void) +{ + if (check_interval) + cancel_delayed_work(&mcheck_work); + /* Timer race is harmless here */ + on_each_cpu(mce_init, NULL, 1, 1); + if (check_interval) + schedule_delayed_work(&mcheck_work, check_interval*HZ); +} + +static struct sysdev_class mce_sysclass = { + .resume = mce_resume, + set_kset_name("machinecheck"), +}; + +static struct sys_device device_mce = { + .id = 0, + .cls = &mce_sysclass, +}; + +/* Why are there no generic functions for this? */ +#define ACCESSOR(name, var, start) \ + static ssize_t show_ ## name(struct sys_device *s, char *buf) { \ + return sprintf(buf, "%lx\n", (unsigned long)var); \ + } \ + static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \ + char *end; \ + unsigned long new = simple_strtoul(buf, &end, 0); \ + if (end == buf) return -EINVAL; \ + var = new; \ + start; \ + return end-buf; \ + } \ + static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); + +ACCESSOR(bank0ctl,bank[0],mce_restart()) +ACCESSOR(bank1ctl,bank[1],mce_restart()) +ACCESSOR(bank2ctl,bank[2],mce_restart()) +ACCESSOR(bank3ctl,bank[3],mce_restart()) +ACCESSOR(bank4ctl,bank[4],mce_restart()) +ACCESSOR(tolerant,tolerant,) +ACCESSOR(check_interval,check_interval,mce_restart()) + +static __init int mce_init_device(void) +{ + int err; + if (!mce_available(&boot_cpu_data)) + return -EIO; + err = sysdev_class_register(&mce_sysclass); + if (!err) + err = sysdev_register(&device_mce); + if (!err) { + /* could create per CPU objects, but it is not worth it. */ + sysdev_create_file(&device_mce, &attr_bank0ctl); + sysdev_create_file(&device_mce, &attr_bank1ctl); + sysdev_create_file(&device_mce, &attr_bank2ctl); + sysdev_create_file(&device_mce, &attr_bank3ctl); + sysdev_create_file(&device_mce, &attr_bank4ctl); + sysdev_create_file(&device_mce, &attr_tolerant); + sysdev_create_file(&device_mce, &attr_check_interval); + } + + misc_register(&mce_log_device); + return err; + +} +device_initcall(mce_init_device); diff --git a/arch/x86_64/kernel/mce_intel.c b/arch/x86_64/kernel/mce_intel.c new file mode 100644 index 00000000000..4db9a640069 --- /dev/null +++ b/arch/x86_64/kernel/mce_intel.c @@ -0,0 +1,99 @@ +/* + * Intel specific MCE features. + * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca> + */ + +#include <linux/init.h> +#include <linux/interrupt.h> +#include <linux/percpu.h> +#include <asm/processor.h> +#include <asm/msr.h> +#include <asm/mce.h> +#include <asm/hw_irq.h> + +static DEFINE_PER_CPU(unsigned long, next_check); + +asmlinkage void smp_thermal_interrupt(void) +{ + struct mce m; + + ack_APIC_irq(); + + irq_enter(); + if (time_before(jiffies, __get_cpu_var(next_check))) + goto done; + + __get_cpu_var(next_check) = jiffies + HZ*300; + memset(&m, 0, sizeof(m)); + m.cpu = smp_processor_id(); + m.bank = MCE_THERMAL_BANK; + rdtscll(m.tsc); + rdmsrl(MSR_IA32_THERM_STATUS, m.status); + if (m.status & 0x1) { + printk(KERN_EMERG + "CPU%d: Temperature above threshold, cpu clock throttled\n", m.cpu); + add_taint(TAINT_MACHINE_CHECK); + } else { + printk(KERN_EMERG "CPU%d: Temperature/speed normal\n", m.cpu); + } + + mce_log(&m); +done: + irq_exit(); +} + +static void __init intel_init_thermal(struct cpuinfo_x86 *c) +{ + u32 l, h; + int tm2 = 0; + unsigned int cpu = smp_processor_id(); + + if (!cpu_has(c, X86_FEATURE_ACPI)) + return; + + if (!cpu_has(c, X86_FEATURE_ACC)) + return; + + /* first check if TM1 is already enabled by the BIOS, in which + * case there might be some SMM goo which handles it, so we can't even + * put a handler since it might be delivered via SMI already. + */ + rdmsr(MSR_IA32_MISC_ENABLE, l, h); + h = apic_read(APIC_LVTTHMR); + if ((l & (1 << 3)) && (h & APIC_DM_SMI)) { + printk(KERN_DEBUG + "CPU%d: Thermal monitoring handled by SMI\n", cpu); + return; + } + + if (cpu_has(c, X86_FEATURE_TM2) && (l & (1 << 13))) + tm2 = 1; + + if (h & APIC_VECTOR_MASK) { + printk(KERN_DEBUG + "CPU%d: Thermal LVT vector (%#x) already " + "installed\n", cpu, (h & APIC_VECTOR_MASK)); + return; + } + + h = THERMAL_APIC_VECTOR; + h |= (APIC_DM_FIXED | APIC_LVT_MASKED); + apic_write_around(APIC_LVTTHMR, h); + + rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); + wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h); + + rdmsr(MSR_IA32_MISC_ENABLE, l, h); + wrmsr(MSR_IA32_MISC_ENABLE, l | (1 << 3), h); + + l = apic_read(APIC_LVTTHMR); + apic_write_around(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); + printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", + cpu, tm2 ? "TM2" : "TM1"); + return; +} + +void __init mce_intel_feature_init(struct cpuinfo_x86 *c) +{ + intel_init_thermal(c); +} diff --git a/arch/x86_64/kernel/module.c b/arch/x86_64/kernel/module.c new file mode 100644 index 00000000000..c2ffea8845e --- /dev/null +++ b/arch/x86_64/kernel/module.c @@ -0,0 +1,166 @@ +/* Kernel module help for x86-64 + Copyright (C) 2001 Rusty Russell. + Copyright (C) 2002,2003 Andi Kleen, SuSE Labs. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ +#include <linux/moduleloader.h> +#include <linux/elf.h> +#include <linux/vmalloc.h> +#include <linux/fs.h> +#include <linux/string.h> +#include <linux/kernel.h> +#include <linux/slab.h> + +#include <asm/system.h> +#include <asm/page.h> +#include <asm/pgtable.h> + +#define DEBUGP(fmt...) + +void module_free(struct module *mod, void *module_region) +{ + vfree(module_region); +} + +void *module_alloc(unsigned long size) +{ + struct vm_struct *area; + + if (!size) + return NULL; + size = PAGE_ALIGN(size); + if (size > MODULES_LEN) + return NULL; + + area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END); + if (!area) + return NULL; + + return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL_EXEC); +} + +/* We don't need anything special. */ +int module_frob_arch_sections(Elf_Ehdr *hdr, + Elf_Shdr *sechdrs, + char *secstrings, + struct module *mod) +{ + return 0; +} + +int apply_relocate_add(Elf64_Shdr *sechdrs, + const char *strtab, + unsigned int symindex, + unsigned int relsec, + struct module *me) +{ + unsigned int i; + Elf64_Rela *rel = (void *)sechdrs[relsec].sh_addr; + Elf64_Sym *sym; + void *loc; + u64 val; + + DEBUGP("Applying relocate section %u to %u\n", relsec, + sechdrs[relsec].sh_info); + for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { + /* This is where to make the change */ + loc = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr + + rel[i].r_offset; + + /* This is the symbol it is referring to. Note that all + undefined symbols have been resolved. */ + sym = (Elf64_Sym *)sechdrs[symindex].sh_addr + + ELF64_R_SYM(rel[i].r_info); + + DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n", + (int)ELF64_R_TYPE(rel[i].r_info), + sym->st_value, rel[i].r_addend, (u64)loc); + + val = sym->st_value + rel[i].r_addend; + + switch (ELF64_R_TYPE(rel[i].r_info)) { + case R_X86_64_NONE: + break; + case R_X86_64_64: + *(u64 *)loc = val; + break; + case R_X86_64_32: + *(u32 *)loc = val; + if (val != *(u32 *)loc) + goto overflow; + break; + case R_X86_64_32S: + *(s32 *)loc = val; + if ((s64)val != *(s32 *)loc) + goto overflow; + break; + case R_X86_64_PC32: + val -= (u64)loc; + *(u32 *)loc = val; +#if 0 + if ((s64)val != *(s32 *)loc) + goto overflow; +#endif + break; + default: + printk(KERN_ERR "module %s: Unknown rela relocation: %Lu\n", + me->name, ELF64_R_TYPE(rel[i].r_info)); + return -ENOEXEC; + } + } + return 0; + +overflow: + printk(KERN_ERR "overflow in relocation type %d val %Lx\n", + (int)ELF64_R_TYPE(rel[i].r_info), val); + printk(KERN_ERR "`%s' likely not compiled with -mcmodel=kernel\n", + me->name); + return -ENOEXEC; +} + +int apply_relocate(Elf_Shdr *sechdrs, + const char *strtab, + unsigned int symindex, + unsigned int relsec, + struct module *me) +{ + printk("non add relocation not supported\n"); + return -ENOSYS; +} + +extern void apply_alternatives(void *start, void *end); + +int module_finalize(const Elf_Ehdr *hdr, + const Elf_Shdr *sechdrs, + struct module *me) +{ + const Elf_Shdr *s; + char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; + + /* look for .altinstructions to patch */ + for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { + void *seg; + if (strcmp(".altinstructions", secstrings + s->sh_name)) + continue; + seg = (void *)s->sh_addr; + apply_alternatives(seg, seg + s->sh_size); + } + return 0; +} + +void module_arch_cleanup(struct module *mod) +{ +} diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c new file mode 100644 index 00000000000..7ec031c6ca1 --- /dev/null +++ b/arch/x86_64/kernel/mpparse.c @@ -0,0 +1,949 @@ +/* + * Intel Multiprocessor Specification 1.1 and 1.4 + * compliant MP-table parsing routines. + * + * (c) 1995 Alan Cox, Building #3 <alan@redhat.com> + * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com> + * + * Fixes + * Erich Boleyn : MP v1.4 and additional changes. + * Alan Cox : Added EBDA scanning + * Ingo Molnar : various cleanups and rewrites + * Maciej W. Rozycki: Bits for default MP configurations + * Paul Diefenbaugh: Added full ACPI support + */ + +#include <linux/mm.h> +#include <linux/irq.h> +#include <linux/init.h> +#include <linux/delay.h> +#include <linux/config.h> +#include <linux/bootmem.h> +#include <linux/smp_lock.h> +#include <linux/kernel_stat.h> +#include <linux/mc146818rtc.h> +#include <linux/acpi.h> + +#include <asm/smp.h> +#include <asm/mtrr.h> +#include <asm/mpspec.h> +#include <asm/pgalloc.h> +#include <asm/io_apic.h> +#include <asm/proto.h> + +/* Have we found an MP table */ +int smp_found_config; +unsigned int __initdata maxcpus = NR_CPUS; + +int acpi_found_madt; + +/* + * Various Linux-internal data structures created from the + * MP-table. + */ +int apic_version [MAX_APICS]; +unsigned char mp_bus_id_to_type [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; +int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; +cpumask_t pci_bus_to_cpumask [256] = { [0 ... 255] = CPU_MASK_ALL }; + +static int mp_current_pci_id = 0; +/* I/O APIC entries */ +struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS]; + +/* # of MP IRQ source entries */ +struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; + +/* MP IRQ source entries */ +int mp_irq_entries; + +int nr_ioapics; +int pic_mode; +unsigned long mp_lapic_addr = 0; + + + +/* Processor that is doing the boot up */ +unsigned int boot_cpu_id = -1U; +/* Internal processor count */ +static unsigned int num_processors = 0; + +/* Bitmask of physically existing CPUs */ +physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE; + +/* ACPI MADT entry parsing functions */ +#ifdef CONFIG_ACPI_BOOT +extern struct acpi_boot_flags acpi_boot; +#ifdef CONFIG_X86_LOCAL_APIC +extern int acpi_parse_lapic (acpi_table_entry_header *header); +extern int acpi_parse_lapic_addr_ovr (acpi_table_entry_header *header); +extern int acpi_parse_lapic_nmi (acpi_table_entry_header *header); +#endif /*CONFIG_X86_LOCAL_APIC*/ +#ifdef CONFIG_X86_IO_APIC +extern int acpi_parse_ioapic (acpi_table_entry_header *header); +#endif /*CONFIG_X86_IO_APIC*/ +#endif /*CONFIG_ACPI_BOOT*/ + +u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; + + +/* + * Intel MP BIOS table parsing routines: + */ + +/* + * Checksum an MP configuration block. + */ + +static int __init mpf_checksum(unsigned char *mp, int len) +{ + int sum = 0; + + while (len--) + sum += *mp++; + + return sum & 0xFF; +} + +static void __init MP_processor_info (struct mpc_config_processor *m) +{ + int ver; + + if (!(m->mpc_cpuflag & CPU_ENABLED)) + return; + + printk(KERN_INFO "Processor #%d %d:%d APIC version %d\n", + m->mpc_apicid, + (m->mpc_cpufeature & CPU_FAMILY_MASK)>>8, + (m->mpc_cpufeature & CPU_MODEL_MASK)>>4, + m->mpc_apicver); + + if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { + Dprintk(" Bootup CPU\n"); + boot_cpu_id = m->mpc_apicid; + } + if (num_processors >= NR_CPUS) { + printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." + " Processor ignored.\n", NR_CPUS); + return; + } + if (num_processors >= maxcpus) { + printk(KERN_WARNING "WARNING: maxcpus limit of %i reached." + " Processor ignored.\n", maxcpus); + return; + } + + num_processors++; + + if (m->mpc_apicid > MAX_APICS) { + printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n", + m->mpc_apicid, MAX_APICS); + return; + } + ver = m->mpc_apicver; + + physid_set(m->mpc_apicid, phys_cpu_present_map); + /* + * Validate version + */ + if (ver == 0x0) { + printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! fixing up to 0x10. (tell your hw vendor)\n", m->mpc_apicid); + ver = 0x10; + } + apic_version[m->mpc_apicid] = ver; + bios_cpu_apicid[num_processors - 1] = m->mpc_apicid; +} + +static void __init MP_bus_info (struct mpc_config_bus *m) +{ + char str[7]; + + memcpy(str, m->mpc_bustype, 6); + str[6] = 0; + Dprintk("Bus #%d is %s\n", m->mpc_busid, str); + + if (strncmp(str, "ISA", 3) == 0) { + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA; + } else if (strncmp(str, "EISA", 4) == 0) { + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA; + } else if (strncmp(str, "PCI", 3) == 0) { + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI; + mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id; + mp_current_pci_id++; + } else if (strncmp(str, "MCA", 3) == 0) { + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA; + } else { + printk(KERN_ERR "Unknown bustype %s\n", str); + } +} + +static void __init MP_ioapic_info (struct mpc_config_ioapic *m) +{ + if (!(m->mpc_flags & MPC_APIC_USABLE)) + return; + + printk("I/O APIC #%d Version %d at 0x%X.\n", + m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr); + if (nr_ioapics >= MAX_IO_APICS) { + printk(KERN_ERR "Max # of I/O APICs (%d) exceeded (found %d).\n", + MAX_IO_APICS, nr_ioapics); + panic("Recompile kernel with bigger MAX_IO_APICS!.\n"); + } + if (!m->mpc_apicaddr) { + printk(KERN_ERR "WARNING: bogus zero I/O APIC address" + " found in MP table, skipping!\n"); + return; + } + mp_ioapics[nr_ioapics] = *m; + nr_ioapics++; +} + +static void __init MP_intsrc_info (struct mpc_config_intsrc *m) +{ + mp_irqs [mp_irq_entries] = *m; + Dprintk("Int: type %d, pol %d, trig %d, bus %d," + " IRQ %02x, APIC ID %x, APIC INT %02x\n", + m->mpc_irqtype, m->mpc_irqflag & 3, + (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus, + m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq); + if (++mp_irq_entries == MAX_IRQ_SOURCES) + panic("Max # of irq sources exceeded!!\n"); +} + +static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m) +{ + Dprintk("Lint: type %d, pol %d, trig %d, bus %d," + " IRQ %02x, APIC ID %x, APIC LINT %02x\n", + m->mpc_irqtype, m->mpc_irqflag & 3, + (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid, + m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint); + /* + * Well it seems all SMP boards in existence + * use ExtINT/LVT1 == LINT0 and + * NMI/LVT2 == LINT1 - the following check + * will show us if this assumptions is false. + * Until then we do not have to add baggage. + */ + if ((m->mpc_irqtype == mp_ExtINT) && + (m->mpc_destapiclint != 0)) + BUG(); + if ((m->mpc_irqtype == mp_NMI) && + (m->mpc_destapiclint != 1)) + BUG(); +} + +/* + * Read/parse the MPC + */ + +static int __init smp_read_mpc(struct mp_config_table *mpc) +{ + char str[16]; + int count=sizeof(*mpc); + unsigned char *mpt=((unsigned char *)mpc)+count; + + if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) { + printk("SMP mptable: bad signature [%c%c%c%c]!\n", + mpc->mpc_signature[0], + mpc->mpc_signature[1], + mpc->mpc_signature[2], + mpc->mpc_signature[3]); + return 0; + } + if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) { + printk("SMP mptable: checksum error!\n"); + return 0; + } + if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) { + printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n", + mpc->mpc_spec); + return 0; + } + if (!mpc->mpc_lapic) { + printk(KERN_ERR "SMP mptable: null local APIC address!\n"); + return 0; + } + memcpy(str,mpc->mpc_oem,8); + str[8]=0; + printk(KERN_INFO "OEM ID: %s ",str); + + memcpy(str,mpc->mpc_productid,12); + str[12]=0; + printk(KERN_INFO "Product ID: %s ",str); + + printk(KERN_INFO "APIC at: 0x%X\n",mpc->mpc_lapic); + + /* save the local APIC address, it might be non-default */ + if (!acpi_lapic) + mp_lapic_addr = mpc->mpc_lapic; + + /* + * Now process the configuration blocks. + */ + while (count < mpc->mpc_length) { + switch(*mpt) { + case MP_PROCESSOR: + { + struct mpc_config_processor *m= + (struct mpc_config_processor *)mpt; + if (!acpi_lapic) + MP_processor_info(m); + mpt += sizeof(*m); + count += sizeof(*m); + break; + } + case MP_BUS: + { + struct mpc_config_bus *m= + (struct mpc_config_bus *)mpt; + MP_bus_info(m); + mpt += sizeof(*m); + count += sizeof(*m); + break; + } + case MP_IOAPIC: + { + struct mpc_config_ioapic *m= + (struct mpc_config_ioapic *)mpt; + MP_ioapic_info(m); + mpt+=sizeof(*m); + count+=sizeof(*m); + break; + } + case MP_INTSRC: + { + struct mpc_config_intsrc *m= + (struct mpc_config_intsrc *)mpt; + + MP_intsrc_info(m); + mpt+=sizeof(*m); + count+=sizeof(*m); + break; + } + case MP_LINTSRC: + { + struct mpc_config_lintsrc *m= + (struct mpc_config_lintsrc *)mpt; + MP_lintsrc_info(m); + mpt+=sizeof(*m); + count+=sizeof(*m); + break; + } + } + } + clustered_apic_check(); + if (!num_processors) + printk(KERN_ERR "SMP mptable: no processors registered!\n"); + return num_processors; +} + +static int __init ELCR_trigger(unsigned int irq) +{ + unsigned int port; + + port = 0x4d0 + (irq >> 3); + return (inb(port) >> (irq & 7)) & 1; +} + +static void __init construct_default_ioirq_mptable(int mpc_default_type) +{ + struct mpc_config_intsrc intsrc; + int i; + int ELCR_fallback = 0; + + intsrc.mpc_type = MP_INTSRC; + intsrc.mpc_irqflag = 0; /* conforming */ + intsrc.mpc_srcbus = 0; + intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid; + + intsrc.mpc_irqtype = mp_INT; + + /* + * If true, we have an ISA/PCI system with no IRQ entries + * in the MP table. To prevent the PCI interrupts from being set up + * incorrectly, we try to use the ELCR. The sanity check to see if + * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can + * never be level sensitive, so we simply see if the ELCR agrees. + * If it does, we assume it's valid. + */ + if (mpc_default_type == 5) { + printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n"); + + if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13)) + printk(KERN_ERR "ELCR contains invalid data... not using ELCR\n"); + else { + printk(KERN_INFO "Using ELCR to identify PCI interrupts\n"); + ELCR_fallback = 1; + } + } + + for (i = 0; i < 16; i++) { + switch (mpc_default_type) { + case 2: + if (i == 0 || i == 13) + continue; /* IRQ0 & IRQ13 not connected */ + /* fall through */ + default: + if (i == 2) + continue; /* IRQ2 is never connected */ + } + + if (ELCR_fallback) { + /* + * If the ELCR indicates a level-sensitive interrupt, we + * copy that information over to the MP table in the + * irqflag field (level sensitive, active high polarity). + */ + if (ELCR_trigger(i)) + intsrc.mpc_irqflag = 13; + else + intsrc.mpc_irqflag = 0; + } + + intsrc.mpc_srcbusirq = i; + intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */ + MP_intsrc_info(&intsrc); + } + + intsrc.mpc_irqtype = mp_ExtINT; + intsrc.mpc_srcbusirq = 0; + intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */ + MP_intsrc_info(&intsrc); +} + +static inline void __init construct_default_ISA_mptable(int mpc_default_type) +{ + struct mpc_config_processor processor; + struct mpc_config_bus bus; + struct mpc_config_ioapic ioapic; + struct mpc_config_lintsrc lintsrc; + int linttypes[2] = { mp_ExtINT, mp_NMI }; + int i; + + /* + * local APIC has default address + */ + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; + + /* + * 2 CPUs, numbered 0 & 1. + */ + processor.mpc_type = MP_PROCESSOR; + /* Either an integrated APIC or a discrete 82489DX. */ + processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; + processor.mpc_cpuflag = CPU_ENABLED; + processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | + (boot_cpu_data.x86_model << 4) | + boot_cpu_data.x86_mask; + processor.mpc_featureflag = boot_cpu_data.x86_capability[0]; + processor.mpc_reserved[0] = 0; + processor.mpc_reserved[1] = 0; + for (i = 0; i < 2; i++) { + processor.mpc_apicid = i; + MP_processor_info(&processor); + } + + bus.mpc_type = MP_BUS; + bus.mpc_busid = 0; + switch (mpc_default_type) { + default: + printk(KERN_ERR "???\nUnknown standard configuration %d\n", + mpc_default_type); + /* fall through */ + case 1: + case 5: + memcpy(bus.mpc_bustype, "ISA ", 6); + break; + case 2: + case 6: + case 3: + memcpy(bus.mpc_bustype, "EISA ", 6); + break; + case 4: + case 7: + memcpy(bus.mpc_bustype, "MCA ", 6); + } + MP_bus_info(&bus); + if (mpc_default_type > 4) { + bus.mpc_busid = 1; + memcpy(bus.mpc_bustype, "PCI ", 6); + MP_bus_info(&bus); + } + + ioapic.mpc_type = MP_IOAPIC; + ioapic.mpc_apicid = 2; + ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; + ioapic.mpc_flags = MPC_APIC_USABLE; + ioapic.mpc_apicaddr = 0xFEC00000; + MP_ioapic_info(&ioapic); + + /* + * We set up most of the low 16 IO-APIC pins according to MPS rules. + */ + construct_default_ioirq_mptable(mpc_default_type); + + lintsrc.mpc_type = MP_LINTSRC; + lintsrc.mpc_irqflag = 0; /* conforming */ + lintsrc.mpc_srcbusid = 0; + lintsrc.mpc_srcbusirq = 0; + lintsrc.mpc_destapic = MP_APIC_ALL; + for (i = 0; i < 2; i++) { + lintsrc.mpc_irqtype = linttypes[i]; + lintsrc.mpc_destapiclint = i; + MP_lintsrc_info(&lintsrc); + } +} + +static struct intel_mp_floating *mpf_found; + +/* + * Scan the memory blocks for an SMP configuration block. + */ +void __init get_smp_config (void) +{ + struct intel_mp_floating *mpf = mpf_found; + + /* + * ACPI may be used to obtain the entire SMP configuration or just to + * enumerate/configure processors (CONFIG_ACPI_BOOT). Note that + * ACPI supports both logical (e.g. Hyper-Threading) and physical + * processors, where MPS only supports physical. + */ + if (acpi_lapic && acpi_ioapic) { + printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n"); + return; + } + else if (acpi_lapic) + printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n"); + + printk("Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification); + if (mpf->mpf_feature2 & (1<<7)) { + printk(KERN_INFO " IMCR and PIC compatibility mode.\n"); + pic_mode = 1; + } else { + printk(KERN_INFO " Virtual Wire compatibility mode.\n"); + pic_mode = 0; + } + + /* + * Now see if we need to read further. + */ + if (mpf->mpf_feature1 != 0) { + + printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1); + construct_default_ISA_mptable(mpf->mpf_feature1); + + } else if (mpf->mpf_physptr) { + + /* + * Read the physical hardware table. Anything here will + * override the defaults. + */ + if (!smp_read_mpc((void *)(unsigned long)mpf->mpf_physptr)) { + smp_found_config = 0; + printk(KERN_ERR "BIOS bug, MP table errors detected!...\n"); + printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n"); + return; + } + /* + * If there are no explicit MP IRQ entries, then we are + * broken. We set up most of the low 16 IO-APIC pins to + * ISA defaults and hope it will work. + */ + if (!mp_irq_entries) { + struct mpc_config_bus bus; + + printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n"); + + bus.mpc_type = MP_BUS; + bus.mpc_busid = 0; + memcpy(bus.mpc_bustype, "ISA ", 6); + MP_bus_info(&bus); + + construct_default_ioirq_mptable(0); + } + + } else + BUG(); + + printk(KERN_INFO "Processors: %d\n", num_processors); + /* + * Only use the first configuration found. + */ +} + +static int __init smp_scan_config (unsigned long base, unsigned long length) +{ + extern void __bad_mpf_size(void); + unsigned int *bp = phys_to_virt(base); + struct intel_mp_floating *mpf; + + Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length); + if (sizeof(*mpf) != 16) + __bad_mpf_size(); + + while (length > 0) { + mpf = (struct intel_mp_floating *)bp; + if ((*bp == SMP_MAGIC_IDENT) && + (mpf->mpf_length == 1) && + !mpf_checksum((unsigned char *)bp, 16) && + ((mpf->mpf_specification == 1) + || (mpf->mpf_specification == 4)) ) { + + smp_found_config = 1; + reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE); + if (mpf->mpf_physptr) + reserve_bootmem_generic(mpf->mpf_physptr, PAGE_SIZE); + mpf_found = mpf; + return 1; + } + bp += 4; + length -= 16; + } + return 0; +} + +void __init find_intel_smp (void) +{ + unsigned int address; + + /* + * FIXME: Linux assumes you have 640K of base ram.. + * this continues the error... + * + * 1) Scan the bottom 1K for a signature + * 2) Scan the top 1K of base RAM + * 3) Scan the 64K of bios + */ + if (smp_scan_config(0x0,0x400) || + smp_scan_config(639*0x400,0x400) || + smp_scan_config(0xF0000,0x10000)) + return; + /* + * If it is an SMP machine we should know now, unless the + * configuration is in an EISA/MCA bus machine with an + * extended bios data area. + * + * there is a real-mode segmented pointer pointing to the + * 4K EBDA area at 0x40E, calculate and scan it here. + * + * NOTE! There are Linux loaders that will corrupt the EBDA + * area, and as such this kind of SMP config may be less + * trustworthy, simply because the SMP table may have been + * stomped on during early boot. These loaders are buggy and + * should be fixed. + */ + + address = *(unsigned short *)phys_to_virt(0x40E); + address <<= 4; + if (smp_scan_config(address, 0x1000)) + return; + + /* If we have come this far, we did not find an MP table */ + printk(KERN_INFO "No mptable found.\n"); +} + +/* + * - Intel MP Configuration Table + */ +void __init find_smp_config (void) +{ +#ifdef CONFIG_X86_LOCAL_APIC + find_intel_smp(); +#endif +} + + +/* -------------------------------------------------------------------------- + ACPI-based MP Configuration + -------------------------------------------------------------------------- */ + +#ifdef CONFIG_ACPI_BOOT + +void __init mp_register_lapic_address ( + u64 address) +{ + mp_lapic_addr = (unsigned long) address; + + set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); + + if (boot_cpu_id == -1U) + boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID)); + + Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid); +} + + +void __init mp_register_lapic ( + u8 id, + u8 enabled) +{ + struct mpc_config_processor processor; + int boot_cpu = 0; + + if (id >= MAX_APICS) { + printk(KERN_WARNING "Processor #%d invalid (max %d)\n", + id, MAX_APICS); + return; + } + + if (id == boot_cpu_physical_apicid) + boot_cpu = 1; + + processor.mpc_type = MP_PROCESSOR; + processor.mpc_apicid = id; + processor.mpc_apicver = 0x10; /* TBD: lapic version */ + processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0); + processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0); + processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | + (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask; + processor.mpc_featureflag = boot_cpu_data.x86_capability[0]; + processor.mpc_reserved[0] = 0; + processor.mpc_reserved[1] = 0; + + MP_processor_info(&processor); +} + +#ifdef CONFIG_X86_IO_APIC + +#define MP_ISA_BUS 0 +#define MP_MAX_IOAPIC_PIN 127 + +static struct mp_ioapic_routing { + int apic_id; + int gsi_start; + int gsi_end; + u32 pin_programmed[4]; +} mp_ioapic_routing[MAX_IO_APICS]; + + +static int mp_find_ioapic ( + int gsi) +{ + int i = 0; + + /* Find the IOAPIC that manages this GSI. */ + for (i = 0; i < nr_ioapics; i++) { + if ((gsi >= mp_ioapic_routing[i].gsi_start) + && (gsi <= mp_ioapic_routing[i].gsi_end)) + return i; + } + + printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi); + + return -1; +} + + +void __init mp_register_ioapic ( + u8 id, + u32 address, + u32 gsi_base) +{ + int idx = 0; + + if (nr_ioapics >= MAX_IO_APICS) { + printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded " + "(found %d)\n", MAX_IO_APICS, nr_ioapics); + panic("Recompile kernel with bigger MAX_IO_APICS!\n"); + } + if (!address) { + printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address" + " found in MADT table, skipping!\n"); + return; + } + + idx = nr_ioapics++; + + mp_ioapics[idx].mpc_type = MP_IOAPIC; + mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE; + mp_ioapics[idx].mpc_apicaddr = address; + + set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); + mp_ioapics[idx].mpc_apicid = io_apic_get_unique_id(idx, id); + mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx); + + /* + * Build basic IRQ lookup table to facilitate gsi->io_apic lookups + * and to prevent reprogramming of IOAPIC pins (PCI IRQs). + */ + mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid; + mp_ioapic_routing[idx].gsi_start = gsi_base; + mp_ioapic_routing[idx].gsi_end = gsi_base + + io_apic_get_redir_entries(idx); + + printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " + "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, + mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr, + mp_ioapic_routing[idx].gsi_start, + mp_ioapic_routing[idx].gsi_end); + + return; +} + + +void __init mp_override_legacy_irq ( + u8 bus_irq, + u8 polarity, + u8 trigger, + u32 gsi) +{ + struct mpc_config_intsrc intsrc; + int ioapic = -1; + int pin = -1; + + /* + * Convert 'gsi' to 'ioapic.pin'. + */ + ioapic = mp_find_ioapic(gsi); + if (ioapic < 0) + return; + pin = gsi - mp_ioapic_routing[ioapic].gsi_start; + + /* + * TBD: This check is for faulty timer entries, where the override + * erroneously sets the trigger to level, resulting in a HUGE + * increase of timer interrupts! + */ + if ((bus_irq == 0) && (trigger == 3)) + trigger = 1; + + intsrc.mpc_type = MP_INTSRC; + intsrc.mpc_irqtype = mp_INT; + intsrc.mpc_irqflag = (trigger << 2) | polarity; + intsrc.mpc_srcbus = MP_ISA_BUS; + intsrc.mpc_srcbusirq = bus_irq; /* IRQ */ + intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */ + intsrc.mpc_dstirq = pin; /* INTIN# */ + + Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n", + intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, + (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, + intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq); + + mp_irqs[mp_irq_entries] = intsrc; + if (++mp_irq_entries == MAX_IRQ_SOURCES) + panic("Max # of irq sources exceeded!\n"); + + return; +} + + +void __init mp_config_acpi_legacy_irqs (void) +{ + struct mpc_config_intsrc intsrc; + int i = 0; + int ioapic = -1; + + /* + * Fabricate the legacy ISA bus (bus #31). + */ + mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA; + Dprintk("Bus #%d is ISA\n", MP_ISA_BUS); + + /* + * Locate the IOAPIC that manages the ISA IRQs (0-15). + */ + ioapic = mp_find_ioapic(0); + if (ioapic < 0) + return; + + intsrc.mpc_type = MP_INTSRC; + intsrc.mpc_irqflag = 0; /* Conforming */ + intsrc.mpc_srcbus = MP_ISA_BUS; + intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; + + /* + * Use the default configuration for the IRQs 0-15. Unless + * overridden by (MADT) interrupt source override entries. + */ + for (i = 0; i < 16; i++) { + int idx; + + for (idx = 0; idx < mp_irq_entries; idx++) { + struct mpc_config_intsrc *irq = mp_irqs + idx; + + /* Do we already have a mapping for this ISA IRQ? */ + if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i) + break; + + /* Do we already have a mapping for this IOAPIC pin */ + if ((irq->mpc_dstapic == intsrc.mpc_dstapic) && + (irq->mpc_dstirq == i)) + break; + } + + if (idx != mp_irq_entries) { + printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i); + continue; /* IRQ already used */ + } + + intsrc.mpc_irqtype = mp_INT; + intsrc.mpc_srcbusirq = i; /* Identity mapped */ + intsrc.mpc_dstirq = i; + + Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, " + "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, + (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, + intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, + intsrc.mpc_dstirq); + + mp_irqs[mp_irq_entries] = intsrc; + if (++mp_irq_entries == MAX_IRQ_SOURCES) + panic("Max # of irq sources exceeded!\n"); + } + + return; +} + +int mp_register_gsi(u32 gsi, int edge_level, int active_high_low) +{ + int ioapic = -1; + int ioapic_pin = 0; + int idx, bit = 0; + + if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) + return gsi; + +#ifdef CONFIG_ACPI_BUS + /* Don't set up the ACPI SCI because it's already set up */ + if (acpi_fadt.sci_int == gsi) + return gsi; +#endif + + ioapic = mp_find_ioapic(gsi); + if (ioapic < 0) { + printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi); + return gsi; + } + + ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_start; + + /* + * Avoid pin reprogramming. PRTs typically include entries + * with redundant pin->gsi mappings (but unique PCI devices); + * we only program the IOAPIC on the first. + */ + bit = ioapic_pin % 32; + idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32); + if (idx > 3) { + printk(KERN_ERR "Invalid reference to IOAPIC pin " + "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, + ioapic_pin); + return gsi; + } + if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) { + Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n", + mp_ioapic_routing[ioapic].apic_id, ioapic_pin); + return gsi; + } + + mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit); + + io_apic_set_pci_routing(ioapic, ioapic_pin, gsi, + edge_level == ACPI_EDGE_SENSITIVE ? 0 : 1, + active_high_low == ACPI_ACTIVE_HIGH ? 0 : 1); + return gsi; +} + +#endif /*CONFIG_X86_IO_APIC*/ +#endif /*CONFIG_ACPI_BOOT*/ diff --git a/arch/x86_64/kernel/msr.c b/arch/x86_64/kernel/msr.c new file mode 100644 index 00000000000..598953ab015 --- /dev/null +++ b/arch/x86_64/kernel/msr.c @@ -0,0 +1,279 @@ +/* ----------------------------------------------------------------------- * + * + * Copyright 2000 H. Peter Anvin - All Rights Reserved + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139, + * USA; either version 2 of the License, or (at your option) any later + * version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * msr.c + * + * x86 MSR access device + * + * This device is accessed by lseek() to the appropriate register number + * and then read/write in chunks of 8 bytes. A larger size means multiple + * reads or writes of the same register. + * + * This driver uses /dev/cpu/%d/msr where %d is the minor number, and on + * an SMP box will direct the access to CPU %d. + */ + +#include <linux/module.h> +#include <linux/config.h> + +#include <linux/types.h> +#include <linux/errno.h> +#include <linux/fcntl.h> +#include <linux/init.h> +#include <linux/poll.h> +#include <linux/smp.h> +#include <linux/smp_lock.h> +#include <linux/major.h> +#include <linux/fs.h> + +#include <asm/processor.h> +#include <asm/msr.h> +#include <asm/uaccess.h> +#include <asm/system.h> + +/* Note: "err" is handled in a funny way below. Otherwise one version + of gcc or another breaks. */ + +static inline int wrmsr_eio(u32 reg, u32 eax, u32 edx) +{ + int err; + + asm volatile ("1: wrmsr\n" + "2:\n" + ".section .fixup,\"ax\"\n" + "3: movl %4,%0\n" + " jmp 2b\n" + ".previous\n" + ".section __ex_table,\"a\"\n" + " .align 8\n" " .quad 1b,3b\n" ".previous":"=&bDS" (err) + :"a"(eax), "d"(edx), "c"(reg), "i"(-EIO), "0"(0)); + + return err; +} + +static inline int rdmsr_eio(u32 reg, u32 *eax, u32 *edx) +{ + int err; + + asm volatile ("1: rdmsr\n" + "2:\n" + ".section .fixup,\"ax\"\n" + "3: movl %4,%0\n" + " jmp 2b\n" + ".previous\n" + ".section __ex_table,\"a\"\n" + " .align 8\n" + " .quad 1b,3b\n" + ".previous":"=&bDS" (err), "=a"(*eax), "=d"(*edx) + :"c"(reg), "i"(-EIO), "0"(0)); + + return err; +} + +#ifdef CONFIG_SMP + +struct msr_command { + int cpu; + int err; + u32 reg; + u32 data[2]; +}; + +static void msr_smp_wrmsr(void *cmd_block) +{ + struct msr_command *cmd = (struct msr_command *)cmd_block; + + if (cmd->cpu == smp_processor_id()) + cmd->err = wrmsr_eio(cmd->reg, cmd->data[0], cmd->data[1]); +} + +static void msr_smp_rdmsr(void *cmd_block) +{ + struct msr_command *cmd = (struct msr_command *)cmd_block; + + if (cmd->cpu == smp_processor_id()) + cmd->err = rdmsr_eio(cmd->reg, &cmd->data[0], &cmd->data[1]); +} + +static inline int do_wrmsr(int cpu, u32 reg, u32 eax, u32 edx) +{ + struct msr_command cmd; + int ret; + + preempt_disable(); + if (cpu == smp_processor_id()) { + ret = wrmsr_eio(reg, eax, edx); + } else { + cmd.cpu = cpu; + cmd.reg = reg; + cmd.data[0] = eax; + cmd.data[1] = edx; + + smp_call_function(msr_smp_wrmsr, &cmd, 1, 1); + ret = cmd.err; + } + preempt_enable(); + return ret; +} + +static inline int do_rdmsr(int cpu, u32 reg, u32 * eax, u32 * edx) +{ + struct msr_command cmd; + int ret; + + preempt_disable(); + if (cpu == smp_processor_id()) { + ret = rdmsr_eio(reg, eax, edx); + } else { + cmd.cpu = cpu; + cmd.reg = reg; + + smp_call_function(msr_smp_rdmsr, &cmd, 1, 1); + + *eax = cmd.data[0]; + *edx = cmd.data[1]; + + ret = cmd.err; + } + preempt_enable(); + return ret; +} + +#else /* ! CONFIG_SMP */ + +static inline int do_wrmsr(int cpu, u32 reg, u32 eax, u32 edx) +{ + return wrmsr_eio(reg, eax, edx); +} + +static inline int do_rdmsr(int cpu, u32 reg, u32 *eax, u32 *edx) +{ + return rdmsr_eio(reg, eax, edx); +} + +#endif /* ! CONFIG_SMP */ + +static loff_t msr_seek(struct file *file, loff_t offset, int orig) +{ + loff_t ret = -EINVAL; + + lock_kernel(); + switch (orig) { + case 0: + file->f_pos = offset; + ret = file->f_pos; + break; + case 1: + file->f_pos += offset; + ret = file->f_pos; + } + unlock_kernel(); + return ret; +} + +static ssize_t msr_read(struct file *file, char __user * buf, + size_t count, loff_t * ppos) +{ + u32 __user *tmp = (u32 __user *) buf; + u32 data[2]; + size_t rv; + u32 reg = *ppos; + int cpu = iminor(file->f_dentry->d_inode); + int err; + + if (count % 8) + return -EINVAL; /* Invalid chunk size */ + + for (rv = 0; count; count -= 8) { + err = do_rdmsr(cpu, reg, &data[0], &data[1]); + if (err) + return err; + if (copy_to_user(tmp, &data, 8)) + return -EFAULT; + tmp += 2; + } + + return ((char __user *)tmp) - buf; +} + +static ssize_t msr_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + const u32 __user *tmp = (const u32 __user *)buf; + u32 data[2]; + size_t rv; + u32 reg = *ppos; + int cpu = iminor(file->f_dentry->d_inode); + int err; + + if (count % 8) + return -EINVAL; /* Invalid chunk size */ + + for (rv = 0; count; count -= 8) { + if (copy_from_user(&data, tmp, 8)) + return -EFAULT; + err = do_wrmsr(cpu, reg, data[0], data[1]); + if (err) + return err; + tmp += 2; + } + + return ((char __user *)tmp) - buf; +} + +static int msr_open(struct inode *inode, struct file *file) +{ + unsigned int cpu = iminor(file->f_dentry->d_inode); + struct cpuinfo_x86 *c = &(cpu_data)[cpu]; + + if (cpu >= NR_CPUS || !cpu_online(cpu)) + return -ENXIO; /* No such CPU */ + if (!cpu_has(c, X86_FEATURE_MSR)) + return -EIO; /* MSR not supported */ + + return 0; +} + +/* + * File operations we support + */ +static struct file_operations msr_fops = { + .owner = THIS_MODULE, + .llseek = msr_seek, + .read = msr_read, + .write = msr_write, + .open = msr_open, +}; + +static int __init msr_init(void) +{ + if (register_chrdev(MSR_MAJOR, "cpu/msr", &msr_fops)) { + printk(KERN_ERR "msr: unable to get major %d for msr\n", + MSR_MAJOR); + return -EBUSY; + } + + return 0; +} + +static void __exit msr_exit(void) +{ + unregister_chrdev(MSR_MAJOR, "cpu/msr"); +} + +module_init(msr_init); +module_exit(msr_exit) + +MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>"); +MODULE_DESCRIPTION("x86 generic MSR driver"); +MODULE_LICENSE("GPL"); diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c new file mode 100644 index 00000000000..d9867de6a62 --- /dev/null +++ b/arch/x86_64/kernel/nmi.c @@ -0,0 +1,488 @@ +/* + * linux/arch/x86_64/nmi.c + * + * NMI watchdog support on APIC systems + * + * Started by Ingo Molnar <mingo@redhat.com> + * + * Fixes: + * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog. + * Mikael Pettersson : Power Management for local APIC NMI watchdog. + * Pavel Machek and + * Mikael Pettersson : PM converted to driver model. Disable/enable API. + */ + +#include <linux/config.h> +#include <linux/mm.h> +#include <linux/irq.h> +#include <linux/delay.h> +#include <linux/bootmem.h> +#include <linux/smp_lock.h> +#include <linux/interrupt.h> +#include <linux/mc146818rtc.h> +#include <linux/kernel_stat.h> +#include <linux/module.h> +#include <linux/sysdev.h> +#include <linux/nmi.h> +#include <linux/sysctl.h> + +#include <asm/smp.h> +#include <asm/mtrr.h> +#include <asm/mpspec.h> +#include <asm/nmi.h> +#include <asm/msr.h> +#include <asm/proto.h> +#include <asm/kdebug.h> + +/* + * lapic_nmi_owner tracks the ownership of the lapic NMI hardware: + * - it may be reserved by some other driver, or not + * - when not reserved by some other driver, it may be used for + * the NMI watchdog, or not + * + * This is maintained separately from nmi_active because the NMI + * watchdog may also be driven from the I/O APIC timer. + */ +static DEFINE_SPINLOCK(lapic_nmi_owner_lock); +static unsigned int lapic_nmi_owner; +#define LAPIC_NMI_WATCHDOG (1<<0) +#define LAPIC_NMI_RESERVED (1<<1) + +/* nmi_active: + * +1: the lapic NMI watchdog is active, but can be disabled + * 0: the lapic NMI watchdog has not been set up, and cannot + * be enabled + * -1: the lapic NMI watchdog is disabled, but can be enabled + */ +int nmi_active; /* oprofile uses this */ +int panic_on_timeout; + +unsigned int nmi_watchdog = NMI_DEFAULT; +static unsigned int nmi_hz = HZ; +unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */ + +/* Note that these events don't tick when the CPU idles. This means + the frequency varies with CPU load. */ + +#define K7_EVNTSEL_ENABLE (1 << 22) +#define K7_EVNTSEL_INT (1 << 20) +#define K7_EVNTSEL_OS (1 << 17) +#define K7_EVNTSEL_USR (1 << 16) +#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76 +#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING + +#define P6_EVNTSEL0_ENABLE (1 << 22) +#define P6_EVNTSEL_INT (1 << 20) +#define P6_EVNTSEL_OS (1 << 17) +#define P6_EVNTSEL_USR (1 << 16) +#define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79 +#define P6_NMI_EVENT P6_EVENT_CPU_CLOCKS_NOT_HALTED + +/* Run after command line and cpu_init init, but before all other checks */ +void __init nmi_watchdog_default(void) +{ + if (nmi_watchdog != NMI_DEFAULT) + return; + + /* For some reason the IO APIC watchdog doesn't work on the AMD + 8111 chipset. For now switch to local APIC mode using + perfctr0 there. On Intel CPUs we don't have code to handle + the perfctr and the IO-APIC seems to work, so use that. */ + + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { + nmi_watchdog = NMI_LOCAL_APIC; + printk(KERN_INFO + "Using local APIC NMI watchdog using perfctr0\n"); + } else { + printk(KERN_INFO "Using IO APIC NMI watchdog\n"); + nmi_watchdog = NMI_IO_APIC; + } +} + +/* Why is there no CPUID flag for this? */ +static __init int cpu_has_lapic(void) +{ + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_INTEL: + case X86_VENDOR_AMD: + return boot_cpu_data.x86 >= 6; + /* .... add more cpus here or find a different way to figure this out. */ + default: + return 0; + } +} + +int __init check_nmi_watchdog (void) +{ + int counts[NR_CPUS]; + int cpu; + + if (nmi_watchdog == NMI_LOCAL_APIC && !cpu_has_lapic()) { + nmi_watchdog = NMI_NONE; + return -1; + } + + printk(KERN_INFO "testing NMI watchdog ... "); + + for (cpu = 0; cpu < NR_CPUS; cpu++) + counts[cpu] = cpu_pda[cpu].__nmi_count; + local_irq_enable(); + mdelay((10*1000)/nmi_hz); // wait 10 ticks + + for (cpu = 0; cpu < NR_CPUS; cpu++) { +#ifdef CONFIG_SMP + /* Check cpu_callin_map here because that is set + after the timer is started. */ + if (!cpu_isset(cpu, cpu_callin_map)) + continue; +#endif + if (cpu_pda[cpu].__nmi_count - counts[cpu] <= 5) { + printk("CPU#%d: NMI appears to be stuck (%d)!\n", + cpu, + cpu_pda[cpu].__nmi_count); + nmi_active = 0; + lapic_nmi_owner &= ~LAPIC_NMI_WATCHDOG; + return -1; + } + } + printk("OK.\n"); + + /* now that we know it works we can reduce NMI frequency to + something more reasonable; makes a difference in some configs */ + if (nmi_watchdog == NMI_LOCAL_APIC) + nmi_hz = 1; + + return 0; +} + +int __init setup_nmi_watchdog(char *str) +{ + int nmi; + + if (!strncmp(str,"panic",5)) { + panic_on_timeout = 1; + str = strchr(str, ','); + if (!str) + return 1; + ++str; + } + + get_option(&str, &nmi); + + if (nmi >= NMI_INVALID) + return 0; + nmi_watchdog = nmi; + return 1; +} + +__setup("nmi_watchdog=", setup_nmi_watchdog); + +static void disable_lapic_nmi_watchdog(void) +{ + if (nmi_active <= 0) + return; + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + wrmsr(MSR_K7_EVNTSEL0, 0, 0); + break; + case X86_VENDOR_INTEL: + wrmsr(MSR_IA32_EVNTSEL0, 0, 0); + break; + } + nmi_active = -1; + /* tell do_nmi() and others that we're not active any more */ + nmi_watchdog = 0; +} + +static void enable_lapic_nmi_watchdog(void) +{ + if (nmi_active < 0) { + nmi_watchdog = NMI_LOCAL_APIC; + setup_apic_nmi_watchdog(); + } +} + +int reserve_lapic_nmi(void) +{ + unsigned int old_owner; + + spin_lock(&lapic_nmi_owner_lock); + old_owner = lapic_nmi_owner; + lapic_nmi_owner |= LAPIC_NMI_RESERVED; + spin_unlock(&lapic_nmi_owner_lock); + if (old_owner & LAPIC_NMI_RESERVED) + return -EBUSY; + if (old_owner & LAPIC_NMI_WATCHDOG) + disable_lapic_nmi_watchdog(); + return 0; +} + +void release_lapic_nmi(void) +{ + unsigned int new_owner; + + spin_lock(&lapic_nmi_owner_lock); + new_owner = lapic_nmi_owner & ~LAPIC_NMI_RESERVED; + lapic_nmi_owner = new_owner; + spin_unlock(&lapic_nmi_owner_lock); + if (new_owner & LAPIC_NMI_WATCHDOG) + enable_lapic_nmi_watchdog(); +} + +void disable_timer_nmi_watchdog(void) +{ + if ((nmi_watchdog != NMI_IO_APIC) || (nmi_active <= 0)) + return; + + disable_irq(0); + unset_nmi_callback(); + nmi_active = -1; + nmi_watchdog = NMI_NONE; +} + +void enable_timer_nmi_watchdog(void) +{ + if (nmi_active < 0) { + nmi_watchdog = NMI_IO_APIC; + touch_nmi_watchdog(); + nmi_active = 1; + enable_irq(0); + } +} + +#ifdef CONFIG_PM + +static int nmi_pm_active; /* nmi_active before suspend */ + +static int lapic_nmi_suspend(struct sys_device *dev, u32 state) +{ + nmi_pm_active = nmi_active; + disable_lapic_nmi_watchdog(); + return 0; +} + +static int lapic_nmi_resume(struct sys_device *dev) +{ + if (nmi_pm_active > 0) + enable_lapic_nmi_watchdog(); + return 0; +} + +static struct sysdev_class nmi_sysclass = { + set_kset_name("lapic_nmi"), + .resume = lapic_nmi_resume, + .suspend = lapic_nmi_suspend, +}; + +static struct sys_device device_lapic_nmi = { + .id = 0, + .cls = &nmi_sysclass, +}; + +static int __init init_lapic_nmi_sysfs(void) +{ + int error; + + if (nmi_active == 0 || nmi_watchdog != NMI_LOCAL_APIC) + return 0; + + error = sysdev_class_register(&nmi_sysclass); + if (!error) + error = sysdev_register(&device_lapic_nmi); + return error; +} +/* must come after the local APIC's device_initcall() */ +late_initcall(init_lapic_nmi_sysfs); + +#endif /* CONFIG_PM */ + +/* + * Activate the NMI watchdog via the local APIC. + * Original code written by Keith Owens. + */ + +static void setup_k7_watchdog(void) +{ + int i; + unsigned int evntsel; + + /* No check, so can start with slow frequency */ + nmi_hz = 1; + + /* XXX should check these in EFER */ + + nmi_perfctr_msr = MSR_K7_PERFCTR0; + + for(i = 0; i < 4; ++i) { + /* Simulator may not support it */ + if (checking_wrmsrl(MSR_K7_EVNTSEL0+i, 0UL)) + return; + wrmsrl(MSR_K7_PERFCTR0+i, 0UL); + } + + evntsel = K7_EVNTSEL_INT + | K7_EVNTSEL_OS + | K7_EVNTSEL_USR + | K7_NMI_EVENT; + + wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); + wrmsrl(MSR_K7_PERFCTR0, -((u64)cpu_khz*1000) / nmi_hz); + apic_write(APIC_LVTPC, APIC_DM_NMI); + evntsel |= K7_EVNTSEL_ENABLE; + wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); +} + +void setup_apic_nmi_watchdog(void) +{ + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + if (boot_cpu_data.x86 < 6) + return; + if (strstr(boot_cpu_data.x86_model_id, "Screwdriver")) + return; + setup_k7_watchdog(); + break; + default: + return; + } + lapic_nmi_owner = LAPIC_NMI_WATCHDOG; + nmi_active = 1; +} + +/* + * the best way to detect whether a CPU has a 'hard lockup' problem + * is to check it's local APIC timer IRQ counts. If they are not + * changing then that CPU has some problem. + * + * as these watchdog NMI IRQs are generated on every CPU, we only + * have to check the current processor. + * + * since NMIs don't listen to _any_ locks, we have to be extremely + * careful not to rely on unsafe variables. The printk might lock + * up though, so we have to break up any console locks first ... + * [when there will be more tty-related locks, break them up + * here too!] + */ + +static unsigned int + last_irq_sums [NR_CPUS], + alert_counter [NR_CPUS]; + +void touch_nmi_watchdog (void) +{ + int i; + + /* + * Just reset the alert counters, (other CPUs might be + * spinning on locks we hold): + */ + for (i = 0; i < NR_CPUS; i++) + alert_counter[i] = 0; +} + +void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason) +{ + int sum, cpu; + + cpu = safe_smp_processor_id(); + sum = read_pda(apic_timer_irqs); + if (last_irq_sums[cpu] == sum) { + /* + * Ayiee, looks like this CPU is stuck ... + * wait a few IRQs (5 seconds) before doing the oops ... + */ + alert_counter[cpu]++; + if (alert_counter[cpu] == 5*nmi_hz) { + if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) + == NOTIFY_STOP) { + alert_counter[cpu] = 0; + return; + } + die_nmi("NMI Watchdog detected LOCKUP on CPU%d", regs); + } + } else { + last_irq_sums[cpu] = sum; + alert_counter[cpu] = 0; + } + if (nmi_perfctr_msr) + wrmsr(nmi_perfctr_msr, -(cpu_khz/nmi_hz*1000), -1); +} + +static int dummy_nmi_callback(struct pt_regs * regs, int cpu) +{ + return 0; +} + +static nmi_callback_t nmi_callback = dummy_nmi_callback; + +asmlinkage void do_nmi(struct pt_regs * regs, long error_code) +{ + int cpu = safe_smp_processor_id(); + + nmi_enter(); + add_pda(__nmi_count,1); + if (!nmi_callback(regs, cpu)) + default_do_nmi(regs); + nmi_exit(); +} + +void set_nmi_callback(nmi_callback_t callback) +{ + nmi_callback = callback; +} + +void unset_nmi_callback(void) +{ + nmi_callback = dummy_nmi_callback; +} + +#ifdef CONFIG_SYSCTL + +static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) +{ + unsigned char reason = get_nmi_reason(); + char buf[64]; + + if (!(reason & 0xc0)) { + sprintf(buf, "NMI received for unknown reason %02x\n", reason); + die_nmi(buf,regs); + } + return 0; +} + +/* + * proc handler for /proc/sys/kernel/unknown_nmi_panic + */ +int proc_unknown_nmi_panic(struct ctl_table *table, int write, struct file *file, + void __user *buffer, size_t *length, loff_t *ppos) +{ + int old_state; + + old_state = unknown_nmi_panic; + proc_dointvec(table, write, file, buffer, length, ppos); + if (!!old_state == !!unknown_nmi_panic) + return 0; + + if (unknown_nmi_panic) { + if (reserve_lapic_nmi() < 0) { + unknown_nmi_panic = 0; + return -EBUSY; + } else { + set_nmi_callback(unknown_nmi_panic_callback); + } + } else { + release_lapic_nmi(); + unset_nmi_callback(); + } + return 0; +} + +#endif + +EXPORT_SYMBOL(nmi_active); +EXPORT_SYMBOL(nmi_watchdog); +EXPORT_SYMBOL(reserve_lapic_nmi); +EXPORT_SYMBOL(release_lapic_nmi); +EXPORT_SYMBOL(disable_timer_nmi_watchdog); +EXPORT_SYMBOL(enable_timer_nmi_watchdog); +EXPORT_SYMBOL(touch_nmi_watchdog); diff --git a/arch/x86_64/kernel/pci-dma.c b/arch/x86_64/kernel/pci-dma.c new file mode 100644 index 00000000000..cab471cf3ed --- /dev/null +++ b/arch/x86_64/kernel/pci-dma.c @@ -0,0 +1,60 @@ +/* + * Dynamic DMA mapping support. + */ + +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <linux/pci.h> +#include <linux/module.h> +#include <asm/io.h> + +/* Map a set of buffers described by scatterlist in streaming + * mode for DMA. This is the scatter-gather version of the + * above pci_map_single interface. Here the scatter gather list + * elements are each tagged with the appropriate dma address + * and length. They are obtained via sg_dma_{address,length}(SG). + * + * NOTE: An implementation may be able to use a smaller number of + * DMA address/length pairs than there are SG table elements. + * (for example via virtual mapping capabilities) + * The routine returns the number of addr/length pairs actually + * used, at most nents. + * + * Device ownership issues as mentioned above for pci_map_single are + * the same here. + */ +int dma_map_sg(struct device *hwdev, struct scatterlist *sg, + int nents, int direction) +{ + int i; + + BUG_ON(direction == DMA_NONE); + for (i = 0; i < nents; i++ ) { + struct scatterlist *s = &sg[i]; + BUG_ON(!s->page); + s->dma_address = virt_to_bus(page_address(s->page) +s->offset); + s->dma_length = s->length; + } + return nents; +} + +EXPORT_SYMBOL(dma_map_sg); + +/* Unmap a set of streaming mode DMA translations. + * Again, cpu read rules concerning calls here are the same as for + * pci_unmap_single() above. + */ +void dma_unmap_sg(struct device *dev, struct scatterlist *sg, + int nents, int dir) +{ + int i; + for (i = 0; i < nents; i++) { + struct scatterlist *s = &sg[i]; + BUG_ON(s->page == NULL); + BUG_ON(s->dma_address == 0); + dma_unmap_single(dev, s->dma_address, s->dma_length, dir); + } +} + +EXPORT_SYMBOL(dma_unmap_sg); diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c new file mode 100644 index 00000000000..57f35c68aa3 --- /dev/null +++ b/arch/x86_64/kernel/pci-gart.c @@ -0,0 +1,980 @@ +/* + * Dynamic DMA mapping support for AMD Hammer. + * + * Use the integrated AGP GART in the Hammer northbridge as an IOMMU for PCI. + * This allows to use PCI devices that only support 32bit addresses on systems + * with more than 4GB. + * + * See Documentation/DMA-mapping.txt for the interface specification. + * + * Copyright 2002 Andi Kleen, SuSE Labs. + */ + +#include <linux/config.h> +#include <linux/types.h> +#include <linux/ctype.h> +#include <linux/agp_backend.h> +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <linux/spinlock.h> +#include <linux/pci.h> +#include <linux/module.h> +#include <linux/topology.h> +#include <linux/interrupt.h> +#include <linux/bitops.h> +#include <asm/atomic.h> +#include <asm/io.h> +#include <asm/mtrr.h> +#include <asm/pgtable.h> +#include <asm/proto.h> +#include <asm/cacheflush.h> +#include <asm/kdebug.h> + +dma_addr_t bad_dma_address; + +unsigned long iommu_bus_base; /* GART remapping area (physical) */ +static unsigned long iommu_size; /* size of remapping area bytes */ +static unsigned long iommu_pages; /* .. and in pages */ + +u32 *iommu_gatt_base; /* Remapping table */ + +int no_iommu; +static int no_agp; +#ifdef CONFIG_IOMMU_DEBUG +int panic_on_overflow = 1; +int force_iommu = 1; +#else +int panic_on_overflow = 0; +int force_iommu = 0; +#endif +int iommu_merge = 1; +int iommu_sac_force = 0; + +/* If this is disabled the IOMMU will use an optimized flushing strategy + of only flushing when an mapping is reused. With it true the GART is flushed + for every mapping. Problem is that doing the lazy flush seems to trigger + bugs with some popular PCI cards, in particular 3ware (but has been also + also seen with Qlogic at least). */ +int iommu_fullflush = 1; + +/* This tells the BIO block layer to assume merging. Default to off + because we cannot guarantee merging later. */ +int iommu_bio_merge = 0; + +#define MAX_NB 8 + +/* Allocation bitmap for the remapping area */ +static DEFINE_SPINLOCK(iommu_bitmap_lock); +static unsigned long *iommu_gart_bitmap; /* guarded by iommu_bitmap_lock */ + +static u32 gart_unmapped_entry; + +#define GPTE_VALID 1 +#define GPTE_COHERENT 2 +#define GPTE_ENCODE(x) \ + (((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT) +#define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28)) + +#define to_pages(addr,size) \ + (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT) + +#define for_all_nb(dev) \ + dev = NULL; \ + while ((dev = pci_get_device(PCI_VENDOR_ID_AMD, 0x1103, dev))!=NULL)\ + if (dev->bus->number == 0 && \ + (PCI_SLOT(dev->devfn) >= 24) && (PCI_SLOT(dev->devfn) <= 31)) + +static struct pci_dev *northbridges[MAX_NB]; +static u32 northbridge_flush_word[MAX_NB]; + +#define EMERGENCY_PAGES 32 /* = 128KB */ + +#ifdef CONFIG_AGP +#define AGPEXTERN extern +#else +#define AGPEXTERN +#endif + +/* backdoor interface to AGP driver */ +AGPEXTERN int agp_memory_reserved; +AGPEXTERN __u32 *agp_gatt_table; + +static unsigned long next_bit; /* protected by iommu_bitmap_lock */ +static int need_flush; /* global flush state. set for each gart wrap */ +static dma_addr_t dma_map_area(struct device *dev, unsigned long phys_mem, + size_t size, int dir, int do_panic); + +/* Dummy device used for NULL arguments (normally ISA). Better would + be probably a smaller DMA mask, but this is bug-to-bug compatible to i386. */ +static struct device fallback_dev = { + .bus_id = "fallback device", + .coherent_dma_mask = 0xffffffff, + .dma_mask = &fallback_dev.coherent_dma_mask, +}; + +static unsigned long alloc_iommu(int size) +{ + unsigned long offset, flags; + + spin_lock_irqsave(&iommu_bitmap_lock, flags); + offset = find_next_zero_string(iommu_gart_bitmap,next_bit,iommu_pages,size); + if (offset == -1) { + need_flush = 1; + offset = find_next_zero_string(iommu_gart_bitmap,0,next_bit,size); + } + if (offset != -1) { + set_bit_string(iommu_gart_bitmap, offset, size); + next_bit = offset+size; + if (next_bit >= iommu_pages) { + next_bit = 0; + need_flush = 1; + } + } + if (iommu_fullflush) + need_flush = 1; + spin_unlock_irqrestore(&iommu_bitmap_lock, flags); + return offset; +} + +static void free_iommu(unsigned long offset, int size) +{ + unsigned long flags; + if (size == 1) { + clear_bit(offset, iommu_gart_bitmap); + return; + } + spin_lock_irqsave(&iommu_bitmap_lock, flags); + __clear_bit_string(iommu_gart_bitmap, offset, size); + spin_unlock_irqrestore(&iommu_bitmap_lock, flags); +} + +/* + * Use global flush state to avoid races with multiple flushers. + */ +static void flush_gart(struct device *dev) +{ + unsigned long flags; + int flushed = 0; + int i, max; + + spin_lock_irqsave(&iommu_bitmap_lock, flags); + if (need_flush) { + max = 0; + for (i = 0; i < MAX_NB; i++) { + if (!northbridges[i]) + continue; + pci_write_config_dword(northbridges[i], 0x9c, + northbridge_flush_word[i] | 1); + flushed++; + max = i; + } + for (i = 0; i <= max; i++) { + u32 w; + if (!northbridges[i]) + continue; + /* Make sure the hardware actually executed the flush. */ + do { + pci_read_config_dword(northbridges[i], 0x9c, &w); + } while (w & 1); + } + if (!flushed) + printk("nothing to flush?\n"); + need_flush = 0; + } + spin_unlock_irqrestore(&iommu_bitmap_lock, flags); +} + +/* Allocate DMA memory on node near device */ +noinline +static void *dma_alloc_pages(struct device *dev, unsigned gfp, unsigned order) +{ + struct page *page; + int node; + if (dev->bus == &pci_bus_type) { + cpumask_t mask; + mask = pcibus_to_cpumask(to_pci_dev(dev)->bus); + node = cpu_to_node(first_cpu(mask)); + } else + node = numa_node_id(); + page = alloc_pages_node(node, gfp, order); + return page ? page_address(page) : NULL; +} + +/* + * Allocate memory for a coherent mapping. + */ +void * +dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, + unsigned gfp) +{ + void *memory; + unsigned long dma_mask = 0; + u64 bus; + + if (!dev) + dev = &fallback_dev; + dma_mask = dev->coherent_dma_mask; + if (dma_mask == 0) + dma_mask = 0xffffffff; + + /* Kludge to make it bug-to-bug compatible with i386. i386 + uses the normal dma_mask for alloc_coherent. */ + dma_mask &= *dev->dma_mask; + + again: + memory = dma_alloc_pages(dev, gfp, get_order(size)); + if (memory == NULL) + return NULL; + + { + int high, mmu; + bus = virt_to_bus(memory); + high = (bus + size) >= dma_mask; + mmu = high; + if (force_iommu && !(gfp & GFP_DMA)) + mmu = 1; + if (no_iommu || dma_mask < 0xffffffffUL) { + if (high) { + free_pages((unsigned long)memory, + get_order(size)); + + if (swiotlb) { + return + swiotlb_alloc_coherent(dev, size, + dma_handle, + gfp); + } + + if (!(gfp & GFP_DMA)) { + gfp |= GFP_DMA; + goto again; + } + return NULL; + } + mmu = 0; + } + memset(memory, 0, size); + if (!mmu) { + *dma_handle = virt_to_bus(memory); + return memory; + } + } + + *dma_handle = dma_map_area(dev, bus, size, PCI_DMA_BIDIRECTIONAL, 0); + if (*dma_handle == bad_dma_address) + goto error; + flush_gart(dev); + return memory; + +error: + if (panic_on_overflow) + panic("dma_alloc_coherent: IOMMU overflow by %lu bytes\n", size); + free_pages((unsigned long)memory, get_order(size)); + return NULL; +} + +/* + * Unmap coherent memory. + * The caller must ensure that the device has finished accessing the mapping. + */ +void dma_free_coherent(struct device *dev, size_t size, + void *vaddr, dma_addr_t bus) +{ + if (swiotlb) { + swiotlb_free_coherent(dev, size, vaddr, bus); + return; + } + + dma_unmap_single(dev, bus, size, 0); + free_pages((unsigned long)vaddr, get_order(size)); +} + +#ifdef CONFIG_IOMMU_LEAK + +#define SET_LEAK(x) if (iommu_leak_tab) \ + iommu_leak_tab[x] = __builtin_return_address(0); +#define CLEAR_LEAK(x) if (iommu_leak_tab) \ + iommu_leak_tab[x] = NULL; + +/* Debugging aid for drivers that don't free their IOMMU tables */ +static void **iommu_leak_tab; +static int leak_trace; +int iommu_leak_pages = 20; +void dump_leak(void) +{ + int i; + static int dump; + if (dump || !iommu_leak_tab) return; + dump = 1; + show_stack(NULL,NULL); + /* Very crude. dump some from the end of the table too */ + printk("Dumping %d pages from end of IOMMU:\n", iommu_leak_pages); + for (i = 0; i < iommu_leak_pages; i+=2) { + printk("%lu: ", iommu_pages-i); + printk_address((unsigned long) iommu_leak_tab[iommu_pages-i]); + printk("%c", (i+1)%2 == 0 ? '\n' : ' '); + } + printk("\n"); +} +#else +#define SET_LEAK(x) +#define CLEAR_LEAK(x) +#endif + +static void iommu_full(struct device *dev, size_t size, int dir, int do_panic) +{ + /* + * Ran out of IOMMU space for this operation. This is very bad. + * Unfortunately the drivers cannot handle this operation properly. + * Return some non mapped prereserved space in the aperture and + * let the Northbridge deal with it. This will result in garbage + * in the IO operation. When the size exceeds the prereserved space + * memory corruption will occur or random memory will be DMAed + * out. Hopefully no network devices use single mappings that big. + */ + + printk(KERN_ERR + "PCI-DMA: Out of IOMMU space for %lu bytes at device %s\n", + size, dev->bus_id); + + if (size > PAGE_SIZE*EMERGENCY_PAGES && do_panic) { + if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL) + panic("PCI-DMA: Memory would be corrupted\n"); + if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL) + panic("PCI-DMA: Random memory would be DMAed\n"); + } + +#ifdef CONFIG_IOMMU_LEAK + dump_leak(); +#endif +} + +static inline int need_iommu(struct device *dev, unsigned long addr, size_t size) +{ + u64 mask = *dev->dma_mask; + int high = addr + size >= mask; + int mmu = high; + if (force_iommu) + mmu = 1; + if (no_iommu) { + if (high) + panic("PCI-DMA: high address but no IOMMU.\n"); + mmu = 0; + } + return mmu; +} + +static inline int nonforced_iommu(struct device *dev, unsigned long addr, size_t size) +{ + u64 mask = *dev->dma_mask; + int high = addr + size >= mask; + int mmu = high; + if (no_iommu) { + if (high) + panic("PCI-DMA: high address but no IOMMU.\n"); + mmu = 0; + } + return mmu; +} + +/* Map a single continuous physical area into the IOMMU. + * Caller needs to check if the iommu is needed and flush. + */ +static dma_addr_t dma_map_area(struct device *dev, unsigned long phys_mem, + size_t size, int dir, int do_panic) +{ + unsigned long npages = to_pages(phys_mem, size); + unsigned long iommu_page = alloc_iommu(npages); + int i; + if (iommu_page == -1) { + if (!nonforced_iommu(dev, phys_mem, size)) + return phys_mem; + if (panic_on_overflow) + panic("dma_map_area overflow %lu bytes\n", size); + iommu_full(dev, size, dir, do_panic); + return bad_dma_address; + } + + for (i = 0; i < npages; i++) { + iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem); + SET_LEAK(iommu_page + i); + phys_mem += PAGE_SIZE; + } + return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK); +} + +/* Map a single area into the IOMMU */ +dma_addr_t dma_map_single(struct device *dev, void *addr, size_t size, int dir) +{ + unsigned long phys_mem, bus; + + BUG_ON(dir == DMA_NONE); + + if (swiotlb) + return swiotlb_map_single(dev,addr,size,dir); + if (!dev) + dev = &fallback_dev; + + phys_mem = virt_to_phys(addr); + if (!need_iommu(dev, phys_mem, size)) + return phys_mem; + + bus = dma_map_area(dev, phys_mem, size, dir, 1); + flush_gart(dev); + return bus; +} + +/* Fallback for dma_map_sg in case of overflow */ +static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg, + int nents, int dir) +{ + int i; + +#ifdef CONFIG_IOMMU_DEBUG + printk(KERN_DEBUG "dma_map_sg overflow\n"); +#endif + + for (i = 0; i < nents; i++ ) { + struct scatterlist *s = &sg[i]; + unsigned long addr = page_to_phys(s->page) + s->offset; + if (nonforced_iommu(dev, addr, s->length)) { + addr = dma_map_area(dev, addr, s->length, dir, 0); + if (addr == bad_dma_address) { + if (i > 0) + dma_unmap_sg(dev, sg, i, dir); + nents = 0; + sg[0].dma_length = 0; + break; + } + } + s->dma_address = addr; + s->dma_length = s->length; + } + flush_gart(dev); + return nents; +} + +/* Map multiple scatterlist entries continuous into the first. */ +static int __dma_map_cont(struct scatterlist *sg, int start, int stopat, + struct scatterlist *sout, unsigned long pages) +{ + unsigned long iommu_start = alloc_iommu(pages); + unsigned long iommu_page = iommu_start; + int i; + + if (iommu_start == -1) + return -1; + + for (i = start; i < stopat; i++) { + struct scatterlist *s = &sg[i]; + unsigned long pages, addr; + unsigned long phys_addr = s->dma_address; + + BUG_ON(i > start && s->offset); + if (i == start) { + *sout = *s; + sout->dma_address = iommu_bus_base; + sout->dma_address += iommu_page*PAGE_SIZE + s->offset; + sout->dma_length = s->length; + } else { + sout->dma_length += s->length; + } + + addr = phys_addr; + pages = to_pages(s->offset, s->length); + while (pages--) { + iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); + SET_LEAK(iommu_page); + addr += PAGE_SIZE; + iommu_page++; + } + } + BUG_ON(iommu_page - iommu_start != pages); + return 0; +} + +static inline int dma_map_cont(struct scatterlist *sg, int start, int stopat, + struct scatterlist *sout, + unsigned long pages, int need) +{ + if (!need) { + BUG_ON(stopat - start != 1); + *sout = sg[start]; + sout->dma_length = sg[start].length; + return 0; + } + return __dma_map_cont(sg, start, stopat, sout, pages); +} + +/* + * DMA map all entries in a scatterlist. + * Merge chunks that have page aligned sizes into a continuous mapping. + */ +int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) +{ + int i; + int out; + int start; + unsigned long pages = 0; + int need = 0, nextneed; + + BUG_ON(dir == DMA_NONE); + if (nents == 0) + return 0; + + if (swiotlb) + return swiotlb_map_sg(dev,sg,nents,dir); + if (!dev) + dev = &fallback_dev; + + out = 0; + start = 0; + for (i = 0; i < nents; i++) { + struct scatterlist *s = &sg[i]; + dma_addr_t addr = page_to_phys(s->page) + s->offset; + s->dma_address = addr; + BUG_ON(s->length == 0); + + nextneed = need_iommu(dev, addr, s->length); + + /* Handle the previous not yet processed entries */ + if (i > start) { + struct scatterlist *ps = &sg[i-1]; + /* Can only merge when the last chunk ends on a page + boundary and the new one doesn't have an offset. */ + if (!iommu_merge || !nextneed || !need || s->offset || + (ps->offset + ps->length) % PAGE_SIZE) { + if (dma_map_cont(sg, start, i, sg+out, pages, + need) < 0) + goto error; + out++; + pages = 0; + start = i; + } + } + + need = nextneed; + pages += to_pages(s->offset, s->length); + } + if (dma_map_cont(sg, start, i, sg+out, pages, need) < 0) + goto error; + out++; + flush_gart(dev); + if (out < nents) + sg[out].dma_length = 0; + return out; + +error: + flush_gart(NULL); + dma_unmap_sg(dev, sg, nents, dir); + /* When it was forced try again unforced */ + if (force_iommu) + return dma_map_sg_nonforce(dev, sg, nents, dir); + if (panic_on_overflow) + panic("dma_map_sg: overflow on %lu pages\n", pages); + iommu_full(dev, pages << PAGE_SHIFT, dir, 0); + for (i = 0; i < nents; i++) + sg[i].dma_address = bad_dma_address; + return 0; +} + +/* + * Free a DMA mapping. + */ +void dma_unmap_single(struct device *dev, dma_addr_t dma_addr, + size_t size, int direction) +{ + unsigned long iommu_page; + int npages; + int i; + + if (swiotlb) { + swiotlb_unmap_single(dev,dma_addr,size,direction); + return; + } + + if (dma_addr < iommu_bus_base + EMERGENCY_PAGES*PAGE_SIZE || + dma_addr >= iommu_bus_base + iommu_size) + return; + iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT; + npages = to_pages(dma_addr, size); + for (i = 0; i < npages; i++) { + iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; + CLEAR_LEAK(iommu_page + i); + } + free_iommu(iommu_page, npages); +} + +/* + * Wrapper for pci_unmap_single working with scatterlists. + */ +void dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) +{ + int i; + if (swiotlb) { + swiotlb_unmap_sg(dev,sg,nents,dir); + return; + } + for (i = 0; i < nents; i++) { + struct scatterlist *s = &sg[i]; + if (!s->dma_length || !s->length) + break; + dma_unmap_single(dev, s->dma_address, s->dma_length, dir); + } +} + +int dma_supported(struct device *dev, u64 mask) +{ + /* Copied from i386. Doesn't make much sense, because it will + only work for pci_alloc_coherent. + The caller just has to use GFP_DMA in this case. */ + if (mask < 0x00ffffff) + return 0; + + /* Tell the device to use SAC when IOMMU force is on. + This allows the driver to use cheaper accesses in some cases. + + Problem with this is that if we overflow the IOMMU area + and return DAC as fallback address the device may not handle it correctly. + + As a special case some controllers have a 39bit address mode + that is as efficient as 32bit (aic79xx). Don't force SAC for these. + Assume all masks <= 40 bits are of this type. Normally this doesn't + make any difference, but gives more gentle handling of IOMMU overflow. */ + if (iommu_sac_force && (mask >= 0xffffffffffULL)) { + printk(KERN_INFO "%s: Force SAC with mask %Lx\n", dev->bus_id,mask); + return 0; + } + + return 1; +} + +int dma_get_cache_alignment(void) +{ + return boot_cpu_data.x86_clflush_size; +} + +EXPORT_SYMBOL(dma_unmap_sg); +EXPORT_SYMBOL(dma_map_sg); +EXPORT_SYMBOL(dma_map_single); +EXPORT_SYMBOL(dma_unmap_single); +EXPORT_SYMBOL(dma_supported); +EXPORT_SYMBOL(no_iommu); +EXPORT_SYMBOL(force_iommu); +EXPORT_SYMBOL(bad_dma_address); +EXPORT_SYMBOL(iommu_bio_merge); +EXPORT_SYMBOL(iommu_sac_force); +EXPORT_SYMBOL(dma_get_cache_alignment); +EXPORT_SYMBOL(dma_alloc_coherent); +EXPORT_SYMBOL(dma_free_coherent); + +static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) +{ + unsigned long a; + if (!iommu_size) { + iommu_size = aper_size; + if (!no_agp) + iommu_size /= 2; + } + + a = aper + iommu_size; + iommu_size -= round_up(a, LARGE_PAGE_SIZE) - a; + + if (iommu_size < 64*1024*1024) + printk(KERN_WARNING + "PCI-DMA: Warning: Small IOMMU %luMB. Consider increasing the AGP aperture in BIOS\n",iommu_size>>20); + + return iommu_size; +} + +static __init unsigned read_aperture(struct pci_dev *dev, u32 *size) +{ + unsigned aper_size = 0, aper_base_32; + u64 aper_base; + unsigned aper_order; + + pci_read_config_dword(dev, 0x94, &aper_base_32); + pci_read_config_dword(dev, 0x90, &aper_order); + aper_order = (aper_order >> 1) & 7; + + aper_base = aper_base_32 & 0x7fff; + aper_base <<= 25; + + aper_size = (32 * 1024 * 1024) << aper_order; + if (aper_base + aper_size >= 0xffffffff || !aper_size) + aper_base = 0; + + *size = aper_size; + return aper_base; +} + +/* + * Private Northbridge GATT initialization in case we cannot use the + * AGP driver for some reason. + */ +static __init int init_k8_gatt(struct agp_kern_info *info) +{ + struct pci_dev *dev; + void *gatt; + unsigned aper_base, new_aper_base; + unsigned aper_size, gatt_size, new_aper_size; + + printk(KERN_INFO "PCI-DMA: Disabling AGP.\n"); + aper_size = aper_base = info->aper_size = 0; + for_all_nb(dev) { + new_aper_base = read_aperture(dev, &new_aper_size); + if (!new_aper_base) + goto nommu; + + if (!aper_base) { + aper_size = new_aper_size; + aper_base = new_aper_base; + } + if (aper_size != new_aper_size || aper_base != new_aper_base) + goto nommu; + } + if (!aper_base) + goto nommu; + info->aper_base = aper_base; + info->aper_size = aper_size>>20; + + gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32); + gatt = (void *)__get_free_pages(GFP_KERNEL, get_order(gatt_size)); + if (!gatt) + panic("Cannot allocate GATT table"); + memset(gatt, 0, gatt_size); + agp_gatt_table = gatt; + + for_all_nb(dev) { + u32 ctl; + u32 gatt_reg; + + gatt_reg = __pa(gatt) >> 12; + gatt_reg <<= 4; + pci_write_config_dword(dev, 0x98, gatt_reg); + pci_read_config_dword(dev, 0x90, &ctl); + + ctl |= 1; + ctl &= ~((1<<4) | (1<<5)); + + pci_write_config_dword(dev, 0x90, ctl); + } + flush_gart(NULL); + + printk("PCI-DMA: aperture base @ %x size %u KB\n",aper_base, aper_size>>10); + return 0; + + nommu: + /* Should not happen anymore */ + printk(KERN_ERR "PCI-DMA: More than 4GB of RAM and no IOMMU\n" + KERN_ERR "PCI-DMA: 32bit PCI IO may malfunction."); + return -1; +} + +extern int agp_amd64_init(void); + +static int __init pci_iommu_init(void) +{ + struct agp_kern_info info; + unsigned long aper_size; + unsigned long iommu_start; + struct pci_dev *dev; + unsigned long scratch; + long i; + +#ifndef CONFIG_AGP_AMD64 + no_agp = 1; +#else + /* Makefile puts PCI initialization via subsys_initcall first. */ + /* Add other K8 AGP bridge drivers here */ + no_agp = no_agp || + (agp_amd64_init() < 0) || + (agp_copy_info(agp_bridge, &info) < 0); +#endif + + if (swiotlb) { + no_iommu = 1; + printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n"); + return -1; + } + + if (no_iommu || + (!force_iommu && end_pfn < 0xffffffff>>PAGE_SHIFT) || + !iommu_aperture || + (no_agp && init_k8_gatt(&info) < 0)) { + printk(KERN_INFO "PCI-DMA: Disabling IOMMU.\n"); + no_iommu = 1; + return -1; + } + + aper_size = info.aper_size * 1024 * 1024; + iommu_size = check_iommu_size(info.aper_base, aper_size); + iommu_pages = iommu_size >> PAGE_SHIFT; + + iommu_gart_bitmap = (void*)__get_free_pages(GFP_KERNEL, + get_order(iommu_pages/8)); + if (!iommu_gart_bitmap) + panic("Cannot allocate iommu bitmap\n"); + memset(iommu_gart_bitmap, 0, iommu_pages/8); + +#ifdef CONFIG_IOMMU_LEAK + if (leak_trace) { + iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL, + get_order(iommu_pages*sizeof(void *))); + if (iommu_leak_tab) + memset(iommu_leak_tab, 0, iommu_pages * 8); + else + printk("PCI-DMA: Cannot allocate leak trace area\n"); + } +#endif + + /* + * Out of IOMMU space handling. + * Reserve some invalid pages at the beginning of the GART. + */ + set_bit_string(iommu_gart_bitmap, 0, EMERGENCY_PAGES); + + agp_memory_reserved = iommu_size; + printk(KERN_INFO + "PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n", + iommu_size>>20); + + iommu_start = aper_size - iommu_size; + iommu_bus_base = info.aper_base + iommu_start; + bad_dma_address = iommu_bus_base; + iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT); + + /* + * Unmap the IOMMU part of the GART. The alias of the page is + * always mapped with cache enabled and there is no full cache + * coherency across the GART remapping. The unmapping avoids + * automatic prefetches from the CPU allocating cache lines in + * there. All CPU accesses are done via the direct mapping to + * the backing memory. The GART address is only used by PCI + * devices. + */ + clear_kernel_mapping((unsigned long)__va(iommu_bus_base), iommu_size); + + /* + * Try to workaround a bug (thanks to BenH) + * Set unmapped entries to a scratch page instead of 0. + * Any prefetches that hit unmapped entries won't get an bus abort + * then. + */ + scratch = get_zeroed_page(GFP_KERNEL); + if (!scratch) + panic("Cannot allocate iommu scratch page"); + gart_unmapped_entry = GPTE_ENCODE(__pa(scratch)); + for (i = EMERGENCY_PAGES; i < iommu_pages; i++) + iommu_gatt_base[i] = gart_unmapped_entry; + + for_all_nb(dev) { + u32 flag; + int cpu = PCI_SLOT(dev->devfn) - 24; + if (cpu >= MAX_NB) + continue; + northbridges[cpu] = dev; + pci_read_config_dword(dev, 0x9c, &flag); /* cache flush word */ + northbridge_flush_word[cpu] = flag; + } + + flush_gart(NULL); + + return 0; +} + +/* Must execute after PCI subsystem */ +fs_initcall(pci_iommu_init); + +/* iommu=[size][,noagp][,off][,force][,noforce][,leak][,memaper[=order]][,merge] + [,forcesac][,fullflush][,nomerge][,biomerge] + size set size of iommu (in bytes) + noagp don't initialize the AGP driver and use full aperture. + off don't use the IOMMU + leak turn on simple iommu leak tracing (only when CONFIG_IOMMU_LEAK is on) + memaper[=order] allocate an own aperture over RAM with size 32MB^order. + noforce don't force IOMMU usage. Default. + force Force IOMMU. + merge Do lazy merging. This may improve performance on some block devices. + Implies force (experimental) + biomerge Do merging at the BIO layer. This is more efficient than merge, + but should be only done with very big IOMMUs. Implies merge,force. + nomerge Don't do SG merging. + forcesac For SAC mode for masks <40bits (experimental) + fullflush Flush IOMMU on each allocation (default) + nofullflush Don't use IOMMU fullflush + allowed overwrite iommu off workarounds for specific chipsets. + soft Use software bounce buffering (default for Intel machines) + noaperture Don't touch the aperture for AGP. +*/ +__init int iommu_setup(char *p) +{ + int arg; + + while (*p) { + if (!strncmp(p,"noagp",5)) + no_agp = 1; + if (!strncmp(p,"off",3)) + no_iommu = 1; + if (!strncmp(p,"force",5)) { + force_iommu = 1; + iommu_aperture_allowed = 1; + } + if (!strncmp(p,"allowed",7)) + iommu_aperture_allowed = 1; + if (!strncmp(p,"noforce",7)) { + iommu_merge = 0; + force_iommu = 0; + } + if (!strncmp(p, "memaper", 7)) { + fallback_aper_force = 1; + p += 7; + if (*p == '=') { + ++p; + if (get_option(&p, &arg)) + fallback_aper_order = arg; + } + } + if (!strncmp(p, "biomerge",8)) { + iommu_bio_merge = 4096; + iommu_merge = 1; + force_iommu = 1; + } + if (!strncmp(p, "panic",5)) + panic_on_overflow = 1; + if (!strncmp(p, "nopanic",7)) + panic_on_overflow = 0; + if (!strncmp(p, "merge",5)) { + iommu_merge = 1; + force_iommu = 1; + } + if (!strncmp(p, "nomerge",7)) + iommu_merge = 0; + if (!strncmp(p, "forcesac",8)) + iommu_sac_force = 1; + if (!strncmp(p, "fullflush",8)) + iommu_fullflush = 1; + if (!strncmp(p, "nofullflush",11)) + iommu_fullflush = 0; + if (!strncmp(p, "soft",4)) + swiotlb = 1; + if (!strncmp(p, "noaperture",10)) + fix_aperture = 0; +#ifdef CONFIG_IOMMU_LEAK + if (!strncmp(p,"leak",4)) { + leak_trace = 1; + p += 4; + if (*p == '=') ++p; + if (isdigit(*p) && get_option(&p, &arg)) + iommu_leak_pages = arg; + } else +#endif + if (isdigit(*p) && get_option(&p, &arg)) + iommu_size = arg; + p += strcspn(p, ","); + if (*p == ',') + ++p; + } + return 1; +} diff --git a/arch/x86_64/kernel/pci-nommu.c b/arch/x86_64/kernel/pci-nommu.c new file mode 100644 index 00000000000..67d90b89af0 --- /dev/null +++ b/arch/x86_64/kernel/pci-nommu.c @@ -0,0 +1,94 @@ +/* Fallback functions when the main IOMMU code is not compiled in. This + code is roughly equivalent to i386. */ +#include <linux/mm.h> +#include <linux/init.h> +#include <linux/pci.h> +#include <linux/string.h> +#include <asm/proto.h> +#include <asm/processor.h> + +int iommu_merge = 0; +EXPORT_SYMBOL(iommu_merge); + +dma_addr_t bad_dma_address; +EXPORT_SYMBOL(bad_dma_address); + +int iommu_bio_merge = 0; +EXPORT_SYMBOL(iommu_bio_merge); + +int iommu_sac_force = 0; +EXPORT_SYMBOL(iommu_sac_force); + +/* + * Dummy IO MMU functions + */ + +void *dma_alloc_coherent(struct device *hwdev, size_t size, + dma_addr_t *dma_handle, unsigned gfp) +{ + void *ret; + u64 mask; + int order = get_order(size); + + if (hwdev) + mask = hwdev->coherent_dma_mask & *hwdev->dma_mask; + else + mask = 0xffffffff; + for (;;) { + ret = (void *)__get_free_pages(gfp, order); + if (ret == NULL) + return NULL; + *dma_handle = virt_to_bus(ret); + if ((*dma_handle & ~mask) == 0) + break; + free_pages((unsigned long)ret, order); + if (gfp & GFP_DMA) + return NULL; + gfp |= GFP_DMA; + } + + memset(ret, 0, size); + return ret; +} +EXPORT_SYMBOL(dma_alloc_coherent); + +void dma_free_coherent(struct device *hwdev, size_t size, + void *vaddr, dma_addr_t dma_handle) +{ + free_pages((unsigned long)vaddr, get_order(size)); +} +EXPORT_SYMBOL(dma_free_coherent); + +int dma_supported(struct device *hwdev, u64 mask) +{ + /* + * we fall back to GFP_DMA when the mask isn't all 1s, + * so we can't guarantee allocations that must be + * within a tighter range than GFP_DMA.. + * RED-PEN this won't work for pci_map_single. Caller has to + * use GFP_DMA in the first place. + */ + if (mask < 0x00ffffff) + return 0; + + return 1; +} +EXPORT_SYMBOL(dma_supported); + +int dma_get_cache_alignment(void) +{ + return boot_cpu_data.x86_clflush_size; +} +EXPORT_SYMBOL(dma_get_cache_alignment); + +static int __init check_ram(void) +{ + if (end_pfn >= 0xffffffff>>PAGE_SHIFT) { + printk( + KERN_ERR "WARNING more than 4GB of memory but IOMMU not compiled in.\n" + KERN_ERR "WARNING 32bit PCI may malfunction.\n"); + } + return 0; +} +__initcall(check_ram); + diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c new file mode 100644 index 00000000000..9922d2ba24a --- /dev/null +++ b/arch/x86_64/kernel/process.c @@ -0,0 +1,770 @@ +/* + * linux/arch/x86-64/kernel/process.c + * + * Copyright (C) 1995 Linus Torvalds + * + * Pentium III FXSR, SSE support + * Gareth Hughes <gareth@valinux.com>, May 2000 + * + * X86-64 port + * Andi Kleen. + * + * $Id: process.c,v 1.38 2002/01/15 10:08:03 ak Exp $ + */ + +/* + * This file handles the architecture-dependent parts of process handling.. + */ + +#include <stdarg.h> + +#include <linux/errno.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/elfcore.h> +#include <linux/smp.h> +#include <linux/slab.h> +#include <linux/user.h> +#include <linux/module.h> +#include <linux/a.out.h> +#include <linux/interrupt.h> +#include <linux/delay.h> +#include <linux/irq.h> +#include <linux/ptrace.h> +#include <linux/utsname.h> +#include <linux/random.h> + +#include <asm/uaccess.h> +#include <asm/pgtable.h> +#include <asm/system.h> +#include <asm/io.h> +#include <asm/processor.h> +#include <asm/i387.h> +#include <asm/mmu_context.h> +#include <asm/pda.h> +#include <asm/prctl.h> +#include <asm/kdebug.h> +#include <asm/desc.h> +#include <asm/proto.h> +#include <asm/ia32.h> + +asmlinkage extern void ret_from_fork(void); + +unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED; + +static atomic_t hlt_counter = ATOMIC_INIT(0); + +unsigned long boot_option_idle_override = 0; +EXPORT_SYMBOL(boot_option_idle_override); + +/* + * Powermanagement idle function, if any.. + */ +void (*pm_idle)(void); +static DEFINE_PER_CPU(unsigned int, cpu_idle_state); + +void disable_hlt(void) +{ + atomic_inc(&hlt_counter); +} + +EXPORT_SYMBOL(disable_hlt); + +void enable_hlt(void) +{ + atomic_dec(&hlt_counter); +} + +EXPORT_SYMBOL(enable_hlt); + +/* + * We use this if we don't have any better + * idle routine.. + */ +void default_idle(void) +{ + if (!atomic_read(&hlt_counter)) { + local_irq_disable(); + if (!need_resched()) + safe_halt(); + else + local_irq_enable(); + } +} + +/* + * On SMP it's slightly faster (but much more power-consuming!) + * to poll the ->need_resched flag instead of waiting for the + * cross-CPU IPI to arrive. Use this option with caution. + */ +static void poll_idle (void) +{ + int oldval; + + local_irq_enable(); + + /* + * Deal with another CPU just having chosen a thread to + * run here: + */ + oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED); + + if (!oldval) { + set_thread_flag(TIF_POLLING_NRFLAG); + asm volatile( + "2:" + "testl %0,%1;" + "rep; nop;" + "je 2b;" + : : + "i" (_TIF_NEED_RESCHED), + "m" (current_thread_info()->flags)); + } else { + set_need_resched(); + } +} + +void cpu_idle_wait(void) +{ + unsigned int cpu, this_cpu = get_cpu(); + cpumask_t map; + + set_cpus_allowed(current, cpumask_of_cpu(this_cpu)); + put_cpu(); + + cpus_clear(map); + for_each_online_cpu(cpu) { + per_cpu(cpu_idle_state, cpu) = 1; + cpu_set(cpu, map); + } + + __get_cpu_var(cpu_idle_state) = 0; + + wmb(); + do { + ssleep(1); + for_each_online_cpu(cpu) { + if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu)) + cpu_clear(cpu, map); + } + cpus_and(map, map, cpu_online_map); + } while (!cpus_empty(map)); +} +EXPORT_SYMBOL_GPL(cpu_idle_wait); + +/* + * The idle thread. There's no useful work to be + * done, so just try to conserve power and have a + * low exit latency (ie sit in a loop waiting for + * somebody to say that they'd like to reschedule) + */ +void cpu_idle (void) +{ + /* endless idle loop with no priority at all */ + while (1) { + while (!need_resched()) { + void (*idle)(void); + + if (__get_cpu_var(cpu_idle_state)) + __get_cpu_var(cpu_idle_state) = 0; + + rmb(); + idle = pm_idle; + if (!idle) + idle = default_idle; + idle(); + } + + schedule(); + } +} + +/* + * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, + * which can obviate IPI to trigger checking of need_resched. + * We execute MONITOR against need_resched and enter optimized wait state + * through MWAIT. Whenever someone changes need_resched, we would be woken + * up from MWAIT (without an IPI). + */ +static void mwait_idle(void) +{ + local_irq_enable(); + + if (!need_resched()) { + set_thread_flag(TIF_POLLING_NRFLAG); + do { + __monitor((void *)¤t_thread_info()->flags, 0, 0); + if (need_resched()) + break; + __mwait(0, 0); + } while (!need_resched()); + clear_thread_flag(TIF_POLLING_NRFLAG); + } +} + +void __init select_idle_routine(const struct cpuinfo_x86 *c) +{ + static int printed; + if (cpu_has(c, X86_FEATURE_MWAIT)) { + /* + * Skip, if setup has overridden idle. + * One CPU supports mwait => All CPUs supports mwait + */ + if (!pm_idle) { + if (!printed) { + printk("using mwait in idle threads.\n"); + printed = 1; + } + pm_idle = mwait_idle; + } + } +} + +static int __init idle_setup (char *str) +{ + if (!strncmp(str, "poll", 4)) { + printk("using polling idle threads.\n"); + pm_idle = poll_idle; + } + + boot_option_idle_override = 1; + return 1; +} + +__setup("idle=", idle_setup); + +/* Prints also some state that isn't saved in the pt_regs */ +void __show_regs(struct pt_regs * regs) +{ + unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; + unsigned int fsindex,gsindex; + unsigned int ds,cs,es; + + printk("\n"); + print_modules(); + printk("Pid: %d, comm: %.20s %s %s\n", + current->pid, current->comm, print_tainted(), system_utsname.release); + printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip); + printk_address(regs->rip); + printk("\nRSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, regs->eflags); + printk("RAX: %016lx RBX: %016lx RCX: %016lx\n", + regs->rax, regs->rbx, regs->rcx); + printk("RDX: %016lx RSI: %016lx RDI: %016lx\n", + regs->rdx, regs->rsi, regs->rdi); + printk("RBP: %016lx R08: %016lx R09: %016lx\n", + regs->rbp, regs->r8, regs->r9); + printk("R10: %016lx R11: %016lx R12: %016lx\n", + regs->r10, regs->r11, regs->r12); + printk("R13: %016lx R14: %016lx R15: %016lx\n", + regs->r13, regs->r14, regs->r15); + + asm("movl %%ds,%0" : "=r" (ds)); + asm("movl %%cs,%0" : "=r" (cs)); + asm("movl %%es,%0" : "=r" (es)); + asm("movl %%fs,%0" : "=r" (fsindex)); + asm("movl %%gs,%0" : "=r" (gsindex)); + + rdmsrl(MSR_FS_BASE, fs); + rdmsrl(MSR_GS_BASE, gs); + rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); + + asm("movq %%cr0, %0": "=r" (cr0)); + asm("movq %%cr2, %0": "=r" (cr2)); + asm("movq %%cr3, %0": "=r" (cr3)); + asm("movq %%cr4, %0": "=r" (cr4)); + + printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", + fs,fsindex,gs,gsindex,shadowgs); + printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0); + printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4); +} + +void show_regs(struct pt_regs *regs) +{ + __show_regs(regs); + show_trace(®s->rsp); +} + +/* + * Free current thread data structures etc.. + */ +void exit_thread(void) +{ + struct task_struct *me = current; + struct thread_struct *t = &me->thread; + if (me->thread.io_bitmap_ptr) { + struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); + + kfree(t->io_bitmap_ptr); + t->io_bitmap_ptr = NULL; + /* + * Careful, clear this in the TSS too: + */ + memset(tss->io_bitmap, 0xff, t->io_bitmap_max); + t->io_bitmap_max = 0; + put_cpu(); + } +} + +void flush_thread(void) +{ + struct task_struct *tsk = current; + struct thread_info *t = current_thread_info(); + + if (t->flags & _TIF_ABI_PENDING) + t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32); + + tsk->thread.debugreg0 = 0; + tsk->thread.debugreg1 = 0; + tsk->thread.debugreg2 = 0; + tsk->thread.debugreg3 = 0; + tsk->thread.debugreg6 = 0; + tsk->thread.debugreg7 = 0; + memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); + /* + * Forget coprocessor state.. + */ + clear_fpu(tsk); + clear_used_math(); +} + +void release_thread(struct task_struct *dead_task) +{ + if (dead_task->mm) { + if (dead_task->mm->context.size) { + printk("WARNING: dead process %8s still has LDT? <%p/%d>\n", + dead_task->comm, + dead_task->mm->context.ldt, + dead_task->mm->context.size); + BUG(); + } + } +} + +static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr) +{ + struct user_desc ud = { + .base_addr = addr, + .limit = 0xfffff, + .seg_32bit = 1, + .limit_in_pages = 1, + .useable = 1, + }; + struct n_desc_struct *desc = (void *)t->thread.tls_array; + desc += tls; + desc->a = LDT_entry_a(&ud); + desc->b = LDT_entry_b(&ud); +} + +static inline u32 read_32bit_tls(struct task_struct *t, int tls) +{ + struct desc_struct *desc = (void *)t->thread.tls_array; + desc += tls; + return desc->base0 | + (((u32)desc->base1) << 16) | + (((u32)desc->base2) << 24); +} + +/* + * This gets called before we allocate a new thread and copy + * the current task into it. + */ +void prepare_to_copy(struct task_struct *tsk) +{ + unlazy_fpu(tsk); +} + +int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp, + unsigned long unused, + struct task_struct * p, struct pt_regs * regs) +{ + int err; + struct pt_regs * childregs; + struct task_struct *me = current; + + childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p->thread_info)) - 1; + + *childregs = *regs; + + childregs->rax = 0; + childregs->rsp = rsp; + if (rsp == ~0UL) { + childregs->rsp = (unsigned long)childregs; + } + + p->thread.rsp = (unsigned long) childregs; + p->thread.rsp0 = (unsigned long) (childregs+1); + p->thread.userrsp = me->thread.userrsp; + + set_ti_thread_flag(p->thread_info, TIF_FORK); + + p->thread.fs = me->thread.fs; + p->thread.gs = me->thread.gs; + + asm("movl %%gs,%0" : "=m" (p->thread.gsindex)); + asm("movl %%fs,%0" : "=m" (p->thread.fsindex)); + asm("movl %%es,%0" : "=m" (p->thread.es)); + asm("movl %%ds,%0" : "=m" (p->thread.ds)); + + if (unlikely(me->thread.io_bitmap_ptr != NULL)) { + p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); + if (!p->thread.io_bitmap_ptr) { + p->thread.io_bitmap_max = 0; + return -ENOMEM; + } + memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, IO_BITMAP_BYTES); + } + + /* + * Set a new TLS for the child thread? + */ + if (clone_flags & CLONE_SETTLS) { +#ifdef CONFIG_IA32_EMULATION + if (test_thread_flag(TIF_IA32)) + err = ia32_child_tls(p, childregs); + else +#endif + err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); + if (err) + goto out; + } + err = 0; +out: + if (err && p->thread.io_bitmap_ptr) { + kfree(p->thread.io_bitmap_ptr); + p->thread.io_bitmap_max = 0; + } + return err; +} + +/* + * This special macro can be used to load a debugging register + */ +#define loaddebug(thread,r) set_debug(thread->debugreg ## r, r) + +/* + * switch_to(x,y) should switch tasks from x to y. + * + * This could still be optimized: + * - fold all the options into a flag word and test it with a single test. + * - could test fs/gs bitsliced + */ +struct task_struct *__switch_to(struct task_struct *prev_p, struct task_struct *next_p) +{ + struct thread_struct *prev = &prev_p->thread, + *next = &next_p->thread; + int cpu = smp_processor_id(); + struct tss_struct *tss = &per_cpu(init_tss, cpu); + + unlazy_fpu(prev_p); + + /* + * Reload esp0, LDT and the page table pointer: + */ + tss->rsp0 = next->rsp0; + + /* + * Switch DS and ES. + * This won't pick up thread selector changes, but I guess that is ok. + */ + asm volatile("movl %%es,%0" : "=m" (prev->es)); + if (unlikely(next->es | prev->es)) + loadsegment(es, next->es); + + asm volatile ("movl %%ds,%0" : "=m" (prev->ds)); + if (unlikely(next->ds | prev->ds)) + loadsegment(ds, next->ds); + + load_TLS(next, cpu); + + /* + * Switch FS and GS. + */ + { + unsigned fsindex; + asm volatile("movl %%fs,%0" : "=r" (fsindex)); + /* segment register != 0 always requires a reload. + also reload when it has changed. + when prev process used 64bit base always reload + to avoid an information leak. */ + if (unlikely(fsindex | next->fsindex | prev->fs)) { + loadsegment(fs, next->fsindex); + /* check if the user used a selector != 0 + * if yes clear 64bit base, since overloaded base + * is always mapped to the Null selector + */ + if (fsindex) + prev->fs = 0; + } + /* when next process has a 64bit base use it */ + if (next->fs) + wrmsrl(MSR_FS_BASE, next->fs); + prev->fsindex = fsindex; + } + { + unsigned gsindex; + asm volatile("movl %%gs,%0" : "=r" (gsindex)); + if (unlikely(gsindex | next->gsindex | prev->gs)) { + load_gs_index(next->gsindex); + if (gsindex) + prev->gs = 0; + } + if (next->gs) + wrmsrl(MSR_KERNEL_GS_BASE, next->gs); + prev->gsindex = gsindex; + } + + /* + * Switch the PDA context. + */ + prev->userrsp = read_pda(oldrsp); + write_pda(oldrsp, next->userrsp); + write_pda(pcurrent, next_p); + write_pda(kernelstack, (unsigned long)next_p->thread_info + THREAD_SIZE - PDA_STACKOFFSET); + + /* + * Now maybe reload the debug registers + */ + if (unlikely(next->debugreg7)) { + loaddebug(next, 0); + loaddebug(next, 1); + loaddebug(next, 2); + loaddebug(next, 3); + /* no 4 and 5 */ + loaddebug(next, 6); + loaddebug(next, 7); + } + + + /* + * Handle the IO bitmap + */ + if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) { + if (next->io_bitmap_ptr) + /* + * Copy the relevant range of the IO bitmap. + * Normally this is 128 bytes or less: + */ + memcpy(tss->io_bitmap, next->io_bitmap_ptr, + max(prev->io_bitmap_max, next->io_bitmap_max)); + else { + /* + * Clear any possible leftover bits: + */ + memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); + } + } + + return prev_p; +} + +/* + * sys_execve() executes a new program. + */ +asmlinkage +long sys_execve(char __user *name, char __user * __user *argv, + char __user * __user *envp, struct pt_regs regs) +{ + long error; + char * filename; + + filename = getname(name); + error = PTR_ERR(filename); + if (IS_ERR(filename)) + return error; + error = do_execve(filename, argv, envp, ®s); + if (error == 0) { + task_lock(current); + current->ptrace &= ~PT_DTRACE; + task_unlock(current); + } + putname(filename); + return error; +} + +void set_personality_64bit(void) +{ + /* inherit personality from parent */ + + /* Make sure to be in 64bit mode */ + clear_thread_flag(TIF_IA32); + + /* TBD: overwrites user setup. Should have two bits. + But 64bit processes have always behaved this way, + so it's not too bad. The main problem is just that + 32bit childs are affected again. */ + current->personality &= ~READ_IMPLIES_EXEC; +} + +asmlinkage long sys_fork(struct pt_regs *regs) +{ + return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL); +} + +asmlinkage long sys_clone(unsigned long clone_flags, unsigned long newsp, void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) +{ + if (!newsp) + newsp = regs->rsp; + return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); +} + +/* + * This is trivial, and on the face of it looks like it + * could equally well be done in user mode. + * + * Not so, for quite unobvious reasons - register pressure. + * In user mode vfork() cannot have a stack frame, and if + * done by calling the "clone()" system call directly, you + * do not have enough call-clobbered registers to hold all + * the information you need. + */ +asmlinkage long sys_vfork(struct pt_regs *regs) +{ + return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0, + NULL, NULL); +} + +unsigned long get_wchan(struct task_struct *p) +{ + unsigned long stack; + u64 fp,rip; + int count = 0; + + if (!p || p == current || p->state==TASK_RUNNING) + return 0; + stack = (unsigned long)p->thread_info; + if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE) + return 0; + fp = *(u64 *)(p->thread.rsp); + do { + if (fp < (unsigned long)stack || fp > (unsigned long)stack+THREAD_SIZE) + return 0; + rip = *(u64 *)(fp+8); + if (!in_sched_functions(rip)) + return rip; + fp = *(u64 *)fp; + } while (count++ < 16); + return 0; +} + +long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) +{ + int ret = 0; + int doit = task == current; + int cpu; + + switch (code) { + case ARCH_SET_GS: + if (addr >= TASK_SIZE) + return -EPERM; + cpu = get_cpu(); + /* handle small bases via the GDT because that's faster to + switch. */ + if (addr <= 0xffffffff) { + set_32bit_tls(task, GS_TLS, addr); + if (doit) { + load_TLS(&task->thread, cpu); + load_gs_index(GS_TLS_SEL); + } + task->thread.gsindex = GS_TLS_SEL; + task->thread.gs = 0; + } else { + task->thread.gsindex = 0; + task->thread.gs = addr; + if (doit) { + load_gs_index(0); + ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr); + } + } + put_cpu(); + break; + case ARCH_SET_FS: + /* Not strictly needed for fs, but do it for symmetry + with gs */ + if (addr >= TASK_SIZE) + return -EPERM; + cpu = get_cpu(); + /* handle small bases via the GDT because that's faster to + switch. */ + if (addr <= 0xffffffff) { + set_32bit_tls(task, FS_TLS, addr); + if (doit) { + load_TLS(&task->thread, cpu); + asm volatile("movl %0,%%fs" :: "r" (FS_TLS_SEL)); + } + task->thread.fsindex = FS_TLS_SEL; + task->thread.fs = 0; + } else { + task->thread.fsindex = 0; + task->thread.fs = addr; + if (doit) { + /* set the selector to 0 to not confuse + __switch_to */ + asm volatile("movl %0,%%fs" :: "r" (0)); + ret = checking_wrmsrl(MSR_FS_BASE, addr); + } + } + put_cpu(); + break; + case ARCH_GET_FS: { + unsigned long base; + if (task->thread.fsindex == FS_TLS_SEL) + base = read_32bit_tls(task, FS_TLS); + else if (doit) { + rdmsrl(MSR_FS_BASE, base); + } else + base = task->thread.fs; + ret = put_user(base, (unsigned long __user *)addr); + break; + } + case ARCH_GET_GS: { + unsigned long base; + if (task->thread.gsindex == GS_TLS_SEL) + base = read_32bit_tls(task, GS_TLS); + else if (doit) { + rdmsrl(MSR_KERNEL_GS_BASE, base); + } else + base = task->thread.gs; + ret = put_user(base, (unsigned long __user *)addr); + break; + } + + default: + ret = -EINVAL; + break; + } + + return ret; +} + +long sys_arch_prctl(int code, unsigned long addr) +{ + return do_arch_prctl(current, code, addr); +} + +/* + * Capture the user space registers if the task is not running (in user space) + */ +int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs) +{ + struct pt_regs *pp, ptregs; + + pp = (struct pt_regs *)(tsk->thread.rsp0); + --pp; + + ptregs = *pp; + ptregs.cs &= 0xffff; + ptregs.ss &= 0xffff; + + elf_core_copy_regs(regs, &ptregs); + + return 1; +} + +unsigned long arch_align_stack(unsigned long sp) +{ + if (randomize_va_space) + sp -= get_random_int() % 8192; + return sp & ~0xf; +} diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c new file mode 100644 index 00000000000..0b7b101debd --- /dev/null +++ b/arch/x86_64/kernel/ptrace.c @@ -0,0 +1,547 @@ +/* ptrace.c */ +/* By Ross Biro 1/23/92 */ +/* + * Pentium III FXSR, SSE support + * Gareth Hughes <gareth@valinux.com>, May 2000 + * + * x86-64 port 2000-2002 Andi Kleen + */ + +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/smp_lock.h> +#include <linux/errno.h> +#include <linux/ptrace.h> +#include <linux/user.h> +#include <linux/security.h> +#include <linux/audit.h> +#include <linux/seccomp.h> + +#include <asm/uaccess.h> +#include <asm/pgtable.h> +#include <asm/system.h> +#include <asm/processor.h> +#include <asm/i387.h> +#include <asm/debugreg.h> +#include <asm/ldt.h> +#include <asm/desc.h> +#include <asm/proto.h> +#include <asm/ia32.h> + +/* + * does not yet catch signals sent when the child dies. + * in exit.c or in signal.c. + */ + +/* determines which flags the user has access to. */ +/* 1 = access 0 = no access */ +#define FLAG_MASK 0x44dd5UL + +/* set's the trap flag. */ +#define TRAP_FLAG 0x100UL + +/* + * eflags and offset of eflags on child stack.. + */ +#define EFLAGS offsetof(struct pt_regs, eflags) +#define EFL_OFFSET ((int)(EFLAGS-sizeof(struct pt_regs))) + +/* + * this routine will get a word off of the processes privileged stack. + * the offset is how far from the base addr as stored in the TSS. + * this routine assumes that all the privileged stacks are in our + * data space. + */ +static inline unsigned long get_stack_long(struct task_struct *task, int offset) +{ + unsigned char *stack; + + stack = (unsigned char *)task->thread.rsp0; + stack += offset; + return (*((unsigned long *)stack)); +} + +/* + * this routine will put a word on the processes privileged stack. + * the offset is how far from the base addr as stored in the TSS. + * this routine assumes that all the privileged stacks are in our + * data space. + */ +static inline long put_stack_long(struct task_struct *task, int offset, + unsigned long data) +{ + unsigned char * stack; + + stack = (unsigned char *) task->thread.rsp0; + stack += offset; + *(unsigned long *) stack = data; + return 0; +} + +/* + * Called by kernel/ptrace.c when detaching.. + * + * Make sure the single step bit is not set. + */ +void ptrace_disable(struct task_struct *child) +{ + long tmp; + + clear_tsk_thread_flag(child, TIF_SINGLESTEP); + tmp = get_stack_long(child, EFL_OFFSET) & ~TRAP_FLAG; + put_stack_long(child, EFL_OFFSET, tmp); +} + +static int putreg(struct task_struct *child, + unsigned long regno, unsigned long value) +{ + unsigned long tmp; + + /* Some code in the 64bit emulation may not be 64bit clean. + Don't take any chances. */ + if (test_tsk_thread_flag(child, TIF_IA32)) + value &= 0xffffffff; + switch (regno) { + case offsetof(struct user_regs_struct,fs): + if (value && (value & 3) != 3) + return -EIO; + child->thread.fsindex = value & 0xffff; + return 0; + case offsetof(struct user_regs_struct,gs): + if (value && (value & 3) != 3) + return -EIO; + child->thread.gsindex = value & 0xffff; + return 0; + case offsetof(struct user_regs_struct,ds): + if (value && (value & 3) != 3) + return -EIO; + child->thread.ds = value & 0xffff; + return 0; + case offsetof(struct user_regs_struct,es): + if (value && (value & 3) != 3) + return -EIO; + child->thread.es = value & 0xffff; + return 0; + case offsetof(struct user_regs_struct,ss): + if ((value & 3) != 3) + return -EIO; + value &= 0xffff; + return 0; + case offsetof(struct user_regs_struct,fs_base): + if (!((value >> 48) == 0 || (value >> 48) == 0xffff)) + return -EIO; + child->thread.fs = value; + return 0; + case offsetof(struct user_regs_struct,gs_base): + if (!((value >> 48) == 0 || (value >> 48) == 0xffff)) + return -EIO; + child->thread.gs = value; + return 0; + case offsetof(struct user_regs_struct, eflags): + value &= FLAG_MASK; + tmp = get_stack_long(child, EFL_OFFSET); + tmp &= ~FLAG_MASK; + value |= tmp; + break; + case offsetof(struct user_regs_struct,cs): + if ((value & 3) != 3) + return -EIO; + value &= 0xffff; + break; + } + put_stack_long(child, regno - sizeof(struct pt_regs), value); + return 0; +} + +static unsigned long getreg(struct task_struct *child, unsigned long regno) +{ + unsigned long val; + switch (regno) { + case offsetof(struct user_regs_struct, fs): + return child->thread.fsindex; + case offsetof(struct user_regs_struct, gs): + return child->thread.gsindex; + case offsetof(struct user_regs_struct, ds): + return child->thread.ds; + case offsetof(struct user_regs_struct, es): + return child->thread.es; + case offsetof(struct user_regs_struct, fs_base): + return child->thread.fs; + case offsetof(struct user_regs_struct, gs_base): + return child->thread.gs; + default: + regno = regno - sizeof(struct pt_regs); + val = get_stack_long(child, regno); + if (test_tsk_thread_flag(child, TIF_IA32)) + val &= 0xffffffff; + return val; + } + +} + +asmlinkage long sys_ptrace(long request, long pid, unsigned long addr, long data) +{ + struct task_struct *child; + long i, ret; + unsigned ui; + + /* This lock_kernel fixes a subtle race with suid exec */ + lock_kernel(); + ret = -EPERM; + if (request == PTRACE_TRACEME) { + /* are we already being traced? */ + if (current->ptrace & PT_PTRACED) + goto out; + ret = security_ptrace(current->parent, current); + if (ret) + goto out; + /* set the ptrace bit in the process flags. */ + current->ptrace |= PT_PTRACED; + ret = 0; + goto out; + } + ret = -ESRCH; + read_lock(&tasklist_lock); + child = find_task_by_pid(pid); + if (child) + get_task_struct(child); + read_unlock(&tasklist_lock); + if (!child) + goto out; + + ret = -EPERM; + if (pid == 1) /* you may not mess with init */ + goto out_tsk; + + if (request == PTRACE_ATTACH) { + ret = ptrace_attach(child); + goto out_tsk; + } + ret = ptrace_check_attach(child, request == PTRACE_KILL); + if (ret < 0) + goto out_tsk; + + switch (request) { + /* when I and D space are separate, these will need to be fixed. */ + case PTRACE_PEEKTEXT: /* read word at location addr. */ + case PTRACE_PEEKDATA: { + unsigned long tmp; + int copied; + + copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0); + ret = -EIO; + if (copied != sizeof(tmp)) + break; + ret = put_user(tmp,(unsigned long __user *) data); + break; + } + + /* read the word at location addr in the USER area. */ + case PTRACE_PEEKUSR: { + unsigned long tmp; + + ret = -EIO; + if ((addr & 7) || + addr > sizeof(struct user) - 7) + break; + + switch (addr) { + case 0 ... sizeof(struct user_regs_struct): + tmp = getreg(child, addr); + break; + case offsetof(struct user, u_debugreg[0]): + tmp = child->thread.debugreg0; + break; + case offsetof(struct user, u_debugreg[1]): + tmp = child->thread.debugreg1; + break; + case offsetof(struct user, u_debugreg[2]): + tmp = child->thread.debugreg2; + break; + case offsetof(struct user, u_debugreg[3]): + tmp = child->thread.debugreg3; + break; + case offsetof(struct user, u_debugreg[6]): + tmp = child->thread.debugreg6; + break; + case offsetof(struct user, u_debugreg[7]): + tmp = child->thread.debugreg7; + break; + default: + tmp = 0; + break; + } + ret = put_user(tmp,(unsigned long __user *) data); + break; + } + + /* when I and D space are separate, this will have to be fixed. */ + case PTRACE_POKETEXT: /* write the word at location addr. */ + case PTRACE_POKEDATA: + ret = 0; + if (access_process_vm(child, addr, &data, sizeof(data), 1) == sizeof(data)) + break; + ret = -EIO; + break; + + case PTRACE_POKEUSR: /* write the word at location addr in the USER area */ + ret = -EIO; + if ((addr & 7) || + addr > sizeof(struct user) - 7) + break; + + switch (addr) { + case 0 ... sizeof(struct user_regs_struct): + ret = putreg(child, addr, data); + break; + /* Disallows to set a breakpoint into the vsyscall */ + case offsetof(struct user, u_debugreg[0]): + if (data >= TASK_SIZE-7) break; + child->thread.debugreg0 = data; + ret = 0; + break; + case offsetof(struct user, u_debugreg[1]): + if (data >= TASK_SIZE-7) break; + child->thread.debugreg1 = data; + ret = 0; + break; + case offsetof(struct user, u_debugreg[2]): + if (data >= TASK_SIZE-7) break; + child->thread.debugreg2 = data; + ret = 0; + break; + case offsetof(struct user, u_debugreg[3]): + if (data >= TASK_SIZE-7) break; + child->thread.debugreg3 = data; + ret = 0; + break; + case offsetof(struct user, u_debugreg[6]): + if (data >> 32) + break; + child->thread.debugreg6 = data; + ret = 0; + break; + case offsetof(struct user, u_debugreg[7]): + /* See arch/i386/kernel/ptrace.c for an explanation of + * this awkward check.*/ + data &= ~DR_CONTROL_RESERVED; + for(i=0; i<4; i++) + if ((0x5454 >> ((data >> (16 + 4*i)) & 0xf)) & 1) + break; + if (i == 4) { + child->thread.debugreg7 = data; + ret = 0; + } + break; + } + break; + case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */ + case PTRACE_CONT: { /* restart after signal. */ + long tmp; + + ret = -EIO; + if ((unsigned long) data > _NSIG) + break; + if (request == PTRACE_SYSCALL) + set_tsk_thread_flag(child,TIF_SYSCALL_TRACE); + else + clear_tsk_thread_flag(child,TIF_SYSCALL_TRACE); + clear_tsk_thread_flag(child, TIF_SINGLESTEP); + child->exit_code = data; + /* make sure the single step bit is not set. */ + tmp = get_stack_long(child, EFL_OFFSET); + tmp &= ~TRAP_FLAG; + put_stack_long(child, EFL_OFFSET,tmp); + wake_up_process(child); + ret = 0; + break; + } + +#ifdef CONFIG_IA32_EMULATION + /* This makes only sense with 32bit programs. Allow a + 64bit debugger to fully examine them too. Better + don't use it against 64bit processes, use + PTRACE_ARCH_PRCTL instead. */ + case PTRACE_SET_THREAD_AREA: { + struct user_desc __user *p; + int old; + p = (struct user_desc __user *)data; + get_user(old, &p->entry_number); + put_user(addr, &p->entry_number); + ret = do_set_thread_area(&child->thread, p); + put_user(old, &p->entry_number); + break; + case PTRACE_GET_THREAD_AREA: + p = (struct user_desc __user *)data; + get_user(old, &p->entry_number); + put_user(addr, &p->entry_number); + ret = do_get_thread_area(&child->thread, p); + put_user(old, &p->entry_number); + break; + } +#endif + /* normal 64bit interface to access TLS data. + Works just like arch_prctl, except that the arguments + are reversed. */ + case PTRACE_ARCH_PRCTL: + ret = do_arch_prctl(child, data, addr); + break; + +/* + * make the child exit. Best I can do is send it a sigkill. + * perhaps it should be put in the status that it wants to + * exit. + */ + case PTRACE_KILL: { + long tmp; + + ret = 0; + if (child->exit_state == EXIT_ZOMBIE) /* already dead */ + break; + clear_tsk_thread_flag(child, TIF_SINGLESTEP); + child->exit_code = SIGKILL; + /* make sure the single step bit is not set. */ + tmp = get_stack_long(child, EFL_OFFSET) & ~TRAP_FLAG; + put_stack_long(child, EFL_OFFSET, tmp); + wake_up_process(child); + break; + } + + case PTRACE_SINGLESTEP: { /* set the trap flag. */ + long tmp; + + ret = -EIO; + if ((unsigned long) data > _NSIG) + break; + clear_tsk_thread_flag(child,TIF_SYSCALL_TRACE); + if ((child->ptrace & PT_DTRACE) == 0) { + /* Spurious delayed TF traps may occur */ + child->ptrace |= PT_DTRACE; + } + tmp = get_stack_long(child, EFL_OFFSET) | TRAP_FLAG; + put_stack_long(child, EFL_OFFSET, tmp); + set_tsk_thread_flag(child, TIF_SINGLESTEP); + child->exit_code = data; + /* give it a chance to run. */ + wake_up_process(child); + ret = 0; + break; + } + + case PTRACE_DETACH: + /* detach a process that was attached. */ + ret = ptrace_detach(child, data); + break; + + case PTRACE_GETREGS: { /* Get all gp regs from the child. */ + if (!access_ok(VERIFY_WRITE, (unsigned __user *)data, + sizeof(struct user_regs_struct))) { + ret = -EIO; + break; + } + ret = 0; + for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) { + ret |= __put_user(getreg(child, ui),(unsigned long __user *) data); + data += sizeof(long); + } + break; + } + + case PTRACE_SETREGS: { /* Set all gp regs in the child. */ + unsigned long tmp; + if (!access_ok(VERIFY_READ, (unsigned __user *)data, + sizeof(struct user_regs_struct))) { + ret = -EIO; + break; + } + ret = 0; + for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) { + ret |= __get_user(tmp, (unsigned long __user *) data); + putreg(child, ui, tmp); + data += sizeof(long); + } + break; + } + + case PTRACE_GETFPREGS: { /* Get the child extended FPU state. */ + if (!access_ok(VERIFY_WRITE, (unsigned __user *)data, + sizeof(struct user_i387_struct))) { + ret = -EIO; + break; + } + ret = get_fpregs((struct user_i387_struct __user *)data, child); + break; + } + + case PTRACE_SETFPREGS: { /* Set the child extended FPU state. */ + if (!access_ok(VERIFY_READ, (unsigned __user *)data, + sizeof(struct user_i387_struct))) { + ret = -EIO; + break; + } + set_stopped_child_used_math(child); + ret = set_fpregs(child, (struct user_i387_struct __user *)data); + break; + } + + default: + ret = ptrace_request(child, request, addr, data); + break; + } +out_tsk: + put_task_struct(child); +out: + unlock_kernel(); + return ret; +} + +static void syscall_trace(struct pt_regs *regs) +{ + +#if 0 + printk("trace %s rip %lx rsp %lx rax %d origrax %d caller %lx tiflags %x ptrace %x\n", + current->comm, + regs->rip, regs->rsp, regs->rax, regs->orig_rax, __builtin_return_address(0), + current_thread_info()->flags, current->ptrace); +#endif + + ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) + ? 0x80 : 0)); + /* + * this isn't the same as continuing with a signal, but it will do + * for normal use. strace only continues with a signal if the + * stopping signal is not SIGTRAP. -brl + */ + if (current->exit_code) { + send_sig(current->exit_code, current, 1); + current->exit_code = 0; + } +} + +asmlinkage void syscall_trace_enter(struct pt_regs *regs) +{ + /* do the secure computing check first */ + secure_computing(regs->orig_rax); + + if (unlikely(current->audit_context)) + audit_syscall_entry(current, regs->orig_rax, + regs->rdi, regs->rsi, + regs->rdx, regs->r10); + + if (test_thread_flag(TIF_SYSCALL_TRACE) + && (current->ptrace & PT_PTRACED)) + syscall_trace(regs); +} + +asmlinkage void syscall_trace_leave(struct pt_regs *regs) +{ + if (unlikely(current->audit_context)) + audit_syscall_exit(current, regs->rax); + + if ((test_thread_flag(TIF_SYSCALL_TRACE) + || test_thread_flag(TIF_SINGLESTEP)) + && (current->ptrace & PT_PTRACED)) + syscall_trace(regs); +} diff --git a/arch/x86_64/kernel/reboot.c b/arch/x86_64/kernel/reboot.c new file mode 100644 index 00000000000..be4b36f762c --- /dev/null +++ b/arch/x86_64/kernel/reboot.c @@ -0,0 +1,163 @@ +/* Various gunk just to reboot the machine. */ +#include <linux/module.h> +#include <linux/reboot.h> +#include <linux/init.h> +#include <linux/smp.h> +#include <linux/kernel.h> +#include <linux/ctype.h> +#include <linux/string.h> +#include <asm/io.h> +#include <asm/kdebug.h> +#include <asm/delay.h> +#include <asm/hw_irq.h> +#include <asm/system.h> +#include <asm/pgtable.h> +#include <asm/tlbflush.h> +#include <asm/apic.h> + +/* + * Power off function, if any + */ +void (*pm_power_off)(void); + +static long no_idt[3]; +static enum { + BOOT_TRIPLE = 't', + BOOT_KBD = 'k' +} reboot_type = BOOT_KBD; +static int reboot_mode = 0; +int reboot_force; + +/* reboot=t[riple] | k[bd] [, [w]arm | [c]old] + warm Don't set the cold reboot flag + cold Set the cold reboot flag + triple Force a triple fault (init) + kbd Use the keyboard controller. cold reset (default) + force Avoid anything that could hang. + */ +static int __init reboot_setup(char *str) +{ + for (;;) { + switch (*str) { + case 'w': + reboot_mode = 0x1234; + break; + + case 'c': + reboot_mode = 0; + break; + + case 't': + case 'b': + case 'k': + reboot_type = *str; + break; + case 'f': + reboot_force = 1; + break; + } + if((str = strchr(str,',')) != NULL) + str++; + else + break; + } + return 1; +} + +__setup("reboot=", reboot_setup); + +#ifdef CONFIG_SMP +static void smp_halt(void) +{ + int cpuid = safe_smp_processor_id(); + static int first_entry = 1; + + if (reboot_force) + return; + + if (first_entry) { + first_entry = 0; + smp_call_function((void *)machine_restart, NULL, 1, 0); + } + + smp_stop_cpu(); + + /* AP calling this. Just halt */ + if (cpuid != boot_cpu_id) { + for (;;) + asm("hlt"); + } + + /* Wait for all other CPUs to have run smp_stop_cpu */ + while (!cpus_empty(cpu_online_map)) + rep_nop(); +} +#endif + +static inline void kb_wait(void) +{ + int i; + + for (i=0; i<0x10000; i++) + if ((inb_p(0x64) & 0x02) == 0) + break; +} + +void machine_restart(char * __unused) +{ + int i; + + printk("machine restart\n"); + +#ifdef CONFIG_SMP + smp_halt(); +#endif + + if (!reboot_force) { + local_irq_disable(); +#ifndef CONFIG_SMP + disable_local_APIC(); +#endif + disable_IO_APIC(); + local_irq_enable(); + } + + /* Tell the BIOS if we want cold or warm reboot */ + *((unsigned short *)__va(0x472)) = reboot_mode; + + for (;;) { + /* Could also try the reset bit in the Hammer NB */ + switch (reboot_type) { + case BOOT_KBD: + for (i=0; i<100; i++) { + kb_wait(); + udelay(50); + outb(0xfe,0x64); /* pulse reset low */ + udelay(50); + } + + case BOOT_TRIPLE: + __asm__ __volatile__("lidt (%0)": :"r" (&no_idt)); + __asm__ __volatile__("int3"); + + reboot_type = BOOT_KBD; + break; + } + } +} + +EXPORT_SYMBOL(machine_restart); + +void machine_halt(void) +{ +} + +EXPORT_SYMBOL(machine_halt); + +void machine_power_off(void) +{ + if (pm_power_off) + pm_power_off(); +} + +EXPORT_SYMBOL(machine_power_off); diff --git a/arch/x86_64/kernel/semaphore.c b/arch/x86_64/kernel/semaphore.c new file mode 100644 index 00000000000..48f7c18172b --- /dev/null +++ b/arch/x86_64/kernel/semaphore.c @@ -0,0 +1,180 @@ +/* + * x86_64 semaphore implementation. + * + * (C) Copyright 1999 Linus Torvalds + * + * Portions Copyright 1999 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * rw semaphores implemented November 1999 by Benjamin LaHaise <bcrl@kvack.org> + */ +#include <linux/config.h> +#include <linux/sched.h> +#include <linux/init.h> +#include <asm/errno.h> + +#include <asm/semaphore.h> + +/* + * Semaphores are implemented using a two-way counter: + * The "count" variable is decremented for each process + * that tries to acquire the semaphore, while the "sleeping" + * variable is a count of such acquires. + * + * Notably, the inline "up()" and "down()" functions can + * efficiently test if they need to do any extra work (up + * needs to do something only if count was negative before + * the increment operation. + * + * "sleeping" and the contention routine ordering is protected + * by the spinlock in the semaphore's waitqueue head. + * + * Note that these functions are only called when there is + * contention on the lock, and as such all this is the + * "non-critical" part of the whole semaphore business. The + * critical part is the inline stuff in <asm/semaphore.h> + * where we want to avoid any extra jumps and calls. + */ + +/* + * Logic: + * - only on a boundary condition do we need to care. When we go + * from a negative count to a non-negative, we wake people up. + * - when we go from a non-negative count to a negative do we + * (a) synchronize with the "sleeper" count and (b) make sure + * that we're on the wakeup list before we synchronize so that + * we cannot lose wakeup events. + */ + +void __up(struct semaphore *sem) +{ + wake_up(&sem->wait); +} + +void __sched __down(struct semaphore * sem) +{ + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); + unsigned long flags; + + tsk->state = TASK_UNINTERRUPTIBLE; + spin_lock_irqsave(&sem->wait.lock, flags); + add_wait_queue_exclusive_locked(&sem->wait, &wait); + + sem->sleepers++; + for (;;) { + int sleepers = sem->sleepers; + + /* + * Add "everybody else" into it. They aren't + * playing, because we own the spinlock in + * the wait_queue_head. + */ + if (!atomic_add_negative(sleepers - 1, &sem->count)) { + sem->sleepers = 0; + break; + } + sem->sleepers = 1; /* us - see -1 above */ + spin_unlock_irqrestore(&sem->wait.lock, flags); + + schedule(); + + spin_lock_irqsave(&sem->wait.lock, flags); + tsk->state = TASK_UNINTERRUPTIBLE; + } + remove_wait_queue_locked(&sem->wait, &wait); + wake_up_locked(&sem->wait); + spin_unlock_irqrestore(&sem->wait.lock, flags); + tsk->state = TASK_RUNNING; +} + +int __sched __down_interruptible(struct semaphore * sem) +{ + int retval = 0; + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); + unsigned long flags; + + tsk->state = TASK_INTERRUPTIBLE; + spin_lock_irqsave(&sem->wait.lock, flags); + add_wait_queue_exclusive_locked(&sem->wait, &wait); + + sem->sleepers++; + for (;;) { + int sleepers = sem->sleepers; + + /* + * With signals pending, this turns into + * the trylock failure case - we won't be + * sleeping, and we* can't get the lock as + * it has contention. Just correct the count + * and exit. + */ + if (signal_pending(current)) { + retval = -EINTR; + sem->sleepers = 0; + atomic_add(sleepers, &sem->count); + break; + } + + /* + * Add "everybody else" into it. They aren't + * playing, because we own the spinlock in + * wait_queue_head. The "-1" is because we're + * still hoping to get the semaphore. + */ + if (!atomic_add_negative(sleepers - 1, &sem->count)) { + sem->sleepers = 0; + break; + } + sem->sleepers = 1; /* us - see -1 above */ + spin_unlock_irqrestore(&sem->wait.lock, flags); + + schedule(); + + spin_lock_irqsave(&sem->wait.lock, flags); + tsk->state = TASK_INTERRUPTIBLE; + } + remove_wait_queue_locked(&sem->wait, &wait); + wake_up_locked(&sem->wait); + spin_unlock_irqrestore(&sem->wait.lock, flags); + + tsk->state = TASK_RUNNING; + return retval; +} + +/* + * Trylock failed - make sure we correct for + * having decremented the count. + * + * We could have done the trylock with a + * single "cmpxchg" without failure cases, + * but then it wouldn't work on a 386. + */ +int __down_trylock(struct semaphore * sem) +{ + int sleepers; + unsigned long flags; + + spin_lock_irqsave(&sem->wait.lock, flags); + sleepers = sem->sleepers + 1; + sem->sleepers = 0; + + /* + * Add "everybody else" and us into it. They aren't + * playing, because we own the spinlock in the + * wait_queue_head. + */ + if (!atomic_add_negative(sleepers, &sem->count)) { + wake_up_locked(&sem->wait); + } + + spin_unlock_irqrestore(&sem->wait.lock, flags); + return 1; +} + + diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c new file mode 100644 index 00000000000..e50fc41de6b --- /dev/null +++ b/arch/x86_64/kernel/setup.c @@ -0,0 +1,1189 @@ +/* + * linux/arch/x86-64/kernel/setup.c + * + * Copyright (C) 1995 Linus Torvalds + * + * Nov 2001 Dave Jones <davej@suse.de> + * Forked from i386 setup code. + * + * $Id$ + */ + +/* + * This file handles the architecture-dependent parts of initialization + */ + +#include <linux/errno.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/stddef.h> +#include <linux/unistd.h> +#include <linux/ptrace.h> +#include <linux/slab.h> +#include <linux/user.h> +#include <linux/a.out.h> +#include <linux/tty.h> +#include <linux/ioport.h> +#include <linux/delay.h> +#include <linux/config.h> +#include <linux/init.h> +#include <linux/initrd.h> +#include <linux/highmem.h> +#include <linux/bootmem.h> +#include <linux/module.h> +#include <asm/processor.h> +#include <linux/console.h> +#include <linux/seq_file.h> +#include <linux/root_dev.h> +#include <linux/pci.h> +#include <linux/acpi.h> +#include <linux/kallsyms.h> +#include <linux/edd.h> +#include <asm/mtrr.h> +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/io.h> +#include <asm/smp.h> +#include <asm/msr.h> +#include <asm/desc.h> +#include <video/edid.h> +#include <asm/e820.h> +#include <asm/dma.h> +#include <asm/mpspec.h> +#include <asm/mmu_context.h> +#include <asm/bootsetup.h> +#include <asm/proto.h> +#include <asm/setup.h> +#include <asm/mach_apic.h> +#include <asm/numa.h> + +/* + * Machine setup.. + */ + +struct cpuinfo_x86 boot_cpu_data; + +unsigned long mmu_cr4_features; + +int acpi_disabled; +EXPORT_SYMBOL(acpi_disabled); +#ifdef CONFIG_ACPI_BOOT +extern int __initdata acpi_ht; +extern acpi_interrupt_flags acpi_sci_flags; +int __initdata acpi_force = 0; +#endif + +int acpi_numa __initdata; + +/* For PCI or other memory-mapped resources */ +unsigned long pci_mem_start = 0x10000000; + +/* Boot loader ID as an integer, for the benefit of proc_dointvec */ +int bootloader_type; + +unsigned long saved_video_mode; + +#ifdef CONFIG_SWIOTLB +int swiotlb; +EXPORT_SYMBOL(swiotlb); +#endif + +/* + * Setup options + */ +struct drive_info_struct { char dummy[32]; } drive_info; +struct screen_info screen_info; +struct sys_desc_table_struct { + unsigned short length; + unsigned char table[0]; +}; + +struct edid_info edid_info; +struct e820map e820; + +extern int root_mountflags; +extern char _text, _etext, _edata, _end; + +char command_line[COMMAND_LINE_SIZE]; + +struct resource standard_io_resources[] = { + { .name = "dma1", .start = 0x00, .end = 0x1f, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "pic1", .start = 0x20, .end = 0x21, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "timer0", .start = 0x40, .end = 0x43, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "timer1", .start = 0x50, .end = 0x53, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "keyboard", .start = 0x60, .end = 0x6f, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "dma page reg", .start = 0x80, .end = 0x8f, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "pic2", .start = 0xa0, .end = 0xa1, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "dma2", .start = 0xc0, .end = 0xdf, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "fpu", .start = 0xf0, .end = 0xff, + .flags = IORESOURCE_BUSY | IORESOURCE_IO } +}; + +#define STANDARD_IO_RESOURCES \ + (sizeof standard_io_resources / sizeof standard_io_resources[0]) + +#define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM) + +struct resource data_resource = { + .name = "Kernel data", + .start = 0, + .end = 0, + .flags = IORESOURCE_RAM, +}; +struct resource code_resource = { + .name = "Kernel code", + .start = 0, + .end = 0, + .flags = IORESOURCE_RAM, +}; + +#define IORESOURCE_ROM (IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM) + +static struct resource system_rom_resource = { + .name = "System ROM", + .start = 0xf0000, + .end = 0xfffff, + .flags = IORESOURCE_ROM, +}; + +static struct resource extension_rom_resource = { + .name = "Extension ROM", + .start = 0xe0000, + .end = 0xeffff, + .flags = IORESOURCE_ROM, +}; + +static struct resource adapter_rom_resources[] = { + { .name = "Adapter ROM", .start = 0xc8000, .end = 0, + .flags = IORESOURCE_ROM }, + { .name = "Adapter ROM", .start = 0, .end = 0, + .flags = IORESOURCE_ROM }, + { .name = "Adapter ROM", .start = 0, .end = 0, + .flags = IORESOURCE_ROM }, + { .name = "Adapter ROM", .start = 0, .end = 0, + .flags = IORESOURCE_ROM }, + { .name = "Adapter ROM", .start = 0, .end = 0, + .flags = IORESOURCE_ROM }, + { .name = "Adapter ROM", .start = 0, .end = 0, + .flags = IORESOURCE_ROM } +}; + +#define ADAPTER_ROM_RESOURCES \ + (sizeof adapter_rom_resources / sizeof adapter_rom_resources[0]) + +static struct resource video_rom_resource = { + .name = "Video ROM", + .start = 0xc0000, + .end = 0xc7fff, + .flags = IORESOURCE_ROM, +}; + +static struct resource video_ram_resource = { + .name = "Video RAM area", + .start = 0xa0000, + .end = 0xbffff, + .flags = IORESOURCE_RAM, +}; + +#define romsignature(x) (*(unsigned short *)(x) == 0xaa55) + +static int __init romchecksum(unsigned char *rom, unsigned long length) +{ + unsigned char *p, sum = 0; + + for (p = rom; p < rom + length; p++) + sum += *p; + return sum == 0; +} + +static void __init probe_roms(void) +{ + unsigned long start, length, upper; + unsigned char *rom; + int i; + + /* video rom */ + upper = adapter_rom_resources[0].start; + for (start = video_rom_resource.start; start < upper; start += 2048) { + rom = isa_bus_to_virt(start); + if (!romsignature(rom)) + continue; + + video_rom_resource.start = start; + + /* 0 < length <= 0x7f * 512, historically */ + length = rom[2] * 512; + + /* if checksum okay, trust length byte */ + if (length && romchecksum(rom, length)) + video_rom_resource.end = start + length - 1; + + request_resource(&iomem_resource, &video_rom_resource); + break; + } + + start = (video_rom_resource.end + 1 + 2047) & ~2047UL; + if (start < upper) + start = upper; + + /* system rom */ + request_resource(&iomem_resource, &system_rom_resource); + upper = system_rom_resource.start; + + /* check for extension rom (ignore length byte!) */ + rom = isa_bus_to_virt(extension_rom_resource.start); + if (romsignature(rom)) { + length = extension_rom_resource.end - extension_rom_resource.start + 1; + if (romchecksum(rom, length)) { + request_resource(&iomem_resource, &extension_rom_resource); + upper = extension_rom_resource.start; + } + } + + /* check for adapter roms on 2k boundaries */ + for (i = 0; i < ADAPTER_ROM_RESOURCES && start < upper; start += 2048) { + rom = isa_bus_to_virt(start); + if (!romsignature(rom)) + continue; + + /* 0 < length <= 0x7f * 512, historically */ + length = rom[2] * 512; + + /* but accept any length that fits if checksum okay */ + if (!length || start + length > upper || !romchecksum(rom, length)) + continue; + + adapter_rom_resources[i].start = start; + adapter_rom_resources[i].end = start + length - 1; + request_resource(&iomem_resource, &adapter_rom_resources[i]); + + start = adapter_rom_resources[i++].end & ~2047UL; + } +} + +static __init void parse_cmdline_early (char ** cmdline_p) +{ + char c = ' ', *to = command_line, *from = COMMAND_LINE; + int len = 0; + + /* Save unparsed command line copy for /proc/cmdline */ + memcpy(saved_command_line, COMMAND_LINE, COMMAND_LINE_SIZE); + saved_command_line[COMMAND_LINE_SIZE-1] = '\0'; + + for (;;) { + if (c != ' ') + goto next_char; + +#ifdef CONFIG_SMP + /* + * If the BIOS enumerates physical processors before logical, + * maxcpus=N at enumeration-time can be used to disable HT. + */ + else if (!memcmp(from, "maxcpus=", 8)) { + extern unsigned int maxcpus; + + maxcpus = simple_strtoul(from + 8, NULL, 0); + } +#endif +#ifdef CONFIG_ACPI_BOOT + /* "acpi=off" disables both ACPI table parsing and interpreter init */ + if (!memcmp(from, "acpi=off", 8)) + disable_acpi(); + + if (!memcmp(from, "acpi=force", 10)) { + /* add later when we do DMI horrors: */ + acpi_force = 1; + acpi_disabled = 0; + } + + /* acpi=ht just means: do ACPI MADT parsing + at bootup, but don't enable the full ACPI interpreter */ + if (!memcmp(from, "acpi=ht", 7)) { + if (!acpi_force) + disable_acpi(); + acpi_ht = 1; + } + else if (!memcmp(from, "pci=noacpi", 10)) + acpi_disable_pci(); + else if (!memcmp(from, "acpi=noirq", 10)) + acpi_noirq_set(); + + else if (!memcmp(from, "acpi_sci=edge", 13)) + acpi_sci_flags.trigger = 1; + else if (!memcmp(from, "acpi_sci=level", 14)) + acpi_sci_flags.trigger = 3; + else if (!memcmp(from, "acpi_sci=high", 13)) + acpi_sci_flags.polarity = 1; + else if (!memcmp(from, "acpi_sci=low", 12)) + acpi_sci_flags.polarity = 3; + + /* acpi=strict disables out-of-spec workarounds */ + else if (!memcmp(from, "acpi=strict", 11)) { + acpi_strict = 1; + } +#endif + + if (!memcmp(from, "nolapic", 7) || + !memcmp(from, "disableapic", 11)) + disable_apic = 1; + + if (!memcmp(from, "noapic", 6)) + skip_ioapic_setup = 1; + + if (!memcmp(from, "apic", 4)) { + skip_ioapic_setup = 0; + ioapic_force = 1; + } + + if (!memcmp(from, "mem=", 4)) + parse_memopt(from+4, &from); + +#ifdef CONFIG_DISCONTIGMEM + if (!memcmp(from, "numa=", 5)) + numa_setup(from+5); +#endif + +#ifdef CONFIG_GART_IOMMU + if (!memcmp(from,"iommu=",6)) { + iommu_setup(from+6); + } +#endif + + if (!memcmp(from,"oops=panic", 10)) + panic_on_oops = 1; + + if (!memcmp(from, "noexec=", 7)) + nonx_setup(from + 7); + + next_char: + c = *(from++); + if (!c) + break; + if (COMMAND_LINE_SIZE <= ++len) + break; + *(to++) = c; + } + *to = '\0'; + *cmdline_p = command_line; +} + +#ifndef CONFIG_DISCONTIGMEM +static void __init contig_initmem_init(void) +{ + unsigned long bootmap_size, bootmap; + bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT; + bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size); + if (bootmap == -1L) + panic("Cannot find bootmem map of size %ld\n",bootmap_size); + bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn); + e820_bootmem_free(&contig_page_data, 0, end_pfn << PAGE_SHIFT); + reserve_bootmem(bootmap, bootmap_size); +} +#endif + +/* Use inline assembly to define this because the nops are defined + as inline assembly strings in the include files and we cannot + get them easily into strings. */ +asm("\t.data\nk8nops: " + K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6 + K8_NOP7 K8_NOP8); + +extern unsigned char k8nops[]; +static unsigned char *k8_nops[ASM_NOP_MAX+1] = { + NULL, + k8nops, + k8nops + 1, + k8nops + 1 + 2, + k8nops + 1 + 2 + 3, + k8nops + 1 + 2 + 3 + 4, + k8nops + 1 + 2 + 3 + 4 + 5, + k8nops + 1 + 2 + 3 + 4 + 5 + 6, + k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, +}; + +/* Replace instructions with better alternatives for this CPU type. + + This runs before SMP is initialized to avoid SMP problems with + self modifying code. This implies that assymetric systems where + APs have less capabilities than the boot processor are not handled. + In this case boot with "noreplacement". */ +void apply_alternatives(void *start, void *end) +{ + struct alt_instr *a; + int diff, i, k; + for (a = start; (void *)a < end; a++) { + if (!boot_cpu_has(a->cpuid)) + continue; + + BUG_ON(a->replacementlen > a->instrlen); + __inline_memcpy(a->instr, a->replacement, a->replacementlen); + diff = a->instrlen - a->replacementlen; + + /* Pad the rest with nops */ + for (i = a->replacementlen; diff > 0; diff -= k, i += k) { + k = diff; + if (k > ASM_NOP_MAX) + k = ASM_NOP_MAX; + __inline_memcpy(a->instr + i, k8_nops[k], k); + } + } +} + +static int no_replacement __initdata = 0; + +void __init alternative_instructions(void) +{ + extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; + if (no_replacement) + return; + apply_alternatives(__alt_instructions, __alt_instructions_end); +} + +static int __init noreplacement_setup(char *s) +{ + no_replacement = 1; + return 0; +} + +__setup("noreplacement", noreplacement_setup); + +#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) +struct edd edd; +#ifdef CONFIG_EDD_MODULE +EXPORT_SYMBOL(edd); +#endif +/** + * copy_edd() - Copy the BIOS EDD information + * from boot_params into a safe place. + * + */ +static inline void copy_edd(void) +{ + memcpy(edd.mbr_signature, EDD_MBR_SIGNATURE, sizeof(edd.mbr_signature)); + memcpy(edd.edd_info, EDD_BUF, sizeof(edd.edd_info)); + edd.mbr_signature_nr = EDD_MBR_SIG_NR; + edd.edd_info_nr = EDD_NR; +} +#else +static inline void copy_edd(void) +{ +} +#endif + +#define EBDA_ADDR_POINTER 0x40E +static void __init reserve_ebda_region(void) +{ + unsigned int addr; + /** + * there is a real-mode segmented pointer pointing to the + * 4K EBDA area at 0x40E + */ + addr = *(unsigned short *)phys_to_virt(EBDA_ADDR_POINTER); + addr <<= 4; + if (addr) + reserve_bootmem_generic(addr, PAGE_SIZE); +} + +void __init setup_arch(char **cmdline_p) +{ + unsigned long low_mem_size; + unsigned long kernel_end; + + ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV); + drive_info = DRIVE_INFO; + screen_info = SCREEN_INFO; + edid_info = EDID_INFO; + saved_video_mode = SAVED_VIDEO_MODE; + bootloader_type = LOADER_TYPE; + +#ifdef CONFIG_BLK_DEV_RAM + rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK; + rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0); + rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0); +#endif + setup_memory_region(); + copy_edd(); + + if (!MOUNT_ROOT_RDONLY) + root_mountflags &= ~MS_RDONLY; + init_mm.start_code = (unsigned long) &_text; + init_mm.end_code = (unsigned long) &_etext; + init_mm.end_data = (unsigned long) &_edata; + init_mm.brk = (unsigned long) &_end; + + code_resource.start = virt_to_phys(&_text); + code_resource.end = virt_to_phys(&_etext)-1; + data_resource.start = virt_to_phys(&_etext); + data_resource.end = virt_to_phys(&_edata)-1; + + parse_cmdline_early(cmdline_p); + + early_identify_cpu(&boot_cpu_data); + + /* + * partially used pages are not usable - thus + * we are rounding upwards: + */ + end_pfn = e820_end_of_ram(); + + check_efer(); + + init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT)); + +#ifdef CONFIG_ACPI_BOOT + /* + * Initialize the ACPI boot-time table parser (gets the RSDP and SDT). + * Call this early for SRAT node setup. + */ + acpi_boot_table_init(); +#endif + +#ifdef CONFIG_ACPI_NUMA + /* + * Parse SRAT to discover nodes. + */ + acpi_numa_init(); +#endif + +#ifdef CONFIG_DISCONTIGMEM + numa_initmem_init(0, end_pfn); +#else + contig_initmem_init(); +#endif + + /* Reserve direct mapping */ + reserve_bootmem_generic(table_start << PAGE_SHIFT, + (table_end - table_start) << PAGE_SHIFT); + + /* reserve kernel */ + kernel_end = round_up(__pa_symbol(&_end),PAGE_SIZE); + reserve_bootmem_generic(HIGH_MEMORY, kernel_end - HIGH_MEMORY); + + /* + * reserve physical page 0 - it's a special BIOS page on many boxes, + * enabling clean reboots, SMP operation, laptop functions. + */ + reserve_bootmem_generic(0, PAGE_SIZE); + + /* reserve ebda region */ + reserve_ebda_region(); + +#ifdef CONFIG_SMP + /* + * But first pinch a few for the stack/trampoline stuff + * FIXME: Don't need the extra page at 4K, but need to fix + * trampoline before removing it. (see the GDT stuff) + */ + reserve_bootmem_generic(PAGE_SIZE, PAGE_SIZE); + + /* Reserve SMP trampoline */ + reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, PAGE_SIZE); +#endif + +#ifdef CONFIG_ACPI_SLEEP + /* + * Reserve low memory region for sleep support. + */ + acpi_reserve_bootmem(); +#endif +#ifdef CONFIG_X86_LOCAL_APIC + /* + * Find and reserve possible boot-time SMP configuration: + */ + find_smp_config(); +#endif +#ifdef CONFIG_BLK_DEV_INITRD + if (LOADER_TYPE && INITRD_START) { + if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) { + reserve_bootmem_generic(INITRD_START, INITRD_SIZE); + initrd_start = + INITRD_START ? INITRD_START + PAGE_OFFSET : 0; + initrd_end = initrd_start+INITRD_SIZE; + } + else { + printk(KERN_ERR "initrd extends beyond end of memory " + "(0x%08lx > 0x%08lx)\ndisabling initrd\n", + (unsigned long)(INITRD_START + INITRD_SIZE), + (unsigned long)(end_pfn << PAGE_SHIFT)); + initrd_start = 0; + } + } +#endif + paging_init(); + + check_ioapic(); + +#ifdef CONFIG_ACPI_BOOT + /* + * Read APIC and some other early information from ACPI tables. + */ + acpi_boot_init(); +#endif + +#ifdef CONFIG_X86_LOCAL_APIC + /* + * get boot-time SMP configuration: + */ + if (smp_found_config) + get_smp_config(); + init_apic_mappings(); +#endif + + /* + * Request address space for all standard RAM and ROM resources + * and also for regions reported as reserved by the e820. + */ + probe_roms(); + e820_reserve_resources(); + + request_resource(&iomem_resource, &video_ram_resource); + + { + unsigned i; + /* request I/O space for devices used on all i[345]86 PCs */ + for (i = 0; i < STANDARD_IO_RESOURCES; i++) + request_resource(&ioport_resource, &standard_io_resources[i]); + } + + /* Will likely break when you have unassigned resources with more + than 4GB memory and bridges that don't support more than 4GB. + Doing it properly would require to use pci_alloc_consistent + in this case. */ + low_mem_size = ((end_pfn << PAGE_SHIFT) + 0xfffff) & ~0xfffff; + if (low_mem_size > pci_mem_start) + pci_mem_start = low_mem_size; + +#ifdef CONFIG_GART_IOMMU + iommu_hole_init(); +#endif + +#ifdef CONFIG_VT +#if defined(CONFIG_VGA_CONSOLE) + conswitchp = &vga_con; +#elif defined(CONFIG_DUMMY_CONSOLE) + conswitchp = &dummy_con; +#endif +#endif +} + +static int __init get_model_name(struct cpuinfo_x86 *c) +{ + unsigned int *v; + + if (c->x86_cpuid_level < 0x80000004) + return 0; + + v = (unsigned int *) c->x86_model_id; + cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]); + cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]); + cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]); + c->x86_model_id[48] = 0; + return 1; +} + + +static void __init display_cacheinfo(struct cpuinfo_x86 *c) +{ + unsigned int n, dummy, eax, ebx, ecx, edx; + + n = c->x86_cpuid_level; + + if (n >= 0x80000005) { + cpuid(0x80000005, &dummy, &ebx, &ecx, &edx); + printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n", + edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); + c->x86_cache_size=(ecx>>24)+(edx>>24); + /* On K8 L1 TLB is inclusive, so don't count it */ + c->x86_tlbsize = 0; + } + + if (n >= 0x80000006) { + cpuid(0x80000006, &dummy, &ebx, &ecx, &edx); + ecx = cpuid_ecx(0x80000006); + c->x86_cache_size = ecx >> 16; + c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff); + + printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n", + c->x86_cache_size, ecx & 0xFF); + } + + if (n >= 0x80000007) + cpuid(0x80000007, &dummy, &dummy, &dummy, &c->x86_power); + if (n >= 0x80000008) { + cpuid(0x80000008, &eax, &dummy, &dummy, &dummy); + c->x86_virt_bits = (eax >> 8) & 0xff; + c->x86_phys_bits = eax & 0xff; + } +} + + +static int __init init_amd(struct cpuinfo_x86 *c) +{ + int r; + int level; +#ifdef CONFIG_NUMA + int cpu; +#endif + + /* Bit 31 in normal CPUID used for nonstandard 3DNow ID; + 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */ + clear_bit(0*32+31, &c->x86_capability); + + /* C-stepping K8? */ + level = cpuid_eax(1); + if ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58) + set_bit(X86_FEATURE_K8_C, &c->x86_capability); + + r = get_model_name(c); + if (!r) { + switch (c->x86) { + case 15: + /* Should distinguish Models here, but this is only + a fallback anyways. */ + strcpy(c->x86_model_id, "Hammer"); + break; + } + } + display_cacheinfo(c); + + if (c->x86_cpuid_level >= 0x80000008) { + c->x86_num_cores = (cpuid_ecx(0x80000008) & 0xff) + 1; + if (c->x86_num_cores & (c->x86_num_cores - 1)) + c->x86_num_cores = 1; + +#ifdef CONFIG_NUMA + /* On a dual core setup the lower bits of apic id + distingush the cores. Fix up the CPU<->node mappings + here based on that. + Assumes number of cores is a power of two. + When using SRAT use mapping from SRAT. */ + cpu = c->x86_apicid; + if (acpi_numa <= 0 && c->x86_num_cores > 1) { + cpu_to_node[cpu] = cpu >> hweight32(c->x86_num_cores - 1); + if (!node_online(cpu_to_node[cpu])) + cpu_to_node[cpu] = first_node(node_online_map); + } + printk(KERN_INFO "CPU %d(%d) -> Node %d\n", + cpu, c->x86_num_cores, cpu_to_node[cpu]); +#endif + } + + return r; +} + +static void __init detect_ht(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_SMP + u32 eax, ebx, ecx, edx; + int index_lsb, index_msb, tmp; + int cpu = smp_processor_id(); + + if (!cpu_has(c, X86_FEATURE_HT)) + return; + + cpuid(1, &eax, &ebx, &ecx, &edx); + smp_num_siblings = (ebx & 0xff0000) >> 16; + + if (smp_num_siblings == 1) { + printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); + } else if (smp_num_siblings > 1) { + index_lsb = 0; + index_msb = 31; + /* + * At this point we only support two siblings per + * processor package. + */ + if (smp_num_siblings > NR_CPUS) { + printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings); + smp_num_siblings = 1; + return; + } + tmp = smp_num_siblings; + while ((tmp & 1) == 0) { + tmp >>=1 ; + index_lsb++; + } + tmp = smp_num_siblings; + while ((tmp & 0x80000000 ) == 0) { + tmp <<=1 ; + index_msb--; + } + if (index_lsb != index_msb ) + index_msb++; + phys_proc_id[cpu] = phys_pkg_id(index_msb); + + printk(KERN_INFO "CPU: Physical Processor ID: %d\n", + phys_proc_id[cpu]); + } +#endif +} + +static void __init sched_cmp_hack(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_SMP + /* AMD dual core looks like HT but isn't really. Hide it from the + scheduler. This works around problems with the domain scheduler. + Also probably gives slightly better scheduling and disables + SMT nice which is harmful on dual core. + TBD tune the domain scheduler for dual core. */ + if (c->x86_vendor == X86_VENDOR_AMD && cpu_has(c, X86_FEATURE_CMP_LEGACY)) + smp_num_siblings = 1; +#endif +} + +static void __init init_intel(struct cpuinfo_x86 *c) +{ + /* Cache sizes */ + unsigned n; + + init_intel_cacheinfo(c); + n = c->x86_cpuid_level; + if (n >= 0x80000008) { + unsigned eax = cpuid_eax(0x80000008); + c->x86_virt_bits = (eax >> 8) & 0xff; + c->x86_phys_bits = eax & 0xff; + } + + if (c->x86 == 15) + c->x86_cache_alignment = c->x86_clflush_size * 2; +} + +void __init get_cpu_vendor(struct cpuinfo_x86 *c) +{ + char *v = c->x86_vendor_id; + + if (!strcmp(v, "AuthenticAMD")) + c->x86_vendor = X86_VENDOR_AMD; + else if (!strcmp(v, "GenuineIntel")) + c->x86_vendor = X86_VENDOR_INTEL; + else + c->x86_vendor = X86_VENDOR_UNKNOWN; +} + +struct cpu_model_info { + int vendor; + int family; + char *model_names[16]; +}; + +/* Do some early cpuid on the boot CPU to get some parameter that are + needed before check_bugs. Everything advanced is in identify_cpu + below. */ +void __init early_identify_cpu(struct cpuinfo_x86 *c) +{ + u32 tfms; + + c->loops_per_jiffy = loops_per_jiffy; + c->x86_cache_size = -1; + c->x86_vendor = X86_VENDOR_UNKNOWN; + c->x86_model = c->x86_mask = 0; /* So far unknown... */ + c->x86_vendor_id[0] = '\0'; /* Unset */ + c->x86_model_id[0] = '\0'; /* Unset */ + c->x86_clflush_size = 64; + c->x86_cache_alignment = c->x86_clflush_size; + c->x86_num_cores = 1; + c->x86_apicid = c == &boot_cpu_data ? 0 : c - cpu_data; + c->x86_cpuid_level = 0; + memset(&c->x86_capability, 0, sizeof c->x86_capability); + + /* Get vendor name */ + cpuid(0x00000000, (unsigned int *)&c->cpuid_level, + (unsigned int *)&c->x86_vendor_id[0], + (unsigned int *)&c->x86_vendor_id[8], + (unsigned int *)&c->x86_vendor_id[4]); + + get_cpu_vendor(c); + + /* Initialize the standard set of capabilities */ + /* Note that the vendor-specific code below might override */ + + /* Intel-defined flags: level 0x00000001 */ + if (c->cpuid_level >= 0x00000001) { + __u32 misc; + cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4], + &c->x86_capability[0]); + c->x86 = (tfms >> 8) & 0xf; + c->x86_model = (tfms >> 4) & 0xf; + c->x86_mask = tfms & 0xf; + if (c->x86 == 0xf) { + c->x86 += (tfms >> 20) & 0xff; + c->x86_model += ((tfms >> 16) & 0xF) << 4; + } + if (c->x86_capability[0] & (1<<19)) + c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; + c->x86_apicid = misc >> 24; + } else { + /* Have CPUID level 0 only - unheard of */ + c->x86 = 4; + } +} + +/* + * This does the hard work of actually picking apart the CPU stuff... + */ +void __init identify_cpu(struct cpuinfo_x86 *c) +{ + int i; + u32 xlvl; + + early_identify_cpu(c); + + /* AMD-defined flags: level 0x80000001 */ + xlvl = cpuid_eax(0x80000000); + c->x86_cpuid_level = xlvl; + if ((xlvl & 0xffff0000) == 0x80000000) { + if (xlvl >= 0x80000001) { + c->x86_capability[1] = cpuid_edx(0x80000001); + c->x86_capability[5] = cpuid_ecx(0x80000001); + } + if (xlvl >= 0x80000004) + get_model_name(c); /* Default name */ + } + + /* Transmeta-defined flags: level 0x80860001 */ + xlvl = cpuid_eax(0x80860000); + if ((xlvl & 0xffff0000) == 0x80860000) { + /* Don't set x86_cpuid_level here for now to not confuse. */ + if (xlvl >= 0x80860001) + c->x86_capability[2] = cpuid_edx(0x80860001); + } + + /* + * Vendor-specific initialization. In this section we + * canonicalize the feature flags, meaning if there are + * features a certain CPU supports which CPUID doesn't + * tell us, CPUID claiming incorrect flags, or other bugs, + * we handle them here. + * + * At the end of this section, c->x86_capability better + * indicate the features this CPU genuinely supports! + */ + switch (c->x86_vendor) { + case X86_VENDOR_AMD: + init_amd(c); + break; + + case X86_VENDOR_INTEL: + init_intel(c); + break; + + case X86_VENDOR_UNKNOWN: + default: + display_cacheinfo(c); + break; + } + + select_idle_routine(c); + detect_ht(c); + sched_cmp_hack(c); + + /* + * On SMP, boot_cpu_data holds the common feature set between + * all CPUs; so make sure that we indicate which features are + * common between the CPUs. The first time this routine gets + * executed, c == &boot_cpu_data. + */ + if (c != &boot_cpu_data) { + /* AND the already accumulated flags with these */ + for (i = 0 ; i < NCAPINTS ; i++) + boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; + } + +#ifdef CONFIG_X86_MCE + mcheck_init(c); +#endif +#ifdef CONFIG_NUMA + if (c != &boot_cpu_data) + numa_add_cpu(c - cpu_data); +#endif +} + + +void __init print_cpu_info(struct cpuinfo_x86 *c) +{ + if (c->x86_model_id[0]) + printk("%s", c->x86_model_id); + + if (c->x86_mask || c->cpuid_level >= 0) + printk(" stepping %02x\n", c->x86_mask); + else + printk("\n"); +} + +/* + * Get CPU information for use by the procfs. + */ + +static int show_cpuinfo(struct seq_file *m, void *v) +{ + struct cpuinfo_x86 *c = v; + + /* + * These flag bits must match the definitions in <asm/cpufeature.h>. + * NULL means this bit is undefined or reserved; either way it doesn't + * have meaning as far as Linux is concerned. Note that it's important + * to realize there is a difference between this table and CPUID -- if + * applications want to get the raw CPUID data, they should access + * /dev/cpu/<cpu_nr>/cpuid instead. + */ + static char *x86_cap_flags[] = { + /* Intel-defined */ + "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce", + "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov", + "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx", + "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", NULL, + + /* AMD-defined */ + "pni", NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL, + NULL, "fxsr_opt", NULL, NULL, NULL, "lm", "3dnowext", "3dnow", + + /* Transmeta-defined */ + "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + + /* Other (Linux-defined) */ + "cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr", NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + + /* Intel-defined (#2) */ + "pni", NULL, NULL, "monitor", "ds_cpl", NULL, NULL, "est", + "tm2", NULL, "cid", NULL, NULL, "cx16", "xtpr", NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + + /* AMD-defined (#2) */ + "lahf_lm", "cmp_legacy", NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL + }; + static char *x86_power_flags[] = { + "ts", /* temperature sensor */ + "fid", /* frequency id control */ + "vid", /* voltage id control */ + "ttp", /* thermal trip */ + "tm", + "stc" + }; + + +#ifdef CONFIG_SMP + if (!cpu_online(c-cpu_data)) + return 0; +#endif + + seq_printf(m,"processor\t: %u\n" + "vendor_id\t: %s\n" + "cpu family\t: %d\n" + "model\t\t: %d\n" + "model name\t: %s\n", + (unsigned)(c-cpu_data), + c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown", + c->x86, + (int)c->x86_model, + c->x86_model_id[0] ? c->x86_model_id : "unknown"); + + if (c->x86_mask || c->cpuid_level >= 0) + seq_printf(m, "stepping\t: %d\n", c->x86_mask); + else + seq_printf(m, "stepping\t: unknown\n"); + + if (cpu_has(c,X86_FEATURE_TSC)) { + seq_printf(m, "cpu MHz\t\t: %u.%03u\n", + cpu_khz / 1000, (cpu_khz % 1000)); + } + + /* Cache size */ + if (c->x86_cache_size >= 0) + seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size); + +#ifdef CONFIG_SMP + seq_printf(m, "physical id\t: %d\n", phys_proc_id[c - cpu_data]); + seq_printf(m, "siblings\t: %d\n", c->x86_num_cores * smp_num_siblings); +#endif + + seq_printf(m, + "fpu\t\t: yes\n" + "fpu_exception\t: yes\n" + "cpuid level\t: %d\n" + "wp\t\t: yes\n" + "flags\t\t:", + c->cpuid_level); + + { + int i; + for ( i = 0 ; i < 32*NCAPINTS ; i++ ) + if ( test_bit(i, &c->x86_capability) && + x86_cap_flags[i] != NULL ) + seq_printf(m, " %s", x86_cap_flags[i]); + } + + seq_printf(m, "\nbogomips\t: %lu.%02lu\n", + c->loops_per_jiffy/(500000/HZ), + (c->loops_per_jiffy/(5000/HZ)) % 100); + + if (c->x86_tlbsize > 0) + seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize); + seq_printf(m, "clflush size\t: %d\n", c->x86_clflush_size); + seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment); + + seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n", + c->x86_phys_bits, c->x86_virt_bits); + + seq_printf(m, "power management:"); + { + unsigned i; + for (i = 0; i < 32; i++) + if (c->x86_power & (1 << i)) { + if (i < ARRAY_SIZE(x86_power_flags)) + seq_printf(m, " %s", x86_power_flags[i]); + else + seq_printf(m, " [%d]", i); + } + } + seq_printf(m, "\n"); + + if (c->x86_num_cores > 1) + seq_printf(m, "cpu cores\t: %d\n", c->x86_num_cores); + + seq_printf(m, "\n\n"); + + return 0; +} + +static void *c_start(struct seq_file *m, loff_t *pos) +{ + return *pos < NR_CPUS ? cpu_data + *pos : NULL; +} + +static void *c_next(struct seq_file *m, void *v, loff_t *pos) +{ + ++*pos; + return c_start(m, pos); +} + +static void c_stop(struct seq_file *m, void *v) +{ +} + +struct seq_operations cpuinfo_op = { + .start =c_start, + .next = c_next, + .stop = c_stop, + .show = show_cpuinfo, +}; diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c new file mode 100644 index 00000000000..b5305b04bc4 --- /dev/null +++ b/arch/x86_64/kernel/setup64.c @@ -0,0 +1,292 @@ +/* + * X86-64 specific CPU setup. + * Copyright (C) 1995 Linus Torvalds + * Copyright 2001, 2002, 2003 SuSE Labs / Andi Kleen. + * See setup.c for older changelog. + * $Id: setup64.c,v 1.12 2002/03/21 10:09:17 ak Exp $ + */ +#include <linux/config.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/bootmem.h> +#include <linux/bitops.h> +#include <asm/pda.h> +#include <asm/pgtable.h> +#include <asm/processor.h> +#include <asm/desc.h> +#include <asm/atomic.h> +#include <asm/mmu_context.h> +#include <asm/smp.h> +#include <asm/i387.h> +#include <asm/percpu.h> +#include <asm/mtrr.h> +#include <asm/proto.h> +#include <asm/mman.h> +#include <asm/numa.h> + +char x86_boot_params[2048] __initdata = {0,}; + +cpumask_t cpu_initialized __initdata = CPU_MASK_NONE; + +struct x8664_pda cpu_pda[NR_CPUS] __cacheline_aligned; + +extern struct task_struct init_task; + +extern unsigned char __per_cpu_start[], __per_cpu_end[]; + +extern struct desc_ptr cpu_gdt_descr[]; +struct desc_ptr idt_descr = { 256 * 16, (unsigned long) idt_table }; + +char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned"))); + +unsigned long __supported_pte_mask = ~0UL; +static int do_not_nx __initdata = 0; + +/* noexec=on|off +Control non executable mappings for 64bit processes. + +on Enable(default) +off Disable +*/ +int __init nonx_setup(char *str) +{ + if (!strncmp(str, "on", 2)) { + __supported_pte_mask |= _PAGE_NX; + do_not_nx = 0; + } else if (!strncmp(str, "off", 3)) { + do_not_nx = 1; + __supported_pte_mask &= ~_PAGE_NX; + } + return 0; +} +__setup("noexec=", nonx_setup); /* parsed early actually */ + +int force_personality32 = READ_IMPLIES_EXEC; + +/* noexec32=on|off +Control non executable heap for 32bit processes. +To control the stack too use noexec=off + +on PROT_READ does not imply PROT_EXEC for 32bit processes +off PROT_READ implies PROT_EXEC (default) +*/ +static int __init nonx32_setup(char *str) +{ + if (!strcmp(str, "on")) + force_personality32 &= ~READ_IMPLIES_EXEC; + else if (!strcmp(str, "off")) + force_personality32 |= READ_IMPLIES_EXEC; + return 0; +} +__setup("noexec32=", nonx32_setup); + +/* + * Great future plan: + * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data. + * Always point %gs to its beginning + */ +void __init setup_per_cpu_areas(void) +{ + int i; + unsigned long size; + + /* Copy section for each CPU (we discard the original) */ + size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES); +#ifdef CONFIG_MODULES + if (size < PERCPU_ENOUGH_ROOM) + size = PERCPU_ENOUGH_ROOM; +#endif + + for (i = 0; i < NR_CPUS; i++) { + unsigned char *ptr; + + if (!NODE_DATA(cpu_to_node(i))) { + printk("cpu with no node %d, num_online_nodes %d\n", + i, num_online_nodes()); + ptr = alloc_bootmem(size); + } else { + ptr = alloc_bootmem_node(NODE_DATA(cpu_to_node(i)), size); + } + if (!ptr) + panic("Cannot allocate cpu data for CPU %d\n", i); + cpu_pda[i].data_offset = ptr - __per_cpu_start; + memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); + } +} + +void pda_init(int cpu) +{ + struct x8664_pda *pda = &cpu_pda[cpu]; + + /* Setup up data that may be needed in __get_free_pages early */ + asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0)); + wrmsrl(MSR_GS_BASE, cpu_pda + cpu); + + pda->me = pda; + pda->cpunumber = cpu; + pda->irqcount = -1; + pda->kernelstack = + (unsigned long)stack_thread_info() - PDA_STACKOFFSET + THREAD_SIZE; + pda->active_mm = &init_mm; + pda->mmu_state = 0; + + if (cpu == 0) { + /* others are initialized in smpboot.c */ + pda->pcurrent = &init_task; + pda->irqstackptr = boot_cpu_stack; + } else { + pda->irqstackptr = (char *) + __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER); + if (!pda->irqstackptr) + panic("cannot allocate irqstack for cpu %d", cpu); + } + + asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt))); + + pda->irqstackptr += IRQSTACKSIZE-64; +} + +char boot_exception_stacks[N_EXCEPTION_STACKS * EXCEPTION_STKSZ] +__attribute__((section(".bss.page_aligned"))); + +/* May not be marked __init: used by software suspend */ +void syscall_init(void) +{ + /* + * LSTAR and STAR live in a bit strange symbiosis. + * They both write to the same internal register. STAR allows to set CS/DS + * but only a 32bit target. LSTAR sets the 64bit rip. + */ + wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); + wrmsrl(MSR_LSTAR, system_call); + +#ifdef CONFIG_IA32_EMULATION + syscall32_cpu_init (); +#endif + + /* Flags to clear on syscall */ + wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000); +} + +void __init check_efer(void) +{ + unsigned long efer; + + rdmsrl(MSR_EFER, efer); + if (!(efer & EFER_NX) || do_not_nx) { + __supported_pte_mask &= ~_PAGE_NX; + } +} + +/* + * cpu_init() initializes state that is per-CPU. Some data is already + * initialized (naturally) in the bootstrap process, such as the GDT + * and IDT. We reload them nevertheless, this function acts as a + * 'CPU state barrier', nothing should get across. + * A lot of state is already set up in PDA init. + */ +void __init cpu_init (void) +{ +#ifdef CONFIG_SMP + int cpu = stack_smp_processor_id(); +#else + int cpu = smp_processor_id(); +#endif + struct tss_struct *t = &per_cpu(init_tss, cpu); + unsigned long v; + char *estacks = NULL; + struct task_struct *me; + int i; + + /* CPU 0 is initialised in head64.c */ + if (cpu != 0) { + pda_init(cpu); + } else + estacks = boot_exception_stacks; + + me = current; + + if (cpu_test_and_set(cpu, cpu_initialized)) + panic("CPU#%d already initialized!\n", cpu); + + printk("Initializing CPU#%d\n", cpu); + + clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); + + /* + * Initialize the per-CPU GDT with the boot GDT, + * and set up the GDT descriptor: + */ + if (cpu) { + memcpy(cpu_gdt_table[cpu], cpu_gdt_table[0], GDT_SIZE); + } + + cpu_gdt_descr[cpu].size = GDT_SIZE; + cpu_gdt_descr[cpu].address = (unsigned long)cpu_gdt_table[cpu]; + asm volatile("lgdt %0" :: "m" (cpu_gdt_descr[cpu])); + asm volatile("lidt %0" :: "m" (idt_descr)); + + memcpy(me->thread.tls_array, cpu_gdt_table[cpu], GDT_ENTRY_TLS_ENTRIES * 8); + + /* + * Delete NT + */ + + asm volatile("pushfq ; popq %%rax ; btr $14,%%rax ; pushq %%rax ; popfq" ::: "eax"); + + syscall_init(); + + wrmsrl(MSR_FS_BASE, 0); + wrmsrl(MSR_KERNEL_GS_BASE, 0); + barrier(); + + check_efer(); + + /* + * set up and load the per-CPU TSS + */ + for (v = 0; v < N_EXCEPTION_STACKS; v++) { + if (cpu) { + estacks = (char *)__get_free_pages(GFP_ATOMIC, + EXCEPTION_STACK_ORDER); + if (!estacks) + panic("Cannot allocate exception stack %ld %d\n", + v, cpu); + } + estacks += EXCEPTION_STKSZ; + t->ist[v] = (unsigned long)estacks; + } + + t->io_bitmap_base = offsetof(struct tss_struct, io_bitmap); + /* + * <= is required because the CPU will access up to + * 8 bits beyond the end of the IO permission bitmap. + */ + for (i = 0; i <= IO_BITMAP_LONGS; i++) + t->io_bitmap[i] = ~0UL; + + atomic_inc(&init_mm.mm_count); + me->active_mm = &init_mm; + if (me->mm) + BUG(); + enter_lazy_tlb(&init_mm, me); + + set_tss_desc(cpu, t); + load_TR_desc(); + load_LDT(&init_mm.context); + + /* + * Clear all 6 debug registers: + */ + + set_debug(0UL, 0); + set_debug(0UL, 1); + set_debug(0UL, 2); + set_debug(0UL, 3); + set_debug(0UL, 6); + set_debug(0UL, 7); + + fpu_init(); +} diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c new file mode 100644 index 00000000000..5ace32c6ac5 --- /dev/null +++ b/arch/x86_64/kernel/signal.c @@ -0,0 +1,486 @@ +/* + * linux/arch/x86_64/kernel/signal.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs + * + * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson + * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes + * 2000-2002 x86-64 support by Andi Kleen + * + * $Id: signal.c,v 1.18 2001/10/17 22:30:37 ak Exp $ + */ + +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/smp_lock.h> +#include <linux/kernel.h> +#include <linux/signal.h> +#include <linux/errno.h> +#include <linux/wait.h> +#include <linux/ptrace.h> +#include <linux/unistd.h> +#include <linux/stddef.h> +#include <linux/personality.h> +#include <linux/compiler.h> +#include <asm/ucontext.h> +#include <asm/uaccess.h> +#include <asm/i387.h> +#include <asm/proto.h> + +/* #define DEBUG_SIG 1 */ + +#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) + +void ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, + sigset_t *set, struct pt_regs * regs); +void ia32_setup_frame(int sig, struct k_sigaction *ka, + sigset_t *set, struct pt_regs * regs); + +asmlinkage long +sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize, struct pt_regs *regs) +{ + sigset_t saveset, newset; + + /* XXX: Don't preclude handling different sized sigset_t's. */ + if (sigsetsize != sizeof(sigset_t)) + return -EINVAL; + + if (copy_from_user(&newset, unewset, sizeof(newset))) + return -EFAULT; + sigdelsetmask(&newset, ~_BLOCKABLE); + + spin_lock_irq(¤t->sighand->siglock); + saveset = current->blocked; + current->blocked = newset; + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); +#ifdef DEBUG_SIG + printk("rt_sigsuspend savset(%lx) newset(%lx) regs(%p) rip(%lx)\n", + saveset, newset, regs, regs->rip); +#endif + regs->rax = -EINTR; + while (1) { + current->state = TASK_INTERRUPTIBLE; + schedule(); + if (do_signal(regs, &saveset)) + return -EINTR; + } +} + +asmlinkage long +sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, + struct pt_regs *regs) +{ + return do_sigaltstack(uss, uoss, regs->rsp); +} + + +/* + * Do a signal return; undo the signal stack. + */ + +struct rt_sigframe +{ + char *pretcode; + struct ucontext uc; + struct siginfo info; +}; + +static int +restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned long *prax) +{ + unsigned int err = 0; + + /* Always make any pending restarted system calls return -EINTR */ + current_thread_info()->restart_block.fn = do_no_restart_syscall; + +#define COPY(x) err |= __get_user(regs->x, &sc->x) + + COPY(rdi); COPY(rsi); COPY(rbp); COPY(rsp); COPY(rbx); + COPY(rdx); COPY(rcx); COPY(rip); + COPY(r8); + COPY(r9); + COPY(r10); + COPY(r11); + COPY(r12); + COPY(r13); + COPY(r14); + COPY(r15); + + { + unsigned int tmpflags; + err |= __get_user(tmpflags, &sc->eflags); + regs->eflags = (regs->eflags & ~0x40DD5) | (tmpflags & 0x40DD5); + regs->orig_rax = -1; /* disable syscall checks */ + } + + { + struct _fpstate __user * buf; + err |= __get_user(buf, &sc->fpstate); + + if (buf) { + if (!access_ok(VERIFY_READ, buf, sizeof(*buf))) + goto badframe; + err |= restore_i387(buf); + } else { + struct task_struct *me = current; + if (used_math()) { + clear_fpu(me); + clear_used_math(); + } + } + } + + err |= __get_user(*prax, &sc->rax); + return err; + +badframe: + return 1; +} + +asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) +{ + struct rt_sigframe __user *frame; + sigset_t set; + unsigned long eax; + + frame = (struct rt_sigframe __user *)(regs->rsp - 8); + if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) { + goto badframe; + } + if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) { + goto badframe; + } + + sigdelsetmask(&set, ~_BLOCKABLE); + spin_lock_irq(¤t->sighand->siglock); + current->blocked = set; + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + + if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax)) + goto badframe; + +#ifdef DEBUG_SIG + printk("%d sigreturn rip:%lx rsp:%lx frame:%p rax:%lx\n",current->pid,regs.rip,regs.rsp,frame,eax); +#endif + + if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->rsp) == -EFAULT) + goto badframe; + + return eax; + +badframe: + signal_fault(regs,frame,"sigreturn"); + return 0; +} + +/* + * Set up a signal frame. + */ + +static inline int +setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned long mask, struct task_struct *me) +{ + int err = 0; + unsigned long eflags; + + err |= __put_user(0, &sc->gs); + err |= __put_user(0, &sc->fs); + + err |= __put_user(regs->rdi, &sc->rdi); + err |= __put_user(regs->rsi, &sc->rsi); + err |= __put_user(regs->rbp, &sc->rbp); + err |= __put_user(regs->rsp, &sc->rsp); + err |= __put_user(regs->rbx, &sc->rbx); + err |= __put_user(regs->rdx, &sc->rdx); + err |= __put_user(regs->rcx, &sc->rcx); + err |= __put_user(regs->rax, &sc->rax); + err |= __put_user(regs->r8, &sc->r8); + err |= __put_user(regs->r9, &sc->r9); + err |= __put_user(regs->r10, &sc->r10); + err |= __put_user(regs->r11, &sc->r11); + err |= __put_user(regs->r12, &sc->r12); + err |= __put_user(regs->r13, &sc->r13); + err |= __put_user(regs->r14, &sc->r14); + err |= __put_user(regs->r15, &sc->r15); + err |= __put_user(me->thread.trap_no, &sc->trapno); + err |= __put_user(me->thread.error_code, &sc->err); + err |= __put_user(regs->rip, &sc->rip); + eflags = regs->eflags; + if (current->ptrace & PT_PTRACED) { + eflags &= ~TF_MASK; + } + err |= __put_user(eflags, &sc->eflags); + err |= __put_user(mask, &sc->oldmask); + err |= __put_user(me->thread.cr2, &sc->cr2); + + return err; +} + +/* + * Determine which stack to use.. + */ + +static void __user * +get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size) +{ + unsigned long rsp; + + /* Default to using normal stack - redzone*/ + rsp = regs->rsp - 128; + + /* This is the X/Open sanctioned signal stack switching. */ + /* RED-PEN: redzone on that stack? */ + if (ka->sa.sa_flags & SA_ONSTACK) { + if (sas_ss_flags(rsp) == 0) + rsp = current->sas_ss_sp + current->sas_ss_size; + } + + return (void __user *)round_down(rsp - size, 16); +} + +static void setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, + sigset_t *set, struct pt_regs * regs) +{ + struct rt_sigframe __user *frame; + struct _fpstate __user *fp = NULL; + int err = 0; + struct task_struct *me = current; + + if (used_math()) { + fp = get_stack(ka, regs, sizeof(struct _fpstate)); + frame = (void __user *)round_down( + (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8; + + if (!access_ok(VERIFY_WRITE, fp, sizeof(struct _fpstate))) + goto give_sigsegv; + + if (save_i387(fp) < 0) + err |= -1; + } else + frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8; + + if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) + goto give_sigsegv; + + if (ka->sa.sa_flags & SA_SIGINFO) { + err |= copy_siginfo_to_user(&frame->info, info); + if (err) + goto give_sigsegv; + } + + /* Create the ucontext. */ + err |= __put_user(0, &frame->uc.uc_flags); + err |= __put_user(0, &frame->uc.uc_link); + err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp); + err |= __put_user(sas_ss_flags(regs->rsp), + &frame->uc.uc_stack.ss_flags); + err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size); + err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me); + err |= __put_user(fp, &frame->uc.uc_mcontext.fpstate); + if (sizeof(*set) == 16) { + __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]); + __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]); + } else + err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); + + /* Set up to return from userspace. If provided, use a stub + already in userspace. */ + /* x86-64 should always use SA_RESTORER. */ + if (ka->sa.sa_flags & SA_RESTORER) { + err |= __put_user(ka->sa.sa_restorer, &frame->pretcode); + } else { + /* could use a vstub here */ + goto give_sigsegv; + } + + if (err) + goto give_sigsegv; + +#ifdef DEBUG_SIG + printk("%d old rip %lx old rsp %lx old rax %lx\n", current->pid,regs->rip,regs->rsp,regs->rax); +#endif + + /* Set up registers for signal handler */ + { + struct exec_domain *ed = current_thread_info()->exec_domain; + if (unlikely(ed && ed->signal_invmap && sig < 32)) + sig = ed->signal_invmap[sig]; + } + regs->rdi = sig; + /* In case the signal handler was declared without prototypes */ + regs->rax = 0; + + /* This also works for non SA_SIGINFO handlers because they expect the + next argument after the signal number on the stack. */ + regs->rsi = (unsigned long)&frame->info; + regs->rdx = (unsigned long)&frame->uc; + regs->rip = (unsigned long) ka->sa.sa_handler; + + regs->rsp = (unsigned long)frame; + + set_fs(USER_DS); + if (regs->eflags & TF_MASK) { + if ((current->ptrace & (PT_PTRACED | PT_DTRACE)) == (PT_PTRACED | PT_DTRACE)) { + ptrace_notify(SIGTRAP); + } else { + regs->eflags &= ~TF_MASK; + } + } + +#ifdef DEBUG_SIG + printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n", + current->comm, current->pid, frame, regs->rip, frame->pretcode); +#endif + + return; + +give_sigsegv: + force_sigsegv(sig, current); +} + +/* + * OK, we're invoking a handler + */ + +static void +handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, + sigset_t *oldset, struct pt_regs *regs) +{ +#ifdef DEBUG_SIG + printk("handle_signal pid:%d sig:%lu rip:%lx rsp:%lx regs=%p\n", + current->pid, sig, + regs->rip, regs->rsp, regs); +#endif + + /* Are we from a system call? */ + if ((long)regs->orig_rax >= 0) { + /* If so, check system call restarting.. */ + switch (regs->rax) { + case -ERESTART_RESTARTBLOCK: + case -ERESTARTNOHAND: + regs->rax = -EINTR; + break; + + case -ERESTARTSYS: + if (!(ka->sa.sa_flags & SA_RESTART)) { + regs->rax = -EINTR; + break; + } + /* fallthrough */ + case -ERESTARTNOINTR: + regs->rax = regs->orig_rax; + regs->rip -= 2; + break; + } + } + +#ifdef CONFIG_IA32_EMULATION + if (test_thread_flag(TIF_IA32)) { + if (ka->sa.sa_flags & SA_SIGINFO) + ia32_setup_rt_frame(sig, ka, info, oldset, regs); + else + ia32_setup_frame(sig, ka, oldset, regs); + } else +#endif + setup_rt_frame(sig, ka, info, oldset, regs); + + if (!(ka->sa.sa_flags & SA_NODEFER)) { + spin_lock_irq(¤t->sighand->siglock); + sigorsets(¤t->blocked,¤t->blocked,&ka->sa.sa_mask); + sigaddset(¤t->blocked,sig); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + } +} + +/* + * Note that 'init' is a special process: it doesn't get signals it doesn't + * want to handle. Thus you cannot kill init even with a SIGKILL even by + * mistake. + */ +int do_signal(struct pt_regs *regs, sigset_t *oldset) +{ + struct k_sigaction ka; + siginfo_t info; + int signr; + + /* + * We want the common case to go fast, which + * is why we may in certain cases get here from + * kernel mode. Just return without doing anything + * if so. + */ + if ((regs->cs & 3) != 3) + return 1; + + if (try_to_freeze(0)) + goto no_signal; + + if (!oldset) + oldset = ¤t->blocked; + + signr = get_signal_to_deliver(&info, &ka, regs, NULL); + if (signr > 0) { + /* Reenable any watchpoints before delivering the + * signal to user space. The processor register will + * have been cleared if the watchpoint triggered + * inside the kernel. + */ + if (current->thread.debugreg7) + asm volatile("movq %0,%%db7" : : "r" (current->thread.debugreg7)); + + /* Whee! Actually deliver the signal. */ + handle_signal(signr, &info, &ka, oldset, regs); + return 1; + } + + no_signal: + /* Did we come from a system call? */ + if ((long)regs->orig_rax >= 0) { + /* Restart the system call - no handlers present */ + long res = regs->rax; + if (res == -ERESTARTNOHAND || + res == -ERESTARTSYS || + res == -ERESTARTNOINTR) { + regs->rax = regs->orig_rax; + regs->rip -= 2; + } + if (regs->rax == (unsigned long)-ERESTART_RESTARTBLOCK) { + regs->rax = __NR_restart_syscall; + regs->rip -= 2; + } + } + return 0; +} + +void do_notify_resume(struct pt_regs *regs, sigset_t *oldset, __u32 thread_info_flags) +{ +#ifdef DEBUG_SIG + printk("do_notify_resume flags:%x rip:%lx rsp:%lx caller:%lx pending:%lx\n", + thread_info_flags, regs->rip, regs->rsp, __builtin_return_address(0),signal_pending(current)); +#endif + + /* Pending single-step? */ + if (thread_info_flags & _TIF_SINGLESTEP) { + regs->eflags |= TF_MASK; + clear_thread_flag(TIF_SINGLESTEP); + } + + /* deal with pending signal delivery */ + if (thread_info_flags & _TIF_SIGPENDING) + do_signal(regs,oldset); +} + +void signal_fault(struct pt_regs *regs, void __user *frame, char *where) +{ + struct task_struct *me = current; + if (exception_trace) + printk("%s[%d] bad frame in %s frame:%p rip:%lx rsp:%lx orax:%lx\n", + me->comm,me->pid,where,frame,regs->rip,regs->rsp,regs->orig_rax); + + force_sig(SIGSEGV, me); +} diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c new file mode 100644 index 00000000000..e3cdbf9a88b --- /dev/null +++ b/arch/x86_64/kernel/smp.c @@ -0,0 +1,415 @@ +/* + * Intel SMP support routines. + * + * (c) 1995 Alan Cox, Building #3 <alan@redhat.com> + * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com> + * (c) 2002,2003 Andi Kleen, SuSE Labs. + * + * This code is released under the GNU General Public License version 2 or + * later. + */ + +#include <linux/init.h> + +#include <linux/mm.h> +#include <linux/irq.h> +#include <linux/delay.h> +#include <linux/spinlock.h> +#include <linux/smp_lock.h> +#include <linux/smp.h> +#include <linux/kernel_stat.h> +#include <linux/mc146818rtc.h> +#include <linux/interrupt.h> + +#include <asm/mtrr.h> +#include <asm/pgalloc.h> +#include <asm/tlbflush.h> +#include <asm/mach_apic.h> +#include <asm/mmu_context.h> +#include <asm/proto.h> + +/* + * Smarter SMP flushing macros. + * c/o Linus Torvalds. + * + * These mean you can really definitely utterly forget about + * writing to user space from interrupts. (Its not allowed anyway). + * + * Optimizations Manfred Spraul <manfred@colorfullife.com> + */ + +static cpumask_t flush_cpumask; +static struct mm_struct * flush_mm; +static unsigned long flush_va; +static DEFINE_SPINLOCK(tlbstate_lock); +#define FLUSH_ALL -1ULL + +/* + * We cannot call mmdrop() because we are in interrupt context, + * instead update mm->cpu_vm_mask. + */ +static inline void leave_mm (unsigned long cpu) +{ + if (read_pda(mmu_state) == TLBSTATE_OK) + BUG(); + clear_bit(cpu, &read_pda(active_mm)->cpu_vm_mask); + load_cr3(swapper_pg_dir); +} + +/* + * + * The flush IPI assumes that a thread switch happens in this order: + * [cpu0: the cpu that switches] + * 1) switch_mm() either 1a) or 1b) + * 1a) thread switch to a different mm + * 1a1) clear_bit(cpu, &old_mm->cpu_vm_mask); + * Stop ipi delivery for the old mm. This is not synchronized with + * the other cpus, but smp_invalidate_interrupt ignore flush ipis + * for the wrong mm, and in the worst case we perform a superfluous + * tlb flush. + * 1a2) set cpu mmu_state to TLBSTATE_OK + * Now the smp_invalidate_interrupt won't call leave_mm if cpu0 + * was in lazy tlb mode. + * 1a3) update cpu active_mm + * Now cpu0 accepts tlb flushes for the new mm. + * 1a4) set_bit(cpu, &new_mm->cpu_vm_mask); + * Now the other cpus will send tlb flush ipis. + * 1a4) change cr3. + * 1b) thread switch without mm change + * cpu active_mm is correct, cpu0 already handles + * flush ipis. + * 1b1) set cpu mmu_state to TLBSTATE_OK + * 1b2) test_and_set the cpu bit in cpu_vm_mask. + * Atomically set the bit [other cpus will start sending flush ipis], + * and test the bit. + * 1b3) if the bit was 0: leave_mm was called, flush the tlb. + * 2) switch %%esp, ie current + * + * The interrupt must handle 2 special cases: + * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm. + * - the cpu performs speculative tlb reads, i.e. even if the cpu only + * runs in kernel space, the cpu could load tlb entries for user space + * pages. + * + * The good news is that cpu mmu_state is local to each cpu, no + * write/read ordering problems. + */ + +/* + * TLB flush IPI: + * + * 1) Flush the tlb entries if the cpu uses the mm that's being flushed. + * 2) Leave the mm if we are in the lazy tlb mode. + */ + +asmlinkage void smp_invalidate_interrupt (void) +{ + unsigned long cpu; + + cpu = get_cpu(); + + if (!cpu_isset(cpu, flush_cpumask)) + goto out; + /* + * This was a BUG() but until someone can quote me the + * line from the intel manual that guarantees an IPI to + * multiple CPUs is retried _only_ on the erroring CPUs + * its staying as a return + * + * BUG(); + */ + + if (flush_mm == read_pda(active_mm)) { + if (read_pda(mmu_state) == TLBSTATE_OK) { + if (flush_va == FLUSH_ALL) + local_flush_tlb(); + else + __flush_tlb_one(flush_va); + } else + leave_mm(cpu); + } + ack_APIC_irq(); + cpu_clear(cpu, flush_cpumask); + +out: + put_cpu_no_resched(); +} + +static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, + unsigned long va) +{ + cpumask_t tmp; + /* + * A couple of (to be removed) sanity checks: + * + * - we do not send IPIs to not-yet booted CPUs. + * - current CPU must not be in mask + * - mask must exist :) + */ + BUG_ON(cpus_empty(cpumask)); + cpus_and(tmp, cpumask, cpu_online_map); + BUG_ON(!cpus_equal(tmp, cpumask)); + BUG_ON(cpu_isset(smp_processor_id(), cpumask)); + if (!mm) + BUG(); + + /* + * I'm not happy about this global shared spinlock in the + * MM hot path, but we'll see how contended it is. + * Temporarily this turns IRQs off, so that lockups are + * detected by the NMI watchdog. + */ + spin_lock(&tlbstate_lock); + + flush_mm = mm; + flush_va = va; + cpus_or(flush_cpumask, cpumask, flush_cpumask); + + /* + * We have to send the IPI only to + * CPUs affected. + */ + send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR); + + while (!cpus_empty(flush_cpumask)) + mb(); /* nothing. lockup detection does not belong here */; + + flush_mm = NULL; + flush_va = 0; + spin_unlock(&tlbstate_lock); +} + +void flush_tlb_current_task(void) +{ + struct mm_struct *mm = current->mm; + cpumask_t cpu_mask; + + preempt_disable(); + cpu_mask = mm->cpu_vm_mask; + cpu_clear(smp_processor_id(), cpu_mask); + + local_flush_tlb(); + if (!cpus_empty(cpu_mask)) + flush_tlb_others(cpu_mask, mm, FLUSH_ALL); + preempt_enable(); +} + +void flush_tlb_mm (struct mm_struct * mm) +{ + cpumask_t cpu_mask; + + preempt_disable(); + cpu_mask = mm->cpu_vm_mask; + cpu_clear(smp_processor_id(), cpu_mask); + + if (current->active_mm == mm) { + if (current->mm) + local_flush_tlb(); + else + leave_mm(smp_processor_id()); + } + if (!cpus_empty(cpu_mask)) + flush_tlb_others(cpu_mask, mm, FLUSH_ALL); + + preempt_enable(); +} + +void flush_tlb_page(struct vm_area_struct * vma, unsigned long va) +{ + struct mm_struct *mm = vma->vm_mm; + cpumask_t cpu_mask; + + preempt_disable(); + cpu_mask = mm->cpu_vm_mask; + cpu_clear(smp_processor_id(), cpu_mask); + + if (current->active_mm == mm) { + if(current->mm) + __flush_tlb_one(va); + else + leave_mm(smp_processor_id()); + } + + if (!cpus_empty(cpu_mask)) + flush_tlb_others(cpu_mask, mm, va); + + preempt_enable(); +} + +static void do_flush_tlb_all(void* info) +{ + unsigned long cpu = smp_processor_id(); + + __flush_tlb_all(); + if (read_pda(mmu_state) == TLBSTATE_LAZY) + leave_mm(cpu); +} + +void flush_tlb_all(void) +{ + on_each_cpu(do_flush_tlb_all, NULL, 1, 1); +} + +void smp_kdb_stop(void) +{ + send_IPI_allbutself(KDB_VECTOR); +} + +/* + * this function sends a 'reschedule' IPI to another CPU. + * it goes straight through and wastes no time serializing + * anything. Worst case is that we lose a reschedule ... + */ + +void smp_send_reschedule(int cpu) +{ + send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR); +} + +/* + * Structure and data for smp_call_function(). This is designed to minimise + * static memory requirements. It also looks cleaner. + */ +static DEFINE_SPINLOCK(call_lock); + +struct call_data_struct { + void (*func) (void *info); + void *info; + atomic_t started; + atomic_t finished; + int wait; +}; + +static struct call_data_struct * call_data; + +/* + * this function sends a 'generic call function' IPI to all other CPUs + * in the system. + */ +static void __smp_call_function (void (*func) (void *info), void *info, + int nonatomic, int wait) +{ + struct call_data_struct data; + int cpus = num_online_cpus()-1; + + if (!cpus) + return; + + data.func = func; + data.info = info; + atomic_set(&data.started, 0); + data.wait = wait; + if (wait) + atomic_set(&data.finished, 0); + + call_data = &data; + wmb(); + /* Send a message to all other CPUs and wait for them to respond */ + send_IPI_allbutself(CALL_FUNCTION_VECTOR); + + /* Wait for response */ + while (atomic_read(&data.started) != cpus) + cpu_relax(); + + if (!wait) + return; + + while (atomic_read(&data.finished) != cpus) + cpu_relax(); +} + +/* + * smp_call_function - run a function on all other CPUs. + * @func: The function to run. This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to the function. + * @nonatomic: currently unused. + * @wait: If true, wait (atomically) until function has completed on other + * CPUs. + * + * Returns 0 on success, else a negative status code. Does not return until + * remote CPUs are nearly ready to execute func or are or have executed. + * + * You must not call this function with disabled interrupts or from a + * hardware interrupt handler or from a bottom half handler. + * Actually there are a few legal cases, like panic. + */ +int smp_call_function (void (*func) (void *info), void *info, int nonatomic, + int wait) +{ + spin_lock(&call_lock); + __smp_call_function(func,info,nonatomic,wait); + spin_unlock(&call_lock); + return 0; +} + +void smp_stop_cpu(void) +{ + /* + * Remove this CPU: + */ + cpu_clear(smp_processor_id(), cpu_online_map); + local_irq_disable(); + disable_local_APIC(); + local_irq_enable(); +} + +static void smp_really_stop_cpu(void *dummy) +{ + smp_stop_cpu(); + for (;;) + asm("hlt"); +} + +void smp_send_stop(void) +{ + int nolock = 0; + if (reboot_force) + return; + /* Don't deadlock on the call lock in panic */ + if (!spin_trylock(&call_lock)) { + /* ignore locking because we have paniced anyways */ + nolock = 1; + } + __smp_call_function(smp_really_stop_cpu, NULL, 0, 0); + if (!nolock) + spin_unlock(&call_lock); + + local_irq_disable(); + disable_local_APIC(); + local_irq_enable(); +} + +/* + * Reschedule call back. Nothing to do, + * all the work is done automatically when + * we return from the interrupt. + */ +asmlinkage void smp_reschedule_interrupt(void) +{ + ack_APIC_irq(); +} + +asmlinkage void smp_call_function_interrupt(void) +{ + void (*func) (void *info) = call_data->func; + void *info = call_data->info; + int wait = call_data->wait; + + ack_APIC_irq(); + /* + * Notify initiating CPU that I've grabbed the data and am + * about to execute the function + */ + mb(); + atomic_inc(&call_data->started); + /* + * At this point the info structure may be out of scope unless wait==1 + */ + irq_enter(); + (*func)(info); + irq_exit(); + if (wait) { + mb(); + atomic_inc(&call_data->finished); + } +} diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c new file mode 100644 index 00000000000..afd3600b919 --- /dev/null +++ b/arch/x86_64/kernel/smpboot.c @@ -0,0 +1,938 @@ +/* + * x86 SMP booting functions + * + * (c) 1995 Alan Cox, Building #3 <alan@redhat.com> + * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com> + * Copyright 2001 Andi Kleen, SuSE Labs. + * + * Much of the core SMP work is based on previous work by Thomas Radke, to + * whom a great many thanks are extended. + * + * Thanks to Intel for making available several different Pentium, + * Pentium Pro and Pentium-II/Xeon MP machines. + * Original development of Linux SMP code supported by Caldera. + * + * This code is released under the GNU General Public License version 2 or + * later. + * + * Fixes + * Felix Koop : NR_CPUS used properly + * Jose Renau : Handle single CPU case. + * Alan Cox : By repeated request 8) - Total BogoMIP report. + * Greg Wright : Fix for kernel stacks panic. + * Erich Boleyn : MP v1.4 and additional changes. + * Matthias Sattler : Changes for 2.1 kernel map. + * Michel Lespinasse : Changes for 2.1 kernel map. + * Michael Chastain : Change trampoline.S to gnu as. + * Alan Cox : Dumb bug: 'B' step PPro's are fine + * Ingo Molnar : Added APIC timers, based on code + * from Jose Renau + * Ingo Molnar : various cleanups and rewrites + * Tigran Aivazian : fixed "0.00 in /proc/uptime on SMP" bug. + * Maciej W. Rozycki : Bits for genuine 82489DX APICs + * Andi Kleen : Changed for SMP boot into long mode. + * Rusty Russell : Hacked into shape for new "hotplug" boot process. + */ + +#include <linux/config.h> +#include <linux/init.h> + +#include <linux/mm.h> +#include <linux/kernel_stat.h> +#include <linux/smp_lock.h> +#include <linux/irq.h> +#include <linux/bootmem.h> +#include <linux/thread_info.h> +#include <linux/module.h> + +#include <linux/delay.h> +#include <linux/mc146818rtc.h> +#include <asm/mtrr.h> +#include <asm/pgalloc.h> +#include <asm/desc.h> +#include <asm/kdebug.h> +#include <asm/tlbflush.h> +#include <asm/proto.h> + +/* Number of siblings per CPU package */ +int smp_num_siblings = 1; +/* Package ID of each logical CPU */ +u8 phys_proc_id[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; +EXPORT_SYMBOL(phys_proc_id); + +/* Bitmask of currently online CPUs */ +cpumask_t cpu_online_map; + +cpumask_t cpu_callin_map; +cpumask_t cpu_callout_map; +static cpumask_t smp_commenced_mask; + +/* Per CPU bogomips and other parameters */ +struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned; + +cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned; + +/* + * Trampoline 80x86 program as an array. + */ + +extern unsigned char trampoline_data []; +extern unsigned char trampoline_end []; + +/* + * Currently trivial. Write the real->protected mode + * bootstrap into the page concerned. The caller + * has made sure it's suitably aligned. + */ + +static unsigned long __init setup_trampoline(void) +{ + void *tramp = __va(SMP_TRAMPOLINE_BASE); + memcpy(tramp, trampoline_data, trampoline_end - trampoline_data); + return virt_to_phys(tramp); +} + +/* + * The bootstrap kernel entry code has set these up. Save them for + * a given CPU + */ + +static void __init smp_store_cpu_info(int id) +{ + struct cpuinfo_x86 *c = cpu_data + id; + + *c = boot_cpu_data; + identify_cpu(c); +} + +/* + * TSC synchronization. + * + * We first check whether all CPUs have their TSC's synchronized, + * then we print a warning if not, and always resync. + */ + +static atomic_t tsc_start_flag = ATOMIC_INIT(0); +static atomic_t tsc_count_start = ATOMIC_INIT(0); +static atomic_t tsc_count_stop = ATOMIC_INIT(0); +static unsigned long long tsc_values[NR_CPUS]; + +#define NR_LOOPS 5 + +extern unsigned int fast_gettimeoffset_quotient; + +static void __init synchronize_tsc_bp (void) +{ + int i; + unsigned long long t0; + unsigned long long sum, avg; + long long delta; + long one_usec; + int buggy = 0; + + printk(KERN_INFO "checking TSC synchronization across %u CPUs: ",num_booting_cpus()); + + one_usec = cpu_khz; + + atomic_set(&tsc_start_flag, 1); + wmb(); + + /* + * We loop a few times to get a primed instruction cache, + * then the last pass is more or less synchronized and + * the BP and APs set their cycle counters to zero all at + * once. This reduces the chance of having random offsets + * between the processors, and guarantees that the maximum + * delay between the cycle counters is never bigger than + * the latency of information-passing (cachelines) between + * two CPUs. + */ + for (i = 0; i < NR_LOOPS; i++) { + /* + * all APs synchronize but they loop on '== num_cpus' + */ + while (atomic_read(&tsc_count_start) != num_booting_cpus()-1) mb(); + atomic_set(&tsc_count_stop, 0); + wmb(); + /* + * this lets the APs save their current TSC: + */ + atomic_inc(&tsc_count_start); + + sync_core(); + rdtscll(tsc_values[smp_processor_id()]); + /* + * We clear the TSC in the last loop: + */ + if (i == NR_LOOPS-1) + write_tsc(0, 0); + + /* + * Wait for all APs to leave the synchronization point: + */ + while (atomic_read(&tsc_count_stop) != num_booting_cpus()-1) mb(); + atomic_set(&tsc_count_start, 0); + wmb(); + atomic_inc(&tsc_count_stop); + } + + sum = 0; + for (i = 0; i < NR_CPUS; i++) { + if (cpu_isset(i, cpu_callout_map)) { + t0 = tsc_values[i]; + sum += t0; + } + } + avg = sum / num_booting_cpus(); + + sum = 0; + for (i = 0; i < NR_CPUS; i++) { + if (!cpu_isset(i, cpu_callout_map)) + continue; + + delta = tsc_values[i] - avg; + if (delta < 0) + delta = -delta; + /* + * We report bigger than 2 microseconds clock differences. + */ + if (delta > 2*one_usec) { + long realdelta; + if (!buggy) { + buggy = 1; + printk("\n"); + } + realdelta = delta / one_usec; + if (tsc_values[i] < avg) + realdelta = -realdelta; + + printk("BIOS BUG: CPU#%d improperly initialized, has %ld usecs TSC skew! FIXED.\n", + i, realdelta); + } + + sum += delta; + } + if (!buggy) + printk("passed.\n"); +} + +static void __init synchronize_tsc_ap (void) +{ + int i; + + /* + * Not every cpu is online at the time + * this gets called, so we first wait for the BP to + * finish SMP initialization: + */ + while (!atomic_read(&tsc_start_flag)) mb(); + + for (i = 0; i < NR_LOOPS; i++) { + atomic_inc(&tsc_count_start); + while (atomic_read(&tsc_count_start) != num_booting_cpus()) mb(); + + sync_core(); + rdtscll(tsc_values[smp_processor_id()]); + if (i == NR_LOOPS-1) + write_tsc(0, 0); + + atomic_inc(&tsc_count_stop); + while (atomic_read(&tsc_count_stop) != num_booting_cpus()) mb(); + } +} +#undef NR_LOOPS + +static atomic_t init_deasserted; + +static void __init smp_callin(void) +{ + int cpuid, phys_id; + unsigned long timeout; + + /* + * If waken up by an INIT in an 82489DX configuration + * we may get here before an INIT-deassert IPI reaches + * our local APIC. We have to wait for the IPI or we'll + * lock up on an APIC access. + */ + while (!atomic_read(&init_deasserted)); + + /* + * (This works even if the APIC is not enabled.) + */ + phys_id = GET_APIC_ID(apic_read(APIC_ID)); + cpuid = smp_processor_id(); + if (cpu_isset(cpuid, cpu_callin_map)) { + panic("smp_callin: phys CPU#%d, CPU#%d already present??\n", + phys_id, cpuid); + } + Dprintk("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id); + + /* + * STARTUP IPIs are fragile beasts as they might sometimes + * trigger some glue motherboard logic. Complete APIC bus + * silence for 1 second, this overestimates the time the + * boot CPU is spending to send the up to 2 STARTUP IPIs + * by a factor of two. This should be enough. + */ + + /* + * Waiting 2s total for startup (udelay is not yet working) + */ + timeout = jiffies + 2*HZ; + while (time_before(jiffies, timeout)) { + /* + * Has the boot CPU finished it's STARTUP sequence? + */ + if (cpu_isset(cpuid, cpu_callout_map)) + break; + rep_nop(); + } + + if (!time_before(jiffies, timeout)) { + panic("smp_callin: CPU%d started up but did not get a callout!\n", + cpuid); + } + + /* + * the boot CPU has finished the init stage and is spinning + * on callin_map until we finish. We are free to set up this + * CPU, first the APIC. (this is probably redundant on most + * boards) + */ + + Dprintk("CALLIN, before setup_local_APIC().\n"); + setup_local_APIC(); + + local_irq_enable(); + + /* + * Get our bogomips. + */ + calibrate_delay(); + Dprintk("Stack at about %p\n",&cpuid); + + disable_APIC_timer(); + + /* + * Save our processor parameters + */ + smp_store_cpu_info(cpuid); + + local_irq_disable(); + + /* + * Allow the master to continue. + */ + cpu_set(cpuid, cpu_callin_map); + + /* + * Synchronize the TSC with the BP + */ + if (cpu_has_tsc) + synchronize_tsc_ap(); +} + +static int cpucount; + +/* + * Activate a secondary processor. + */ +void __init start_secondary(void) +{ + /* + * Dont put anything before smp_callin(), SMP + * booting is too fragile that we want to limit the + * things done here to the most necessary things. + */ + cpu_init(); + smp_callin(); + + /* otherwise gcc will move up the smp_processor_id before the cpu_init */ + barrier(); + + Dprintk("cpu %d: waiting for commence\n", smp_processor_id()); + while (!cpu_isset(smp_processor_id(), smp_commenced_mask)) + rep_nop(); + + Dprintk("cpu %d: setting up apic clock\n", smp_processor_id()); + setup_secondary_APIC_clock(); + + Dprintk("cpu %d: enabling apic timer\n", smp_processor_id()); + + if (nmi_watchdog == NMI_IO_APIC) { + disable_8259A_irq(0); + enable_NMI_through_LVT0(NULL); + enable_8259A_irq(0); + } + + + enable_APIC_timer(); + + /* + * low-memory mappings have been cleared, flush them from + * the local TLBs too. + */ + local_flush_tlb(); + + Dprintk("cpu %d eSetting cpu_online_map\n", smp_processor_id()); + cpu_set(smp_processor_id(), cpu_online_map); + wmb(); + + cpu_idle(); +} + +extern volatile unsigned long init_rsp; +extern void (*initial_code)(void); + +#if APIC_DEBUG +static inline void inquire_remote_apic(int apicid) +{ + unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 }; + char *names[] = { "ID", "VERSION", "SPIV" }; + int timeout, status; + + printk(KERN_INFO "Inquiring remote APIC #%d...\n", apicid); + + for (i = 0; i < sizeof(regs) / sizeof(*regs); i++) { + printk("... APIC #%d %s: ", apicid, names[i]); + + /* + * Wait for idle. + */ + apic_wait_icr_idle(); + + apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); + apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]); + + timeout = 0; + do { + udelay(100); + status = apic_read(APIC_ICR) & APIC_ICR_RR_MASK; + } while (status == APIC_ICR_RR_INPROG && timeout++ < 1000); + + switch (status) { + case APIC_ICR_RR_VALID: + status = apic_read(APIC_RRR); + printk("%08x\n", status); + break; + default: + printk("failed\n"); + } + } +} +#endif + +static int __init wakeup_secondary_via_INIT(int phys_apicid, unsigned int start_rip) +{ + unsigned long send_status = 0, accept_status = 0; + int maxlvt, timeout, num_starts, j; + + Dprintk("Asserting INIT.\n"); + + /* + * Turn INIT on target chip + */ + apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); + + /* + * Send IPI + */ + apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT + | APIC_DM_INIT); + + Dprintk("Waiting for send to finish...\n"); + timeout = 0; + do { + Dprintk("+"); + udelay(100); + send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; + } while (send_status && (timeout++ < 1000)); + + mdelay(10); + + Dprintk("Deasserting INIT.\n"); + + /* Target chip */ + apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); + + /* Send IPI */ + apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT); + + Dprintk("Waiting for send to finish...\n"); + timeout = 0; + do { + Dprintk("+"); + udelay(100); + send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; + } while (send_status && (timeout++ < 1000)); + + atomic_set(&init_deasserted, 1); + + /* + * Should we send STARTUP IPIs ? + * + * Determine this based on the APIC version. + * If we don't have an integrated APIC, don't send the STARTUP IPIs. + */ + if (APIC_INTEGRATED(apic_version[phys_apicid])) + num_starts = 2; + else + num_starts = 0; + + /* + * Run STARTUP IPI loop. + */ + Dprintk("#startup loops: %d.\n", num_starts); + + maxlvt = get_maxlvt(); + + for (j = 1; j <= num_starts; j++) { + Dprintk("Sending STARTUP #%d.\n",j); + apic_read_around(APIC_SPIV); + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + Dprintk("After apic_write.\n"); + + /* + * STARTUP IPI + */ + + /* Target chip */ + apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); + + /* Boot on the stack */ + /* Kick the second */ + apic_write_around(APIC_ICR, APIC_DM_STARTUP + | (start_rip >> 12)); + + /* + * Give the other CPU some time to accept the IPI. + */ + udelay(300); + + Dprintk("Startup point 1.\n"); + + Dprintk("Waiting for send to finish...\n"); + timeout = 0; + do { + Dprintk("+"); + udelay(100); + send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; + } while (send_status && (timeout++ < 1000)); + + /* + * Give the other CPU some time to accept the IPI. + */ + udelay(200); + /* + * Due to the Pentium erratum 3AP. + */ + if (maxlvt > 3) { + apic_read_around(APIC_SPIV); + apic_write(APIC_ESR, 0); + } + accept_status = (apic_read(APIC_ESR) & 0xEF); + if (send_status || accept_status) + break; + } + Dprintk("After Startup.\n"); + + if (send_status) + printk(KERN_ERR "APIC never delivered???\n"); + if (accept_status) + printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status); + + return (send_status | accept_status); +} + +static void __init do_boot_cpu (int apicid) +{ + struct task_struct *idle; + unsigned long boot_error; + int timeout, cpu; + unsigned long start_rip; + + cpu = ++cpucount; + /* + * We can't use kernel_thread since we must avoid to + * reschedule the child. + */ + idle = fork_idle(cpu); + if (IS_ERR(idle)) + panic("failed fork for CPU %d", cpu); + x86_cpu_to_apicid[cpu] = apicid; + + cpu_pda[cpu].pcurrent = idle; + + start_rip = setup_trampoline(); + + init_rsp = idle->thread.rsp; + per_cpu(init_tss,cpu).rsp0 = init_rsp; + initial_code = start_secondary; + clear_ti_thread_flag(idle->thread_info, TIF_FORK); + + printk(KERN_INFO "Booting processor %d/%d rip %lx rsp %lx\n", cpu, apicid, + start_rip, init_rsp); + + /* + * This grunge runs the startup process for + * the targeted processor. + */ + + atomic_set(&init_deasserted, 0); + + Dprintk("Setting warm reset code and vector.\n"); + + CMOS_WRITE(0xa, 0xf); + local_flush_tlb(); + Dprintk("1.\n"); + *((volatile unsigned short *) phys_to_virt(0x469)) = start_rip >> 4; + Dprintk("2.\n"); + *((volatile unsigned short *) phys_to_virt(0x467)) = start_rip & 0xf; + Dprintk("3.\n"); + + /* + * Be paranoid about clearing APIC errors. + */ + if (APIC_INTEGRATED(apic_version[apicid])) { + apic_read_around(APIC_SPIV); + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + } + + /* + * Status is now clean + */ + boot_error = 0; + + /* + * Starting actual IPI sequence... + */ + boot_error = wakeup_secondary_via_INIT(apicid, start_rip); + + if (!boot_error) { + /* + * allow APs to start initializing. + */ + Dprintk("Before Callout %d.\n", cpu); + cpu_set(cpu, cpu_callout_map); + Dprintk("After Callout %d.\n", cpu); + + /* + * Wait 5s total for a response + */ + for (timeout = 0; timeout < 50000; timeout++) { + if (cpu_isset(cpu, cpu_callin_map)) + break; /* It has booted */ + udelay(100); + } + + if (cpu_isset(cpu, cpu_callin_map)) { + /* number CPUs logically, starting from 1 (BSP is 0) */ + Dprintk("OK.\n"); + print_cpu_info(&cpu_data[cpu]); + Dprintk("CPU has booted.\n"); + } else { + boot_error = 1; + if (*((volatile unsigned char *)phys_to_virt(SMP_TRAMPOLINE_BASE)) + == 0xA5) + /* trampoline started but...? */ + printk("Stuck ??\n"); + else + /* trampoline code not run */ + printk("Not responding.\n"); +#if APIC_DEBUG + inquire_remote_apic(apicid); +#endif + } + } + if (boot_error) { + cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */ + clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */ + cpucount--; + x86_cpu_to_apicid[cpu] = BAD_APICID; + x86_cpu_to_log_apicid[cpu] = BAD_APICID; + } +} + +static void smp_tune_scheduling (void) +{ + int cachesize; /* kB */ + unsigned long bandwidth = 1000; /* MB/s */ + /* + * Rough estimation for SMP scheduling, this is the number of + * cycles it takes for a fully memory-limited process to flush + * the SMP-local cache. + * + * (For a P5 this pretty much means we will choose another idle + * CPU almost always at wakeup time (this is due to the small + * L1 cache), on PIIs it's around 50-100 usecs, depending on + * the cache size) + */ + + if (!cpu_khz) { + return; + } else { + cachesize = boot_cpu_data.x86_cache_size; + if (cachesize == -1) { + cachesize = 16; /* Pentiums, 2x8kB cache */ + bandwidth = 100; + } + } +} + +/* + * Cycle through the processors sending APIC IPIs to boot each. + */ + +static void __init smp_boot_cpus(unsigned int max_cpus) +{ + unsigned apicid, cpu, bit, kicked; + + nmi_watchdog_default(); + + /* + * Setup boot CPU information + */ + smp_store_cpu_info(0); /* Final full version of the data */ + printk(KERN_INFO "CPU%d: ", 0); + print_cpu_info(&cpu_data[0]); + + current_thread_info()->cpu = 0; + smp_tune_scheduling(); + + if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) { + printk("weird, boot CPU (#%d) not listed by the BIOS.\n", + hard_smp_processor_id()); + physid_set(hard_smp_processor_id(), phys_cpu_present_map); + } + + /* + * If we couldn't find an SMP configuration at boot time, + * get out of here now! + */ + if (!smp_found_config) { + printk(KERN_NOTICE "SMP motherboard not detected.\n"); + io_apic_irqs = 0; + cpu_online_map = cpumask_of_cpu(0); + cpu_set(0, cpu_sibling_map[0]); + phys_cpu_present_map = physid_mask_of_physid(0); + if (APIC_init_uniprocessor()) + printk(KERN_NOTICE "Local APIC not detected." + " Using dummy APIC emulation.\n"); + goto smp_done; + } + + /* + * Should not be necessary because the MP table should list the boot + * CPU too, but we do it for the sake of robustness anyway. + */ + if (!physid_isset(boot_cpu_id, phys_cpu_present_map)) { + printk(KERN_NOTICE "weird, boot CPU (#%d) not listed by the BIOS.\n", + boot_cpu_id); + physid_set(hard_smp_processor_id(), phys_cpu_present_map); + } + + /* + * If we couldn't find a local APIC, then get out of here now! + */ + if (APIC_INTEGRATED(apic_version[boot_cpu_id]) && !cpu_has_apic) { + printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", + boot_cpu_id); + printk(KERN_ERR "... forcing use of dummy APIC emulation. (tell your hw vendor)\n"); + io_apic_irqs = 0; + cpu_online_map = cpumask_of_cpu(0); + cpu_set(0, cpu_sibling_map[0]); + phys_cpu_present_map = physid_mask_of_physid(0); + disable_apic = 1; + goto smp_done; + } + + verify_local_APIC(); + + /* + * If SMP should be disabled, then really disable it! + */ + if (!max_cpus) { + smp_found_config = 0; + printk(KERN_INFO "SMP mode deactivated, forcing use of dummy APIC emulation.\n"); + io_apic_irqs = 0; + cpu_online_map = cpumask_of_cpu(0); + cpu_set(0, cpu_sibling_map[0]); + phys_cpu_present_map = physid_mask_of_physid(0); + disable_apic = 1; + goto smp_done; + } + + connect_bsp_APIC(); + setup_local_APIC(); + + if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_id) + BUG(); + + x86_cpu_to_apicid[0] = boot_cpu_id; + + /* + * Now scan the CPU present map and fire up the other CPUs. + */ + Dprintk("CPU present map: %lx\n", physids_coerce(phys_cpu_present_map)); + + kicked = 1; + for (bit = 0; kicked < NR_CPUS && bit < MAX_APICS; bit++) { + apicid = cpu_present_to_apicid(bit); + /* + * Don't even attempt to start the boot CPU! + */ + if (apicid == boot_cpu_id || (apicid == BAD_APICID)) + continue; + + if (!physid_isset(apicid, phys_cpu_present_map)) + continue; + if ((max_cpus >= 0) && (max_cpus <= cpucount+1)) + continue; + + do_boot_cpu(apicid); + ++kicked; + } + + /* + * Cleanup possible dangling ends... + */ + { + /* + * Install writable page 0 entry to set BIOS data area. + */ + local_flush_tlb(); + + /* + * Paranoid: Set warm reset code and vector here back + * to default values. + */ + CMOS_WRITE(0, 0xf); + + *((volatile int *) phys_to_virt(0x467)) = 0; + } + + /* + * Allow the user to impress friends. + */ + + Dprintk("Before bogomips.\n"); + if (!cpucount) { + printk(KERN_INFO "Only one processor found.\n"); + } else { + unsigned long bogosum = 0; + for (cpu = 0; cpu < NR_CPUS; cpu++) + if (cpu_isset(cpu, cpu_callout_map)) + bogosum += cpu_data[cpu].loops_per_jiffy; + printk(KERN_INFO "Total of %d processors activated (%lu.%02lu BogoMIPS).\n", + cpucount+1, + bogosum/(500000/HZ), + (bogosum/(5000/HZ))%100); + Dprintk("Before bogocount - setting activated=1.\n"); + } + + /* + * Construct cpu_sibling_map[], so that we can tell the + * sibling CPU efficiently. + */ + for (cpu = 0; cpu < NR_CPUS; cpu++) + cpus_clear(cpu_sibling_map[cpu]); + + for (cpu = 0; cpu < NR_CPUS; cpu++) { + int siblings = 0; + int i; + if (!cpu_isset(cpu, cpu_callout_map)) + continue; + + if (smp_num_siblings > 1) { + for (i = 0; i < NR_CPUS; i++) { + if (!cpu_isset(i, cpu_callout_map)) + continue; + if (phys_proc_id[cpu] == phys_proc_id[i]) { + siblings++; + cpu_set(i, cpu_sibling_map[cpu]); + } + } + } else { + siblings++; + cpu_set(cpu, cpu_sibling_map[cpu]); + } + + if (siblings != smp_num_siblings) { + printk(KERN_WARNING + "WARNING: %d siblings found for CPU%d, should be %d\n", + siblings, cpu, smp_num_siblings); + smp_num_siblings = siblings; + } + } + + Dprintk("Boot done.\n"); + + /* + * Here we can be sure that there is an IO-APIC in the system. Let's + * go and set it up: + */ + if (!skip_ioapic_setup && nr_ioapics) + setup_IO_APIC(); + else + nr_ioapics = 0; + + setup_boot_APIC_clock(); + + /* + * Synchronize the TSC with the AP + */ + if (cpu_has_tsc && cpucount) + synchronize_tsc_bp(); + + smp_done: + time_init_smp(); +} + +/* These are wrappers to interface to the new boot process. Someone + who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */ +void __init smp_prepare_cpus(unsigned int max_cpus) +{ + smp_boot_cpus(max_cpus); +} + +void __devinit smp_prepare_boot_cpu(void) +{ + cpu_set(smp_processor_id(), cpu_online_map); + cpu_set(smp_processor_id(), cpu_callout_map); +} + +int __devinit __cpu_up(unsigned int cpu) +{ + /* This only works at boot for x86. See "rewrite" above. */ + if (cpu_isset(cpu, smp_commenced_mask)) { + local_irq_enable(); + return -ENOSYS; + } + + /* In case one didn't come up */ + if (!cpu_isset(cpu, cpu_callin_map)) { + local_irq_enable(); + return -EIO; + } + local_irq_enable(); + + /* Unleash the CPU! */ + Dprintk("waiting for cpu %d\n", cpu); + + cpu_set(cpu, smp_commenced_mask); + while (!cpu_isset(cpu, cpu_online_map)) + mb(); + return 0; +} + +void __init smp_cpus_done(unsigned int max_cpus) +{ +#ifdef CONFIG_X86_IO_APIC + setup_ioapic_dest(); +#endif + zap_low_mappings(); +} + diff --git a/arch/x86_64/kernel/suspend.c b/arch/x86_64/kernel/suspend.c new file mode 100644 index 00000000000..ebaa1e37d65 --- /dev/null +++ b/arch/x86_64/kernel/suspend.c @@ -0,0 +1,157 @@ +/* + * Suspend support specific for i386. + * + * Distribute under GPLv2 + * + * Copyright (c) 2002 Pavel Machek <pavel@suse.cz> + * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org> + */ + +#include <linux/config.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/types.h> +#include <linux/spinlock.h> +#include <linux/poll.h> +#include <linux/delay.h> +#include <linux/sysrq.h> +#include <linux/proc_fs.h> +#include <linux/irq.h> +#include <linux/pm.h> +#include <linux/device.h> +#include <linux/suspend.h> +#include <asm/uaccess.h> +#include <asm/acpi.h> +#include <asm/tlbflush.h> +#include <asm/io.h> +#include <asm/proto.h> + +struct saved_context saved_context; + +unsigned long saved_context_eax, saved_context_ebx, saved_context_ecx, saved_context_edx; +unsigned long saved_context_esp, saved_context_ebp, saved_context_esi, saved_context_edi; +unsigned long saved_context_r08, saved_context_r09, saved_context_r10, saved_context_r11; +unsigned long saved_context_r12, saved_context_r13, saved_context_r14, saved_context_r15; +unsigned long saved_context_eflags; + +void __save_processor_state(struct saved_context *ctxt) +{ + kernel_fpu_begin(); + + /* + * descriptor tables + */ + asm volatile ("sgdt %0" : "=m" (ctxt->gdt_limit)); + asm volatile ("sidt %0" : "=m" (ctxt->idt_limit)); + asm volatile ("sldt %0" : "=m" (ctxt->ldt)); + asm volatile ("str %0" : "=m" (ctxt->tr)); + + /* XMM0..XMM15 should be handled by kernel_fpu_begin(). */ + /* EFER should be constant for kernel version, no need to handle it. */ + /* + * segment registers + */ + asm volatile ("movw %%ds, %0" : "=m" (ctxt->ds)); + asm volatile ("movw %%es, %0" : "=m" (ctxt->es)); + asm volatile ("movw %%fs, %0" : "=m" (ctxt->fs)); + asm volatile ("movw %%gs, %0" : "=m" (ctxt->gs)); + asm volatile ("movw %%ss, %0" : "=m" (ctxt->ss)); + + rdmsrl(MSR_FS_BASE, ctxt->fs_base); + rdmsrl(MSR_GS_BASE, ctxt->gs_base); + rdmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); + + /* + * control registers + */ + asm volatile ("movq %%cr0, %0" : "=r" (ctxt->cr0)); + asm volatile ("movq %%cr2, %0" : "=r" (ctxt->cr2)); + asm volatile ("movq %%cr3, %0" : "=r" (ctxt->cr3)); + asm volatile ("movq %%cr4, %0" : "=r" (ctxt->cr4)); +} + +void save_processor_state(void) +{ + __save_processor_state(&saved_context); +} + +static void +do_fpu_end(void) +{ + /* restore FPU regs if necessary */ + /* Do it out of line so that gcc does not move cr0 load to some stupid place */ + kernel_fpu_end(); + mxcsr_feature_mask_init(); +} + +void __restore_processor_state(struct saved_context *ctxt) +{ + /* + * control registers + */ + asm volatile ("movq %0, %%cr4" :: "r" (ctxt->cr4)); + asm volatile ("movq %0, %%cr3" :: "r" (ctxt->cr3)); + asm volatile ("movq %0, %%cr2" :: "r" (ctxt->cr2)); + asm volatile ("movq %0, %%cr0" :: "r" (ctxt->cr0)); + + /* + * segment registers + */ + asm volatile ("movw %0, %%ds" :: "r" (ctxt->ds)); + asm volatile ("movw %0, %%es" :: "r" (ctxt->es)); + asm volatile ("movw %0, %%fs" :: "r" (ctxt->fs)); + load_gs_index(ctxt->gs); + asm volatile ("movw %0, %%ss" :: "r" (ctxt->ss)); + + wrmsrl(MSR_FS_BASE, ctxt->fs_base); + wrmsrl(MSR_GS_BASE, ctxt->gs_base); + wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); + + /* + * now restore the descriptor tables to their proper values + * ltr is done i fix_processor_context(). + */ + asm volatile ("lgdt %0" :: "m" (ctxt->gdt_limit)); + asm volatile ("lidt %0" :: "m" (ctxt->idt_limit)); + asm volatile ("lldt %0" :: "m" (ctxt->ldt)); + + fix_processor_context(); + + do_fpu_end(); +} + +void restore_processor_state(void) +{ + __restore_processor_state(&saved_context); +} + +void fix_processor_context(void) +{ + int cpu = smp_processor_id(); + struct tss_struct *t = &per_cpu(init_tss, cpu); + + set_tss_desc(cpu,t); /* This just modifies memory; should not be neccessary. But... This is neccessary, because 386 hardware has concept of busy TSS or some similar stupidity. */ + + cpu_gdt_table[cpu][GDT_ENTRY_TSS].type = 9; + + syscall_init(); /* This sets MSR_*STAR and related */ + load_TR_desc(); /* This does ltr */ + load_LDT(¤t->active_mm->context); /* This does lldt */ + + /* + * Now maybe reload the debug registers + */ + if (current->thread.debugreg7){ + loaddebug(¤t->thread, 0); + loaddebug(¤t->thread, 1); + loaddebug(¤t->thread, 2); + loaddebug(¤t->thread, 3); + /* no 4 and 5 */ + loaddebug(¤t->thread, 6); + loaddebug(¤t->thread, 7); + } + +} + + diff --git a/arch/x86_64/kernel/suspend_asm.S b/arch/x86_64/kernel/suspend_asm.S new file mode 100644 index 00000000000..53f8e165951 --- /dev/null +++ b/arch/x86_64/kernel/suspend_asm.S @@ -0,0 +1,104 @@ +/* Copyright 2004,2005 Pavel Machek <pavel@suse.cz>, Andi Kleen <ak@suse.de>, Rafael J. Wysocki <rjw@sisk.pl> + * + * Distribute under GPLv2. + * + * swsusp_arch_resume may not use any stack, nor any variable that is + * not "NoSave" during copying pages: + * + * Its rewriting one kernel image with another. What is stack in "old" + * image could very well be data page in "new" image, and overwriting + * your own stack under you is bad idea. + */ + + .text +#include <linux/linkage.h> +#include <asm/segment.h> +#include <asm/page.h> +#include <asm/offset.h> + +ENTRY(swsusp_arch_suspend) + + movq %rsp, saved_context_esp(%rip) + movq %rax, saved_context_eax(%rip) + movq %rbx, saved_context_ebx(%rip) + movq %rcx, saved_context_ecx(%rip) + movq %rdx, saved_context_edx(%rip) + movq %rbp, saved_context_ebp(%rip) + movq %rsi, saved_context_esi(%rip) + movq %rdi, saved_context_edi(%rip) + movq %r8, saved_context_r08(%rip) + movq %r9, saved_context_r09(%rip) + movq %r10, saved_context_r10(%rip) + movq %r11, saved_context_r11(%rip) + movq %r12, saved_context_r12(%rip) + movq %r13, saved_context_r13(%rip) + movq %r14, saved_context_r14(%rip) + movq %r15, saved_context_r15(%rip) + pushfq ; popq saved_context_eflags(%rip) + + call swsusp_save + ret + +ENTRY(swsusp_arch_resume) + /* set up cr3 */ + leaq init_level4_pgt(%rip),%rax + subq $__START_KERNEL_map,%rax + movq %rax,%cr3 + + movq mmu_cr4_features(%rip), %rax + movq %rax, %rdx + andq $~(1<<7), %rdx # PGE + movq %rdx, %cr4; # turn off PGE + movq %cr3, %rcx; # flush TLB + movq %rcx, %cr3; + movq %rax, %cr4; # turn PGE back on + + movq pagedir_nosave(%rip), %rdx +loop: + testq %rdx, %rdx + jz done + + /* get addresses from the pbe and copy the page */ + movq pbe_address(%rdx), %rsi + movq pbe_orig_address(%rdx), %rdi + movq $512, %rcx + rep + movsq + + /* progress to the next pbe */ + movq pbe_next(%rdx), %rdx + jmp loop +done: + /* Flush TLB, including "global" things (vmalloc) */ + movq mmu_cr4_features(%rip), %rax + movq %rax, %rdx + andq $~(1<<7), %rdx; # PGE + movq %rdx, %cr4; # turn off PGE + movq %cr3, %rcx; # flush TLB + movq %rcx, %cr3 + movq %rax, %cr4; # turn PGE back on + + movl $24, %eax + movl %eax, %ds + + movq saved_context_esp(%rip), %rsp + movq saved_context_ebp(%rip), %rbp + /* Don't restore %rax, it must be 0 anyway */ + movq saved_context_ebx(%rip), %rbx + movq saved_context_ecx(%rip), %rcx + movq saved_context_edx(%rip), %rdx + movq saved_context_esi(%rip), %rsi + movq saved_context_edi(%rip), %rdi + movq saved_context_r08(%rip), %r8 + movq saved_context_r09(%rip), %r9 + movq saved_context_r10(%rip), %r10 + movq saved_context_r11(%rip), %r11 + movq saved_context_r12(%rip), %r12 + movq saved_context_r13(%rip), %r13 + movq saved_context_r14(%rip), %r14 + movq saved_context_r15(%rip), %r15 + pushq saved_context_eflags(%rip) ; popfq + + xorq %rax, %rax + + ret diff --git a/arch/x86_64/kernel/sys_x86_64.c b/arch/x86_64/kernel/sys_x86_64.c new file mode 100644 index 00000000000..477d8be57d6 --- /dev/null +++ b/arch/x86_64/kernel/sys_x86_64.c @@ -0,0 +1,173 @@ +/* + * linux/arch/x86_64/kernel/sys_x86_64.c + */ + +#include <linux/errno.h> +#include <linux/sched.h> +#include <linux/syscalls.h> +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/smp_lock.h> +#include <linux/sem.h> +#include <linux/msg.h> +#include <linux/shm.h> +#include <linux/stat.h> +#include <linux/mman.h> +#include <linux/file.h> +#include <linux/utsname.h> +#include <linux/personality.h> + +#include <asm/uaccess.h> +#include <asm/ia32.h> + +/* + * sys_pipe() is the normal C calling standard for creating + * a pipe. It's not the way Unix traditionally does this, though. + */ +asmlinkage long sys_pipe(int __user *fildes) +{ + int fd[2]; + int error; + + error = do_pipe(fd); + if (!error) { + if (copy_to_user(fildes, fd, 2*sizeof(int))) + error = -EFAULT; + } + return error; +} + +asmlinkage long sys_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, + unsigned long fd, unsigned long off) +{ + long error; + struct file * file; + + error = -EINVAL; + if (off & ~PAGE_MASK) + goto out; + + error = -EBADF; + file = NULL; + flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); + if (!(flags & MAP_ANONYMOUS)) { + file = fget(fd); + if (!file) + goto out; + } + down_write(¤t->mm->mmap_sem); + error = do_mmap_pgoff(file, addr, len, prot, flags, off >> PAGE_SHIFT); + up_write(¤t->mm->mmap_sem); + + if (file) + fput(file); +out: + return error; +} + +static void find_start_end(unsigned long flags, unsigned long *begin, + unsigned long *end) +{ +#ifdef CONFIG_IA32_EMULATION + if (test_thread_flag(TIF_IA32)) { + *begin = TASK_UNMAPPED_32; + *end = IA32_PAGE_OFFSET; + } else +#endif + if (flags & MAP_32BIT) { + /* This is usually used needed to map code in small + model, so it needs to be in the first 31bit. Limit + it to that. This means we need to move the + unmapped base down for this case. This can give + conflicts with the heap, but we assume that glibc + malloc knows how to fall back to mmap. Give it 1GB + of playground for now. -AK */ + *begin = 0x40000000; + *end = 0x80000000; + } else { + *begin = TASK_UNMAPPED_64; + *end = TASK_SIZE; + } +} + +unsigned long +arch_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, unsigned long flags) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + unsigned long start_addr; + unsigned long begin, end; + + find_start_end(flags, &begin, &end); + + if (len > end) + return -ENOMEM; + + if (addr) { + addr = PAGE_ALIGN(addr); + vma = find_vma(mm, addr); + if (end - len >= addr && + (!vma || addr + len <= vma->vm_start)) + return addr; + } + addr = mm->free_area_cache; + if (addr < begin) + addr = begin; + start_addr = addr; + +full_search: + for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { + /* At this point: (!vma || addr < vma->vm_end). */ + if (end - len < addr) { + /* + * Start a new search - just in case we missed + * some holes. + */ + if (start_addr != begin) { + start_addr = addr = begin; + goto full_search; + } + return -ENOMEM; + } + if (!vma || addr + len <= vma->vm_start) { + /* + * Remember the place where we stopped the search: + */ + mm->free_area_cache = addr + len; + return addr; + } + addr = vma->vm_end; + } +} + +asmlinkage long sys_uname(struct new_utsname __user * name) +{ + int err; + down_read(&uts_sem); + err = copy_to_user(name, &system_utsname, sizeof (*name)); + up_read(&uts_sem); + if (personality(current->personality) == PER_LINUX32) + err |= copy_to_user(&name->machine, "i686", 5); + return err ? -EFAULT : 0; +} + +asmlinkage long wrap_sys_shmat(int shmid, char __user *shmaddr, int shmflg) +{ + unsigned long raddr; + return do_shmat(shmid,shmaddr,shmflg,&raddr) ?: (long)raddr; +} + +asmlinkage long sys_time64(long __user * tloc) +{ + struct timeval now; + int i; + + do_gettimeofday(&now); + i = now.tv_sec; + if (tloc) { + if (put_user(i,tloc)) + i = -EFAULT; + } + return i; +} diff --git a/arch/x86_64/kernel/syscall.c b/arch/x86_64/kernel/syscall.c new file mode 100644 index 00000000000..e263685f864 --- /dev/null +++ b/arch/x86_64/kernel/syscall.c @@ -0,0 +1,26 @@ +/* System call table for x86-64. */ + +#include <linux/linkage.h> +#include <linux/sys.h> +#include <linux/cache.h> +#include <linux/config.h> + +#define __NO_STUBS + +#define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ; +#undef _ASM_X86_64_UNISTD_H_ +#include <asm-x86_64/unistd.h> + +#undef __SYSCALL +#define __SYSCALL(nr, sym) [ nr ] = sym, +#undef _ASM_X86_64_UNISTD_H_ + +typedef void (*sys_call_ptr_t)(void); + +extern void sys_ni_syscall(void); + +sys_call_ptr_t sys_call_table[__NR_syscall_max+1] __cacheline_aligned = { + /* Smells like a like a compiler bug -- it doesn't work when the & below is removed. */ + [0 ... __NR_syscall_max] = &sys_ni_syscall, +#include <asm-x86_64/unistd.h> +}; diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c new file mode 100644 index 00000000000..3bafe438fa7 --- /dev/null +++ b/arch/x86_64/kernel/time.c @@ -0,0 +1,1262 @@ +/* + * linux/arch/x86-64/kernel/time.c + * + * "High Precision Event Timer" based timekeeping. + * + * Copyright (c) 1991,1992,1995 Linus Torvalds + * Copyright (c) 1994 Alan Modra + * Copyright (c) 1995 Markus Kuhn + * Copyright (c) 1996 Ingo Molnar + * Copyright (c) 1998 Andrea Arcangeli + * Copyright (c) 2002 Vojtech Pavlik + * Copyright (c) 2003 Andi Kleen + * RTC support code taken from arch/i386/kernel/timers/time_hpet.c + */ + +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/interrupt.h> +#include <linux/init.h> +#include <linux/mc146818rtc.h> +#include <linux/irq.h> +#include <linux/time.h> +#include <linux/ioport.h> +#include <linux/module.h> +#include <linux/device.h> +#include <linux/sysdev.h> +#include <linux/bcd.h> +#include <linux/kallsyms.h> +#include <asm/8253pit.h> +#include <asm/pgtable.h> +#include <asm/vsyscall.h> +#include <asm/timex.h> +#include <asm/proto.h> +#include <asm/hpet.h> +#include <asm/sections.h> +#include <linux/cpufreq.h> +#include <linux/hpet.h> +#ifdef CONFIG_X86_LOCAL_APIC +#include <asm/apic.h> +#endif + +u64 jiffies_64 = INITIAL_JIFFIES; + +EXPORT_SYMBOL(jiffies_64); + +#ifdef CONFIG_CPU_FREQ +static void cpufreq_delayed_get(void); +#endif +extern void i8254_timer_resume(void); +extern int using_apic_timer; + +DEFINE_SPINLOCK(rtc_lock); +DEFINE_SPINLOCK(i8253_lock); + +static int nohpet __initdata = 0; +static int notsc __initdata = 0; + +#undef HPET_HACK_ENABLE_DANGEROUS + +unsigned int cpu_khz; /* TSC clocks / usec, not used here */ +static unsigned long hpet_period; /* fsecs / HPET clock */ +unsigned long hpet_tick; /* HPET clocks / interrupt */ +unsigned long vxtime_hz = PIT_TICK_RATE; +int report_lost_ticks; /* command line option */ +unsigned long long monotonic_base; + +struct vxtime_data __vxtime __section_vxtime; /* for vsyscalls */ + +volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; +unsigned long __wall_jiffies __section_wall_jiffies = INITIAL_JIFFIES; +struct timespec __xtime __section_xtime; +struct timezone __sys_tz __section_sys_tz; + +static inline void rdtscll_sync(unsigned long *tsc) +{ +#ifdef CONFIG_SMP + sync_core(); +#endif + rdtscll(*tsc); +} + +/* + * do_gettimeoffset() returns microseconds since last timer interrupt was + * triggered by hardware. A memory read of HPET is slower than a register read + * of TSC, but much more reliable. It's also synchronized to the timer + * interrupt. Note that do_gettimeoffset() may return more than hpet_tick, if a + * timer interrupt has happened already, but vxtime.trigger wasn't updated yet. + * This is not a problem, because jiffies hasn't updated either. They are bound + * together by xtime_lock. + */ + +static inline unsigned int do_gettimeoffset_tsc(void) +{ + unsigned long t; + unsigned long x; + rdtscll_sync(&t); + if (t < vxtime.last_tsc) t = vxtime.last_tsc; /* hack */ + x = ((t - vxtime.last_tsc) * vxtime.tsc_quot) >> 32; + return x; +} + +static inline unsigned int do_gettimeoffset_hpet(void) +{ + return ((hpet_readl(HPET_COUNTER) - vxtime.last) * vxtime.quot) >> 32; +} + +unsigned int (*do_gettimeoffset)(void) = do_gettimeoffset_tsc; + +/* + * This version of gettimeofday() has microsecond resolution and better than + * microsecond precision, as we're using at least a 10 MHz (usually 14.31818 + * MHz) HPET timer. + */ + +void do_gettimeofday(struct timeval *tv) +{ + unsigned long seq, t; + unsigned int sec, usec; + + do { + seq = read_seqbegin(&xtime_lock); + + sec = xtime.tv_sec; + usec = xtime.tv_nsec / 1000; + + /* i386 does some correction here to keep the clock + monotonous even when ntpd is fixing drift. + But they didn't work for me, there is a non monotonic + clock anyways with ntp. + I dropped all corrections now until a real solution can + be found. Note when you fix it here you need to do the same + in arch/x86_64/kernel/vsyscall.c and export all needed + variables in vmlinux.lds. -AK */ + + t = (jiffies - wall_jiffies) * (1000000L / HZ) + + do_gettimeoffset(); + usec += t; + + } while (read_seqretry(&xtime_lock, seq)); + + tv->tv_sec = sec + usec / 1000000; + tv->tv_usec = usec % 1000000; +} + +EXPORT_SYMBOL(do_gettimeofday); + +/* + * settimeofday() first undoes the correction that gettimeofday would do + * on the time, and then saves it. This is ugly, but has been like this for + * ages already. + */ + +int do_settimeofday(struct timespec *tv) +{ + time_t wtm_sec, sec = tv->tv_sec; + long wtm_nsec, nsec = tv->tv_nsec; + + if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) + return -EINVAL; + + write_seqlock_irq(&xtime_lock); + + nsec -= do_gettimeoffset() * 1000 + + (jiffies - wall_jiffies) * (NSEC_PER_SEC/HZ); + + wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); + wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); + + set_normalized_timespec(&xtime, sec, nsec); + set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); + + time_adjust = 0; /* stop active adjtime() */ + time_status |= STA_UNSYNC; + time_maxerror = NTP_PHASE_LIMIT; + time_esterror = NTP_PHASE_LIMIT; + + write_sequnlock_irq(&xtime_lock); + clock_was_set(); + return 0; +} + +EXPORT_SYMBOL(do_settimeofday); + +unsigned long profile_pc(struct pt_regs *regs) +{ + unsigned long pc = instruction_pointer(regs); + + /* Assume the lock function has either no stack frame or only a single word. + This checks if the address on the stack looks like a kernel text address. + There is a small window for false hits, but in that case the tick + is just accounted to the spinlock function. + Better would be to write these functions in assembler again + and check exactly. */ + if (in_lock_functions(pc)) { + char *v = *(char **)regs->rsp; + if ((v >= _stext && v <= _etext) || + (v >= _sinittext && v <= _einittext) || + (v >= (char *)MODULES_VADDR && v <= (char *)MODULES_END)) + return (unsigned long)v; + return ((unsigned long *)regs->rsp)[1]; + } + return pc; +} +EXPORT_SYMBOL(profile_pc); + +/* + * In order to set the CMOS clock precisely, set_rtc_mmss has to be called 500 + * ms after the second nowtime has started, because when nowtime is written + * into the registers of the CMOS clock, it will jump to the next second + * precisely 500 ms later. Check the Motorola MC146818A or Dallas DS12887 data + * sheet for details. + */ + +static void set_rtc_mmss(unsigned long nowtime) +{ + int real_seconds, real_minutes, cmos_minutes; + unsigned char control, freq_select; + +/* + * IRQs are disabled when we're called from the timer interrupt, + * no need for spin_lock_irqsave() + */ + + spin_lock(&rtc_lock); + +/* + * Tell the clock it's being set and stop it. + */ + + control = CMOS_READ(RTC_CONTROL); + CMOS_WRITE(control | RTC_SET, RTC_CONTROL); + + freq_select = CMOS_READ(RTC_FREQ_SELECT); + CMOS_WRITE(freq_select | RTC_DIV_RESET2, RTC_FREQ_SELECT); + + cmos_minutes = CMOS_READ(RTC_MINUTES); + BCD_TO_BIN(cmos_minutes); + +/* + * since we're only adjusting minutes and seconds, don't interfere with hour + * overflow. This avoids messing with unknown time zones but requires your RTC + * not to be off by more than 15 minutes. Since we're calling it only when + * our clock is externally synchronized using NTP, this shouldn't be a problem. + */ + + real_seconds = nowtime % 60; + real_minutes = nowtime / 60; + if (((abs(real_minutes - cmos_minutes) + 15) / 30) & 1) + real_minutes += 30; /* correct for half hour time zone */ + real_minutes %= 60; + +#if 0 + /* AMD 8111 is a really bad time keeper and hits this regularly. + It probably was an attempt to avoid screwing up DST, but ignore + that for now. */ + if (abs(real_minutes - cmos_minutes) >= 30) { + printk(KERN_WARNING "time.c: can't update CMOS clock " + "from %d to %d\n", cmos_minutes, real_minutes); + } else +#endif + + { + BIN_TO_BCD(real_seconds); + BIN_TO_BCD(real_minutes); + CMOS_WRITE(real_seconds, RTC_SECONDS); + CMOS_WRITE(real_minutes, RTC_MINUTES); + } + +/* + * The following flags have to be released exactly in this order, otherwise the + * DS12887 (popular MC146818A clone with integrated battery and quartz) will + * not reset the oscillator and will not update precisely 500 ms later. You + * won't find this mentioned in the Dallas Semiconductor data sheets, but who + * believes data sheets anyway ... -- Markus Kuhn + */ + + CMOS_WRITE(control, RTC_CONTROL); + CMOS_WRITE(freq_select, RTC_FREQ_SELECT); + + spin_unlock(&rtc_lock); +} + + +/* monotonic_clock(): returns # of nanoseconds passed since time_init() + * Note: This function is required to return accurate + * time even in the absence of multiple timer ticks. + */ +unsigned long long monotonic_clock(void) +{ + unsigned long seq; + u32 last_offset, this_offset, offset; + unsigned long long base; + + if (vxtime.mode == VXTIME_HPET) { + do { + seq = read_seqbegin(&xtime_lock); + + last_offset = vxtime.last; + base = monotonic_base; + this_offset = hpet_readl(HPET_T0_CMP) - hpet_tick; + + } while (read_seqretry(&xtime_lock, seq)); + offset = (this_offset - last_offset); + offset *=(NSEC_PER_SEC/HZ)/hpet_tick; + return base + offset; + }else{ + do { + seq = read_seqbegin(&xtime_lock); + + last_offset = vxtime.last_tsc; + base = monotonic_base; + } while (read_seqretry(&xtime_lock, seq)); + sync_core(); + rdtscll(this_offset); + offset = (this_offset - last_offset)*1000/cpu_khz; + return base + offset; + } + + +} +EXPORT_SYMBOL(monotonic_clock); + +static noinline void handle_lost_ticks(int lost, struct pt_regs *regs) +{ + static long lost_count; + static int warned; + + if (report_lost_ticks) { + printk(KERN_WARNING "time.c: Lost %d timer " + "tick(s)! ", lost); + print_symbol("rip %s)\n", regs->rip); + } + + if (lost_count == 1000 && !warned) { + printk(KERN_WARNING + "warning: many lost ticks.\n" + KERN_WARNING "Your time source seems to be instable or " + "some driver is hogging interupts\n"); + print_symbol("rip %s\n", regs->rip); + if (vxtime.mode == VXTIME_TSC && vxtime.hpet_address) { + printk(KERN_WARNING "Falling back to HPET\n"); + vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick; + vxtime.mode = VXTIME_HPET; + do_gettimeoffset = do_gettimeoffset_hpet; + } + /* else should fall back to PIT, but code missing. */ + warned = 1; + } else + lost_count++; + +#ifdef CONFIG_CPU_FREQ + /* In some cases the CPU can change frequency without us noticing + (like going into thermal throttle) + Give cpufreq a change to catch up. */ + if ((lost_count+1) % 25 == 0) { + cpufreq_delayed_get(); + } +#endif +} + +static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) +{ + static unsigned long rtc_update = 0; + unsigned long tsc; + int delay, offset = 0, lost = 0; + +/* + * Here we are in the timer irq handler. We have irqs locally disabled (so we + * don't need spin_lock_irqsave()) but we don't know if the timer_bh is running + * on the other CPU, so we need a lock. We also need to lock the vsyscall + * variables, because both do_timer() and us change them -arca+vojtech + */ + + write_seqlock(&xtime_lock); + + if (vxtime.hpet_address) { + offset = hpet_readl(HPET_T0_CMP) - hpet_tick; + delay = hpet_readl(HPET_COUNTER) - offset; + } else { + spin_lock(&i8253_lock); + outb_p(0x00, 0x43); + delay = inb_p(0x40); + delay |= inb(0x40) << 8; + spin_unlock(&i8253_lock); + delay = LATCH - 1 - delay; + } + + rdtscll_sync(&tsc); + + if (vxtime.mode == VXTIME_HPET) { + if (offset - vxtime.last > hpet_tick) { + lost = (offset - vxtime.last) / hpet_tick - 1; + } + + monotonic_base += + (offset - vxtime.last)*(NSEC_PER_SEC/HZ) / hpet_tick; + + vxtime.last = offset; + } else { + offset = (((tsc - vxtime.last_tsc) * + vxtime.tsc_quot) >> 32) - (USEC_PER_SEC / HZ); + + if (offset < 0) + offset = 0; + + if (offset > (USEC_PER_SEC / HZ)) { + lost = offset / (USEC_PER_SEC / HZ); + offset %= (USEC_PER_SEC / HZ); + } + + monotonic_base += (tsc - vxtime.last_tsc)*1000000/cpu_khz ; + + vxtime.last_tsc = tsc - vxtime.quot * delay / vxtime.tsc_quot; + + if ((((tsc - vxtime.last_tsc) * + vxtime.tsc_quot) >> 32) < offset) + vxtime.last_tsc = tsc - + (((long) offset << 32) / vxtime.tsc_quot) - 1; + } + + if (lost > 0) { + handle_lost_ticks(lost, regs); + jiffies += lost; + } + +/* + * Do the timer stuff. + */ + + do_timer(regs); +#ifndef CONFIG_SMP + update_process_times(user_mode(regs)); +#endif + +/* + * In the SMP case we use the local APIC timer interrupt to do the profiling, + * except when we simulate SMP mode on a uniprocessor system, in that case we + * have to call the local interrupt handler. + */ + +#ifndef CONFIG_X86_LOCAL_APIC + profile_tick(CPU_PROFILING, regs); +#else + if (!using_apic_timer) + smp_local_timer_interrupt(regs); +#endif + +/* + * If we have an externally synchronized Linux clock, then update CMOS clock + * accordingly every ~11 minutes. set_rtc_mmss() will be called in the jiffy + * closest to exactly 500 ms before the next second. If the update fails, we + * don't care, as it'll be updated on the next turn, and the problem (time way + * off) isn't likely to go away much sooner anyway. + */ + + if ((~time_status & STA_UNSYNC) && xtime.tv_sec > rtc_update && + abs(xtime.tv_nsec - 500000000) <= tick_nsec / 2) { + set_rtc_mmss(xtime.tv_sec); + rtc_update = xtime.tv_sec + 660; + } + + write_sequnlock(&xtime_lock); + + return IRQ_HANDLED; +} + +static unsigned int cyc2ns_scale; +#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ + +static inline void set_cyc2ns_scale(unsigned long cpu_mhz) +{ + cyc2ns_scale = (1000 << CYC2NS_SCALE_FACTOR)/cpu_mhz; +} + +static inline unsigned long long cycles_2_ns(unsigned long long cyc) +{ + return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR; +} + +unsigned long long sched_clock(void) +{ + unsigned long a = 0; + +#if 0 + /* Don't do a HPET read here. Using TSC always is much faster + and HPET may not be mapped yet when the scheduler first runs. + Disadvantage is a small drift between CPUs in some configurations, + but that should be tolerable. */ + if (__vxtime.mode == VXTIME_HPET) + return (hpet_readl(HPET_COUNTER) * vxtime.quot) >> 32; +#endif + + /* Could do CPU core sync here. Opteron can execute rdtsc speculatively, + which means it is not completely exact and may not be monotonous between + CPUs. But the errors should be too small to matter for scheduling + purposes. */ + + rdtscll(a); + return cycles_2_ns(a); +} + +unsigned long get_cmos_time(void) +{ + unsigned int timeout, year, mon, day, hour, min, sec; + unsigned char last, this; + unsigned long flags; + +/* + * The Linux interpretation of the CMOS clock register contents: When the + * Update-In-Progress (UIP) flag goes from 1 to 0, the RTC registers show the + * second which has precisely just started. Waiting for this can take up to 1 + * second, we timeout approximately after 2.4 seconds on a machine with + * standard 8.3 MHz ISA bus. + */ + + spin_lock_irqsave(&rtc_lock, flags); + + timeout = 1000000; + last = this = 0; + + while (timeout && last && !this) { + last = this; + this = CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP; + timeout--; + } + +/* + * Here we are safe to assume the registers won't change for a whole second, so + * we just go ahead and read them. + */ + + sec = CMOS_READ(RTC_SECONDS); + min = CMOS_READ(RTC_MINUTES); + hour = CMOS_READ(RTC_HOURS); + day = CMOS_READ(RTC_DAY_OF_MONTH); + mon = CMOS_READ(RTC_MONTH); + year = CMOS_READ(RTC_YEAR); + + spin_unlock_irqrestore(&rtc_lock, flags); + +/* + * We know that x86-64 always uses BCD format, no need to check the config + * register. + */ + + BCD_TO_BIN(sec); + BCD_TO_BIN(min); + BCD_TO_BIN(hour); + BCD_TO_BIN(day); + BCD_TO_BIN(mon); + BCD_TO_BIN(year); + +/* + * x86-64 systems only exists since 2002. + * This will work up to Dec 31, 2100 + */ + year += 2000; + + return mktime(year, mon, day, hour, min, sec); +} + +#ifdef CONFIG_CPU_FREQ + +/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency + changes. + + RED-PEN: On SMP we assume all CPUs run with the same frequency. It's + not that important because current Opteron setups do not support + scaling on SMP anyroads. + + Should fix up last_tsc too. Currently gettimeofday in the + first tick after the change will be slightly wrong. */ + +#include <linux/workqueue.h> + +static unsigned int cpufreq_delayed_issched = 0; +static unsigned int cpufreq_init = 0; +static struct work_struct cpufreq_delayed_get_work; + +static void handle_cpufreq_delayed_get(void *v) +{ + unsigned int cpu; + for_each_online_cpu(cpu) { + cpufreq_get(cpu); + } + cpufreq_delayed_issched = 0; +} + +/* if we notice lost ticks, schedule a call to cpufreq_get() as it tries + * to verify the CPU frequency the timing core thinks the CPU is running + * at is still correct. + */ +static void cpufreq_delayed_get(void) +{ + static int warned; + if (cpufreq_init && !cpufreq_delayed_issched) { + cpufreq_delayed_issched = 1; + if (!warned) { + warned = 1; + printk(KERN_DEBUG "Losing some ticks... checking if CPU frequency changed.\n"); + } + schedule_work(&cpufreq_delayed_get_work); + } +} + +static unsigned int ref_freq = 0; +static unsigned long loops_per_jiffy_ref = 0; + +static unsigned long cpu_khz_ref = 0; + +static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct cpufreq_freqs *freq = data; + unsigned long *lpj, dummy; + + lpj = &dummy; + if (!(freq->flags & CPUFREQ_CONST_LOOPS)) +#ifdef CONFIG_SMP + lpj = &cpu_data[freq->cpu].loops_per_jiffy; +#else + lpj = &boot_cpu_data.loops_per_jiffy; +#endif + + + + if (!ref_freq) { + ref_freq = freq->old; + loops_per_jiffy_ref = *lpj; + cpu_khz_ref = cpu_khz; + } + if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || + (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || + (val == CPUFREQ_RESUMECHANGE)) { + *lpj = + cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); + + cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new); + if (!(freq->flags & CPUFREQ_CONST_LOOPS)) + vxtime.tsc_quot = (1000L << 32) / cpu_khz; + } + + set_cyc2ns_scale(cpu_khz_ref / 1000); + + return 0; +} + +static struct notifier_block time_cpufreq_notifier_block = { + .notifier_call = time_cpufreq_notifier +}; + +static int __init cpufreq_tsc(void) +{ + INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get, NULL); + if (!cpufreq_register_notifier(&time_cpufreq_notifier_block, + CPUFREQ_TRANSITION_NOTIFIER)) + cpufreq_init = 1; + return 0; +} + +core_initcall(cpufreq_tsc); + +#endif + +/* + * calibrate_tsc() calibrates the processor TSC in a very simple way, comparing + * it to the HPET timer of known frequency. + */ + +#define TICK_COUNT 100000000 + +static unsigned int __init hpet_calibrate_tsc(void) +{ + int tsc_start, hpet_start; + int tsc_now, hpet_now; + unsigned long flags; + + local_irq_save(flags); + local_irq_disable(); + + hpet_start = hpet_readl(HPET_COUNTER); + rdtscl(tsc_start); + + do { + local_irq_disable(); + hpet_now = hpet_readl(HPET_COUNTER); + sync_core(); + rdtscl(tsc_now); + local_irq_restore(flags); + } while ((tsc_now - tsc_start) < TICK_COUNT && + (hpet_now - hpet_start) < TICK_COUNT); + + return (tsc_now - tsc_start) * 1000000000L + / ((hpet_now - hpet_start) * hpet_period / 1000); +} + + +/* + * pit_calibrate_tsc() uses the speaker output (channel 2) of + * the PIT. This is better than using the timer interrupt output, + * because we can read the value of the speaker with just one inb(), + * where we need three i/o operations for the interrupt channel. + * We count how many ticks the TSC does in 50 ms. + */ + +static unsigned int __init pit_calibrate_tsc(void) +{ + unsigned long start, end; + unsigned long flags; + + spin_lock_irqsave(&i8253_lock, flags); + + outb((inb(0x61) & ~0x02) | 0x01, 0x61); + + outb(0xb0, 0x43); + outb((PIT_TICK_RATE / (1000 / 50)) & 0xff, 0x42); + outb((PIT_TICK_RATE / (1000 / 50)) >> 8, 0x42); + rdtscll(start); + sync_core(); + while ((inb(0x61) & 0x20) == 0); + sync_core(); + rdtscll(end); + + spin_unlock_irqrestore(&i8253_lock, flags); + + return (end - start) / 50; +} + +#ifdef CONFIG_HPET +static __init int late_hpet_init(void) +{ + struct hpet_data hd; + unsigned int ntimer; + + if (!vxtime.hpet_address) + return -1; + + memset(&hd, 0, sizeof (hd)); + + ntimer = hpet_readl(HPET_ID); + ntimer = (ntimer & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT; + ntimer++; + + /* + * Register with driver. + * Timer0 and Timer1 is used by platform. + */ + hd.hd_phys_address = vxtime.hpet_address; + hd.hd_address = (void *)fix_to_virt(FIX_HPET_BASE); + hd.hd_nirqs = ntimer; + hd.hd_flags = HPET_DATA_PLATFORM; + hpet_reserve_timer(&hd, 0); +#ifdef CONFIG_HPET_EMULATE_RTC + hpet_reserve_timer(&hd, 1); +#endif + hd.hd_irq[0] = HPET_LEGACY_8254; + hd.hd_irq[1] = HPET_LEGACY_RTC; + if (ntimer > 2) { + struct hpet *hpet; + struct hpet_timer *timer; + int i; + + hpet = (struct hpet *) fix_to_virt(FIX_HPET_BASE); + + for (i = 2, timer = &hpet->hpet_timers[2]; i < ntimer; + timer++, i++) + hd.hd_irq[i] = (timer->hpet_config & + Tn_INT_ROUTE_CNF_MASK) >> + Tn_INT_ROUTE_CNF_SHIFT; + + } + + hpet_alloc(&hd); + return 0; +} +fs_initcall(late_hpet_init); +#endif + +static int hpet_timer_stop_set_go(unsigned long tick) +{ + unsigned int cfg; + +/* + * Stop the timers and reset the main counter. + */ + + cfg = hpet_readl(HPET_CFG); + cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY); + hpet_writel(cfg, HPET_CFG); + hpet_writel(0, HPET_COUNTER); + hpet_writel(0, HPET_COUNTER + 4); + +/* + * Set up timer 0, as periodic with first interrupt to happen at hpet_tick, + * and period also hpet_tick. + */ + + hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL | + HPET_TN_32BIT, HPET_T0_CFG); + hpet_writel(hpet_tick, HPET_T0_CMP); + hpet_writel(hpet_tick, HPET_T0_CMP); /* AK: why twice? */ + +/* + * Go! + */ + + cfg |= HPET_CFG_ENABLE | HPET_CFG_LEGACY; + hpet_writel(cfg, HPET_CFG); + + return 0; +} + +static int hpet_init(void) +{ + unsigned int id; + + if (!vxtime.hpet_address) + return -1; + set_fixmap_nocache(FIX_HPET_BASE, vxtime.hpet_address); + __set_fixmap(VSYSCALL_HPET, vxtime.hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE); + +/* + * Read the period, compute tick and quotient. + */ + + id = hpet_readl(HPET_ID); + + if (!(id & HPET_ID_VENDOR) || !(id & HPET_ID_NUMBER) || + !(id & HPET_ID_LEGSUP)) + return -1; + + hpet_period = hpet_readl(HPET_PERIOD); + if (hpet_period < 100000 || hpet_period > 100000000) + return -1; + + hpet_tick = (1000000000L * (USEC_PER_SEC / HZ) + hpet_period / 2) / + hpet_period; + + return hpet_timer_stop_set_go(hpet_tick); +} + +static int hpet_reenable(void) +{ + return hpet_timer_stop_set_go(hpet_tick); +} + +void __init pit_init(void) +{ + unsigned long flags; + + spin_lock_irqsave(&i8253_lock, flags); + outb_p(0x34, 0x43); /* binary, mode 2, LSB/MSB, ch 0 */ + outb_p(LATCH & 0xff, 0x40); /* LSB */ + outb_p(LATCH >> 8, 0x40); /* MSB */ + spin_unlock_irqrestore(&i8253_lock, flags); +} + +int __init time_setup(char *str) +{ + report_lost_ticks = 1; + return 1; +} + +static struct irqaction irq0 = { + timer_interrupt, SA_INTERRUPT, CPU_MASK_NONE, "timer", NULL, NULL +}; + +extern void __init config_acpi_tables(void); + +void __init time_init(void) +{ + char *timename; + +#ifdef HPET_HACK_ENABLE_DANGEROUS + if (!vxtime.hpet_address) { + printk(KERN_WARNING "time.c: WARNING: Enabling HPET base " + "manually!\n"); + outl(0x800038a0, 0xcf8); + outl(0xff000001, 0xcfc); + outl(0x800038a0, 0xcf8); + vxtime.hpet_address = inl(0xcfc) & 0xfffffffe; + printk(KERN_WARNING "time.c: WARNING: Enabled HPET " + "at %#lx.\n", vxtime.hpet_address); + } +#endif + if (nohpet) + vxtime.hpet_address = 0; + + xtime.tv_sec = get_cmos_time(); + xtime.tv_nsec = 0; + + set_normalized_timespec(&wall_to_monotonic, + -xtime.tv_sec, -xtime.tv_nsec); + + if (!hpet_init()) { + vxtime_hz = (1000000000000000L + hpet_period / 2) / + hpet_period; + cpu_khz = hpet_calibrate_tsc(); + timename = "HPET"; + } else { + pit_init(); + cpu_khz = pit_calibrate_tsc(); + timename = "PIT"; + } + + printk(KERN_INFO "time.c: Using %ld.%06ld MHz %s timer.\n", + vxtime_hz / 1000000, vxtime_hz % 1000000, timename); + printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n", + cpu_khz / 1000, cpu_khz % 1000); + vxtime.mode = VXTIME_TSC; + vxtime.quot = (1000000L << 32) / vxtime_hz; + vxtime.tsc_quot = (1000L << 32) / cpu_khz; + vxtime.hz = vxtime_hz; + rdtscll_sync(&vxtime.last_tsc); + setup_irq(0, &irq0); + + set_cyc2ns_scale(cpu_khz / 1000); +} + +void __init time_init_smp(void) +{ + char *timetype; + + /* + * AMD systems with more than one CPU don't have fully synchronized + * TSCs. Always use HPET gettimeofday for these, although it is slower. + * Intel SMP systems usually have synchronized TSCs, so use always + * the TSC. + * + * Exceptions: + * IBM Summit2 checked by oem_force_hpet_timer(). + * AMD dual core may also not need HPET. Check me. + * + * Can be turned off with "notsc". + */ + if (num_online_cpus() > 1 && + boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + notsc = 1; + /* Some systems will want to disable TSC and use HPET. */ + if (oem_force_hpet_timer()) + notsc = 1; + if (vxtime.hpet_address && notsc) { + timetype = "HPET"; + vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick; + vxtime.mode = VXTIME_HPET; + do_gettimeoffset = do_gettimeoffset_hpet; + } else { + timetype = vxtime.hpet_address ? "HPET/TSC" : "PIT/TSC"; + vxtime.mode = VXTIME_TSC; + } + + printk(KERN_INFO "time.c: Using %s based timekeeping.\n", timetype); +} + +__setup("report_lost_ticks", time_setup); + +static long clock_cmos_diff; +static unsigned long sleep_start; + +static int timer_suspend(struct sys_device *dev, u32 state) +{ + /* + * Estimate time zone so that set_time can update the clock + */ + long cmos_time = get_cmos_time(); + + clock_cmos_diff = -cmos_time; + clock_cmos_diff += get_seconds(); + sleep_start = cmos_time; + return 0; +} + +static int timer_resume(struct sys_device *dev) +{ + unsigned long flags; + unsigned long sec; + unsigned long ctime = get_cmos_time(); + unsigned long sleep_length = (ctime - sleep_start) * HZ; + + if (vxtime.hpet_address) + hpet_reenable(); + else + i8254_timer_resume(); + + sec = ctime + clock_cmos_diff; + write_seqlock_irqsave(&xtime_lock,flags); + xtime.tv_sec = sec; + xtime.tv_nsec = 0; + write_sequnlock_irqrestore(&xtime_lock,flags); + jiffies += sleep_length; + wall_jiffies += sleep_length; + return 0; +} + +static struct sysdev_class timer_sysclass = { + .resume = timer_resume, + .suspend = timer_suspend, + set_kset_name("timer"), +}; + + +/* XXX this driverfs stuff should probably go elsewhere later -john */ +static struct sys_device device_timer = { + .id = 0, + .cls = &timer_sysclass, +}; + +static int time_init_device(void) +{ + int error = sysdev_class_register(&timer_sysclass); + if (!error) + error = sysdev_register(&device_timer); + return error; +} + +device_initcall(time_init_device); + +#ifdef CONFIG_HPET_EMULATE_RTC +/* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET + * is enabled, we support RTC interrupt functionality in software. + * RTC has 3 kinds of interrupts: + * 1) Update Interrupt - generate an interrupt, every sec, when RTC clock + * is updated + * 2) Alarm Interrupt - generate an interrupt at a specific time of day + * 3) Periodic Interrupt - generate periodic interrupt, with frequencies + * 2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2) + * (1) and (2) above are implemented using polling at a frequency of + * 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt + * overhead. (DEFAULT_RTC_INT_FREQ) + * For (3), we use interrupts at 64Hz or user specified periodic + * frequency, whichever is higher. + */ +#include <linux/rtc.h> + +extern irqreturn_t rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs); + +#define DEFAULT_RTC_INT_FREQ 64 +#define RTC_NUM_INTS 1 + +static unsigned long UIE_on; +static unsigned long prev_update_sec; + +static unsigned long AIE_on; +static struct rtc_time alarm_time; + +static unsigned long PIE_on; +static unsigned long PIE_freq = DEFAULT_RTC_INT_FREQ; +static unsigned long PIE_count; + +static unsigned long hpet_rtc_int_freq; /* RTC interrupt frequency */ + +int is_hpet_enabled(void) +{ + return vxtime.hpet_address != 0; +} + +/* + * Timer 1 for RTC, we do not use periodic interrupt feature, + * even if HPET supports periodic interrupts on Timer 1. + * The reason being, to set up a periodic interrupt in HPET, we need to + * stop the main counter. And if we do that everytime someone diables/enables + * RTC, we will have adverse effect on main kernel timer running on Timer 0. + * So, for the time being, simulate the periodic interrupt in software. + * + * hpet_rtc_timer_init() is called for the first time and during subsequent + * interuppts reinit happens through hpet_rtc_timer_reinit(). + */ +int hpet_rtc_timer_init(void) +{ + unsigned int cfg, cnt; + unsigned long flags; + + if (!is_hpet_enabled()) + return 0; + /* + * Set the counter 1 and enable the interrupts. + */ + if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ)) + hpet_rtc_int_freq = PIE_freq; + else + hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ; + + local_irq_save(flags); + cnt = hpet_readl(HPET_COUNTER); + cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq); + hpet_writel(cnt, HPET_T1_CMP); + local_irq_restore(flags); + + cfg = hpet_readl(HPET_T1_CFG); + cfg |= HPET_TN_ENABLE | HPET_TN_SETVAL | HPET_TN_32BIT; + hpet_writel(cfg, HPET_T1_CFG); + + return 1; +} + +static void hpet_rtc_timer_reinit(void) +{ + unsigned int cfg, cnt; + + if (!(PIE_on | AIE_on | UIE_on)) + return; + + if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ)) + hpet_rtc_int_freq = PIE_freq; + else + hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ; + + /* It is more accurate to use the comparator value than current count.*/ + cnt = hpet_readl(HPET_T1_CMP); + cnt += hpet_tick*HZ/hpet_rtc_int_freq; + hpet_writel(cnt, HPET_T1_CMP); + + cfg = hpet_readl(HPET_T1_CFG); + cfg |= HPET_TN_ENABLE | HPET_TN_SETVAL | HPET_TN_32BIT; + hpet_writel(cfg, HPET_T1_CFG); + + return; +} + +/* + * The functions below are called from rtc driver. + * Return 0 if HPET is not being used. + * Otherwise do the necessary changes and return 1. + */ +int hpet_mask_rtc_irq_bit(unsigned long bit_mask) +{ + if (!is_hpet_enabled()) + return 0; + + if (bit_mask & RTC_UIE) + UIE_on = 0; + if (bit_mask & RTC_PIE) + PIE_on = 0; + if (bit_mask & RTC_AIE) + AIE_on = 0; + + return 1; +} + +int hpet_set_rtc_irq_bit(unsigned long bit_mask) +{ + int timer_init_reqd = 0; + + if (!is_hpet_enabled()) + return 0; + + if (!(PIE_on | AIE_on | UIE_on)) + timer_init_reqd = 1; + + if (bit_mask & RTC_UIE) { + UIE_on = 1; + } + if (bit_mask & RTC_PIE) { + PIE_on = 1; + PIE_count = 0; + } + if (bit_mask & RTC_AIE) { + AIE_on = 1; + } + + if (timer_init_reqd) + hpet_rtc_timer_init(); + + return 1; +} + +int hpet_set_alarm_time(unsigned char hrs, unsigned char min, unsigned char sec) +{ + if (!is_hpet_enabled()) + return 0; + + alarm_time.tm_hour = hrs; + alarm_time.tm_min = min; + alarm_time.tm_sec = sec; + + return 1; +} + +int hpet_set_periodic_freq(unsigned long freq) +{ + if (!is_hpet_enabled()) + return 0; + + PIE_freq = freq; + PIE_count = 0; + + return 1; +} + +int hpet_rtc_dropped_irq(void) +{ + if (!is_hpet_enabled()) + return 0; + + return 1; +} + +irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs) +{ + struct rtc_time curr_time; + unsigned long rtc_int_flag = 0; + int call_rtc_interrupt = 0; + + hpet_rtc_timer_reinit(); + + if (UIE_on | AIE_on) { + rtc_get_rtc_time(&curr_time); + } + if (UIE_on) { + if (curr_time.tm_sec != prev_update_sec) { + /* Set update int info, call real rtc int routine */ + call_rtc_interrupt = 1; + rtc_int_flag = RTC_UF; + prev_update_sec = curr_time.tm_sec; + } + } + if (PIE_on) { + PIE_count++; + if (PIE_count >= hpet_rtc_int_freq/PIE_freq) { + /* Set periodic int info, call real rtc int routine */ + call_rtc_interrupt = 1; + rtc_int_flag |= RTC_PF; + PIE_count = 0; + } + } + if (AIE_on) { + if ((curr_time.tm_sec == alarm_time.tm_sec) && + (curr_time.tm_min == alarm_time.tm_min) && + (curr_time.tm_hour == alarm_time.tm_hour)) { + /* Set alarm int info, call real rtc int routine */ + call_rtc_interrupt = 1; + rtc_int_flag |= RTC_AF; + } + } + if (call_rtc_interrupt) { + rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8)); + rtc_interrupt(rtc_int_flag, dev_id, regs); + } + return IRQ_HANDLED; +} +#endif + + + +static int __init nohpet_setup(char *s) +{ + nohpet = 1; + return 0; +} + +__setup("nohpet", nohpet_setup); + + +static int __init notsc_setup(char *s) +{ + notsc = 1; + return 0; +} + +__setup("notsc", notsc_setup); + + diff --git a/arch/x86_64/kernel/trampoline.S b/arch/x86_64/kernel/trampoline.S new file mode 100644 index 00000000000..6d9c9a8e7d0 --- /dev/null +++ b/arch/x86_64/kernel/trampoline.S @@ -0,0 +1,64 @@ +/* + * + * Trampoline.S Derived from Setup.S by Linus Torvalds + * + * 4 Jan 1997 Michael Chastain: changed to gnu as. + * + * Entry: CS:IP point to the start of our code, we are + * in real mode with no stack, but the rest of the + * trampoline page to make our stack and everything else + * is a mystery. + * + * In fact we don't actually need a stack so we don't + * set one up. + * + * On entry to trampoline_data, the processor is in real mode + * with 16-bit addressing and 16-bit data. CS has some value + * and IP is zero. Thus, data addresses need to be absolute + * (no relocation) and are taken with regard to r_base. + * + * If you work on this file, check the object module with objdump + * --full-contents --reloc to make sure there are no relocation + * entries. For the GDT entry we do hand relocation in smpboot.c + * because of 64bit linker limitations. + */ + +#include <linux/linkage.h> +#include <asm/segment.h> +#include <asm/page.h> + +.data + +.code16 + +ENTRY(trampoline_data) +r_base = . + wbinvd + mov %cs, %ax # Code and data in the same place + mov %ax, %ds + + cli # We should be safe anyway + + movl $0xA5A5A5A5, trampoline_data - r_base + # write marker for master knows we're running + + lidt idt_48 - r_base # load idt with 0, 0 + lgdt gdt_48 - r_base # load gdt with whatever is appropriate + + xor %ax, %ax + inc %ax # protected mode (PE) bit + lmsw %ax # into protected mode + # flaush prefetch and jump to startup_32 in arch/x86_64/kernel/head.S + ljmpl $__KERNEL32_CS, $(startup_32-__START_KERNEL_map) + + # Careful these need to be in the same 64K segment as the above; +idt_48: + .word 0 # idt limit = 0 + .word 0, 0 # idt base = 0L + +gdt_48: + .short __KERNEL32_CS + 7 # gdt limit + .long cpu_gdt_table-__START_KERNEL_map + +.globl trampoline_end +trampoline_end: diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c new file mode 100644 index 00000000000..d87ac6496cc --- /dev/null +++ b/arch/x86_64/kernel/traps.c @@ -0,0 +1,948 @@ +/* + * linux/arch/x86-64/traps.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs + * + * Pentium III FXSR, SSE support + * Gareth Hughes <gareth@valinux.com>, May 2000 + * + * $Id: traps.c,v 1.36 2002/03/24 11:09:10 ak Exp $ + */ + +/* + * 'Traps.c' handles hardware traps and faults after we have saved some + * state in 'entry.S'. + */ +#include <linux/config.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/ptrace.h> +#include <linux/timer.h> +#include <linux/mm.h> +#include <linux/init.h> +#include <linux/delay.h> +#include <linux/spinlock.h> +#include <linux/interrupt.h> +#include <linux/module.h> +#include <linux/moduleparam.h> + +#include <asm/system.h> +#include <asm/uaccess.h> +#include <asm/io.h> +#include <asm/atomic.h> +#include <asm/debugreg.h> +#include <asm/desc.h> +#include <asm/i387.h> +#include <asm/kdebug.h> +#include <asm/processor.h> + +#include <asm/smp.h> +#include <asm/pgalloc.h> +#include <asm/pda.h> +#include <asm/proto.h> +#include <asm/nmi.h> + +#include <linux/irq.h> + + +extern struct gate_struct idt_table[256]; + +asmlinkage void divide_error(void); +asmlinkage void debug(void); +asmlinkage void nmi(void); +asmlinkage void int3(void); +asmlinkage void overflow(void); +asmlinkage void bounds(void); +asmlinkage void invalid_op(void); +asmlinkage void device_not_available(void); +asmlinkage void double_fault(void); +asmlinkage void coprocessor_segment_overrun(void); +asmlinkage void invalid_TSS(void); +asmlinkage void segment_not_present(void); +asmlinkage void stack_segment(void); +asmlinkage void general_protection(void); +asmlinkage void page_fault(void); +asmlinkage void coprocessor_error(void); +asmlinkage void simd_coprocessor_error(void); +asmlinkage void reserved(void); +asmlinkage void alignment_check(void); +asmlinkage void machine_check(void); +asmlinkage void spurious_interrupt_bug(void); +asmlinkage void call_debug(void); + +struct notifier_block *die_chain; +static DEFINE_SPINLOCK(die_notifier_lock); + +int register_die_notifier(struct notifier_block *nb) +{ + int err = 0; + unsigned long flags; + spin_lock_irqsave(&die_notifier_lock, flags); + err = notifier_chain_register(&die_chain, nb); + spin_unlock_irqrestore(&die_notifier_lock, flags); + return err; +} + +static inline void conditional_sti(struct pt_regs *regs) +{ + if (regs->eflags & X86_EFLAGS_IF) + local_irq_enable(); +} + +static int kstack_depth_to_print = 10; + +#ifdef CONFIG_KALLSYMS +#include <linux/kallsyms.h> +int printk_address(unsigned long address) +{ + unsigned long offset = 0, symsize; + const char *symname; + char *modname; + char *delim = ":"; + char namebuf[128]; + + symname = kallsyms_lookup(address, &symsize, &offset, &modname, namebuf); + if (!symname) + return printk("[<%016lx>]", address); + if (!modname) + modname = delim = ""; + return printk("<%016lx>{%s%s%s%s%+ld}", + address,delim,modname,delim,symname,offset); +} +#else +int printk_address(unsigned long address) +{ + return printk("[<%016lx>]", address); +} +#endif + +unsigned long *in_exception_stack(int cpu, unsigned long stack) +{ + int k; + for (k = 0; k < N_EXCEPTION_STACKS; k++) { + struct tss_struct *tss = &per_cpu(init_tss, cpu); + unsigned long start = tss->ist[k] - EXCEPTION_STKSZ; + + if (stack >= start && stack < tss->ist[k]) + return (unsigned long *)tss->ist[k]; + } + return NULL; +} + +/* + * x86-64 can have upto three kernel stacks: + * process stack + * interrupt stack + * severe exception (double fault, nmi, stack fault) hardware stack + * Check and process them in order. + */ + +void show_trace(unsigned long *stack) +{ + unsigned long addr; + unsigned long *irqstack, *irqstack_end, *estack_end; + const int cpu = safe_smp_processor_id(); + int i; + + printk("\nCall Trace:"); + i = 0; + + estack_end = in_exception_stack(cpu, (unsigned long)stack); + if (estack_end) { + while (stack < estack_end) { + addr = *stack++; + if (__kernel_text_address(addr)) { + i += printk_address(addr); + i += printk(" "); + if (i > 50) { + printk("\n"); + i = 0; + } + } + } + i += printk(" <EOE> "); + i += 7; + stack = (unsigned long *) estack_end[-2]; + } + + irqstack_end = (unsigned long *) (cpu_pda[cpu].irqstackptr); + irqstack = (unsigned long *) (cpu_pda[cpu].irqstackptr - IRQSTACKSIZE + 64); + + if (stack >= irqstack && stack < irqstack_end) { + printk("<IRQ> "); + while (stack < irqstack_end) { + addr = *stack++; + /* + * If the address is either in the text segment of the + * kernel, or in the region which contains vmalloc'ed + * memory, it *may* be the address of a calling + * routine; if so, print it so that someone tracing + * down the cause of the crash will be able to figure + * out the call path that was taken. + */ + if (__kernel_text_address(addr)) { + i += printk_address(addr); + i += printk(" "); + if (i > 50) { + printk("\n "); + i = 0; + } + } + } + stack = (unsigned long *) (irqstack_end[-1]); + printk(" <EOI> "); + i += 7; + } + + while (((long) stack & (THREAD_SIZE-1)) != 0) { + addr = *stack++; + if (__kernel_text_address(addr)) { + i += printk_address(addr); + i += printk(" "); + if (i > 50) { + printk("\n "); + i = 0; + } + } + } + printk("\n"); +} + +void show_stack(struct task_struct *tsk, unsigned long * rsp) +{ + unsigned long *stack; + int i; + const int cpu = safe_smp_processor_id(); + unsigned long *irqstack_end = (unsigned long *) (cpu_pda[cpu].irqstackptr); + unsigned long *irqstack = (unsigned long *) (cpu_pda[cpu].irqstackptr - IRQSTACKSIZE); + + // debugging aid: "show_stack(NULL, NULL);" prints the + // back trace for this cpu. + + if (rsp == NULL) { + if (tsk) + rsp = (unsigned long *)tsk->thread.rsp; + else + rsp = (unsigned long *)&rsp; + } + + stack = rsp; + for(i=0; i < kstack_depth_to_print; i++) { + if (stack >= irqstack && stack <= irqstack_end) { + if (stack == irqstack_end) { + stack = (unsigned long *) (irqstack_end[-1]); + printk(" <EOI> "); + } + } else { + if (((long) stack & (THREAD_SIZE-1)) == 0) + break; + } + if (i && ((i % 4) == 0)) + printk("\n "); + printk("%016lx ", *stack++); + } + show_trace((unsigned long *)rsp); +} + +/* + * The architecture-independent dump_stack generator + */ +void dump_stack(void) +{ + unsigned long dummy; + show_trace(&dummy); +} + +EXPORT_SYMBOL(dump_stack); + +void show_registers(struct pt_regs *regs) +{ + int i; + int in_kernel = (regs->cs & 3) == 0; + unsigned long rsp; + const int cpu = safe_smp_processor_id(); + struct task_struct *cur = cpu_pda[cpu].pcurrent; + + rsp = regs->rsp; + + printk("CPU %d ", cpu); + __show_regs(regs); + printk("Process %s (pid: %d, threadinfo %p, task %p)\n", + cur->comm, cur->pid, cur->thread_info, cur); + + /* + * When in-kernel, we also print out the stack and code at the + * time of the fault.. + */ + if (in_kernel) { + + printk("Stack: "); + show_stack(NULL, (unsigned long*)rsp); + + printk("\nCode: "); + if(regs->rip < PAGE_OFFSET) + goto bad; + + for(i=0;i<20;i++) + { + unsigned char c; + if(__get_user(c, &((unsigned char*)regs->rip)[i])) { +bad: + printk(" Bad RIP value."); + break; + } + printk("%02x ", c); + } + } + printk("\n"); +} + +void handle_BUG(struct pt_regs *regs) +{ + struct bug_frame f; + char tmp; + + if (regs->cs & 3) + return; + if (__copy_from_user(&f, (struct bug_frame *) regs->rip, + sizeof(struct bug_frame))) + return; + if ((unsigned long)f.filename < __PAGE_OFFSET || + f.ud2[0] != 0x0f || f.ud2[1] != 0x0b) + return; + if (__get_user(tmp, f.filename)) + f.filename = "unmapped filename"; + printk("----------- [cut here ] --------- [please bite here ] ---------\n"); + printk(KERN_ALERT "Kernel BUG at %.50s:%d\n", f.filename, f.line); +} + +void out_of_line_bug(void) +{ + BUG(); +} + +static DEFINE_SPINLOCK(die_lock); +static int die_owner = -1; + +void oops_begin(void) +{ + int cpu = safe_smp_processor_id(); + /* racy, but better than risking deadlock. */ + local_irq_disable(); + if (!spin_trylock(&die_lock)) { + if (cpu == die_owner) + /* nested oops. should stop eventually */; + else + spin_lock(&die_lock); + } + die_owner = cpu; + console_verbose(); + bust_spinlocks(1); +} + +void oops_end(void) +{ + die_owner = -1; + bust_spinlocks(0); + spin_unlock(&die_lock); + if (panic_on_oops) + panic("Oops"); +} + +void __die(const char * str, struct pt_regs * regs, long err) +{ + static int die_counter; + printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter); +#ifdef CONFIG_PREEMPT + printk("PREEMPT "); +#endif +#ifdef CONFIG_SMP + printk("SMP "); +#endif +#ifdef CONFIG_DEBUG_PAGEALLOC + printk("DEBUG_PAGEALLOC"); +#endif + printk("\n"); + notify_die(DIE_OOPS, (char *)str, regs, err, 255, SIGSEGV); + show_registers(regs); + /* Executive summary in case the oops scrolled away */ + printk(KERN_ALERT "RIP "); + printk_address(regs->rip); + printk(" RSP <%016lx>\n", regs->rsp); +} + +void die(const char * str, struct pt_regs * regs, long err) +{ + oops_begin(); + handle_BUG(regs); + __die(str, regs, err); + oops_end(); + do_exit(SIGSEGV); +} +static inline void die_if_kernel(const char * str, struct pt_regs * regs, long err) +{ + if (!(regs->eflags & VM_MASK) && (regs->cs == __KERNEL_CS)) + die(str, regs, err); +} + +void die_nmi(char *str, struct pt_regs *regs) +{ + oops_begin(); + /* + * We are in trouble anyway, lets at least try + * to get a message out. + */ + printk(str, safe_smp_processor_id()); + show_registers(regs); + if (panic_on_timeout || panic_on_oops) + panic("nmi watchdog"); + printk("console shuts up ...\n"); + oops_end(); + do_exit(SIGSEGV); +} + +static void do_trap(int trapnr, int signr, char *str, + struct pt_regs * regs, long error_code, siginfo_t *info) +{ + conditional_sti(regs); + +#ifdef CONFIG_CHECKING + { + unsigned long gs; + struct x8664_pda *pda = cpu_pda + safe_smp_processor_id(); + rdmsrl(MSR_GS_BASE, gs); + if (gs != (unsigned long)pda) { + wrmsrl(MSR_GS_BASE, pda); + printk("%s: wrong gs %lx expected %p rip %lx\n", str, gs, pda, + regs->rip); + } + } +#endif + + if ((regs->cs & 3) != 0) { + struct task_struct *tsk = current; + + if (exception_trace && unhandled_signal(tsk, signr)) + printk(KERN_INFO + "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n", + tsk->comm, tsk->pid, str, + regs->rip,regs->rsp,error_code); + + tsk->thread.error_code = error_code; + tsk->thread.trap_no = trapnr; + if (info) + force_sig_info(signr, info, tsk); + else + force_sig(signr, tsk); + return; + } + + + /* kernel trap */ + { + const struct exception_table_entry *fixup; + fixup = search_exception_tables(regs->rip); + if (fixup) { + regs->rip = fixup->fixup; + } else + die(str, regs, error_code); + return; + } +} + +#define DO_ERROR(trapnr, signr, str, name) \ +asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ +{ \ + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ + == NOTIFY_STOP) \ + return; \ + do_trap(trapnr, signr, str, regs, error_code, NULL); \ +} + +#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ +asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ +{ \ + siginfo_t info; \ + info.si_signo = signr; \ + info.si_errno = 0; \ + info.si_code = sicode; \ + info.si_addr = (void __user *)siaddr; \ + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ + == NOTIFY_STOP) \ + return; \ + do_trap(trapnr, signr, str, regs, error_code, &info); \ +} + +DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->rip) +DO_ERROR( 4, SIGSEGV, "overflow", overflow) +DO_ERROR( 5, SIGSEGV, "bounds", bounds) +DO_ERROR_INFO( 6, SIGILL, "invalid operand", invalid_op, ILL_ILLOPN, regs->rip) +DO_ERROR( 7, SIGSEGV, "device not available", device_not_available) +DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) +DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) +DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) +DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) +DO_ERROR(18, SIGSEGV, "reserved", reserved) + +#define DO_ERROR_STACK(trapnr, signr, str, name) \ +asmlinkage void *do_##name(struct pt_regs * regs, long error_code) \ +{ \ + struct pt_regs *pr = ((struct pt_regs *)(current->thread.rsp0))-1; \ + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ + == NOTIFY_STOP) \ + return regs; \ + if (regs->cs & 3) { \ + memcpy(pr, regs, sizeof(struct pt_regs)); \ + regs = pr; \ + } \ + do_trap(trapnr, signr, str, regs, error_code, NULL); \ + return regs; \ +} + +DO_ERROR_STACK(12, SIGBUS, "stack segment", stack_segment) +DO_ERROR_STACK( 8, SIGSEGV, "double fault", double_fault) + +asmlinkage void do_general_protection(struct pt_regs * regs, long error_code) +{ + conditional_sti(regs); + +#ifdef CONFIG_CHECKING + { + unsigned long gs; + struct x8664_pda *pda = cpu_pda + safe_smp_processor_id(); + rdmsrl(MSR_GS_BASE, gs); + if (gs != (unsigned long)pda) { + wrmsrl(MSR_GS_BASE, pda); + oops_in_progress++; + printk("general protection handler: wrong gs %lx expected %p\n", gs, pda); + oops_in_progress--; + } + } +#endif + + if ((regs->cs & 3)!=0) { + struct task_struct *tsk = current; + + if (exception_trace && unhandled_signal(tsk, SIGSEGV)) + printk(KERN_INFO + "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n", + tsk->comm, tsk->pid, + regs->rip,regs->rsp,error_code); + + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 13; + force_sig(SIGSEGV, tsk); + return; + } + + /* kernel gp */ + { + const struct exception_table_entry *fixup; + fixup = search_exception_tables(regs->rip); + if (fixup) { + regs->rip = fixup->fixup; + return; + } + if (notify_die(DIE_GPF, "general protection fault", regs, + error_code, 13, SIGSEGV) == NOTIFY_STOP) + return; + die("general protection fault", regs, error_code); + } +} + +static void mem_parity_error(unsigned char reason, struct pt_regs * regs) +{ + printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n"); + printk("You probably have a hardware problem with your RAM chips\n"); + + /* Clear and disable the memory parity error line. */ + reason = (reason & 0xf) | 4; + outb(reason, 0x61); +} + +static void io_check_error(unsigned char reason, struct pt_regs * regs) +{ + printk("NMI: IOCK error (debug interrupt?)\n"); + show_registers(regs); + + /* Re-enable the IOCK line, wait for a few seconds */ + reason = (reason & 0xf) | 8; + outb(reason, 0x61); + mdelay(2000); + reason &= ~8; + outb(reason, 0x61); +} + +static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs) +{ printk("Uhhuh. NMI received for unknown reason %02x.\n", reason); + printk("Dazed and confused, but trying to continue\n"); + printk("Do you have a strange power saving mode enabled?\n"); +} + +asmlinkage void default_do_nmi(struct pt_regs *regs) +{ + unsigned char reason = 0; + + /* Only the BSP gets external NMIs from the system. */ + if (!smp_processor_id()) + reason = get_nmi_reason(); + + if (!(reason & 0xc0)) { + if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 0, SIGINT) + == NOTIFY_STOP) + return; +#ifdef CONFIG_X86_LOCAL_APIC + /* + * Ok, so this is none of the documented NMI sources, + * so it must be the NMI watchdog. + */ + if (nmi_watchdog > 0) { + nmi_watchdog_tick(regs,reason); + return; + } +#endif + unknown_nmi_error(reason, regs); + return; + } + if (notify_die(DIE_NMI, "nmi", regs, reason, 0, SIGINT) == NOTIFY_STOP) + return; + + /* AK: following checks seem to be broken on modern chipsets. FIXME */ + + if (reason & 0x80) + mem_parity_error(reason, regs); + if (reason & 0x40) + io_check_error(reason, regs); +} + +asmlinkage void do_int3(struct pt_regs * regs, long error_code) +{ + if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) { + return; + } + do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); + return; +} + +/* runs on IST stack. */ +asmlinkage void *do_debug(struct pt_regs * regs, unsigned long error_code) +{ + struct pt_regs *pr; + unsigned long condition; + struct task_struct *tsk = current; + siginfo_t info; + + pr = (struct pt_regs *)(current->thread.rsp0)-1; + if (regs->cs & 3) { + memcpy(pr, regs, sizeof(struct pt_regs)); + regs = pr; + } + +#ifdef CONFIG_CHECKING + { + /* RED-PEN interaction with debugger - could destroy gs */ + unsigned long gs; + struct x8664_pda *pda = cpu_pda + safe_smp_processor_id(); + rdmsrl(MSR_GS_BASE, gs); + if (gs != (unsigned long)pda) { + wrmsrl(MSR_GS_BASE, pda); + printk("debug handler: wrong gs %lx expected %p\n", gs, pda); + } + } +#endif + + asm("movq %%db6,%0" : "=r" (condition)); + + if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, + SIGTRAP) == NOTIFY_STOP) { + return regs; + } + conditional_sti(regs); + + /* Mask out spurious debug traps due to lazy DR7 setting */ + if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { + if (!tsk->thread.debugreg7) { + goto clear_dr7; + } + } + + tsk->thread.debugreg6 = condition; + + /* Mask out spurious TF errors due to lazy TF clearing */ + if ((condition & DR_STEP) && + (notify_die(DIE_DEBUGSTEP, "debugstep", regs, condition, + 1, SIGTRAP) != NOTIFY_STOP)) { + /* + * The TF error should be masked out only if the current + * process is not traced and if the TRAP flag has been set + * previously by a tracing process (condition detected by + * the PT_DTRACE flag); remember that the i386 TRAP flag + * can be modified by the process itself in user mode, + * allowing programs to debug themselves without the ptrace() + * interface. + */ + if ((regs->cs & 3) == 0) + goto clear_TF_reenable; + if ((tsk->ptrace & (PT_DTRACE|PT_PTRACED)) == PT_DTRACE) + goto clear_TF; + } + + /* Ok, finally something we can handle */ + tsk->thread.trap_no = 1; + tsk->thread.error_code = error_code; + info.si_signo = SIGTRAP; + info.si_errno = 0; + info.si_code = TRAP_BRKPT; + if ((regs->cs & 3) == 0) + goto clear_dr7; + + info.si_addr = (void __user *)regs->rip; + force_sig_info(SIGTRAP, &info, tsk); +clear_dr7: + asm volatile("movq %0,%%db7"::"r"(0UL)); + notify_die(DIE_DEBUG, "debug", regs, condition, 1, SIGTRAP); + return regs; + +clear_TF_reenable: + set_tsk_thread_flag(tsk, TIF_SINGLESTEP); + +clear_TF: + /* RED-PEN could cause spurious errors */ + if (notify_die(DIE_DEBUG, "debug2", regs, condition, 1, SIGTRAP) + != NOTIFY_STOP) + regs->eflags &= ~TF_MASK; + return regs; +} + +static int kernel_math_error(struct pt_regs *regs, char *str) +{ + const struct exception_table_entry *fixup; + fixup = search_exception_tables(regs->rip); + if (fixup) { + regs->rip = fixup->fixup; + return 1; + } + notify_die(DIE_GPF, str, regs, 0, 16, SIGFPE); +#if 0 + /* This should be a die, but warn only for now */ + die(str, regs, 0); +#else + printk(KERN_DEBUG "%s: %s at ", current->comm, str); + printk_address(regs->rip); + printk("\n"); +#endif + return 0; +} + +/* + * Note that we play around with the 'TS' bit in an attempt to get + * the correct behaviour even in the presence of the asynchronous + * IRQ13 behaviour + */ +asmlinkage void do_coprocessor_error(struct pt_regs *regs) +{ + void __user *rip = (void __user *)(regs->rip); + struct task_struct * task; + siginfo_t info; + unsigned short cwd, swd; + + conditional_sti(regs); + if ((regs->cs & 3) == 0 && + kernel_math_error(regs, "kernel x87 math error")) + return; + + /* + * Save the info for the exception handler and clear the error. + */ + task = current; + save_init_fpu(task); + task->thread.trap_no = 16; + task->thread.error_code = 0; + info.si_signo = SIGFPE; + info.si_errno = 0; + info.si_code = __SI_FAULT; + info.si_addr = rip; + /* + * (~cwd & swd) will mask out exceptions that are not set to unmasked + * status. 0x3f is the exception bits in these regs, 0x200 is the + * C1 reg you need in case of a stack fault, 0x040 is the stack + * fault bit. We should only be taking one exception at a time, + * so if this combination doesn't produce any single exception, + * then we have a bad program that isn't synchronizing its FPU usage + * and it will suffer the consequences since we won't be able to + * fully reproduce the context of the exception + */ + cwd = get_fpu_cwd(task); + swd = get_fpu_swd(task); + switch (((~cwd) & swd & 0x3f) | (swd & 0x240)) { + case 0x000: + default: + break; + case 0x001: /* Invalid Op */ + case 0x041: /* Stack Fault */ + case 0x241: /* Stack Fault | Direction */ + info.si_code = FPE_FLTINV; + break; + case 0x002: /* Denormalize */ + case 0x010: /* Underflow */ + info.si_code = FPE_FLTUND; + break; + case 0x004: /* Zero Divide */ + info.si_code = FPE_FLTDIV; + break; + case 0x008: /* Overflow */ + info.si_code = FPE_FLTOVF; + break; + case 0x020: /* Precision */ + info.si_code = FPE_FLTRES; + break; + } + force_sig_info(SIGFPE, &info, task); +} + +asmlinkage void bad_intr(void) +{ + printk("bad interrupt"); +} + +asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs) +{ + void __user *rip = (void __user *)(regs->rip); + struct task_struct * task; + siginfo_t info; + unsigned short mxcsr; + + conditional_sti(regs); + if ((regs->cs & 3) == 0 && + kernel_math_error(regs, "simd math error")) + return; + + /* + * Save the info for the exception handler and clear the error. + */ + task = current; + save_init_fpu(task); + task->thread.trap_no = 19; + task->thread.error_code = 0; + info.si_signo = SIGFPE; + info.si_errno = 0; + info.si_code = __SI_FAULT; + info.si_addr = rip; + /* + * The SIMD FPU exceptions are handled a little differently, as there + * is only a single status/control register. Thus, to determine which + * unmasked exception was caught we must mask the exception mask bits + * at 0x1f80, and then use these to mask the exception bits at 0x3f. + */ + mxcsr = get_fpu_mxcsr(task); + switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) { + case 0x000: + default: + break; + case 0x001: /* Invalid Op */ + info.si_code = FPE_FLTINV; + break; + case 0x002: /* Denormalize */ + case 0x010: /* Underflow */ + info.si_code = FPE_FLTUND; + break; + case 0x004: /* Zero Divide */ + info.si_code = FPE_FLTDIV; + break; + case 0x008: /* Overflow */ + info.si_code = FPE_FLTOVF; + break; + case 0x020: /* Precision */ + info.si_code = FPE_FLTRES; + break; + } + force_sig_info(SIGFPE, &info, task); +} + +asmlinkage void do_spurious_interrupt_bug(struct pt_regs * regs) +{ +} + +asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) +{ +} + +/* + * 'math_state_restore()' saves the current math information in the + * old math state array, and gets the new ones from the current task + * + * Careful.. There are problems with IBM-designed IRQ13 behaviour. + * Don't touch unless you *really* know how it works. + */ +asmlinkage void math_state_restore(void) +{ + struct task_struct *me = current; + clts(); /* Allow maths ops (or we recurse) */ + + if (!used_math()) + init_fpu(me); + restore_fpu_checking(&me->thread.i387.fxsave); + me->thread_info->status |= TS_USEDFPU; +} + +void do_call_debug(struct pt_regs *regs) +{ + notify_die(DIE_CALL, "debug call", regs, 0, 255, SIGINT); +} + +void __init trap_init(void) +{ + set_intr_gate(0,÷_error); + set_intr_gate_ist(1,&debug,DEBUG_STACK); + set_intr_gate_ist(2,&nmi,NMI_STACK); + set_system_gate(3,&int3); + set_system_gate(4,&overflow); /* int4-5 can be called from all */ + set_system_gate(5,&bounds); + set_intr_gate(6,&invalid_op); + set_intr_gate(7,&device_not_available); + set_intr_gate_ist(8,&double_fault, DOUBLEFAULT_STACK); + set_intr_gate(9,&coprocessor_segment_overrun); + set_intr_gate(10,&invalid_TSS); + set_intr_gate(11,&segment_not_present); + set_intr_gate_ist(12,&stack_segment,STACKFAULT_STACK); + set_intr_gate(13,&general_protection); + set_intr_gate(14,&page_fault); + set_intr_gate(15,&spurious_interrupt_bug); + set_intr_gate(16,&coprocessor_error); + set_intr_gate(17,&alignment_check); +#ifdef CONFIG_X86_MCE + set_intr_gate_ist(18,&machine_check, MCE_STACK); +#endif + set_intr_gate(19,&simd_coprocessor_error); + +#ifdef CONFIG_IA32_EMULATION + set_system_gate(IA32_SYSCALL_VECTOR, ia32_syscall); +#endif + + set_intr_gate(KDB_VECTOR, call_debug); + + /* + * Should be a barrier for any external CPU state. + */ + cpu_init(); +} + + +/* Actual parsing is done early in setup.c. */ +static int __init oops_dummy(char *s) +{ + panic_on_oops = 1; + return -1; +} +__setup("oops=", oops_dummy); + +static int __init kstack_setup(char *s) +{ + kstack_depth_to_print = simple_strtoul(s,NULL,0); + return 0; +} +__setup("kstack=", kstack_setup); + diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S new file mode 100644 index 00000000000..59ebd5beda8 --- /dev/null +++ b/arch/x86_64/kernel/vmlinux.lds.S @@ -0,0 +1,164 @@ +/* ld script to make x86-64 Linux kernel + * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>; + */ + +#include <asm-generic/vmlinux.lds.h> +#include <linux/config.h> + +OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64") +OUTPUT_ARCH(i386:x86-64) +ENTRY(phys_startup_64) +jiffies_64 = jiffies; +SECTIONS +{ + . = 0xffffffff80100000; + phys_startup_64 = startup_64 - LOAD_OFFSET; + _text = .; /* Text and read-only data */ + .text : { + *(.text) + SCHED_TEXT + LOCK_TEXT + *(.fixup) + *(.gnu.warning) + } = 0x9090 + .text.lock : { *(.text.lock) } /* out-of-line lock text */ + + _etext = .; /* End of text section */ + + . = ALIGN(16); /* Exception table */ + __start___ex_table = .; + __ex_table : { *(__ex_table) } + __stop___ex_table = .; + + RODATA + + .data : { /* Data */ + *(.data) + CONSTRUCTORS + } + + _edata = .; /* End of data section */ + + __bss_start = .; /* BSS */ + .bss : { + *(.bss.page_aligned) + *(.bss) + } + __bss_end = .; + + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + .data.cacheline_aligned : { *(.data.cacheline_aligned) } + +#define AFTER(x) BINALIGN(LOADADDR(x) + SIZEOF(x), 16) +#define BINALIGN(x,y) (((x) + (y) - 1) & ~((y) - 1)) +#define CACHE_ALIGN(x) BINALIGN(x, CONFIG_X86_L1_CACHE_BYTES) + + .vsyscall_0 -10*1024*1024: AT ((LOADADDR(.data.cacheline_aligned) + SIZEOF(.data.cacheline_aligned) + 4095) & ~(4095)) { *(.vsyscall_0) } + __vsyscall_0 = LOADADDR(.vsyscall_0); + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + .xtime_lock : AT CACHE_ALIGN(AFTER(.vsyscall_0)) { *(.xtime_lock) } + xtime_lock = LOADADDR(.xtime_lock); + .vxtime : AT AFTER(.xtime_lock) { *(.vxtime) } + vxtime = LOADADDR(.vxtime); + .wall_jiffies : AT AFTER(.vxtime) { *(.wall_jiffies) } + wall_jiffies = LOADADDR(.wall_jiffies); + .sys_tz : AT AFTER(.wall_jiffies) { *(.sys_tz) } + sys_tz = LOADADDR(.sys_tz); + .sysctl_vsyscall : AT AFTER(.sys_tz) { *(.sysctl_vsyscall) } + sysctl_vsyscall = LOADADDR(.sysctl_vsyscall); + .xtime : AT AFTER(.sysctl_vsyscall) { *(.xtime) } + xtime = LOADADDR(.xtime); + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + .jiffies : AT CACHE_ALIGN(AFTER(.xtime)) { *(.jiffies) } + jiffies = LOADADDR(.jiffies); + .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT (LOADADDR(.vsyscall_0) + 1024) { *(.vsyscall_1) } + . = LOADADDR(.vsyscall_0) + 4096; + + . = ALIGN(8192); /* init_task */ + .data.init_task : { *(.data.init_task) } + + . = ALIGN(4096); + .data.page_aligned : { *(.data.page_aligned) } + + . = ALIGN(4096); /* Init code and data */ + __init_begin = .; + .init.text : { + _sinittext = .; + *(.init.text) + _einittext = .; + } + __initdata_begin = .; + .init.data : { *(.init.data) } + __initdata_end = .; + . = ALIGN(16); + __setup_start = .; + .init.setup : { *(.init.setup) } + __setup_end = .; + __initcall_start = .; + .initcall.init : { + *(.initcall1.init) + *(.initcall2.init) + *(.initcall3.init) + *(.initcall4.init) + *(.initcall5.init) + *(.initcall6.init) + *(.initcall7.init) + } + __initcall_end = .; + __con_initcall_start = .; + .con_initcall.init : { *(.con_initcall.init) } + __con_initcall_end = .; + SECURITY_INIT + . = ALIGN(8); + __alt_instructions = .; + .altinstructions : { *(.altinstructions) } + __alt_instructions_end = .; + .altinstr_replacement : { *(.altinstr_replacement) } + /* .exit.text is discard at runtime, not link time, to deal with references + from .altinstructions and .eh_frame */ + .exit.text : { *(.exit.text) } + .exit.data : { *(.exit.data) } + . = ALIGN(4096); + __initramfs_start = .; + .init.ramfs : { *(.init.ramfs) } + __initramfs_end = .; + . = ALIGN(32); + __per_cpu_start = .; + .data.percpu : { *(.data.percpu) } + __per_cpu_end = .; + . = ALIGN(4096); + __init_end = .; + + . = ALIGN(4096); + __nosave_begin = .; + .data_nosave : { *(.data.nosave) } + . = ALIGN(4096); + __nosave_end = .; + + _end = . ; + + /* Sections to be discarded */ + /DISCARD/ : { + *(.exitcall.exit) +#ifndef CONFIG_DEBUG_INFO + *(.eh_frame) +#endif + } + + /* DWARF 2 */ + .debug_info 0 : { *(.debug_info) } + .debug_abbrev 0 : { *(.debug_abbrev) } + .debug_line 0 : { *(.debug_line) } + .debug_frame 0 : { *(.debug_frame) } + .debug_str 0 : { *(.debug_str) } + .debug_loc 0 : { *(.debug_loc) } + .debug_macinfo 0 : { *(.debug_macinfo) } + /* SGI/MIPS DWARF 2 extensions */ + .debug_weaknames 0 : { *(.debug_weaknames) } + .debug_funcnames 0 : { *(.debug_funcnames) } + .debug_typenames 0 : { *(.debug_typenames) } + .debug_varnames 0 : { *(.debug_varnames) } + + + .comment 0 : { *(.comment) } +} diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c new file mode 100644 index 00000000000..b4b8dc59663 --- /dev/null +++ b/arch/x86_64/kernel/vsyscall.c @@ -0,0 +1,225 @@ +/* + * linux/arch/x86_64/kernel/vsyscall.c + * + * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE + * Copyright 2003 Andi Kleen, SuSE Labs. + * + * Thanks to hpa@transmeta.com for some useful hint. + * Special thanks to Ingo Molnar for his early experience with + * a different vsyscall implementation for Linux/IA32 and for the name. + * + * vsyscall 1 is located at -10Mbyte, vsyscall 2 is located + * at virtual address -10Mbyte+1024bytes etc... There are at max 4 + * vsyscalls. One vsyscall can reserve more than 1 slot to avoid + * jumping out of line if necessary. We cannot add more with this + * mechanism because older kernels won't return -ENOSYS. + * If we want more than four we need a vDSO. + * + * Note: the concept clashes with user mode linux. If you use UML and + * want per guest time just set the kernel.vsyscall64 sysctl to 0. + */ + +#include <linux/time.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/timer.h> +#include <linux/seqlock.h> +#include <linux/jiffies.h> +#include <linux/sysctl.h> + +#include <asm/vsyscall.h> +#include <asm/pgtable.h> +#include <asm/page.h> +#include <asm/fixmap.h> +#include <asm/errno.h> +#include <asm/io.h> + +#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) +#define force_inline __attribute__((always_inline)) inline + +int __sysctl_vsyscall __section_sysctl_vsyscall = 1; +seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED; + +#include <asm/unistd.h> + +static force_inline void timeval_normalize(struct timeval * tv) +{ + time_t __sec; + + __sec = tv->tv_usec / 1000000; + if (__sec) { + tv->tv_usec %= 1000000; + tv->tv_sec += __sec; + } +} + +static force_inline void do_vgettimeofday(struct timeval * tv) +{ + long sequence, t; + unsigned long sec, usec; + + do { + sequence = read_seqbegin(&__xtime_lock); + + sec = __xtime.tv_sec; + usec = (__xtime.tv_nsec / 1000) + + (__jiffies - __wall_jiffies) * (1000000 / HZ); + + if (__vxtime.mode == VXTIME_TSC) { + sync_core(); + rdtscll(t); + if (t < __vxtime.last_tsc) + t = __vxtime.last_tsc; + usec += ((t - __vxtime.last_tsc) * + __vxtime.tsc_quot) >> 32; + /* See comment in x86_64 do_gettimeofday. */ + } else { + usec += ((readl((void *)fix_to_virt(VSYSCALL_HPET) + 0xf0) - + __vxtime.last) * __vxtime.quot) >> 32; + } + } while (read_seqretry(&__xtime_lock, sequence)); + + tv->tv_sec = sec + usec / 1000000; + tv->tv_usec = usec % 1000000; +} + +/* RED-PEN may want to readd seq locking, but then the variable should be write-once. */ +static force_inline void do_get_tz(struct timezone * tz) +{ + *tz = __sys_tz; +} + +static force_inline int gettimeofday(struct timeval *tv, struct timezone *tz) +{ + int ret; + asm volatile("vsysc2: syscall" + : "=a" (ret) + : "0" (__NR_gettimeofday),"D" (tv),"S" (tz) : __syscall_clobber ); + return ret; +} + +static force_inline long time_syscall(long *t) +{ + long secs; + asm volatile("vsysc1: syscall" + : "=a" (secs) + : "0" (__NR_time),"D" (t) : __syscall_clobber); + return secs; +} + +static int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz) +{ + if (unlikely(!__sysctl_vsyscall)) + return gettimeofday(tv,tz); + if (tv) + do_vgettimeofday(tv); + if (tz) + do_get_tz(tz); + return 0; +} + +/* This will break when the xtime seconds get inaccurate, but that is + * unlikely */ +static time_t __vsyscall(1) vtime(time_t *t) +{ + if (unlikely(!__sysctl_vsyscall)) + return time_syscall(t); + else if (t) + *t = __xtime.tv_sec; + return __xtime.tv_sec; +} + +static long __vsyscall(2) venosys_0(void) +{ + return -ENOSYS; +} + +static long __vsyscall(3) venosys_1(void) +{ + return -ENOSYS; +} + +#ifdef CONFIG_SYSCTL + +#define SYSCALL 0x050f +#define NOP2 0x9090 + +/* + * NOP out syscall in vsyscall page when not needed. + */ +static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + extern u16 vsysc1, vsysc2; + u16 *map1, *map2; + int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); + if (!write) + return ret; + /* gcc has some trouble with __va(__pa()), so just do it this + way. */ + map1 = ioremap(__pa_symbol(&vsysc1), 2); + if (!map1) + return -ENOMEM; + map2 = ioremap(__pa_symbol(&vsysc2), 2); + if (!map2) { + ret = -ENOMEM; + goto out; + } + if (!sysctl_vsyscall) { + *map1 = SYSCALL; + *map2 = SYSCALL; + } else { + *map1 = NOP2; + *map2 = NOP2; + } + iounmap(map2); +out: + iounmap(map1); + return ret; +} + +static int vsyscall_sysctl_nostrat(ctl_table *t, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, + void **context) +{ + return -ENOSYS; +} + +static ctl_table kernel_table2[] = { + { .ctl_name = 99, .procname = "vsyscall64", + .data = &sysctl_vsyscall, .maxlen = sizeof(int), .mode = 0644, + .strategy = vsyscall_sysctl_nostrat, + .proc_handler = vsyscall_sysctl_change }, + { 0, } +}; + +static ctl_table kernel_root_table2[] = { + { .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555, + .child = kernel_table2 }, + { 0 }, +}; + +#endif + +static void __init map_vsyscall(void) +{ + extern char __vsyscall_0; + unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0); + + __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL); +} + +static int __init vsyscall_init(void) +{ + BUG_ON(((unsigned long) &vgettimeofday != + VSYSCALL_ADDR(__NR_vgettimeofday))); + BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime)); + BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE))); + map_vsyscall(); + sysctl_vsyscall = 1; + register_sysctl_table(kernel_root_table2, 0); + return 0; +} + +__initcall(vsyscall_init); diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c new file mode 100644 index 00000000000..88626e62688 --- /dev/null +++ b/arch/x86_64/kernel/x8664_ksyms.c @@ -0,0 +1,221 @@ +#include <linux/config.h> +#include <linux/module.h> +#include <linux/smp.h> +#include <linux/user.h> +#include <linux/sched.h> +#include <linux/in6.h> +#include <linux/interrupt.h> +#include <linux/smp_lock.h> +#include <linux/pm.h> +#include <linux/pci.h> +#include <linux/apm_bios.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/syscalls.h> +#include <linux/tty.h> +#include <linux/ioctl32.h> + +#include <asm/semaphore.h> +#include <asm/processor.h> +#include <asm/i387.h> +#include <asm/uaccess.h> +#include <asm/checksum.h> +#include <asm/io.h> +#include <asm/delay.h> +#include <asm/irq.h> +#include <asm/mmx.h> +#include <asm/desc.h> +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/nmi.h> +#include <asm/kdebug.h> +#include <asm/unistd.h> +#include <asm/tlbflush.h> +#include <asm/kdebug.h> + +extern spinlock_t rtc_lock; + +#ifdef CONFIG_SMP +extern void __write_lock_failed(rwlock_t *rw); +extern void __read_lock_failed(rwlock_t *rw); +#endif + +#if defined(CONFIG_BLK_DEV_IDE) || defined(CONFIG_BLK_DEV_HD) || defined(CONFIG_BLK_DEV_IDE_MODULE) || defined(CONFIG_BLK_DEV_HD_MODULE) +extern struct drive_info_struct drive_info; +EXPORT_SYMBOL(drive_info); +#endif + +extern unsigned long get_cmos_time(void); + +/* platform dependent support */ +EXPORT_SYMBOL(boot_cpu_data); +//EXPORT_SYMBOL(dump_fpu); +EXPORT_SYMBOL(__ioremap); +EXPORT_SYMBOL(ioremap_nocache); +EXPORT_SYMBOL(iounmap); +EXPORT_SYMBOL(enable_irq); +EXPORT_SYMBOL(disable_irq); +EXPORT_SYMBOL(disable_irq_nosync); +EXPORT_SYMBOL(probe_irq_mask); +EXPORT_SYMBOL(kernel_thread); +EXPORT_SYMBOL(pm_idle); +EXPORT_SYMBOL(pm_power_off); +EXPORT_SYMBOL(get_cmos_time); + +EXPORT_SYMBOL(__down_failed); +EXPORT_SYMBOL(__down_failed_interruptible); +EXPORT_SYMBOL(__down_failed_trylock); +EXPORT_SYMBOL(__up_wakeup); +/* Networking helper routines. */ +EXPORT_SYMBOL(csum_partial_copy_nocheck); +EXPORT_SYMBOL(ip_compute_csum); +/* Delay loops */ +EXPORT_SYMBOL(__udelay); +EXPORT_SYMBOL(__ndelay); +EXPORT_SYMBOL(__delay); +EXPORT_SYMBOL(__const_udelay); + +EXPORT_SYMBOL(__get_user_1); +EXPORT_SYMBOL(__get_user_2); +EXPORT_SYMBOL(__get_user_4); +EXPORT_SYMBOL(__get_user_8); +EXPORT_SYMBOL(__put_user_1); +EXPORT_SYMBOL(__put_user_2); +EXPORT_SYMBOL(__put_user_4); +EXPORT_SYMBOL(__put_user_8); + +EXPORT_SYMBOL(strpbrk); +EXPORT_SYMBOL(strstr); + +EXPORT_SYMBOL(strncpy_from_user); +EXPORT_SYMBOL(__strncpy_from_user); +EXPORT_SYMBOL(clear_user); +EXPORT_SYMBOL(__clear_user); +EXPORT_SYMBOL(copy_user_generic); +EXPORT_SYMBOL(copy_from_user); +EXPORT_SYMBOL(copy_to_user); +EXPORT_SYMBOL(copy_in_user); +EXPORT_SYMBOL(strnlen_user); + +#ifdef CONFIG_PCI +EXPORT_SYMBOL(pci_alloc_consistent); +EXPORT_SYMBOL(pci_free_consistent); +#endif + +#ifdef CONFIG_PCI +EXPORT_SYMBOL(pci_mem_start); +#endif + +EXPORT_SYMBOL(copy_page); +EXPORT_SYMBOL(clear_page); + +EXPORT_SYMBOL(cpu_pda); +#ifdef CONFIG_SMP +EXPORT_SYMBOL(cpu_data); +EXPORT_SYMBOL(cpu_online_map); +EXPORT_SYMBOL(__write_lock_failed); +EXPORT_SYMBOL(__read_lock_failed); + +EXPORT_SYMBOL(synchronize_irq); +EXPORT_SYMBOL(smp_call_function); +EXPORT_SYMBOL(cpu_callout_map); +#endif + +#ifdef CONFIG_VT +EXPORT_SYMBOL(screen_info); +#endif + +EXPORT_SYMBOL(get_wchan); + +EXPORT_SYMBOL(rtc_lock); + +EXPORT_SYMBOL_GPL(set_nmi_callback); +EXPORT_SYMBOL_GPL(unset_nmi_callback); + +/* Export string functions. We normally rely on gcc builtin for most of these, + but gcc sometimes decides not to inline them. */ +#undef memcpy +#undef memset +#undef memmove +#undef memchr +#undef strlen +#undef strcpy +#undef strncmp +#undef strncpy +#undef strchr +#undef strcmp +#undef strcpy +#undef strcat +#undef memcmp + +extern void * memset(void *,int,__kernel_size_t); +extern size_t strlen(const char *); +extern void * memmove(void * dest,const void *src,size_t count); +extern char * strcpy(char * dest,const char *src); +extern int strcmp(const char * cs,const char * ct); +extern void *memchr(const void *s, int c, size_t n); +extern void * memcpy(void *,const void *,__kernel_size_t); +extern void * __memcpy(void *,const void *,__kernel_size_t); +extern char * strcat(char *, const char *); +extern int memcmp(const void * cs,const void * ct,size_t count); + +EXPORT_SYMBOL(memset); +EXPORT_SYMBOL(strlen); +EXPORT_SYMBOL(memmove); +EXPORT_SYMBOL(strcpy); +EXPORT_SYMBOL(strncmp); +EXPORT_SYMBOL(strncpy); +EXPORT_SYMBOL(strchr); +EXPORT_SYMBOL(strcmp); +EXPORT_SYMBOL(strcat); +EXPORT_SYMBOL(strncat); +EXPORT_SYMBOL(memchr); +EXPORT_SYMBOL(strrchr); +EXPORT_SYMBOL(strnlen); +EXPORT_SYMBOL(memscan); +EXPORT_SYMBOL(memcpy); +EXPORT_SYMBOL(__memcpy); +EXPORT_SYMBOL(memcmp); + +#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM +/* prototypes are wrong, these are assembly with custom calling functions */ +extern void rwsem_down_read_failed_thunk(void); +extern void rwsem_wake_thunk(void); +extern void rwsem_downgrade_thunk(void); +extern void rwsem_down_write_failed_thunk(void); +EXPORT_SYMBOL(rwsem_down_read_failed_thunk); +EXPORT_SYMBOL(rwsem_wake_thunk); +EXPORT_SYMBOL(rwsem_downgrade_thunk); +EXPORT_SYMBOL(rwsem_down_write_failed_thunk); +#endif + +EXPORT_SYMBOL(empty_zero_page); + +#ifdef CONFIG_HAVE_DEC_LOCK +EXPORT_SYMBOL(_atomic_dec_and_lock); +#endif + +EXPORT_SYMBOL(die_chain); +EXPORT_SYMBOL(register_die_notifier); + +#ifdef CONFIG_SMP +EXPORT_SYMBOL(cpu_sibling_map); +EXPORT_SYMBOL(smp_num_siblings); +#endif + +extern void do_softirq_thunk(void); +EXPORT_SYMBOL(do_softirq_thunk); + +void out_of_line_bug(void); +EXPORT_SYMBOL(out_of_line_bug); + +EXPORT_SYMBOL(init_level4_pgt); + +extern unsigned long __supported_pte_mask; +EXPORT_SYMBOL(__supported_pte_mask); + +#ifdef CONFIG_SMP +EXPORT_SYMBOL(flush_tlb_page); +#endif + +EXPORT_SYMBOL(cpu_khz); |