diff options
Diffstat (limited to 'arch/x86')
136 files changed, 5474 insertions, 3685 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index bb0c0d0f6db..96e0c2ebc38 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -23,6 +23,8 @@ config X86 select HAVE_OPROFILE select HAVE_KPROBES select HAVE_KRETPROBES + select HAVE_DYNAMIC_FTRACE + select HAVE_FTRACE select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) select HAVE_ARCH_KGDB if !X86_VOYAGER @@ -168,6 +170,7 @@ config GENERIC_PENDING_IRQ config X86_SMP bool depends on SMP && ((X86_32 && !X86_VOYAGER) || X86_64) + select USE_GENERIC_SMP_HELPERS default y config X86_32_SMP @@ -181,12 +184,12 @@ config X86_64_SMP config X86_HT bool depends on SMP - depends on (X86_32 && !(X86_VISWS || X86_VOYAGER)) || X86_64 + depends on (X86_32 && !X86_VOYAGER) || X86_64 default y config X86_BIOS_REBOOT bool - depends on !X86_VISWS && !X86_VOYAGER + depends on !X86_VOYAGER default y config X86_TRAMPOLINE @@ -232,13 +235,13 @@ config SMP config X86_FIND_SMP_CONFIG def_bool y - depends on X86_MPPARSE || X86_VOYAGER || X86_VISWS + depends on X86_MPPARSE || X86_VOYAGER if ACPI config X86_MPPARSE def_bool y bool "Enable MPS table" - depends on X86_LOCAL_APIC && !X86_VISWS + depends on X86_LOCAL_APIC help For old smp systems that do not have proper acpi support. Newer systems (esp with 64bit cpus) with acpi support, MADT and DSDT will override it @@ -247,7 +250,7 @@ endif if !ACPI config X86_MPPARSE def_bool y - depends on X86_LOCAL_APIC && !X86_VISWS + depends on X86_LOCAL_APIC endif choice @@ -281,18 +284,6 @@ config X86_VOYAGER If you do not specifically know you have a Voyager based machine, say N here, otherwise the kernel you build will not be bootable. -config X86_VISWS - bool "SGI 320/540 (Visual Workstation)" - depends on X86_32 && !PCI - help - The SGI Visual Workstation series is an IA32-based workstation - based on SGI systems chips with some legacy PC hardware attached. - - Say Y here to create a kernel to run on the SGI 320 or 540. - - A kernel compiled for the Visual Workstation will not run on PCs - and vice versa. See <file:Documentation/sgi-visws.txt> for details. - config X86_GENERICARCH bool "Generic architecture" depends on X86_32 @@ -355,7 +346,7 @@ config X86_RDC321X config X86_VSMP bool "Support for ScaleMP vSMP" select PARAVIRT - depends on X86_64 && !PCI + depends on X86_64 && PCI help Support for ScaleMP vSMP systems. Say 'Y' here if this kernel is supposed to run on these EM64T-based machines. Only choose this option @@ -363,6 +354,18 @@ config X86_VSMP endchoice +config X86_VISWS + bool "SGI 320/540 (Visual Workstation)" + depends on X86_32 && PCI && !X86_VOYAGER && X86_MPPARSE && PCI_GODIRECT + help + The SGI Visual Workstation series is an IA32-based workstation + based on SGI systems chips with some legacy PC hardware attached. + + Say Y here to create a kernel to run on the SGI 320 or 540. + + A kernel compiled for the Visual Workstation will run on general + PCs as well. See <file:Documentation/sgi-visws.txt> for details. + config SCHED_NO_NO_OMIT_FRAME_POINTER def_bool y prompt "Single-depth WCHAN output" @@ -391,7 +394,7 @@ config VMI bool "VMI Guest support" select PARAVIRT depends on X86_32 - depends on !(X86_VISWS || X86_VOYAGER) + depends on !X86_VOYAGER help VMI provides a paravirtualized interface to the VMware ESX server (it could be used by other hypervisors in theory too, but is not @@ -402,7 +405,7 @@ config KVM_CLOCK bool "KVM paravirtualized clock" select PARAVIRT select PARAVIRT_CLOCK - depends on !(X86_VISWS || X86_VOYAGER) + depends on !X86_VOYAGER help Turning on this option will allow you to run a paravirtualized clock when running over the KVM hypervisor. Instead of relying on a PIT @@ -413,7 +416,7 @@ config KVM_CLOCK config KVM_GUEST bool "KVM Guest support" select PARAVIRT - depends on !(X86_VISWS || X86_VOYAGER) + depends on !X86_VOYAGER help This option enables various optimizations for running under the KVM hypervisor. @@ -422,7 +425,7 @@ source "arch/x86/lguest/Kconfig" config PARAVIRT bool "Enable paravirtualization code" - depends on !(X86_VISWS || X86_VOYAGER) + depends on !X86_VOYAGER help This changes the kernel so it can modify itself when it is run under a hypervisor, potentially improving performance significantly @@ -445,7 +448,6 @@ config PARAVIRT_DEBUG config MEMTEST bool "Memtest" depends on X86_64 - default y help This option adds a kernel parameter 'memtest', which allows memtest to be set. @@ -453,7 +455,7 @@ config MEMTEST memtest=1, mean do 1 test pattern; ... memtest=4, mean do 4 test patterns. - If you are unsure how to answer this question, answer Y. + If you are unsure how to answer this question, answer N. config X86_SUMMIT_NUMA def_bool y @@ -575,7 +577,7 @@ config SWIOTLB 3 GB of memory. If unsure, say Y. config IOMMU_HELPER - def_bool (CALGARY_IOMMU || GART_IOMMU || SWIOTLB) + def_bool (CALGARY_IOMMU || GART_IOMMU || SWIOTLB || AMD_IOMMU) config MAXSMP bool "Configure Maximum number of SMP Processors and NUMA Nodes" depends on X86_64 && SMP @@ -628,7 +630,7 @@ source "kernel/Kconfig.preempt" config X86_UP_APIC bool "Local APIC support on uniprocessors" - depends on X86_32 && !SMP && !(X86_VISWS || X86_VOYAGER || X86_GENERICARCH) + depends on X86_32 && !SMP && !(X86_VOYAGER || X86_GENERICARCH) help A local APIC (Advanced Programmable Interrupt Controller) is an integrated interrupt controller in the CPU. If you have a single-CPU @@ -653,11 +655,11 @@ config X86_UP_IOAPIC config X86_LOCAL_APIC def_bool y - depends on X86_64 || (X86_32 && (X86_UP_APIC || ((X86_VISWS || SMP) && !X86_VOYAGER) || X86_GENERICARCH)) + depends on X86_64 || (X86_32 && (X86_UP_APIC || (SMP && !X86_VOYAGER) || X86_GENERICARCH)) config X86_IO_APIC def_bool y - depends on X86_64 || (X86_32 && (X86_UP_IOAPIC || (SMP && !(X86_VISWS || X86_VOYAGER)) || X86_GENERICARCH)) + depends on X86_64 || (X86_32 && (X86_UP_IOAPIC || (SMP && !X86_VOYAGER) || X86_GENERICARCH)) config X86_VISWS_APIC def_bool y @@ -711,7 +713,7 @@ config X86_MCE_NONFATAL config X86_MCE_P4THERMAL bool "check for P4 thermal throttling interrupt." - depends on X86_32 && X86_MCE && (X86_UP_APIC || SMP) && !X86_VISWS + depends on X86_32 && X86_MCE && (X86_UP_APIC || SMP) help Enabling this feature will cause a message to be printed when the P4 enters thermal throttling. @@ -1133,21 +1135,18 @@ config MTRR See <file:Documentation/mtrr.txt> for more information. config MTRR_SANITIZER - def_bool y + bool prompt "MTRR cleanup support" depends on MTRR help - Convert MTRR layout from continuous to discrete, so some X driver - could add WB entries. - - Say N here if you see bootup problems (boot crash, boot hang, - spontaneous reboots). + Convert MTRR layout from continuous to discrete, so X drivers can + add writeback entries. - Could be disabled with disable_mtrr_cleanup. Also mtrr_chunk_size - could be used to send largest mtrr entry size for continuous block - to hold holes (aka. UC entries) + Can be disabled with disable_mtrr_cleanup on the kernel command line. + The largest mtrr entry size for a continous block can be set with + mtrr_chunk_size. - If unsure, say Y. + If unsure, say N. config MTRR_SANITIZER_ENABLE_DEFAULT int "MTRR cleanup enable value (0-1)" @@ -1164,7 +1163,7 @@ config MTRR_SANITIZER_SPARE_REG_NR_DEFAULT depends on MTRR_SANITIZER help mtrr cleanup spare entries default, it can be changed via - mtrr_spare_reg_nr= + mtrr_spare_reg_nr=N on the kernel command line. config X86_PAT bool @@ -1414,7 +1413,7 @@ config X86_APM_BOOT menuconfig APM tristate "APM (Advanced Power Management) BIOS support" - depends on X86_32 && PM_SLEEP && !X86_VISWS + depends on X86_32 && PM_SLEEP ---help--- APM is a BIOS specification for saving power using several different techniques. This is mostly useful for battery powered laptops with @@ -1561,7 +1560,7 @@ config PCI choice prompt "PCI access mode" - depends on X86_32 && PCI && !X86_VISWS + depends on X86_32 && PCI default PCI_GOANY ---help--- On PCI systems, the BIOS can be used to detect the PCI devices and @@ -1598,12 +1597,12 @@ endchoice config PCI_BIOS def_bool y - depends on X86_32 && !X86_VISWS && PCI && (PCI_GOBIOS || PCI_GOANY) + depends on X86_32 && PCI && (PCI_GOBIOS || PCI_GOANY) # x86-64 doesn't support PCI BIOS access from long mode so always go direct. config PCI_DIRECT def_bool y - depends on PCI && (X86_64 || (PCI_GODIRECT || PCI_GOANY || PCI_GOOLPC) || X86_VISWS) + depends on PCI && (X86_64 || (PCI_GODIRECT || PCI_GOANY || PCI_GOOLPC)) config PCI_MMCONFIG def_bool y @@ -1663,7 +1662,7 @@ if X86_32 config ISA bool "ISA support" - depends on !(X86_VOYAGER || X86_VISWS) + depends on !X86_VOYAGER help Find out whether you have ISA slots on your motherboard. ISA is the name of a bus system, i.e. the way the CPU talks to the other stuff @@ -1690,7 +1689,7 @@ config EISA source "drivers/eisa/Kconfig" config MCA - bool "MCA support" if !(X86_VISWS || X86_VOYAGER) + bool "MCA support" if !X86_VOYAGER default y if X86_VOYAGER help MicroChannel Architecture is found in some IBM PS/2 machines and diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 3d22bb8175b..abff1b84ed5 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -344,7 +344,7 @@ config X86_F00F_BUG config X86_WP_WORKS_OK def_bool y - depends on X86_32 && !M386 + depends on !M386 config X86_INVLPG def_bool y diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index acc0271920f..ae36bfa814e 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -171,6 +171,33 @@ config IOMMU_LEAK Add a simple leak tracer to the IOMMU code. This is useful when you are debugging a buggy device driver that leaks IOMMU mappings. +config MMIOTRACE_HOOKS + bool + +config MMIOTRACE + bool "Memory mapped IO tracing" + depends on DEBUG_KERNEL && PCI + select TRACING + select MMIOTRACE_HOOKS + help + Mmiotrace traces Memory Mapped I/O access and is meant for + debugging and reverse engineering. It is called from the ioremap + implementation and works via page faults. Tracing is disabled by + default and can be enabled at run-time. + + See Documentation/tracers/mmiotrace.txt. + If you are not helping to develop drivers, say N. + +config MMIOTRACE_TEST + tristate "Test module for mmiotrace" + depends on MMIOTRACE && m + help + This is a dumb module for testing mmiotrace. It is very dangerous + as it will write garbage to IO memory starting at a given address. + However, it should be safe to use on e.g. unused portion of VRAM. + + Say N, unless you absolutely know what you are doing. + # # IO delay types: # diff --git a/arch/x86/Makefile b/arch/x86/Makefile index b03d24b44bf..919ce21ea65 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -113,10 +113,6 @@ mcore-y := arch/x86/mach-default/ mflags-$(CONFIG_X86_VOYAGER) := -Iinclude/asm-x86/mach-voyager mcore-$(CONFIG_X86_VOYAGER) := arch/x86/mach-voyager/ -# VISWS subarch support -mflags-$(CONFIG_X86_VISWS) := -Iinclude/asm-x86/mach-visws -mcore-$(CONFIG_X86_VISWS) := arch/x86/mach-visws/ - # generic subarchitecture mflags-$(CONFIG_X86_GENERICARCH):= -Iinclude/asm-x86/mach-generic fcore-$(CONFIG_X86_GENERICARCH) += arch/x86/mach-generic/ diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 24e4d4928d6..20371d0635e 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -116,7 +116,7 @@ ENTRY(ia32_sysenter_target) pushfq CFI_ADJUST_CFA_OFFSET 8 /*CFI_REL_OFFSET rflags,0*/ - movl 8*3-THREAD_SIZE+threadinfo_sysenter_return(%rsp), %r10d + movl 8*3-THREAD_SIZE+TI_sysenter_return(%rsp), %r10d CFI_REGISTER rip,r10 pushq $__USER32_CS CFI_ADJUST_CFA_OFFSET 8 @@ -136,8 +136,9 @@ ENTRY(ia32_sysenter_target) .quad 1b,ia32_badarg .previous GET_THREAD_INFO(%r10) - orl $TS_COMPAT,threadinfo_status(%r10) - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10) + orl $TS_COMPAT,TI_status(%r10) + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \ + TI_flags(%r10) CFI_REMEMBER_STATE jnz sysenter_tracesys sysenter_do_call: @@ -149,9 +150,9 @@ sysenter_do_call: GET_THREAD_INFO(%r10) DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - testl $_TIF_ALLWORK_MASK,threadinfo_flags(%r10) + testl $_TIF_ALLWORK_MASK,TI_flags(%r10) jnz int_ret_from_sys_call - andl $~TS_COMPAT,threadinfo_status(%r10) + andl $~TS_COMPAT,TI_status(%r10) /* clear IF, that popfq doesn't enable interrupts early */ andl $~0x200,EFLAGS-R11(%rsp) movl RIP-R11(%rsp),%edx /* User %eip */ @@ -240,8 +241,9 @@ ENTRY(ia32_cstar_target) .quad 1b,ia32_badarg .previous GET_THREAD_INFO(%r10) - orl $TS_COMPAT,threadinfo_status(%r10) - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10) + orl $TS_COMPAT,TI_status(%r10) + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \ + TI_flags(%r10) CFI_REMEMBER_STATE jnz cstar_tracesys cstar_do_call: @@ -253,9 +255,9 @@ cstar_do_call: GET_THREAD_INFO(%r10) DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - testl $_TIF_ALLWORK_MASK,threadinfo_flags(%r10) + testl $_TIF_ALLWORK_MASK,TI_flags(%r10) jnz int_ret_from_sys_call - andl $~TS_COMPAT,threadinfo_status(%r10) + andl $~TS_COMPAT,TI_status(%r10) RESTORE_ARGS 1,-ARG_SKIP,1,1,1 movl RIP-ARGOFFSET(%rsp),%ecx CFI_REGISTER rip,rcx @@ -333,8 +335,9 @@ ENTRY(ia32_syscall) this could be a problem. */ SAVE_ARGS 0,0,1 GET_THREAD_INFO(%r10) - orl $TS_COMPAT,threadinfo_status(%r10) - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10) + orl $TS_COMPAT,TI_status(%r10) + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \ + TI_flags(%r10) jnz ia32_tracesys ia32_do_syscall: cmpl $(IA32_NR_syscalls-1),%eax diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 54829e2b516..da140611bb5 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -6,6 +6,12 @@ extra-y := head_$(BITS).o head$(BITS).o head.o init_task.o vmlinu CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) +ifdef CONFIG_FTRACE +# Do not profile debug utilities +CFLAGS_REMOVE_tsc.o = -pg +CFLAGS_REMOVE_rtc.o = -pg +endif + # # vsyscalls (which work on the user stack) should have # no stack-protector checks: @@ -13,12 +19,13 @@ CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) nostackp := $(call cc-option, -fno-stack-protector) CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp) CFLAGS_hpet.o := $(nostackp) -CFLAGS_tsc_64.o := $(nostackp) +CFLAGS_tsc.o := $(nostackp) obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o obj-y += traps_$(BITS).o irq_$(BITS).o obj-y += time_$(BITS).o ioport.o ldt.o obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o +obj-$(CONFIG_X86_VISWS) += visws_quirks.o obj-$(CONFIG_X86_32) += probe_roms_32.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o @@ -26,7 +33,7 @@ obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o obj-y += bootflag.o e820.o obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o obj-y += alternative.o i8253.o pci-nommu.o -obj-y += tsc_$(BITS).o io_delay.o rtc.o +obj-y += tsc.o io_delay.o rtc.o obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o obj-y += process.o @@ -56,6 +63,7 @@ obj-$(CONFIG_X86_MPPARSE) += mpparse.o obj-$(CONFIG_X86_LOCAL_APIC) += apic_$(BITS).o nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic_$(BITS).o obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o +obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 5c0107602b6..f489d7a9be9 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -37,6 +37,7 @@ #include <asm/pgtable.h> #include <asm/io_apic.h> #include <asm/apic.h> +#include <asm/genapic.h> #include <asm/io.h> #include <asm/mpspec.h> #include <asm/smp.h> @@ -83,8 +84,6 @@ int acpi_lapic; int acpi_ioapic; int acpi_strict; -static int disable_irq0_through_ioapic __initdata; - u8 acpi_sci_flags __initdata; int acpi_sci_override_gsi __initdata; int acpi_skip_timer_override __initdata; @@ -108,21 +107,6 @@ static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; */ enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_PIC; -#ifdef CONFIG_X86_64 - -/* rely on all ACPI tables being in the direct mapping */ -char *__init __acpi_map_table(unsigned long phys_addr, unsigned long size) -{ - if (!phys_addr || !size) - return NULL; - - if (phys_addr+size <= (max_pfn_mapped << PAGE_SHIFT) + PAGE_SIZE) - return __va(phys_addr); - - return NULL; -} - -#else /* * Temporarily use the virtual area starting from FIX_IO_APIC_BASE_END, @@ -141,11 +125,15 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size) unsigned long base, offset, mapped_size; int idx; - if (phys + size < 8 * 1024 * 1024) + if (!phys || !size) + return NULL; + + if (phys+size <= (max_low_pfn_mapped << PAGE_SHIFT)) return __va(phys); offset = phys & (PAGE_SIZE - 1); mapped_size = PAGE_SIZE - offset; + clear_fixmap(FIX_ACPI_END); set_fixmap(FIX_ACPI_END, phys); base = fix_to_virt(FIX_ACPI_END); @@ -157,13 +145,13 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size) if (--idx < FIX_ACPI_BEGIN) return NULL; /* cannot handle this */ phys += PAGE_SIZE; + clear_fixmap(idx); set_fixmap(idx, phys); mapped_size += PAGE_SIZE; } return ((unsigned char *)base + offset); } -#endif #ifdef CONFIG_PCI_MMCONFIG /* The physical address of the MMCONFIG aperture. Set from ACPI tables. */ @@ -992,10 +980,6 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) int pin; struct mp_config_intsrc mp_irq; - /* Skip the 8254 timer interrupt (IRQ 0) if requested. */ - if (bus_irq == 0 && disable_irq0_through_ioapic) - return; - /* * Convert 'gsi' to 'ioapic.pin'. */ @@ -1062,10 +1046,6 @@ void __init mp_config_acpi_legacy_irqs(void) for (i = 0; i < 16; i++) { int idx; - /* Skip the 8254 timer interrupt (IRQ 0) if requested. */ - if (i == 0 && disable_irq0_through_ioapic) - continue; - for (idx = 0; idx < mp_irq_entries; idx++) { struct mp_config_intsrc *irq = mp_irqs + idx; @@ -1373,8 +1353,6 @@ static void __init acpi_process_madt(void) return; } -#ifdef __i386__ - static int __init disable_acpi_irq(const struct dmi_system_id *d) { if (!acpi_force) { @@ -1425,13 +1403,12 @@ static int __init force_acpi_ht(const struct dmi_system_id *d) } /* - * Don't register any I/O APIC entries for the 8254 timer IRQ. + * Force ignoring BIOS IRQ0 pin2 override */ -static int __init -dmi_disable_irq0_through_ioapic(const struct dmi_system_id *d) +static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d) { - pr_notice("%s detected: disabling IRQ 0 through I/O APIC\n", d->ident); - disable_irq0_through_ioapic = 1; + pr_notice("%s detected: Ignoring BIOS IRQ0 pin2 override\n", d->ident); + acpi_skip_timer_override = 1; return 0; } @@ -1609,11 +1586,11 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = { * is enabled. This input is incorrectly designated the * ISA IRQ 0 via an interrupt source override even though * it is wired to the output of the master 8259A and INTIN0 - * is not connected at all. Abandon any attempts to route - * IRQ 0 through the I/O APIC therefore. + * is not connected at all. Force ignoring BIOS IRQ0 pin2 + * override in that cases. */ { - .callback = dmi_disable_irq0_through_ioapic, + .callback = dmi_ignore_irq0_timer_override, .ident = "HP NX6125 laptop", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), @@ -1621,7 +1598,7 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = { }, }, { - .callback = dmi_disable_irq0_through_ioapic, + .callback = dmi_ignore_irq0_timer_override, .ident = "HP NX6325 laptop", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), @@ -1631,8 +1608,6 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = { {} }; -#endif /* __i386__ */ - /* * acpi_boot_table_init() and acpi_boot_init() * called from setup_arch(), always. @@ -1660,9 +1635,7 @@ int __init acpi_boot_table_init(void) { int error; -#ifdef __i386__ dmi_check_system(acpi_dmi_table); -#endif /* * If acpi_disabled, bail out diff --git a/arch/x86/kernel/acpi/processor.c b/arch/x86/kernel/acpi/processor.c index de2d2e4ebad..7c074eec39f 100644 --- a/arch/x86/kernel/acpi/processor.c +++ b/arch/x86/kernel/acpi/processor.c @@ -56,6 +56,12 @@ static void init_intel_pdc(struct acpi_processor *pr, struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_ACPI)) buf[2] |= ACPI_PDC_T_FFH; + /* + * If mwait/monitor is unsupported, C2/C3_FFH will be disabled + */ + if (!cpu_has(c, X86_FEATURE_MWAIT)) + buf[2] &= ~(ACPI_PDC_C_C2C3_FFH); + obj->type = ACPI_TYPE_BUFFER; obj->buffer.length = 12; obj->buffer.pointer = (u8 *) buf; diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index e6a4b564cca..868de3d5c39 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -23,6 +23,15 @@ static unsigned long acpi_realmode; static char temp_stack[10240]; #endif +/* XXX: this macro should move to asm-x86/segment.h and be shared with the + boot code... */ +#define GDT_ENTRY(flags, base, limit) \ + (((u64)(base & 0xff000000) << 32) | \ + ((u64)flags << 40) | \ + ((u64)(limit & 0x00ff0000) << 32) | \ + ((u64)(base & 0x00ffffff) << 16) | \ + ((u64)(limit & 0x0000ffff))) + /** * acpi_save_state_mem - save kernel state * @@ -51,18 +60,27 @@ int acpi_save_state_mem(void) header->video_mode = saved_video_mode; header->wakeup_jmp_seg = acpi_wakeup_address >> 4; + + /* + * Set up the wakeup GDT. We set these up as Big Real Mode, + * that is, with limits set to 4 GB. At least the Lenovo + * Thinkpad X61 is known to need this for the video BIOS + * initialization quirk to work; this is likely to also + * be the case for other laptops or integrated video devices. + */ + /* GDT[0]: GDT self-pointer */ header->wakeup_gdt[0] = (u64)(sizeof(header->wakeup_gdt) - 1) + ((u64)(acpi_wakeup_address + ((char *)&header->wakeup_gdt - (char *)acpi_realmode)) << 16); - /* GDT[1]: real-mode-like code segment */ - header->wakeup_gdt[1] = (0x009bULL << 40) + - ((u64)acpi_wakeup_address << 16) + 0xffff; - /* GDT[2]: real-mode-like data segment */ - header->wakeup_gdt[2] = (0x0093ULL << 40) + - ((u64)acpi_wakeup_address << 16) + 0xffff; + /* GDT[1]: big real mode-like code segment */ + header->wakeup_gdt[1] = + GDT_ENTRY(0x809b, acpi_wakeup_address, 0xfffff); + /* GDT[2]: big real mode-like data segment */ + header->wakeup_gdt[2] = + GDT_ENTRY(0x8093, acpi_wakeup_address, 0xfffff); #ifndef CONFIG_64BIT store_gdt((struct desc_ptr *)&header->pmode_gdt); @@ -140,6 +158,8 @@ static int __init acpi_sleep_setup(char *str) acpi_realmode_flags |= 2; if (strncmp(str, "s3_beep", 7) == 0) acpi_realmode_flags |= 4; + if (strncmp(str, "old_ordering", 12) == 0) + acpi_old_suspend_ordering(); str = strchr(str, ','); if (str != NULL) str += strspn(str, ", \t"); diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 65c7857a90d..2763cb37b55 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1,6 +1,6 @@ #include <linux/module.h> #include <linux/sched.h> -#include <linux/spinlock.h> +#include <linux/mutex.h> #include <linux/list.h> #include <linux/kprobes.h> #include <linux/mm.h> @@ -143,7 +143,7 @@ static const unsigned char *const p6_nops[ASM_NOP_MAX+1] = { #ifdef CONFIG_X86_64 extern char __vsyscall_0; -static inline const unsigned char*const * find_nop_table(void) +const unsigned char *const *find_nop_table(void) { return boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || boot_cpu_data.x86 < 6 ? k8_nops : p6_nops; @@ -162,7 +162,7 @@ static const struct nop { { -1, NULL } }; -static const unsigned char*const * find_nop_table(void) +const unsigned char *const *find_nop_table(void) { const unsigned char *const *noptable = intel_nops; int i; @@ -279,7 +279,7 @@ struct smp_alt_module { struct list_head next; }; static LIST_HEAD(smp_alt_modules); -static DEFINE_SPINLOCK(smp_alt); +static DEFINE_MUTEX(smp_alt); static int smp_mode = 1; /* protected by smp_alt */ void alternatives_smp_module_add(struct module *mod, char *name, @@ -312,12 +312,12 @@ void alternatives_smp_module_add(struct module *mod, char *name, __func__, smp->locks, smp->locks_end, smp->text, smp->text_end, smp->name); - spin_lock(&smp_alt); + mutex_lock(&smp_alt); list_add_tail(&smp->next, &smp_alt_modules); if (boot_cpu_has(X86_FEATURE_UP)) alternatives_smp_unlock(smp->locks, smp->locks_end, smp->text, smp->text_end); - spin_unlock(&smp_alt); + mutex_unlock(&smp_alt); } void alternatives_smp_module_del(struct module *mod) @@ -327,17 +327,17 @@ void alternatives_smp_module_del(struct module *mod) if (smp_alt_once || noreplace_smp) return; - spin_lock(&smp_alt); + mutex_lock(&smp_alt); list_for_each_entry(item, &smp_alt_modules, next) { if (mod != item->mod) continue; list_del(&item->next); - spin_unlock(&smp_alt); + mutex_unlock(&smp_alt); DPRINTK("%s: %s\n", __func__, item->name); kfree(item); return; } - spin_unlock(&smp_alt); + mutex_unlock(&smp_alt); } void alternatives_smp_switch(int smp) @@ -359,7 +359,7 @@ void alternatives_smp_switch(int smp) return; BUG_ON(!smp && (num_online_cpus() > 1)); - spin_lock(&smp_alt); + mutex_lock(&smp_alt); /* * Avoid unnecessary switches because it forces JIT based VMs to @@ -383,7 +383,7 @@ void alternatives_smp_switch(int smp) mod->text, mod->text_end); } smp_mode = smp; - spin_unlock(&smp_alt); + mutex_unlock(&smp_alt); } #endif diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 6dea8306d8c..a437d027f20 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -82,6 +82,11 @@ int pic_mode; /* Have we found an MP table */ int smp_found_config; +static struct resource lapic_resource = { + .name = "Local APIC", + .flags = IORESOURCE_MEM | IORESOURCE_BUSY, +}; + static unsigned int calibration_result; static int lapic_next_event(unsigned long delta, @@ -969,7 +974,7 @@ void __cpuinit setup_local_APIC(void) * Double-check whether this APIC is really registered. */ if (!apic_id_registered()) - BUG(); + WARN_ON_ONCE(1); /* * Intel recommends to set DFR, LDR and TPR before enabling @@ -1335,6 +1340,10 @@ void __init smp_intr_init(void) /* IPI for generic function call */ alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); + + /* IPI for single call function */ + set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, + call_function_single_interrupt); } #endif @@ -1720,3 +1729,21 @@ static int __init apic_set_verbosity(char *str) } __setup("apic=", apic_set_verbosity); +static int __init lapic_insert_resource(void) +{ + if (!apic_phys) + return -1; + + /* Put local APIC into the resource map. */ + lapic_resource.start = apic_phys; + lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1; + insert_resource(&iomem_resource, &lapic_resource); + + return 0; +} + +/* + * need call insert after e820_reserve_resources() + * that is using request_resource + */ +late_initcall(lapic_insert_resource); diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 00e6d137095..bf9b441331e 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -204,6 +204,7 @@ #include <linux/module.h> #include <linux/poll.h> +#include <linux/smp_lock.h> #include <linux/types.h> #include <linux/stddef.h> #include <linux/timer.h> @@ -1212,9 +1213,9 @@ static int suspend(int vetoable) if (err != APM_SUCCESS) apm_error("suspend", err); err = (err == APM_SUCCESS) ? 0 : -EIO; - device_power_up(); + device_power_up(PMSG_RESUME); local_irq_enable(); - device_resume(); + device_resume(PMSG_RESUME); queue_event(APM_NORMAL_RESUME, NULL); spin_lock(&user_list_lock); for (as = user_list; as != NULL; as = as->next) { @@ -1239,7 +1240,7 @@ static void standby(void) apm_error("standby", err); local_irq_disable(); - device_power_up(); + device_power_up(PMSG_RESUME); local_irq_enable(); } @@ -1325,7 +1326,7 @@ static void check_events(void) ignore_bounce = 1; if ((event != APM_NORMAL_RESUME) || (ignore_normal_resume == 0)) { - device_resume(); + device_resume(PMSG_RESUME); queue_event(event, NULL); } ignore_normal_resume = 0; @@ -1549,10 +1550,12 @@ static int do_open(struct inode *inode, struct file *filp) { struct apm_user *as; + lock_kernel(); as = kmalloc(sizeof(*as), GFP_KERNEL); if (as == NULL) { printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n", sizeof(*as)); + unlock_kernel(); return -ENOMEM; } as->magic = APM_BIOS_MAGIC; @@ -1574,6 +1577,7 @@ static int do_open(struct inode *inode, struct file *filp) user_list = as; spin_unlock(&user_list_lock); filp->private_data = as; + unlock_kernel(); return 0; } diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 3295e7c08fe..bacf5deeec2 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -34,7 +34,7 @@ int main(void) ENTRY(pid); BLANK(); #undef ENTRY -#define ENTRY(entry) DEFINE(threadinfo_ ## entry, offsetof(struct thread_info, entry)) +#define ENTRY(entry) DEFINE(TI_ ## entry, offsetof(struct thread_info, entry)) ENTRY(flags); ENTRY(addr_limit); ENTRY(preempt_count); diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c index 958526d6a74..7c36fb8a28d 100644 --- a/arch/x86/kernel/cpu/amd_64.c +++ b/arch/x86/kernel/cpu/amd_64.c @@ -199,10 +199,15 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) * Don't do it for gbpages because there seems very little * benefit in doing so. */ - if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg) && - (tseg >> PMD_SHIFT) < - (max_pfn_mapped >> (PMD_SHIFT-PAGE_SHIFT))) + if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) { + printk(KERN_DEBUG "tseg: %010llx\n", tseg); + if ((tseg>>PMD_SHIFT) < + (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) || + ((tseg>>PMD_SHIFT) < + (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) && + (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT)))) set_memory_4k((unsigned long)__va(tseg), 1); + } } } diff --git a/arch/x86/kernel/cpu/centaur_64.c b/arch/x86/kernel/cpu/centaur_64.c index 13526fd5cce..1d181c40e2e 100644 --- a/arch/x86/kernel/cpu/centaur_64.c +++ b/arch/x86/kernel/cpu/centaur_64.c @@ -10,20 +10,12 @@ static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c) { if (c->x86 == 0x6 && c->x86_model >= 0xf) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + + set_cpu_cap(c, X86_FEATURE_SYSENTER32); } static void __cpuinit init_centaur(struct cpuinfo_x86 *c) { - /* Cache sizes */ - unsigned n; - - n = c->extended_cpuid_level; - if (n >= 0x80000008) { - unsigned eax = cpuid_eax(0x80000008); - c->x86_virt_bits = (eax >> 8) & 0xff; - c->x86_phys_bits = eax & 0xff; - } - if (c->x86 == 0x6 && c->x86_model >= 0xf) { c->x86_cache_alignment = c->x86_clflush_size * 2; set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c index 75185023529..7b8cc72feb4 100644 --- a/arch/x86/kernel/cpu/common_64.c +++ b/arch/x86/kernel/cpu/common_64.c @@ -98,7 +98,7 @@ int __cpuinit get_model_name(struct cpuinfo_x86 *c) void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) { - unsigned int n, dummy, eax, ebx, ecx, edx; + unsigned int n, dummy, ebx, ecx, edx; n = c->extended_cpuid_level; @@ -121,11 +121,6 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n", c->x86_cache_size, ecx & 0xFF); } - if (n >= 0x80000008) { - cpuid(0x80000008, &eax, &dummy, &dummy, &dummy); - c->x86_virt_bits = (eax >> 8) & 0xff; - c->x86_phys_bits = eax & 0xff; - } } void __cpuinit detect_ht(struct cpuinfo_x86 *c) @@ -314,6 +309,16 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) if (c->extended_cpuid_level >= 0x80000007) c->x86_power = cpuid_edx(0x80000007); + if (c->extended_cpuid_level >= 0x80000008) { + u32 eax = cpuid_eax(0x80000008); + + c->x86_virt_bits = (eax >> 8) & 0xff; + c->x86_phys_bits = eax & 0xff; + } + + /* Assume all 64-bit CPUs support 32-bit syscall */ + set_cpu_cap(c, X86_FEATURE_SYSCALL32); + if (c->x86_vendor != X86_VENDOR_UNKNOWN && cpu_devs[c->x86_vendor]->c_early_init) cpu_devs[c->x86_vendor]->c_early_init(c); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index fe9224c51d3..70609efdf1d 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -226,6 +226,10 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) if (cpu_has_bts) ds_init_intel(c); + +#ifdef CONFIG_X86_NUMAQ + numaq_tsc_disable(); +#endif } static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned int size) diff --git a/arch/x86/kernel/cpu/intel_64.c b/arch/x86/kernel/cpu/intel_64.c index fcb1cc9d75c..1019c58d39f 100644 --- a/arch/x86/kernel/cpu/intel_64.c +++ b/arch/x86/kernel/cpu/intel_64.c @@ -12,6 +12,8 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) if ((c->x86 == 0xf && c->x86_model >= 0x03) || (c->x86 == 0x6 && c->x86_model >= 0x0e)) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + + set_cpu_cap(c, X86_FEATURE_SYSENTER32); } /* @@ -52,9 +54,6 @@ static void __cpuinit srat_detect_node(void) static void __cpuinit init_intel(struct cpuinfo_x86 *c) { - /* Cache sizes */ - unsigned n; - init_intel_cacheinfo(c); if (c->cpuid_level > 9) { unsigned eax = cpuid_eax(10); @@ -76,13 +75,6 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) if (cpu_has_bts) ds_init_intel(c); - n = c->extended_cpuid_level; - if (n >= 0x80000008) { - unsigned eax = cpuid_eax(0x80000008); - c->x86_virt_bits = (eax >> 8) & 0xff; - c->x86_phys_bits = eax & 0xff; - } - if (c->x86 == 15) c->x86_cache_alignment = c->x86_clflush_size * 2; if (c->x86 == 6) diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 501ca1cea27..c4a7ec31394 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -9,6 +9,7 @@ #include <linux/types.h> #include <linux/kernel.h> #include <linux/sched.h> +#include <linux/smp_lock.h> #include <linux/string.h> #include <linux/rcupdate.h> #include <linux/kallsyms.h> @@ -363,7 +364,7 @@ static void mcheck_check_cpu(void *info) static void mcheck_timer(struct work_struct *work) { - on_each_cpu(mcheck_check_cpu, NULL, 1, 1); + on_each_cpu(mcheck_check_cpu, NULL, 1); /* * Alert userspace if needed. If we logged an MCE, reduce the @@ -532,10 +533,12 @@ static int open_exclu; /* already open exclusive? */ static int mce_open(struct inode *inode, struct file *file) { + lock_kernel(); spin_lock(&mce_state_lock); if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { spin_unlock(&mce_state_lock); + unlock_kernel(); return -EBUSY; } @@ -544,6 +547,7 @@ static int mce_open(struct inode *inode, struct file *file) open_count++; spin_unlock(&mce_state_lock); + unlock_kernel(); return nonseekable_open(inode, file); } @@ -617,7 +621,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, * Collect entries that were still getting written before the * synchronize. */ - on_each_cpu(collect_tscs, cpu_tsc, 1, 1); + on_each_cpu(collect_tscs, cpu_tsc, 1); for (i = next; i < MCE_LOG_LEN; i++) { if (mcelog.entry[i].finished && mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { @@ -742,7 +746,7 @@ static void mce_restart(void) if (next_interval) cancel_delayed_work(&mcheck_work); /* Timer race is harmless here */ - on_each_cpu(mce_init, NULL, 1, 1); + on_each_cpu(mce_init, NULL, 1); next_interval = check_interval * HZ; if (next_interval) schedule_delayed_work(&mcheck_work, diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c index 00ccb6c14ec..cc1fccdd31e 100644 --- a/arch/x86/kernel/cpu/mcheck/non-fatal.c +++ b/arch/x86/kernel/cpu/mcheck/non-fatal.c @@ -59,7 +59,7 @@ static DECLARE_DELAYED_WORK(mce_work, mce_work_fn); static void mce_work_fn(struct work_struct *work) { - on_each_cpu(mce_checkregs, NULL, 1, 1); + on_each_cpu(mce_checkregs, NULL, 1); schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE)); } diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 105afe12beb..6f23969c8fa 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -223,7 +223,7 @@ static void set_mtrr(unsigned int reg, unsigned long base, atomic_set(&data.gate,0); /* Start the ball rolling on other CPUs */ - if (smp_call_function(ipi_handler, &data, 1, 0) != 0) + if (smp_call_function(ipi_handler, &data, 0) != 0) panic("mtrr: timed out waiting for other CPUs\n"); local_irq_save(flags); @@ -1682,7 +1682,7 @@ void mtrr_ap_init(void) */ void mtrr_save_state(void) { - smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1, 1); + smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1); } static int __init mtrr_init_finialize(void) diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 2e9bef6e3aa..6d4bdc02388 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -189,7 +189,7 @@ void disable_lapic_nmi_watchdog(void) if (atomic_read(&nmi_active) <= 0) return; - on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1); + on_each_cpu(stop_apic_nmi_watchdog, NULL, 1); if (wd_ops) wd_ops->unreserve(); @@ -213,7 +213,7 @@ void enable_lapic_nmi_watchdog(void) return; } - on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1); + on_each_cpu(setup_apic_nmi_watchdog, NULL, 1); touch_nmi_watchdog(); } diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index daff52a6224..2de5fa2bbf7 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -33,6 +33,7 @@ #include <linux/init.h> #include <linux/poll.h> #include <linux/smp.h> +#include <linux/smp_lock.h> #include <linux/major.h> #include <linux/fs.h> #include <linux/smp_lock.h> @@ -95,7 +96,7 @@ static ssize_t cpuid_read(struct file *file, char __user *buf, for (; count; count -= 16) { cmd.eax = pos; cmd.ecx = pos >> 32; - smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1, 1); + smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1); if (copy_to_user(tmp, &cmd, 16)) return -EFAULT; tmp += 16; @@ -107,15 +108,23 @@ static ssize_t cpuid_read(struct file *file, char __user *buf, static int cpuid_open(struct inode *inode, struct file *file) { - unsigned int cpu = iminor(file->f_path.dentry->d_inode); - struct cpuinfo_x86 *c = &cpu_data(cpu); - - if (cpu >= NR_CPUS || !cpu_online(cpu)) - return -ENXIO; /* No such CPU */ + unsigned int cpu; + struct cpuinfo_x86 *c; + int ret = 0; + + lock_kernel(); + + cpu = iminor(file->f_path.dentry->d_inode); + if (cpu >= NR_CPUS || !cpu_online(cpu)) { + ret = -ENXIO; /* No such CPU */ + goto out; + } + c = &cpu_data(cpu); if (c->cpuid_level < 0) - return -EIO; /* CPUID not supported */ - - return 0; + ret = -EIO; /* CPUID not supported */ +out: + unlock_kernel(); + return ret; } /* diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index d0335853ff5..28c29180b38 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -19,6 +19,7 @@ #include <linux/mm.h> #include <linux/pfn.h> #include <linux/suspend.h> +#include <linux/firmware-map.h> #include <asm/pgtable.h> #include <asm/page.h> @@ -27,7 +28,22 @@ #include <asm/setup.h> #include <asm/trampoline.h> +/* + * The e820 map is the map that gets modified e.g. with command line parameters + * and that is also registered with modifications in the kernel resource tree + * with the iomem_resource as parent. + * + * The e820_saved is directly saved after the BIOS-provided memory map is + * copied. It doesn't get modified afterwards. It's registered for the + * /sys/firmware/memmap interface. + * + * That memory map is not modified and is used as base for kexec. The kexec'd + * kernel should get the same memory map as the firmware provides. Then the + * user can e.g. boot the original kernel with mem=1G while still booting the + * next kernel with full memory. + */ struct e820map e820; +struct e820map e820_saved; /* For PCI or other memory-mapped resources */ unsigned long pci_mem_start = 0xaeedbabe; @@ -398,8 +414,9 @@ static int __init append_e820_map(struct e820entry *biosmap, int nr_map) return __append_e820_map(biosmap, nr_map); } -u64 __init e820_update_range(u64 start, u64 size, unsigned old_type, - unsigned new_type) +static u64 __init e820_update_range_map(struct e820map *e820x, u64 start, + u64 size, unsigned old_type, + unsigned new_type) { int i; u64 real_updated_size = 0; @@ -410,7 +427,7 @@ u64 __init e820_update_range(u64 start, u64 size, unsigned old_type, size = ULLONG_MAX - start; for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; + struct e820entry *ei = &e820x->map[i]; u64 final_start, final_end; if (ei->type != old_type) continue; @@ -438,6 +455,19 @@ u64 __init e820_update_range(u64 start, u64 size, unsigned old_type, return real_updated_size; } +u64 __init e820_update_range(u64 start, u64 size, unsigned old_type, + unsigned new_type) +{ + return e820_update_range_map(&e820, start, size, old_type, new_type); +} + +static u64 __init e820_update_range_saved(u64 start, u64 size, + unsigned old_type, unsigned new_type) +{ + return e820_update_range_map(&e820_saved, start, size, old_type, + new_type); +} + /* make e820 not cover the range */ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type, int checktype) @@ -487,6 +517,15 @@ void __init update_e820(void) printk(KERN_INFO "modified physical RAM map:\n"); e820_print_map("modified"); } +static void __init update_e820_saved(void) +{ + int nr_map; + + nr_map = e820_saved.nr_map; + if (sanitize_e820_map(e820_saved.map, ARRAY_SIZE(e820_saved.map), &nr_map)) + return; + e820_saved.nr_map = nr_map; +} #define MAX_GAP_END 0x100000000ull /* * Search for a gap in the e820 memory space from start_addr to end_addr. @@ -991,8 +1030,10 @@ u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align) addr = round_down(start + size - sizet, align); e820_update_range(addr, sizet, E820_RAM, E820_RESERVED); + e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED); printk(KERN_INFO "update e820 for early_reserve_e820\n"); update_e820(); + update_e820_saved(); return addr; } @@ -1008,30 +1049,51 @@ u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align) #endif /* - * Last pfn which the user wants to use. - */ -unsigned long __initdata end_user_pfn = MAX_ARCH_PFN; - -/* * Find the highest page frame number we have available */ -unsigned long __init e820_end_of_ram(void) +static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type) { - unsigned long last_pfn; + int i; + unsigned long last_pfn = 0; unsigned long max_arch_pfn = MAX_ARCH_PFN; - last_pfn = find_max_pfn_with_active_regions(); + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + unsigned long start_pfn; + unsigned long end_pfn; + + if (ei->type != type) + continue; + + start_pfn = ei->addr >> PAGE_SHIFT; + end_pfn = (ei->addr + ei->size) >> PAGE_SHIFT; + + if (start_pfn >= limit_pfn) + continue; + if (end_pfn > limit_pfn) { + last_pfn = limit_pfn; + break; + } + if (end_pfn > last_pfn) + last_pfn = end_pfn; + } if (last_pfn > max_arch_pfn) last_pfn = max_arch_pfn; - if (last_pfn > end_user_pfn) - last_pfn = end_user_pfn; printk(KERN_INFO "last_pfn = %#lx max_arch_pfn = %#lx\n", last_pfn, max_arch_pfn); return last_pfn; } +unsigned long __init e820_end_of_ram_pfn(void) +{ + return e820_end_pfn(MAX_ARCH_PFN, E820_RAM); +} +unsigned long __init e820_end_of_low_ram_pfn(void) +{ + return e820_end_pfn(1UL<<(32 - PAGE_SHIFT), E820_RAM); +} /* * Finds an active region in the address range from start_pfn to last_pfn and * returns its range in ei_startpfn and ei_endpfn for the e820 entry. @@ -1062,12 +1124,6 @@ int __init e820_find_active_region(const struct e820entry *ei, if (*ei_endpfn > last_pfn) *ei_endpfn = last_pfn; - /* Obey end_user_pfn to save on memmap */ - if (*ei_startpfn >= end_user_pfn) - return 0; - if (*ei_endpfn > end_user_pfn) - *ei_endpfn = end_user_pfn; - return 1; } @@ -1113,6 +1169,8 @@ static void early_panic(char *msg) panic(msg); } +static int userdef __initdata; + /* "mem=nopentium" disables the 4MB page tables. */ static int __init parse_memopt(char *p) { @@ -1128,22 +1186,22 @@ static int __init parse_memopt(char *p) } #endif + userdef = 1; mem_size = memparse(p, &p); - end_user_pfn = mem_size>>PAGE_SHIFT; - e820_update_range(mem_size, ULLONG_MAX - mem_size, - E820_RAM, E820_RESERVED); + e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1); return 0; } early_param("mem", parse_memopt); -static int userdef __initdata; - static int __init parse_memmap_opt(char *p) { char *oldp; u64 start_at, mem_size; + if (!p) + return -EINVAL; + if (!strcmp(p, "exactmap")) { #ifdef CONFIG_CRASH_DUMP /* @@ -1151,9 +1209,7 @@ static int __init parse_memmap_opt(char *p) * the real mem size before original memory map is * reset. */ - e820_register_active_regions(0, 0, -1UL); - saved_max_pfn = e820_end_of_ram(); - remove_all_active_ranges(); + saved_max_pfn = e820_end_of_ram_pfn(); #endif e820.nr_map = 0; userdef = 1; @@ -1175,11 +1231,9 @@ static int __init parse_memmap_opt(char *p) } else if (*p == '$') { start_at = memparse(p+1, &p); e820_add_region(start_at, mem_size, E820_RESERVED); - } else { - end_user_pfn = (mem_size >> PAGE_SHIFT); - e820_update_range(mem_size, ULLONG_MAX - mem_size, - E820_RAM, E820_RESERVED); - } + } else + e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1); + return *p == '\0' ? 0 : -EINVAL; } early_param("memmap", parse_memmap_opt); @@ -1198,6 +1252,17 @@ void __init finish_e820_parsing(void) } } +static inline const char *e820_type_to_string(int e820_type) +{ + switch (e820_type) { + case E820_RESERVED_KERN: + case E820_RAM: return "System RAM"; + case E820_ACPI: return "ACPI Tables"; + case E820_NVS: return "ACPI Non-volatile Storage"; + default: return "reserved"; + } +} + /* * Mark e820 reserved areas as busy for the resource manager. */ @@ -1209,13 +1274,6 @@ void __init e820_reserve_resources(void) res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map); for (i = 0; i < e820.nr_map; i++) { - switch (e820.map[i].type) { - case E820_RESERVED_KERN: - case E820_RAM: res->name = "System RAM"; break; - case E820_ACPI: res->name = "ACPI Tables"; break; - case E820_NVS: res->name = "ACPI Non-volatile Storage"; break; - default: res->name = "reserved"; - } end = e820.map[i].addr + e820.map[i].size - 1; #ifndef CONFIG_RESOURCES_64BIT if (end > 0x100000000ULL) { @@ -1223,6 +1281,7 @@ void __init e820_reserve_resources(void) continue; } #endif + res->name = e820_type_to_string(e820.map[i].type); res->start = e820.map[i].addr; res->end = end; @@ -1230,8 +1289,20 @@ void __init e820_reserve_resources(void) insert_resource(&iomem_resource, res); res++; } + + for (i = 0; i < e820_saved.nr_map; i++) { + struct e820entry *entry = &e820_saved.map[i]; + firmware_map_add_early(entry->addr, + entry->addr + entry->size - 1, + e820_type_to_string(entry->type)); + } } +/* + * Non-standard memory setup can be specified via this quirk: + */ +char * (*arch_memory_setup_quirk)(void); + char *__init default_machine_specific_memory_setup(void) { char *who = "BIOS-e820"; @@ -1272,6 +1343,12 @@ char *__init default_machine_specific_memory_setup(void) char *__init __attribute__((weak)) machine_specific_memory_setup(void) { + if (arch_memory_setup_quirk) { + char *who = arch_memory_setup_quirk(); + + if (who) + return who; + } return default_machine_specific_memory_setup(); } @@ -1283,8 +1360,12 @@ char * __init __attribute__((weak)) memory_setup(void) void __init setup_memory_map(void) { + char *who; + + who = memory_setup(); + memcpy(&e820_saved, &e820, sizeof(struct e820map)); printk(KERN_INFO "BIOS-provided physical RAM map:\n"); - e820_print_map(memory_setup()); + e820_print_map(who); } #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index a4665f37cfc..a0e11c0cc87 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -120,7 +120,18 @@ static struct chipset early_qrk[] __initdata = { {} }; -static void __init check_dev_quirk(int num, int slot, int func) +/** + * check_dev_quirk - apply early quirks to a given PCI device + * @num: bus number + * @slot: slot number + * @func: PCI function + * + * Check the vendor & device ID against the early quirks table. + * + * If the device is single function, let early_quirks() know so we don't + * poke at this device again. + */ +static int __init check_dev_quirk(int num, int slot, int func) { u16 class; u16 vendor; @@ -131,7 +142,7 @@ static void __init check_dev_quirk(int num, int slot, int func) class = read_pci_config_16(num, slot, func, PCI_CLASS_DEVICE); if (class == 0xffff) - return; + return -1; /* no class, treat as single function */ vendor = read_pci_config_16(num, slot, func, PCI_VENDOR_ID); @@ -154,7 +165,9 @@ static void __init check_dev_quirk(int num, int slot, int func) type = read_pci_config_byte(num, slot, func, PCI_HEADER_TYPE); if (!(type & 0x80)) - return; + return -1; + + return 0; } void __init early_quirks(void) @@ -167,6 +180,9 @@ void __init early_quirks(void) /* Poor man's PCI discovery */ for (num = 0; num < 32; num++) for (slot = 0; slot < 32; slot++) - for (func = 0; func < 8; func++) - check_dev_quirk(num, slot, func); + for (func = 0; func < 8; func++) { + /* Only probe function 0 on single fn devices */ + if (check_dev_quirk(num, slot, func)) + break; + } } diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 643fd861b72..ff9e7350da5 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -196,7 +196,7 @@ static struct console simnow_console = { static struct console *early_console = &early_vga_console; static int early_console_initialized; -void early_printk(const char *fmt, ...) +asmlinkage void early_printk(const char *fmt, ...) { char buf[512]; int n; diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c index 94382faeadb..06cc8d4254b 100644 --- a/arch/x86/kernel/efi.c +++ b/arch/x86/kernel/efi.c @@ -473,7 +473,7 @@ void __init efi_enter_virtual_mode(void) size = md->num_pages << EFI_PAGE_SHIFT; end = md->phys_addr + size; - if (PFN_UP(end) <= max_pfn_mapped) + if (PFN_UP(end) <= max_low_pfn_mapped) va = __va(md->phys_addr); else va = efi_ioremap(md->phys_addr, size); diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 53393c306e1..6bc07f0f120 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -51,6 +51,7 @@ #include <asm/percpu.h> #include <asm/dwarf2.h> #include <asm/processor-flags.h> +#include <asm/ftrace.h> #include <asm/irq_vectors.h> /* @@ -1024,6 +1025,7 @@ ENTRY(xen_sysenter_target) RING0_INT_FRAME addl $5*4, %esp /* remove xen-provided frame */ jmp sysenter_past_esp + CFI_ENDPROC ENTRY(xen_hypervisor_callback) CFI_STARTPROC @@ -1110,6 +1112,77 @@ ENDPROC(xen_failsafe_callback) #endif /* CONFIG_XEN */ +#ifdef CONFIG_FTRACE +#ifdef CONFIG_DYNAMIC_FTRACE + +ENTRY(mcount) + pushl %eax + pushl %ecx + pushl %edx + movl 0xc(%esp), %eax + subl $MCOUNT_INSN_SIZE, %eax + +.globl mcount_call +mcount_call: + call ftrace_stub + + popl %edx + popl %ecx + popl %eax + + ret +END(mcount) + +ENTRY(ftrace_caller) + pushl %eax + pushl %ecx + pushl %edx + movl 0xc(%esp), %eax + movl 0x4(%ebp), %edx + subl $MCOUNT_INSN_SIZE, %eax + +.globl ftrace_call +ftrace_call: + call ftrace_stub + + popl %edx + popl %ecx + popl %eax + +.globl ftrace_stub +ftrace_stub: + ret +END(ftrace_caller) + +#else /* ! CONFIG_DYNAMIC_FTRACE */ + +ENTRY(mcount) + cmpl $ftrace_stub, ftrace_trace_function + jnz trace +.globl ftrace_stub +ftrace_stub: + ret + + /* taken from glibc */ +trace: + pushl %eax + pushl %ecx + pushl %edx + movl 0xc(%esp), %eax + movl 0x4(%ebp), %edx + subl $MCOUNT_INSN_SIZE, %eax + + call *ftrace_trace_function + + popl %edx + popl %ecx + popl %eax + + jmp ftrace_stub +END(mcount) +#endif /* CONFIG_DYNAMIC_FTRACE */ +#endif /* CONFIG_FTRACE */ + .section .rodata,"a" #include "syscall_table_32.S" diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 07d69f26233..ae63e584c34 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -51,9 +51,115 @@ #include <asm/page.h> #include <asm/irqflags.h> #include <asm/paravirt.h> +#include <asm/ftrace.h> .code64 +#ifdef CONFIG_FTRACE +#ifdef CONFIG_DYNAMIC_FTRACE +ENTRY(mcount) + + subq $0x38, %rsp + movq %rax, (%rsp) + movq %rcx, 8(%rsp) + movq %rdx, 16(%rsp) + movq %rsi, 24(%rsp) + movq %rdi, 32(%rsp) + movq %r8, 40(%rsp) + movq %r9, 48(%rsp) + + movq 0x38(%rsp), %rdi + subq $MCOUNT_INSN_SIZE, %rdi + +.globl mcount_call +mcount_call: + call ftrace_stub + + movq 48(%rsp), %r9 + movq 40(%rsp), %r8 + movq 32(%rsp), %rdi + movq 24(%rsp), %rsi + movq 16(%rsp), %rdx + movq 8(%rsp), %rcx + movq (%rsp), %rax + addq $0x38, %rsp + + retq +END(mcount) + +ENTRY(ftrace_caller) + + /* taken from glibc */ + subq $0x38, %rsp + movq %rax, (%rsp) + movq %rcx, 8(%rsp) + movq %rdx, 16(%rsp) + movq %rsi, 24(%rsp) + movq %rdi, 32(%rsp) + movq %r8, 40(%rsp) + movq %r9, 48(%rsp) + + movq 0x38(%rsp), %rdi + movq 8(%rbp), %rsi + subq $MCOUNT_INSN_SIZE, %rdi + +.globl ftrace_call +ftrace_call: + call ftrace_stub + + movq 48(%rsp), %r9 + movq 40(%rsp), %r8 + movq 32(%rsp), %rdi + movq 24(%rsp), %rsi + movq 16(%rsp), %rdx + movq 8(%rsp), %rcx + movq (%rsp), %rax + addq $0x38, %rsp + +.globl ftrace_stub +ftrace_stub: + retq +END(ftrace_caller) + +#else /* ! CONFIG_DYNAMIC_FTRACE */ +ENTRY(mcount) + cmpq $ftrace_stub, ftrace_trace_function + jnz trace +.globl ftrace_stub +ftrace_stub: + retq + +trace: + /* taken from glibc */ + subq $0x38, %rsp + movq %rax, (%rsp) + movq %rcx, 8(%rsp) + movq %rdx, 16(%rsp) + movq %rsi, 24(%rsp) + movq %rdi, 32(%rsp) + movq %r8, 40(%rsp) + movq %r9, 48(%rsp) + + movq 0x38(%rsp), %rdi + movq 8(%rbp), %rsi + subq $MCOUNT_INSN_SIZE, %rdi + + call *ftrace_trace_function + + movq 48(%rsp), %r9 + movq 40(%rsp), %r8 + movq 32(%rsp), %rdi + movq 24(%rsp), %rsi + movq 16(%rsp), %rdx + movq 8(%rsp), %rcx + movq (%rsp), %rax + addq $0x38, %rsp + + jmp ftrace_stub +END(mcount) +#endif /* CONFIG_DYNAMIC_FTRACE */ +#endif /* CONFIG_FTRACE */ + #ifndef CONFIG_PREEMPT #define retint_kernel retint_restore_args #endif @@ -168,13 +274,13 @@ ENTRY(ret_from_fork) CFI_ADJUST_CFA_OFFSET -4 call schedule_tail GET_THREAD_INFO(%rcx) - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx) + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%rcx) jnz rff_trace rff_action: RESTORE_REST testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread? je int_ret_from_sys_call - testl $_TIF_IA32,threadinfo_flags(%rcx) + testl $_TIF_IA32,TI_flags(%rcx) jnz int_ret_from_sys_call RESTORE_TOP_OF_STACK %rdi,ARGOFFSET jmp ret_from_sys_call @@ -243,7 +349,8 @@ ENTRY(system_call_after_swapgs) movq %rcx,RIP-ARGOFFSET(%rsp) CFI_REL_OFFSET rip,RIP-ARGOFFSET GET_THREAD_INFO(%rcx) - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx) + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \ + TI_flags(%rcx) jnz tracesys cmpq $__NR_syscall_max,%rax ja badsys @@ -262,7 +369,7 @@ sysret_check: GET_THREAD_INFO(%rcx) DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - movl threadinfo_flags(%rcx),%edx + movl TI_flags(%rcx),%edx andl %edi,%edx jnz sysret_careful CFI_REMEMBER_STATE @@ -305,7 +412,7 @@ sysret_signal: leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 xorl %esi,%esi # oldset -> arg2 call ptregscall_common -1: movl $_TIF_NEED_RESCHED,%edi +1: movl $_TIF_WORK_MASK,%edi /* Use IRET because user could have changed frame. This works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ DISABLE_INTERRUPTS(CLBR_NONE) @@ -347,10 +454,10 @@ int_ret_from_sys_call: int_with_check: LOCKDEP_SYS_EXIT_IRQ GET_THREAD_INFO(%rcx) - movl threadinfo_flags(%rcx),%edx + movl TI_flags(%rcx),%edx andl %edi,%edx jnz int_careful - andl $~TS_COMPAT,threadinfo_status(%rcx) + andl $~TS_COMPAT,TI_status(%rcx) jmp retint_swapgs /* Either reschedule or signal or syscall exit tracking needed. */ @@ -393,7 +500,7 @@ int_signal: movq %rsp,%rdi # &ptregs -> arg1 xorl %esi,%esi # oldset -> arg2 call do_notify_resume -1: movl $_TIF_NEED_RESCHED,%edi +1: movl $_TIF_WORK_MASK,%edi int_restore_rest: RESTORE_REST DISABLE_INTERRUPTS(CLBR_NONE) @@ -558,7 +665,7 @@ retint_with_reschedule: movl $_TIF_WORK_MASK,%edi retint_check: LOCKDEP_SYS_EXIT_IRQ - movl threadinfo_flags(%rcx),%edx + movl TI_flags(%rcx),%edx andl %edi,%edx CFI_REMEMBER_STATE jnz retint_careful @@ -646,17 +753,16 @@ retint_signal: RESTORE_REST DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - movl $_TIF_NEED_RESCHED,%edi GET_THREAD_INFO(%rcx) - jmp retint_check + jmp retint_with_reschedule #ifdef CONFIG_PREEMPT /* Returning to kernel space. Check if we need preemption */ /* rcx: threadinfo. interrupts off. */ ENTRY(retint_kernel) - cmpl $0,threadinfo_preempt_count(%rcx) + cmpl $0,TI_preempt_count(%rcx) jnz retint_restore_args - bt $TIF_NEED_RESCHED,threadinfo_flags(%rcx) + bt $TIF_NEED_RESCHED,TI_flags(%rcx) jnc retint_restore_args bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */ jnc retint_restore_args @@ -710,6 +816,9 @@ END(invalidate_interrupt\num) ENTRY(call_function_interrupt) apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt END(call_function_interrupt) +ENTRY(call_function_single_interrupt) + apicinterrupt CALL_FUNCTION_SINGLE_VECTOR,smp_call_function_single_interrupt +END(call_function_single_interrupt) ENTRY(irq_move_cleanup_interrupt) apicinterrupt IRQ_MOVE_CLEANUP_VECTOR,smp_irq_move_cleanup_interrupt END(irq_move_cleanup_interrupt) @@ -819,7 +928,7 @@ paranoid_restore\trace: jmp irq_return paranoid_userspace\trace: GET_THREAD_INFO(%rcx) - movl threadinfo_flags(%rcx),%ebx + movl TI_flags(%rcx),%ebx andl $_TIF_WORK_MASK,%ebx jz paranoid_swapgs\trace movq %rsp,%rdi /* &pt_regs */ @@ -917,7 +1026,7 @@ error_exit: testl %eax,%eax jne retint_kernel LOCKDEP_SYS_EXIT_IRQ - movl threadinfo_flags(%rcx),%edx + movl TI_flags(%rcx),%edx movl $_TIF_WORK_MASK,%edi andl %edi,%edx jnz retint_careful diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c new file mode 100644 index 00000000000..ab115cd15fd --- /dev/null +++ b/arch/x86/kernel/ftrace.c @@ -0,0 +1,141 @@ +/* + * Code for replacing ftrace calls with jumps. + * + * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com> + * + * Thanks goes to Ingo Molnar, for suggesting the idea. + * Mathieu Desnoyers, for suggesting postponing the modifications. + * Arjan van de Ven, for keeping me straight, and explaining to me + * the dangers of modifying code on the run. + */ + +#include <linux/spinlock.h> +#include <linux/hardirq.h> +#include <linux/ftrace.h> +#include <linux/percpu.h> +#include <linux/init.h> +#include <linux/list.h> + +#include <asm/alternative.h> +#include <asm/ftrace.h> + + +/* Long is fine, even if it is only 4 bytes ;-) */ +static long *ftrace_nop; + +union ftrace_code_union { + char code[MCOUNT_INSN_SIZE]; + struct { + char e8; + int offset; + } __attribute__((packed)); +}; + + +static int notrace ftrace_calc_offset(long ip, long addr) +{ + return (int)(addr - ip); +} + +notrace unsigned char *ftrace_nop_replace(void) +{ + return (char *)ftrace_nop; +} + +notrace unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) +{ + static union ftrace_code_union calc; + + calc.e8 = 0xe8; + calc.offset = ftrace_calc_offset(ip + MCOUNT_INSN_SIZE, addr); + + /* + * No locking needed, this must be called via kstop_machine + * which in essence is like running on a uniprocessor machine. + */ + return calc.code; +} + +notrace int +ftrace_modify_code(unsigned long ip, unsigned char *old_code, + unsigned char *new_code) +{ + unsigned replaced; + unsigned old = *(unsigned *)old_code; /* 4 bytes */ + unsigned new = *(unsigned *)new_code; /* 4 bytes */ + unsigned char newch = new_code[4]; + int faulted = 0; + + /* + * Note: Due to modules and __init, code can + * disappear and change, we need to protect against faulting + * as well as code changing. + * + * No real locking needed, this code is run through + * kstop_machine. + */ + asm volatile ( + "1: lock\n" + " cmpxchg %3, (%2)\n" + " jnz 2f\n" + " movb %b4, 4(%2)\n" + "2:\n" + ".section .fixup, \"ax\"\n" + "3: movl $1, %0\n" + " jmp 2b\n" + ".previous\n" + _ASM_EXTABLE(1b, 3b) + : "=r"(faulted), "=a"(replaced) + : "r"(ip), "r"(new), "c"(newch), + "0"(faulted), "a"(old) + : "memory"); + sync_core(); + + if (replaced != old && replaced != new) + faulted = 2; + + return faulted; +} + +notrace int ftrace_update_ftrace_func(ftrace_func_t func) +{ + unsigned long ip = (unsigned long)(&ftrace_call); + unsigned char old[MCOUNT_INSN_SIZE], *new; + int ret; + + memcpy(old, &ftrace_call, MCOUNT_INSN_SIZE); + new = ftrace_call_replace(ip, (unsigned long)func); + ret = ftrace_modify_code(ip, old, new); + + return ret; +} + +notrace int ftrace_mcount_set(unsigned long *data) +{ + unsigned long ip = (long)(&mcount_call); + unsigned long *addr = data; + unsigned char old[MCOUNT_INSN_SIZE], *new; + + /* + * Replace the mcount stub with a pointer to the + * ip recorder function. + */ + memcpy(old, &mcount_call, MCOUNT_INSN_SIZE); + new = ftrace_call_replace(ip, *addr); + *addr = ftrace_modify_code(ip, old, new); + + return 0; +} + +int __init ftrace_dyn_arch_init(void *data) +{ + const unsigned char *const *noptable = find_nop_table(); + + /* This is running in kstop_machine */ + + ftrace_mcount_set(data); + + ftrace_nop = (unsigned long *)noptable[MCOUNT_INSN_SIZE]; + + return 0; +} diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c index 45e84acca8a..711f11c30b0 100644 --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/genx2apic_uv_x.c @@ -8,6 +8,7 @@ * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved. */ +#include <linux/kernel.h> #include <linux/threads.h> #include <linux/cpumask.h> #include <linux/string.h> @@ -20,6 +21,7 @@ #include <asm/smp.h> #include <asm/ipi.h> #include <asm/genapic.h> +#include <asm/pgtable.h> #include <asm/uv/uv_mmrs.h> #include <asm/uv/uv_hub.h> @@ -208,14 +210,79 @@ static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size) BUG(); } +static __init void map_low_mmrs(void) +{ + init_extra_mapping_uc(UV_GLOBAL_MMR32_BASE, UV_GLOBAL_MMR32_SIZE); + init_extra_mapping_uc(UV_LOCAL_MMR_BASE, UV_LOCAL_MMR_SIZE); +} + +enum map_type {map_wb, map_uc}; + +static void map_high(char *id, unsigned long base, int shift, enum map_type map_type) +{ + unsigned long bytes, paddr; + + paddr = base << shift; + bytes = (1UL << shift); + printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr, + paddr + bytes); + if (map_type == map_uc) + init_extra_mapping_uc(paddr, bytes); + else + init_extra_mapping_wb(paddr, bytes); + +} +static __init void map_gru_high(int max_pnode) +{ + union uvh_rh_gam_gru_overlay_config_mmr_u gru; + int shift = UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT; + + gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR); + if (gru.s.enable) + map_high("GRU", gru.s.base, shift, map_wb); +} + +static __init void map_config_high(int max_pnode) +{ + union uvh_rh_gam_cfg_overlay_config_mmr_u cfg; + int shift = UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_BASE_SHFT; + + cfg.v = uv_read_local_mmr(UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR); + if (cfg.s.enable) + map_high("CONFIG", cfg.s.base, shift, map_uc); +} + +static __init void map_mmr_high(int max_pnode) +{ + union uvh_rh_gam_mmr_overlay_config_mmr_u mmr; + int shift = UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT; + + mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR); + if (mmr.s.enable) + map_high("MMR", mmr.s.base, shift, map_uc); +} + +static __init void map_mmioh_high(int max_pnode) +{ + union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh; + int shift = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT; + + mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR); + if (mmioh.s.enable) + map_high("MMIOH", mmioh.s.base, shift, map_uc); +} + static __init void uv_system_init(void) { union uvh_si_addr_map_config_u m_n_config; union uvh_node_id_u node_id; unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size; int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val; + int max_pnode = 0; unsigned long mmr_base, present; + map_low_mmrs(); + m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG); m_val = m_n_config.s.m_skt; n_val = m_n_config.s.n_skt; @@ -281,12 +348,18 @@ static __init void uv_system_init(void) uv_cpu_hub_info(cpu)->coherency_domain_number = 0;/* ZZZ */ uv_node_to_blade[nid] = blade; uv_cpu_to_blade[cpu] = blade; + max_pnode = max(pnode, max_pnode); - printk(KERN_DEBUG "UV cpu %d, apicid 0x%x, pnode %d, nid %d, " + printk(KERN_DEBUG "UV: cpu %d, apicid 0x%x, pnode %d, nid %d, " "lcpu %d, blade %d\n", cpu, per_cpu(x86_cpu_to_apicid, cpu), pnode, nid, lcpu, blade); } + + map_gru_high(max_pnode); + map_mmr_high(max_pnode); + map_config_high(max_pnode); + map_mmioh_high(max_pnode); } /* diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index ea230ec6905..0ea6a19bfdf 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -36,26 +36,15 @@ static inline void hpet_writel(unsigned long d, unsigned long a) } #ifdef CONFIG_X86_64 - #include <asm/pgtable.h> - -static inline void hpet_set_mapping(void) -{ - set_fixmap_nocache(FIX_HPET_BASE, hpet_address); - __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE); - hpet_virt_address = (void __iomem *)fix_to_virt(FIX_HPET_BASE); -} - -static inline void hpet_clear_mapping(void) -{ - hpet_virt_address = NULL; -} - -#else +#endif static inline void hpet_set_mapping(void) { hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE); +#ifdef CONFIG_X86_64 + __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE); +#endif } static inline void hpet_clear_mapping(void) @@ -63,7 +52,6 @@ static inline void hpet_clear_mapping(void) iounmap(hpet_virt_address); hpet_virt_address = NULL; } -#endif /* * HPET command line enable / disable diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c index deb43785e92..dd7ebee446a 100644 --- a/arch/x86/kernel/i386_ksyms_32.c +++ b/arch/x86/kernel/i386_ksyms_32.c @@ -1,7 +1,14 @@ #include <linux/module.h> + #include <asm/checksum.h> -#include <asm/desc.h> #include <asm/pgtable.h> +#include <asm/desc.h> +#include <asm/ftrace.h> + +#ifdef CONFIG_FTRACE +/* mcount is defined in assembly */ +EXPORT_SYMBOL(mcount); +#endif /* Networking helper routines. */ EXPORT_SYMBOL(csum_partial_copy_generic); diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index 337ec3438a8..558abf4c796 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -1569,7 +1569,7 @@ void /*__init*/ print_local_APIC(void *dummy) void print_all_local_APICs(void) { - on_each_cpu(print_local_APIC, NULL, 1, 1); + on_each_cpu(print_local_APIC, NULL, 1); } void /*__init*/ print_PIC(void) @@ -2020,7 +2020,7 @@ static inline void init_IO_APIC_traps(void) * The local APIC irq-chip implementation: */ -static void ack_apic(unsigned int irq) +static void ack_lapic_irq(unsigned int irq) { ack_APIC_irq(); } @@ -2045,9 +2045,17 @@ static struct irq_chip lapic_chip __read_mostly = { .name = "local-APIC", .mask = mask_lapic_irq, .unmask = unmask_lapic_irq, - .eoi = ack_apic, + .ack = ack_lapic_irq, }; +static void lapic_register_intr(int irq, int vector) +{ + irq_desc[irq].status &= ~IRQ_LEVEL; + set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, + "edge"); + set_intr_gate(vector, interrupt[irq]); +} + static void __init setup_nmi(void) { /* @@ -2247,8 +2255,7 @@ static inline void __init check_timer(void) printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); - set_irq_chip_and_handler_name(0, &lapic_chip, handle_fasteoi_irq, - "fasteoi"); + lapic_register_intr(0, vector); apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */ enable_8259A_irq(0); @@ -2280,11 +2287,21 @@ out: } /* - * - * IRQ's that are handled by the PIC in the MPS IOAPIC case. - * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ. - * Linux doesn't really care, as it's not actually used - * for any interrupt handling anyway. + * Traditionally ISA IRQ2 is the cascade IRQ, and is not available + * to devices. However there may be an I/O APIC pin available for + * this interrupt regardless. The pin may be left unconnected, but + * typically it will be reused as an ExtINT cascade interrupt for + * the master 8259A. In the MPS case such a pin will normally be + * reported as an ExtINT interrupt in the MP table. With ACPI + * there is no provision for ExtINT interrupts, and in the absence + * of an override it would be treated as an ordinary ISA I/O APIC + * interrupt, that is edge-triggered and unmasked by default. We + * used to do this, but it caused problems on some systems because + * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using + * the same ExtINT cascade interrupt to drive the local APIC of the + * bootstrap processor. Therefore we refrain from routing IRQ2 to + * the I/O APIC in all cases now. No actual device should request + * it anyway. --macro */ #define PIC_IRQS (1 << PIC_CASCADE_IR) @@ -2298,10 +2315,7 @@ void __init setup_IO_APIC(void) enable_IO_APIC(); - if (acpi_ioapic) - io_apic_irqs = ~0; /* all IRQs go through IOAPIC */ - else - io_apic_irqs = ~PIC_IRQS; + io_apic_irqs = ~PIC_IRQS; printk("ENABLING IO-APIC IRQs\n"); diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 2b4c40bc12c..6510cde36b3 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -1160,7 +1160,7 @@ void __apicdebuginit print_local_APIC(void * dummy) void print_all_local_APICs (void) { - on_each_cpu(print_local_APIC, NULL, 1, 1); + on_each_cpu(print_local_APIC, NULL, 1); } void __apicdebuginit print_PIC(void) @@ -1554,7 +1554,7 @@ static inline void init_IO_APIC_traps(void) } } -static void enable_lapic_irq (unsigned int irq) +static void unmask_lapic_irq(unsigned int irq) { unsigned long v; @@ -1562,7 +1562,7 @@ static void enable_lapic_irq (unsigned int irq) apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); } -static void disable_lapic_irq (unsigned int irq) +static void mask_lapic_irq(unsigned int irq) { unsigned long v; @@ -1575,19 +1575,20 @@ static void ack_lapic_irq (unsigned int irq) ack_APIC_irq(); } -static void end_lapic_irq (unsigned int i) { /* nothing */ } - -static struct hw_interrupt_type lapic_irq_type __read_mostly = { - .name = "local-APIC", - .typename = "local-APIC-edge", - .startup = NULL, /* startup_irq() not used for IRQ0 */ - .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */ - .enable = enable_lapic_irq, - .disable = disable_lapic_irq, - .ack = ack_lapic_irq, - .end = end_lapic_irq, +static struct irq_chip lapic_chip __read_mostly = { + .name = "local-APIC", + .mask = mask_lapic_irq, + .unmask = unmask_lapic_irq, + .ack = ack_lapic_irq, }; +static void lapic_register_intr(int irq) +{ + irq_desc[irq].status &= ~IRQ_LEVEL; + set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, + "edge"); +} + static void __init setup_nmi(void) { /* @@ -1714,11 +1715,6 @@ static inline void __init check_timer(void) apic2 = apic1; } - replace_pin_at_irq(0, 0, 0, apic1, pin1); - apic1 = 0; - pin1 = 0; - setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); - if (pin1 != -1) { /* * Ok, does IRQ0 through the IOAPIC work? @@ -1779,7 +1775,7 @@ static inline void __init check_timer(void) apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); - irq_desc[0].chip = &lapic_irq_type; + lapic_register_intr(0); apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ enable_8259A_irq(0); @@ -1817,11 +1813,21 @@ static int __init notimercheck(char *s) __setup("no_timer_check", notimercheck); /* - * - * IRQs that are handled by the PIC in the MPS IOAPIC case. - * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ. - * Linux doesn't really care, as it's not actually used - * for any interrupt handling anyway. + * Traditionally ISA IRQ2 is the cascade IRQ, and is not available + * to devices. However there may be an I/O APIC pin available for + * this interrupt regardless. The pin may be left unconnected, but + * typically it will be reused as an ExtINT cascade interrupt for + * the master 8259A. In the MPS case such a pin will normally be + * reported as an ExtINT interrupt in the MP table. With ACPI + * there is no provision for ExtINT interrupts, and in the absence + * of an override it would be treated as an ordinary ISA I/O APIC + * interrupt, that is edge-triggered and unmasked by default. We + * used to do this, but it caused problems on some systems because + * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using + * the same ExtINT cascade interrupt to drive the local APIC of the + * bootstrap processor. Therefore we refrain from routing IRQ2 to + * the I/O APIC in all cases now. No actual device should request + * it anyway. --macro */ #define PIC_IRQS (1<<2) @@ -1832,10 +1838,7 @@ void __init setup_IO_APIC(void) * calling enable_IO_APIC() is moved to setup_local_APIC for BP */ - if (acpi_ioapic) - io_apic_irqs = ~0; /* all IRQs go through IOAPIC */ - else - io_apic_irqs = ~PIC_IRQS; + io_apic_irqs = ~PIC_IRQS; apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index 31f49e8f46a..0373e88de95 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -199,6 +199,10 @@ void __init native_init_IRQ(void) /* IPI for generic function call */ alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); + /* IPI for generic single function call */ + alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, + call_function_single_interrupt); + /* Low priority IPI to cleanup after moving an irq */ set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); #endif diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 21f2bae98c1..a8449571858 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -68,7 +68,7 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload) load_LDT(pc); mask = cpumask_of_cpu(smp_processor_id()); if (!cpus_equal(current->mm->cpu_vm_mask, mask)) - smp_call_function(flush_ldt, current->mm, 1, 1); + smp_call_function(flush_ldt, current->mm, 1); preempt_enable(); #else load_LDT(pc); diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index f4960171bc6..8864230d55a 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -11,6 +11,8 @@ #include <linux/delay.h> #include <linux/init.h> #include <linux/numa.h> +#include <linux/ftrace.h> + #include <asm/pgtable.h> #include <asm/pgalloc.h> #include <asm/tlbflush.h> @@ -107,6 +109,8 @@ NORET_TYPE void machine_kexec(struct kimage *image) unsigned long page_list[PAGES_NR]; void *control_page; + tracer_disable(); + /* Interrupts aren't acceptable while we reboot */ local_irq_disable(); diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 7830dc4a838..9dd9262693a 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -11,6 +11,8 @@ #include <linux/string.h> #include <linux/reboot.h> #include <linux/numa.h> +#include <linux/ftrace.h> + #include <asm/pgtable.h> #include <asm/tlbflush.h> #include <asm/mmu_context.h> @@ -184,6 +186,8 @@ NORET_TYPE void machine_kexec(struct kimage *image) unsigned long page_list[PAGES_NR]; void *control_page; + tracer_disable(); + /* Interrupts aren't acceptable while we reboot */ local_irq_disable(); diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c index 9758fea87c5..56b933119a0 100644 --- a/arch/x86/kernel/microcode.c +++ b/arch/x86/kernel/microcode.c @@ -76,6 +76,7 @@ #include <linux/kernel.h> #include <linux/init.h> #include <linux/sched.h> +#include <linux/smp_lock.h> #include <linux/cpumask.h> #include <linux/module.h> #include <linux/slab.h> @@ -423,6 +424,7 @@ out: static int microcode_open (struct inode *unused1, struct file *unused2) { + cycle_kernel_lock(); return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; } @@ -489,7 +491,7 @@ MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); #define microcode_dev_exit() do { } while(0) #endif -static long get_next_ucode_from_buffer(void **mc, void *buf, +static long get_next_ucode_from_buffer(void **mc, const u8 *buf, unsigned long size, long offset) { microcode_header_t *mc_header; @@ -523,7 +525,7 @@ static int cpu_request_microcode(int cpu) char name[30]; struct cpuinfo_x86 *c = &cpu_data(cpu); const struct firmware *firmware; - void *buf; + const u8 *buf; unsigned long size; long offset = 0; int error; diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 8b6b1e05c30..3b25e49380c 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -726,12 +726,22 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type) static struct intel_mp_floating *mpf_found; /* + * Machine specific quirk for finding the SMP config before other setup + * activities destroy the table: + */ +int (*mach_get_smp_config_quirk)(unsigned int early); + +/* * Scan the memory blocks for an SMP configuration block. */ -static void __init __get_smp_config(unsigned early) +static void __init __get_smp_config(unsigned int early) { struct intel_mp_floating *mpf = mpf_found; + if (mach_get_smp_config_quirk) { + if (mach_get_smp_config_quirk(early)) + return; + } if (acpi_lapic && early) return; /* @@ -889,10 +899,16 @@ static int __init smp_scan_config(unsigned long base, unsigned long length, return 0; } -static void __init __find_smp_config(unsigned reserve) +int (*mach_find_smp_config_quirk)(unsigned int reserve); + +static void __init __find_smp_config(unsigned int reserve) { unsigned int address; + if (mach_find_smp_config_quirk) { + if (mach_find_smp_config_quirk(reserve)) + return; + } /* * FIXME: Linux assumes you have 640K of base ram.. * this continues the error... diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 1f3abe048e9..a153b3905f6 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -117,12 +117,20 @@ static int msr_open(struct inode *inode, struct file *file) { unsigned int cpu = iminor(file->f_path.dentry->d_inode); struct cpuinfo_x86 *c = &cpu_data(cpu); + int ret = 0; - if (cpu >= NR_CPUS || !cpu_online(cpu)) - return -ENXIO; /* No such CPU */ - if (!cpu_has(c, X86_FEATURE_MSR)) - return -EIO; /* MSR not supported */ + lock_kernel(); + cpu = iminor(file->f_path.dentry->d_inode); + if (cpu >= NR_CPUS || !cpu_online(cpu)) { + ret = -ENXIO; /* No such CPU */ + goto out; + } + c = &cpu_data(cpu); + if (!cpu_has(c, X86_FEATURE_MSR)) + ret = -EIO; /* MSR not supported */ +out: + unlock_kernel(); return 0; } diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 8dfe9db87a9..ec024b3baad 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -130,7 +130,7 @@ int __init check_nmi_watchdog(void) #ifdef CONFIG_SMP if (nmi_watchdog == NMI_LOCAL_APIC) - smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0); + smp_call_function(nmi_cpu_busy, (void *)&endflag, 0); #endif for_each_possible_cpu(cpu) @@ -171,6 +171,9 @@ int __init check_nmi_watchdog(void) error: if (nmi_watchdog == NMI_IO_APIC && !timer_through_8259) disable_8259A_irq(0); +#ifdef CONFIG_X86_32 + timer_ack = 0; +#endif return -1; } @@ -269,7 +272,7 @@ static void __acpi_nmi_enable(void *__unused) void acpi_nmi_enable(void) { if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) - on_each_cpu(__acpi_nmi_enable, NULL, 0, 1); + on_each_cpu(__acpi_nmi_enable, NULL, 1); } static void __acpi_nmi_disable(void *__unused) @@ -283,7 +286,7 @@ static void __acpi_nmi_disable(void *__unused) void acpi_nmi_disable(void) { if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) - on_each_cpu(__acpi_nmi_disable, NULL, 0, 1); + on_each_cpu(__acpi_nmi_disable, NULL, 1); } void setup_apic_nmi_watchdog(void *unused) diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c index f0f1de1c4a1..a23e8233b9a 100644 --- a/arch/x86/kernel/numaq_32.c +++ b/arch/x86/kernel/numaq_32.c @@ -93,12 +93,13 @@ int __init get_memcfg_numaq(void) return 1; } -static int __init numaq_tsc_disable(void) +void __init numaq_tsc_disable(void) { + if (!found_numaq) + return; + if (num_online_nodes() > 1) { printk(KERN_DEBUG "NUMAQ: disabling TSC\n"); setup_clear_cpu_cap(X86_FEATURE_TSC); } - return 0; } -arch_initcall(numaq_tsc_disable); diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index e7e5652f65b..e0f571d58c1 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -285,7 +285,7 @@ struct pv_time_ops pv_time_ops = { .get_wallclock = native_get_wallclock, .set_wallclock = native_set_wallclock, .sched_clock = native_sched_clock, - .get_cpu_khz = native_calculate_cpu_khz, + .get_tsc_khz = native_calibrate_tsc, }; struct pv_irq_ops pv_irq_ops = { diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index d0d18db5d2a..c3fe78406d1 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -630,6 +630,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info) struct pci_dev *dev; void *gatt; int i, error; + unsigned long start_pfn, end_pfn; printk(KERN_INFO "PCI-DMA: Disabling AGP.\n"); aper_size = aper_base = info->aper_size = 0; @@ -674,6 +675,13 @@ static __init int init_k8_gatt(struct agp_kern_info *info) printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n", aper_base, aper_size>>10); + + /* need to map that range */ + end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT); + if (end_pfn > max_low_pfn_mapped) { + start_pfn = (aper_base>>PAGE_SHIFT); + init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT); + } return 0; nommu: diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 4061d63aabe..4d629c62f4f 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -7,6 +7,12 @@ #include <linux/module.h> #include <linux/pm.h> #include <linux/clockchips.h> +#include <asm/system.h> + +unsigned long idle_halt; +EXPORT_SYMBOL(idle_halt); +unsigned long idle_nomwait; +EXPORT_SYMBOL(idle_nomwait); struct kmem_cache *task_xstate_cachep; @@ -132,7 +138,7 @@ void cpu_idle_wait(void) { smp_mb(); /* kick all the CPUs so that they exit out of pm_idle */ - smp_call_function(do_nothing, NULL, 0, 1); + smp_call_function(do_nothing, NULL, 1); } EXPORT_SYMBOL_GPL(cpu_idle_wait); @@ -325,7 +331,27 @@ static int __init idle_setup(char *str) pm_idle = poll_idle; } else if (!strcmp(str, "mwait")) force_mwait = 1; - else + else if (!strcmp(str, "halt")) { + /* + * When the boot option of idle=halt is added, halt is + * forced to be used for CPU idle. In such case CPU C2/C3 + * won't be used again. + * To continue to load the CPU idle driver, don't touch + * the boot_option_idle_override. + */ + pm_idle = default_idle; + idle_halt = 1; + return 0; + } else if (!strcmp(str, "nomwait")) { + /* + * If the boot option of "idle=nomwait" is added, + * it means that mwait will be disabled for CPU C2/C3 + * states. In such case it won't touch the variable + * of boot_option_idle_override. + */ + idle_nomwait = 1; + return 0; + } else return -1; boot_option_idle_override = 1; diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 9a139f6c9df..0c3927accb0 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -142,7 +142,10 @@ void cpu_idle(void) local_irq_disable(); __get_cpu_var(irq_stat).idle_timestamp = jiffies; + /* Don't trace irqs off for idle */ + stop_critical_timings(); pm_idle(); + start_critical_timings(); } tick_nohz_restart_sched_tick(); preempt_enable_no_resched(); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index db5eb963e4d..a8e53626ac9 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -134,7 +134,10 @@ void cpu_idle(void) */ local_irq_disable(); enter_idle(); + /* Don't trace irqs off for idle */ + stop_critical_timings(); pm_idle(); + start_critical_timings(); /* In many cases the interrupt that ended idle has already called exit_idle. But some idle loops can be woken up without interrupt. */ diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 79bdcd11c66..d1385881810 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -266,6 +266,8 @@ static void old_ich_force_enable_hpet_user(struct pci_dev *dev) hpet_print_force_info(); } +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB_1, + old_ich_force_enable_hpet_user); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_0, old_ich_force_enable_hpet_user); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_12, diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index cfcfbefee0b..531b55b8e81 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -394,11 +394,10 @@ static void __init parse_setup_data(void) } } -static void __init reserve_setup_data(void) +static void __init e820_reserve_setup_data(void) { struct setup_data *data; u64 pa_data; - char buf[32]; int found = 0; if (boot_params.hdr.version < 0x0209) @@ -406,8 +405,6 @@ static void __init reserve_setup_data(void) pa_data = boot_params.hdr.setup_data; while (pa_data) { data = early_ioremap(pa_data, sizeof(*data)); - sprintf(buf, "setup data %x", data->type); - reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf); e820_update_range(pa_data, sizeof(*data)+data->len, E820_RAM, E820_RESERVED_KERN); found = 1; @@ -418,10 +415,29 @@ static void __init reserve_setup_data(void) return; sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + memcpy(&e820_saved, &e820, sizeof(struct e820map)); printk(KERN_INFO "extended physical RAM map:\n"); e820_print_map("reserve setup_data"); } +static void __init reserve_early_setup_data(void) +{ + struct setup_data *data; + u64 pa_data; + char buf[32]; + + if (boot_params.hdr.version < 0x0209) + return; + pa_data = boot_params.hdr.setup_data; + while (pa_data) { + data = early_ioremap(pa_data, sizeof(*data)); + sprintf(buf, "setup data %x", data->type); + reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf); + pa_data = data->next; + early_iounmap(data, sizeof(*data)); + } +} + /* * --------- Crashkernel reservation ------------------------------ */ @@ -580,6 +596,7 @@ void __init setup_arch(char **cmdline_p) { #ifdef CONFIG_X86_32 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); + visws_early_detect(); pre_setup_arch_hook(); early_cpu_init(); #else @@ -626,6 +643,8 @@ void __init setup_arch(char **cmdline_p) setup_memory_map(); parse_setup_data(); + /* update the e820_saved too */ + e820_reserve_setup_data(); copy_edd(); @@ -656,7 +675,7 @@ void __init setup_arch(char **cmdline_p) parse_early_param(); /* after early param, so could get panic from serial */ - reserve_setup_data(); + reserve_early_setup_data(); if (acpi_mps_check()) { #ifdef CONFIG_X86_LOCAL_APIC @@ -665,6 +684,11 @@ void __init setup_arch(char **cmdline_p) clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); } +#ifdef CONFIG_PCI + if (pci_early_dump_regs) + early_dump_pci_devices(); +#endif + finish_e820_parsing(); #ifdef CONFIG_X86_32 @@ -691,22 +715,18 @@ void __init setup_arch(char **cmdline_p) early_gart_iommu_check(); #endif - e820_register_active_regions(0, 0, -1UL); /* * partially used pages are not usable - thus * we are rounding upwards: */ - max_pfn = e820_end_of_ram(); + max_pfn = e820_end_of_ram_pfn(); /* preallocate 4k for mptable mpc */ early_reserve_e820_mpc_new(); /* update e820 for memory not covered by WB MTRRs */ mtrr_bp_init(); - if (mtrr_trim_uncached_memory(max_pfn)) { - remove_all_active_ranges(); - e820_register_active_regions(0, 0, -1UL); - max_pfn = e820_end_of_ram(); - } + if (mtrr_trim_uncached_memory(max_pfn)) + max_pfn = e820_end_of_ram_pfn(); #ifdef CONFIG_X86_32 /* max_low_pfn get updated here */ @@ -718,12 +738,26 @@ void __init setup_arch(char **cmdline_p) /* How many end-of-memory variables you have, grandma! */ /* need this before calling reserve_initrd */ - max_low_pfn = max_pfn; + if (max_pfn > (1UL<<(32 - PAGE_SHIFT))) + max_low_pfn = e820_end_of_low_ram_pfn(); + else + max_low_pfn = max_pfn; + high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; #endif /* max_pfn_mapped is updated here */ - max_pfn_mapped = init_memory_mapping(0, (max_low_pfn << PAGE_SHIFT)); + max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT); + max_pfn_mapped = max_low_pfn_mapped; + +#ifdef CONFIG_X86_64 + if (max_pfn > max_low_pfn) { + max_pfn_mapped = init_memory_mapping(1UL<<32, + max_pfn<<PAGE_SHIFT); + /* can we preseve max_low_pfn ?*/ + max_low_pfn = max_pfn; + } +#endif /* * NOTE: On x86-32, only from this point on, fixmaps are ready for use. @@ -749,9 +783,6 @@ void __init setup_arch(char **cmdline_p) */ acpi_boot_table_init(); - /* Remove active ranges so rediscovery with NUMA-awareness happens */ - remove_all_active_ranges(); - #ifdef CONFIG_ACPI_NUMA /* * Parse SRAT to discover nodes. @@ -823,6 +854,14 @@ void __init setup_arch(char **cmdline_p) init_cpu_to_node(); #endif +#ifdef CONFIG_X86_NUMAQ + /* + * need to check online nodes num, call it + * here before time_init/tsc_init + */ + numaq_tsc_disable(); +#endif + init_apic_mappings(); ioapic_init_mappings(); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 5fc310f746f..cac68430d31 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -343,23 +343,23 @@ static const cpumask_t cpu_mask_none; /* * Returns a pointer to the bitmask of CPUs on Node 'node'. */ -cpumask_t *_node_to_cpumask_ptr(int node) +const cpumask_t *_node_to_cpumask_ptr(int node) { if (node_to_cpumask_map == NULL) { printk(KERN_WARNING "_node_to_cpumask_ptr(%d): no node_to_cpumask_map!\n", node); dump_stack(); - return &cpu_online_map; + return (const cpumask_t *)&cpu_online_map; } if (node >= nr_node_ids) { printk(KERN_WARNING "_node_to_cpumask_ptr(%d): node > nr_node_ids(%d)\n", node, nr_node_ids); dump_stack(); - return (cpumask_t *)&cpu_mask_none; + return &cpu_mask_none; } - return (cpumask_t *)&node_to_cpumask_map[node]; + return &node_to_cpumask_map[node]; } EXPORT_SYMBOL(_node_to_cpumask_ptr); diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 0cb7aadc87c..361b7a4c640 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -121,132 +121,23 @@ static void native_smp_send_reschedule(int cpu) send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR); } -/* - * Structure and data for smp_call_function(). This is designed to minimise - * static memory requirements. It also looks cleaner. - */ -static DEFINE_SPINLOCK(call_lock); - -struct call_data_struct { - void (*func) (void *info); - void *info; - atomic_t started; - atomic_t finished; - int wait; -}; - -void lock_ipi_call_lock(void) +void native_send_call_func_single_ipi(int cpu) { - spin_lock_irq(&call_lock); -} - -void unlock_ipi_call_lock(void) -{ - spin_unlock_irq(&call_lock); -} - -static struct call_data_struct *call_data; - -static void __smp_call_function(void (*func) (void *info), void *info, - int nonatomic, int wait) -{ - struct call_data_struct data; - int cpus = num_online_cpus() - 1; - - if (!cpus) - return; - - data.func = func; - data.info = info; - atomic_set(&data.started, 0); - data.wait = wait; - if (wait) - atomic_set(&data.finished, 0); - - call_data = &data; - mb(); - - /* Send a message to all other CPUs and wait for them to respond */ - send_IPI_allbutself(CALL_FUNCTION_VECTOR); - - /* Wait for response */ - while (atomic_read(&data.started) != cpus) - cpu_relax(); - - if (wait) - while (atomic_read(&data.finished) != cpus) - cpu_relax(); + send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_SINGLE_VECTOR); } - -/** - * smp_call_function_mask(): Run a function on a set of other CPUs. - * @mask: The set of cpus to run on. Must not include the current cpu. - * @func: The function to run. This must be fast and non-blocking. - * @info: An arbitrary pointer to pass to the function. - * @wait: If true, wait (atomically) until function has completed on other CPUs. - * - * Returns 0 on success, else a negative status code. - * - * If @wait is true, then returns once @func has returned; otherwise - * it returns just before the target cpu calls @func. - * - * You must not call this function with disabled interrupts or from a - * hardware interrupt handler or from a bottom half handler. - */ -static int -native_smp_call_function_mask(cpumask_t mask, - void (*func)(void *), void *info, - int wait) +void native_send_call_func_ipi(cpumask_t mask) { - struct call_data_struct data; cpumask_t allbutself; - int cpus; - - /* Can deadlock when called with interrupts disabled */ - WARN_ON(irqs_disabled()); - - /* Holding any lock stops cpus from going down. */ - spin_lock(&call_lock); allbutself = cpu_online_map; cpu_clear(smp_processor_id(), allbutself); - cpus_and(mask, mask, allbutself); - cpus = cpus_weight(mask); - - if (!cpus) { - spin_unlock(&call_lock); - return 0; - } - - data.func = func; - data.info = info; - atomic_set(&data.started, 0); - data.wait = wait; - if (wait) - atomic_set(&data.finished, 0); - - call_data = &data; - wmb(); - - /* Send a message to other CPUs */ if (cpus_equal(mask, allbutself) && cpus_equal(cpu_online_map, cpu_callout_map)) send_IPI_allbutself(CALL_FUNCTION_VECTOR); else send_IPI_mask(mask, CALL_FUNCTION_VECTOR); - - /* Wait for response */ - while (atomic_read(&data.started) != cpus) - cpu_relax(); - - if (wait) - while (atomic_read(&data.finished) != cpus) - cpu_relax(); - spin_unlock(&call_lock); - - return 0; } static void stop_this_cpu(void *dummy) @@ -268,18 +159,13 @@ static void stop_this_cpu(void *dummy) static void native_smp_send_stop(void) { - int nolock; unsigned long flags; if (reboot_force) return; - /* Don't deadlock on the call lock in panic */ - nolock = !spin_trylock(&call_lock); + smp_call_function(stop_this_cpu, NULL, 0); local_irq_save(flags); - __smp_call_function(stop_this_cpu, NULL, 0, 0); - if (!nolock) - spin_unlock(&call_lock); disable_local_APIC(); local_irq_restore(flags); } @@ -301,33 +187,28 @@ void smp_reschedule_interrupt(struct pt_regs *regs) void smp_call_function_interrupt(struct pt_regs *regs) { - void (*func) (void *info) = call_data->func; - void *info = call_data->info; - int wait = call_data->wait; - ack_APIC_irq(); - /* - * Notify initiating CPU that I've grabbed the data and am - * about to execute the function - */ - mb(); - atomic_inc(&call_data->started); - /* - * At this point the info structure may be out of scope unless wait==1 - */ irq_enter(); - (*func)(info); + generic_smp_call_function_interrupt(); #ifdef CONFIG_X86_32 __get_cpu_var(irq_stat).irq_call_count++; #else add_pda(irq_call_count, 1); #endif irq_exit(); +} - if (wait) { - mb(); - atomic_inc(&call_data->finished); - } +void smp_call_function_single_interrupt(struct pt_regs *regs) +{ + ack_APIC_irq(); + irq_enter(); + generic_smp_call_function_single_interrupt(); +#ifdef CONFIG_X86_32 + __get_cpu_var(irq_stat).irq_call_count++; +#else + add_pda(irq_call_count, 1); +#endif + irq_exit(); } struct smp_ops smp_ops = { @@ -338,7 +219,8 @@ struct smp_ops smp_ops = { .smp_send_stop = native_smp_send_stop, .smp_send_reschedule = native_smp_send_reschedule, - .smp_call_function_mask = native_smp_call_function_mask, + + .send_call_func_ipi = native_send_call_func_ipi, + .send_call_func_single_ipi = native_send_call_func_single_ipi, }; EXPORT_SYMBOL_GPL(smp_ops); - diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index e1200b202ed..687376ab07e 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -327,12 +327,12 @@ static void __cpuinit start_secondary(void *unused) * lock helps us to not include this cpu in a currently in progress * smp_call_function(). */ - lock_ipi_call_lock(); + ipi_call_lock_irq(); #ifdef CONFIG_X86_IO_APIC setup_vector_irq(smp_processor_id()); #endif cpu_set(smp_processor_id(), cpu_online_map); - unlock_ipi_call_lock(); + ipi_call_unlock_irq(); per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; setup_secondary_clock(); @@ -939,9 +939,9 @@ do_rest: inquire_remote_apic(apicid); } } - +#ifdef CONFIG_X86_64 restore_state: - +#endif if (boot_error) { /* Try to put things back the way they were before ... */ numa_remove_cpu(cpu); /* was set by numa_add_cpu */ diff --git a/arch/x86/kernel/smpcommon.c b/arch/x86/kernel/smpcommon.c index 3449064d141..99941b37eca 100644 --- a/arch/x86/kernel/smpcommon.c +++ b/arch/x86/kernel/smpcommon.c @@ -25,59 +25,3 @@ __cpuinit void init_gdt(int cpu) per_cpu(cpu_number, cpu) = cpu; } #endif - -/** - * smp_call_function(): Run a function on all other CPUs. - * @func: The function to run. This must be fast and non-blocking. - * @info: An arbitrary pointer to pass to the function. - * @nonatomic: Unused. - * @wait: If true, wait (atomically) until function has completed on other CPUs. - * - * Returns 0 on success, else a negative status code. - * - * If @wait is true, then returns once @func has returned; otherwise - * it returns just before the target cpu calls @func. - * - * You must not call this function with disabled interrupts or from a - * hardware interrupt handler or from a bottom half handler. - */ -int smp_call_function(void (*func) (void *info), void *info, int nonatomic, - int wait) -{ - return smp_call_function_mask(cpu_online_map, func, info, wait); -} -EXPORT_SYMBOL(smp_call_function); - -/** - * smp_call_function_single - Run a function on a specific CPU - * @cpu: The target CPU. Cannot be the calling CPU. - * @func: The function to run. This must be fast and non-blocking. - * @info: An arbitrary pointer to pass to the function. - * @nonatomic: Unused. - * @wait: If true, wait until function has completed on other CPUs. - * - * Returns 0 on success, else a negative status code. - * - * If @wait is true, then returns once @func has returned; otherwise - * it returns just before the target cpu calls @func. - */ -int smp_call_function_single(int cpu, void (*func) (void *info), void *info, - int nonatomic, int wait) -{ - /* prevent preemption and reschedule on another processor */ - int ret; - int me = get_cpu(); - if (cpu == me) { - local_irq_disable(); - func(info); - local_irq_enable(); - put_cpu(); - return 0; - } - - ret = smp_call_function_mask(cpumask_of_cpu(cpu), func, info, wait); - - put_cpu(); - return ret; -} -EXPORT_SYMBOL(smp_call_function_single); diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index c28c342c162..a03e7f6d90c 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -74,6 +74,7 @@ void save_stack_trace(struct stack_trace *trace) if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = ULONG_MAX; } +EXPORT_SYMBOL_GPL(save_stack_trace); void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) { @@ -81,3 +82,4 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = ULONG_MAX; } +EXPORT_SYMBOL_GPL(save_stack_trace_tsk); diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c index 5f29f12da50..059ca6ee59b 100644 --- a/arch/x86/kernel/time_32.c +++ b/arch/x86/kernel/time_32.c @@ -39,9 +39,6 @@ #include "do_timer.h" -unsigned int cpu_khz; /* Detected as we calibrate the TSC */ -EXPORT_SYMBOL(cpu_khz); - int timer_ack; unsigned long profile_pc(struct pt_regs *regs) diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c index 39ae8511a13..e3d49c553af 100644 --- a/arch/x86/kernel/time_64.c +++ b/arch/x86/kernel/time_64.c @@ -56,7 +56,7 @@ static irqreturn_t timer_event_interrupt(int irq, void *dev_id) /* calibrate_cpu is used on systems with fixed rate TSCs to determine * processor frequency */ #define TICK_COUNT 100000000 -unsigned long __init native_calculate_cpu_khz(void) +unsigned long __init calibrate_cpu(void) { int tsc_start, tsc_now; int i, no_ctr_free; @@ -116,25 +116,11 @@ void __init hpet_time_init(void) void __init time_init(void) { - tsc_calibrate(); - - cpu_khz = tsc_khz; - if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) && - (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)) - cpu_khz = calculate_cpu_khz(); - - lpj_fine = ((unsigned long)tsc_khz * 1000)/HZ; - - if (unsynchronized_tsc()) - mark_tsc_unstable("TSCs unsynchronized"); - + tsc_init(); if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP)) vgetcpu_mode = VGETCPU_RDTSCP; else vgetcpu_mode = VGETCPU_LSL; - printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n", - cpu_khz / 1000, cpu_khz % 1000); - init_tsc_clocksource(); late_time_init = choose_time_init(); } diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c index 9bb2363851a..fec1ecedc9b 100644 --- a/arch/x86/kernel/tlb_32.c +++ b/arch/x86/kernel/tlb_32.c @@ -238,6 +238,6 @@ static void do_flush_tlb_all(void *info) void flush_tlb_all(void) { - on_each_cpu(do_flush_tlb_all, NULL, 1, 1); + on_each_cpu(do_flush_tlb_all, NULL, 1); } diff --git a/arch/x86/kernel/tlb_64.c b/arch/x86/kernel/tlb_64.c index 5039d0f097a..dcbf7a1159e 100644 --- a/arch/x86/kernel/tlb_64.c +++ b/arch/x86/kernel/tlb_64.c @@ -275,5 +275,5 @@ static void do_flush_tlb_all(void *info) void flush_tlb_all(void) { - on_each_cpu(do_flush_tlb_all, NULL, 1, 1); + on_each_cpu(do_flush_tlb_all, NULL, 1); } diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index d7cc292691f..8a768973c4f 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -1,5 +1,6 @@ /* * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs * * Pentium III FXSR, SSE support * Gareth Hughes <gareth@valinux.com>, May 2000 @@ -60,8 +61,6 @@ #include "mach_traps.h" -int panic_on_unrecovered_nmi; - DECLARE_BITMAP(used_vectors, NR_VECTORS); EXPORT_SYMBOL_GPL(used_vectors); @@ -98,19 +97,22 @@ asmlinkage void alignment_check(void); asmlinkage void spurious_interrupt_bug(void); asmlinkage void machine_check(void); +int panic_on_unrecovered_nmi; int kstack_depth_to_print = 24; static unsigned int code_bytes = 64; +static int ignore_nmis; +static int die_counter; void printk_address(unsigned long address, int reliable) { #ifdef CONFIG_KALLSYMS - char namebuf[KSYM_NAME_LEN]; unsigned long offset = 0; unsigned long symsize; const char *symname; - char reliab[4] = ""; - char *delim = ":"; char *modname; + char *delim = ":"; + char namebuf[KSYM_NAME_LEN]; + char reliab[4] = ""; symname = kallsyms_lookup(address, &symsize, &offset, &modname, namebuf); @@ -130,22 +132,23 @@ void printk_address(unsigned long address, int reliable) #endif } -static inline int valid_stack_ptr(struct thread_info *tinfo, void *p, unsigned size) +static inline int valid_stack_ptr(struct thread_info *tinfo, + void *p, unsigned int size) { - return p > (void *)tinfo && - p <= (void *)tinfo + THREAD_SIZE - size; + void *t = tinfo; + return p > t && p <= t + THREAD_SIZE - size; } /* The form of the top of the frame on the stack */ struct stack_frame { - struct stack_frame *next_frame; - unsigned long return_address; + struct stack_frame *next_frame; + unsigned long return_address; }; static inline unsigned long print_context_stack(struct thread_info *tinfo, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data) + unsigned long *stack, unsigned long bp, + const struct stacktrace_ops *ops, void *data) { struct stack_frame *frame = (struct stack_frame *)bp; @@ -167,8 +170,6 @@ print_context_stack(struct thread_info *tinfo, return bp; } -#define MSG(msg) ops->warning(data, msg) - void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data) @@ -178,7 +179,6 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, if (!stack) { unsigned long dummy; - stack = &dummy; if (task != current) stack = (unsigned long *)task->thread.sp; @@ -196,7 +196,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, } #endif - while (1) { + for (;;) { struct thread_info *context; context = (struct thread_info *) @@ -248,10 +248,10 @@ static void print_trace_address(void *data, unsigned long addr, int reliable) } static const struct stacktrace_ops print_trace_ops = { - .warning = print_trace_warning, - .warning_symbol = print_trace_warning_symbol, - .stack = print_trace_stack, - .address = print_trace_address, + .warning = print_trace_warning, + .warning_symbol = print_trace_warning_symbol, + .stack = print_trace_stack, + .address = print_trace_address, }; static void @@ -351,15 +351,14 @@ void show_registers(struct pt_regs *regs) printk(KERN_EMERG "Code: "); ip = (u8 *)regs->ip - code_prologue; - if (ip < (u8 *)PAGE_OFFSET || - probe_kernel_address(ip, c)) { + if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { /* try starting at EIP */ ip = (u8 *)regs->ip; code_len = code_len - code_prologue + 1; } for (i = 0; i < code_len; i++, ip++) { if (ip < (u8 *)PAGE_OFFSET || - probe_kernel_address(ip, c)) { + probe_kernel_address(ip, c)) { printk(" Bad EIP value."); break; } @@ -384,8 +383,6 @@ int is_valid_bugaddr(unsigned long ip) return ud2 == 0x0b0f; } -static int die_counter; - int __kprobes __die(const char *str, struct pt_regs *regs, long err) { unsigned short ss; @@ -402,26 +399,22 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err) printk("DEBUG_PAGEALLOC"); #endif printk("\n"); - if (notify_die(DIE_OOPS, str, regs, err, - current->thread.trap_no, SIGSEGV) != NOTIFY_STOP) { - - show_registers(regs); - /* Executive summary in case the oops scrolled away */ - sp = (unsigned long) (®s->sp); - savesegment(ss, ss); - if (user_mode(regs)) { - sp = regs->sp; - ss = regs->ss & 0xffff; - } - printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); - print_symbol("%s", regs->ip); - printk(" SS:ESP %04x:%08lx\n", ss, sp); + current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) + return 1; - return 0; + show_registers(regs); + /* Executive summary in case the oops scrolled away */ + sp = (unsigned long) (®s->sp); + savesegment(ss, ss); + if (user_mode(regs)) { + sp = regs->sp; + ss = regs->ss & 0xffff; } - - return 1; + printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); + print_symbol("%s", regs->ip); + printk(" SS:ESP %04x:%08lx\n", ss, sp); + return 0; } /* @@ -546,7 +539,7 @@ void do_##name(struct pt_regs *regs, long error_code) \ { \ trace_hardirqs_fixup(); \ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ + == NOTIFY_STOP) \ return; \ do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \ } @@ -562,7 +555,7 @@ void do_##name(struct pt_regs *regs, long error_code) \ info.si_code = sicode; \ info.si_addr = (void __user *)siaddr; \ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ + == NOTIFY_STOP) \ return; \ do_trap(trapnr, signr, str, 0, regs, error_code, &info); \ } @@ -571,7 +564,7 @@ void do_##name(struct pt_regs *regs, long error_code) \ void do_##name(struct pt_regs *regs, long error_code) \ { \ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ + == NOTIFY_STOP) \ return; \ do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \ } @@ -586,27 +579,29 @@ void do_##name(struct pt_regs *regs, long error_code) \ info.si_addr = (void __user *)siaddr; \ trace_hardirqs_fixup(); \ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ + == NOTIFY_STOP) \ return; \ do_trap(trapnr, signr, str, 1, regs, error_code, &info); \ } -DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) +DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) #ifndef CONFIG_KPROBES DO_VM86_ERROR(3, SIGTRAP, "int3", int3) #endif DO_VM86_ERROR(4, SIGSEGV, "overflow", overflow) DO_VM86_ERROR(5, SIGSEGV, "bounds", bounds) -DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0) -DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) +DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0) +DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) -DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) -DO_ERROR(12, SIGBUS, "stack segment", stack_segment) +DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) +DO_ERROR(12, SIGBUS, "stack segment", stack_segment) DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0) DO_ERROR_INFO(32, SIGILL, "iret exception", iret_error, ILL_BADSTK, 0, 1) -void __kprobes do_general_protection(struct pt_regs *regs, long error_code) +void __kprobes +do_general_protection(struct pt_regs *regs, long error_code) { + struct task_struct *tsk; struct thread_struct *thread; struct tss_struct *tss; int cpu; @@ -647,23 +642,24 @@ void __kprobes do_general_protection(struct pt_regs *regs, long error_code) if (regs->flags & X86_VM_MASK) goto gp_in_vm86; + tsk = current; if (!user_mode(regs)) goto gp_in_kernel; - current->thread.error_code = error_code; - current->thread.trap_no = 13; + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 13; - if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) && - printk_ratelimit()) { + if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && + printk_ratelimit()) { printk(KERN_INFO - "%s[%d] general protection ip:%lx sp:%lx error:%lx", - current->comm, task_pid_nr(current), - regs->ip, regs->sp, error_code); + "%s[%d] general protection ip:%lx sp:%lx error:%lx", + tsk->comm, task_pid_nr(tsk), + regs->ip, regs->sp, error_code); print_vma_addr(" in ", regs->ip); printk("\n"); } - force_sig(SIGSEGV, current); + force_sig(SIGSEGV, tsk); return; gp_in_vm86: @@ -672,14 +668,15 @@ gp_in_vm86: return; gp_in_kernel: - if (!fixup_exception(regs)) { - current->thread.error_code = error_code; - current->thread.trap_no = 13; - if (notify_die(DIE_GPF, "general protection fault", regs, + if (fixup_exception(regs)) + return; + + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 13; + if (notify_die(DIE_GPF, "general protection fault", regs, error_code, 13, SIGSEGV) == NOTIFY_STOP) - return; - die("general protection fault", regs, error_code); - } + return; + die("general protection fault", regs, error_code); } static notrace __kprobes void @@ -792,14 +789,17 @@ void notrace __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic) static notrace __kprobes void default_do_nmi(struct pt_regs *regs) { unsigned char reason = 0; + int cpu; + + cpu = smp_processor_id(); - /* Only the BSP gets external NMIs from the system: */ - if (!smp_processor_id()) + /* Only the BSP gets external NMIs from the system. */ + if (!cpu) reason = get_nmi_reason(); if (!(reason & 0xc0)) { if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) - == NOTIFY_STOP) + == NOTIFY_STOP) return; #ifdef CONFIG_X86_LOCAL_APIC /* @@ -808,7 +808,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) */ if (nmi_watchdog_tick(regs, reason)) return; - if (!do_nmi_callback(regs, smp_processor_id())) + if (!do_nmi_callback(regs, cpu)) unknown_nmi_error(reason, regs); #else unknown_nmi_error(reason, regs); @@ -818,6 +818,8 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) } if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) return; + + /* AK: following checks seem to be broken on modern chipsets. FIXME */ if (reason & 0x80) mem_parity_error(reason, regs); if (reason & 0x40) @@ -829,8 +831,6 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) reassert_nmi(); } -static int ignore_nmis; - notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code) { int cpu; @@ -915,7 +915,7 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code) tsk->thread.debugctlmsr = 0; if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, - SIGTRAP) == NOTIFY_STOP) + SIGTRAP) == NOTIFY_STOP) return; /* It's safe to allow irq's after DR6 has been saved */ if (regs->flags & X86_EFLAGS_IF) @@ -976,9 +976,8 @@ clear_TF_reenable: void math_error(void __user *ip) { struct task_struct *task; - unsigned short cwd; - unsigned short swd; siginfo_t info; + unsigned short cwd, swd; /* * Save the info for the exception handler and clear the error. @@ -997,7 +996,7 @@ void math_error(void __user *ip) * C1 reg you need in case of a stack fault, 0x040 is the stack * fault bit. We should only be taking one exception at a time, * so if this combination doesn't produce any single exception, - * then we have a bad program that isn't syncronizing its FPU usage + * then we have a bad program that isn't synchronizing its FPU usage * and it will suffer the consequences since we won't be able to * fully reproduce the context of the exception */ @@ -1006,7 +1005,7 @@ void math_error(void __user *ip) switch (swd & ~cwd & 0x3f) { case 0x000: /* No unmasked exception */ return; - default: /* Multiple exceptions */ + default: /* Multiple exceptions */ break; case 0x001: /* Invalid Op */ /* @@ -1042,8 +1041,8 @@ void do_coprocessor_error(struct pt_regs *regs, long error_code) static void simd_math_error(void __user *ip) { struct task_struct *task; - unsigned short mxcsr; siginfo_t info; + unsigned short mxcsr; /* * Save the info for the exception handler and clear the error. @@ -1198,16 +1197,16 @@ void __init trap_init(void) early_iounmap(p, 4); #endif - set_trap_gate(0, ÷_error); - set_intr_gate(1, &debug); - set_intr_gate(2, &nmi); - set_system_intr_gate(3, &int3); /* int3/4 can be called from all */ - set_system_gate(4, &overflow); - set_trap_gate(5, &bounds); - set_trap_gate(6, &invalid_op); - set_trap_gate(7, &device_not_available); - set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS); - set_trap_gate(9, &coprocessor_segment_overrun); + set_trap_gate(0, ÷_error); + set_intr_gate(1, &debug); + set_intr_gate(2, &nmi); + set_system_intr_gate(3, &int3); /* int3 can be called from all */ + set_system_gate(4, &overflow); /* int4 can be called from all */ + set_trap_gate(5, &bounds); + set_trap_gate(6, &invalid_op); + set_trap_gate(7, &device_not_available); + set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS); + set_trap_gate(9, &coprocessor_segment_overrun); set_trap_gate(10, &invalid_TSS); set_trap_gate(11, &segment_not_present); set_trap_gate(12, &stack_segment); diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index 80ba6d37bfe..2696a683778 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -10,49 +10,49 @@ * 'Traps.c' handles hardware traps and faults after we have saved some * state in 'entry.S'. */ -#include <linux/sched.h> +#include <linux/moduleparam.h> +#include <linux/interrupt.h> +#include <linux/kallsyms.h> +#include <linux/spinlock.h> +#include <linux/kprobes.h> +#include <linux/uaccess.h> +#include <linux/utsname.h> +#include <linux/kdebug.h> #include <linux/kernel.h> +#include <linux/module.h> +#include <linux/ptrace.h> #include <linux/string.h> +#include <linux/unwind.h> +#include <linux/delay.h> #include <linux/errno.h> -#include <linux/ptrace.h> +#include <linux/kexec.h> +#include <linux/sched.h> #include <linux/timer.h> -#include <linux/mm.h> #include <linux/init.h> -#include <linux/delay.h> -#include <linux/spinlock.h> -#include <linux/interrupt.h> -#include <linux/kallsyms.h> -#include <linux/module.h> -#include <linux/moduleparam.h> -#include <linux/nmi.h> -#include <linux/kprobes.h> -#include <linux/kexec.h> -#include <linux/unwind.h> -#include <linux/uaccess.h> #include <linux/bug.h> -#include <linux/kdebug.h> -#include <linux/utsname.h> - -#include <mach_traps.h> +#include <linux/nmi.h> +#include <linux/mm.h> #if defined(CONFIG_EDAC) #include <linux/edac.h> #endif -#include <asm/system.h> -#include <asm/io.h> -#include <asm/atomic.h> +#include <asm/stacktrace.h> +#include <asm/processor.h> #include <asm/debugreg.h> +#include <asm/atomic.h> +#include <asm/system.h> +#include <asm/unwind.h> #include <asm/desc.h> #include <asm/i387.h> -#include <asm/processor.h> -#include <asm/unwind.h> +#include <asm/nmi.h> #include <asm/smp.h> +#include <asm/io.h> #include <asm/pgalloc.h> -#include <asm/pda.h> #include <asm/proto.h> -#include <asm/nmi.h> -#include <asm/stacktrace.h> +#include <asm/pda.h> + +#include <mach_traps.h> asmlinkage void divide_error(void); asmlinkage void debug(void); @@ -72,12 +72,14 @@ asmlinkage void page_fault(void); asmlinkage void coprocessor_error(void); asmlinkage void simd_coprocessor_error(void); asmlinkage void alignment_check(void); -asmlinkage void machine_check(void); asmlinkage void spurious_interrupt_bug(void); +asmlinkage void machine_check(void); int panic_on_unrecovered_nmi; +int kstack_depth_to_print = 12; static unsigned int code_bytes = 64; -static unsigned ignore_nmis; +static int ignore_nmis; +static int die_counter; static inline void conditional_sti(struct pt_regs *regs) { @@ -101,34 +103,9 @@ static inline void preempt_conditional_cli(struct pt_regs *regs) dec_preempt_count(); } -int kstack_depth_to_print = 12; - void printk_address(unsigned long address, int reliable) { -#ifdef CONFIG_KALLSYMS - unsigned long offset = 0, symsize; - const char *symname; - char *modname; - char *delim = ":"; - char namebuf[KSYM_NAME_LEN]; - char reliab[4] = ""; - - symname = kallsyms_lookup(address, &symsize, &offset, - &modname, namebuf); - if (!symname) { - printk(" [<%016lx>]\n", address); - return; - } - if (!reliable) - strcpy(reliab, "? "); - - if (!modname) - modname = delim = ""; - printk(" [<%016lx>] %s%s%s%s%s+0x%lx/0x%lx\n", - address, reliab, delim, modname, delim, symname, offset, symsize); -#else - printk(" [<%016lx>]\n", address); -#endif + printk(" [<%016lx>] %s%pS\n", address, reliable ? "": "? ", (void *) address); } static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, @@ -205,8 +182,6 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, return NULL; } -#define MSG(txt) ops->warning(data, txt) - /* * x86-64 can have up to three kernel stacks: * process stack @@ -233,11 +208,11 @@ struct stack_frame { unsigned long return_address; }; - -static inline unsigned long print_context_stack(struct thread_info *tinfo, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data, - unsigned long *end) +static inline unsigned long +print_context_stack(struct thread_info *tinfo, + unsigned long *stack, unsigned long bp, + const struct stacktrace_ops *ops, void *data, + unsigned long *end) { struct stack_frame *frame = (struct stack_frame *)bp; @@ -259,7 +234,7 @@ static inline unsigned long print_context_stack(struct thread_info *tinfo, return bp; } -void dump_trace(struct task_struct *tsk, struct pt_regs *regs, +void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data) { @@ -268,36 +243,34 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned used = 0; struct thread_info *tinfo; - if (!tsk) - tsk = current; - tinfo = task_thread_info(tsk); + if (!task) + task = current; if (!stack) { unsigned long dummy; stack = &dummy; - if (tsk && tsk != current) - stack = (unsigned long *)tsk->thread.sp; + if (task && task != current) + stack = (unsigned long *)task->thread.sp; } #ifdef CONFIG_FRAME_POINTER if (!bp) { - if (tsk == current) { + if (task == current) { /* Grab bp right from our regs */ - asm("movq %%rbp, %0" : "=r" (bp):); + asm("movq %%rbp, %0" : "=r" (bp) :); } else { /* bp is the last reg pushed by switch_to */ - bp = *(unsigned long *) tsk->thread.sp; + bp = *(unsigned long *) task->thread.sp; } } #endif - - /* * Print function call entries in all stacks, starting at the * current stack address. If the stacks consist of nested * exceptions */ + tinfo = task_thread_info(task); for (;;) { char *id; unsigned long *estack_end; @@ -382,18 +355,17 @@ static const struct stacktrace_ops print_trace_ops = { .address = print_trace_address, }; -void -show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack, - unsigned long bp) +void show_trace(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, unsigned long bp) { printk("\nCall Trace:\n"); - dump_trace(tsk, regs, stack, bp, &print_trace_ops, NULL); + dump_trace(task, regs, stack, bp, &print_trace_ops, NULL); printk("\n"); } static void -_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp, - unsigned long bp) +_show_stack(struct task_struct *task, struct pt_regs *regs, + unsigned long *sp, unsigned long bp) { unsigned long *stack; int i; @@ -405,14 +377,14 @@ _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp, // back trace for this cpu. if (sp == NULL) { - if (tsk) - sp = (unsigned long *)tsk->thread.sp; + if (task) + sp = (unsigned long *)task->thread.sp; else sp = (unsigned long *)&sp; } stack = sp; - for(i=0; i < kstack_depth_to_print; i++) { + for (i = 0; i < kstack_depth_to_print; i++) { if (stack >= irqstack && stack <= irqstack_end) { if (stack == irqstack_end) { stack = (unsigned long *) (irqstack_end[-1]); @@ -427,12 +399,12 @@ _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp, printk(" %016lx", *stack++); touch_nmi_watchdog(); } - show_trace(tsk, regs, sp, bp); + show_trace(task, regs, sp, bp); } -void show_stack(struct task_struct *tsk, unsigned long * sp) +void show_stack(struct task_struct *task, unsigned long *sp) { - _show_stack(tsk, NULL, sp, 0); + _show_stack(task, NULL, sp, 0); } /* @@ -440,8 +412,8 @@ void show_stack(struct task_struct *tsk, unsigned long * sp) */ void dump_stack(void) { - unsigned long dummy; unsigned long bp = 0; + unsigned long stack; #ifdef CONFIG_FRAME_POINTER if (!bp) @@ -453,7 +425,7 @@ void dump_stack(void) init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version); - show_trace(NULL, NULL, &dummy, bp); + show_trace(NULL, NULL, &stack, bp); } EXPORT_SYMBOL(dump_stack); @@ -464,12 +436,8 @@ void show_registers(struct pt_regs *regs) unsigned long sp; const int cpu = smp_processor_id(); struct task_struct *cur = cpu_pda(cpu)->pcurrent; - u8 *ip; - unsigned int code_prologue = code_bytes * 43 / 64; - unsigned int code_len = code_bytes; sp = regs->sp; - ip = (u8 *) regs->ip - code_prologue; printk("CPU %d ", cpu); __show_regs(regs); printk("Process %s (pid: %d, threadinfo %p, task %p)\n", @@ -480,15 +448,21 @@ void show_registers(struct pt_regs *regs) * time of the fault.. */ if (!user_mode(regs)) { + unsigned int code_prologue = code_bytes * 43 / 64; + unsigned int code_len = code_bytes; unsigned char c; + u8 *ip; + printk("Stack: "); _show_stack(NULL, regs, (unsigned long *)sp, regs->bp); printk("\n"); printk(KERN_EMERG "Code: "); + + ip = (u8 *)regs->ip - code_prologue; if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { /* try starting at RIP */ - ip = (u8 *) regs->ip; + ip = (u8 *)regs->ip; code_len = code_len - code_prologue + 1; } for (i = 0; i < code_len; i++, ip++) { @@ -504,7 +478,7 @@ void show_registers(struct pt_regs *regs) } } printk("\n"); -} +} int is_valid_bugaddr(unsigned long ip) { @@ -562,10 +536,9 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) do_exit(signr); } -int __kprobes __die(const char * str, struct pt_regs * regs, long err) +int __kprobes __die(const char *str, struct pt_regs *regs, long err) { - static int die_counter; - printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter); + printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff, ++die_counter); #ifdef CONFIG_PREEMPT printk("PREEMPT "); #endif @@ -576,8 +549,10 @@ int __kprobes __die(const char * str, struct pt_regs * regs, long err) printk("DEBUG_PAGEALLOC"); #endif printk("\n"); - if (notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) + if (notify_die(DIE_OOPS, str, regs, err, + current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) return 1; + show_registers(regs); add_taint(TAINT_DIE); /* Executive summary in case the oops scrolled away */ @@ -589,7 +564,7 @@ int __kprobes __die(const char * str, struct pt_regs * regs, long err) return 0; } -void die(const char * str, struct pt_regs * regs, long err) +void die(const char *str, struct pt_regs *regs, long err) { unsigned long flags = oops_begin(); @@ -606,8 +581,7 @@ die_nmi(char *str, struct pt_regs *regs, int do_panic) { unsigned long flags; - if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == - NOTIFY_STOP) + if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) return; flags = oops_begin(); @@ -629,44 +603,44 @@ die_nmi(char *str, struct pt_regs *regs, int do_panic) do_exit(SIGBUS); } -static void __kprobes do_trap(int trapnr, int signr, char *str, - struct pt_regs * regs, long error_code, - siginfo_t *info) +static void __kprobes +do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, + long error_code, siginfo_t *info) { struct task_struct *tsk = current; - if (user_mode(regs)) { - /* - * We want error_code and trap_no set for userspace - * faults and kernelspace faults which result in - * die(), but not kernelspace faults which are fixed - * up. die() gives the process no chance to handle - * the signal and notice the kernel fault information, - * so that won't result in polluting the information - * about previously queued, but not yet delivered, - * faults. See also do_general_protection below. - */ - tsk->thread.error_code = error_code; - tsk->thread.trap_no = trapnr; - - if (show_unhandled_signals && unhandled_signal(tsk, signr) && - printk_ratelimit()) { - printk(KERN_INFO - "%s[%d] trap %s ip:%lx sp:%lx error:%lx", - tsk->comm, tsk->pid, str, - regs->ip, regs->sp, error_code); - print_vma_addr(" in ", regs->ip); - printk("\n"); - } + if (!user_mode(regs)) + goto kernel_trap; - if (info) - force_sig_info(signr, info, tsk); - else - force_sig(signr, tsk); - return; + /* + * We want error_code and trap_no set for userspace faults and + * kernelspace faults which result in die(), but not + * kernelspace faults which are fixed up. die() gives the + * process no chance to handle the signal and notice the + * kernel fault information, so that won't result in polluting + * the information about previously queued, but not yet + * delivered, faults. See also do_general_protection below. + */ + tsk->thread.error_code = error_code; + tsk->thread.trap_no = trapnr; + + if (show_unhandled_signals && unhandled_signal(tsk, signr) && + printk_ratelimit()) { + printk(KERN_INFO + "%s[%d] trap %s ip:%lx sp:%lx error:%lx", + tsk->comm, tsk->pid, str, + regs->ip, regs->sp, error_code); + print_vma_addr(" in ", regs->ip); + printk("\n"); } + if (info) + force_sig_info(signr, info, tsk); + else + force_sig(signr, tsk); + return; +kernel_trap: if (!fixup_exception(regs)) { tsk->thread.error_code = error_code; tsk->thread.trap_no = trapnr; @@ -676,38 +650,38 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, } #define DO_ERROR(trapnr, signr, str, name) \ -asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ -{ \ - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ - return; \ +asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ +{ \ + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ + == NOTIFY_STOP) \ + return; \ conditional_sti(regs); \ - do_trap(trapnr, signr, str, regs, error_code, NULL); \ + do_trap(trapnr, signr, str, regs, error_code, NULL); \ } -#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ -asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ -{ \ - siginfo_t info; \ - info.si_signo = signr; \ - info.si_errno = 0; \ - info.si_code = sicode; \ - info.si_addr = (void __user *)siaddr; \ - trace_hardirqs_fixup(); \ - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ - return; \ +#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ +asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ +{ \ + siginfo_t info; \ + info.si_signo = signr; \ + info.si_errno = 0; \ + info.si_code = sicode; \ + info.si_addr = (void __user *)siaddr; \ + trace_hardirqs_fixup(); \ + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ + == NOTIFY_STOP) \ + return; \ conditional_sti(regs); \ - do_trap(trapnr, signr, str, regs, error_code, &info); \ + do_trap(trapnr, signr, str, regs, error_code, &info); \ } -DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) -DO_ERROR( 4, SIGSEGV, "overflow", overflow) -DO_ERROR( 5, SIGSEGV, "bounds", bounds) -DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip) -DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) +DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) +DO_ERROR(4, SIGSEGV, "overflow", overflow) +DO_ERROR(5, SIGSEGV, "bounds", bounds) +DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip) +DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) -DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) +DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) /* Runs on IST stack */ @@ -738,31 +712,34 @@ asmlinkage void do_double_fault(struct pt_regs * regs, long error_code) die(str, regs, error_code); } -asmlinkage void __kprobes do_general_protection(struct pt_regs * regs, - long error_code) +asmlinkage void __kprobes +do_general_protection(struct pt_regs *regs, long error_code) { - struct task_struct *tsk = current; + struct task_struct *tsk; conditional_sti(regs); - if (user_mode(regs)) { - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 13; - - if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && - printk_ratelimit()) { - printk(KERN_INFO - "%s[%d] general protection ip:%lx sp:%lx error:%lx", - tsk->comm, tsk->pid, - regs->ip, regs->sp, error_code); - print_vma_addr(" in ", regs->ip); - printk("\n"); - } + tsk = current; + if (!user_mode(regs)) + goto gp_in_kernel; - force_sig(SIGSEGV, tsk); - return; - } + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 13; + if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && + printk_ratelimit()) { + printk(KERN_INFO + "%s[%d] general protection ip:%lx sp:%lx error:%lx", + tsk->comm, tsk->pid, + regs->ip, regs->sp, error_code); + print_vma_addr(" in ", regs->ip); + printk("\n"); + } + + force_sig(SIGSEGV, tsk); + return; + +gp_in_kernel: if (fixup_exception(regs)) return; @@ -775,14 +752,14 @@ asmlinkage void __kprobes do_general_protection(struct pt_regs * regs, } static notrace __kprobes void -mem_parity_error(unsigned char reason, struct pt_regs * regs) +mem_parity_error(unsigned char reason, struct pt_regs *regs) { printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n", reason); printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n"); #if defined(CONFIG_EDAC) - if(edac_handler_set()) { + if (edac_handler_set()) { edac_atomic_assert_error(); return; } @@ -799,7 +776,7 @@ mem_parity_error(unsigned char reason, struct pt_regs * regs) } static notrace __kprobes void -io_check_error(unsigned char reason, struct pt_regs * regs) +io_check_error(unsigned char reason, struct pt_regs *regs) { printk("NMI: IOCK error (debug interrupt?)\n"); show_registers(regs); @@ -829,14 +806,14 @@ unknown_nmi_error(unsigned char reason, struct pt_regs * regs) /* Runs on IST stack. This code must keep interrupts off all the time. Nested NMIs are prevented by the CPU. */ -asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs) +asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs) { unsigned char reason = 0; int cpu; cpu = smp_processor_id(); - /* Only the BSP gets external NMIs from the system. */ + /* Only the BSP gets external NMIs from the system. */ if (!cpu) reason = get_nmi_reason(); @@ -848,18 +825,17 @@ asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs) * Ok, so this is none of the documented NMI sources, * so it must be the NMI watchdog. */ - if (nmi_watchdog_tick(regs,reason)) + if (nmi_watchdog_tick(regs, reason)) return; - if (!do_nmi_callback(regs,cpu)) + if (!do_nmi_callback(regs, cpu)) unknown_nmi_error(reason, regs); return; } if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) - return; + return; /* AK: following checks seem to be broken on modern chipsets. FIXME */ - if (reason & 0x80) mem_parity_error(reason, regs); if (reason & 0x40) @@ -870,9 +846,12 @@ asmlinkage notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code) { nmi_enter(); + add_pda(__nmi_count, 1); + if (!ignore_nmis) default_do_nmi(regs); + nmi_exit(); } @@ -889,13 +868,14 @@ void restart_nmi(void) } /* runs on IST stack. */ -asmlinkage void __kprobes do_int3(struct pt_regs * regs, long error_code) +asmlinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) { trace_hardirqs_fixup(); - if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) { + if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) + == NOTIFY_STOP) return; - } + preempt_conditional_sti(regs); do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); preempt_conditional_cli(regs); @@ -926,8 +906,8 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) asmlinkage void __kprobes do_debug(struct pt_regs * regs, unsigned long error_code) { - unsigned long condition; struct task_struct *tsk = current; + unsigned long condition; siginfo_t info; trace_hardirqs_fixup(); @@ -948,21 +928,19 @@ asmlinkage void __kprobes do_debug(struct pt_regs * regs, /* Mask out spurious debug traps due to lazy DR7 setting */ if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { - if (!tsk->thread.debugreg7) { + if (!tsk->thread.debugreg7) goto clear_dr7; - } } tsk->thread.debugreg6 = condition; - /* * Single-stepping through TF: make sure we ignore any events in * kernel space (but re-enable TF when returning to user mode). */ if (condition & DR_STEP) { - if (!user_mode(regs)) - goto clear_TF_reenable; + if (!user_mode(regs)) + goto clear_TF_reenable; } /* Ok, finally something we can handle */ @@ -975,7 +953,7 @@ asmlinkage void __kprobes do_debug(struct pt_regs * regs, force_sig_info(SIGTRAP, &info, tsk); clear_dr7: - set_debugreg(0UL, 7); + set_debugreg(0, 7); preempt_conditional_cli(regs); return; @@ -983,6 +961,7 @@ clear_TF_reenable: set_tsk_thread_flag(tsk, TIF_SINGLESTEP); regs->flags &= ~X86_EFLAGS_TF; preempt_conditional_cli(regs); + return; } static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr) @@ -1005,7 +984,7 @@ static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr) asmlinkage void do_coprocessor_error(struct pt_regs *regs) { void __user *ip = (void __user *)(regs->ip); - struct task_struct * task; + struct task_struct *task; siginfo_t info; unsigned short cwd, swd; @@ -1038,30 +1017,30 @@ asmlinkage void do_coprocessor_error(struct pt_regs *regs) cwd = get_fpu_cwd(task); swd = get_fpu_swd(task); switch (swd & ~cwd & 0x3f) { - case 0x000: - default: - break; - case 0x001: /* Invalid Op */ - /* - * swd & 0x240 == 0x040: Stack Underflow - * swd & 0x240 == 0x240: Stack Overflow - * User must clear the SF bit (0x40) if set - */ - info.si_code = FPE_FLTINV; - break; - case 0x002: /* Denormalize */ - case 0x010: /* Underflow */ - info.si_code = FPE_FLTUND; - break; - case 0x004: /* Zero Divide */ - info.si_code = FPE_FLTDIV; - break; - case 0x008: /* Overflow */ - info.si_code = FPE_FLTOVF; - break; - case 0x020: /* Precision */ - info.si_code = FPE_FLTRES; - break; + case 0x000: /* No unmasked exception */ + default: /* Multiple exceptions */ + break; + case 0x001: /* Invalid Op */ + /* + * swd & 0x240 == 0x040: Stack Underflow + * swd & 0x240 == 0x240: Stack Overflow + * User must clear the SF bit (0x40) if set + */ + info.si_code = FPE_FLTINV; + break; + case 0x002: /* Denormalize */ + case 0x010: /* Underflow */ + info.si_code = FPE_FLTUND; + break; + case 0x004: /* Zero Divide */ + info.si_code = FPE_FLTDIV; + break; + case 0x008: /* Overflow */ + info.si_code = FPE_FLTOVF; + break; + case 0x020: /* Precision */ + info.si_code = FPE_FLTRES; + break; } force_sig_info(SIGFPE, &info, task); } @@ -1074,7 +1053,7 @@ asmlinkage void bad_intr(void) asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs) { void __user *ip = (void __user *)(regs->ip); - struct task_struct * task; + struct task_struct *task; siginfo_t info; unsigned short mxcsr; @@ -1102,25 +1081,25 @@ asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs) */ mxcsr = get_fpu_mxcsr(task); switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) { - case 0x000: - default: - break; - case 0x001: /* Invalid Op */ - info.si_code = FPE_FLTINV; - break; - case 0x002: /* Denormalize */ - case 0x010: /* Underflow */ - info.si_code = FPE_FLTUND; - break; - case 0x004: /* Zero Divide */ - info.si_code = FPE_FLTDIV; - break; - case 0x008: /* Overflow */ - info.si_code = FPE_FLTOVF; - break; - case 0x020: /* Precision */ - info.si_code = FPE_FLTRES; - break; + case 0x000: + default: + break; + case 0x001: /* Invalid Op */ + info.si_code = FPE_FLTINV; + break; + case 0x002: /* Denormalize */ + case 0x010: /* Underflow */ + info.si_code = FPE_FLTUND; + break; + case 0x004: /* Zero Divide */ + info.si_code = FPE_FLTDIV; + break; + case 0x008: /* Overflow */ + info.si_code = FPE_FLTOVF; + break; + case 0x020: /* Precision */ + info.si_code = FPE_FLTRES; + break; } force_sig_info(SIGFPE, &info, task); } @@ -1138,7 +1117,7 @@ asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void) } /* - * 'math_state_restore()' saves the current math information in the + * 'math_state_restore()' saves the current math information in the * old math state array, and gets the new ones from the current task * * Careful.. There are problems with IBM-designed IRQ13 behaviour. @@ -1163,7 +1142,7 @@ asmlinkage void math_state_restore(void) local_irq_disable(); } - clts(); /* Allow maths ops (or we recurse) */ + clts(); /* Allow maths ops (or we recurse) */ restore_fpu_checking(&me->thread.xstate->fxsave); task_thread_info(me)->status |= TS_USEDFPU; me->fpu_counter++; @@ -1172,64 +1151,61 @@ EXPORT_SYMBOL_GPL(math_state_restore); void __init trap_init(void) { - set_intr_gate(0,÷_error); - set_intr_gate_ist(1,&debug,DEBUG_STACK); - set_intr_gate_ist(2,&nmi,NMI_STACK); - set_system_gate_ist(3,&int3,DEBUG_STACK); /* int3 can be called from all */ - set_system_gate(4,&overflow); /* int4 can be called from all */ - set_intr_gate(5,&bounds); - set_intr_gate(6,&invalid_op); - set_intr_gate(7,&device_not_available); - set_intr_gate_ist(8,&double_fault, DOUBLEFAULT_STACK); - set_intr_gate(9,&coprocessor_segment_overrun); - set_intr_gate(10,&invalid_TSS); - set_intr_gate(11,&segment_not_present); - set_intr_gate_ist(12,&stack_segment,STACKFAULT_STACK); - set_intr_gate(13,&general_protection); - set_intr_gate(14,&page_fault); - set_intr_gate(15,&spurious_interrupt_bug); - set_intr_gate(16,&coprocessor_error); - set_intr_gate(17,&alignment_check); + set_intr_gate(0, ÷_error); + set_intr_gate_ist(1, &debug, DEBUG_STACK); + set_intr_gate_ist(2, &nmi, NMI_STACK); + set_system_gate_ist(3, &int3, DEBUG_STACK); /* int3 can be called from all */ + set_system_gate(4, &overflow); /* int4 can be called from all */ + set_intr_gate(5, &bounds); + set_intr_gate(6, &invalid_op); + set_intr_gate(7, &device_not_available); + set_intr_gate_ist(8, &double_fault, DOUBLEFAULT_STACK); + set_intr_gate(9, &coprocessor_segment_overrun); + set_intr_gate(10, &invalid_TSS); + set_intr_gate(11, &segment_not_present); + set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK); + set_intr_gate(13, &general_protection); + set_intr_gate(14, &page_fault); + set_intr_gate(15, &spurious_interrupt_bug); + set_intr_gate(16, &coprocessor_error); + set_intr_gate(17, &alignment_check); #ifdef CONFIG_X86_MCE - set_intr_gate_ist(18,&machine_check, MCE_STACK); + set_intr_gate_ist(18, &machine_check, MCE_STACK); #endif - set_intr_gate(19,&simd_coprocessor_error); + set_intr_gate(19, &simd_coprocessor_error); #ifdef CONFIG_IA32_EMULATION set_system_gate(IA32_SYSCALL_VECTOR, ia32_syscall); #endif - /* * initialize the per thread extended state: */ - init_thread_xstate(); + init_thread_xstate(); /* - * Should be a barrier for any external CPU state. + * Should be a barrier for any external CPU state: */ cpu_init(); } - static int __init oops_setup(char *s) -{ +{ if (!s) return -EINVAL; if (!strcmp(s, "panic")) panic_on_oops = 1; return 0; -} +} early_param("oops", oops_setup); static int __init kstack_setup(char *s) { if (!s) return -EINVAL; - kstack_depth_to_print = simple_strtoul(s,NULL,0); + kstack_depth_to_print = simple_strtoul(s, NULL, 0); return 0; } early_param("kstack", kstack_setup); - static int __init code_bytes_setup(char *s) { code_bytes = simple_strtoul(s, NULL, 0); diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c new file mode 100644 index 00000000000..7603c055390 --- /dev/null +++ b/arch/x86/kernel/tsc.c @@ -0,0 +1,535 @@ +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/timer.h> +#include <linux/acpi_pmtmr.h> +#include <linux/cpufreq.h> +#include <linux/dmi.h> +#include <linux/delay.h> +#include <linux/clocksource.h> +#include <linux/percpu.h> + +#include <asm/hpet.h> +#include <asm/timer.h> +#include <asm/vgtod.h> +#include <asm/time.h> +#include <asm/delay.h> + +unsigned int cpu_khz; /* TSC clocks / usec, not used here */ +EXPORT_SYMBOL(cpu_khz); +unsigned int tsc_khz; +EXPORT_SYMBOL(tsc_khz); + +/* + * TSC can be unstable due to cpufreq or due to unsynced TSCs + */ +static int tsc_unstable; + +/* native_sched_clock() is called before tsc_init(), so + we must start with the TSC soft disabled to prevent + erroneous rdtsc usage on !cpu_has_tsc processors */ +static int tsc_disabled = -1; + +/* + * Scheduler clock - returns current time in nanosec units. + */ +u64 native_sched_clock(void) +{ + u64 this_offset; + + /* + * Fall back to jiffies if there's no TSC available: + * ( But note that we still use it if the TSC is marked + * unstable. We do this because unlike Time Of Day, + * the scheduler clock tolerates small errors and it's + * very important for it to be as fast as the platform + * can achive it. ) + */ + if (unlikely(tsc_disabled)) { + /* No locking but a rare wrong value is not a big deal: */ + return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ); + } + + /* read the Time Stamp Counter: */ + rdtscll(this_offset); + + /* return the value in ns */ + return cycles_2_ns(this_offset); +} + +/* We need to define a real function for sched_clock, to override the + weak default version */ +#ifdef CONFIG_PARAVIRT +unsigned long long sched_clock(void) +{ + return paravirt_sched_clock(); +} +#else +unsigned long long +sched_clock(void) __attribute__((alias("native_sched_clock"))); +#endif + +int check_tsc_unstable(void) +{ + return tsc_unstable; +} +EXPORT_SYMBOL_GPL(check_tsc_unstable); + +#ifdef CONFIG_X86_TSC +int __init notsc_setup(char *str) +{ + printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, " + "cannot disable TSC completely.\n"); + tsc_disabled = 1; + return 1; +} +#else +/* + * disable flag for tsc. Takes effect by clearing the TSC cpu flag + * in cpu/common.c + */ +int __init notsc_setup(char *str) +{ + setup_clear_cpu_cap(X86_FEATURE_TSC); + return 1; +} +#endif + +__setup("notsc", notsc_setup); + +#define MAX_RETRIES 5 +#define SMI_TRESHOLD 50000 + +/* + * Read TSC and the reference counters. Take care of SMI disturbance + */ +static u64 __init tsc_read_refs(u64 *pm, u64 *hpet) +{ + u64 t1, t2; + int i; + + for (i = 0; i < MAX_RETRIES; i++) { + t1 = get_cycles(); + if (hpet) + *hpet = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF; + else + *pm = acpi_pm_read_early(); + t2 = get_cycles(); + if ((t2 - t1) < SMI_TRESHOLD) + return t2; + } + return ULLONG_MAX; +} + +/** + * native_calibrate_tsc - calibrate the tsc on boot + */ +unsigned long native_calibrate_tsc(void) +{ + unsigned long flags; + u64 tsc1, tsc2, tr1, tr2, delta, pm1, pm2, hpet1, hpet2; + int hpet = is_hpet_enabled(); + unsigned int tsc_khz_val = 0; + + local_irq_save(flags); + + tsc1 = tsc_read_refs(&pm1, hpet ? &hpet1 : NULL); + + outb((inb(0x61) & ~0x02) | 0x01, 0x61); + + outb(0xb0, 0x43); + outb((CLOCK_TICK_RATE / (1000 / 50)) & 0xff, 0x42); + outb((CLOCK_TICK_RATE / (1000 / 50)) >> 8, 0x42); + tr1 = get_cycles(); + while ((inb(0x61) & 0x20) == 0); + tr2 = get_cycles(); + + tsc2 = tsc_read_refs(&pm2, hpet ? &hpet2 : NULL); + + local_irq_restore(flags); + + /* + * Preset the result with the raw and inaccurate PIT + * calibration value + */ + delta = (tr2 - tr1); + do_div(delta, 50); + tsc_khz_val = delta; + + /* hpet or pmtimer available ? */ + if (!hpet && !pm1 && !pm2) { + printk(KERN_INFO "TSC calibrated against PIT\n"); + goto out; + } + + /* Check, whether the sampling was disturbed by an SMI */ + if (tsc1 == ULLONG_MAX || tsc2 == ULLONG_MAX) { + printk(KERN_WARNING "TSC calibration disturbed by SMI, " + "using PIT calibration result\n"); + goto out; + } + + tsc2 = (tsc2 - tsc1) * 1000000LL; + + if (hpet) { + printk(KERN_INFO "TSC calibrated against HPET\n"); + if (hpet2 < hpet1) + hpet2 += 0x100000000ULL; + hpet2 -= hpet1; + tsc1 = ((u64)hpet2 * hpet_readl(HPET_PERIOD)); + do_div(tsc1, 1000000); + } else { + printk(KERN_INFO "TSC calibrated against PM_TIMER\n"); + if (pm2 < pm1) + pm2 += (u64)ACPI_PM_OVRRUN; + pm2 -= pm1; + tsc1 = pm2 * 1000000000LL; + do_div(tsc1, PMTMR_TICKS_PER_SEC); + } + + do_div(tsc2, tsc1); + tsc_khz_val = tsc2; + +out: + return tsc_khz_val; +} + + +#ifdef CONFIG_X86_32 +/* Only called from the Powernow K7 cpu freq driver */ +int recalibrate_cpu_khz(void) +{ +#ifndef CONFIG_SMP + unsigned long cpu_khz_old = cpu_khz; + + if (cpu_has_tsc) { + tsc_khz = calibrate_tsc(); + cpu_khz = tsc_khz; + cpu_data(0).loops_per_jiffy = + cpufreq_scale(cpu_data(0).loops_per_jiffy, + cpu_khz_old, cpu_khz); + return 0; + } else + return -ENODEV; +#else + return -ENODEV; +#endif +} + +EXPORT_SYMBOL(recalibrate_cpu_khz); + +#endif /* CONFIG_X86_32 */ + +/* Accelerators for sched_clock() + * convert from cycles(64bits) => nanoseconds (64bits) + * basic equation: + * ns = cycles / (freq / ns_per_sec) + * ns = cycles * (ns_per_sec / freq) + * ns = cycles * (10^9 / (cpu_khz * 10^3)) + * ns = cycles * (10^6 / cpu_khz) + * + * Then we use scaling math (suggested by george@mvista.com) to get: + * ns = cycles * (10^6 * SC / cpu_khz) / SC + * ns = cycles * cyc2ns_scale / SC + * + * And since SC is a constant power of two, we can convert the div + * into a shift. + * + * We can use khz divisor instead of mhz to keep a better precision, since + * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. + * (mathieu.desnoyers@polymtl.ca) + * + * -johnstul@us.ibm.com "math is hard, lets go shopping!" + */ + +DEFINE_PER_CPU(unsigned long, cyc2ns); + +static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) +{ + unsigned long long tsc_now, ns_now; + unsigned long flags, *scale; + + local_irq_save(flags); + sched_clock_idle_sleep_event(); + + scale = &per_cpu(cyc2ns, cpu); + + rdtscll(tsc_now); + ns_now = __cycles_2_ns(tsc_now); + + if (cpu_khz) + *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz; + + sched_clock_idle_wakeup_event(0); + local_irq_restore(flags); +} + +#ifdef CONFIG_CPU_FREQ + +/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency + * changes. + * + * RED-PEN: On SMP we assume all CPUs run with the same frequency. It's + * not that important because current Opteron setups do not support + * scaling on SMP anyroads. + * + * Should fix up last_tsc too. Currently gettimeofday in the + * first tick after the change will be slightly wrong. + */ + +static unsigned int ref_freq; +static unsigned long loops_per_jiffy_ref; +static unsigned long tsc_khz_ref; + +static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct cpufreq_freqs *freq = data; + unsigned long *lpj, dummy; + + if (cpu_has(&cpu_data(freq->cpu), X86_FEATURE_CONSTANT_TSC)) + return 0; + + lpj = &dummy; + if (!(freq->flags & CPUFREQ_CONST_LOOPS)) +#ifdef CONFIG_SMP + lpj = &cpu_data(freq->cpu).loops_per_jiffy; +#else + lpj = &boot_cpu_data.loops_per_jiffy; +#endif + + if (!ref_freq) { + ref_freq = freq->old; + loops_per_jiffy_ref = *lpj; + tsc_khz_ref = tsc_khz; + } + if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || + (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || + (val == CPUFREQ_RESUMECHANGE)) { + *lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); + + tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); + if (!(freq->flags & CPUFREQ_CONST_LOOPS)) + mark_tsc_unstable("cpufreq changes"); + } + + set_cyc2ns_scale(tsc_khz_ref, freq->cpu); + + return 0; +} + +static struct notifier_block time_cpufreq_notifier_block = { + .notifier_call = time_cpufreq_notifier +}; + +static int __init cpufreq_tsc(void) +{ + cpufreq_register_notifier(&time_cpufreq_notifier_block, + CPUFREQ_TRANSITION_NOTIFIER); + return 0; +} + +core_initcall(cpufreq_tsc); + +#endif /* CONFIG_CPU_FREQ */ + +/* clocksource code */ + +static struct clocksource clocksource_tsc; + +/* + * We compare the TSC to the cycle_last value in the clocksource + * structure to avoid a nasty time-warp. This can be observed in a + * very small window right after one CPU updated cycle_last under + * xtime/vsyscall_gtod lock and the other CPU reads a TSC value which + * is smaller than the cycle_last reference value due to a TSC which + * is slighty behind. This delta is nowhere else observable, but in + * that case it results in a forward time jump in the range of hours + * due to the unsigned delta calculation of the time keeping core + * code, which is necessary to support wrapping clocksources like pm + * timer. + */ +static cycle_t read_tsc(void) +{ + cycle_t ret = (cycle_t)get_cycles(); + + return ret >= clocksource_tsc.cycle_last ? + ret : clocksource_tsc.cycle_last; +} + +#ifdef CONFIG_X86_64 +static cycle_t __vsyscall_fn vread_tsc(void) +{ + cycle_t ret = (cycle_t)vget_cycles(); + + return ret >= __vsyscall_gtod_data.clock.cycle_last ? + ret : __vsyscall_gtod_data.clock.cycle_last; +} +#endif + +static struct clocksource clocksource_tsc = { + .name = "tsc", + .rating = 300, + .read = read_tsc, + .mask = CLOCKSOURCE_MASK(64), + .shift = 22, + .flags = CLOCK_SOURCE_IS_CONTINUOUS | + CLOCK_SOURCE_MUST_VERIFY, +#ifdef CONFIG_X86_64 + .vread = vread_tsc, +#endif +}; + +void mark_tsc_unstable(char *reason) +{ + if (!tsc_unstable) { + tsc_unstable = 1; + printk("Marking TSC unstable due to %s\n", reason); + /* Change only the rating, when not registered */ + if (clocksource_tsc.mult) + clocksource_change_rating(&clocksource_tsc, 0); + else + clocksource_tsc.rating = 0; + } +} + +EXPORT_SYMBOL_GPL(mark_tsc_unstable); + +static int __init dmi_mark_tsc_unstable(const struct dmi_system_id *d) +{ + printk(KERN_NOTICE "%s detected: marking TSC unstable.\n", + d->ident); + tsc_unstable = 1; + return 0; +} + +/* List of systems that have known TSC problems */ +static struct dmi_system_id __initdata bad_tsc_dmi_table[] = { + { + .callback = dmi_mark_tsc_unstable, + .ident = "IBM Thinkpad 380XD", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), + DMI_MATCH(DMI_BOARD_NAME, "2635FA0"), + }, + }, + {} +}; + +/* + * Geode_LX - the OLPC CPU has a possibly a very reliable TSC + */ +#ifdef CONFIG_MGEODE_LX +/* RTSC counts during suspend */ +#define RTSC_SUSP 0x100 + +static void __init check_geode_tsc_reliable(void) +{ + unsigned long res_low, res_high; + + rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high); + if (res_low & RTSC_SUSP) + clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; +} +#else +static inline void check_geode_tsc_reliable(void) { } +#endif + +/* + * Make an educated guess if the TSC is trustworthy and synchronized + * over all CPUs. + */ +__cpuinit int unsynchronized_tsc(void) +{ + if (!cpu_has_tsc || tsc_unstable) + return 1; + +#ifdef CONFIG_SMP + if (apic_is_clustered_box()) + return 1; +#endif + + if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) + return 0; + /* + * Intel systems are normally all synchronized. + * Exceptions must mark TSC as unstable: + */ + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { + /* assume multi socket systems are not synchronized: */ + if (num_possible_cpus() > 1) + tsc_unstable = 1; + } + + return tsc_unstable; +} + +static void __init init_tsc_clocksource(void) +{ + clocksource_tsc.mult = clocksource_khz2mult(tsc_khz, + clocksource_tsc.shift); + /* lower the rating if we already know its unstable: */ + if (check_tsc_unstable()) { + clocksource_tsc.rating = 0; + clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS; + } + clocksource_register(&clocksource_tsc); +} + +void __init tsc_init(void) +{ + u64 lpj; + int cpu; + + if (!cpu_has_tsc) + return; + + tsc_khz = calibrate_tsc(); + cpu_khz = tsc_khz; + + if (!tsc_khz) { + mark_tsc_unstable("could not calculate TSC khz"); + return; + } + +#ifdef CONFIG_X86_64 + if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) && + (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)) + cpu_khz = calibrate_cpu(); +#endif + + lpj = ((u64)tsc_khz * 1000); + do_div(lpj, HZ); + lpj_fine = lpj; + + printk("Detected %lu.%03lu MHz processor.\n", + (unsigned long)cpu_khz / 1000, + (unsigned long)cpu_khz % 1000); + + /* + * Secondary CPUs do not run through tsc_init(), so set up + * all the scale factors for all CPUs, assuming the same + * speed as the bootup CPU. (cpufreq notifiers will fix this + * up if their speed diverges) + */ + for_each_possible_cpu(cpu) + set_cyc2ns_scale(cpu_khz, cpu); + + if (tsc_disabled > 0) + return; + + /* now allow native_sched_clock() to use rdtsc */ + tsc_disabled = 0; + + use_tsc_delay(); + /* Check and install the TSC clocksource */ + dmi_check_system(bad_tsc_dmi_table); + + if (unsynchronized_tsc()) + mark_tsc_unstable("TSCs unsynchronized"); + + check_geode_tsc_reliable(); + init_tsc_clocksource(); +} + diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c deleted file mode 100644 index 6240922e497..00000000000 --- a/arch/x86/kernel/tsc_32.c +++ /dev/null @@ -1,455 +0,0 @@ -#include <linux/sched.h> -#include <linux/clocksource.h> -#include <linux/workqueue.h> -#include <linux/delay.h> -#include <linux/cpufreq.h> -#include <linux/jiffies.h> -#include <linux/init.h> -#include <linux/dmi.h> -#include <linux/percpu.h> - -#include <asm/delay.h> -#include <asm/tsc.h> -#include <asm/io.h> -#include <asm/timer.h> - -#include "mach_timer.h" - -/* native_sched_clock() is called before tsc_init(), so - we must start with the TSC soft disabled to prevent - erroneous rdtsc usage on !cpu_has_tsc processors */ -static int tsc_disabled = -1; - -/* - * On some systems the TSC frequency does not - * change with the cpu frequency. So we need - * an extra value to store the TSC freq - */ -unsigned int tsc_khz; -EXPORT_SYMBOL_GPL(tsc_khz); - -#ifdef CONFIG_X86_TSC -static int __init tsc_setup(char *str) -{ - printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, " - "cannot disable TSC completely.\n"); - tsc_disabled = 1; - return 1; -} -#else -/* - * disable flag for tsc. Takes effect by clearing the TSC cpu flag - * in cpu/common.c - */ -static int __init tsc_setup(char *str) -{ - setup_clear_cpu_cap(X86_FEATURE_TSC); - return 1; -} -#endif - -__setup("notsc", tsc_setup); - -/* - * code to mark and check if the TSC is unstable - * due to cpufreq or due to unsynced TSCs - */ -static int tsc_unstable; - -int check_tsc_unstable(void) -{ - return tsc_unstable; -} -EXPORT_SYMBOL_GPL(check_tsc_unstable); - -/* Accelerators for sched_clock() - * convert from cycles(64bits) => nanoseconds (64bits) - * basic equation: - * ns = cycles / (freq / ns_per_sec) - * ns = cycles * (ns_per_sec / freq) - * ns = cycles * (10^9 / (cpu_khz * 10^3)) - * ns = cycles * (10^6 / cpu_khz) - * - * Then we use scaling math (suggested by george@mvista.com) to get: - * ns = cycles * (10^6 * SC / cpu_khz) / SC - * ns = cycles * cyc2ns_scale / SC - * - * And since SC is a constant power of two, we can convert the div - * into a shift. - * - * We can use khz divisor instead of mhz to keep a better precision, since - * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. - * (mathieu.desnoyers@polymtl.ca) - * - * -johnstul@us.ibm.com "math is hard, lets go shopping!" - */ - -DEFINE_PER_CPU(unsigned long, cyc2ns); - -static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) -{ - unsigned long long tsc_now, ns_now; - unsigned long flags, *scale; - - local_irq_save(flags); - sched_clock_idle_sleep_event(); - - scale = &per_cpu(cyc2ns, cpu); - - rdtscll(tsc_now); - ns_now = __cycles_2_ns(tsc_now); - - if (cpu_khz) - *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz; - - /* - * Start smoothly with the new frequency: - */ - sched_clock_idle_wakeup_event(0); - local_irq_restore(flags); -} - -/* - * Scheduler clock - returns current time in nanosec units. - */ -unsigned long long native_sched_clock(void) -{ - unsigned long long this_offset; - - /* - * Fall back to jiffies if there's no TSC available: - * ( But note that we still use it if the TSC is marked - * unstable. We do this because unlike Time Of Day, - * the scheduler clock tolerates small errors and it's - * very important for it to be as fast as the platform - * can achive it. ) - */ - if (unlikely(tsc_disabled)) - /* No locking but a rare wrong value is not a big deal: */ - return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ); - - /* read the Time Stamp Counter: */ - rdtscll(this_offset); - - /* return the value in ns */ - return cycles_2_ns(this_offset); -} - -/* We need to define a real function for sched_clock, to override the - weak default version */ -#ifdef CONFIG_PARAVIRT -unsigned long long sched_clock(void) -{ - return paravirt_sched_clock(); -} -#else -unsigned long long sched_clock(void) - __attribute__((alias("native_sched_clock"))); -#endif - -unsigned long native_calculate_cpu_khz(void) -{ - unsigned long long start, end; - unsigned long count; - u64 delta64 = (u64)ULLONG_MAX; - int i; - unsigned long flags; - - local_irq_save(flags); - - /* run 3 times to ensure the cache is warm and to get an accurate reading */ - for (i = 0; i < 3; i++) { - mach_prepare_counter(); - rdtscll(start); - mach_countup(&count); - rdtscll(end); - - /* - * Error: ECTCNEVERSET - * The CTC wasn't reliable: we got a hit on the very first read, - * or the CPU was so fast/slow that the quotient wouldn't fit in - * 32 bits.. - */ - if (count <= 1) - continue; - - /* cpu freq too slow: */ - if ((end - start) <= CALIBRATE_TIME_MSEC) - continue; - - /* - * We want the minimum time of all runs in case one of them - * is inaccurate due to SMI or other delay - */ - delta64 = min(delta64, (end - start)); - } - - /* cpu freq too fast (or every run was bad): */ - if (delta64 > (1ULL<<32)) - goto err; - - delta64 += CALIBRATE_TIME_MSEC/2; /* round for do_div */ - do_div(delta64,CALIBRATE_TIME_MSEC); - - local_irq_restore(flags); - return (unsigned long)delta64; -err: - local_irq_restore(flags); - return 0; -} - -int recalibrate_cpu_khz(void) -{ -#ifndef CONFIG_SMP - unsigned long cpu_khz_old = cpu_khz; - - if (cpu_has_tsc) { - cpu_khz = calculate_cpu_khz(); - tsc_khz = cpu_khz; - cpu_data(0).loops_per_jiffy = - cpufreq_scale(cpu_data(0).loops_per_jiffy, - cpu_khz_old, cpu_khz); - return 0; - } else - return -ENODEV; -#else - return -ENODEV; -#endif -} - -EXPORT_SYMBOL(recalibrate_cpu_khz); - -#ifdef CONFIG_CPU_FREQ - -/* - * if the CPU frequency is scaled, TSC-based delays will need a different - * loops_per_jiffy value to function properly. - */ -static unsigned int ref_freq; -static unsigned long loops_per_jiffy_ref; -static unsigned long cpu_khz_ref; - -static int -time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) -{ - struct cpufreq_freqs *freq = data; - - if (!ref_freq) { - if (!freq->old){ - ref_freq = freq->new; - return 0; - } - ref_freq = freq->old; - loops_per_jiffy_ref = cpu_data(freq->cpu).loops_per_jiffy; - cpu_khz_ref = cpu_khz; - } - - if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || - (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || - (val == CPUFREQ_RESUMECHANGE)) { - if (!(freq->flags & CPUFREQ_CONST_LOOPS)) - cpu_data(freq->cpu).loops_per_jiffy = - cpufreq_scale(loops_per_jiffy_ref, - ref_freq, freq->new); - - if (cpu_khz) { - - if (num_online_cpus() == 1) - cpu_khz = cpufreq_scale(cpu_khz_ref, - ref_freq, freq->new); - if (!(freq->flags & CPUFREQ_CONST_LOOPS)) { - tsc_khz = cpu_khz; - set_cyc2ns_scale(cpu_khz, freq->cpu); - /* - * TSC based sched_clock turns - * to junk w/ cpufreq - */ - mark_tsc_unstable("cpufreq changes"); - } - } - } - - return 0; -} - -static struct notifier_block time_cpufreq_notifier_block = { - .notifier_call = time_cpufreq_notifier -}; - -static int __init cpufreq_tsc(void) -{ - return cpufreq_register_notifier(&time_cpufreq_notifier_block, - CPUFREQ_TRANSITION_NOTIFIER); -} -core_initcall(cpufreq_tsc); - -#endif - -/* clock source code */ - -static struct clocksource clocksource_tsc; - -/* - * We compare the TSC to the cycle_last value in the clocksource - * structure to avoid a nasty time-warp issue. This can be observed in - * a very small window right after one CPU updated cycle_last under - * xtime lock and the other CPU reads a TSC value which is smaller - * than the cycle_last reference value due to a TSC which is slighty - * behind. This delta is nowhere else observable, but in that case it - * results in a forward time jump in the range of hours due to the - * unsigned delta calculation of the time keeping core code, which is - * necessary to support wrapping clocksources like pm timer. - */ -static cycle_t read_tsc(void) -{ - cycle_t ret; - - rdtscll(ret); - - return ret >= clocksource_tsc.cycle_last ? - ret : clocksource_tsc.cycle_last; -} - -static struct clocksource clocksource_tsc = { - .name = "tsc", - .rating = 300, - .read = read_tsc, - .mask = CLOCKSOURCE_MASK(64), - .mult = 0, /* to be set */ - .shift = 22, - .flags = CLOCK_SOURCE_IS_CONTINUOUS | - CLOCK_SOURCE_MUST_VERIFY, -}; - -void mark_tsc_unstable(char *reason) -{ - if (!tsc_unstable) { - tsc_unstable = 1; - printk("Marking TSC unstable due to: %s.\n", reason); - /* Can be called before registration */ - if (clocksource_tsc.mult) - clocksource_change_rating(&clocksource_tsc, 0); - else - clocksource_tsc.rating = 0; - } -} -EXPORT_SYMBOL_GPL(mark_tsc_unstable); - -static int __init dmi_mark_tsc_unstable(const struct dmi_system_id *d) -{ - printk(KERN_NOTICE "%s detected: marking TSC unstable.\n", - d->ident); - tsc_unstable = 1; - return 0; -} - -/* List of systems that have known TSC problems */ -static struct dmi_system_id __initdata bad_tsc_dmi_table[] = { - { - .callback = dmi_mark_tsc_unstable, - .ident = "IBM Thinkpad 380XD", - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), - DMI_MATCH(DMI_BOARD_NAME, "2635FA0"), - }, - }, - {} -}; - -/* - * Make an educated guess if the TSC is trustworthy and synchronized - * over all CPUs. - */ -__cpuinit int unsynchronized_tsc(void) -{ - if (!cpu_has_tsc || tsc_unstable) - return 1; - - /* Anything with constant TSC should be synchronized */ - if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) - return 0; - - /* - * Intel systems are normally all synchronized. - * Exceptions must mark TSC as unstable: - */ - if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { - /* assume multi socket systems are not synchronized: */ - if (num_possible_cpus() > 1) - tsc_unstable = 1; - } - return tsc_unstable; -} - -/* - * Geode_LX - the OLPC CPU has a possibly a very reliable TSC - */ -#ifdef CONFIG_MGEODE_LX -/* RTSC counts during suspend */ -#define RTSC_SUSP 0x100 - -static void __init check_geode_tsc_reliable(void) -{ - unsigned long res_low, res_high; - - rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high); - if (res_low & RTSC_SUSP) - clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; -} -#else -static inline void check_geode_tsc_reliable(void) { } -#endif - - -void __init tsc_init(void) -{ - int cpu; - u64 lpj; - - if (!cpu_has_tsc || tsc_disabled > 0) - return; - - cpu_khz = calculate_cpu_khz(); - tsc_khz = cpu_khz; - - if (!cpu_khz) { - mark_tsc_unstable("could not calculate TSC khz"); - return; - } - - lpj = ((u64)tsc_khz * 1000); - do_div(lpj, HZ); - lpj_fine = lpj; - - /* now allow native_sched_clock() to use rdtsc */ - tsc_disabled = 0; - - printk("Detected %lu.%03lu MHz processor.\n", - (unsigned long)cpu_khz / 1000, - (unsigned long)cpu_khz % 1000); - - /* - * Secondary CPUs do not run through tsc_init(), so set up - * all the scale factors for all CPUs, assuming the same - * speed as the bootup CPU. (cpufreq notifiers will fix this - * up if their speed diverges) - */ - for_each_possible_cpu(cpu) - set_cyc2ns_scale(cpu_khz, cpu); - - use_tsc_delay(); - - /* Check and install the TSC clocksource */ - dmi_check_system(bad_tsc_dmi_table); - - unsynchronized_tsc(); - check_geode_tsc_reliable(); - clocksource_tsc.mult = clocksource_khz2mult(tsc_khz, - clocksource_tsc.shift); - /* lower the rating if we already know its unstable: */ - if (check_tsc_unstable()) { - clocksource_tsc.rating = 0; - clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS; - } - clocksource_register(&clocksource_tsc); -} diff --git a/arch/x86/kernel/tsc_64.c b/arch/x86/kernel/tsc_64.c deleted file mode 100644 index 9898fb01edf..00000000000 --- a/arch/x86/kernel/tsc_64.c +++ /dev/null @@ -1,357 +0,0 @@ -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/interrupt.h> -#include <linux/init.h> -#include <linux/clocksource.h> -#include <linux/time.h> -#include <linux/acpi.h> -#include <linux/cpufreq.h> -#include <linux/acpi_pmtmr.h> - -#include <asm/hpet.h> -#include <asm/timex.h> -#include <asm/timer.h> -#include <asm/vgtod.h> - -static int notsc __initdata = 0; - -unsigned int cpu_khz; /* TSC clocks / usec, not used here */ -EXPORT_SYMBOL(cpu_khz); -unsigned int tsc_khz; -EXPORT_SYMBOL(tsc_khz); - -/* Accelerators for sched_clock() - * convert from cycles(64bits) => nanoseconds (64bits) - * basic equation: - * ns = cycles / (freq / ns_per_sec) - * ns = cycles * (ns_per_sec / freq) - * ns = cycles * (10^9 / (cpu_khz * 10^3)) - * ns = cycles * (10^6 / cpu_khz) - * - * Then we use scaling math (suggested by george@mvista.com) to get: - * ns = cycles * (10^6 * SC / cpu_khz) / SC - * ns = cycles * cyc2ns_scale / SC - * - * And since SC is a constant power of two, we can convert the div - * into a shift. - * - * We can use khz divisor instead of mhz to keep a better precision, since - * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. - * (mathieu.desnoyers@polymtl.ca) - * - * -johnstul@us.ibm.com "math is hard, lets go shopping!" - */ -DEFINE_PER_CPU(unsigned long, cyc2ns); - -static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) -{ - unsigned long long tsc_now, ns_now; - unsigned long flags, *scale; - - local_irq_save(flags); - sched_clock_idle_sleep_event(); - - scale = &per_cpu(cyc2ns, cpu); - - rdtscll(tsc_now); - ns_now = __cycles_2_ns(tsc_now); - - if (cpu_khz) - *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz; - - sched_clock_idle_wakeup_event(0); - local_irq_restore(flags); -} - -unsigned long long native_sched_clock(void) -{ - unsigned long a = 0; - - /* Could do CPU core sync here. Opteron can execute rdtsc speculatively, - * which means it is not completely exact and may not be monotonous - * between CPUs. But the errors should be too small to matter for - * scheduling purposes. - */ - - rdtscll(a); - return cycles_2_ns(a); -} - -/* We need to define a real function for sched_clock, to override the - weak default version */ -#ifdef CONFIG_PARAVIRT -unsigned long long sched_clock(void) -{ - return paravirt_sched_clock(); -} -#else -unsigned long long -sched_clock(void) __attribute__((alias("native_sched_clock"))); -#endif - - -static int tsc_unstable; - -int check_tsc_unstable(void) -{ - return tsc_unstable; -} -EXPORT_SYMBOL_GPL(check_tsc_unstable); - -#ifdef CONFIG_CPU_FREQ - -/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency - * changes. - * - * RED-PEN: On SMP we assume all CPUs run with the same frequency. It's - * not that important because current Opteron setups do not support - * scaling on SMP anyroads. - * - * Should fix up last_tsc too. Currently gettimeofday in the - * first tick after the change will be slightly wrong. - */ - -static unsigned int ref_freq; -static unsigned long loops_per_jiffy_ref; -static unsigned long tsc_khz_ref; - -static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, - void *data) -{ - struct cpufreq_freqs *freq = data; - unsigned long *lpj, dummy; - - if (cpu_has(&cpu_data(freq->cpu), X86_FEATURE_CONSTANT_TSC)) - return 0; - - lpj = &dummy; - if (!(freq->flags & CPUFREQ_CONST_LOOPS)) -#ifdef CONFIG_SMP - lpj = &cpu_data(freq->cpu).loops_per_jiffy; -#else - lpj = &boot_cpu_data.loops_per_jiffy; -#endif - - if (!ref_freq) { - ref_freq = freq->old; - loops_per_jiffy_ref = *lpj; - tsc_khz_ref = tsc_khz; - } - if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || - (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || - (val == CPUFREQ_RESUMECHANGE)) { - *lpj = - cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); - - tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); - if (!(freq->flags & CPUFREQ_CONST_LOOPS)) - mark_tsc_unstable("cpufreq changes"); - } - - set_cyc2ns_scale(tsc_khz_ref, freq->cpu); - - return 0; -} - -static struct notifier_block time_cpufreq_notifier_block = { - .notifier_call = time_cpufreq_notifier -}; - -static int __init cpufreq_tsc(void) -{ - cpufreq_register_notifier(&time_cpufreq_notifier_block, - CPUFREQ_TRANSITION_NOTIFIER); - return 0; -} - -core_initcall(cpufreq_tsc); - -#endif - -#define MAX_RETRIES 5 -#define SMI_TRESHOLD 50000 - -/* - * Read TSC and the reference counters. Take care of SMI disturbance - */ -static unsigned long __init tsc_read_refs(unsigned long *pm, - unsigned long *hpet) -{ - unsigned long t1, t2; - int i; - - for (i = 0; i < MAX_RETRIES; i++) { - t1 = get_cycles(); - if (hpet) - *hpet = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF; - else - *pm = acpi_pm_read_early(); - t2 = get_cycles(); - if ((t2 - t1) < SMI_TRESHOLD) - return t2; - } - return ULONG_MAX; -} - -/** - * tsc_calibrate - calibrate the tsc on boot - */ -void __init tsc_calibrate(void) -{ - unsigned long flags, tsc1, tsc2, tr1, tr2, pm1, pm2, hpet1, hpet2; - int hpet = is_hpet_enabled(), cpu; - - local_irq_save(flags); - - tsc1 = tsc_read_refs(&pm1, hpet ? &hpet1 : NULL); - - outb((inb(0x61) & ~0x02) | 0x01, 0x61); - - outb(0xb0, 0x43); - outb((CLOCK_TICK_RATE / (1000 / 50)) & 0xff, 0x42); - outb((CLOCK_TICK_RATE / (1000 / 50)) >> 8, 0x42); - tr1 = get_cycles(); - while ((inb(0x61) & 0x20) == 0); - tr2 = get_cycles(); - - tsc2 = tsc_read_refs(&pm2, hpet ? &hpet2 : NULL); - - local_irq_restore(flags); - - /* - * Preset the result with the raw and inaccurate PIT - * calibration value - */ - tsc_khz = (tr2 - tr1) / 50; - - /* hpet or pmtimer available ? */ - if (!hpet && !pm1 && !pm2) { - printk(KERN_INFO "TSC calibrated against PIT\n"); - goto out; - } - - /* Check, whether the sampling was disturbed by an SMI */ - if (tsc1 == ULONG_MAX || tsc2 == ULONG_MAX) { - printk(KERN_WARNING "TSC calibration disturbed by SMI, " - "using PIT calibration result\n"); - goto out; - } - - tsc2 = (tsc2 - tsc1) * 1000000L; - - if (hpet) { - printk(KERN_INFO "TSC calibrated against HPET\n"); - if (hpet2 < hpet1) - hpet2 += 0x100000000UL; - hpet2 -= hpet1; - tsc1 = (hpet2 * hpet_readl(HPET_PERIOD)) / 1000000; - } else { - printk(KERN_INFO "TSC calibrated against PM_TIMER\n"); - if (pm2 < pm1) - pm2 += ACPI_PM_OVRRUN; - pm2 -= pm1; - tsc1 = (pm2 * 1000000000) / PMTMR_TICKS_PER_SEC; - } - - tsc_khz = tsc2 / tsc1; - -out: - for_each_possible_cpu(cpu) - set_cyc2ns_scale(tsc_khz, cpu); -} - -/* - * Make an educated guess if the TSC is trustworthy and synchronized - * over all CPUs. - */ -__cpuinit int unsynchronized_tsc(void) -{ - if (tsc_unstable) - return 1; - -#ifdef CONFIG_SMP - if (apic_is_clustered_box()) - return 1; -#endif - - if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) - return 0; - - /* Assume multi socket systems are not synchronized */ - return num_present_cpus() > 1; -} - -int __init notsc_setup(char *s) -{ - notsc = 1; - return 1; -} - -__setup("notsc", notsc_setup); - -static struct clocksource clocksource_tsc; - -/* - * We compare the TSC to the cycle_last value in the clocksource - * structure to avoid a nasty time-warp. This can be observed in a - * very small window right after one CPU updated cycle_last under - * xtime/vsyscall_gtod lock and the other CPU reads a TSC value which - * is smaller than the cycle_last reference value due to a TSC which - * is slighty behind. This delta is nowhere else observable, but in - * that case it results in a forward time jump in the range of hours - * due to the unsigned delta calculation of the time keeping core - * code, which is necessary to support wrapping clocksources like pm - * timer. - */ -static cycle_t read_tsc(void) -{ - cycle_t ret = (cycle_t)get_cycles(); - - return ret >= clocksource_tsc.cycle_last ? - ret : clocksource_tsc.cycle_last; -} - -static cycle_t __vsyscall_fn vread_tsc(void) -{ - cycle_t ret = (cycle_t)vget_cycles(); - - return ret >= __vsyscall_gtod_data.clock.cycle_last ? - ret : __vsyscall_gtod_data.clock.cycle_last; -} - -static struct clocksource clocksource_tsc = { - .name = "tsc", - .rating = 300, - .read = read_tsc, - .mask = CLOCKSOURCE_MASK(64), - .shift = 22, - .flags = CLOCK_SOURCE_IS_CONTINUOUS | - CLOCK_SOURCE_MUST_VERIFY, - .vread = vread_tsc, -}; - -void mark_tsc_unstable(char *reason) -{ - if (!tsc_unstable) { - tsc_unstable = 1; - printk("Marking TSC unstable due to %s\n", reason); - /* Change only the rating, when not registered */ - if (clocksource_tsc.mult) - clocksource_change_rating(&clocksource_tsc, 0); - else - clocksource_tsc.rating = 0; - } -} -EXPORT_SYMBOL_GPL(mark_tsc_unstable); - -void __init init_tsc_clocksource(void) -{ - if (!notsc) { - clocksource_tsc.mult = clocksource_khz2mult(tsc_khz, - clocksource_tsc.shift); - if (check_tsc_unstable()) - clocksource_tsc.rating = 0; - - clocksource_register(&clocksource_tsc); - } -} diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c new file mode 100644 index 00000000000..e94bdb6add1 --- /dev/null +++ b/arch/x86/kernel/visws_quirks.c @@ -0,0 +1,709 @@ +/* + * SGI Visual Workstation support and quirks, unmaintained. + * + * Split out from setup.c by davej@suse.de + * + * Copyright (C) 1999 Bent Hagemark, Ingo Molnar + * + * SGI Visual Workstation interrupt controller + * + * The Cobalt system ASIC in the Visual Workstation contains a "Cobalt" APIC + * which serves as the main interrupt controller in the system. Non-legacy + * hardware in the system uses this controller directly. Legacy devices + * are connected to the PIIX4 which in turn has its 8259(s) connected to + * a of the Cobalt APIC entry. + * + * 09/02/2000 - Updated for 2.4 by jbarnes@sgi.com + * + * 25/11/2002 - Updated for 2.5 by Andrey Panin <pazke@orbita1.ru> + */ +#include <linux/interrupt.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/smp.h> + +#include <asm/visws/cobalt.h> +#include <asm/visws/piix4.h> +#include <asm/arch_hooks.h> +#include <asm/fixmap.h> +#include <asm/reboot.h> +#include <asm/setup.h> +#include <asm/e820.h> +#include <asm/smp.h> +#include <asm/io.h> + +#include <mach_ipi.h> + +#include "mach_apic.h" + +#include <linux/init.h> +#include <linux/smp.h> + +#include <linux/kernel_stat.h> +#include <linux/interrupt.h> +#include <linux/init.h> + +#include <asm/io.h> +#include <asm/apic.h> +#include <asm/i8259.h> +#include <asm/irq_vectors.h> +#include <asm/visws/cobalt.h> +#include <asm/visws/lithium.h> +#include <asm/visws/piix4.h> + +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/pci.h> +#include <linux/pci_ids.h> + +extern int no_broadcast; + +#include <asm/io.h> +#include <asm/apic.h> +#include <asm/arch_hooks.h> +#include <asm/visws/cobalt.h> +#include <asm/visws/lithium.h> + +char visws_board_type = -1; +char visws_board_rev = -1; + +int is_visws_box(void) +{ + return visws_board_type >= 0; +} + +static int __init visws_time_init_quirk(void) +{ + printk(KERN_INFO "Starting Cobalt Timer system clock\n"); + + /* Set the countdown value */ + co_cpu_write(CO_CPU_TIMEVAL, CO_TIME_HZ/HZ); + + /* Start the timer */ + co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) | CO_CTRL_TIMERUN); + + /* Enable (unmask) the timer interrupt */ + co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) & ~CO_CTRL_TIMEMASK); + + /* + * Zero return means the generic timer setup code will set up + * the standard vector: + */ + return 0; +} + +static int __init visws_pre_intr_init_quirk(void) +{ + init_VISWS_APIC_irqs(); + + /* + * We dont want ISA irqs to be set up by the generic code: + */ + return 1; +} + +/* Quirk for machine specific memory setup. */ + +#define MB (1024 * 1024) + +unsigned long sgivwfb_mem_phys; +unsigned long sgivwfb_mem_size; +EXPORT_SYMBOL(sgivwfb_mem_phys); +EXPORT_SYMBOL(sgivwfb_mem_size); + +long long mem_size __initdata = 0; + +static char * __init visws_memory_setup_quirk(void) +{ + long long gfx_mem_size = 8 * MB; + + mem_size = boot_params.alt_mem_k; + + if (!mem_size) { + printk(KERN_WARNING "Bootloader didn't set memory size, upgrade it !\n"); + mem_size = 128 * MB; + } + + /* + * this hardcodes the graphics memory to 8 MB + * it really should be sized dynamically (or at least + * set as a boot param) + */ + if (!sgivwfb_mem_size) { + printk(KERN_WARNING "Defaulting to 8 MB framebuffer size\n"); + sgivwfb_mem_size = 8 * MB; + } + + /* + * Trim to nearest MB + */ + sgivwfb_mem_size &= ~((1 << 20) - 1); + sgivwfb_mem_phys = mem_size - gfx_mem_size; + + e820_add_region(0, LOWMEMSIZE(), E820_RAM); + e820_add_region(HIGH_MEMORY, mem_size - sgivwfb_mem_size - HIGH_MEMORY, E820_RAM); + e820_add_region(sgivwfb_mem_phys, sgivwfb_mem_size, E820_RESERVED); + + return "PROM"; +} + +static void visws_machine_emergency_restart(void) +{ + /* + * Visual Workstations restart after this + * register is poked on the PIIX4 + */ + outb(PIIX4_RESET_VAL, PIIX4_RESET_PORT); +} + +static void visws_machine_power_off(void) +{ + unsigned short pm_status; +/* extern unsigned int pci_bus0; */ + + while ((pm_status = inw(PMSTS_PORT)) & 0x100) + outw(pm_status, PMSTS_PORT); + + outw(PM_SUSPEND_ENABLE, PMCNTRL_PORT); + + mdelay(10); + +#define PCI_CONF1_ADDRESS(bus, devfn, reg) \ + (0x80000000 | (bus << 16) | (devfn << 8) | (reg & ~3)) + +/* outl(PCI_CONF1_ADDRESS(pci_bus0, SPECIAL_DEV, SPECIAL_REG), 0xCF8); */ + outl(PIIX_SPECIAL_STOP, 0xCFC); +} + +static int __init visws_get_smp_config_quirk(unsigned int early) +{ + /* + * Prevent MP-table parsing by the generic code: + */ + return 1; +} + +extern unsigned int __cpuinitdata maxcpus; + +/* + * The Visual Workstation is Intel MP compliant in the hardware + * sense, but it doesn't have a BIOS(-configuration table). + * No problem for Linux. + */ + +static void __init MP_processor_info (struct mpc_config_processor *m) +{ + int ver, logical_apicid; + physid_mask_t apic_cpus; + + if (!(m->mpc_cpuflag & CPU_ENABLED)) + return; + + logical_apicid = m->mpc_apicid; + printk(KERN_INFO "%sCPU #%d %u:%u APIC version %d\n", + m->mpc_cpuflag & CPU_BOOTPROCESSOR ? "Bootup " : "", + m->mpc_apicid, + (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8, + (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4, + m->mpc_apicver); + + if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) + boot_cpu_physical_apicid = m->mpc_apicid; + + ver = m->mpc_apicver; + if ((ver >= 0x14 && m->mpc_apicid >= 0xff) || m->mpc_apicid >= 0xf) { + printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n", + m->mpc_apicid, MAX_APICS); + return; + } + + apic_cpus = apicid_to_cpu_present(m->mpc_apicid); + physids_or(phys_cpu_present_map, phys_cpu_present_map, apic_cpus); + /* + * Validate version + */ + if (ver == 0x0) { + printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! " + "fixing up to 0x10. (tell your hw vendor)\n", + m->mpc_apicid); + ver = 0x10; + } + apic_version[m->mpc_apicid] = ver; +} + +int __init visws_find_smp_config_quirk(unsigned int reserve) +{ + struct mpc_config_processor *mp = phys_to_virt(CO_CPU_TAB_PHYS); + unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS)); + + if (ncpus > CO_CPU_MAX) { + printk(KERN_WARNING "find_visws_smp: got cpu count of %d at %p\n", + ncpus, mp); + + ncpus = CO_CPU_MAX; + } + + if (ncpus > maxcpus) + ncpus = maxcpus; + +#ifdef CONFIG_X86_LOCAL_APIC + smp_found_config = 1; +#endif + while (ncpus--) + MP_processor_info(mp++); + + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; + + return 1; +} + +extern int visws_trap_init_quirk(void); + +void __init visws_early_detect(void) +{ + int raw; + + visws_board_type = (char)(inb_p(PIIX_GPI_BD_REG) & PIIX_GPI_BD_REG) + >> PIIX_GPI_BD_SHIFT; + + if (visws_board_type < 0) + return; + + /* + * Install special quirks for timer, interrupt and memory setup: + */ + arch_time_init_quirk = visws_time_init_quirk; + arch_pre_intr_init_quirk = visws_pre_intr_init_quirk; + arch_memory_setup_quirk = visws_memory_setup_quirk; + + /* + * Fall back to generic behavior for traps: + */ + arch_intr_init_quirk = NULL; + arch_trap_init_quirk = visws_trap_init_quirk; + + /* + * Install reboot quirks: + */ + pm_power_off = visws_machine_power_off; + machine_ops.emergency_restart = visws_machine_emergency_restart; + + /* + * Do not use broadcast IPIs: + */ + no_broadcast = 0; + + /* + * Override generic MP-table parsing: + */ + mach_get_smp_config_quirk = visws_get_smp_config_quirk; + mach_find_smp_config_quirk = visws_find_smp_config_quirk; + +#ifdef CONFIG_X86_IO_APIC + /* + * Turn off IO-APIC detection and initialization: + */ + skip_ioapic_setup = 1; +#endif + + /* + * Get Board rev. + * First, we have to initialize the 307 part to allow us access + * to the GPIO registers. Let's map them at 0x0fc0 which is right + * after the PIIX4 PM section. + */ + outb_p(SIO_DEV_SEL, SIO_INDEX); + outb_p(SIO_GP_DEV, SIO_DATA); /* Talk to GPIO regs. */ + + outb_p(SIO_DEV_MSB, SIO_INDEX); + outb_p(SIO_GP_MSB, SIO_DATA); /* MSB of GPIO base address */ + + outb_p(SIO_DEV_LSB, SIO_INDEX); + outb_p(SIO_GP_LSB, SIO_DATA); /* LSB of GPIO base address */ + + outb_p(SIO_DEV_ENB, SIO_INDEX); + outb_p(1, SIO_DATA); /* Enable GPIO registers. */ + + /* + * Now, we have to map the power management section to write + * a bit which enables access to the GPIO registers. + * What lunatic came up with this shit? + */ + outb_p(SIO_DEV_SEL, SIO_INDEX); + outb_p(SIO_PM_DEV, SIO_DATA); /* Talk to GPIO regs. */ + + outb_p(SIO_DEV_MSB, SIO_INDEX); + outb_p(SIO_PM_MSB, SIO_DATA); /* MSB of PM base address */ + + outb_p(SIO_DEV_LSB, SIO_INDEX); + outb_p(SIO_PM_LSB, SIO_DATA); /* LSB of PM base address */ + + outb_p(SIO_DEV_ENB, SIO_INDEX); + outb_p(1, SIO_DATA); /* Enable PM registers. */ + + /* + * Now, write the PM register which enables the GPIO registers. + */ + outb_p(SIO_PM_FER2, SIO_PM_INDEX); + outb_p(SIO_PM_GP_EN, SIO_PM_DATA); + + /* + * Now, initialize the GPIO registers. + * We want them all to be inputs which is the + * power on default, so let's leave them alone. + * So, let's just read the board rev! + */ + raw = inb_p(SIO_GP_DATA1); + raw &= 0x7f; /* 7 bits of valid board revision ID. */ + + if (visws_board_type == VISWS_320) { + if (raw < 0x6) { + visws_board_rev = 4; + } else if (raw < 0xc) { + visws_board_rev = 5; + } else { + visws_board_rev = 6; + } + } else if (visws_board_type == VISWS_540) { + visws_board_rev = 2; + } else { + visws_board_rev = raw; + } + + printk(KERN_INFO "Silicon Graphics Visual Workstation %s (rev %d) detected\n", + (visws_board_type == VISWS_320 ? "320" : + (visws_board_type == VISWS_540 ? "540" : + "unknown")), visws_board_rev); +} + +#define A01234 (LI_INTA_0 | LI_INTA_1 | LI_INTA_2 | LI_INTA_3 | LI_INTA_4) +#define BCD (LI_INTB | LI_INTC | LI_INTD) +#define ALLDEVS (A01234 | BCD) + +static __init void lithium_init(void) +{ + set_fixmap(FIX_LI_PCIA, LI_PCI_A_PHYS); + set_fixmap(FIX_LI_PCIB, LI_PCI_B_PHYS); + + if ((li_pcia_read16(PCI_VENDOR_ID) != PCI_VENDOR_ID_SGI) || + (li_pcia_read16(PCI_DEVICE_ID) != PCI_DEVICE_ID_SGI_LITHIUM)) { + printk(KERN_EMERG "Lithium hostbridge %c not found\n", 'A'); +/* panic("This machine is not SGI Visual Workstation 320/540"); */ + } + + if ((li_pcib_read16(PCI_VENDOR_ID) != PCI_VENDOR_ID_SGI) || + (li_pcib_read16(PCI_DEVICE_ID) != PCI_DEVICE_ID_SGI_LITHIUM)) { + printk(KERN_EMERG "Lithium hostbridge %c not found\n", 'B'); +/* panic("This machine is not SGI Visual Workstation 320/540"); */ + } + + li_pcia_write16(LI_PCI_INTEN, ALLDEVS); + li_pcib_write16(LI_PCI_INTEN, ALLDEVS); +} + +static __init void cobalt_init(void) +{ + /* + * On normal SMP PC this is used only with SMP, but we have to + * use it and set it up here to start the Cobalt clock + */ + set_fixmap(FIX_APIC_BASE, APIC_DEFAULT_PHYS_BASE); + setup_local_APIC(); + printk(KERN_INFO "Local APIC Version %#x, ID %#x\n", + (unsigned int)apic_read(APIC_LVR), + (unsigned int)apic_read(APIC_ID)); + + set_fixmap(FIX_CO_CPU, CO_CPU_PHYS); + set_fixmap(FIX_CO_APIC, CO_APIC_PHYS); + printk(KERN_INFO "Cobalt Revision %#lx, APIC ID %#lx\n", + co_cpu_read(CO_CPU_REV), co_apic_read(CO_APIC_ID)); + + /* Enable Cobalt APIC being careful to NOT change the ID! */ + co_apic_write(CO_APIC_ID, co_apic_read(CO_APIC_ID) | CO_APIC_ENABLE); + + printk(KERN_INFO "Cobalt APIC enabled: ID reg %#lx\n", + co_apic_read(CO_APIC_ID)); +} + +int __init visws_trap_init_quirk(void) +{ + lithium_init(); + cobalt_init(); + + return 1; +} + +/* + * IRQ controller / APIC support: + */ + +static DEFINE_SPINLOCK(cobalt_lock); + +/* + * Set the given Cobalt APIC Redirection Table entry to point + * to the given IDT vector/index. + */ +static inline void co_apic_set(int entry, int irq) +{ + co_apic_write(CO_APIC_LO(entry), CO_APIC_LEVEL | (irq + FIRST_EXTERNAL_VECTOR)); + co_apic_write(CO_APIC_HI(entry), 0); +} + +/* + * Cobalt (IO)-APIC functions to handle PCI devices. + */ +static inline int co_apic_ide0_hack(void) +{ + extern char visws_board_type; + extern char visws_board_rev; + + if (visws_board_type == VISWS_320 && visws_board_rev == 5) + return 5; + return CO_APIC_IDE0; +} + +static int is_co_apic(unsigned int irq) +{ + if (IS_CO_APIC(irq)) + return CO_APIC(irq); + + switch (irq) { + case 0: return CO_APIC_CPU; + case CO_IRQ_IDE0: return co_apic_ide0_hack(); + case CO_IRQ_IDE1: return CO_APIC_IDE1; + default: return -1; + } +} + + +/* + * This is the SGI Cobalt (IO-)APIC: + */ + +static void enable_cobalt_irq(unsigned int irq) +{ + co_apic_set(is_co_apic(irq), irq); +} + +static void disable_cobalt_irq(unsigned int irq) +{ + int entry = is_co_apic(irq); + + co_apic_write(CO_APIC_LO(entry), CO_APIC_MASK); + co_apic_read(CO_APIC_LO(entry)); +} + +/* + * "irq" really just serves to identify the device. Here is where we + * map this to the Cobalt APIC entry where it's physically wired. + * This is called via request_irq -> setup_irq -> irq_desc->startup() + */ +static unsigned int startup_cobalt_irq(unsigned int irq) +{ + unsigned long flags; + + spin_lock_irqsave(&cobalt_lock, flags); + if ((irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING))) + irq_desc[irq].status &= ~(IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING); + enable_cobalt_irq(irq); + spin_unlock_irqrestore(&cobalt_lock, flags); + return 0; +} + +static void ack_cobalt_irq(unsigned int irq) +{ + unsigned long flags; + + spin_lock_irqsave(&cobalt_lock, flags); + disable_cobalt_irq(irq); + apic_write(APIC_EOI, APIC_EIO_ACK); + spin_unlock_irqrestore(&cobalt_lock, flags); +} + +static void end_cobalt_irq(unsigned int irq) +{ + unsigned long flags; + + spin_lock_irqsave(&cobalt_lock, flags); + if (!(irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS))) + enable_cobalt_irq(irq); + spin_unlock_irqrestore(&cobalt_lock, flags); +} + +static struct irq_chip cobalt_irq_type = { + .typename = "Cobalt-APIC", + .startup = startup_cobalt_irq, + .shutdown = disable_cobalt_irq, + .enable = enable_cobalt_irq, + .disable = disable_cobalt_irq, + .ack = ack_cobalt_irq, + .end = end_cobalt_irq, +}; + + +/* + * This is the PIIX4-based 8259 that is wired up indirectly to Cobalt + * -- not the manner expected by the code in i8259.c. + * + * there is a 'master' physical interrupt source that gets sent to + * the CPU. But in the chipset there are various 'virtual' interrupts + * waiting to be handled. We represent this to Linux through a 'master' + * interrupt controller type, and through a special virtual interrupt- + * controller. Device drivers only see the virtual interrupt sources. + */ +static unsigned int startup_piix4_master_irq(unsigned int irq) +{ + init_8259A(0); + + return startup_cobalt_irq(irq); +} + +static void end_piix4_master_irq(unsigned int irq) +{ + unsigned long flags; + + spin_lock_irqsave(&cobalt_lock, flags); + enable_cobalt_irq(irq); + spin_unlock_irqrestore(&cobalt_lock, flags); +} + +static struct irq_chip piix4_master_irq_type = { + .typename = "PIIX4-master", + .startup = startup_piix4_master_irq, + .ack = ack_cobalt_irq, + .end = end_piix4_master_irq, +}; + + +static struct irq_chip piix4_virtual_irq_type = { + .typename = "PIIX4-virtual", + .shutdown = disable_8259A_irq, + .enable = enable_8259A_irq, + .disable = disable_8259A_irq, +}; + + +/* + * PIIX4-8259 master/virtual functions to handle interrupt requests + * from legacy devices: floppy, parallel, serial, rtc. + * + * None of these get Cobalt APIC entries, neither do they have IDT + * entries. These interrupts are purely virtual and distributed from + * the 'master' interrupt source: CO_IRQ_8259. + * + * When the 8259 interrupts its handler figures out which of these + * devices is interrupting and dispatches to its handler. + * + * CAREFUL: devices see the 'virtual' interrupt only. Thus disable/ + * enable_irq gets the right irq. This 'master' irq is never directly + * manipulated by any driver. + */ +static irqreturn_t piix4_master_intr(int irq, void *dev_id) +{ + int realirq; + irq_desc_t *desc; + unsigned long flags; + + spin_lock_irqsave(&i8259A_lock, flags); + + /* Find out what's interrupting in the PIIX4 master 8259 */ + outb(0x0c, 0x20); /* OCW3 Poll command */ + realirq = inb(0x20); + + /* + * Bit 7 == 0 means invalid/spurious + */ + if (unlikely(!(realirq & 0x80))) + goto out_unlock; + + realirq &= 7; + + if (unlikely(realirq == 2)) { + outb(0x0c, 0xa0); + realirq = inb(0xa0); + + if (unlikely(!(realirq & 0x80))) + goto out_unlock; + + realirq = (realirq & 7) + 8; + } + + /* mask and ack interrupt */ + cached_irq_mask |= 1 << realirq; + if (unlikely(realirq > 7)) { + inb(0xa1); + outb(cached_slave_mask, 0xa1); + outb(0x60 + (realirq & 7), 0xa0); + outb(0x60 + 2, 0x20); + } else { + inb(0x21); + outb(cached_master_mask, 0x21); + outb(0x60 + realirq, 0x20); + } + + spin_unlock_irqrestore(&i8259A_lock, flags); + + desc = irq_desc + realirq; + + /* + * handle this 'virtual interrupt' as a Cobalt one now. + */ + kstat_cpu(smp_processor_id()).irqs[realirq]++; + + if (likely(desc->action != NULL)) + handle_IRQ_event(realirq, desc->action); + + if (!(desc->status & IRQ_DISABLED)) + enable_8259A_irq(realirq); + + return IRQ_HANDLED; + +out_unlock: + spin_unlock_irqrestore(&i8259A_lock, flags); + return IRQ_NONE; +} + +static struct irqaction master_action = { + .handler = piix4_master_intr, + .name = "PIIX4-8259", +}; + +static struct irqaction cascade_action = { + .handler = no_action, + .name = "cascade", +}; + + +void init_VISWS_APIC_irqs(void) +{ + int i; + + for (i = 0; i < CO_IRQ_APIC0 + CO_APIC_LAST + 1; i++) { + irq_desc[i].status = IRQ_DISABLED; + irq_desc[i].action = 0; + irq_desc[i].depth = 1; + + if (i == 0) { + irq_desc[i].chip = &cobalt_irq_type; + } + else if (i == CO_IRQ_IDE0) { + irq_desc[i].chip = &cobalt_irq_type; + } + else if (i == CO_IRQ_IDE1) { + irq_desc[i].chip = &cobalt_irq_type; + } + else if (i == CO_IRQ_8259) { + irq_desc[i].chip = &piix4_master_irq_type; + } + else if (i < CO_IRQ_APIC0) { + irq_desc[i].chip = &piix4_virtual_irq_type; + } + else if (IS_CO_APIC(i)) { + irq_desc[i].chip = &cobalt_irq_type; + } + } + + setup_irq(CO_IRQ_8259, &master_action); + setup_irq(2, &cascade_action); +} diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 946bf13b44a..b15346092b7 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -932,7 +932,7 @@ static inline int __init activate_vmi(void) pv_apic_ops.setup_secondary_clock = vmi_time_ap_init; #endif pv_time_ops.sched_clock = vmi_sched_clock; - pv_time_ops.get_cpu_khz = vmi_cpu_khz; + pv_time_ops.get_tsc_khz = vmi_tsc_khz; /* We have true wallclock functions; disable CMOS clock sync */ no_sync_cmos_clock = 1; diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c index ba7d19e102b..6953859fe28 100644 --- a/arch/x86/kernel/vmiclock_32.c +++ b/arch/x86/kernel/vmiclock_32.c @@ -69,8 +69,8 @@ unsigned long long vmi_sched_clock(void) return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE)); } -/* paravirt_ops.get_cpu_khz = vmi_cpu_khz */ -unsigned long vmi_cpu_khz(void) +/* paravirt_ops.get_tsc_khz = vmi_tsc_khz */ +unsigned long vmi_tsc_khz(void) { unsigned long long khz; khz = vmi_timer_ops.get_cycle_frequency(); diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index 2674f579627..cdb2363697d 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -49,16 +49,14 @@ SECTIONS _etext = .; /* End of text section */ } :text = 0x9090 + NOTES :text :note + . = ALIGN(16); /* Exception table */ __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { __start___ex_table = .; *(__ex_table) __stop___ex_table = .; - } - - NOTES :text :note - - BUG_TABLE :text + } :text = 0x9090 RODATA diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index fd246e22fe6..63e5c1a22e8 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -19,7 +19,7 @@ PHDRS { data PT_LOAD FLAGS(7); /* RWE */ user PT_LOAD FLAGS(7); /* RWE */ data.init PT_LOAD FLAGS(7); /* RWE */ - note PT_NOTE FLAGS(4); /* R__ */ + note PT_NOTE FLAGS(0); /* ___ */ } SECTIONS { @@ -40,16 +40,14 @@ SECTIONS _etext = .; /* End of text section */ } :text = 0x9090 + NOTES :text :note + . = ALIGN(16); /* Exception table */ __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { __start___ex_table = .; *(__ex_table) __stop___ex_table = .; - } - - NOTES :text :note - - BUG_TABLE :text + } :text = 0x9090 RODATA diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index c87cbd84c3e..0b8b6690a86 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -42,7 +42,8 @@ #include <asm/topology.h> #include <asm/vgtod.h> -#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) +#define __vsyscall(nr) \ + __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace #define __syscall_clobber "r11","cx","memory" /* @@ -278,7 +279,7 @@ cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg) { long cpu = (long)arg; if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) - smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1); + smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1); return NOTIFY_DONE; } @@ -301,7 +302,7 @@ static int __init vsyscall_init(void) #ifdef CONFIG_SYSCTL register_sysctl_table(kernel_root_table2); #endif - on_each_cpu(cpu_vsyscall_init, NULL, 0, 1); + on_each_cpu(cpu_vsyscall_init, NULL, 1); hotcpu_notifier(cpu_vsyscall_notifier, 0); return 0; } diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index 2f306a82689..b545f371b5f 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -2,13 +2,20 @@ All C exports should go in the respective C files. */ #include <linux/module.h> -#include <net/checksum.h> #include <linux/smp.h> +#include <net/checksum.h> + #include <asm/processor.h> -#include <asm/uaccess.h> #include <asm/pgtable.h> +#include <asm/uaccess.h> #include <asm/desc.h> +#include <asm/ftrace.h> + +#ifdef CONFIG_FTRACE +/* mcount is defined in assembly */ +EXPORT_SYMBOL(mcount); +#endif EXPORT_SYMBOL(kernel_thread); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 540e9517907..10ce6ee4c49 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -335,7 +335,7 @@ static void vcpu_clear(struct vcpu_vmx *vmx) { if (vmx->vcpu.cpu == -1) return; - smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 0, 1); + smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1); vmx->launched = 0; } @@ -2968,7 +2968,7 @@ static void vmx_free_vmcs(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); if (vmx->vmcs) { - on_each_cpu(__vcpu_clear, vmx, 0, 1); + on_each_cpu(__vcpu_clear, vmx, 1); free_vmcs(vmx->vmcs); vmx->vmcs = NULL; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 63a77caa59f..0faa2546b1c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4044,6 +4044,6 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu) * So need not to call smp_call_function_single() in that case. */ if (vcpu->guest_mode && vcpu->cpu != cpu) - smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0); + smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0); put_cpu(); } diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig index 964dfa36d36..c70e12b1a63 100644 --- a/arch/x86/lguest/Kconfig +++ b/arch/x86/lguest/Kconfig @@ -3,7 +3,7 @@ config LGUEST_GUEST select PARAVIRT depends on X86_32 depends on !X86_PAE - depends on !(X86_VISWS || X86_VOYAGER) + depends on !X86_VOYAGER select VIRTIO select VIRTIO_RING select VIRTIO_CONSOLE diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index e72cf0793fb..50dad44fb54 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -607,7 +607,7 @@ static unsigned long lguest_get_wallclock(void) * what speed it runs at, or 0 if it's unusable as a reliable clock source. * This matches what we want here: if we return 0 from this function, the x86 * TSC clock will give up and not register itself. */ -static unsigned long lguest_cpu_khz(void) +static unsigned long lguest_tsc_khz(void) { return lguest_data.tsc_khz; } @@ -998,7 +998,7 @@ __init void lguest_init(void) /* time operations */ pv_time_ops.get_wallclock = lguest_get_wallclock; pv_time_ops.time_init = lguest_time_init; - pv_time_ops.get_cpu_khz = lguest_cpu_khz; + pv_time_ops.get_tsc_khz = lguest_tsc_khz; /* Now is a good time to look at the implementations of these functions * before returning to the rest of lguest_init(). */ diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 76f60f52a88..aa3fa411942 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -4,8 +4,9 @@ obj-$(CONFIG_SMP) := msr-on-cpu.o -lib-y := delay_$(BITS).o -lib-y += usercopy_$(BITS).o getuser_$(BITS).o putuser_$(BITS).o +lib-y := delay.o +lib-y += thunk_$(BITS).o +lib-y += usercopy_$(BITS).o getuser.o putuser.o lib-y += memcpy_$(BITS).o ifeq ($(CONFIG_X86_32),y) diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index ee1c3f63515..dfdf428975c 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -1,8 +1,10 @@ -/* Copyright 2002 Andi Kleen, SuSE Labs. +/* + * Copyright 2008 Vitaly Mayatskikh <vmayatsk@redhat.com> + * Copyright 2002 Andi Kleen, SuSE Labs. * Subject to the GNU Public License v2. - * - * Functions to copy from and to user space. - */ + * + * Functions to copy from and to user space. + */ #include <linux/linkage.h> #include <asm/dwarf2.h> @@ -20,60 +22,88 @@ .long \orig-1f /* by default jump to orig */ 1: .section .altinstr_replacement,"ax" -2: .byte 0xe9 /* near jump with 32bit immediate */ +2: .byte 0xe9 /* near jump with 32bit immediate */ .long \alt-1b /* offset */ /* or alternatively to alt */ .previous .section .altinstructions,"a" .align 8 .quad 0b .quad 2b - .byte \feature /* when feature is set */ + .byte \feature /* when feature is set */ .byte 5 .byte 5 .previous .endm -/* Standard copy_to_user with segment limit checking */ + .macro ALIGN_DESTINATION +#ifdef FIX_ALIGNMENT + /* check for bad alignment of destination */ + movl %edi,%ecx + andl $7,%ecx + jz 102f /* already aligned */ + subl $8,%ecx + negl %ecx + subl %ecx,%edx +100: movb (%rsi),%al +101: movb %al,(%rdi) + incq %rsi + incq %rdi + decl %ecx + jnz 100b +102: + .section .fixup,"ax" +103: addl %r8d,%edx /* ecx is zerorest also */ + jmp copy_user_handle_tail + .previous + + .section __ex_table,"a" + .align 8 + .quad 100b,103b + .quad 101b,103b + .previous +#endif + .endm + +/* Standard copy_to_user with segment limit checking */ ENTRY(copy_to_user) CFI_STARTPROC GET_THREAD_INFO(%rax) movq %rdi,%rcx addq %rdx,%rcx - jc bad_to_user - cmpq threadinfo_addr_limit(%rax),%rcx + jc bad_to_user + cmpq TI_addr_limit(%rax),%rcx jae bad_to_user - xorl %eax,%eax /* clear zero flag */ ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string CFI_ENDPROC -ENTRY(copy_user_generic) +/* Standard copy_from_user with segment limit checking */ +ENTRY(copy_from_user) CFI_STARTPROC - movl $1,%ecx /* set zero flag */ + GET_THREAD_INFO(%rax) + movq %rsi,%rcx + addq %rdx,%rcx + jc bad_from_user + cmpq TI_addr_limit(%rax),%rcx + jae bad_from_user ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string CFI_ENDPROC +ENDPROC(copy_from_user) -ENTRY(__copy_from_user_inatomic) +ENTRY(copy_user_generic) CFI_STARTPROC - xorl %ecx,%ecx /* clear zero flag */ ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string CFI_ENDPROC +ENDPROC(copy_user_generic) -/* Standard copy_from_user with segment limit checking */ -ENTRY(copy_from_user) +ENTRY(__copy_from_user_inatomic) CFI_STARTPROC - GET_THREAD_INFO(%rax) - movq %rsi,%rcx - addq %rdx,%rcx - jc bad_from_user - cmpq threadinfo_addr_limit(%rax),%rcx - jae bad_from_user - movl $1,%ecx /* set zero flag */ ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string CFI_ENDPROC -ENDPROC(copy_from_user) - +ENDPROC(__copy_from_user_inatomic) + .section .fixup,"ax" /* must zero dest */ +ENTRY(bad_from_user) bad_from_user: CFI_STARTPROC movl %edx,%ecx @@ -81,271 +111,158 @@ bad_from_user: rep stosb bad_to_user: - movl %edx,%eax + movl %edx,%eax ret CFI_ENDPROC -END(bad_from_user) +ENDPROC(bad_from_user) .previous - - + /* * copy_user_generic_unrolled - memory copy with exception handling. - * This version is for CPUs like P4 that don't have efficient micro code for rep movsq - * - * Input: + * This version is for CPUs like P4 that don't have efficient micro + * code for rep movsq + * + * Input: * rdi destination * rsi source * rdx count - * ecx zero flag -- if true zero destination on error * - * Output: - * eax uncopied bytes or 0 if successful. + * Output: + * eax uncopied bytes or 0 if successfull. */ ENTRY(copy_user_generic_unrolled) CFI_STARTPROC - pushq %rbx - CFI_ADJUST_CFA_OFFSET 8 - CFI_REL_OFFSET rbx, 0 - pushq %rcx - CFI_ADJUST_CFA_OFFSET 8 - CFI_REL_OFFSET rcx, 0 - xorl %eax,%eax /*zero for the exception handler */ - -#ifdef FIX_ALIGNMENT - /* check for bad alignment of destination */ - movl %edi,%ecx - andl $7,%ecx - jnz .Lbad_alignment -.Lafter_bad_alignment: -#endif - - movq %rdx,%rcx - - movl $64,%ebx - shrq $6,%rdx - decq %rdx - js .Lhandle_tail - - .p2align 4 -.Lloop: -.Ls1: movq (%rsi),%r11 -.Ls2: movq 1*8(%rsi),%r8 -.Ls3: movq 2*8(%rsi),%r9 -.Ls4: movq 3*8(%rsi),%r10 -.Ld1: movq %r11,(%rdi) -.Ld2: movq %r8,1*8(%rdi) -.Ld3: movq %r9,2*8(%rdi) -.Ld4: movq %r10,3*8(%rdi) - -.Ls5: movq 4*8(%rsi),%r11 -.Ls6: movq 5*8(%rsi),%r8 -.Ls7: movq 6*8(%rsi),%r9 -.Ls8: movq 7*8(%rsi),%r10 -.Ld5: movq %r11,4*8(%rdi) -.Ld6: movq %r8,5*8(%rdi) -.Ld7: movq %r9,6*8(%rdi) -.Ld8: movq %r10,7*8(%rdi) - - decq %rdx - + cmpl $8,%edx + jb 20f /* less then 8 bytes, go to byte copy loop */ + ALIGN_DESTINATION + movl %edx,%ecx + andl $63,%edx + shrl $6,%ecx + jz 17f +1: movq (%rsi),%r8 +2: movq 1*8(%rsi),%r9 +3: movq 2*8(%rsi),%r10 +4: movq 3*8(%rsi),%r11 +5: movq %r8,(%rdi) +6: movq %r9,1*8(%rdi) +7: movq %r10,2*8(%rdi) +8: movq %r11,3*8(%rdi) +9: movq 4*8(%rsi),%r8 +10: movq 5*8(%rsi),%r9 +11: movq 6*8(%rsi),%r10 +12: movq 7*8(%rsi),%r11 +13: movq %r8,4*8(%rdi) +14: movq %r9,5*8(%rdi) +15: movq %r10,6*8(%rdi) +16: movq %r11,7*8(%rdi) leaq 64(%rsi),%rsi leaq 64(%rdi),%rdi - - jns .Lloop - - .p2align 4 -.Lhandle_tail: - movl %ecx,%edx - andl $63,%ecx - shrl $3,%ecx - jz .Lhandle_7 - movl $8,%ebx - .p2align 4 -.Lloop_8: -.Ls9: movq (%rsi),%r8 -.Ld9: movq %r8,(%rdi) decl %ecx - leaq 8(%rdi),%rdi + jnz 1b +17: movl %edx,%ecx + andl $7,%edx + shrl $3,%ecx + jz 20f +18: movq (%rsi),%r8 +19: movq %r8,(%rdi) leaq 8(%rsi),%rsi - jnz .Lloop_8 - -.Lhandle_7: + leaq 8(%rdi),%rdi + decl %ecx + jnz 18b +20: andl %edx,%edx + jz 23f movl %edx,%ecx - andl $7,%ecx - jz .Lende - .p2align 4 -.Lloop_1: -.Ls10: movb (%rsi),%bl -.Ld10: movb %bl,(%rdi) - incq %rdi +21: movb (%rsi),%al +22: movb %al,(%rdi) incq %rsi + incq %rdi decl %ecx - jnz .Lloop_1 - - CFI_REMEMBER_STATE -.Lende: - popq %rcx - CFI_ADJUST_CFA_OFFSET -8 - CFI_RESTORE rcx - popq %rbx - CFI_ADJUST_CFA_OFFSET -8 - CFI_RESTORE rbx + jnz 21b +23: xor %eax,%eax ret - CFI_RESTORE_STATE -#ifdef FIX_ALIGNMENT - /* align destination */ - .p2align 4 -.Lbad_alignment: - movl $8,%r9d - subl %ecx,%r9d - movl %r9d,%ecx - cmpq %r9,%rdx - jz .Lhandle_7 - js .Lhandle_7 -.Lalign_1: -.Ls11: movb (%rsi),%bl -.Ld11: movb %bl,(%rdi) - incq %rsi - incq %rdi - decl %ecx - jnz .Lalign_1 - subq %r9,%rdx - jmp .Lafter_bad_alignment -#endif + .section .fixup,"ax" +30: shll $6,%ecx + addl %ecx,%edx + jmp 60f +40: lea (%rdx,%rcx,8),%rdx + jmp 60f +50: movl %ecx,%edx +60: jmp copy_user_handle_tail /* ecx is zerorest also */ + .previous - /* table sorted by exception address */ .section __ex_table,"a" .align 8 - .quad .Ls1,.Ls1e /* Ls1-Ls4 have copied zero bytes */ - .quad .Ls2,.Ls1e - .quad .Ls3,.Ls1e - .quad .Ls4,.Ls1e - .quad .Ld1,.Ls1e /* Ld1-Ld4 have copied 0-24 bytes */ - .quad .Ld2,.Ls2e - .quad .Ld3,.Ls3e - .quad .Ld4,.Ls4e - .quad .Ls5,.Ls5e /* Ls5-Ls8 have copied 32 bytes */ - .quad .Ls6,.Ls5e - .quad .Ls7,.Ls5e - .quad .Ls8,.Ls5e - .quad .Ld5,.Ls5e /* Ld5-Ld8 have copied 32-56 bytes */ - .quad .Ld6,.Ls6e - .quad .Ld7,.Ls7e - .quad .Ld8,.Ls8e - .quad .Ls9,.Le_quad - .quad .Ld9,.Le_quad - .quad .Ls10,.Le_byte - .quad .Ld10,.Le_byte -#ifdef FIX_ALIGNMENT - .quad .Ls11,.Lzero_rest - .quad .Ld11,.Lzero_rest -#endif - .quad .Le5,.Le_zero + .quad 1b,30b + .quad 2b,30b + .quad 3b,30b + .quad 4b,30b + .quad 5b,30b + .quad 6b,30b + .quad 7b,30b + .quad 8b,30b + .quad 9b,30b + .quad 10b,30b + .quad 11b,30b + .quad 12b,30b + .quad 13b,30b + .quad 14b,30b + .quad 15b,30b + .quad 16b,30b + .quad 18b,40b + .quad 19b,40b + .quad 21b,50b + .quad 22b,50b .previous - - /* eax: zero, ebx: 64 */ -.Ls1e: addl $8,%eax /* eax is bytes left uncopied within the loop (Ls1e: 64 .. Ls8e: 8) */ -.Ls2e: addl $8,%eax -.Ls3e: addl $8,%eax -.Ls4e: addl $8,%eax -.Ls5e: addl $8,%eax -.Ls6e: addl $8,%eax -.Ls7e: addl $8,%eax -.Ls8e: addl $8,%eax - addq %rbx,%rdi /* +64 */ - subq %rax,%rdi /* correct destination with computed offset */ - - shlq $6,%rdx /* loop counter * 64 (stride length) */ - addq %rax,%rdx /* add offset to loopcnt */ - andl $63,%ecx /* remaining bytes */ - addq %rcx,%rdx /* add them */ - jmp .Lzero_rest - - /* exception on quad word loop in tail handling */ - /* ecx: loopcnt/8, %edx: length, rdi: correct */ -.Le_quad: - shll $3,%ecx - andl $7,%edx - addl %ecx,%edx - /* edx: bytes to zero, rdi: dest, eax:zero */ -.Lzero_rest: - cmpl $0,(%rsp) - jz .Le_zero - movq %rdx,%rcx -.Le_byte: - xorl %eax,%eax -.Le5: rep - stosb - /* when there is another exception while zeroing the rest just return */ -.Le_zero: - movq %rdx,%rax - jmp .Lende CFI_ENDPROC -ENDPROC(copy_user_generic) +ENDPROC(copy_user_generic_unrolled) - - /* Some CPUs run faster using the string copy instructions. - This is also a lot simpler. Use them when possible. - Patch in jmps to this code instead of copying it fully - to avoid unwanted aliasing in the exception tables. */ - - /* rdi destination - * rsi source - * rdx count - * ecx zero flag - * - * Output: - * eax uncopied bytes or 0 if successfull. - * - * Only 4GB of copy is supported. This shouldn't be a problem - * because the kernel normally only writes from/to page sized chunks - * even if user space passed a longer buffer. - * And more would be dangerous because both Intel and AMD have - * errata with rep movsq > 4GB. If someone feels the need to fix - * this please consider this. - */ +/* Some CPUs run faster using the string copy instructions. + * This is also a lot simpler. Use them when possible. + * + * Only 4GB of copy is supported. This shouldn't be a problem + * because the kernel normally only writes from/to page sized chunks + * even if user space passed a longer buffer. + * And more would be dangerous because both Intel and AMD have + * errata with rep movsq > 4GB. If someone feels the need to fix + * this please consider this. + * + * Input: + * rdi destination + * rsi source + * rdx count + * + * Output: + * eax uncopied bytes or 0 if successful. + */ ENTRY(copy_user_generic_string) CFI_STARTPROC - movl %ecx,%r8d /* save zero flag */ + andl %edx,%edx + jz 4f + cmpl $8,%edx + jb 2f /* less than 8 bytes, go to byte copy loop */ + ALIGN_DESTINATION movl %edx,%ecx shrl $3,%ecx - andl $7,%edx - jz 10f -1: rep - movsq - movl %edx,%ecx -2: rep - movsb -9: movl %ecx,%eax - ret - - /* multiple of 8 byte */ -10: rep + andl $7,%edx +1: rep movsq - xor %eax,%eax +2: movl %edx,%ecx +3: rep + movsb +4: xorl %eax,%eax ret - /* exception handling */ -3: lea (%rdx,%rcx,8),%rax /* exception on quad loop */ - jmp 6f -5: movl %ecx,%eax /* exception on byte loop */ - /* eax: left over bytes */ -6: testl %r8d,%r8d /* zero flag set? */ - jz 7f - movl %eax,%ecx /* initialize x86 loop counter */ - push %rax - xorl %eax,%eax -8: rep - stosb /* zero the rest */ -11: pop %rax -7: ret - CFI_ENDPROC -END(copy_user_generic_c) + .section .fixup,"ax" +11: lea (%rdx,%rcx,8),%rcx +12: movl %ecx,%edx /* ecx is zerorest also */ + jmp copy_user_handle_tail + .previous .section __ex_table,"a" - .quad 1b,3b - .quad 2b,5b - .quad 8b,11b - .quad 10b,3b + .align 8 + .quad 1b,11b + .quad 3b,12b .previous + CFI_ENDPROC +ENDPROC(copy_user_generic_string) diff --git a/arch/x86/lib/copy_user_nocache_64.S b/arch/x86/lib/copy_user_nocache_64.S index 9d3d1ab8376..40e0e309d27 100644 --- a/arch/x86/lib/copy_user_nocache_64.S +++ b/arch/x86/lib/copy_user_nocache_64.S @@ -1,4 +1,6 @@ -/* Copyright 2002 Andi Kleen, SuSE Labs. +/* + * Copyright 2008 Vitaly Mayatskikh <vmayatsk@redhat.com> + * Copyright 2002 Andi Kleen, SuSE Labs. * Subject to the GNU Public License v2. * * Functions to copy from and to user space. @@ -12,204 +14,125 @@ #include <asm/current.h> #include <asm/asm-offsets.h> #include <asm/thread_info.h> -#include <asm/cpufeature.h> - -/* - * copy_user_nocache - Uncached memory copy with exception handling - * This will force destination/source out of cache for more performance. - * - * Input: - * rdi destination - * rsi source - * rdx count - * rcx zero flag when 1 zero on exception - * - * Output: - * eax uncopied bytes or 0 if successful. - */ -ENTRY(__copy_user_nocache) - CFI_STARTPROC - pushq %rbx - CFI_ADJUST_CFA_OFFSET 8 - CFI_REL_OFFSET rbx, 0 - pushq %rcx /* save zero flag */ - CFI_ADJUST_CFA_OFFSET 8 - CFI_REL_OFFSET rcx, 0 - - xorl %eax,%eax /* zero for the exception handler */ + .macro ALIGN_DESTINATION #ifdef FIX_ALIGNMENT /* check for bad alignment of destination */ movl %edi,%ecx andl $7,%ecx - jnz .Lbad_alignment -.Lafter_bad_alignment: -#endif - - movq %rdx,%rcx - - movl $64,%ebx - shrq $6,%rdx - decq %rdx - js .Lhandle_tail - - .p2align 4 -.Lloop: -.Ls1: movq (%rsi),%r11 -.Ls2: movq 1*8(%rsi),%r8 -.Ls3: movq 2*8(%rsi),%r9 -.Ls4: movq 3*8(%rsi),%r10 -.Ld1: movnti %r11,(%rdi) -.Ld2: movnti %r8,1*8(%rdi) -.Ld3: movnti %r9,2*8(%rdi) -.Ld4: movnti %r10,3*8(%rdi) - -.Ls5: movq 4*8(%rsi),%r11 -.Ls6: movq 5*8(%rsi),%r8 -.Ls7: movq 6*8(%rsi),%r9 -.Ls8: movq 7*8(%rsi),%r10 -.Ld5: movnti %r11,4*8(%rdi) -.Ld6: movnti %r8,5*8(%rdi) -.Ld7: movnti %r9,6*8(%rdi) -.Ld8: movnti %r10,7*8(%rdi) + jz 102f /* already aligned */ + subl $8,%ecx + negl %ecx + subl %ecx,%edx +100: movb (%rsi),%al +101: movb %al,(%rdi) + incq %rsi + incq %rdi + decl %ecx + jnz 100b +102: + .section .fixup,"ax" +103: addl %r8d,%edx /* ecx is zerorest also */ + jmp copy_user_handle_tail + .previous - dec %rdx + .section __ex_table,"a" + .align 8 + .quad 100b,103b + .quad 101b,103b + .previous +#endif + .endm +/* + * copy_user_nocache - Uncached memory copy with exception handling + * This will force destination/source out of cache for more performance. + */ +ENTRY(__copy_user_nocache) + CFI_STARTPROC + cmpl $8,%edx + jb 20f /* less then 8 bytes, go to byte copy loop */ + ALIGN_DESTINATION + movl %edx,%ecx + andl $63,%edx + shrl $6,%ecx + jz 17f +1: movq (%rsi),%r8 +2: movq 1*8(%rsi),%r9 +3: movq 2*8(%rsi),%r10 +4: movq 3*8(%rsi),%r11 +5: movnti %r8,(%rdi) +6: movnti %r9,1*8(%rdi) +7: movnti %r10,2*8(%rdi) +8: movnti %r11,3*8(%rdi) +9: movq 4*8(%rsi),%r8 +10: movq 5*8(%rsi),%r9 +11: movq 6*8(%rsi),%r10 +12: movq 7*8(%rsi),%r11 +13: movnti %r8,4*8(%rdi) +14: movnti %r9,5*8(%rdi) +15: movnti %r10,6*8(%rdi) +16: movnti %r11,7*8(%rdi) leaq 64(%rsi),%rsi leaq 64(%rdi),%rdi - - jns .Lloop - - .p2align 4 -.Lhandle_tail: - movl %ecx,%edx - andl $63,%ecx - shrl $3,%ecx - jz .Lhandle_7 - movl $8,%ebx - .p2align 4 -.Lloop_8: -.Ls9: movq (%rsi),%r8 -.Ld9: movnti %r8,(%rdi) decl %ecx - leaq 8(%rdi),%rdi + jnz 1b +17: movl %edx,%ecx + andl $7,%edx + shrl $3,%ecx + jz 20f +18: movq (%rsi),%r8 +19: movnti %r8,(%rdi) leaq 8(%rsi),%rsi - jnz .Lloop_8 - -.Lhandle_7: + leaq 8(%rdi),%rdi + decl %ecx + jnz 18b +20: andl %edx,%edx + jz 23f movl %edx,%ecx - andl $7,%ecx - jz .Lende - .p2align 4 -.Lloop_1: -.Ls10: movb (%rsi),%bl -.Ld10: movb %bl,(%rdi) - incq %rdi +21: movb (%rsi),%al +22: movb %al,(%rdi) incq %rsi + incq %rdi decl %ecx - jnz .Lloop_1 - - CFI_REMEMBER_STATE -.Lende: - popq %rcx - CFI_ADJUST_CFA_OFFSET -8 - CFI_RESTORE %rcx - popq %rbx - CFI_ADJUST_CFA_OFFSET -8 - CFI_RESTORE rbx + jnz 21b +23: xorl %eax,%eax sfence ret - CFI_RESTORE_STATE -#ifdef FIX_ALIGNMENT - /* align destination */ - .p2align 4 -.Lbad_alignment: - movl $8,%r9d - subl %ecx,%r9d - movl %r9d,%ecx - cmpq %r9,%rdx - jz .Lhandle_7 - js .Lhandle_7 -.Lalign_1: -.Ls11: movb (%rsi),%bl -.Ld11: movb %bl,(%rdi) - incq %rsi - incq %rdi - decl %ecx - jnz .Lalign_1 - subq %r9,%rdx - jmp .Lafter_bad_alignment -#endif + .section .fixup,"ax" +30: shll $6,%ecx + addl %ecx,%edx + jmp 60f +40: lea (%rdx,%rcx,8),%rdx + jmp 60f +50: movl %ecx,%edx +60: sfence + movl %r8d,%ecx + jmp copy_user_handle_tail + .previous - /* table sorted by exception address */ .section __ex_table,"a" - .align 8 - .quad .Ls1,.Ls1e /* .Ls[1-4] - 0 bytes copied */ - .quad .Ls2,.Ls1e - .quad .Ls3,.Ls1e - .quad .Ls4,.Ls1e - .quad .Ld1,.Ls1e /* .Ld[1-4] - 0..24 bytes coped */ - .quad .Ld2,.Ls2e - .quad .Ld3,.Ls3e - .quad .Ld4,.Ls4e - .quad .Ls5,.Ls5e /* .Ls[5-8] - 32 bytes copied */ - .quad .Ls6,.Ls5e - .quad .Ls7,.Ls5e - .quad .Ls8,.Ls5e - .quad .Ld5,.Ls5e /* .Ld[5-8] - 32..56 bytes copied */ - .quad .Ld6,.Ls6e - .quad .Ld7,.Ls7e - .quad .Ld8,.Ls8e - .quad .Ls9,.Le_quad - .quad .Ld9,.Le_quad - .quad .Ls10,.Le_byte - .quad .Ld10,.Le_byte -#ifdef FIX_ALIGNMENT - .quad .Ls11,.Lzero_rest - .quad .Ld11,.Lzero_rest -#endif - .quad .Le5,.Le_zero + .quad 1b,30b + .quad 2b,30b + .quad 3b,30b + .quad 4b,30b + .quad 5b,30b + .quad 6b,30b + .quad 7b,30b + .quad 8b,30b + .quad 9b,30b + .quad 10b,30b + .quad 11b,30b + .quad 12b,30b + .quad 13b,30b + .quad 14b,30b + .quad 15b,30b + .quad 16b,30b + .quad 18b,40b + .quad 19b,40b + .quad 21b,50b + .quad 22b,50b .previous - - /* eax: zero, ebx: 64 */ -.Ls1e: addl $8,%eax /* eax: bytes left uncopied: Ls1e: 64 .. Ls8e: 8 */ -.Ls2e: addl $8,%eax -.Ls3e: addl $8,%eax -.Ls4e: addl $8,%eax -.Ls5e: addl $8,%eax -.Ls6e: addl $8,%eax -.Ls7e: addl $8,%eax -.Ls8e: addl $8,%eax - addq %rbx,%rdi /* +64 */ - subq %rax,%rdi /* correct destination with computed offset */ - - shlq $6,%rdx /* loop counter * 64 (stride length) */ - addq %rax,%rdx /* add offset to loopcnt */ - andl $63,%ecx /* remaining bytes */ - addq %rcx,%rdx /* add them */ - jmp .Lzero_rest - - /* exception on quad word loop in tail handling */ - /* ecx: loopcnt/8, %edx: length, rdi: correct */ -.Le_quad: - shll $3,%ecx - andl $7,%edx - addl %ecx,%edx - /* edx: bytes to zero, rdi: dest, eax:zero */ -.Lzero_rest: - cmpl $0,(%rsp) /* zero flag set? */ - jz .Le_zero - movq %rdx,%rcx -.Le_byte: - xorl %eax,%eax -.Le5: rep - stosb - /* when there is another exception while zeroing the rest just return */ -.Le_zero: - movq %rdx,%rax - jmp .Lende CFI_ENDPROC ENDPROC(__copy_user_nocache) - - diff --git a/arch/x86/lib/delay_32.c b/arch/x86/lib/delay.c index ef691316f8b..f4568605d7d 100644 --- a/arch/x86/lib/delay_32.c +++ b/arch/x86/lib/delay.c @@ -29,7 +29,7 @@ /* simple loop based delay: */ static void delay_loop(unsigned long loops) { - __asm__ __volatile__( + asm volatile( " test %0,%0 \n" " jz 3f \n" " jmp 1f \n" @@ -38,9 +38,9 @@ static void delay_loop(unsigned long loops) "1: jmp 2f \n" ".align 16 \n" - "2: decl %0 \n" + "2: dec %0 \n" " jnz 2b \n" - "3: decl %0 \n" + "3: dec %0 \n" : /* we don't need output */ :"a" (loops) @@ -98,7 +98,7 @@ void use_tsc_delay(void) int __devinit read_current_timer(unsigned long *timer_val) { if (delay_fn == delay_tsc) { - rdtscl(*timer_val); + rdtscll(*timer_val); return 0; } return -1; @@ -108,31 +108,30 @@ void __delay(unsigned long loops) { delay_fn(loops); } +EXPORT_SYMBOL(__delay); inline void __const_udelay(unsigned long xloops) { int d0; xloops *= 4; - __asm__("mull %0" + asm("mull %%edx" :"=d" (xloops), "=&a" (d0) :"1" (xloops), "0" (cpu_data(raw_smp_processor_id()).loops_per_jiffy * (HZ/4))); __delay(++xloops); } +EXPORT_SYMBOL(__const_udelay); void __udelay(unsigned long usecs) { __const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */ } +EXPORT_SYMBOL(__udelay); void __ndelay(unsigned long nsecs) { __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */ } - -EXPORT_SYMBOL(__delay); -EXPORT_SYMBOL(__const_udelay); -EXPORT_SYMBOL(__udelay); EXPORT_SYMBOL(__ndelay); diff --git a/arch/x86/lib/delay_64.c b/arch/x86/lib/delay_64.c deleted file mode 100644 index 4c441be9264..00000000000 --- a/arch/x86/lib/delay_64.c +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Precise Delay Loops for x86-64 - * - * Copyright (C) 1993 Linus Torvalds - * Copyright (C) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz> - * - * The __delay function must _NOT_ be inlined as its execution time - * depends wildly on alignment on many x86 processors. - */ - -#include <linux/module.h> -#include <linux/sched.h> -#include <linux/timex.h> -#include <linux/preempt.h> -#include <linux/delay.h> -#include <linux/init.h> - -#include <asm/delay.h> -#include <asm/msr.h> - -#ifdef CONFIG_SMP -#include <asm/smp.h> -#endif - -int __devinit read_current_timer(unsigned long *timer_value) -{ - rdtscll(*timer_value); - return 0; -} - -void __delay(unsigned long loops) -{ - unsigned bclock, now; - int cpu; - - preempt_disable(); - cpu = smp_processor_id(); - rdtscl(bclock); - for (;;) { - rdtscl(now); - if ((now - bclock) >= loops) - break; - - /* Allow RT tasks to run */ - preempt_enable(); - rep_nop(); - preempt_disable(); - - /* - * It is possible that we moved to another CPU, and - * since TSC's are per-cpu we need to calculate - * that. The delay must guarantee that we wait "at - * least" the amount of time. Being moved to another - * CPU could make the wait longer but we just need to - * make sure we waited long enough. Rebalance the - * counter for this CPU. - */ - if (unlikely(cpu != smp_processor_id())) { - loops -= (now - bclock); - cpu = smp_processor_id(); - rdtscl(bclock); - } - } - preempt_enable(); -} -EXPORT_SYMBOL(__delay); - -inline void __const_udelay(unsigned long xloops) -{ - __delay(((xloops * HZ * - cpu_data(raw_smp_processor_id()).loops_per_jiffy) >> 32) + 1); -} -EXPORT_SYMBOL(__const_udelay); - -void __udelay(unsigned long usecs) -{ - __const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */ -} -EXPORT_SYMBOL(__udelay); - -void __ndelay(unsigned long nsecs) -{ - __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */ -} -EXPORT_SYMBOL(__ndelay); diff --git a/arch/x86/lib/getuser_64.S b/arch/x86/lib/getuser.S index 5448876261f..ad374003742 100644 --- a/arch/x86/lib/getuser_64.S +++ b/arch/x86/lib/getuser.S @@ -3,6 +3,7 @@ * * (C) Copyright 1998 Linus Torvalds * (C) Copyright 2005 Andi Kleen + * (C) Copyright 2008 Glauber Costa * * These functions have a non-standard call interface * to make them more efficient, especially as they @@ -13,14 +14,13 @@ /* * __get_user_X * - * Inputs: %rcx contains the address. + * Inputs: %[r|e]ax contains the address. * The register is modified, but all changes are undone * before returning because the C code doesn't know about it. * - * Outputs: %rax is error code (0 or -EFAULT) - * %rdx contains zero-extended value - * - * %r8 is destroyed. + * Outputs: %[r|e]ax is error code (0 or -EFAULT) + * %[r|e]dx contains zero-extended value + * * * These functions should not modify any other registers, * as they get called from within inline assembly. @@ -32,78 +32,73 @@ #include <asm/errno.h> #include <asm/asm-offsets.h> #include <asm/thread_info.h> +#include <asm/asm.h> .text ENTRY(__get_user_1) CFI_STARTPROC - GET_THREAD_INFO(%r8) - cmpq threadinfo_addr_limit(%r8),%rcx + GET_THREAD_INFO(%_ASM_DX) + cmp TI_addr_limit(%_ASM_DX),%_ASM_AX jae bad_get_user -1: movzb (%rcx),%edx - xorl %eax,%eax +1: movzb (%_ASM_AX),%edx + xor %eax,%eax ret CFI_ENDPROC ENDPROC(__get_user_1) ENTRY(__get_user_2) CFI_STARTPROC - GET_THREAD_INFO(%r8) - addq $1,%rcx - jc 20f - cmpq threadinfo_addr_limit(%r8),%rcx - jae 20f - decq %rcx -2: movzwl (%rcx),%edx - xorl %eax,%eax + add $1,%_ASM_AX + jc bad_get_user + GET_THREAD_INFO(%_ASM_DX) + cmp TI_addr_limit(%_ASM_DX),%_ASM_AX + jae bad_get_user +2: movzwl -1(%_ASM_AX),%edx + xor %eax,%eax ret -20: decq %rcx - jmp bad_get_user CFI_ENDPROC ENDPROC(__get_user_2) ENTRY(__get_user_4) CFI_STARTPROC - GET_THREAD_INFO(%r8) - addq $3,%rcx - jc 30f - cmpq threadinfo_addr_limit(%r8),%rcx - jae 30f - subq $3,%rcx -3: movl (%rcx),%edx - xorl %eax,%eax + add $3,%_ASM_AX + jc bad_get_user + GET_THREAD_INFO(%_ASM_DX) + cmp TI_addr_limit(%_ASM_DX),%_ASM_AX + jae bad_get_user +3: mov -3(%_ASM_AX),%edx + xor %eax,%eax ret -30: subq $3,%rcx - jmp bad_get_user CFI_ENDPROC ENDPROC(__get_user_4) +#ifdef CONFIG_X86_64 ENTRY(__get_user_8) CFI_STARTPROC - GET_THREAD_INFO(%r8) - addq $7,%rcx - jc 40f - cmpq threadinfo_addr_limit(%r8),%rcx - jae 40f - subq $7,%rcx -4: movq (%rcx),%rdx - xorl %eax,%eax + add $7,%_ASM_AX + jc bad_get_user + GET_THREAD_INFO(%_ASM_DX) + cmp TI_addr_limit(%_ASM_DX),%_ASM_AX + jae bad_get_user +4: movq -7(%_ASM_AX),%_ASM_DX + xor %eax,%eax ret -40: subq $7,%rcx - jmp bad_get_user CFI_ENDPROC ENDPROC(__get_user_8) +#endif bad_get_user: CFI_STARTPROC - xorl %edx,%edx - movq $(-EFAULT),%rax + xor %edx,%edx + mov $(-EFAULT),%_ASM_AX ret CFI_ENDPROC END(bad_get_user) .section __ex_table,"a" - .quad 1b,bad_get_user - .quad 2b,bad_get_user - .quad 3b,bad_get_user - .quad 4b,bad_get_user -.previous + _ASM_PTR 1b,bad_get_user + _ASM_PTR 2b,bad_get_user + _ASM_PTR 3b,bad_get_user +#ifdef CONFIG_X86_64 + _ASM_PTR 4b,bad_get_user +#endif diff --git a/arch/x86/lib/getuser_32.S b/arch/x86/lib/getuser_32.S deleted file mode 100644 index 6d84b53f12a..00000000000 --- a/arch/x86/lib/getuser_32.S +++ /dev/null @@ -1,78 +0,0 @@ -/* - * __get_user functions. - * - * (C) Copyright 1998 Linus Torvalds - * - * These functions have a non-standard call interface - * to make them more efficient, especially as they - * return an error value in addition to the "real" - * return value. - */ -#include <linux/linkage.h> -#include <asm/dwarf2.h> -#include <asm/thread_info.h> - - -/* - * __get_user_X - * - * Inputs: %eax contains the address - * - * Outputs: %eax is error code (0 or -EFAULT) - * %edx contains zero-extended value - * - * These functions should not modify any other registers, - * as they get called from within inline assembly. - */ - -.text -ENTRY(__get_user_1) - CFI_STARTPROC - GET_THREAD_INFO(%edx) - cmpl TI_addr_limit(%edx),%eax - jae bad_get_user -1: movzbl (%eax),%edx - xorl %eax,%eax - ret - CFI_ENDPROC -ENDPROC(__get_user_1) - -ENTRY(__get_user_2) - CFI_STARTPROC - addl $1,%eax - jc bad_get_user - GET_THREAD_INFO(%edx) - cmpl TI_addr_limit(%edx),%eax - jae bad_get_user -2: movzwl -1(%eax),%edx - xorl %eax,%eax - ret - CFI_ENDPROC -ENDPROC(__get_user_2) - -ENTRY(__get_user_4) - CFI_STARTPROC - addl $3,%eax - jc bad_get_user - GET_THREAD_INFO(%edx) - cmpl TI_addr_limit(%edx),%eax - jae bad_get_user -3: movl -3(%eax),%edx - xorl %eax,%eax - ret - CFI_ENDPROC -ENDPROC(__get_user_4) - -bad_get_user: - CFI_STARTPROC - xorl %edx,%edx - movl $-14,%eax - ret - CFI_ENDPROC -END(bad_get_user) - -.section __ex_table,"a" - .long 1b,bad_get_user - .long 2b,bad_get_user - .long 3b,bad_get_user -.previous diff --git a/arch/x86/lib/msr-on-cpu.c b/arch/x86/lib/msr-on-cpu.c index 57d043fa893..d5a2b39f882 100644 --- a/arch/x86/lib/msr-on-cpu.c +++ b/arch/x86/lib/msr-on-cpu.c @@ -30,10 +30,10 @@ static int _rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h, int safe) rv.msr_no = msr_no; if (safe) { - smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 0, 1); + smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1); err = rv.err; } else { - smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 0, 1); + smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1); } *l = rv.l; *h = rv.h; @@ -64,10 +64,10 @@ static int _wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h, int safe) rv.l = l; rv.h = h; if (safe) { - smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 0, 1); + smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1); err = rv.err; } else { - smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 0, 1); + smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1); } return err; diff --git a/arch/x86/lib/putuser_32.S b/arch/x86/lib/putuser.S index f58fba109d1..36b0d15ae6e 100644 --- a/arch/x86/lib/putuser_32.S +++ b/arch/x86/lib/putuser.S @@ -2,6 +2,8 @@ * __put_user functions. * * (C) Copyright 2005 Linus Torvalds + * (C) Copyright 2005 Andi Kleen + * (C) Copyright 2008 Glauber Costa * * These functions have a non-standard call interface * to make them more efficient, especially as they @@ -11,6 +13,8 @@ #include <linux/linkage.h> #include <asm/dwarf2.h> #include <asm/thread_info.h> +#include <asm/errno.h> +#include <asm/asm.h> /* @@ -26,73 +30,68 @@ */ #define ENTER CFI_STARTPROC ; \ - pushl %ebx ; \ - CFI_ADJUST_CFA_OFFSET 4 ; \ - CFI_REL_OFFSET ebx, 0 ; \ - GET_THREAD_INFO(%ebx) -#define EXIT popl %ebx ; \ - CFI_ADJUST_CFA_OFFSET -4 ; \ - CFI_RESTORE ebx ; \ - ret ; \ + GET_THREAD_INFO(%_ASM_BX) +#define EXIT ret ; \ CFI_ENDPROC .text ENTRY(__put_user_1) ENTER - cmpl TI_addr_limit(%ebx),%ecx + cmp TI_addr_limit(%_ASM_BX),%_ASM_CX jae bad_put_user -1: movb %al,(%ecx) - xorl %eax,%eax +1: movb %al,(%_ASM_CX) + xor %eax,%eax EXIT ENDPROC(__put_user_1) ENTRY(__put_user_2) ENTER - movl TI_addr_limit(%ebx),%ebx - subl $1,%ebx - cmpl %ebx,%ecx + mov TI_addr_limit(%_ASM_BX),%_ASM_BX + sub $1,%_ASM_BX + cmp %_ASM_BX,%_ASM_CX jae bad_put_user -2: movw %ax,(%ecx) - xorl %eax,%eax +2: movw %ax,(%_ASM_CX) + xor %eax,%eax EXIT ENDPROC(__put_user_2) ENTRY(__put_user_4) ENTER - movl TI_addr_limit(%ebx),%ebx - subl $3,%ebx - cmpl %ebx,%ecx + mov TI_addr_limit(%_ASM_BX),%_ASM_BX + sub $3,%_ASM_BX + cmp %_ASM_BX,%_ASM_CX jae bad_put_user -3: movl %eax,(%ecx) - xorl %eax,%eax +3: movl %eax,(%_ASM_CX) + xor %eax,%eax EXIT ENDPROC(__put_user_4) ENTRY(__put_user_8) ENTER - movl TI_addr_limit(%ebx),%ebx - subl $7,%ebx - cmpl %ebx,%ecx + mov TI_addr_limit(%_ASM_BX),%_ASM_BX + sub $7,%_ASM_BX + cmp %_ASM_BX,%_ASM_CX jae bad_put_user -4: movl %eax,(%ecx) -5: movl %edx,4(%ecx) - xorl %eax,%eax +4: mov %_ASM_AX,(%_ASM_CX) +#ifdef CONFIG_X86_32 +5: movl %edx,4(%_ASM_CX) +#endif + xor %eax,%eax EXIT ENDPROC(__put_user_8) bad_put_user: - CFI_STARTPROC simple - CFI_DEF_CFA esp, 2*4 - CFI_OFFSET eip, -1*4 - CFI_OFFSET ebx, -2*4 - movl $-14,%eax + CFI_STARTPROC + movl $-EFAULT,%eax EXIT END(bad_put_user) .section __ex_table,"a" - .long 1b,bad_put_user - .long 2b,bad_put_user - .long 3b,bad_put_user - .long 4b,bad_put_user - .long 5b,bad_put_user + _ASM_PTR 1b,bad_put_user + _ASM_PTR 2b,bad_put_user + _ASM_PTR 3b,bad_put_user + _ASM_PTR 4b,bad_put_user +#ifdef CONFIG_X86_32 + _ASM_PTR 5b,bad_put_user +#endif .previous diff --git a/arch/x86/lib/putuser_64.S b/arch/x86/lib/putuser_64.S deleted file mode 100644 index 4989f5a8fa9..00000000000 --- a/arch/x86/lib/putuser_64.S +++ /dev/null @@ -1,106 +0,0 @@ -/* - * __put_user functions. - * - * (C) Copyright 1998 Linus Torvalds - * (C) Copyright 2005 Andi Kleen - * - * These functions have a non-standard call interface - * to make them more efficient, especially as they - * return an error value in addition to the "real" - * return value. - */ - -/* - * __put_user_X - * - * Inputs: %rcx contains the address - * %rdx contains new value - * - * Outputs: %rax is error code (0 or -EFAULT) - * - * %r8 is destroyed. - * - * These functions should not modify any other registers, - * as they get called from within inline assembly. - */ - -#include <linux/linkage.h> -#include <asm/dwarf2.h> -#include <asm/page.h> -#include <asm/errno.h> -#include <asm/asm-offsets.h> -#include <asm/thread_info.h> - - .text -ENTRY(__put_user_1) - CFI_STARTPROC - GET_THREAD_INFO(%r8) - cmpq threadinfo_addr_limit(%r8),%rcx - jae bad_put_user -1: movb %dl,(%rcx) - xorl %eax,%eax - ret - CFI_ENDPROC -ENDPROC(__put_user_1) - -ENTRY(__put_user_2) - CFI_STARTPROC - GET_THREAD_INFO(%r8) - addq $1,%rcx - jc 20f - cmpq threadinfo_addr_limit(%r8),%rcx - jae 20f - decq %rcx -2: movw %dx,(%rcx) - xorl %eax,%eax - ret -20: decq %rcx - jmp bad_put_user - CFI_ENDPROC -ENDPROC(__put_user_2) - -ENTRY(__put_user_4) - CFI_STARTPROC - GET_THREAD_INFO(%r8) - addq $3,%rcx - jc 30f - cmpq threadinfo_addr_limit(%r8),%rcx - jae 30f - subq $3,%rcx -3: movl %edx,(%rcx) - xorl %eax,%eax - ret -30: subq $3,%rcx - jmp bad_put_user - CFI_ENDPROC -ENDPROC(__put_user_4) - -ENTRY(__put_user_8) - CFI_STARTPROC - GET_THREAD_INFO(%r8) - addq $7,%rcx - jc 40f - cmpq threadinfo_addr_limit(%r8),%rcx - jae 40f - subq $7,%rcx -4: movq %rdx,(%rcx) - xorl %eax,%eax - ret -40: subq $7,%rcx - jmp bad_put_user - CFI_ENDPROC -ENDPROC(__put_user_8) - -bad_put_user: - CFI_STARTPROC - movq $(-EFAULT),%rax - ret - CFI_ENDPROC -END(bad_put_user) - -.section __ex_table,"a" - .quad 1b,bad_put_user - .quad 2b,bad_put_user - .quad 3b,bad_put_user - .quad 4b,bad_put_user -.previous diff --git a/arch/x86/lib/thunk_32.S b/arch/x86/lib/thunk_32.S new file mode 100644 index 00000000000..650b11e00ec --- /dev/null +++ b/arch/x86/lib/thunk_32.S @@ -0,0 +1,47 @@ +/* + * Trampoline to trace irqs off. (otherwise CALLER_ADDR1 might crash) + * Copyright 2008 by Steven Rostedt, Red Hat, Inc + * (inspired by Andi Kleen's thunk_64.S) + * Subject to the GNU public license, v.2. No warranty of any kind. + */ + + #include <linux/linkage.h> + +#define ARCH_TRACE_IRQS_ON \ + pushl %eax; \ + pushl %ecx; \ + pushl %edx; \ + call trace_hardirqs_on; \ + popl %edx; \ + popl %ecx; \ + popl %eax; + +#define ARCH_TRACE_IRQS_OFF \ + pushl %eax; \ + pushl %ecx; \ + pushl %edx; \ + call trace_hardirqs_off; \ + popl %edx; \ + popl %ecx; \ + popl %eax; + +#ifdef CONFIG_TRACE_IRQFLAGS + /* put return address in eax (arg1) */ + .macro thunk_ra name,func + .globl \name +\name: + pushl %eax + pushl %ecx + pushl %edx + /* Place EIP in the arg1 */ + movl 3*4(%esp), %eax + call \func + popl %edx + popl %ecx + popl %eax + ret + .endm + + thunk_ra trace_hardirqs_on_thunk,trace_hardirqs_on_caller + thunk_ra trace_hardirqs_off_thunk,trace_hardirqs_off_caller +#endif diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S index e009251d4e9..bf9a7d5a542 100644 --- a/arch/x86/lib/thunk_64.S +++ b/arch/x86/lib/thunk_64.S @@ -2,6 +2,7 @@ * Save registers before calling assembly functions. This avoids * disturbance of register allocation in some inline assembly constructs. * Copyright 2001,2002 by Andi Kleen, SuSE Labs. + * Added trace_hardirqs callers - Copyright 2007 Steven Rostedt, Red Hat, Inc. * Subject to the GNU public license, v.2. No warranty of any kind. */ @@ -42,8 +43,22 @@ #endif #ifdef CONFIG_TRACE_IRQFLAGS - thunk trace_hardirqs_on_thunk,trace_hardirqs_on - thunk trace_hardirqs_off_thunk,trace_hardirqs_off + /* put return address in rdi (arg1) */ + .macro thunk_ra name,func + .globl \name +\name: + CFI_STARTPROC + SAVE_ARGS + /* SAVE_ARGS pushs 9 elements */ + /* the next element would be the rip */ + movq 9*8(%rsp), %rdi + call \func + jmp restore + CFI_ENDPROC + .endm + + thunk_ra trace_hardirqs_on_thunk,trace_hardirqs_on_caller + thunk_ra trace_hardirqs_off_thunk,trace_hardirqs_off_caller #endif #ifdef CONFIG_DEBUG_LOCK_ALLOC diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index 0c89d1bb028..f4df6e7c718 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -158,3 +158,26 @@ unsigned long copy_in_user(void __user *to, const void __user *from, unsigned le } EXPORT_SYMBOL(copy_in_user); +/* + * Try to copy last bytes and clear the rest if needed. + * Since protection fault in copy_from/to_user is not a normal situation, + * it is not necessary to optimize tail handling. + */ +unsigned long +copy_user_handle_tail(char *to, char *from, unsigned len, unsigned zerorest) +{ + char c; + unsigned zero_len; + + for (; len; --len) { + if (__get_user_nocheck(c, from++, sizeof(char))) + break; + if (__put_user_nocheck(c, to++, sizeof(char))) + break; + } + + for (c = 0, zero_len = len; zerorest && zero_len; --zero_len) + if (__put_user_nocheck(c, to++, sizeof(char))) + break; + return len; +} diff --git a/arch/x86/mach-default/setup.c b/arch/x86/mach-default/setup.c index 2f5e277686b..48278fa7d3d 100644 --- a/arch/x86/mach-default/setup.c +++ b/arch/x86/mach-default/setup.c @@ -10,6 +10,14 @@ #include <asm/e820.h> #include <asm/setup.h> +/* + * Any quirks to be performed to initialize timers/irqs/etc? + */ +int (*arch_time_init_quirk)(void); +int (*arch_pre_intr_init_quirk)(void); +int (*arch_intr_init_quirk)(void); +int (*arch_trap_init_quirk)(void); + #ifdef CONFIG_HOTPLUG_CPU #define DEFAULT_SEND_IPI (1) #else @@ -29,6 +37,10 @@ int no_broadcast=DEFAULT_SEND_IPI; **/ void __init pre_intr_init_hook(void) { + if (arch_pre_intr_init_quirk) { + if (arch_pre_intr_init_quirk()) + return; + } init_ISA_irqs(); } @@ -52,6 +64,10 @@ static struct irqaction irq2 = { **/ void __init intr_init_hook(void) { + if (arch_intr_init_quirk) { + if (arch_intr_init_quirk()) + return; + } #ifdef CONFIG_X86_LOCAL_APIC apic_intr_init(); #endif @@ -65,7 +81,7 @@ void __init intr_init_hook(void) * * Description: * generally used to activate any machine specific identification - * routines that may be needed before setup_arch() runs. On VISWS + * routines that may be needed before setup_arch() runs. On Voyager * this is used to get the board revision and type. **/ void __init pre_setup_arch_hook(void) @@ -81,6 +97,10 @@ void __init pre_setup_arch_hook(void) **/ void __init trap_init_hook(void) { + if (arch_trap_init_quirk) { + if (arch_trap_init_quirk()) + return; + } } static struct irqaction irq0 = { @@ -99,6 +119,16 @@ static struct irqaction irq0 = { **/ void __init time_init_hook(void) { + if (arch_time_init_quirk) { + /* + * A nonzero return code does not mean failure, it means + * that the architecture quirk does not want any + * generic (timer) setup to be performed after this: + */ + if (arch_time_init_quirk()) + return; + } + irq0.mask = cpumask_of_cpu(0); setup_irq(0, &irq0); } diff --git a/arch/x86/mach-visws/Makefile b/arch/x86/mach-visws/Makefile deleted file mode 100644 index 835fd96ad76..00000000000 --- a/arch/x86/mach-visws/Makefile +++ /dev/null @@ -1,8 +0,0 @@ -# -# Makefile for the linux kernel. -# - -obj-y := setup.o traps.o reboot.o - -obj-$(CONFIG_X86_VISWS_APIC) += visws_apic.o -obj-$(CONFIG_X86_LOCAL_APIC) += mpparse.o diff --git a/arch/x86/mach-visws/mpparse.c b/arch/x86/mach-visws/mpparse.c deleted file mode 100644 index a2fb78c0d15..00000000000 --- a/arch/x86/mach-visws/mpparse.c +++ /dev/null @@ -1,85 +0,0 @@ - -#include <linux/init.h> -#include <linux/smp.h> - -#include <asm/smp.h> -#include <asm/io.h> - -#include "cobalt.h" -#include "mach_apic.h" - -extern unsigned int __cpuinitdata maxcpus; - -/* - * The Visual Workstation is Intel MP compliant in the hardware - * sense, but it doesn't have a BIOS(-configuration table). - * No problem for Linux. - */ - -static void __init MP_processor_info (struct mpc_config_processor *m) -{ - int ver, logical_apicid; - physid_mask_t apic_cpus; - - if (!(m->mpc_cpuflag & CPU_ENABLED)) - return; - - logical_apicid = m->mpc_apicid; - printk(KERN_INFO "%sCPU #%d %u:%u APIC version %d\n", - m->mpc_cpuflag & CPU_BOOTPROCESSOR ? "Bootup " : "", - m->mpc_apicid, - (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8, - (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4, - m->mpc_apicver); - - if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) - boot_cpu_physical_apicid = m->mpc_apicid; - - ver = m->mpc_apicver; - if ((ver >= 0x14 && m->mpc_apicid >= 0xff) || m->mpc_apicid >= 0xf) { - printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n", - m->mpc_apicid, MAX_APICS); - return; - } - - apic_cpus = apicid_to_cpu_present(m->mpc_apicid); - physids_or(phys_cpu_present_map, phys_cpu_present_map, apic_cpus); - /* - * Validate version - */ - if (ver == 0x0) { - printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! " - "fixing up to 0x10. (tell your hw vendor)\n", - m->mpc_apicid); - ver = 0x10; - } - apic_version[m->mpc_apicid] = ver; -} - -void __init find_smp_config(void) -{ - struct mpc_config_processor *mp = phys_to_virt(CO_CPU_TAB_PHYS); - unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS)); - - if (ncpus > CO_CPU_MAX) { - printk(KERN_WARNING "find_visws_smp: got cpu count of %d at %p\n", - ncpus, mp); - - ncpus = CO_CPU_MAX; - } - - if (ncpus > maxcpus) - ncpus = maxcpus; - -#ifdef CONFIG_X86_LOCAL_APIC - smp_found_config = 1; -#endif - while (ncpus--) - MP_processor_info(mp++); - - mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; -} - -void __init get_smp_config (void) -{ -} diff --git a/arch/x86/mach-visws/reboot.c b/arch/x86/mach-visws/reboot.c deleted file mode 100644 index 99332abfad4..00000000000 --- a/arch/x86/mach-visws/reboot.c +++ /dev/null @@ -1,55 +0,0 @@ -#include <linux/module.h> -#include <linux/smp.h> -#include <linux/delay.h> - -#include <asm/io.h> -#include "piix4.h" - -void (*pm_power_off)(void); -EXPORT_SYMBOL(pm_power_off); - -void machine_shutdown(void) -{ -#ifdef CONFIG_SMP - smp_send_stop(); -#endif -} - -void machine_emergency_restart(void) -{ - /* - * Visual Workstations restart after this - * register is poked on the PIIX4 - */ - outb(PIIX4_RESET_VAL, PIIX4_RESET_PORT); -} - -void machine_restart(char * __unused) -{ - machine_shutdown(); - machine_emergency_restart(); -} - -void machine_power_off(void) -{ - unsigned short pm_status; - extern unsigned int pci_bus0; - - while ((pm_status = inw(PMSTS_PORT)) & 0x100) - outw(pm_status, PMSTS_PORT); - - outw(PM_SUSPEND_ENABLE, PMCNTRL_PORT); - - mdelay(10); - -#define PCI_CONF1_ADDRESS(bus, devfn, reg) \ - (0x80000000 | (bus << 16) | (devfn << 8) | (reg & ~3)) - - outl(PCI_CONF1_ADDRESS(pci_bus0, SPECIAL_DEV, SPECIAL_REG), 0xCF8); - outl(PIIX_SPECIAL_STOP, 0xCFC); -} - -void machine_halt(void) -{ -} - diff --git a/arch/x86/mach-visws/setup.c b/arch/x86/mach-visws/setup.c deleted file mode 100644 index d67868ec9b7..00000000000 --- a/arch/x86/mach-visws/setup.c +++ /dev/null @@ -1,183 +0,0 @@ -/* - * Unmaintained SGI Visual Workstation support. - * Split out from setup.c by davej@suse.de - */ - -#include <linux/smp.h> -#include <linux/init.h> -#include <linux/interrupt.h> -#include <linux/module.h> - -#include <asm/fixmap.h> -#include <asm/arch_hooks.h> -#include <asm/io.h> -#include <asm/e820.h> -#include <asm/setup.h> -#include "cobalt.h" -#include "piix4.h" - -int no_broadcast; - -char visws_board_type = -1; -char visws_board_rev = -1; - -void __init visws_get_board_type_and_rev(void) -{ - int raw; - - visws_board_type = (char)(inb_p(PIIX_GPI_BD_REG) & PIIX_GPI_BD_REG) - >> PIIX_GPI_BD_SHIFT; - /* - * Get Board rev. - * First, we have to initialize the 307 part to allow us access - * to the GPIO registers. Let's map them at 0x0fc0 which is right - * after the PIIX4 PM section. - */ - outb_p(SIO_DEV_SEL, SIO_INDEX); - outb_p(SIO_GP_DEV, SIO_DATA); /* Talk to GPIO regs. */ - - outb_p(SIO_DEV_MSB, SIO_INDEX); - outb_p(SIO_GP_MSB, SIO_DATA); /* MSB of GPIO base address */ - - outb_p(SIO_DEV_LSB, SIO_INDEX); - outb_p(SIO_GP_LSB, SIO_DATA); /* LSB of GPIO base address */ - - outb_p(SIO_DEV_ENB, SIO_INDEX); - outb_p(1, SIO_DATA); /* Enable GPIO registers. */ - - /* - * Now, we have to map the power management section to write - * a bit which enables access to the GPIO registers. - * What lunatic came up with this shit? - */ - outb_p(SIO_DEV_SEL, SIO_INDEX); - outb_p(SIO_PM_DEV, SIO_DATA); /* Talk to GPIO regs. */ - - outb_p(SIO_DEV_MSB, SIO_INDEX); - outb_p(SIO_PM_MSB, SIO_DATA); /* MSB of PM base address */ - - outb_p(SIO_DEV_LSB, SIO_INDEX); - outb_p(SIO_PM_LSB, SIO_DATA); /* LSB of PM base address */ - - outb_p(SIO_DEV_ENB, SIO_INDEX); - outb_p(1, SIO_DATA); /* Enable PM registers. */ - - /* - * Now, write the PM register which enables the GPIO registers. - */ - outb_p(SIO_PM_FER2, SIO_PM_INDEX); - outb_p(SIO_PM_GP_EN, SIO_PM_DATA); - - /* - * Now, initialize the GPIO registers. - * We want them all to be inputs which is the - * power on default, so let's leave them alone. - * So, let's just read the board rev! - */ - raw = inb_p(SIO_GP_DATA1); - raw &= 0x7f; /* 7 bits of valid board revision ID. */ - - if (visws_board_type == VISWS_320) { - if (raw < 0x6) { - visws_board_rev = 4; - } else if (raw < 0xc) { - visws_board_rev = 5; - } else { - visws_board_rev = 6; - } - } else if (visws_board_type == VISWS_540) { - visws_board_rev = 2; - } else { - visws_board_rev = raw; - } - - printk(KERN_INFO "Silicon Graphics Visual Workstation %s (rev %d) detected\n", - (visws_board_type == VISWS_320 ? "320" : - (visws_board_type == VISWS_540 ? "540" : - "unknown")), visws_board_rev); -} - -void __init pre_intr_init_hook(void) -{ - init_VISWS_APIC_irqs(); -} - -void __init intr_init_hook(void) -{ -#ifdef CONFIG_X86_LOCAL_APIC - apic_intr_init(); -#endif -} - -void __init pre_setup_arch_hook() -{ - visws_get_board_type_and_rev(); -} - -static struct irqaction irq0 = { - .handler = timer_interrupt, - .flags = IRQF_DISABLED | IRQF_IRQPOLL, - .name = "timer", -}; - -void __init time_init_hook(void) -{ - printk(KERN_INFO "Starting Cobalt Timer system clock\n"); - - /* Set the countdown value */ - co_cpu_write(CO_CPU_TIMEVAL, CO_TIME_HZ/HZ); - - /* Start the timer */ - co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) | CO_CTRL_TIMERUN); - - /* Enable (unmask) the timer interrupt */ - co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) & ~CO_CTRL_TIMEMASK); - - /* Wire cpu IDT entry to s/w handler (and Cobalt APIC to IDT) */ - setup_irq(0, &irq0); -} - -/* Hook for machine specific memory setup. */ - -#define MB (1024 * 1024) - -unsigned long sgivwfb_mem_phys; -unsigned long sgivwfb_mem_size; -EXPORT_SYMBOL(sgivwfb_mem_phys); -EXPORT_SYMBOL(sgivwfb_mem_size); - -long long mem_size __initdata = 0; - -char * __init machine_specific_memory_setup(void) -{ - long long gfx_mem_size = 8 * MB; - - mem_size = boot_params.alt_mem_k; - - if (!mem_size) { - printk(KERN_WARNING "Bootloader didn't set memory size, upgrade it !\n"); - mem_size = 128 * MB; - } - - /* - * this hardcodes the graphics memory to 8 MB - * it really should be sized dynamically (or at least - * set as a boot param) - */ - if (!sgivwfb_mem_size) { - printk(KERN_WARNING "Defaulting to 8 MB framebuffer size\n"); - sgivwfb_mem_size = 8 * MB; - } - - /* - * Trim to nearest MB - */ - sgivwfb_mem_size &= ~((1 << 20) - 1); - sgivwfb_mem_phys = mem_size - gfx_mem_size; - - e820_add_region(0, LOWMEMSIZE(), E820_RAM); - e820_add_region(HIGH_MEMORY, mem_size - sgivwfb_mem_size - HIGH_MEMORY, E820_RAM); - e820_add_region(sgivwfb_mem_phys, sgivwfb_mem_size, E820_RESERVED); - - return "PROM"; -} diff --git a/arch/x86/mach-visws/traps.c b/arch/x86/mach-visws/traps.c deleted file mode 100644 index bfac6ba10f8..00000000000 --- a/arch/x86/mach-visws/traps.c +++ /dev/null @@ -1,69 +0,0 @@ -/* VISWS traps */ - -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/pci.h> -#include <linux/pci_ids.h> - -#include <asm/io.h> -#include <asm/arch_hooks.h> -#include <asm/apic.h> -#include "cobalt.h" -#include "lithium.h" - - -#define A01234 (LI_INTA_0 | LI_INTA_1 | LI_INTA_2 | LI_INTA_3 | LI_INTA_4) -#define BCD (LI_INTB | LI_INTC | LI_INTD) -#define ALLDEVS (A01234 | BCD) - -static __init void lithium_init(void) -{ - set_fixmap(FIX_LI_PCIA, LI_PCI_A_PHYS); - set_fixmap(FIX_LI_PCIB, LI_PCI_B_PHYS); - - if ((li_pcia_read16(PCI_VENDOR_ID) != PCI_VENDOR_ID_SGI) || - (li_pcia_read16(PCI_DEVICE_ID) != PCI_DEVICE_ID_SGI_LITHIUM)) { - printk(KERN_EMERG "Lithium hostbridge %c not found\n", 'A'); - panic("This machine is not SGI Visual Workstation 320/540"); - } - - if ((li_pcib_read16(PCI_VENDOR_ID) != PCI_VENDOR_ID_SGI) || - (li_pcib_read16(PCI_DEVICE_ID) != PCI_DEVICE_ID_SGI_LITHIUM)) { - printk(KERN_EMERG "Lithium hostbridge %c not found\n", 'B'); - panic("This machine is not SGI Visual Workstation 320/540"); - } - - li_pcia_write16(LI_PCI_INTEN, ALLDEVS); - li_pcib_write16(LI_PCI_INTEN, ALLDEVS); -} - -static __init void cobalt_init(void) -{ - /* - * On normal SMP PC this is used only with SMP, but we have to - * use it and set it up here to start the Cobalt clock - */ - set_fixmap(FIX_APIC_BASE, APIC_DEFAULT_PHYS_BASE); - setup_local_APIC(); - printk(KERN_INFO "Local APIC Version %#x, ID %#x\n", - (unsigned int)apic_read(APIC_LVR), - (unsigned int)apic_read(APIC_ID)); - - set_fixmap(FIX_CO_CPU, CO_CPU_PHYS); - set_fixmap(FIX_CO_APIC, CO_APIC_PHYS); - printk(KERN_INFO "Cobalt Revision %#lx, APIC ID %#lx\n", - co_cpu_read(CO_CPU_REV), co_apic_read(CO_APIC_ID)); - - /* Enable Cobalt APIC being careful to NOT change the ID! */ - co_apic_write(CO_APIC_ID, co_apic_read(CO_APIC_ID) | CO_APIC_ENABLE); - - printk(KERN_INFO "Cobalt APIC enabled: ID reg %#lx\n", - co_apic_read(CO_APIC_ID)); -} - -void __init trap_init_hook(void) -{ - lithium_init(); - cobalt_init(); -} diff --git a/arch/x86/mach-visws/visws_apic.c b/arch/x86/mach-visws/visws_apic.c deleted file mode 100644 index d8b2cfd85d9..00000000000 --- a/arch/x86/mach-visws/visws_apic.c +++ /dev/null @@ -1,296 +0,0 @@ -/* - * Copyright (C) 1999 Bent Hagemark, Ingo Molnar - * - * SGI Visual Workstation interrupt controller - * - * The Cobalt system ASIC in the Visual Workstation contains a "Cobalt" APIC - * which serves as the main interrupt controller in the system. Non-legacy - * hardware in the system uses this controller directly. Legacy devices - * are connected to the PIIX4 which in turn has its 8259(s) connected to - * a of the Cobalt APIC entry. - * - * 09/02/2000 - Updated for 2.4 by jbarnes@sgi.com - * - * 25/11/2002 - Updated for 2.5 by Andrey Panin <pazke@orbita1.ru> - */ - -#include <linux/kernel_stat.h> -#include <linux/interrupt.h> -#include <linux/init.h> - -#include <asm/io.h> -#include <asm/apic.h> -#include <asm/i8259.h> -#include <asm/irq_vectors.h> - -#include "cobalt.h" - -static DEFINE_SPINLOCK(cobalt_lock); - -/* - * Set the given Cobalt APIC Redirection Table entry to point - * to the given IDT vector/index. - */ -static inline void co_apic_set(int entry, int irq) -{ - co_apic_write(CO_APIC_LO(entry), CO_APIC_LEVEL | (irq + FIRST_EXTERNAL_VECTOR)); - co_apic_write(CO_APIC_HI(entry), 0); -} - -/* - * Cobalt (IO)-APIC functions to handle PCI devices. - */ -static inline int co_apic_ide0_hack(void) -{ - extern char visws_board_type; - extern char visws_board_rev; - - if (visws_board_type == VISWS_320 && visws_board_rev == 5) - return 5; - return CO_APIC_IDE0; -} - -static int is_co_apic(unsigned int irq) -{ - if (IS_CO_APIC(irq)) - return CO_APIC(irq); - - switch (irq) { - case 0: return CO_APIC_CPU; - case CO_IRQ_IDE0: return co_apic_ide0_hack(); - case CO_IRQ_IDE1: return CO_APIC_IDE1; - default: return -1; - } -} - - -/* - * This is the SGI Cobalt (IO-)APIC: - */ - -static void enable_cobalt_irq(unsigned int irq) -{ - co_apic_set(is_co_apic(irq), irq); -} - -static void disable_cobalt_irq(unsigned int irq) -{ - int entry = is_co_apic(irq); - - co_apic_write(CO_APIC_LO(entry), CO_APIC_MASK); - co_apic_read(CO_APIC_LO(entry)); -} - -/* - * "irq" really just serves to identify the device. Here is where we - * map this to the Cobalt APIC entry where it's physically wired. - * This is called via request_irq -> setup_irq -> irq_desc->startup() - */ -static unsigned int startup_cobalt_irq(unsigned int irq) -{ - unsigned long flags; - - spin_lock_irqsave(&cobalt_lock, flags); - if ((irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING))) - irq_desc[irq].status &= ~(IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING); - enable_cobalt_irq(irq); - spin_unlock_irqrestore(&cobalt_lock, flags); - return 0; -} - -static void ack_cobalt_irq(unsigned int irq) -{ - unsigned long flags; - - spin_lock_irqsave(&cobalt_lock, flags); - disable_cobalt_irq(irq); - apic_write(APIC_EOI, APIC_EIO_ACK); - spin_unlock_irqrestore(&cobalt_lock, flags); -} - -static void end_cobalt_irq(unsigned int irq) -{ - unsigned long flags; - - spin_lock_irqsave(&cobalt_lock, flags); - if (!(irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS))) - enable_cobalt_irq(irq); - spin_unlock_irqrestore(&cobalt_lock, flags); -} - -static struct irq_chip cobalt_irq_type = { - .typename = "Cobalt-APIC", - .startup = startup_cobalt_irq, - .shutdown = disable_cobalt_irq, - .enable = enable_cobalt_irq, - .disable = disable_cobalt_irq, - .ack = ack_cobalt_irq, - .end = end_cobalt_irq, -}; - - -/* - * This is the PIIX4-based 8259 that is wired up indirectly to Cobalt - * -- not the manner expected by the code in i8259.c. - * - * there is a 'master' physical interrupt source that gets sent to - * the CPU. But in the chipset there are various 'virtual' interrupts - * waiting to be handled. We represent this to Linux through a 'master' - * interrupt controller type, and through a special virtual interrupt- - * controller. Device drivers only see the virtual interrupt sources. - */ -static unsigned int startup_piix4_master_irq(unsigned int irq) -{ - init_8259A(0); - - return startup_cobalt_irq(irq); -} - -static void end_piix4_master_irq(unsigned int irq) -{ - unsigned long flags; - - spin_lock_irqsave(&cobalt_lock, flags); - enable_cobalt_irq(irq); - spin_unlock_irqrestore(&cobalt_lock, flags); -} - -static struct irq_chip piix4_master_irq_type = { - .typename = "PIIX4-master", - .startup = startup_piix4_master_irq, - .ack = ack_cobalt_irq, - .end = end_piix4_master_irq, -}; - - -static struct irq_chip piix4_virtual_irq_type = { - .typename = "PIIX4-virtual", - .shutdown = disable_8259A_irq, - .enable = enable_8259A_irq, - .disable = disable_8259A_irq, -}; - - -/* - * PIIX4-8259 master/virtual functions to handle interrupt requests - * from legacy devices: floppy, parallel, serial, rtc. - * - * None of these get Cobalt APIC entries, neither do they have IDT - * entries. These interrupts are purely virtual and distributed from - * the 'master' interrupt source: CO_IRQ_8259. - * - * When the 8259 interrupts its handler figures out which of these - * devices is interrupting and dispatches to its handler. - * - * CAREFUL: devices see the 'virtual' interrupt only. Thus disable/ - * enable_irq gets the right irq. This 'master' irq is never directly - * manipulated by any driver. - */ -static irqreturn_t piix4_master_intr(int irq, void *dev_id) -{ - int realirq; - irq_desc_t *desc; - unsigned long flags; - - spin_lock_irqsave(&i8259A_lock, flags); - - /* Find out what's interrupting in the PIIX4 master 8259 */ - outb(0x0c, 0x20); /* OCW3 Poll command */ - realirq = inb(0x20); - - /* - * Bit 7 == 0 means invalid/spurious - */ - if (unlikely(!(realirq & 0x80))) - goto out_unlock; - - realirq &= 7; - - if (unlikely(realirq == 2)) { - outb(0x0c, 0xa0); - realirq = inb(0xa0); - - if (unlikely(!(realirq & 0x80))) - goto out_unlock; - - realirq = (realirq & 7) + 8; - } - - /* mask and ack interrupt */ - cached_irq_mask |= 1 << realirq; - if (unlikely(realirq > 7)) { - inb(0xa1); - outb(cached_slave_mask, 0xa1); - outb(0x60 + (realirq & 7), 0xa0); - outb(0x60 + 2, 0x20); - } else { - inb(0x21); - outb(cached_master_mask, 0x21); - outb(0x60 + realirq, 0x20); - } - - spin_unlock_irqrestore(&i8259A_lock, flags); - - desc = irq_desc + realirq; - - /* - * handle this 'virtual interrupt' as a Cobalt one now. - */ - kstat_cpu(smp_processor_id()).irqs[realirq]++; - - if (likely(desc->action != NULL)) - handle_IRQ_event(realirq, desc->action); - - if (!(desc->status & IRQ_DISABLED)) - enable_8259A_irq(realirq); - - return IRQ_HANDLED; - -out_unlock: - spin_unlock_irqrestore(&i8259A_lock, flags); - return IRQ_NONE; -} - -static struct irqaction master_action = { - .handler = piix4_master_intr, - .name = "PIIX4-8259", -}; - -static struct irqaction cascade_action = { - .handler = no_action, - .name = "cascade", -}; - - -void init_VISWS_APIC_irqs(void) -{ - int i; - - for (i = 0; i < CO_IRQ_APIC0 + CO_APIC_LAST + 1; i++) { - irq_desc[i].status = IRQ_DISABLED; - irq_desc[i].action = 0; - irq_desc[i].depth = 1; - - if (i == 0) { - irq_desc[i].chip = &cobalt_irq_type; - } - else if (i == CO_IRQ_IDE0) { - irq_desc[i].chip = &cobalt_irq_type; - } - else if (i == CO_IRQ_IDE1) { - irq_desc[i].chip = &cobalt_irq_type; - } - else if (i == CO_IRQ_8259) { - irq_desc[i].chip = &piix4_master_irq_type; - } - else if (i < CO_IRQ_APIC0) { - irq_desc[i].chip = &piix4_virtual_irq_type; - } - else if (IS_CO_APIC(i)) { - irq_desc[i].chip = &cobalt_irq_type; - } - } - - setup_irq(CO_IRQ_8259, &master_action); - setup_irq(2, &cascade_action); -} diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c index 8dedd01e909..ee0fba09215 100644 --- a/arch/x86/mach-voyager/voyager_smp.c +++ b/arch/x86/mach-voyager/voyager_smp.c @@ -950,94 +950,24 @@ static void smp_stop_cpu_function(void *dummy) halt(); } -static DEFINE_SPINLOCK(call_lock); - -struct call_data_struct { - void (*func) (void *info); - void *info; - volatile unsigned long started; - volatile unsigned long finished; - int wait; -}; - -static struct call_data_struct *call_data; - /* execute a thread on a new CPU. The function to be called must be * previously set up. This is used to schedule a function for * execution on all CPUs - set up the function then broadcast a * function_interrupt CPI to come here on each CPU */ static void smp_call_function_interrupt(void) { - void (*func) (void *info) = call_data->func; - void *info = call_data->info; - /* must take copy of wait because call_data may be replaced - * unless the function is waiting for us to finish */ - int wait = call_data->wait; - __u8 cpu = smp_processor_id(); - - /* - * Notify initiating CPU that I've grabbed the data and am - * about to execute the function - */ - mb(); - if (!test_and_clear_bit(cpu, &call_data->started)) { - /* If the bit wasn't set, this could be a replay */ - printk(KERN_WARNING "VOYAGER SMP: CPU %d received call funtion" - " with no call pending\n", cpu); - return; - } - /* - * At this point the info structure may be out of scope unless wait==1 - */ irq_enter(); - (*func) (info); + generic_smp_call_function_interrupt(); __get_cpu_var(irq_stat).irq_call_count++; irq_exit(); - if (wait) { - mb(); - clear_bit(cpu, &call_data->finished); - } } -static int -voyager_smp_call_function_mask(cpumask_t cpumask, - void (*func) (void *info), void *info, int wait) +static void smp_call_function_single_interrupt(void) { - struct call_data_struct data; - u32 mask = cpus_addr(cpumask)[0]; - - mask &= ~(1 << smp_processor_id()); - - if (!mask) - return 0; - - /* Can deadlock when called with interrupts disabled */ - WARN_ON(irqs_disabled()); - - data.func = func; - data.info = info; - data.started = mask; - data.wait = wait; - if (wait) - data.finished = mask; - - spin_lock(&call_lock); - call_data = &data; - wmb(); - /* Send a message to all other CPUs and wait for them to respond */ - send_CPI(mask, VIC_CALL_FUNCTION_CPI); - - /* Wait for response */ - while (data.started) - barrier(); - - if (wait) - while (data.finished) - barrier(); - - spin_unlock(&call_lock); - - return 0; + irq_enter(); + generic_smp_call_function_single_interrupt(); + __get_cpu_var(irq_stat).irq_call_count++; + irq_exit(); } /* Sorry about the name. In an APIC based system, the APICs @@ -1094,6 +1024,12 @@ void smp_qic_call_function_interrupt(struct pt_regs *regs) smp_call_function_interrupt(); } +void smp_qic_call_function_single_interrupt(struct pt_regs *regs) +{ + ack_QIC_CPI(QIC_CALL_FUNCTION_SINGLE_CPI); + smp_call_function_single_interrupt(); +} + void smp_vic_cpi_interrupt(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); @@ -1114,6 +1050,8 @@ void smp_vic_cpi_interrupt(struct pt_regs *regs) smp_enable_irq_interrupt(); if (test_and_clear_bit(VIC_CALL_FUNCTION_CPI, &vic_cpi_mailbox[cpu])) smp_call_function_interrupt(); + if (test_and_clear_bit(VIC_CALL_FUNCTION_SINGLE_CPI, &vic_cpi_mailbox[cpu])) + smp_call_function_single_interrupt(); set_irq_regs(old_regs); } @@ -1129,7 +1067,7 @@ static void do_flush_tlb_all(void *info) /* flush the TLB of every active CPU in the system */ void flush_tlb_all(void) { - on_each_cpu(do_flush_tlb_all, 0, 1, 1); + on_each_cpu(do_flush_tlb_all, 0, 1); } /* send a reschedule CPI to one CPU by physical CPU number*/ @@ -1161,7 +1099,7 @@ int safe_smp_processor_id(void) /* broadcast a halt to all other CPUs */ static void voyager_smp_send_stop(void) { - smp_call_function(smp_stop_cpu_function, NULL, 1, 1); + smp_call_function(smp_stop_cpu_function, NULL, 1); } /* this function is triggered in time.c when a clock tick fires @@ -1848,5 +1786,7 @@ struct smp_ops smp_ops = { .smp_send_stop = voyager_smp_send_stop, .smp_send_reschedule = voyager_smp_send_reschedule, - .smp_call_function_mask = voyager_smp_call_function_mask, + + .send_call_func_ipi = native_send_call_func_ipi, + .send_call_func_single_ipi = native_send_call_func_single_ipi, }; diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index c107641cd39..9873716e9f7 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -8,6 +8,11 @@ obj-$(CONFIG_X86_PTDUMP) += dump_pagetables.o obj-$(CONFIG_HIGHMEM) += highmem_32.o +obj-$(CONFIG_MMIOTRACE_HOOKS) += kmmio.o +obj-$(CONFIG_MMIOTRACE) += mmiotrace.o +mmiotrace-y := pf_in.o mmio-mod.o +obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o + ifeq ($(CONFIG_X86_32),y) obj-$(CONFIG_NUMA) += discontig_32.o else diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index d0f5fce77d9..455f3fe67b4 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -10,6 +10,7 @@ #include <linux/string.h> #include <linux/types.h> #include <linux/ptrace.h> +#include <linux/mmiotrace.h> #include <linux/mman.h> #include <linux/mm.h> #include <linux/smp.h> @@ -49,6 +50,16 @@ #define PF_RSVD (1<<3) #define PF_INSTR (1<<4) +static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr) +{ +#ifdef CONFIG_MMIOTRACE_HOOKS + if (unlikely(is_kmmio_active())) + if (kmmio_handler(regs, addr) == 1) + return -1; +#endif + return 0; +} + static inline int notify_page_fault(struct pt_regs *regs) { #ifdef CONFIG_KPROBES @@ -598,6 +609,8 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) if (notify_page_fault(regs)) return; + if (unlikely(kmmio_fault(regs, address))) + return; /* * We fault-in kernel-space virtual memory on-demand. The diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index b5a0fd5f4c5..9689a5138e6 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -50,6 +50,7 @@ unsigned int __VMALLOC_RESERVE = 128 << 20; +unsigned long max_low_pfn_mapped; unsigned long max_pfn_mapped; DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); @@ -1034,6 +1035,8 @@ void mark_rodata_ro(void) unsigned long start = PFN_ALIGN(_text); unsigned long size = PFN_ALIGN(_etext) - start; +#ifndef CONFIG_DYNAMIC_FTRACE + /* Dynamic tracing modifies the kernel text section */ set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); printk(KERN_INFO "Write protecting the kernel text: %luk\n", size >> 10); @@ -1046,6 +1049,8 @@ void mark_rodata_ro(void) printk(KERN_INFO "Testing CPA: write protecting again\n"); set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT); #endif +#endif /* CONFIG_DYNAMIC_FTRACE */ + start += size; size = (unsigned long)__end_rodata - start; set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 77d129d62c9..306049edd55 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -53,6 +53,7 @@ * The direct mapping extends to max_pfn_mapped, so that we can directly access * apertures, ACPI and other tables without having to play with fixmaps. */ +unsigned long max_low_pfn_mapped; unsigned long max_pfn_mapped; static unsigned long dma_reserve __initdata; @@ -202,6 +203,46 @@ set_pte_vaddr(unsigned long vaddr, pte_t pteval) } /* + * Create large page table mappings for a range of physical addresses. + */ +static void __init __init_extra_mapping(unsigned long phys, unsigned long size, + pgprot_t prot) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + + BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK)); + for (; size; phys += PMD_SIZE, size -= PMD_SIZE) { + pgd = pgd_offset_k((unsigned long)__va(phys)); + if (pgd_none(*pgd)) { + pud = (pud_t *) spp_getpage(); + set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE | + _PAGE_USER)); + } + pud = pud_offset(pgd, (unsigned long)__va(phys)); + if (pud_none(*pud)) { + pmd = (pmd_t *) spp_getpage(); + set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | + _PAGE_USER)); + } + pmd = pmd_offset(pud, phys); + BUG_ON(!pmd_none(*pmd)); + set_pmd(pmd, __pmd(phys | pgprot_val(prot))); + } +} + +void __init init_extra_mapping_wb(unsigned long phys, unsigned long size) +{ + __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE); +} + +void __init init_extra_mapping_uc(unsigned long phys, unsigned long size) +{ + __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE_NOCACHE); +} + +/* * The head.S code sets up the kernel high mapping: * * from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text) @@ -262,11 +303,13 @@ static __meminit void unmap_low_page(void *adr) early_iounmap(adr, PAGE_SIZE); } -static void __meminit +static unsigned long __meminit phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end) { unsigned pages = 0; + unsigned long last_map_addr = end; int i; + pte_t *pte = pte_page + pte_index(addr); for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) { @@ -286,23 +329,28 @@ phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end) printk(" pte=%p addr=%lx pte=%016lx\n", pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte); set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL)); + last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE; pages++; } update_page_count(PG_LEVEL_4K, pages); + + return last_map_addr; } -static void __meminit +static unsigned long __meminit phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end) { pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd); - phys_pte_init(pte, address, end); + return phys_pte_init(pte, address, end); } static unsigned long __meminit -phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end) +phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, + unsigned long page_size_mask) { unsigned long pages = 0; + unsigned long last_map_addr = end; int i = pmd_index(address); @@ -321,42 +369,46 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end) if (pmd_val(*pmd)) { if (!pmd_large(*pmd)) - phys_pte_update(pmd, address, end); + last_map_addr = phys_pte_update(pmd, address, + end); continue; } - if (cpu_has_pse) { + if (page_size_mask & (1<<PG_LEVEL_2M)) { pages++; set_pte((pte_t *)pmd, pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); + last_map_addr = (address & PMD_MASK) + PMD_SIZE; continue; } pte = alloc_low_page(&pte_phys); - phys_pte_init(pte, address, end); + last_map_addr = phys_pte_init(pte, address, end); unmap_low_page(pte); pmd_populate_kernel(&init_mm, pmd, __va(pte_phys)); } update_page_count(PG_LEVEL_2M, pages); - return address; + return last_map_addr; } static unsigned long __meminit -phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end) +phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end, + unsigned long page_size_mask) { pmd_t *pmd = pmd_offset(pud, 0); unsigned long last_map_addr; spin_lock(&init_mm.page_table_lock); - last_map_addr = phys_pmd_init(pmd, address, end); + last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask); spin_unlock(&init_mm.page_table_lock); __flush_tlb_all(); return last_map_addr; } static unsigned long __meminit -phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) +phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, + unsigned long page_size_mask) { unsigned long pages = 0; unsigned long last_map_addr = end; @@ -378,11 +430,12 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) if (pud_val(*pud)) { if (!pud_large(*pud)) - last_map_addr = phys_pmd_update(pud, addr, end); + last_map_addr = phys_pmd_update(pud, addr, end, + page_size_mask); continue; } - if (direct_gbpages) { + if (page_size_mask & (1<<PG_LEVEL_1G)) { pages++; set_pte((pte_t *)pud, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); @@ -393,7 +446,7 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) pmd = alloc_low_page(&pmd_phys); spin_lock(&init_mm.page_table_lock); - last_map_addr = phys_pmd_init(pmd, addr, end); + last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask); unmap_low_page(pmd); pud_populate(&init_mm, pud, __va(pmd_phys)); spin_unlock(&init_mm.page_table_lock); @@ -406,29 +459,37 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) } static unsigned long __meminit -phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end) +phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end, + unsigned long page_size_mask) { pud_t *pud; pud = (pud_t *)pgd_page_vaddr(*pgd); - return phys_pud_init(pud, addr, end); + return phys_pud_init(pud, addr, end, page_size_mask); } static void __init find_early_table_space(unsigned long end) { - unsigned long puds, tables, start; + unsigned long puds, pmds, ptes, tables, start; puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; tables = round_up(puds * sizeof(pud_t), PAGE_SIZE); - if (!direct_gbpages) { - unsigned long pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; - tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE); - } - if (!cpu_has_pse) { - unsigned long ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; - tables += round_up(ptes * sizeof(pte_t), PAGE_SIZE); - } + if (direct_gbpages) { + unsigned long extra; + extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT); + pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT; + } else + pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; + tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE); + + if (cpu_has_pse) { + unsigned long extra; + extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); + ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; + } else + ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; + tables += round_up(ptes * sizeof(pte_t), PAGE_SIZE); /* * RED-PEN putting page tables only on node 0 could @@ -568,29 +629,12 @@ static void __init early_memtest(unsigned long start, unsigned long end) } #endif -/* - * Setup the direct mapping of the physical memory at PAGE_OFFSET. - * This runs before bootmem is initialized and gets pages directly from - * the physical memory. To access them they are temporarily mapped. - */ -unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned long end) +static unsigned long __init kernel_physical_mapping_init(unsigned long start, + unsigned long end, + unsigned long page_size_mask) { - unsigned long next, last_map_addr = end; - unsigned long start_phys = start, end_phys = end; - printk(KERN_INFO "init_memory_mapping\n"); - - /* - * Find space for the kernel direct mapping tables. - * - * Later we should allocate these tables in the local node of the - * memory mapped. Unfortunately this is done currently before the - * nodes are discovered. - */ - if (!after_bootmem) { - init_gbpages(); - find_early_table_space(end); - } + unsigned long next, last_map_addr = end; start = (unsigned long)__va(start); end = (unsigned long)__va(end); @@ -600,12 +644,13 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned lon unsigned long pud_phys; pud_t *pud; - next = start + PGDIR_SIZE; + next = (start + PGDIR_SIZE) & PGDIR_MASK; if (next > end) next = end; if (pgd_val(*pgd)) { - last_map_addr = phys_pud_update(pgd, __pa(start), __pa(end)); + last_map_addr = phys_pud_update(pgd, __pa(start), + __pa(end), page_size_mask); continue; } @@ -614,22 +659,151 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned lon else pud = alloc_low_page(&pud_phys); - last_map_addr = phys_pud_init(pud, __pa(start), __pa(next)); + last_map_addr = phys_pud_init(pud, __pa(start), __pa(next), + page_size_mask); unmap_low_page(pud); pgd_populate(&init_mm, pgd_offset_k(start), __va(pud_phys)); } + return last_map_addr; +} + +struct map_range { + unsigned long start; + unsigned long end; + unsigned page_size_mask; +}; + +#define NR_RANGE_MR 5 + +static int save_mr(struct map_range *mr, int nr_range, + unsigned long start_pfn, unsigned long end_pfn, + unsigned long page_size_mask) +{ + + if (start_pfn < end_pfn) { + if (nr_range >= NR_RANGE_MR) + panic("run out of range for init_memory_mapping\n"); + mr[nr_range].start = start_pfn<<PAGE_SHIFT; + mr[nr_range].end = end_pfn<<PAGE_SHIFT; + mr[nr_range].page_size_mask = page_size_mask; + nr_range++; + } + + return nr_range; +} + +/* + * Setup the direct mapping of the physical memory at PAGE_OFFSET. + * This runs before bootmem is initialized and gets pages directly from + * the physical memory. To access them they are temporarily mapped. + */ +unsigned long __init_refok init_memory_mapping(unsigned long start, + unsigned long end) +{ + unsigned long last_map_addr = 0; + unsigned long page_size_mask = 0; + unsigned long start_pfn, end_pfn; + + struct map_range mr[NR_RANGE_MR]; + int nr_range, i; + + printk(KERN_INFO "init_memory_mapping\n"); + + /* + * Find space for the kernel direct mapping tables. + * + * Later we should allocate these tables in the local node of the + * memory mapped. Unfortunately this is done currently before the + * nodes are discovered. + */ + if (!after_bootmem) + init_gbpages(); + + if (direct_gbpages) + page_size_mask |= 1 << PG_LEVEL_1G; + if (cpu_has_pse) + page_size_mask |= 1 << PG_LEVEL_2M; + + memset(mr, 0, sizeof(mr)); + nr_range = 0; + + /* head if not big page alignment ?*/ + start_pfn = start >> PAGE_SHIFT; + end_pfn = ((start + (PMD_SIZE - 1)) >> PMD_SHIFT) + << (PMD_SHIFT - PAGE_SHIFT); + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); + + /* big page (2M) range*/ + start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT) + << (PMD_SHIFT - PAGE_SHIFT); + end_pfn = ((start + (PUD_SIZE - 1))>>PUD_SHIFT) + << (PUD_SHIFT - PAGE_SHIFT); + if (end_pfn > ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT))) + end_pfn = ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT)); + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, + page_size_mask & (1<<PG_LEVEL_2M)); + + /* big page (1G) range */ + start_pfn = end_pfn; + end_pfn = (end>>PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT); + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, + page_size_mask & + ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G))); + + /* tail is not big page (1G) alignment */ + start_pfn = end_pfn; + end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, + page_size_mask & (1<<PG_LEVEL_2M)); + + /* tail is not big page (2M) alignment */ + start_pfn = end_pfn; + end_pfn = end>>PAGE_SHIFT; + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); + + /* try to merge same page size and continuous */ + for (i = 0; nr_range > 1 && i < nr_range - 1; i++) { + unsigned long old_start; + if (mr[i].end != mr[i+1].start || + mr[i].page_size_mask != mr[i+1].page_size_mask) + continue; + /* move it */ + old_start = mr[i].start; + memmove(&mr[i], &mr[i+1], + (nr_range - 1 - i) * sizeof (struct map_range)); + mr[i].start = old_start; + nr_range--; + } + + for (i = 0; i < nr_range; i++) + printk(KERN_DEBUG " %010lx - %010lx page %s\n", + mr[i].start, mr[i].end, + (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":( + (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k")); + + if (!after_bootmem) + find_early_table_space(end); + + for (i = 0; i < nr_range; i++) + last_map_addr = kernel_physical_mapping_init( + mr[i].start, mr[i].end, + mr[i].page_size_mask); + if (!after_bootmem) mmu_cr4_features = read_cr4(); __flush_tlb_all(); - if (!after_bootmem) + if (!after_bootmem && table_end > table_start) reserve_early(table_start << PAGE_SHIFT, table_end << PAGE_SHIFT, "PGTABLE"); + printk(KERN_INFO "last_map_addr: %lx end: %lx\n", + last_map_addr, end); + if (!after_bootmem) - early_memtest(start_phys, end_phys); + early_memtest(start, end); return last_map_addr >> PAGE_SHIFT; } @@ -817,6 +991,13 @@ EXPORT_SYMBOL_GPL(rodata_test_data); void mark_rodata_ro(void) { unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata); + unsigned long rodata_start = + ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK; + +#ifdef CONFIG_DYNAMIC_FTRACE + /* Dynamic tracing modifies the kernel text section */ + start = rodata_start; +#endif printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", (end - start) >> 10); @@ -826,8 +1007,7 @@ void mark_rodata_ro(void) * The rodata section (but not the kernel text!) should also be * not-executable. */ - start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK; - set_memory_nx(start, (end - start) >> PAGE_SHIFT); + set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT); rodata_test(); @@ -1036,9 +1216,6 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node) PAGE_KERNEL_LARGE); set_pmd(pmd, __pmd(pte_val(entry))); - addr_end = addr + PMD_SIZE; - p_end = p + PMD_SIZE; - /* check to see if we have contiguous blocks */ if (p_end != p || node_start != node) { if (p_start) @@ -1048,6 +1225,9 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node) node_start = node; p_start = p; } + + addr_end = addr + PMD_SIZE; + p_end = p + PMD_SIZE; } else vmemmap_verify((pte_t *)pmd, node, addr, next); } diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 115f13ee40c..24c1d3c3018 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -12,6 +12,7 @@ #include <linux/module.h> #include <linux/slab.h> #include <linux/vmalloc.h> +#include <linux/mmiotrace.h> #include <asm/cacheflush.h> #include <asm/e820.h> @@ -122,10 +123,13 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, { unsigned long pfn, offset, vaddr; resource_size_t last_addr; + const resource_size_t unaligned_phys_addr = phys_addr; + const unsigned long unaligned_size = size; struct vm_struct *area; unsigned long new_prot_val; pgprot_t prot; int retval; + void __iomem *ret_addr; /* Don't allow wraparound or zero size */ last_addr = phys_addr + size - 1; @@ -233,7 +237,10 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, return NULL; } - return (void __iomem *) (vaddr + offset); + ret_addr = (void __iomem *) (vaddr + offset); + mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr); + + return ret_addr; } /** @@ -348,6 +355,8 @@ void iounmap(volatile void __iomem *addr) addr = (volatile void __iomem *) (PAGE_MASK & (unsigned long __force)addr); + mmiotrace_iounmap(addr); + /* Use the vm area unlocked, assuming the caller ensures there isn't another iounmap for the same address in parallel. Reuse of the virtual address is prevented by diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c new file mode 100644 index 00000000000..93d82038af4 --- /dev/null +++ b/arch/x86/mm/kmmio.c @@ -0,0 +1,510 @@ +/* Support for MMIO probes. + * Benfit many code from kprobes + * (C) 2002 Louis Zhuang <louis.zhuang@intel.com>. + * 2007 Alexander Eichner + * 2008 Pekka Paalanen <pq@iki.fi> + */ + +#include <linux/list.h> +#include <linux/rculist.h> +#include <linux/spinlock.h> +#include <linux/hash.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/uaccess.h> +#include <linux/ptrace.h> +#include <linux/preempt.h> +#include <linux/percpu.h> +#include <linux/kdebug.h> +#include <linux/mutex.h> +#include <linux/io.h> +#include <asm/cacheflush.h> +#include <asm/tlbflush.h> +#include <linux/errno.h> +#include <asm/debugreg.h> +#include <linux/mmiotrace.h> + +#define KMMIO_PAGE_HASH_BITS 4 +#define KMMIO_PAGE_TABLE_SIZE (1 << KMMIO_PAGE_HASH_BITS) + +struct kmmio_fault_page { + struct list_head list; + struct kmmio_fault_page *release_next; + unsigned long page; /* location of the fault page */ + + /* + * Number of times this page has been registered as a part + * of a probe. If zero, page is disarmed and this may be freed. + * Used only by writers (RCU). + */ + int count; +}; + +struct kmmio_delayed_release { + struct rcu_head rcu; + struct kmmio_fault_page *release_list; +}; + +struct kmmio_context { + struct kmmio_fault_page *fpage; + struct kmmio_probe *probe; + unsigned long saved_flags; + unsigned long addr; + int active; +}; + +static DEFINE_SPINLOCK(kmmio_lock); + +/* Protected by kmmio_lock */ +unsigned int kmmio_count; + +/* Read-protected by RCU, write-protected by kmmio_lock. */ +static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE]; +static LIST_HEAD(kmmio_probes); + +static struct list_head *kmmio_page_list(unsigned long page) +{ + return &kmmio_page_table[hash_long(page, KMMIO_PAGE_HASH_BITS)]; +} + +/* Accessed per-cpu */ +static DEFINE_PER_CPU(struct kmmio_context, kmmio_ctx); + +/* + * this is basically a dynamic stabbing problem: + * Could use the existing prio tree code or + * Possible better implementations: + * The Interval Skip List: A Data Structure for Finding All Intervals That + * Overlap a Point (might be simple) + * Space Efficient Dynamic Stabbing with Fast Queries - Mikkel Thorup + */ +/* Get the kmmio at this addr (if any). You must be holding RCU read lock. */ +static struct kmmio_probe *get_kmmio_probe(unsigned long addr) +{ + struct kmmio_probe *p; + list_for_each_entry_rcu(p, &kmmio_probes, list) { + if (addr >= p->addr && addr <= (p->addr + p->len)) + return p; + } + return NULL; +} + +/* You must be holding RCU read lock. */ +static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page) +{ + struct list_head *head; + struct kmmio_fault_page *p; + + page &= PAGE_MASK; + head = kmmio_page_list(page); + list_for_each_entry_rcu(p, head, list) { + if (p->page == page) + return p; + } + return NULL; +} + +static void set_page_present(unsigned long addr, bool present, + unsigned int *pglevel) +{ + pteval_t pteval; + pmdval_t pmdval; + unsigned int level; + pmd_t *pmd; + pte_t *pte = lookup_address(addr, &level); + + if (!pte) { + pr_err("kmmio: no pte for page 0x%08lx\n", addr); + return; + } + + if (pglevel) + *pglevel = level; + + switch (level) { + case PG_LEVEL_2M: + pmd = (pmd_t *)pte; + pmdval = pmd_val(*pmd) & ~_PAGE_PRESENT; + if (present) + pmdval |= _PAGE_PRESENT; + set_pmd(pmd, __pmd(pmdval)); + break; + + case PG_LEVEL_4K: + pteval = pte_val(*pte) & ~_PAGE_PRESENT; + if (present) + pteval |= _PAGE_PRESENT; + set_pte_atomic(pte, __pte(pteval)); + break; + + default: + pr_err("kmmio: unexpected page level 0x%x.\n", level); + return; + } + + __flush_tlb_one(addr); +} + +/** Mark the given page as not present. Access to it will trigger a fault. */ +static void arm_kmmio_fault_page(unsigned long page, unsigned int *pglevel) +{ + set_page_present(page & PAGE_MASK, false, pglevel); +} + +/** Mark the given page as present. */ +static void disarm_kmmio_fault_page(unsigned long page, unsigned int *pglevel) +{ + set_page_present(page & PAGE_MASK, true, pglevel); +} + +/* + * This is being called from do_page_fault(). + * + * We may be in an interrupt or a critical section. Also prefecthing may + * trigger a page fault. We may be in the middle of process switch. + * We cannot take any locks, because we could be executing especially + * within a kmmio critical section. + * + * Local interrupts are disabled, so preemption cannot happen. + * Do not enable interrupts, do not sleep, and watch out for other CPUs. + */ +/* + * Interrupts are disabled on entry as trap3 is an interrupt gate + * and they remain disabled thorough out this function. + */ +int kmmio_handler(struct pt_regs *regs, unsigned long addr) +{ + struct kmmio_context *ctx; + struct kmmio_fault_page *faultpage; + int ret = 0; /* default to fault not handled */ + + /* + * Preemption is now disabled to prevent process switch during + * single stepping. We can only handle one active kmmio trace + * per cpu, so ensure that we finish it before something else + * gets to run. We also hold the RCU read lock over single + * stepping to avoid looking up the probe and kmmio_fault_page + * again. + */ + preempt_disable(); + rcu_read_lock(); + + faultpage = get_kmmio_fault_page(addr); + if (!faultpage) { + /* + * Either this page fault is not caused by kmmio, or + * another CPU just pulled the kmmio probe from under + * our feet. The latter case should not be possible. + */ + goto no_kmmio; + } + + ctx = &get_cpu_var(kmmio_ctx); + if (ctx->active) { + disarm_kmmio_fault_page(faultpage->page, NULL); + if (addr == ctx->addr) { + /* + * On SMP we sometimes get recursive probe hits on the + * same address. Context is already saved, fall out. + */ + pr_debug("kmmio: duplicate probe hit on CPU %d, for " + "address 0x%08lx.\n", + smp_processor_id(), addr); + ret = 1; + goto no_kmmio_ctx; + } + /* + * Prevent overwriting already in-flight context. + * This should not happen, let's hope disarming at least + * prevents a panic. + */ + pr_emerg("kmmio: recursive probe hit on CPU %d, " + "for address 0x%08lx. Ignoring.\n", + smp_processor_id(), addr); + pr_emerg("kmmio: previous hit was at 0x%08lx.\n", + ctx->addr); + goto no_kmmio_ctx; + } + ctx->active++; + + ctx->fpage = faultpage; + ctx->probe = get_kmmio_probe(addr); + ctx->saved_flags = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF)); + ctx->addr = addr; + + if (ctx->probe && ctx->probe->pre_handler) + ctx->probe->pre_handler(ctx->probe, regs, addr); + + /* + * Enable single-stepping and disable interrupts for the faulting + * context. Local interrupts must not get enabled during stepping. + */ + regs->flags |= X86_EFLAGS_TF; + regs->flags &= ~X86_EFLAGS_IF; + + /* Now we set present bit in PTE and single step. */ + disarm_kmmio_fault_page(ctx->fpage->page, NULL); + + /* + * If another cpu accesses the same page while we are stepping, + * the access will not be caught. It will simply succeed and the + * only downside is we lose the event. If this becomes a problem, + * the user should drop to single cpu before tracing. + */ + + put_cpu_var(kmmio_ctx); + return 1; /* fault handled */ + +no_kmmio_ctx: + put_cpu_var(kmmio_ctx); +no_kmmio: + rcu_read_unlock(); + preempt_enable_no_resched(); + return ret; +} + +/* + * Interrupts are disabled on entry as trap1 is an interrupt gate + * and they remain disabled thorough out this function. + * This must always get called as the pair to kmmio_handler(). + */ +static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) +{ + int ret = 0; + struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx); + + if (!ctx->active) { + pr_debug("kmmio: spurious debug trap on CPU %d.\n", + smp_processor_id()); + goto out; + } + + if (ctx->probe && ctx->probe->post_handler) + ctx->probe->post_handler(ctx->probe, condition, regs); + + arm_kmmio_fault_page(ctx->fpage->page, NULL); + + regs->flags &= ~X86_EFLAGS_TF; + regs->flags |= ctx->saved_flags; + + /* These were acquired in kmmio_handler(). */ + ctx->active--; + BUG_ON(ctx->active); + rcu_read_unlock(); + preempt_enable_no_resched(); + + /* + * if somebody else is singlestepping across a probe point, flags + * will have TF set, in which case, continue the remaining processing + * of do_debug, as if this is not a probe hit. + */ + if (!(regs->flags & X86_EFLAGS_TF)) + ret = 1; +out: + put_cpu_var(kmmio_ctx); + return ret; +} + +/* You must be holding kmmio_lock. */ +static int add_kmmio_fault_page(unsigned long page) +{ + struct kmmio_fault_page *f; + + page &= PAGE_MASK; + f = get_kmmio_fault_page(page); + if (f) { + if (!f->count) + arm_kmmio_fault_page(f->page, NULL); + f->count++; + return 0; + } + + f = kmalloc(sizeof(*f), GFP_ATOMIC); + if (!f) + return -1; + + f->count = 1; + f->page = page; + list_add_rcu(&f->list, kmmio_page_list(f->page)); + + arm_kmmio_fault_page(f->page, NULL); + + return 0; +} + +/* You must be holding kmmio_lock. */ +static void release_kmmio_fault_page(unsigned long page, + struct kmmio_fault_page **release_list) +{ + struct kmmio_fault_page *f; + + page &= PAGE_MASK; + f = get_kmmio_fault_page(page); + if (!f) + return; + + f->count--; + BUG_ON(f->count < 0); + if (!f->count) { + disarm_kmmio_fault_page(f->page, NULL); + f->release_next = *release_list; + *release_list = f; + } +} + +/* + * With page-unaligned ioremaps, one or two armed pages may contain + * addresses from outside the intended mapping. Events for these addresses + * are currently silently dropped. The events may result only from programming + * mistakes by accessing addresses before the beginning or past the end of a + * mapping. + */ +int register_kmmio_probe(struct kmmio_probe *p) +{ + unsigned long flags; + int ret = 0; + unsigned long size = 0; + const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK); + + spin_lock_irqsave(&kmmio_lock, flags); + if (get_kmmio_probe(p->addr)) { + ret = -EEXIST; + goto out; + } + kmmio_count++; + list_add_rcu(&p->list, &kmmio_probes); + while (size < size_lim) { + if (add_kmmio_fault_page(p->addr + size)) + pr_err("kmmio: Unable to set page fault.\n"); + size += PAGE_SIZE; + } +out: + spin_unlock_irqrestore(&kmmio_lock, flags); + /* + * XXX: What should I do here? + * Here was a call to global_flush_tlb(), but it does not exist + * anymore. It seems it's not needed after all. + */ + return ret; +} +EXPORT_SYMBOL(register_kmmio_probe); + +static void rcu_free_kmmio_fault_pages(struct rcu_head *head) +{ + struct kmmio_delayed_release *dr = container_of( + head, + struct kmmio_delayed_release, + rcu); + struct kmmio_fault_page *p = dr->release_list; + while (p) { + struct kmmio_fault_page *next = p->release_next; + BUG_ON(p->count); + kfree(p); + p = next; + } + kfree(dr); +} + +static void remove_kmmio_fault_pages(struct rcu_head *head) +{ + struct kmmio_delayed_release *dr = container_of( + head, + struct kmmio_delayed_release, + rcu); + struct kmmio_fault_page *p = dr->release_list; + struct kmmio_fault_page **prevp = &dr->release_list; + unsigned long flags; + spin_lock_irqsave(&kmmio_lock, flags); + while (p) { + if (!p->count) + list_del_rcu(&p->list); + else + *prevp = p->release_next; + prevp = &p->release_next; + p = p->release_next; + } + spin_unlock_irqrestore(&kmmio_lock, flags); + /* This is the real RCU destroy call. */ + call_rcu(&dr->rcu, rcu_free_kmmio_fault_pages); +} + +/* + * Remove a kmmio probe. You have to synchronize_rcu() before you can be + * sure that the callbacks will not be called anymore. Only after that + * you may actually release your struct kmmio_probe. + * + * Unregistering a kmmio fault page has three steps: + * 1. release_kmmio_fault_page() + * Disarm the page, wait a grace period to let all faults finish. + * 2. remove_kmmio_fault_pages() + * Remove the pages from kmmio_page_table. + * 3. rcu_free_kmmio_fault_pages() + * Actally free the kmmio_fault_page structs as with RCU. + */ +void unregister_kmmio_probe(struct kmmio_probe *p) +{ + unsigned long flags; + unsigned long size = 0; + const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK); + struct kmmio_fault_page *release_list = NULL; + struct kmmio_delayed_release *drelease; + + spin_lock_irqsave(&kmmio_lock, flags); + while (size < size_lim) { + release_kmmio_fault_page(p->addr + size, &release_list); + size += PAGE_SIZE; + } + list_del_rcu(&p->list); + kmmio_count--; + spin_unlock_irqrestore(&kmmio_lock, flags); + + drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC); + if (!drelease) { + pr_crit("kmmio: leaking kmmio_fault_page objects.\n"); + return; + } + drelease->release_list = release_list; + + /* + * This is not really RCU here. We have just disarmed a set of + * pages so that they cannot trigger page faults anymore. However, + * we cannot remove the pages from kmmio_page_table, + * because a probe hit might be in flight on another CPU. The + * pages are collected into a list, and they will be removed from + * kmmio_page_table when it is certain that no probe hit related to + * these pages can be in flight. RCU grace period sounds like a + * good choice. + * + * If we removed the pages too early, kmmio page fault handler might + * not find the respective kmmio_fault_page and determine it's not + * a kmmio fault, when it actually is. This would lead to madness. + */ + call_rcu(&drelease->rcu, remove_kmmio_fault_pages); +} +EXPORT_SYMBOL(unregister_kmmio_probe); + +static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val, + void *args) +{ + struct die_args *arg = args; + + if (val == DIE_DEBUG && (arg->err & DR_STEP)) + if (post_kmmio_handler(arg->err, arg->regs) == 1) + return NOTIFY_STOP; + + return NOTIFY_DONE; +} + +static struct notifier_block nb_die = { + .notifier_call = kmmio_die_notifier +}; + +static int __init init_kmmio(void) +{ + int i; + for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++) + INIT_LIST_HEAD(&kmmio_page_table[i]); + return register_die_notifier(&nb_die); +} +fs_initcall(init_kmmio); /* should be before device_initcall() */ diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c new file mode 100644 index 00000000000..e7397e108be --- /dev/null +++ b/arch/x86/mm/mmio-mod.c @@ -0,0 +1,515 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2005 + * Jeff Muizelaar, 2006, 2007 + * Pekka Paalanen, 2008 <pq@iki.fi> + * + * Derived from the read-mod example from relay-examples by Tom Zanussi. + */ +#define DEBUG 1 + +#include <linux/module.h> +#include <linux/debugfs.h> +#include <linux/uaccess.h> +#include <linux/io.h> +#include <linux/version.h> +#include <linux/kallsyms.h> +#include <asm/pgtable.h> +#include <linux/mmiotrace.h> +#include <asm/e820.h> /* for ISA_START_ADDRESS */ +#include <asm/atomic.h> +#include <linux/percpu.h> +#include <linux/cpu.h> + +#include "pf_in.h" + +#define NAME "mmiotrace: " + +struct trap_reason { + unsigned long addr; + unsigned long ip; + enum reason_type type; + int active_traces; +}; + +struct remap_trace { + struct list_head list; + struct kmmio_probe probe; + resource_size_t phys; + unsigned long id; +}; + +/* Accessed per-cpu. */ +static DEFINE_PER_CPU(struct trap_reason, pf_reason); +static DEFINE_PER_CPU(struct mmiotrace_rw, cpu_trace); + +#if 0 /* XXX: no way gather this info anymore */ +/* Access to this is not per-cpu. */ +static DEFINE_PER_CPU(atomic_t, dropped); +#endif + +static struct dentry *marker_file; + +static DEFINE_MUTEX(mmiotrace_mutex); +static DEFINE_SPINLOCK(trace_lock); +static atomic_t mmiotrace_enabled; +static LIST_HEAD(trace_list); /* struct remap_trace */ + +/* + * Locking in this file: + * - mmiotrace_mutex enforces enable/disable_mmiotrace() critical sections. + * - mmiotrace_enabled may be modified only when holding mmiotrace_mutex + * and trace_lock. + * - Routines depending on is_enabled() must take trace_lock. + * - trace_list users must hold trace_lock. + * - is_enabled() guarantees that mmio_trace_record is allowed. + * - pre/post callbacks assume the effect of is_enabled() being true. + */ + +/* module parameters */ +static unsigned long filter_offset; +static int nommiotrace; +static int trace_pc; + +module_param(filter_offset, ulong, 0); +module_param(nommiotrace, bool, 0); +module_param(trace_pc, bool, 0); + +MODULE_PARM_DESC(filter_offset, "Start address of traced mappings."); +MODULE_PARM_DESC(nommiotrace, "Disable actual MMIO tracing."); +MODULE_PARM_DESC(trace_pc, "Record address of faulting instructions."); + +static bool is_enabled(void) +{ + return atomic_read(&mmiotrace_enabled); +} + +#if 0 /* XXX: needs rewrite */ +/* + * Write callback for the debugfs entry: + * Read a marker and write it to the mmio trace log + */ +static ssize_t write_marker(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) +{ + char *event = NULL; + struct mm_io_header *headp; + ssize_t len = (count > 65535) ? 65535 : count; + + event = kzalloc(sizeof(*headp) + len, GFP_KERNEL); + if (!event) + return -ENOMEM; + + headp = (struct mm_io_header *)event; + headp->type = MMIO_MAGIC | (MMIO_MARKER << MMIO_OPCODE_SHIFT); + headp->data_len = len; + + if (copy_from_user(event + sizeof(*headp), buffer, len)) { + kfree(event); + return -EFAULT; + } + + spin_lock_irq(&trace_lock); +#if 0 /* XXX: convert this to use tracing */ + if (is_enabled()) + relay_write(chan, event, sizeof(*headp) + len); + else +#endif + len = -EINVAL; + spin_unlock_irq(&trace_lock); + kfree(event); + return len; +} +#endif + +static void print_pte(unsigned long address) +{ + unsigned int level; + pte_t *pte = lookup_address(address, &level); + + if (!pte) { + pr_err(NAME "Error in %s: no pte for page 0x%08lx\n", + __func__, address); + return; + } + + if (level == PG_LEVEL_2M) { + pr_emerg(NAME "4MB pages are not currently supported: " + "0x%08lx\n", address); + BUG(); + } + pr_info(NAME "pte for 0x%lx: 0x%llx 0x%llx\n", address, + (unsigned long long)pte_val(*pte), + (unsigned long long)pte_val(*pte) & _PAGE_PRESENT); +} + +/* + * For some reason the pre/post pairs have been called in an + * unmatched order. Report and die. + */ +static void die_kmmio_nesting_error(struct pt_regs *regs, unsigned long addr) +{ + const struct trap_reason *my_reason = &get_cpu_var(pf_reason); + pr_emerg(NAME "unexpected fault for address: 0x%08lx, " + "last fault for address: 0x%08lx\n", + addr, my_reason->addr); + print_pte(addr); + print_symbol(KERN_EMERG "faulting IP is at %s\n", regs->ip); + print_symbol(KERN_EMERG "last faulting IP was at %s\n", my_reason->ip); +#ifdef __i386__ + pr_emerg("eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n", + regs->ax, regs->bx, regs->cx, regs->dx); + pr_emerg("esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n", + regs->si, regs->di, regs->bp, regs->sp); +#else + pr_emerg("rax: %016lx rcx: %016lx rdx: %016lx\n", + regs->ax, regs->cx, regs->dx); + pr_emerg("rsi: %016lx rdi: %016lx rbp: %016lx rsp: %016lx\n", + regs->si, regs->di, regs->bp, regs->sp); +#endif + put_cpu_var(pf_reason); + BUG(); +} + +static void pre(struct kmmio_probe *p, struct pt_regs *regs, + unsigned long addr) +{ + struct trap_reason *my_reason = &get_cpu_var(pf_reason); + struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace); + const unsigned long instptr = instruction_pointer(regs); + const enum reason_type type = get_ins_type(instptr); + struct remap_trace *trace = p->private; + + /* it doesn't make sense to have more than one active trace per cpu */ + if (my_reason->active_traces) + die_kmmio_nesting_error(regs, addr); + else + my_reason->active_traces++; + + my_reason->type = type; + my_reason->addr = addr; + my_reason->ip = instptr; + + my_trace->phys = addr - trace->probe.addr + trace->phys; + my_trace->map_id = trace->id; + + /* + * Only record the program counter when requested. + * It may taint clean-room reverse engineering. + */ + if (trace_pc) + my_trace->pc = instptr; + else + my_trace->pc = 0; + + /* + * XXX: the timestamp recorded will be *after* the tracing has been + * done, not at the time we hit the instruction. SMP implications + * on event ordering? + */ + + switch (type) { + case REG_READ: + my_trace->opcode = MMIO_READ; + my_trace->width = get_ins_mem_width(instptr); + break; + case REG_WRITE: + my_trace->opcode = MMIO_WRITE; + my_trace->width = get_ins_mem_width(instptr); + my_trace->value = get_ins_reg_val(instptr, regs); + break; + case IMM_WRITE: + my_trace->opcode = MMIO_WRITE; + my_trace->width = get_ins_mem_width(instptr); + my_trace->value = get_ins_imm_val(instptr); + break; + default: + { + unsigned char *ip = (unsigned char *)instptr; + my_trace->opcode = MMIO_UNKNOWN_OP; + my_trace->width = 0; + my_trace->value = (*ip) << 16 | *(ip + 1) << 8 | + *(ip + 2); + } + } + put_cpu_var(cpu_trace); + put_cpu_var(pf_reason); +} + +static void post(struct kmmio_probe *p, unsigned long condition, + struct pt_regs *regs) +{ + struct trap_reason *my_reason = &get_cpu_var(pf_reason); + struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace); + + /* this should always return the active_trace count to 0 */ + my_reason->active_traces--; + if (my_reason->active_traces) { + pr_emerg(NAME "unexpected post handler"); + BUG(); + } + + switch (my_reason->type) { + case REG_READ: + my_trace->value = get_ins_reg_val(my_reason->ip, regs); + break; + default: + break; + } + + mmio_trace_rw(my_trace); + put_cpu_var(cpu_trace); + put_cpu_var(pf_reason); +} + +static void ioremap_trace_core(resource_size_t offset, unsigned long size, + void __iomem *addr) +{ + static atomic_t next_id; + struct remap_trace *trace = kmalloc(sizeof(*trace), GFP_KERNEL); + /* These are page-unaligned. */ + struct mmiotrace_map map = { + .phys = offset, + .virt = (unsigned long)addr, + .len = size, + .opcode = MMIO_PROBE + }; + + if (!trace) { + pr_err(NAME "kmalloc failed in ioremap\n"); + return; + } + + *trace = (struct remap_trace) { + .probe = { + .addr = (unsigned long)addr, + .len = size, + .pre_handler = pre, + .post_handler = post, + .private = trace + }, + .phys = offset, + .id = atomic_inc_return(&next_id) + }; + map.map_id = trace->id; + + spin_lock_irq(&trace_lock); + if (!is_enabled()) + goto not_enabled; + + mmio_trace_mapping(&map); + list_add_tail(&trace->list, &trace_list); + if (!nommiotrace) + register_kmmio_probe(&trace->probe); + +not_enabled: + spin_unlock_irq(&trace_lock); +} + +void mmiotrace_ioremap(resource_size_t offset, unsigned long size, + void __iomem *addr) +{ + if (!is_enabled()) /* recheck and proper locking in *_core() */ + return; + + pr_debug(NAME "ioremap_*(0x%llx, 0x%lx) = %p\n", + (unsigned long long)offset, size, addr); + if ((filter_offset) && (offset != filter_offset)) + return; + ioremap_trace_core(offset, size, addr); +} + +static void iounmap_trace_core(volatile void __iomem *addr) +{ + struct mmiotrace_map map = { + .phys = 0, + .virt = (unsigned long)addr, + .len = 0, + .opcode = MMIO_UNPROBE + }; + struct remap_trace *trace; + struct remap_trace *tmp; + struct remap_trace *found_trace = NULL; + + pr_debug(NAME "Unmapping %p.\n", addr); + + spin_lock_irq(&trace_lock); + if (!is_enabled()) + goto not_enabled; + + list_for_each_entry_safe(trace, tmp, &trace_list, list) { + if ((unsigned long)addr == trace->probe.addr) { + if (!nommiotrace) + unregister_kmmio_probe(&trace->probe); + list_del(&trace->list); + found_trace = trace; + break; + } + } + map.map_id = (found_trace) ? found_trace->id : -1; + mmio_trace_mapping(&map); + +not_enabled: + spin_unlock_irq(&trace_lock); + if (found_trace) { + synchronize_rcu(); /* unregister_kmmio_probe() requirement */ + kfree(found_trace); + } +} + +void mmiotrace_iounmap(volatile void __iomem *addr) +{ + might_sleep(); + if (is_enabled()) /* recheck and proper locking in *_core() */ + iounmap_trace_core(addr); +} + +static void clear_trace_list(void) +{ + struct remap_trace *trace; + struct remap_trace *tmp; + + /* + * No locking required, because the caller ensures we are in a + * critical section via mutex, and is_enabled() is false, + * i.e. nothing can traverse or modify this list. + * Caller also ensures is_enabled() cannot change. + */ + list_for_each_entry(trace, &trace_list, list) { + pr_notice(NAME "purging non-iounmapped " + "trace @0x%08lx, size 0x%lx.\n", + trace->probe.addr, trace->probe.len); + if (!nommiotrace) + unregister_kmmio_probe(&trace->probe); + } + synchronize_rcu(); /* unregister_kmmio_probe() requirement */ + + list_for_each_entry_safe(trace, tmp, &trace_list, list) { + list_del(&trace->list); + kfree(trace); + } +} + +#ifdef CONFIG_HOTPLUG_CPU +static cpumask_t downed_cpus; + +static void enter_uniprocessor(void) +{ + int cpu; + int err; + + get_online_cpus(); + downed_cpus = cpu_online_map; + cpu_clear(first_cpu(cpu_online_map), downed_cpus); + if (num_online_cpus() > 1) + pr_notice(NAME "Disabling non-boot CPUs...\n"); + put_online_cpus(); + + for_each_cpu_mask(cpu, downed_cpus) { + err = cpu_down(cpu); + if (!err) + pr_info(NAME "CPU%d is down.\n", cpu); + else + pr_err(NAME "Error taking CPU%d down: %d\n", cpu, err); + } + if (num_online_cpus() > 1) + pr_warning(NAME "multiple CPUs still online, " + "may miss events.\n"); +} + +static void leave_uniprocessor(void) +{ + int cpu; + int err; + + if (cpus_weight(downed_cpus) == 0) + return; + pr_notice(NAME "Re-enabling CPUs...\n"); + for_each_cpu_mask(cpu, downed_cpus) { + err = cpu_up(cpu); + if (!err) + pr_info(NAME "enabled CPU%d.\n", cpu); + else + pr_err(NAME "cannot re-enable CPU%d: %d\n", cpu, err); + } +} + +#else /* !CONFIG_HOTPLUG_CPU */ +static void enter_uniprocessor(void) +{ + if (num_online_cpus() > 1) + pr_warning(NAME "multiple CPUs are online, may miss events. " + "Suggest booting with maxcpus=1 kernel argument.\n"); +} + +static void leave_uniprocessor(void) +{ +} +#endif + +#if 0 /* XXX: out of order */ +static struct file_operations fops_marker = { + .owner = THIS_MODULE, + .write = write_marker +}; +#endif + +void enable_mmiotrace(void) +{ + mutex_lock(&mmiotrace_mutex); + if (is_enabled()) + goto out; + +#if 0 /* XXX: tracing does not support text entries */ + marker_file = debugfs_create_file("marker", 0660, dir, NULL, + &fops_marker); + if (!marker_file) + pr_err(NAME "marker file creation failed.\n"); +#endif + + if (nommiotrace) + pr_info(NAME "MMIO tracing disabled.\n"); + enter_uniprocessor(); + spin_lock_irq(&trace_lock); + atomic_inc(&mmiotrace_enabled); + spin_unlock_irq(&trace_lock); + pr_info(NAME "enabled.\n"); +out: + mutex_unlock(&mmiotrace_mutex); +} + +void disable_mmiotrace(void) +{ + mutex_lock(&mmiotrace_mutex); + if (!is_enabled()) + goto out; + + spin_lock_irq(&trace_lock); + atomic_dec(&mmiotrace_enabled); + BUG_ON(is_enabled()); + spin_unlock_irq(&trace_lock); + + clear_trace_list(); /* guarantees: no more kmmio callbacks */ + leave_uniprocessor(); + if (marker_file) { + debugfs_remove(marker_file); + marker_file = NULL; + } + + pr_info(NAME "disabled.\n"); +out: + mutex_unlock(&mmiotrace_mutex); +} diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index afd40054d15..65c6e46bf05 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -141,7 +141,7 @@ static void cpa_flush_all(unsigned long cache) { BUG_ON(irqs_disabled()); - on_each_cpu(__cpa_flush_all, (void *) cache, 1, 1); + on_each_cpu(__cpa_flush_all, (void *) cache, 1); } static void __cpa_flush_range(void *arg) @@ -162,7 +162,7 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache) BUG_ON(irqs_disabled()); WARN_ON(PAGE_ALIGN(start) != start); - on_each_cpu(__cpa_flush_range, NULL, 1, 1); + on_each_cpu(__cpa_flush_range, NULL, 1); if (!cache) return; @@ -262,6 +262,7 @@ pte_t *lookup_address(unsigned long address, unsigned int *level) return pte_offset_kernel(pmd, address); } +EXPORT_SYMBOL_GPL(lookup_address); /* * Set the new pmd in all the pgds we know about: @@ -536,8 +537,14 @@ static int split_large_page(pte_t *kpte, unsigned long address) set_pte(&pbase[i], pfn_pte(pfn, ref_prot)); if (address >= (unsigned long)__va(0) && + address < (unsigned long)__va(max_low_pfn_mapped << PAGE_SHIFT)) + split_page_count(level); + +#ifdef CONFIG_X86_64 + if (address >= (unsigned long)__va(1UL<<32) && address < (unsigned long)__va(max_pfn_mapped << PAGE_SHIFT)) split_page_count(level); +#endif /* * Install the new, split up pagetable. Important details here: @@ -652,15 +659,24 @@ static int cpa_process_alias(struct cpa_data *cpa) struct cpa_data alias_cpa; int ret = 0; - if (cpa->pfn > max_pfn_mapped) + if (cpa->pfn >= max_pfn_mapped) return 0; +#ifdef CONFIG_X86_64 + if (cpa->pfn >= max_low_pfn_mapped && cpa->pfn < (1UL<<(32-PAGE_SHIFT))) + return 0; +#endif /* * No need to redo, when the primary call touched the direct * mapping already: */ - if (!within(cpa->vaddr, PAGE_OFFSET, - PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) { + if (!(within(cpa->vaddr, PAGE_OFFSET, + PAGE_OFFSET + (max_low_pfn_mapped << PAGE_SHIFT)) +#ifdef CONFIG_X86_64 + || within(cpa->vaddr, PAGE_OFFSET + (1UL<<32), + PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)) +#endif + )) { alias_cpa = *cpa; alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT); diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index a885a1019b8..d4585077977 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -449,7 +449,8 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, if (retval < 0) return 0; - if (pfn <= max_pfn_mapped && + if (((pfn < max_low_pfn_mapped) || + (pfn >= (1UL<<(32 - PAGE_SHIFT)) && pfn < max_pfn_mapped)) && ioremap_change_attr((unsigned long)__va(offset), size, flags) < 0) { free_memtype(offset, offset + size); printk(KERN_INFO diff --git a/arch/x86/mm/pf_in.c b/arch/x86/mm/pf_in.c new file mode 100644 index 00000000000..efa1911e20c --- /dev/null +++ b/arch/x86/mm/pf_in.c @@ -0,0 +1,489 @@ +/* + * Fault Injection Test harness (FI) + * Copyright (C) Intel Crop. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, + * USA. + * + */ + +/* Id: pf_in.c,v 1.1.1.1 2002/11/12 05:56:32 brlock Exp + * Copyright by Intel Crop., 2002 + * Louis Zhuang (louis.zhuang@intel.com) + * + * Bjorn Steinbrink (B.Steinbrink@gmx.de), 2007 + */ + +#include <linux/module.h> +#include <linux/ptrace.h> /* struct pt_regs */ +#include "pf_in.h" + +#ifdef __i386__ +/* IA32 Manual 3, 2-1 */ +static unsigned char prefix_codes[] = { + 0xF0, 0xF2, 0xF3, 0x2E, 0x36, 0x3E, 0x26, 0x64, + 0x65, 0x2E, 0x3E, 0x66, 0x67 +}; +/* IA32 Manual 3, 3-432*/ +static unsigned int reg_rop[] = { + 0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F +}; +static unsigned int reg_wop[] = { 0x88, 0x89 }; +static unsigned int imm_wop[] = { 0xC6, 0xC7 }; +/* IA32 Manual 3, 3-432*/ +static unsigned int rw8[] = { 0x88, 0x8A, 0xC6 }; +static unsigned int rw32[] = { + 0x89, 0x8B, 0xC7, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F +}; +static unsigned int mw8[] = { 0x88, 0x8A, 0xC6, 0xB60F, 0xBE0F }; +static unsigned int mw16[] = { 0xB70F, 0xBF0F }; +static unsigned int mw32[] = { 0x89, 0x8B, 0xC7 }; +static unsigned int mw64[] = {}; +#else /* not __i386__ */ +static unsigned char prefix_codes[] = { + 0x66, 0x67, 0x2E, 0x3E, 0x26, 0x64, 0x65, 0x36, + 0xF0, 0xF3, 0xF2, + /* REX Prefixes */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f +}; +/* AMD64 Manual 3, Appendix A*/ +static unsigned int reg_rop[] = { + 0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F +}; +static unsigned int reg_wop[] = { 0x88, 0x89 }; +static unsigned int imm_wop[] = { 0xC6, 0xC7 }; +static unsigned int rw8[] = { 0xC6, 0x88, 0x8A }; +static unsigned int rw32[] = { + 0xC7, 0x89, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F +}; +/* 8 bit only */ +static unsigned int mw8[] = { 0xC6, 0x88, 0x8A, 0xB60F, 0xBE0F }; +/* 16 bit only */ +static unsigned int mw16[] = { 0xB70F, 0xBF0F }; +/* 16 or 32 bit */ +static unsigned int mw32[] = { 0xC7 }; +/* 16, 32 or 64 bit */ +static unsigned int mw64[] = { 0x89, 0x8B }; +#endif /* not __i386__ */ + +static int skip_prefix(unsigned char *addr, int *shorted, int *enlarged, + int *rexr) +{ + int i; + unsigned char *p = addr; + *shorted = 0; + *enlarged = 0; + *rexr = 0; + +restart: + for (i = 0; i < ARRAY_SIZE(prefix_codes); i++) { + if (*p == prefix_codes[i]) { + if (*p == 0x66) + *shorted = 1; +#ifdef __amd64__ + if ((*p & 0xf8) == 0x48) + *enlarged = 1; + if ((*p & 0xf4) == 0x44) + *rexr = 1; +#endif + p++; + goto restart; + } + } + + return (p - addr); +} + +static int get_opcode(unsigned char *addr, unsigned int *opcode) +{ + int len; + + if (*addr == 0x0F) { + /* 0x0F is extension instruction */ + *opcode = *(unsigned short *)addr; + len = 2; + } else { + *opcode = *addr; + len = 1; + } + + return len; +} + +#define CHECK_OP_TYPE(opcode, array, type) \ + for (i = 0; i < ARRAY_SIZE(array); i++) { \ + if (array[i] == opcode) { \ + rv = type; \ + goto exit; \ + } \ + } + +enum reason_type get_ins_type(unsigned long ins_addr) +{ + unsigned int opcode; + unsigned char *p; + int shorted, enlarged, rexr; + int i; + enum reason_type rv = OTHERS; + + p = (unsigned char *)ins_addr; + p += skip_prefix(p, &shorted, &enlarged, &rexr); + p += get_opcode(p, &opcode); + + CHECK_OP_TYPE(opcode, reg_rop, REG_READ); + CHECK_OP_TYPE(opcode, reg_wop, REG_WRITE); + CHECK_OP_TYPE(opcode, imm_wop, IMM_WRITE); + +exit: + return rv; +} +#undef CHECK_OP_TYPE + +static unsigned int get_ins_reg_width(unsigned long ins_addr) +{ + unsigned int opcode; + unsigned char *p; + int i, shorted, enlarged, rexr; + + p = (unsigned char *)ins_addr; + p += skip_prefix(p, &shorted, &enlarged, &rexr); + p += get_opcode(p, &opcode); + + for (i = 0; i < ARRAY_SIZE(rw8); i++) + if (rw8[i] == opcode) + return 1; + + for (i = 0; i < ARRAY_SIZE(rw32); i++) + if (rw32[i] == opcode) + return (shorted ? 2 : (enlarged ? 8 : 4)); + + printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode); + return 0; +} + +unsigned int get_ins_mem_width(unsigned long ins_addr) +{ + unsigned int opcode; + unsigned char *p; + int i, shorted, enlarged, rexr; + + p = (unsigned char *)ins_addr; + p += skip_prefix(p, &shorted, &enlarged, &rexr); + p += get_opcode(p, &opcode); + + for (i = 0; i < ARRAY_SIZE(mw8); i++) + if (mw8[i] == opcode) + return 1; + + for (i = 0; i < ARRAY_SIZE(mw16); i++) + if (mw16[i] == opcode) + return 2; + + for (i = 0; i < ARRAY_SIZE(mw32); i++) + if (mw32[i] == opcode) + return shorted ? 2 : 4; + + for (i = 0; i < ARRAY_SIZE(mw64); i++) + if (mw64[i] == opcode) + return shorted ? 2 : (enlarged ? 8 : 4); + + printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode); + return 0; +} + +/* + * Define register ident in mod/rm byte. + * Note: these are NOT the same as in ptrace-abi.h. + */ +enum { + arg_AL = 0, + arg_CL = 1, + arg_DL = 2, + arg_BL = 3, + arg_AH = 4, + arg_CH = 5, + arg_DH = 6, + arg_BH = 7, + + arg_AX = 0, + arg_CX = 1, + arg_DX = 2, + arg_BX = 3, + arg_SP = 4, + arg_BP = 5, + arg_SI = 6, + arg_DI = 7, +#ifdef __amd64__ + arg_R8 = 8, + arg_R9 = 9, + arg_R10 = 10, + arg_R11 = 11, + arg_R12 = 12, + arg_R13 = 13, + arg_R14 = 14, + arg_R15 = 15 +#endif +}; + +static unsigned char *get_reg_w8(int no, struct pt_regs *regs) +{ + unsigned char *rv = NULL; + + switch (no) { + case arg_AL: + rv = (unsigned char *)®s->ax; + break; + case arg_BL: + rv = (unsigned char *)®s->bx; + break; + case arg_CL: + rv = (unsigned char *)®s->cx; + break; + case arg_DL: + rv = (unsigned char *)®s->dx; + break; + case arg_AH: + rv = 1 + (unsigned char *)®s->ax; + break; + case arg_BH: + rv = 1 + (unsigned char *)®s->bx; + break; + case arg_CH: + rv = 1 + (unsigned char *)®s->cx; + break; + case arg_DH: + rv = 1 + (unsigned char *)®s->dx; + break; +#ifdef __amd64__ + case arg_R8: + rv = (unsigned char *)®s->r8; + break; + case arg_R9: + rv = (unsigned char *)®s->r9; + break; + case arg_R10: + rv = (unsigned char *)®s->r10; + break; + case arg_R11: + rv = (unsigned char *)®s->r11; + break; + case arg_R12: + rv = (unsigned char *)®s->r12; + break; + case arg_R13: + rv = (unsigned char *)®s->r13; + break; + case arg_R14: + rv = (unsigned char *)®s->r14; + break; + case arg_R15: + rv = (unsigned char *)®s->r15; + break; +#endif + default: + printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no); + break; + } + return rv; +} + +static unsigned long *get_reg_w32(int no, struct pt_regs *regs) +{ + unsigned long *rv = NULL; + + switch (no) { + case arg_AX: + rv = ®s->ax; + break; + case arg_BX: + rv = ®s->bx; + break; + case arg_CX: + rv = ®s->cx; + break; + case arg_DX: + rv = ®s->dx; + break; + case arg_SP: + rv = ®s->sp; + break; + case arg_BP: + rv = ®s->bp; + break; + case arg_SI: + rv = ®s->si; + break; + case arg_DI: + rv = ®s->di; + break; +#ifdef __amd64__ + case arg_R8: + rv = ®s->r8; + break; + case arg_R9: + rv = ®s->r9; + break; + case arg_R10: + rv = ®s->r10; + break; + case arg_R11: + rv = ®s->r11; + break; + case arg_R12: + rv = ®s->r12; + break; + case arg_R13: + rv = ®s->r13; + break; + case arg_R14: + rv = ®s->r14; + break; + case arg_R15: + rv = ®s->r15; + break; +#endif + default: + printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no); + } + + return rv; +} + +unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs) +{ + unsigned int opcode; + unsigned char mod_rm; + int reg; + unsigned char *p; + int i, shorted, enlarged, rexr; + unsigned long rv; + + p = (unsigned char *)ins_addr; + p += skip_prefix(p, &shorted, &enlarged, &rexr); + p += get_opcode(p, &opcode); + for (i = 0; i < ARRAY_SIZE(reg_rop); i++) + if (reg_rop[i] == opcode) { + rv = REG_READ; + goto do_work; + } + + for (i = 0; i < ARRAY_SIZE(reg_wop); i++) + if (reg_wop[i] == opcode) { + rv = REG_WRITE; + goto do_work; + } + + printk(KERN_ERR "mmiotrace: Not a register instruction, opcode " + "0x%02x\n", opcode); + goto err; + +do_work: + mod_rm = *p; + reg = ((mod_rm >> 3) & 0x7) | (rexr << 3); + switch (get_ins_reg_width(ins_addr)) { + case 1: + return *get_reg_w8(reg, regs); + + case 2: + return *(unsigned short *)get_reg_w32(reg, regs); + + case 4: + return *(unsigned int *)get_reg_w32(reg, regs); + +#ifdef __amd64__ + case 8: + return *(unsigned long *)get_reg_w32(reg, regs); +#endif + + default: + printk(KERN_ERR "mmiotrace: Error width# %d\n", reg); + } + +err: + return 0; +} + +unsigned long get_ins_imm_val(unsigned long ins_addr) +{ + unsigned int opcode; + unsigned char mod_rm; + unsigned char mod; + unsigned char *p; + int i, shorted, enlarged, rexr; + unsigned long rv; + + p = (unsigned char *)ins_addr; + p += skip_prefix(p, &shorted, &enlarged, &rexr); + p += get_opcode(p, &opcode); + for (i = 0; i < ARRAY_SIZE(imm_wop); i++) + if (imm_wop[i] == opcode) { + rv = IMM_WRITE; + goto do_work; + } + + printk(KERN_ERR "mmiotrace: Not an immediate instruction, opcode " + "0x%02x\n", opcode); + goto err; + +do_work: + mod_rm = *p; + mod = mod_rm >> 6; + p++; + switch (mod) { + case 0: + /* if r/m is 5 we have a 32 disp (IA32 Manual 3, Table 2-2) */ + /* AMD64: XXX Check for address size prefix? */ + if ((mod_rm & 0x7) == 0x5) + p += 4; + break; + + case 1: + p += 1; + break; + + case 2: + p += 4; + break; + + case 3: + default: + printk(KERN_ERR "mmiotrace: not a memory access instruction " + "at 0x%lx, rm_mod=0x%02x\n", + ins_addr, mod_rm); + } + + switch (get_ins_reg_width(ins_addr)) { + case 1: + return *(unsigned char *)p; + + case 2: + return *(unsigned short *)p; + + case 4: + return *(unsigned int *)p; + +#ifdef __amd64__ + case 8: + return *(unsigned long *)p; +#endif + + default: + printk(KERN_ERR "mmiotrace: Error: width.\n"); + } + +err: + return 0; +} diff --git a/arch/x86/mm/pf_in.h b/arch/x86/mm/pf_in.h new file mode 100644 index 00000000000..e05341a51a2 --- /dev/null +++ b/arch/x86/mm/pf_in.h @@ -0,0 +1,39 @@ +/* + * Fault Injection Test harness (FI) + * Copyright (C) Intel Crop. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, + * USA. + * + */ + +#ifndef __PF_H_ +#define __PF_H_ + +enum reason_type { + NOT_ME, /* page fault is not in regions */ + NOTHING, /* access others point in regions */ + REG_READ, /* read from addr to reg */ + REG_WRITE, /* write from reg to addr */ + IMM_WRITE, /* write from imm to addr */ + OTHERS /* Other instructions can not intercept */ +}; + +enum reason_type get_ins_type(unsigned long ins_addr); +unsigned int get_ins_mem_width(unsigned long ins_addr); +unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs); +unsigned long get_ins_imm_val(unsigned long ins_addr); + +#endif /* __PF_H_ */ diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index 828907d001e..b4becbf8c57 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -141,7 +141,6 @@ void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags) __flush_tlb_one(vaddr); } -static int fixmaps; unsigned long __FIXADDR_TOP = 0xfffff000; EXPORT_SYMBOL(__FIXADDR_TOP); diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c index f41d67f8f83..1eb2973a301 100644 --- a/arch/x86/mm/srat_32.c +++ b/arch/x86/mm/srat_32.c @@ -156,10 +156,9 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *memory_affinity) num_memory_chunks++; - printk(KERN_DEBUG "Memory range %08lx to %08lx (type %x)" + printk(KERN_DEBUG "Memory range %08lx to %08lx" " in proximity domain %02x %s\n", start_pfn, end_pfn, - memory_affinity->memory_type, pxm, ((memory_affinity->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) ? "enabled and removable" : "enabled" ) ); diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 0fd67b81a8b..1b4763e26ea 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -100,7 +100,19 @@ static __init inline int srat_disabled(void) /* Callback for SLIT parsing */ void __init acpi_numa_slit_init(struct acpi_table_slit *slit) { - acpi_slit = slit; + unsigned length; + unsigned long phys; + + length = slit->header.length; + phys = find_e820_area(0, max_pfn_mapped<<PAGE_SHIFT, length, + PAGE_SIZE); + + if (phys == -1L) + panic(" Can not save slit!\n"); + + acpi_slit = __va(phys); + memcpy(acpi_slit, slit, length); + reserve_early(phys, phys + length, "ACPI SLIT"); } /* Callback for Proximity Domain -> LAPIC mapping */ diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c new file mode 100644 index 00000000000..d877c5b423e --- /dev/null +++ b/arch/x86/mm/testmmiotrace.c @@ -0,0 +1,71 @@ +/* + * Written by Pekka Paalanen, 2008 <pq@iki.fi> + */ +#include <linux/module.h> +#include <linux/io.h> + +#define MODULE_NAME "testmmiotrace" + +static unsigned long mmio_address; +module_param(mmio_address, ulong, 0); +MODULE_PARM_DESC(mmio_address, "Start address of the mapping of 16 kB."); + +static void do_write_test(void __iomem *p) +{ + unsigned int i; + for (i = 0; i < 256; i++) + iowrite8(i, p + i); + for (i = 1024; i < (5 * 1024); i += 2) + iowrite16(i * 12 + 7, p + i); + for (i = (5 * 1024); i < (16 * 1024); i += 4) + iowrite32(i * 212371 + 13, p + i); +} + +static void do_read_test(void __iomem *p) +{ + unsigned int i; + for (i = 0; i < 256; i++) + ioread8(p + i); + for (i = 1024; i < (5 * 1024); i += 2) + ioread16(p + i); + for (i = (5 * 1024); i < (16 * 1024); i += 4) + ioread32(p + i); +} + +static void do_test(void) +{ + void __iomem *p = ioremap_nocache(mmio_address, 0x4000); + if (!p) { + pr_err(MODULE_NAME ": could not ioremap, aborting.\n"); + return; + } + do_write_test(p); + do_read_test(p); + iounmap(p); +} + +static int __init init(void) +{ + if (mmio_address == 0) { + pr_err(MODULE_NAME ": you have to use the module argument " + "mmio_address.\n"); + pr_err(MODULE_NAME ": DO NOT LOAD THIS MODULE UNLESS" + " YOU REALLY KNOW WHAT YOU ARE DOING!\n"); + return -ENXIO; + } + + pr_warning(MODULE_NAME ": WARNING: mapping 16 kB @ 0x%08lx " + "in PCI address space, and writing " + "rubbish in there.\n", mmio_address); + do_test(); + return 0; +} + +static void __exit cleanup(void) +{ + pr_debug(MODULE_NAME ": unloaded.\n"); +} + +module_init(init); +module_exit(cleanup); +MODULE_LICENSE("GPL"); diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 2b6ad5b9f9d..7f3329b55d2 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -218,8 +218,8 @@ static int nmi_setup(void) } } - on_each_cpu(nmi_save_registers, NULL, 0, 1); - on_each_cpu(nmi_cpu_setup, NULL, 0, 1); + on_each_cpu(nmi_save_registers, NULL, 1); + on_each_cpu(nmi_cpu_setup, NULL, 1); nmi_enabled = 1; return 0; } @@ -271,7 +271,7 @@ static void nmi_shutdown(void) { struct op_msrs *msrs = &get_cpu_var(cpu_msrs); nmi_enabled = 0; - on_each_cpu(nmi_cpu_shutdown, NULL, 0, 1); + on_each_cpu(nmi_cpu_shutdown, NULL, 1); unregister_die_notifier(&profile_exceptions_nb); model->shutdown(msrs); free_msrs(); @@ -286,7 +286,7 @@ static void nmi_cpu_start(void *dummy) static int nmi_start(void) { - on_each_cpu(nmi_cpu_start, NULL, 0, 1); + on_each_cpu(nmi_cpu_start, NULL, 1); return 0; } @@ -298,7 +298,7 @@ static void nmi_cpu_stop(void *dummy) static void nmi_stop(void) { - on_each_cpu(nmi_cpu_stop, NULL, 0, 1); + on_each_cpu(nmi_cpu_stop, NULL, 1); } struct op_counter_config counter_config[OP_MAX_COUNTER]; diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile index c5c8e485fc4..e515e8db842 100644 --- a/arch/x86/pci/Makefile +++ b/arch/x86/pci/Makefile @@ -1,5 +1,17 @@ -ifeq ($(CONFIG_X86_32),y) -include ${srctree}/arch/x86/pci/Makefile_32 -else -include ${srctree}/arch/x86/pci/Makefile_64 -endif +obj-y := i386.o init.o + +obj-$(CONFIG_PCI_BIOS) += pcbios.o +obj-$(CONFIG_PCI_MMCONFIG) += mmconfig_$(BITS).o direct.o mmconfig-shared.o +obj-$(CONFIG_PCI_DIRECT) += direct.o +obj-$(CONFIG_PCI_OLPC) += olpc.o + +pci-y := fixup.o +pci-$(CONFIG_ACPI) += acpi.o +pci-y += legacy.o irq.o + +pci-$(CONFIG_X86_VISWS) += visws.o + +pci-$(CONFIG_X86_NUMAQ) += numa.o + +obj-y += $(pci-y) common.o early.o +obj-y += amd_bus.o diff --git a/arch/x86/pci/Makefile_32 b/arch/x86/pci/Makefile_32 deleted file mode 100644 index a34fbf55792..00000000000 --- a/arch/x86/pci/Makefile_32 +++ /dev/null @@ -1,26 +0,0 @@ -obj-y := i386.o init.o - -obj-$(CONFIG_PCI_BIOS) += pcbios.o -obj-$(CONFIG_PCI_MMCONFIG) += mmconfig_32.o direct.o mmconfig-shared.o -obj-$(CONFIG_PCI_DIRECT) += direct.o -obj-$(CONFIG_PCI_OLPC) += olpc.o - -pci-y := fixup.o - -# Do not change the ordering here. There is a nasty init function -# ordering dependency which breaks when you move acpi.o below -# legacy/irq.o -pci-$(CONFIG_ACPI) += acpi.o -pci-y += legacy.o irq.o - -# Careful: VISWS overrule the pci-y above. The colons are -# therefor correct. This needs a proper fix by distangling the code. -pci-$(CONFIG_X86_VISWS) := visws.o fixup.o - -pci-$(CONFIG_X86_NUMAQ) += numa.o - -# Necessary for NUMAQ as well -pci-$(CONFIG_NUMA) += mp_bus_to_node.o - -obj-y += $(pci-y) common.o early.o -obj-y += amd_bus.o diff --git a/arch/x86/pci/Makefile_64 b/arch/x86/pci/Makefile_64 deleted file mode 100644 index fd47068c95d..00000000000 --- a/arch/x86/pci/Makefile_64 +++ /dev/null @@ -1,17 +0,0 @@ -# -# Makefile for X86_64 specific PCI routines -# -# Reuse the i386 PCI subsystem -# -EXTRA_CFLAGS += -Iarch/x86/pci - -obj-y := i386.o -obj-$(CONFIG_PCI_DIRECT)+= direct.o -obj-y += fixup.o init.o -obj-$(CONFIG_ACPI) += acpi.o -obj-y += legacy.o irq.o common.o early.o -# mmconfig has a 64bit special -obj-$(CONFIG_PCI_MMCONFIG) += mmconfig_64.o direct.o mmconfig-shared.o - -obj-y += amd_bus.o - diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 4fa52d3dc84..19af06927fb 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -223,7 +223,7 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do return bus; } -static int __init pci_acpi_init(void) +int __init pci_acpi_init(void) { struct pci_dev *dev = NULL; @@ -257,4 +257,3 @@ static int __init pci_acpi_init(void) return 0; } -subsys_initcall(pci_acpi_init); diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index d02c598451e..dbf53236971 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c @@ -1,44 +1,25 @@ #include <linux/init.h> #include <linux/pci.h> +#include <linux/topology.h> #include "pci.h" #ifdef CONFIG_X86_64 - #include <asm/pci-direct.h> #include <asm/mpspec.h> #include <linux/cpumask.h> -#include <linux/topology.h> +#endif /* * This discovers the pcibus <-> node mapping on AMD K8. * also get peer root bus resource for io,mmio */ - -/* - * sub bus (transparent) will use entres from 3 to store extra from root, - * so need to make sure have enought slot there, increase PCI_BUS_NUM_RESOURCES? - */ -#define RES_NUM 16 -struct pci_root_info { - char name[12]; - unsigned int res_num; - struct resource res[RES_NUM]; - int bus_min; - int bus_max; - int node; - int link; -}; - -/* 4 at this time, it may become to 32 */ -#define PCI_ROOT_NR 4 -static int pci_root_num; -static struct pci_root_info pci_root_info[PCI_ROOT_NR]; - #ifdef CONFIG_NUMA #define BUS_NR 256 +#ifdef CONFIG_X86_64 + static int mp_bus_to_node[BUS_NR]; void set_mp_bus_to_node(int busnum, int node) @@ -65,7 +46,52 @@ int get_mp_bus_to_node(int busnum) return node; } -#endif + +#else /* CONFIG_X86_32 */ + +static unsigned char mp_bus_to_node[BUS_NR]; + +void set_mp_bus_to_node(int busnum, int node) +{ + if (busnum >= 0 && busnum < BUS_NR) + mp_bus_to_node[busnum] = (unsigned char) node; +} + +int get_mp_bus_to_node(int busnum) +{ + int node; + + if (busnum < 0 || busnum > (BUS_NR - 1)) + return 0; + node = mp_bus_to_node[busnum]; + return node; +} + +#endif /* CONFIG_X86_32 */ + +#endif /* CONFIG_NUMA */ + +#ifdef CONFIG_X86_64 + +/* + * sub bus (transparent) will use entres from 3 to store extra from root, + * so need to make sure have enought slot there, increase PCI_BUS_NUM_RESOURCES? + */ +#define RES_NUM 16 +struct pci_root_info { + char name[12]; + unsigned int res_num; + struct resource res[RES_NUM]; + int bus_min; + int bus_max; + int node; + int link; +}; + +/* 4 at this time, it may become to 32 */ +#define PCI_ROOT_NR 4 +static int pci_root_num; +static struct pci_root_info pci_root_info[PCI_ROOT_NR]; void set_pci_bus_resources_arch_default(struct pci_bus *b) { @@ -552,7 +578,7 @@ static int __init enable_pci_io_ecs(void) /* assume all cpus from fam10h have IO ECS */ if (boot_cpu_data.x86 < 0x10) return 0; - on_each_cpu(enable_pci_io_ecs_per_cpu, NULL, 1, 1); + on_each_cpu(enable_pci_io_ecs_per_cpu, NULL, 1); pci_probe |= PCI_HAS_IO_ECS; return 0; } diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 00a319cd5be..1485a26ddce 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -20,6 +20,7 @@ unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 | PCI_PROBE_MMCONF; +unsigned int pci_early_dump_regs; static int pci_bf_sort; int pci_routeirq; int noioapicquirk; @@ -33,7 +34,7 @@ struct pci_raw_ops *raw_pci_ext_ops; int raw_pci_read(unsigned int domain, unsigned int bus, unsigned int devfn, int reg, int len, u32 *val) { - if (reg < 256 && raw_pci_ops) + if (domain == 0 && reg < 256 && raw_pci_ops) return raw_pci_ops->read(domain, bus, devfn, reg, len, val); if (raw_pci_ext_ops) return raw_pci_ext_ops->read(domain, bus, devfn, reg, len, val); @@ -43,7 +44,7 @@ int raw_pci_read(unsigned int domain, unsigned int bus, unsigned int devfn, int raw_pci_write(unsigned int domain, unsigned int bus, unsigned int devfn, int reg, int len, u32 val) { - if (reg < 256 && raw_pci_ops) + if (domain == 0 && reg < 256 && raw_pci_ops) return raw_pci_ops->write(domain, bus, devfn, reg, len, val); if (raw_pci_ext_ops) return raw_pci_ext_ops->write(domain, bus, devfn, reg, len, val); @@ -123,6 +124,21 @@ void __init dmi_check_skip_isa_align(void) dmi_check_system(can_skip_pciprobe_dmi_table); } +static void __devinit pcibios_fixup_device_resources(struct pci_dev *dev) +{ + struct resource *rom_r = &dev->resource[PCI_ROM_RESOURCE]; + + if (pci_probe & PCI_NOASSIGN_ROMS) { + if (rom_r->parent) + return; + if (rom_r->start) { + /* we deal with BIOS assigned ROM later */ + return; + } + rom_r->start = rom_r->end = rom_r->flags = 0; + } +} + /* * Called after each bus is probed, but before its children * are examined. @@ -130,7 +146,11 @@ void __init dmi_check_skip_isa_align(void) void __devinit pcibios_fixup_bus(struct pci_bus *b) { + struct pci_dev *dev; + pci_read_bridge_bases(b); + list_for_each_entry(dev, &b->devices, bus_list) + pcibios_fixup_device_resources(dev); } /* @@ -386,7 +406,7 @@ struct pci_bus * __devinit pcibios_scan_root(int busnum) extern u8 pci_cache_line_size; -static int __init pcibios_init(void) +int __init pcibios_init(void) { struct cpuinfo_x86 *c = &boot_cpu_data; @@ -413,8 +433,6 @@ static int __init pcibios_init(void) return 0; } -subsys_initcall(pcibios_init); - char * __devinit pcibios_setup(char *str) { if (!strcmp(str, "off")) { @@ -485,12 +503,18 @@ char * __devinit pcibios_setup(char *str) else if (!strcmp(str, "rom")) { pci_probe |= PCI_ASSIGN_ROMS; return NULL; + } else if (!strcmp(str, "norom")) { + pci_probe |= PCI_NOASSIGN_ROMS; + return NULL; } else if (!strcmp(str, "assign-busses")) { pci_probe |= PCI_ASSIGN_ALL_BUSSES; return NULL; } else if (!strcmp(str, "use_crs")) { pci_probe |= PCI_USE__CRS; return NULL; + } else if (!strcmp(str, "earlydump")) { + pci_early_dump_regs = 1; + return NULL; } else if (!strcmp(str, "routeirq")) { pci_routeirq = 1; return NULL; diff --git a/arch/x86/pci/early.c b/arch/x86/pci/early.c index 42df4b6606d..858dbe3399f 100644 --- a/arch/x86/pci/early.c +++ b/arch/x86/pci/early.c @@ -49,7 +49,14 @@ void write_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset, u8 val) { PDprintk("%x writing to %x: %x\n", slot, offset, val); outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); - outb(val, 0xcfc); + outb(val, 0xcfc + (offset&3)); +} + +void write_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset, u16 val) +{ + PDprintk("%x writing to %x: %x\n", slot, offset, val); + outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); + outw(val, 0xcfc + (offset&2)); } int early_pci_allowed(void) @@ -57,3 +64,54 @@ int early_pci_allowed(void) return (pci_probe & (PCI_PROBE_CONF1|PCI_PROBE_NOEARLY)) == PCI_PROBE_CONF1; } + +void early_dump_pci_device(u8 bus, u8 slot, u8 func) +{ + int i; + int j; + u32 val; + + printk("PCI: %02x:%02x:%02x", bus, slot, func); + + for (i = 0; i < 256; i += 4) { + if (!(i & 0x0f)) + printk("\n%04x:",i); + + val = read_pci_config(bus, slot, func, i); + for (j = 0; j < 4; j++) { + printk(" %02x", val & 0xff); + val >>= 8; + } + } + printk("\n"); +} + +void early_dump_pci_devices(void) +{ + unsigned bus, slot, func; + + if (!early_pci_allowed()) + return; + + for (bus = 0; bus < 256; bus++) { + for (slot = 0; slot < 32; slot++) { + for (func = 0; func < 8; func++) { + u32 class; + u8 type; + class = read_pci_config(bus, slot, func, + PCI_CLASS_REVISION); + if (class == 0xffffffff) + break; + + early_dump_pci_device(bus, slot, func); + + /* No multi-function device? */ + type = read_pci_config_byte(bus, slot, func, + PCI_HEADER_TYPE); + if (!(type & 0x80)) + break; + } + } + } +} + diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index 6ccd7a108cd..2aafb67dc5f 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -334,7 +334,9 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, flags = new_flags; } - if (vma->vm_pgoff <= max_pfn_mapped && + if (((vma->vm_pgoff < max_low_pfn_mapped) || + (vma->vm_pgoff >= (1UL<<(32 - PAGE_SHIFT)) && + vma->vm_pgoff < max_pfn_mapped)) && ioremap_change_attr((unsigned long)__va(addr), len, flags)) { free_memtype(addr, addr + len); return -EINVAL; diff --git a/arch/x86/pci/init.c b/arch/x86/pci/init.c index b821f4462d9..d6c950f8185 100644 --- a/arch/x86/pci/init.c +++ b/arch/x86/pci/init.c @@ -4,7 +4,7 @@ /* arch_initcall has too random ordering, so call the initializers in the right sequence from here. */ -static __init int pci_access_init(void) +static __init int pci_arch_init(void) { #ifdef CONFIG_PCI_DIRECT int type = 0; @@ -40,4 +40,4 @@ static __init int pci_access_init(void) return 0; } -arch_initcall(pci_access_init); +arch_initcall(pci_arch_init); diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index f0859de23e2..6a06a2eb059 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -45,7 +45,8 @@ struct irq_router { char *name; u16 vendor, device; int (*get)(struct pci_dev *router, struct pci_dev *dev, int pirq); - int (*set)(struct pci_dev *router, struct pci_dev *dev, int pirq, int new); + int (*set)(struct pci_dev *router, struct pci_dev *dev, int pirq, + int new); }; struct irq_router_handler { @@ -77,7 +78,8 @@ static inline struct irq_routing_table *pirq_check_routing_table(u8 *addr) for (i = 0; i < rt->size; i++) sum += addr[i]; if (!sum) { - DBG(KERN_DEBUG "PCI: Interrupt Routing Table found at 0x%p\n", rt); + DBG(KERN_DEBUG "PCI: Interrupt Routing Table found at 0x%p\n", + rt); return rt; } return NULL; @@ -183,7 +185,8 @@ static unsigned int read_config_nybble(struct pci_dev *router, unsigned offset, return (nr & 1) ? (x >> 4) : (x & 0xf); } -static void write_config_nybble(struct pci_dev *router, unsigned offset, unsigned nr, unsigned int val) +static void write_config_nybble(struct pci_dev *router, unsigned offset, + unsigned nr, unsigned int val) { u8 x; unsigned reg = offset + (nr >> 1); @@ -467,7 +470,8 @@ static int pirq_serverworks_get(struct pci_dev *router, struct pci_dev *dev, int return inb(0xc01) & 0xf; } -static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) +static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev, + int pirq, int irq) { outb(pirq, 0xc00); outb(irq, 0xc01); @@ -660,7 +664,8 @@ static __init int vlsi_router_probe(struct irq_router *r, struct pci_dev *router } -static __init int serverworks_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) +static __init int serverworks_router_probe(struct irq_router *r, + struct pci_dev *router, u16 device) { switch (device) { case PCI_DEVICE_ID_SERVERWORKS_OSB4: @@ -827,10 +832,12 @@ static void __init pirq_find_router(struct irq_router *r) for (h = pirq_routers; h->vendor; h++) { /* First look for a router match */ - if (rt->rtr_vendor == h->vendor && h->probe(r, pirq_router_dev, rt->rtr_device)) + if (rt->rtr_vendor == h->vendor && + h->probe(r, pirq_router_dev, rt->rtr_device)) break; /* Fall back to a device match */ - if (pirq_router_dev->vendor == h->vendor && h->probe(r, pirq_router_dev, pirq_router_dev->device)) + if (pirq_router_dev->vendor == h->vendor && + h->probe(r, pirq_router_dev, pirq_router_dev->device)) break; } printk(KERN_INFO "PCI: Using IRQ router %s [%04x/%04x] at %s\n", @@ -845,11 +852,13 @@ static void __init pirq_find_router(struct irq_router *r) static struct irq_info *pirq_get_info(struct pci_dev *dev) { struct irq_routing_table *rt = pirq_table; - int entries = (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info); + int entries = (rt->size - sizeof(struct irq_routing_table)) / + sizeof(struct irq_info); struct irq_info *info; for (info = rt->slots; entries--; info++) - if (info->bus == dev->bus->number && PCI_SLOT(info->devfn) == PCI_SLOT(dev->devfn)) + if (info->bus == dev->bus->number && + PCI_SLOT(info->devfn) == PCI_SLOT(dev->devfn)) return info; return NULL; } @@ -890,7 +899,8 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign) DBG(" -> not routed\n" KERN_DEBUG); return 0; } - DBG(" -> PIRQ %02x, mask %04x, excl %04x", pirq, mask, pirq_table->exclusive_irqs); + DBG(" -> PIRQ %02x, mask %04x, excl %04x", pirq, mask, + pirq_table->exclusive_irqs); mask &= pcibios_irq_mask; /* Work around broken HP Pavilion Notebooks which assign USB to @@ -903,7 +913,8 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign) } /* same for Acer Travelmate 360, but with CB and irq 11 -> 10 */ - if (acer_tm360_irqrouting && dev->irq == 11 && dev->vendor == PCI_VENDOR_ID_O2) { + if (acer_tm360_irqrouting && dev->irq == 11 && + dev->vendor == PCI_VENDOR_ID_O2) { pirq = 0x68; mask = 0x400; dev->irq = r->get(pirq_router_dev, dev, pirq); @@ -920,15 +931,16 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign) newirq = 0; else printk("\n" KERN_WARNING - "PCI: IRQ %i for device %s doesn't match PIRQ mask " - "- try pci=usepirqmask\n" KERN_DEBUG, newirq, - pci_name(dev)); + "PCI: IRQ %i for device %s doesn't match PIRQ mask - try pci=usepirqmask\n" + KERN_DEBUG, newirq, + pci_name(dev)); } if (!newirq && assign) { for (i = 0; i < 16; i++) { if (!(mask & (1 << i))) continue; - if (pirq_penalty[i] < pirq_penalty[newirq] && can_request_irq(i, IRQF_SHARED)) + if (pirq_penalty[i] < pirq_penalty[newirq] && + can_request_irq(i, IRQF_SHARED)) newirq = i; } } @@ -944,7 +956,8 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign) DBG(" -> got IRQ %d\n", irq); msg = "Found"; eisa_set_level_irq(irq); - } else if (newirq && r->set && (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) { + } else if (newirq && r->set && + (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) { DBG(" -> assigning IRQ %d", newirq); if (r->set(pirq_router_dev, dev, pirq, newirq)) { eisa_set_level_irq(newirq); @@ -962,7 +975,8 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign) } else return 0; } - printk(KERN_INFO "PCI: %s IRQ %d for device %s\n", msg, irq, pci_name(dev)); + printk(KERN_INFO "PCI: %s IRQ %d for device %s\n", msg, irq, + pci_name(dev)); /* Update IRQ for all devices with the same pirq value */ while ((dev2 = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev2)) != NULL) { @@ -974,7 +988,10 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign) if (!info) continue; if (info->irq[pin].link == pirq) { - /* We refuse to override the dev->irq information. Give a warning! */ + /* + * We refuse to override the dev->irq + * information. Give a warning! + */ if (dev2->irq && dev2->irq != irq && \ (!(pci_probe & PCI_USE_PIRQ_MASK) || \ ((1 << dev2->irq) & mask))) { @@ -987,7 +1004,9 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign) dev2->irq = irq; pirq_penalty[irq]++; if (dev != dev2) - printk(KERN_INFO "PCI: Sharing IRQ %d with %s\n", irq, pci_name(dev2)); + printk(KERN_INFO + "PCI: Sharing IRQ %d with %s\n", + irq, pci_name(dev2)); } } return 1; @@ -1001,15 +1020,21 @@ static void __init pcibios_fixup_irqs(void) DBG(KERN_DEBUG "PCI: IRQ fixup\n"); while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { /* - * If the BIOS has set an out of range IRQ number, just ignore it. - * Also keep track of which IRQ's are already in use. + * If the BIOS has set an out of range IRQ number, just + * ignore it. Also keep track of which IRQ's are + * already in use. */ if (dev->irq >= 16) { - DBG(KERN_DEBUG "%s: ignoring bogus IRQ %d\n", pci_name(dev), dev->irq); + DBG(KERN_DEBUG "%s: ignoring bogus IRQ %d\n", + pci_name(dev), dev->irq); dev->irq = 0; } - /* If the IRQ is already assigned to a PCI device, ignore its ISA use penalty */ - if (pirq_penalty[dev->irq] >= 100 && pirq_penalty[dev->irq] < 100000) + /* + * If the IRQ is already assigned to a PCI device, + * ignore its ISA use penalty + */ + if (pirq_penalty[dev->irq] >= 100 && + pirq_penalty[dev->irq] < 100000) pirq_penalty[dev->irq] = 0; pirq_penalty[dev->irq]++; } @@ -1025,8 +1050,13 @@ static void __init pcibios_fixup_irqs(void) int irq; if (pin) { - pin--; /* interrupt pins are numbered starting from 1 */ - irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin); + /* + * interrupt pins are numbered starting + * from 1 + */ + pin--; + irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, + PCI_SLOT(dev->devfn), pin); /* * Busses behind bridges are typically not listed in the MP-table. * In this case we have to look up the IRQ based on the parent bus, @@ -1067,7 +1097,8 @@ static int __init fix_broken_hp_bios_irq9(const struct dmi_system_id *d) { if (!broken_hp_bios_irq9) { broken_hp_bios_irq9 = 1; - printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident); + printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", + d->ident); } return 0; } @@ -1080,7 +1111,8 @@ static int __init fix_acer_tm360_irqrouting(const struct dmi_system_id *d) { if (!acer_tm360_irqrouting) { acer_tm360_irqrouting = 1; - printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident); + printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", + d->ident); } return 0; } @@ -1092,7 +1124,8 @@ static struct dmi_system_id __initdata pciirq_dmi_table[] = { .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), DMI_MATCH(DMI_BIOS_VERSION, "GE.M1.03"), - DMI_MATCH(DMI_PRODUCT_VERSION, "HP Pavilion Notebook Model GE"), + DMI_MATCH(DMI_PRODUCT_VERSION, + "HP Pavilion Notebook Model GE"), DMI_MATCH(DMI_BOARD_VERSION, "OmniBook N32N-736"), }, }, @@ -1107,7 +1140,7 @@ static struct dmi_system_id __initdata pciirq_dmi_table[] = { { } }; -static int __init pcibios_irq_init(void) +int __init pcibios_irq_init(void) { DBG(KERN_DEBUG "PCI: IRQ init\n"); @@ -1131,7 +1164,10 @@ static int __init pcibios_irq_init(void) if (!(pirq_table->exclusive_irqs & (1 << i))) pirq_penalty[i] += 100; } - /* If we're using the I/O APIC, avoid using the PCI IRQ routing table */ + /* + * If we're using the I/O APIC, avoid using the PCI IRQ + * routing table + */ if (io_apic_assign_pci_irqs) pirq_table = NULL; } @@ -1142,9 +1178,6 @@ static int __init pcibios_irq_init(void) return 0; } -subsys_initcall(pcibios_irq_init); - - static void pirq_penalize_isa_irq(int irq, int active) { /* @@ -1178,7 +1211,7 @@ static int pirq_enable_irq(struct pci_dev *dev) if (pin && !pcibios_lookup_irq(dev, 1) && !dev->irq) { char *msg = ""; - pin--; /* interrupt pins are numbered starting from 1 */ + pin--; /* interrupt pins are numbered starting from 1 */ if (io_apic_assign_pci_irqs) { int irq; @@ -1198,13 +1231,16 @@ static int pirq_enable_irq(struct pci_dev *dev) irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number, PCI_SLOT(bridge->devfn), pin); if (irq >= 0) - printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n", - pci_name(bridge), 'A' + pin, irq); + printk(KERN_WARNING + "PCI: using PPB %s[%c] to get irq %d\n", + pci_name(bridge), + 'A' + pin, irq); dev = bridge; } dev = temp_dev; if (irq >= 0) { - printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n", + printk(KERN_INFO + "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n", pci_name(dev), 'A' + pin, irq); dev->irq = irq; return 0; @@ -1215,12 +1251,17 @@ static int pirq_enable_irq(struct pci_dev *dev) else msg = " Please try using pci=biosirq."; - /* With IDE legacy devices the IRQ lookup failure is not a problem.. */ - if (dev->class >> 8 == PCI_CLASS_STORAGE_IDE && !(dev->class & 0x5)) + /* + * With IDE legacy devices the IRQ lookup failure is not + * a problem.. + */ + if (dev->class >> 8 == PCI_CLASS_STORAGE_IDE && + !(dev->class & 0x5)) return 0; - printk(KERN_WARNING "PCI: No IRQ known for interrupt pin %c of device %s.%s\n", - 'A' + pin, pci_name(dev), msg); + printk(KERN_WARNING + "PCI: No IRQ known for interrupt pin %c of device %s.%s\n", + 'A' + pin, pci_name(dev), msg); } return 0; } diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c index a67921ce60a..132876cc6fc 100644 --- a/arch/x86/pci/legacy.c +++ b/arch/x86/pci/legacy.c @@ -55,4 +55,18 @@ static int __init pci_legacy_init(void) return 0; } -subsys_initcall(pci_legacy_init); +int __init pci_subsys_init(void) +{ +#ifdef CONFIG_ACPI + pci_acpi_init(); +#endif + pci_legacy_init(); + pcibios_irq_init(); +#ifdef CONFIG_X86_NUMAQ + pci_numa_init(); +#endif + pcibios_init(); + + return 0; +} +subsys_initcall(pci_subsys_init); diff --git a/arch/x86/pci/mp_bus_to_node.c b/arch/x86/pci/mp_bus_to_node.c deleted file mode 100644 index 022943999b8..00000000000 --- a/arch/x86/pci/mp_bus_to_node.c +++ /dev/null @@ -1,23 +0,0 @@ -#include <linux/pci.h> -#include <linux/init.h> -#include <linux/topology.h> - -#define BUS_NR 256 - -static unsigned char mp_bus_to_node[BUS_NR]; - -void set_mp_bus_to_node(int busnum, int node) -{ - if (busnum >= 0 && busnum < BUS_NR) - mp_bus_to_node[busnum] = (unsigned char) node; -} - -int get_mp_bus_to_node(int busnum) -{ - int node; - - if (busnum < 0 || busnum > (BUS_NR - 1)) - return 0; - node = mp_bus_to_node[busnum]; - return node; -} diff --git a/arch/x86/pci/numa.c b/arch/x86/pci/numa.c index 99f1ecd485b..8b5ca196673 100644 --- a/arch/x86/pci/numa.c +++ b/arch/x86/pci/numa.c @@ -151,7 +151,7 @@ static void __devinit pci_fixup_i450nx(struct pci_dev *d) } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82451NX, pci_fixup_i450nx); -static int __init pci_numa_init(void) +int __init pci_numa_init(void) { int quad; @@ -176,5 +176,3 @@ static int __init pci_numa_init(void) } return 0; } - -subsys_initcall(pci_numa_init); diff --git a/arch/x86/pci/pci.h b/arch/x86/pci/pci.h index ba263e626a6..3e25deb821a 100644 --- a/arch/x86/pci/pci.h +++ b/arch/x86/pci/pci.h @@ -28,6 +28,7 @@ #define PCI_USE__CRS 0x10000 #define PCI_CHECK_ENABLE_AMD_MMCONF 0x20000 #define PCI_HAS_IO_ECS 0x40000 +#define PCI_NOASSIGN_ROMS 0x80000 extern unsigned int pci_probe; extern unsigned long pirq_table_addr; @@ -39,9 +40,6 @@ enum pci_bf_sort_state { pci_dmi_bf, }; -extern void __init dmi_check_pciprobe(void); -extern void __init dmi_check_skip_isa_align(void); - /* pci-i386.c */ extern unsigned int pcibios_max_latency; @@ -99,10 +97,19 @@ extern struct pci_raw_ops *raw_pci_ext_ops; extern struct pci_raw_ops pci_direct_conf1; +/* arch_initcall level */ extern int pci_direct_probe(void); extern void pci_direct_init(int type); extern void pci_pcbios_init(void); extern int pci_olpc_init(void); +extern void __init dmi_check_pciprobe(void); +extern void __init dmi_check_skip_isa_align(void); + +/* some common used subsys_initcalls */ +extern int __init pci_acpi_init(void); +extern int __init pcibios_irq_init(void); +extern int __init pci_numa_init(void); +extern int __init pcibios_init(void); /* pci-mmconfig.c */ diff --git a/arch/x86/pci/visws.c b/arch/x86/pci/visws.c index c2df4e97eed..1a7bed492bb 100644 --- a/arch/x86/pci/visws.c +++ b/arch/x86/pci/visws.c @@ -8,18 +8,19 @@ #include <linux/pci.h> #include <linux/init.h> -#include "cobalt.h" -#include "lithium.h" +#include <asm/setup.h> +#include <asm/visws/cobalt.h> +#include <asm/visws/lithium.h> #include "pci.h" static int pci_visws_enable_irq(struct pci_dev *dev) { return 0; } static void pci_visws_disable_irq(struct pci_dev *dev) { } -int (*pcibios_enable_irq)(struct pci_dev *dev) = &pci_visws_enable_irq; -void (*pcibios_disable_irq)(struct pci_dev *dev) = &pci_visws_disable_irq; +/* int (*pcibios_enable_irq)(struct pci_dev *dev) = &pci_visws_enable_irq; */ +/* void (*pcibios_disable_irq)(struct pci_dev *dev) = &pci_visws_disable_irq; */ -void __init pcibios_penalize_isa_irq(int irq, int active) {} +/* void __init pcibios_penalize_isa_irq(int irq, int active) {} */ unsigned int pci_bus0, pci_bus1; @@ -85,7 +86,7 @@ void __init pcibios_update_irq(struct pci_dev *dev, int irq) pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq); } -static int __init pcibios_init(void) +static int __init pci_visws_init(void) { /* The VISWS supports configuration access type 1 only */ pci_probe = (pci_probe | PCI_PROBE_CONF1) & @@ -105,4 +106,17 @@ static int __init pcibios_init(void) return 0; } -subsys_initcall(pcibios_init); +static __init int pci_subsys_init(void) +{ + if (!is_visws_box()) + return -1; + + pcibios_enable_irq = &pci_visws_enable_irq; + pcibios_disable_irq = &pci_visws_disable_irq; + + pci_visws_init(); + pcibios_init(); + + return 0; +} +subsys_initcall(pci_subsys_init); diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index efa2ba7c600..1ef0f90813d 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -23,7 +23,7 @@ #define gtod vdso_vsyscall_gtod_data -static long vdso_fallback_gettime(long clock, struct timespec *ts) +notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) { long ret; asm("syscall" : "=a" (ret) : @@ -31,7 +31,7 @@ static long vdso_fallback_gettime(long clock, struct timespec *ts) return ret; } -static inline long vgetns(void) +notrace static inline long vgetns(void) { long v; cycles_t (*vread)(void); @@ -40,7 +40,7 @@ static inline long vgetns(void) return (v * gtod->clock.mult) >> gtod->clock.shift; } -static noinline int do_realtime(struct timespec *ts) +notrace static noinline int do_realtime(struct timespec *ts) { unsigned long seq, ns; do { @@ -54,7 +54,8 @@ static noinline int do_realtime(struct timespec *ts) } /* Copy of the version in kernel/time.c which we cannot directly access */ -static void vset_normalized_timespec(struct timespec *ts, long sec, long nsec) +notrace static void +vset_normalized_timespec(struct timespec *ts, long sec, long nsec) { while (nsec >= NSEC_PER_SEC) { nsec -= NSEC_PER_SEC; @@ -68,7 +69,7 @@ static void vset_normalized_timespec(struct timespec *ts, long sec, long nsec) ts->tv_nsec = nsec; } -static noinline int do_monotonic(struct timespec *ts) +notrace static noinline int do_monotonic(struct timespec *ts) { unsigned long seq, ns, secs; do { @@ -82,7 +83,7 @@ static noinline int do_monotonic(struct timespec *ts) return 0; } -int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) +notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) { if (likely(gtod->sysctl_enabled && gtod->clock.vread)) switch (clock) { @@ -96,7 +97,7 @@ int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) int clock_gettime(clockid_t, struct timespec *) __attribute__((weak, alias("__vdso_clock_gettime"))); -int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) +notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) { long ret; if (likely(gtod->sysctl_enabled && gtod->clock.vread)) { diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index cf058fecfce..0bce5429a51 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -203,20 +203,11 @@ static struct page *vdso32_pages[1]; #ifdef CONFIG_X86_64 -static int use_sysenter __read_mostly = -1; - -#define vdso32_sysenter() (use_sysenter > 0) +#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SYSENTER32)) /* May not be __init: called during resume */ void syscall32_cpu_init(void) { - if (use_sysenter < 0) { - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) - use_sysenter = 1; - if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR) - use_sysenter = 1; - } - /* Load these always in case some future AMD CPU supports SYSENTER from compat mode too. */ checking_wrmsrl(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); diff --git a/arch/x86/vdso/vgetcpu.c b/arch/x86/vdso/vgetcpu.c index c8097f17f8a..9fbc6b20026 100644 --- a/arch/x86/vdso/vgetcpu.c +++ b/arch/x86/vdso/vgetcpu.c @@ -13,7 +13,8 @@ #include <asm/vgtod.h> #include "vextern.h" -long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused) +notrace long +__vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused) { unsigned int p; diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 3b980831602..bb508456ef5 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1062,7 +1062,7 @@ static const struct pv_time_ops xen_time_ops __initdata = { .set_wallclock = xen_set_wallclock, .get_wallclock = xen_get_wallclock, - .get_cpu_khz = xen_cpu_khz, + .get_tsc_khz = xen_tsc_khz, .sched_clock = xen_sched_clock, }; @@ -1214,7 +1214,9 @@ static const struct smp_ops xen_smp_ops __initdata = { .smp_send_stop = xen_smp_send_stop, .smp_send_reschedule = xen_smp_send_reschedule, - .smp_call_function_mask = xen_smp_call_function_mask, + + .send_call_func_ipi = xen_smp_send_call_function_ipi, + .send_call_func_single_ipi = xen_smp_send_call_function_single_ipi, }; #endif /* CONFIG_SMP */ diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 42b3b9ed641..ff0aa74afaa 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -796,7 +796,7 @@ static void drop_mm_ref(struct mm_struct *mm) } if (!cpus_empty(mask)) - xen_smp_call_function_mask(mask, drop_other_mm_ref, mm, 1); + smp_call_function_mask(mask, drop_other_mm_ref, mm, 1); } #else static void drop_mm_ref(struct mm_struct *mm) diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index d2e3c20127d..233156f39b7 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -36,27 +36,14 @@ #include "mmu.h" cpumask_t xen_cpu_initialized_map; -static DEFINE_PER_CPU(int, resched_irq) = -1; -static DEFINE_PER_CPU(int, callfunc_irq) = -1; -static DEFINE_PER_CPU(int, debug_irq) = -1; - -/* - * Structure and data for smp_call_function(). This is designed to minimise - * static memory requirements. It also looks cleaner. - */ -static DEFINE_SPINLOCK(call_lock); -struct call_data_struct { - void (*func) (void *info); - void *info; - atomic_t started; - atomic_t finished; - int wait; -}; +static DEFINE_PER_CPU(int, resched_irq); +static DEFINE_PER_CPU(int, callfunc_irq); +static DEFINE_PER_CPU(int, callfuncsingle_irq); +static DEFINE_PER_CPU(int, debug_irq) = -1; static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id); - -static struct call_data_struct *call_data; +static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id); /* * Reschedule call back. Nothing to do, @@ -128,6 +115,17 @@ static int xen_smp_intr_init(unsigned int cpu) goto fail; per_cpu(debug_irq, cpu) = rc; + callfunc_name = kasprintf(GFP_KERNEL, "callfuncsingle%d", cpu); + rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_SINGLE_VECTOR, + cpu, + xen_call_function_single_interrupt, + IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, + callfunc_name, + NULL); + if (rc < 0) + goto fail; + per_cpu(callfuncsingle_irq, cpu) = rc; + return 0; fail: @@ -137,6 +135,9 @@ static int xen_smp_intr_init(unsigned int cpu) unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL); if (per_cpu(debug_irq, cpu) >= 0) unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL); + if (per_cpu(callfuncsingle_irq, cpu) >= 0) + unbind_from_irqhandler(per_cpu(callfuncsingle_irq, cpu), NULL); + return rc; } @@ -336,7 +337,7 @@ static void stop_self(void *v) void xen_smp_send_stop(void) { - smp_call_function(stop_self, NULL, 0, 0); + smp_call_function(stop_self, NULL, 0); } void xen_smp_send_reschedule(int cpu) @@ -344,7 +345,6 @@ void xen_smp_send_reschedule(int cpu) xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR); } - static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector) { unsigned cpu; @@ -355,83 +355,42 @@ static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector) xen_send_IPI_one(cpu, vector); } +void xen_smp_send_call_function_ipi(cpumask_t mask) +{ + int cpu; + + xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR); + + /* Make sure other vcpus get a chance to run if they need to. */ + for_each_cpu_mask(cpu, mask) { + if (xen_vcpu_stolen(cpu)) { + HYPERVISOR_sched_op(SCHEDOP_yield, 0); + break; + } + } +} + +void xen_smp_send_call_function_single_ipi(int cpu) +{ + xen_send_IPI_mask(cpumask_of_cpu(cpu), XEN_CALL_FUNCTION_SINGLE_VECTOR); +} + static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id) { - void (*func) (void *info) = call_data->func; - void *info = call_data->info; - int wait = call_data->wait; - - /* - * Notify initiating CPU that I've grabbed the data and am - * about to execute the function - */ - mb(); - atomic_inc(&call_data->started); - /* - * At this point the info structure may be out of scope unless wait==1 - */ irq_enter(); - (*func)(info); + generic_smp_call_function_interrupt(); __get_cpu_var(irq_stat).irq_call_count++; irq_exit(); - if (wait) { - mb(); /* commit everything before setting finished */ - atomic_inc(&call_data->finished); - } - return IRQ_HANDLED; } -int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *), - void *info, int wait) +static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id) { - struct call_data_struct data; - int cpus, cpu; - bool yield; - - /* Holding any lock stops cpus from going down. */ - spin_lock(&call_lock); - - cpu_clear(smp_processor_id(), mask); - - cpus = cpus_weight(mask); - if (!cpus) { - spin_unlock(&call_lock); - return 0; - } - - /* Can deadlock when called with interrupts disabled */ - WARN_ON(irqs_disabled()); - - data.func = func; - data.info = info; - atomic_set(&data.started, 0); - data.wait = wait; - if (wait) - atomic_set(&data.finished, 0); - - call_data = &data; - mb(); /* write everything before IPI */ - - /* Send a message to other CPUs and wait for them to respond */ - xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR); - - /* Make sure other vcpus get a chance to run if they need to. */ - yield = false; - for_each_cpu_mask(cpu, mask) - if (xen_vcpu_stolen(cpu)) - yield = true; - - if (yield) - HYPERVISOR_sched_op(SCHEDOP_yield, 0); - - /* Wait for response */ - while (atomic_read(&data.started) != cpus || - (wait && atomic_read(&data.finished) != cpus)) - cpu_relax(); - - spin_unlock(&call_lock); + irq_enter(); + generic_smp_call_function_single_interrupt(); + __get_cpu_var(irq_stat).irq_call_count++; + irq_exit(); - return 0; + return IRQ_HANDLED; } diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 64f0038b955..685b77470fc 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -197,8 +197,8 @@ unsigned long long xen_sched_clock(void) } -/* Get the CPU speed from Xen */ -unsigned long xen_cpu_khz(void) +/* Get the TSC speed from Xen */ +unsigned long xen_tsc_khz(void) { u64 xen_khz = 1000000ULL << 32; const struct pvclock_vcpu_time_info *info = diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 9a055592a30..6f4b1045c1c 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -32,7 +32,7 @@ void __init xen_build_dynamic_phys_to_machine(void); void xen_setup_timer(int cpu); void xen_setup_cpu_clockevents(void); -unsigned long xen_cpu_khz(void); +unsigned long xen_tsc_khz(void); void __init xen_time_init(void); unsigned long xen_get_wallclock(void); int xen_set_wallclock(unsigned long time); @@ -55,13 +55,8 @@ void xen_smp_cpus_done(unsigned int max_cpus); void xen_smp_send_stop(void); void xen_smp_send_reschedule(int cpu); -int xen_smp_call_function (void (*func) (void *info), void *info, int nonatomic, - int wait); -int xen_smp_call_function_single(int cpu, void (*func) (void *info), void *info, - int nonatomic, int wait); - -int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *), - void *info, int wait); +void xen_smp_send_call_function_ipi(cpumask_t mask); +void xen_smp_send_call_function_single_ipi(int cpu); extern cpumask_t xen_cpu_initialized_map; |