diff options
Diffstat (limited to 'arch/x86/kernel')
67 files changed, 2581 insertions, 1682 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 54829e2b516..da140611bb5 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -6,6 +6,12 @@ extra-y := head_$(BITS).o head$(BITS).o head.o init_task.o vmlinu CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) +ifdef CONFIG_FTRACE +# Do not profile debug utilities +CFLAGS_REMOVE_tsc.o = -pg +CFLAGS_REMOVE_rtc.o = -pg +endif + # # vsyscalls (which work on the user stack) should have # no stack-protector checks: @@ -13,12 +19,13 @@ CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) nostackp := $(call cc-option, -fno-stack-protector) CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp) CFLAGS_hpet.o := $(nostackp) -CFLAGS_tsc_64.o := $(nostackp) +CFLAGS_tsc.o := $(nostackp) obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o obj-y += traps_$(BITS).o irq_$(BITS).o obj-y += time_$(BITS).o ioport.o ldt.o obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o +obj-$(CONFIG_X86_VISWS) += visws_quirks.o obj-$(CONFIG_X86_32) += probe_roms_32.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o @@ -26,7 +33,7 @@ obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o obj-y += bootflag.o e820.o obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o obj-y += alternative.o i8253.o pci-nommu.o -obj-y += tsc_$(BITS).o io_delay.o rtc.o +obj-y += tsc.o io_delay.o rtc.o obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o obj-y += process.o @@ -56,6 +63,7 @@ obj-$(CONFIG_X86_MPPARSE) += mpparse.o obj-$(CONFIG_X86_LOCAL_APIC) += apic_$(BITS).o nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic_$(BITS).o obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o +obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 5c0107602b6..f489d7a9be9 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -37,6 +37,7 @@ #include <asm/pgtable.h> #include <asm/io_apic.h> #include <asm/apic.h> +#include <asm/genapic.h> #include <asm/io.h> #include <asm/mpspec.h> #include <asm/smp.h> @@ -83,8 +84,6 @@ int acpi_lapic; int acpi_ioapic; int acpi_strict; -static int disable_irq0_through_ioapic __initdata; - u8 acpi_sci_flags __initdata; int acpi_sci_override_gsi __initdata; int acpi_skip_timer_override __initdata; @@ -108,21 +107,6 @@ static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; */ enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_PIC; -#ifdef CONFIG_X86_64 - -/* rely on all ACPI tables being in the direct mapping */ -char *__init __acpi_map_table(unsigned long phys_addr, unsigned long size) -{ - if (!phys_addr || !size) - return NULL; - - if (phys_addr+size <= (max_pfn_mapped << PAGE_SHIFT) + PAGE_SIZE) - return __va(phys_addr); - - return NULL; -} - -#else /* * Temporarily use the virtual area starting from FIX_IO_APIC_BASE_END, @@ -141,11 +125,15 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size) unsigned long base, offset, mapped_size; int idx; - if (phys + size < 8 * 1024 * 1024) + if (!phys || !size) + return NULL; + + if (phys+size <= (max_low_pfn_mapped << PAGE_SHIFT)) return __va(phys); offset = phys & (PAGE_SIZE - 1); mapped_size = PAGE_SIZE - offset; + clear_fixmap(FIX_ACPI_END); set_fixmap(FIX_ACPI_END, phys); base = fix_to_virt(FIX_ACPI_END); @@ -157,13 +145,13 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size) if (--idx < FIX_ACPI_BEGIN) return NULL; /* cannot handle this */ phys += PAGE_SIZE; + clear_fixmap(idx); set_fixmap(idx, phys); mapped_size += PAGE_SIZE; } return ((unsigned char *)base + offset); } -#endif #ifdef CONFIG_PCI_MMCONFIG /* The physical address of the MMCONFIG aperture. Set from ACPI tables. */ @@ -992,10 +980,6 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) int pin; struct mp_config_intsrc mp_irq; - /* Skip the 8254 timer interrupt (IRQ 0) if requested. */ - if (bus_irq == 0 && disable_irq0_through_ioapic) - return; - /* * Convert 'gsi' to 'ioapic.pin'. */ @@ -1062,10 +1046,6 @@ void __init mp_config_acpi_legacy_irqs(void) for (i = 0; i < 16; i++) { int idx; - /* Skip the 8254 timer interrupt (IRQ 0) if requested. */ - if (i == 0 && disable_irq0_through_ioapic) - continue; - for (idx = 0; idx < mp_irq_entries; idx++) { struct mp_config_intsrc *irq = mp_irqs + idx; @@ -1373,8 +1353,6 @@ static void __init acpi_process_madt(void) return; } -#ifdef __i386__ - static int __init disable_acpi_irq(const struct dmi_system_id *d) { if (!acpi_force) { @@ -1425,13 +1403,12 @@ static int __init force_acpi_ht(const struct dmi_system_id *d) } /* - * Don't register any I/O APIC entries for the 8254 timer IRQ. + * Force ignoring BIOS IRQ0 pin2 override */ -static int __init -dmi_disable_irq0_through_ioapic(const struct dmi_system_id *d) +static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d) { - pr_notice("%s detected: disabling IRQ 0 through I/O APIC\n", d->ident); - disable_irq0_through_ioapic = 1; + pr_notice("%s detected: Ignoring BIOS IRQ0 pin2 override\n", d->ident); + acpi_skip_timer_override = 1; return 0; } @@ -1609,11 +1586,11 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = { * is enabled. This input is incorrectly designated the * ISA IRQ 0 via an interrupt source override even though * it is wired to the output of the master 8259A and INTIN0 - * is not connected at all. Abandon any attempts to route - * IRQ 0 through the I/O APIC therefore. + * is not connected at all. Force ignoring BIOS IRQ0 pin2 + * override in that cases. */ { - .callback = dmi_disable_irq0_through_ioapic, + .callback = dmi_ignore_irq0_timer_override, .ident = "HP NX6125 laptop", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), @@ -1621,7 +1598,7 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = { }, }, { - .callback = dmi_disable_irq0_through_ioapic, + .callback = dmi_ignore_irq0_timer_override, .ident = "HP NX6325 laptop", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), @@ -1631,8 +1608,6 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = { {} }; -#endif /* __i386__ */ - /* * acpi_boot_table_init() and acpi_boot_init() * called from setup_arch(), always. @@ -1660,9 +1635,7 @@ int __init acpi_boot_table_init(void) { int error; -#ifdef __i386__ dmi_check_system(acpi_dmi_table); -#endif /* * If acpi_disabled, bail out diff --git a/arch/x86/kernel/acpi/processor.c b/arch/x86/kernel/acpi/processor.c index de2d2e4ebad..7c074eec39f 100644 --- a/arch/x86/kernel/acpi/processor.c +++ b/arch/x86/kernel/acpi/processor.c @@ -56,6 +56,12 @@ static void init_intel_pdc(struct acpi_processor *pr, struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_ACPI)) buf[2] |= ACPI_PDC_T_FFH; + /* + * If mwait/monitor is unsupported, C2/C3_FFH will be disabled + */ + if (!cpu_has(c, X86_FEATURE_MWAIT)) + buf[2] &= ~(ACPI_PDC_C_C2C3_FFH); + obj->type = ACPI_TYPE_BUFFER; obj->buffer.length = 12; obj->buffer.pointer = (u8 *) buf; diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index e6a4b564cca..868de3d5c39 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -23,6 +23,15 @@ static unsigned long acpi_realmode; static char temp_stack[10240]; #endif +/* XXX: this macro should move to asm-x86/segment.h and be shared with the + boot code... */ +#define GDT_ENTRY(flags, base, limit) \ + (((u64)(base & 0xff000000) << 32) | \ + ((u64)flags << 40) | \ + ((u64)(limit & 0x00ff0000) << 32) | \ + ((u64)(base & 0x00ffffff) << 16) | \ + ((u64)(limit & 0x0000ffff))) + /** * acpi_save_state_mem - save kernel state * @@ -51,18 +60,27 @@ int acpi_save_state_mem(void) header->video_mode = saved_video_mode; header->wakeup_jmp_seg = acpi_wakeup_address >> 4; + + /* + * Set up the wakeup GDT. We set these up as Big Real Mode, + * that is, with limits set to 4 GB. At least the Lenovo + * Thinkpad X61 is known to need this for the video BIOS + * initialization quirk to work; this is likely to also + * be the case for other laptops or integrated video devices. + */ + /* GDT[0]: GDT self-pointer */ header->wakeup_gdt[0] = (u64)(sizeof(header->wakeup_gdt) - 1) + ((u64)(acpi_wakeup_address + ((char *)&header->wakeup_gdt - (char *)acpi_realmode)) << 16); - /* GDT[1]: real-mode-like code segment */ - header->wakeup_gdt[1] = (0x009bULL << 40) + - ((u64)acpi_wakeup_address << 16) + 0xffff; - /* GDT[2]: real-mode-like data segment */ - header->wakeup_gdt[2] = (0x0093ULL << 40) + - ((u64)acpi_wakeup_address << 16) + 0xffff; + /* GDT[1]: big real mode-like code segment */ + header->wakeup_gdt[1] = + GDT_ENTRY(0x809b, acpi_wakeup_address, 0xfffff); + /* GDT[2]: big real mode-like data segment */ + header->wakeup_gdt[2] = + GDT_ENTRY(0x8093, acpi_wakeup_address, 0xfffff); #ifndef CONFIG_64BIT store_gdt((struct desc_ptr *)&header->pmode_gdt); @@ -140,6 +158,8 @@ static int __init acpi_sleep_setup(char *str) acpi_realmode_flags |= 2; if (strncmp(str, "s3_beep", 7) == 0) acpi_realmode_flags |= 4; + if (strncmp(str, "old_ordering", 12) == 0) + acpi_old_suspend_ordering(); str = strchr(str, ','); if (str != NULL) str += strspn(str, ", \t"); diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 65c7857a90d..2763cb37b55 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1,6 +1,6 @@ #include <linux/module.h> #include <linux/sched.h> -#include <linux/spinlock.h> +#include <linux/mutex.h> #include <linux/list.h> #include <linux/kprobes.h> #include <linux/mm.h> @@ -143,7 +143,7 @@ static const unsigned char *const p6_nops[ASM_NOP_MAX+1] = { #ifdef CONFIG_X86_64 extern char __vsyscall_0; -static inline const unsigned char*const * find_nop_table(void) +const unsigned char *const *find_nop_table(void) { return boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || boot_cpu_data.x86 < 6 ? k8_nops : p6_nops; @@ -162,7 +162,7 @@ static const struct nop { { -1, NULL } }; -static const unsigned char*const * find_nop_table(void) +const unsigned char *const *find_nop_table(void) { const unsigned char *const *noptable = intel_nops; int i; @@ -279,7 +279,7 @@ struct smp_alt_module { struct list_head next; }; static LIST_HEAD(smp_alt_modules); -static DEFINE_SPINLOCK(smp_alt); +static DEFINE_MUTEX(smp_alt); static int smp_mode = 1; /* protected by smp_alt */ void alternatives_smp_module_add(struct module *mod, char *name, @@ -312,12 +312,12 @@ void alternatives_smp_module_add(struct module *mod, char *name, __func__, smp->locks, smp->locks_end, smp->text, smp->text_end, smp->name); - spin_lock(&smp_alt); + mutex_lock(&smp_alt); list_add_tail(&smp->next, &smp_alt_modules); if (boot_cpu_has(X86_FEATURE_UP)) alternatives_smp_unlock(smp->locks, smp->locks_end, smp->text, smp->text_end); - spin_unlock(&smp_alt); + mutex_unlock(&smp_alt); } void alternatives_smp_module_del(struct module *mod) @@ -327,17 +327,17 @@ void alternatives_smp_module_del(struct module *mod) if (smp_alt_once || noreplace_smp) return; - spin_lock(&smp_alt); + mutex_lock(&smp_alt); list_for_each_entry(item, &smp_alt_modules, next) { if (mod != item->mod) continue; list_del(&item->next); - spin_unlock(&smp_alt); + mutex_unlock(&smp_alt); DPRINTK("%s: %s\n", __func__, item->name); kfree(item); return; } - spin_unlock(&smp_alt); + mutex_unlock(&smp_alt); } void alternatives_smp_switch(int smp) @@ -359,7 +359,7 @@ void alternatives_smp_switch(int smp) return; BUG_ON(!smp && (num_online_cpus() > 1)); - spin_lock(&smp_alt); + mutex_lock(&smp_alt); /* * Avoid unnecessary switches because it forces JIT based VMs to @@ -383,7 +383,7 @@ void alternatives_smp_switch(int smp) mod->text, mod->text_end); } smp_mode = smp; - spin_unlock(&smp_alt); + mutex_unlock(&smp_alt); } #endif diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 6dea8306d8c..a437d027f20 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -82,6 +82,11 @@ int pic_mode; /* Have we found an MP table */ int smp_found_config; +static struct resource lapic_resource = { + .name = "Local APIC", + .flags = IORESOURCE_MEM | IORESOURCE_BUSY, +}; + static unsigned int calibration_result; static int lapic_next_event(unsigned long delta, @@ -969,7 +974,7 @@ void __cpuinit setup_local_APIC(void) * Double-check whether this APIC is really registered. */ if (!apic_id_registered()) - BUG(); + WARN_ON_ONCE(1); /* * Intel recommends to set DFR, LDR and TPR before enabling @@ -1335,6 +1340,10 @@ void __init smp_intr_init(void) /* IPI for generic function call */ alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); + + /* IPI for single call function */ + set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, + call_function_single_interrupt); } #endif @@ -1720,3 +1729,21 @@ static int __init apic_set_verbosity(char *str) } __setup("apic=", apic_set_verbosity); +static int __init lapic_insert_resource(void) +{ + if (!apic_phys) + return -1; + + /* Put local APIC into the resource map. */ + lapic_resource.start = apic_phys; + lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1; + insert_resource(&iomem_resource, &lapic_resource); + + return 0; +} + +/* + * need call insert after e820_reserve_resources() + * that is using request_resource + */ +late_initcall(lapic_insert_resource); diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 00e6d137095..bf9b441331e 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -204,6 +204,7 @@ #include <linux/module.h> #include <linux/poll.h> +#include <linux/smp_lock.h> #include <linux/types.h> #include <linux/stddef.h> #include <linux/timer.h> @@ -1212,9 +1213,9 @@ static int suspend(int vetoable) if (err != APM_SUCCESS) apm_error("suspend", err); err = (err == APM_SUCCESS) ? 0 : -EIO; - device_power_up(); + device_power_up(PMSG_RESUME); local_irq_enable(); - device_resume(); + device_resume(PMSG_RESUME); queue_event(APM_NORMAL_RESUME, NULL); spin_lock(&user_list_lock); for (as = user_list; as != NULL; as = as->next) { @@ -1239,7 +1240,7 @@ static void standby(void) apm_error("standby", err); local_irq_disable(); - device_power_up(); + device_power_up(PMSG_RESUME); local_irq_enable(); } @@ -1325,7 +1326,7 @@ static void check_events(void) ignore_bounce = 1; if ((event != APM_NORMAL_RESUME) || (ignore_normal_resume == 0)) { - device_resume(); + device_resume(PMSG_RESUME); queue_event(event, NULL); } ignore_normal_resume = 0; @@ -1549,10 +1550,12 @@ static int do_open(struct inode *inode, struct file *filp) { struct apm_user *as; + lock_kernel(); as = kmalloc(sizeof(*as), GFP_KERNEL); if (as == NULL) { printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n", sizeof(*as)); + unlock_kernel(); return -ENOMEM; } as->magic = APM_BIOS_MAGIC; @@ -1574,6 +1577,7 @@ static int do_open(struct inode *inode, struct file *filp) user_list = as; spin_unlock(&user_list_lock); filp->private_data = as; + unlock_kernel(); return 0; } diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 3295e7c08fe..bacf5deeec2 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -34,7 +34,7 @@ int main(void) ENTRY(pid); BLANK(); #undef ENTRY -#define ENTRY(entry) DEFINE(threadinfo_ ## entry, offsetof(struct thread_info, entry)) +#define ENTRY(entry) DEFINE(TI_ ## entry, offsetof(struct thread_info, entry)) ENTRY(flags); ENTRY(addr_limit); ENTRY(preempt_count); diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c index 958526d6a74..7c36fb8a28d 100644 --- a/arch/x86/kernel/cpu/amd_64.c +++ b/arch/x86/kernel/cpu/amd_64.c @@ -199,10 +199,15 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) * Don't do it for gbpages because there seems very little * benefit in doing so. */ - if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg) && - (tseg >> PMD_SHIFT) < - (max_pfn_mapped >> (PMD_SHIFT-PAGE_SHIFT))) + if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) { + printk(KERN_DEBUG "tseg: %010llx\n", tseg); + if ((tseg>>PMD_SHIFT) < + (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) || + ((tseg>>PMD_SHIFT) < + (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) && + (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT)))) set_memory_4k((unsigned long)__va(tseg), 1); + } } } diff --git a/arch/x86/kernel/cpu/centaur_64.c b/arch/x86/kernel/cpu/centaur_64.c index 13526fd5cce..1d181c40e2e 100644 --- a/arch/x86/kernel/cpu/centaur_64.c +++ b/arch/x86/kernel/cpu/centaur_64.c @@ -10,20 +10,12 @@ static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c) { if (c->x86 == 0x6 && c->x86_model >= 0xf) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + + set_cpu_cap(c, X86_FEATURE_SYSENTER32); } static void __cpuinit init_centaur(struct cpuinfo_x86 *c) { - /* Cache sizes */ - unsigned n; - - n = c->extended_cpuid_level; - if (n >= 0x80000008) { - unsigned eax = cpuid_eax(0x80000008); - c->x86_virt_bits = (eax >> 8) & 0xff; - c->x86_phys_bits = eax & 0xff; - } - if (c->x86 == 0x6 && c->x86_model >= 0xf) { c->x86_cache_alignment = c->x86_clflush_size * 2; set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c index 75185023529..7b8cc72feb4 100644 --- a/arch/x86/kernel/cpu/common_64.c +++ b/arch/x86/kernel/cpu/common_64.c @@ -98,7 +98,7 @@ int __cpuinit get_model_name(struct cpuinfo_x86 *c) void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) { - unsigned int n, dummy, eax, ebx, ecx, edx; + unsigned int n, dummy, ebx, ecx, edx; n = c->extended_cpuid_level; @@ -121,11 +121,6 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n", c->x86_cache_size, ecx & 0xFF); } - if (n >= 0x80000008) { - cpuid(0x80000008, &eax, &dummy, &dummy, &dummy); - c->x86_virt_bits = (eax >> 8) & 0xff; - c->x86_phys_bits = eax & 0xff; - } } void __cpuinit detect_ht(struct cpuinfo_x86 *c) @@ -314,6 +309,16 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) if (c->extended_cpuid_level >= 0x80000007) c->x86_power = cpuid_edx(0x80000007); + if (c->extended_cpuid_level >= 0x80000008) { + u32 eax = cpuid_eax(0x80000008); + + c->x86_virt_bits = (eax >> 8) & 0xff; + c->x86_phys_bits = eax & 0xff; + } + + /* Assume all 64-bit CPUs support 32-bit syscall */ + set_cpu_cap(c, X86_FEATURE_SYSCALL32); + if (c->x86_vendor != X86_VENDOR_UNKNOWN && cpu_devs[c->x86_vendor]->c_early_init) cpu_devs[c->x86_vendor]->c_early_init(c); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index fe9224c51d3..70609efdf1d 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -226,6 +226,10 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) if (cpu_has_bts) ds_init_intel(c); + +#ifdef CONFIG_X86_NUMAQ + numaq_tsc_disable(); +#endif } static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned int size) diff --git a/arch/x86/kernel/cpu/intel_64.c b/arch/x86/kernel/cpu/intel_64.c index fcb1cc9d75c..1019c58d39f 100644 --- a/arch/x86/kernel/cpu/intel_64.c +++ b/arch/x86/kernel/cpu/intel_64.c @@ -12,6 +12,8 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) if ((c->x86 == 0xf && c->x86_model >= 0x03) || (c->x86 == 0x6 && c->x86_model >= 0x0e)) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + + set_cpu_cap(c, X86_FEATURE_SYSENTER32); } /* @@ -52,9 +54,6 @@ static void __cpuinit srat_detect_node(void) static void __cpuinit init_intel(struct cpuinfo_x86 *c) { - /* Cache sizes */ - unsigned n; - init_intel_cacheinfo(c); if (c->cpuid_level > 9) { unsigned eax = cpuid_eax(10); @@ -76,13 +75,6 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) if (cpu_has_bts) ds_init_intel(c); - n = c->extended_cpuid_level; - if (n >= 0x80000008) { - unsigned eax = cpuid_eax(0x80000008); - c->x86_virt_bits = (eax >> 8) & 0xff; - c->x86_phys_bits = eax & 0xff; - } - if (c->x86 == 15) c->x86_cache_alignment = c->x86_clflush_size * 2; if (c->x86 == 6) diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 501ca1cea27..c4a7ec31394 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -9,6 +9,7 @@ #include <linux/types.h> #include <linux/kernel.h> #include <linux/sched.h> +#include <linux/smp_lock.h> #include <linux/string.h> #include <linux/rcupdate.h> #include <linux/kallsyms.h> @@ -363,7 +364,7 @@ static void mcheck_check_cpu(void *info) static void mcheck_timer(struct work_struct *work) { - on_each_cpu(mcheck_check_cpu, NULL, 1, 1); + on_each_cpu(mcheck_check_cpu, NULL, 1); /* * Alert userspace if needed. If we logged an MCE, reduce the @@ -532,10 +533,12 @@ static int open_exclu; /* already open exclusive? */ static int mce_open(struct inode *inode, struct file *file) { + lock_kernel(); spin_lock(&mce_state_lock); if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { spin_unlock(&mce_state_lock); + unlock_kernel(); return -EBUSY; } @@ -544,6 +547,7 @@ static int mce_open(struct inode *inode, struct file *file) open_count++; spin_unlock(&mce_state_lock); + unlock_kernel(); return nonseekable_open(inode, file); } @@ -617,7 +621,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, * Collect entries that were still getting written before the * synchronize. */ - on_each_cpu(collect_tscs, cpu_tsc, 1, 1); + on_each_cpu(collect_tscs, cpu_tsc, 1); for (i = next; i < MCE_LOG_LEN; i++) { if (mcelog.entry[i].finished && mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { @@ -742,7 +746,7 @@ static void mce_restart(void) if (next_interval) cancel_delayed_work(&mcheck_work); /* Timer race is harmless here */ - on_each_cpu(mce_init, NULL, 1, 1); + on_each_cpu(mce_init, NULL, 1); next_interval = check_interval * HZ; if (next_interval) schedule_delayed_work(&mcheck_work, diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c index 00ccb6c14ec..cc1fccdd31e 100644 --- a/arch/x86/kernel/cpu/mcheck/non-fatal.c +++ b/arch/x86/kernel/cpu/mcheck/non-fatal.c @@ -59,7 +59,7 @@ static DECLARE_DELAYED_WORK(mce_work, mce_work_fn); static void mce_work_fn(struct work_struct *work) { - on_each_cpu(mce_checkregs, NULL, 1, 1); + on_each_cpu(mce_checkregs, NULL, 1); schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE)); } diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 105afe12beb..6f23969c8fa 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -223,7 +223,7 @@ static void set_mtrr(unsigned int reg, unsigned long base, atomic_set(&data.gate,0); /* Start the ball rolling on other CPUs */ - if (smp_call_function(ipi_handler, &data, 1, 0) != 0) + if (smp_call_function(ipi_handler, &data, 0) != 0) panic("mtrr: timed out waiting for other CPUs\n"); local_irq_save(flags); @@ -1682,7 +1682,7 @@ void mtrr_ap_init(void) */ void mtrr_save_state(void) { - smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1, 1); + smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1); } static int __init mtrr_init_finialize(void) diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 2e9bef6e3aa..6d4bdc02388 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -189,7 +189,7 @@ void disable_lapic_nmi_watchdog(void) if (atomic_read(&nmi_active) <= 0) return; - on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1); + on_each_cpu(stop_apic_nmi_watchdog, NULL, 1); if (wd_ops) wd_ops->unreserve(); @@ -213,7 +213,7 @@ void enable_lapic_nmi_watchdog(void) return; } - on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1); + on_each_cpu(setup_apic_nmi_watchdog, NULL, 1); touch_nmi_watchdog(); } diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index daff52a6224..2de5fa2bbf7 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -33,6 +33,7 @@ #include <linux/init.h> #include <linux/poll.h> #include <linux/smp.h> +#include <linux/smp_lock.h> #include <linux/major.h> #include <linux/fs.h> #include <linux/smp_lock.h> @@ -95,7 +96,7 @@ static ssize_t cpuid_read(struct file *file, char __user *buf, for (; count; count -= 16) { cmd.eax = pos; cmd.ecx = pos >> 32; - smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1, 1); + smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1); if (copy_to_user(tmp, &cmd, 16)) return -EFAULT; tmp += 16; @@ -107,15 +108,23 @@ static ssize_t cpuid_read(struct file *file, char __user *buf, static int cpuid_open(struct inode *inode, struct file *file) { - unsigned int cpu = iminor(file->f_path.dentry->d_inode); - struct cpuinfo_x86 *c = &cpu_data(cpu); - - if (cpu >= NR_CPUS || !cpu_online(cpu)) - return -ENXIO; /* No such CPU */ + unsigned int cpu; + struct cpuinfo_x86 *c; + int ret = 0; + + lock_kernel(); + + cpu = iminor(file->f_path.dentry->d_inode); + if (cpu >= NR_CPUS || !cpu_online(cpu)) { + ret = -ENXIO; /* No such CPU */ + goto out; + } + c = &cpu_data(cpu); if (c->cpuid_level < 0) - return -EIO; /* CPUID not supported */ - - return 0; + ret = -EIO; /* CPUID not supported */ +out: + unlock_kernel(); + return ret; } /* diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index d0335853ff5..28c29180b38 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -19,6 +19,7 @@ #include <linux/mm.h> #include <linux/pfn.h> #include <linux/suspend.h> +#include <linux/firmware-map.h> #include <asm/pgtable.h> #include <asm/page.h> @@ -27,7 +28,22 @@ #include <asm/setup.h> #include <asm/trampoline.h> +/* + * The e820 map is the map that gets modified e.g. with command line parameters + * and that is also registered with modifications in the kernel resource tree + * with the iomem_resource as parent. + * + * The e820_saved is directly saved after the BIOS-provided memory map is + * copied. It doesn't get modified afterwards. It's registered for the + * /sys/firmware/memmap interface. + * + * That memory map is not modified and is used as base for kexec. The kexec'd + * kernel should get the same memory map as the firmware provides. Then the + * user can e.g. boot the original kernel with mem=1G while still booting the + * next kernel with full memory. + */ struct e820map e820; +struct e820map e820_saved; /* For PCI or other memory-mapped resources */ unsigned long pci_mem_start = 0xaeedbabe; @@ -398,8 +414,9 @@ static int __init append_e820_map(struct e820entry *biosmap, int nr_map) return __append_e820_map(biosmap, nr_map); } -u64 __init e820_update_range(u64 start, u64 size, unsigned old_type, - unsigned new_type) +static u64 __init e820_update_range_map(struct e820map *e820x, u64 start, + u64 size, unsigned old_type, + unsigned new_type) { int i; u64 real_updated_size = 0; @@ -410,7 +427,7 @@ u64 __init e820_update_range(u64 start, u64 size, unsigned old_type, size = ULLONG_MAX - start; for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; + struct e820entry *ei = &e820x->map[i]; u64 final_start, final_end; if (ei->type != old_type) continue; @@ -438,6 +455,19 @@ u64 __init e820_update_range(u64 start, u64 size, unsigned old_type, return real_updated_size; } +u64 __init e820_update_range(u64 start, u64 size, unsigned old_type, + unsigned new_type) +{ + return e820_update_range_map(&e820, start, size, old_type, new_type); +} + +static u64 __init e820_update_range_saved(u64 start, u64 size, + unsigned old_type, unsigned new_type) +{ + return e820_update_range_map(&e820_saved, start, size, old_type, + new_type); +} + /* make e820 not cover the range */ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type, int checktype) @@ -487,6 +517,15 @@ void __init update_e820(void) printk(KERN_INFO "modified physical RAM map:\n"); e820_print_map("modified"); } +static void __init update_e820_saved(void) +{ + int nr_map; + + nr_map = e820_saved.nr_map; + if (sanitize_e820_map(e820_saved.map, ARRAY_SIZE(e820_saved.map), &nr_map)) + return; + e820_saved.nr_map = nr_map; +} #define MAX_GAP_END 0x100000000ull /* * Search for a gap in the e820 memory space from start_addr to end_addr. @@ -991,8 +1030,10 @@ u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align) addr = round_down(start + size - sizet, align); e820_update_range(addr, sizet, E820_RAM, E820_RESERVED); + e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED); printk(KERN_INFO "update e820 for early_reserve_e820\n"); update_e820(); + update_e820_saved(); return addr; } @@ -1008,30 +1049,51 @@ u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align) #endif /* - * Last pfn which the user wants to use. - */ -unsigned long __initdata end_user_pfn = MAX_ARCH_PFN; - -/* * Find the highest page frame number we have available */ -unsigned long __init e820_end_of_ram(void) +static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type) { - unsigned long last_pfn; + int i; + unsigned long last_pfn = 0; unsigned long max_arch_pfn = MAX_ARCH_PFN; - last_pfn = find_max_pfn_with_active_regions(); + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + unsigned long start_pfn; + unsigned long end_pfn; + + if (ei->type != type) + continue; + + start_pfn = ei->addr >> PAGE_SHIFT; + end_pfn = (ei->addr + ei->size) >> PAGE_SHIFT; + + if (start_pfn >= limit_pfn) + continue; + if (end_pfn > limit_pfn) { + last_pfn = limit_pfn; + break; + } + if (end_pfn > last_pfn) + last_pfn = end_pfn; + } if (last_pfn > max_arch_pfn) last_pfn = max_arch_pfn; - if (last_pfn > end_user_pfn) - last_pfn = end_user_pfn; printk(KERN_INFO "last_pfn = %#lx max_arch_pfn = %#lx\n", last_pfn, max_arch_pfn); return last_pfn; } +unsigned long __init e820_end_of_ram_pfn(void) +{ + return e820_end_pfn(MAX_ARCH_PFN, E820_RAM); +} +unsigned long __init e820_end_of_low_ram_pfn(void) +{ + return e820_end_pfn(1UL<<(32 - PAGE_SHIFT), E820_RAM); +} /* * Finds an active region in the address range from start_pfn to last_pfn and * returns its range in ei_startpfn and ei_endpfn for the e820 entry. @@ -1062,12 +1124,6 @@ int __init e820_find_active_region(const struct e820entry *ei, if (*ei_endpfn > last_pfn) *ei_endpfn = last_pfn; - /* Obey end_user_pfn to save on memmap */ - if (*ei_startpfn >= end_user_pfn) - return 0; - if (*ei_endpfn > end_user_pfn) - *ei_endpfn = end_user_pfn; - return 1; } @@ -1113,6 +1169,8 @@ static void early_panic(char *msg) panic(msg); } +static int userdef __initdata; + /* "mem=nopentium" disables the 4MB page tables. */ static int __init parse_memopt(char *p) { @@ -1128,22 +1186,22 @@ static int __init parse_memopt(char *p) } #endif + userdef = 1; mem_size = memparse(p, &p); - end_user_pfn = mem_size>>PAGE_SHIFT; - e820_update_range(mem_size, ULLONG_MAX - mem_size, - E820_RAM, E820_RESERVED); + e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1); return 0; } early_param("mem", parse_memopt); -static int userdef __initdata; - static int __init parse_memmap_opt(char *p) { char *oldp; u64 start_at, mem_size; + if (!p) + return -EINVAL; + if (!strcmp(p, "exactmap")) { #ifdef CONFIG_CRASH_DUMP /* @@ -1151,9 +1209,7 @@ static int __init parse_memmap_opt(char *p) * the real mem size before original memory map is * reset. */ - e820_register_active_regions(0, 0, -1UL); - saved_max_pfn = e820_end_of_ram(); - remove_all_active_ranges(); + saved_max_pfn = e820_end_of_ram_pfn(); #endif e820.nr_map = 0; userdef = 1; @@ -1175,11 +1231,9 @@ static int __init parse_memmap_opt(char *p) } else if (*p == '$') { start_at = memparse(p+1, &p); e820_add_region(start_at, mem_size, E820_RESERVED); - } else { - end_user_pfn = (mem_size >> PAGE_SHIFT); - e820_update_range(mem_size, ULLONG_MAX - mem_size, - E820_RAM, E820_RESERVED); - } + } else + e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1); + return *p == '\0' ? 0 : -EINVAL; } early_param("memmap", parse_memmap_opt); @@ -1198,6 +1252,17 @@ void __init finish_e820_parsing(void) } } +static inline const char *e820_type_to_string(int e820_type) +{ + switch (e820_type) { + case E820_RESERVED_KERN: + case E820_RAM: return "System RAM"; + case E820_ACPI: return "ACPI Tables"; + case E820_NVS: return "ACPI Non-volatile Storage"; + default: return "reserved"; + } +} + /* * Mark e820 reserved areas as busy for the resource manager. */ @@ -1209,13 +1274,6 @@ void __init e820_reserve_resources(void) res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map); for (i = 0; i < e820.nr_map; i++) { - switch (e820.map[i].type) { - case E820_RESERVED_KERN: - case E820_RAM: res->name = "System RAM"; break; - case E820_ACPI: res->name = "ACPI Tables"; break; - case E820_NVS: res->name = "ACPI Non-volatile Storage"; break; - default: res->name = "reserved"; - } end = e820.map[i].addr + e820.map[i].size - 1; #ifndef CONFIG_RESOURCES_64BIT if (end > 0x100000000ULL) { @@ -1223,6 +1281,7 @@ void __init e820_reserve_resources(void) continue; } #endif + res->name = e820_type_to_string(e820.map[i].type); res->start = e820.map[i].addr; res->end = end; @@ -1230,8 +1289,20 @@ void __init e820_reserve_resources(void) insert_resource(&iomem_resource, res); res++; } + + for (i = 0; i < e820_saved.nr_map; i++) { + struct e820entry *entry = &e820_saved.map[i]; + firmware_map_add_early(entry->addr, + entry->addr + entry->size - 1, + e820_type_to_string(entry->type)); + } } +/* + * Non-standard memory setup can be specified via this quirk: + */ +char * (*arch_memory_setup_quirk)(void); + char *__init default_machine_specific_memory_setup(void) { char *who = "BIOS-e820"; @@ -1272,6 +1343,12 @@ char *__init default_machine_specific_memory_setup(void) char *__init __attribute__((weak)) machine_specific_memory_setup(void) { + if (arch_memory_setup_quirk) { + char *who = arch_memory_setup_quirk(); + + if (who) + return who; + } return default_machine_specific_memory_setup(); } @@ -1283,8 +1360,12 @@ char * __init __attribute__((weak)) memory_setup(void) void __init setup_memory_map(void) { + char *who; + + who = memory_setup(); + memcpy(&e820_saved, &e820, sizeof(struct e820map)); printk(KERN_INFO "BIOS-provided physical RAM map:\n"); - e820_print_map(memory_setup()); + e820_print_map(who); } #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index a4665f37cfc..a0e11c0cc87 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -120,7 +120,18 @@ static struct chipset early_qrk[] __initdata = { {} }; -static void __init check_dev_quirk(int num, int slot, int func) +/** + * check_dev_quirk - apply early quirks to a given PCI device + * @num: bus number + * @slot: slot number + * @func: PCI function + * + * Check the vendor & device ID against the early quirks table. + * + * If the device is single function, let early_quirks() know so we don't + * poke at this device again. + */ +static int __init check_dev_quirk(int num, int slot, int func) { u16 class; u16 vendor; @@ -131,7 +142,7 @@ static void __init check_dev_quirk(int num, int slot, int func) class = read_pci_config_16(num, slot, func, PCI_CLASS_DEVICE); if (class == 0xffff) - return; + return -1; /* no class, treat as single function */ vendor = read_pci_config_16(num, slot, func, PCI_VENDOR_ID); @@ -154,7 +165,9 @@ static void __init check_dev_quirk(int num, int slot, int func) type = read_pci_config_byte(num, slot, func, PCI_HEADER_TYPE); if (!(type & 0x80)) - return; + return -1; + + return 0; } void __init early_quirks(void) @@ -167,6 +180,9 @@ void __init early_quirks(void) /* Poor man's PCI discovery */ for (num = 0; num < 32; num++) for (slot = 0; slot < 32; slot++) - for (func = 0; func < 8; func++) - check_dev_quirk(num, slot, func); + for (func = 0; func < 8; func++) { + /* Only probe function 0 on single fn devices */ + if (check_dev_quirk(num, slot, func)) + break; + } } diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 643fd861b72..ff9e7350da5 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -196,7 +196,7 @@ static struct console simnow_console = { static struct console *early_console = &early_vga_console; static int early_console_initialized; -void early_printk(const char *fmt, ...) +asmlinkage void early_printk(const char *fmt, ...) { char buf[512]; int n; diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c index 94382faeadb..06cc8d4254b 100644 --- a/arch/x86/kernel/efi.c +++ b/arch/x86/kernel/efi.c @@ -473,7 +473,7 @@ void __init efi_enter_virtual_mode(void) size = md->num_pages << EFI_PAGE_SHIFT; end = md->phys_addr + size; - if (PFN_UP(end) <= max_pfn_mapped) + if (PFN_UP(end) <= max_low_pfn_mapped) va = __va(md->phys_addr); else va = efi_ioremap(md->phys_addr, size); diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 53393c306e1..6bc07f0f120 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -51,6 +51,7 @@ #include <asm/percpu.h> #include <asm/dwarf2.h> #include <asm/processor-flags.h> +#include <asm/ftrace.h> #include <asm/irq_vectors.h> /* @@ -1024,6 +1025,7 @@ ENTRY(xen_sysenter_target) RING0_INT_FRAME addl $5*4, %esp /* remove xen-provided frame */ jmp sysenter_past_esp + CFI_ENDPROC ENTRY(xen_hypervisor_callback) CFI_STARTPROC @@ -1110,6 +1112,77 @@ ENDPROC(xen_failsafe_callback) #endif /* CONFIG_XEN */ +#ifdef CONFIG_FTRACE +#ifdef CONFIG_DYNAMIC_FTRACE + +ENTRY(mcount) + pushl %eax + pushl %ecx + pushl %edx + movl 0xc(%esp), %eax + subl $MCOUNT_INSN_SIZE, %eax + +.globl mcount_call +mcount_call: + call ftrace_stub + + popl %edx + popl %ecx + popl %eax + + ret +END(mcount) + +ENTRY(ftrace_caller) + pushl %eax + pushl %ecx + pushl %edx + movl 0xc(%esp), %eax + movl 0x4(%ebp), %edx + subl $MCOUNT_INSN_SIZE, %eax + +.globl ftrace_call +ftrace_call: + call ftrace_stub + + popl %edx + popl %ecx + popl %eax + +.globl ftrace_stub +ftrace_stub: + ret +END(ftrace_caller) + +#else /* ! CONFIG_DYNAMIC_FTRACE */ + +ENTRY(mcount) + cmpl $ftrace_stub, ftrace_trace_function + jnz trace +.globl ftrace_stub +ftrace_stub: + ret + + /* taken from glibc */ +trace: + pushl %eax + pushl %ecx + pushl %edx + movl 0xc(%esp), %eax + movl 0x4(%ebp), %edx + subl $MCOUNT_INSN_SIZE, %eax + + call *ftrace_trace_function + + popl %edx + popl %ecx + popl %eax + + jmp ftrace_stub +END(mcount) +#endif /* CONFIG_DYNAMIC_FTRACE */ +#endif /* CONFIG_FTRACE */ + .section .rodata,"a" #include "syscall_table_32.S" diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 07d69f26233..ae63e584c34 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -51,9 +51,115 @@ #include <asm/page.h> #include <asm/irqflags.h> #include <asm/paravirt.h> +#include <asm/ftrace.h> .code64 +#ifdef CONFIG_FTRACE +#ifdef CONFIG_DYNAMIC_FTRACE +ENTRY(mcount) + + subq $0x38, %rsp + movq %rax, (%rsp) + movq %rcx, 8(%rsp) + movq %rdx, 16(%rsp) + movq %rsi, 24(%rsp) + movq %rdi, 32(%rsp) + movq %r8, 40(%rsp) + movq %r9, 48(%rsp) + + movq 0x38(%rsp), %rdi + subq $MCOUNT_INSN_SIZE, %rdi + +.globl mcount_call +mcount_call: + call ftrace_stub + + movq 48(%rsp), %r9 + movq 40(%rsp), %r8 + movq 32(%rsp), %rdi + movq 24(%rsp), %rsi + movq 16(%rsp), %rdx + movq 8(%rsp), %rcx + movq (%rsp), %rax + addq $0x38, %rsp + + retq +END(mcount) + +ENTRY(ftrace_caller) + + /* taken from glibc */ + subq $0x38, %rsp + movq %rax, (%rsp) + movq %rcx, 8(%rsp) + movq %rdx, 16(%rsp) + movq %rsi, 24(%rsp) + movq %rdi, 32(%rsp) + movq %r8, 40(%rsp) + movq %r9, 48(%rsp) + + movq 0x38(%rsp), %rdi + movq 8(%rbp), %rsi + subq $MCOUNT_INSN_SIZE, %rdi + +.globl ftrace_call +ftrace_call: + call ftrace_stub + + movq 48(%rsp), %r9 + movq 40(%rsp), %r8 + movq 32(%rsp), %rdi + movq 24(%rsp), %rsi + movq 16(%rsp), %rdx + movq 8(%rsp), %rcx + movq (%rsp), %rax + addq $0x38, %rsp + +.globl ftrace_stub +ftrace_stub: + retq +END(ftrace_caller) + +#else /* ! CONFIG_DYNAMIC_FTRACE */ +ENTRY(mcount) + cmpq $ftrace_stub, ftrace_trace_function + jnz trace +.globl ftrace_stub +ftrace_stub: + retq + +trace: + /* taken from glibc */ + subq $0x38, %rsp + movq %rax, (%rsp) + movq %rcx, 8(%rsp) + movq %rdx, 16(%rsp) + movq %rsi, 24(%rsp) + movq %rdi, 32(%rsp) + movq %r8, 40(%rsp) + movq %r9, 48(%rsp) + + movq 0x38(%rsp), %rdi + movq 8(%rbp), %rsi + subq $MCOUNT_INSN_SIZE, %rdi + + call *ftrace_trace_function + + movq 48(%rsp), %r9 + movq 40(%rsp), %r8 + movq 32(%rsp), %rdi + movq 24(%rsp), %rsi + movq 16(%rsp), %rdx + movq 8(%rsp), %rcx + movq (%rsp), %rax + addq $0x38, %rsp + + jmp ftrace_stub +END(mcount) +#endif /* CONFIG_DYNAMIC_FTRACE */ +#endif /* CONFIG_FTRACE */ + #ifndef CONFIG_PREEMPT #define retint_kernel retint_restore_args #endif @@ -168,13 +274,13 @@ ENTRY(ret_from_fork) CFI_ADJUST_CFA_OFFSET -4 call schedule_tail GET_THREAD_INFO(%rcx) - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx) + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%rcx) jnz rff_trace rff_action: RESTORE_REST testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread? je int_ret_from_sys_call - testl $_TIF_IA32,threadinfo_flags(%rcx) + testl $_TIF_IA32,TI_flags(%rcx) jnz int_ret_from_sys_call RESTORE_TOP_OF_STACK %rdi,ARGOFFSET jmp ret_from_sys_call @@ -243,7 +349,8 @@ ENTRY(system_call_after_swapgs) movq %rcx,RIP-ARGOFFSET(%rsp) CFI_REL_OFFSET rip,RIP-ARGOFFSET GET_THREAD_INFO(%rcx) - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx) + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \ + TI_flags(%rcx) jnz tracesys cmpq $__NR_syscall_max,%rax ja badsys @@ -262,7 +369,7 @@ sysret_check: GET_THREAD_INFO(%rcx) DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - movl threadinfo_flags(%rcx),%edx + movl TI_flags(%rcx),%edx andl %edi,%edx jnz sysret_careful CFI_REMEMBER_STATE @@ -305,7 +412,7 @@ sysret_signal: leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 xorl %esi,%esi # oldset -> arg2 call ptregscall_common -1: movl $_TIF_NEED_RESCHED,%edi +1: movl $_TIF_WORK_MASK,%edi /* Use IRET because user could have changed frame. This works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ DISABLE_INTERRUPTS(CLBR_NONE) @@ -347,10 +454,10 @@ int_ret_from_sys_call: int_with_check: LOCKDEP_SYS_EXIT_IRQ GET_THREAD_INFO(%rcx) - movl threadinfo_flags(%rcx),%edx + movl TI_flags(%rcx),%edx andl %edi,%edx jnz int_careful - andl $~TS_COMPAT,threadinfo_status(%rcx) + andl $~TS_COMPAT,TI_status(%rcx) jmp retint_swapgs /* Either reschedule or signal or syscall exit tracking needed. */ @@ -393,7 +500,7 @@ int_signal: movq %rsp,%rdi # &ptregs -> arg1 xorl %esi,%esi # oldset -> arg2 call do_notify_resume -1: movl $_TIF_NEED_RESCHED,%edi +1: movl $_TIF_WORK_MASK,%edi int_restore_rest: RESTORE_REST DISABLE_INTERRUPTS(CLBR_NONE) @@ -558,7 +665,7 @@ retint_with_reschedule: movl $_TIF_WORK_MASK,%edi retint_check: LOCKDEP_SYS_EXIT_IRQ - movl threadinfo_flags(%rcx),%edx + movl TI_flags(%rcx),%edx andl %edi,%edx CFI_REMEMBER_STATE jnz retint_careful @@ -646,17 +753,16 @@ retint_signal: RESTORE_REST DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - movl $_TIF_NEED_RESCHED,%edi GET_THREAD_INFO(%rcx) - jmp retint_check + jmp retint_with_reschedule #ifdef CONFIG_PREEMPT /* Returning to kernel space. Check if we need preemption */ /* rcx: threadinfo. interrupts off. */ ENTRY(retint_kernel) - cmpl $0,threadinfo_preempt_count(%rcx) + cmpl $0,TI_preempt_count(%rcx) jnz retint_restore_args - bt $TIF_NEED_RESCHED,threadinfo_flags(%rcx) + bt $TIF_NEED_RESCHED,TI_flags(%rcx) jnc retint_restore_args bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */ jnc retint_restore_args @@ -710,6 +816,9 @@ END(invalidate_interrupt\num) ENTRY(call_function_interrupt) apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt END(call_function_interrupt) +ENTRY(call_function_single_interrupt) + apicinterrupt CALL_FUNCTION_SINGLE_VECTOR,smp_call_function_single_interrupt +END(call_function_single_interrupt) ENTRY(irq_move_cleanup_interrupt) apicinterrupt IRQ_MOVE_CLEANUP_VECTOR,smp_irq_move_cleanup_interrupt END(irq_move_cleanup_interrupt) @@ -819,7 +928,7 @@ paranoid_restore\trace: jmp irq_return paranoid_userspace\trace: GET_THREAD_INFO(%rcx) - movl threadinfo_flags(%rcx),%ebx + movl TI_flags(%rcx),%ebx andl $_TIF_WORK_MASK,%ebx jz paranoid_swapgs\trace movq %rsp,%rdi /* &pt_regs */ @@ -917,7 +1026,7 @@ error_exit: testl %eax,%eax jne retint_kernel LOCKDEP_SYS_EXIT_IRQ - movl threadinfo_flags(%rcx),%edx + movl TI_flags(%rcx),%edx movl $_TIF_WORK_MASK,%edi andl %edi,%edx jnz retint_careful diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c new file mode 100644 index 00000000000..ab115cd15fd --- /dev/null +++ b/arch/x86/kernel/ftrace.c @@ -0,0 +1,141 @@ +/* + * Code for replacing ftrace calls with jumps. + * + * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com> + * + * Thanks goes to Ingo Molnar, for suggesting the idea. + * Mathieu Desnoyers, for suggesting postponing the modifications. + * Arjan van de Ven, for keeping me straight, and explaining to me + * the dangers of modifying code on the run. + */ + +#include <linux/spinlock.h> +#include <linux/hardirq.h> +#include <linux/ftrace.h> +#include <linux/percpu.h> +#include <linux/init.h> +#include <linux/list.h> + +#include <asm/alternative.h> +#include <asm/ftrace.h> + + +/* Long is fine, even if it is only 4 bytes ;-) */ +static long *ftrace_nop; + +union ftrace_code_union { + char code[MCOUNT_INSN_SIZE]; + struct { + char e8; + int offset; + } __attribute__((packed)); +}; + + +static int notrace ftrace_calc_offset(long ip, long addr) +{ + return (int)(addr - ip); +} + +notrace unsigned char *ftrace_nop_replace(void) +{ + return (char *)ftrace_nop; +} + +notrace unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) +{ + static union ftrace_code_union calc; + + calc.e8 = 0xe8; + calc.offset = ftrace_calc_offset(ip + MCOUNT_INSN_SIZE, addr); + + /* + * No locking needed, this must be called via kstop_machine + * which in essence is like running on a uniprocessor machine. + */ + return calc.code; +} + +notrace int +ftrace_modify_code(unsigned long ip, unsigned char *old_code, + unsigned char *new_code) +{ + unsigned replaced; + unsigned old = *(unsigned *)old_code; /* 4 bytes */ + unsigned new = *(unsigned *)new_code; /* 4 bytes */ + unsigned char newch = new_code[4]; + int faulted = 0; + + /* + * Note: Due to modules and __init, code can + * disappear and change, we need to protect against faulting + * as well as code changing. + * + * No real locking needed, this code is run through + * kstop_machine. + */ + asm volatile ( + "1: lock\n" + " cmpxchg %3, (%2)\n" + " jnz 2f\n" + " movb %b4, 4(%2)\n" + "2:\n" + ".section .fixup, \"ax\"\n" + "3: movl $1, %0\n" + " jmp 2b\n" + ".previous\n" + _ASM_EXTABLE(1b, 3b) + : "=r"(faulted), "=a"(replaced) + : "r"(ip), "r"(new), "c"(newch), + "0"(faulted), "a"(old) + : "memory"); + sync_core(); + + if (replaced != old && replaced != new) + faulted = 2; + + return faulted; +} + +notrace int ftrace_update_ftrace_func(ftrace_func_t func) +{ + unsigned long ip = (unsigned long)(&ftrace_call); + unsigned char old[MCOUNT_INSN_SIZE], *new; + int ret; + + memcpy(old, &ftrace_call, MCOUNT_INSN_SIZE); + new = ftrace_call_replace(ip, (unsigned long)func); + ret = ftrace_modify_code(ip, old, new); + + return ret; +} + +notrace int ftrace_mcount_set(unsigned long *data) +{ + unsigned long ip = (long)(&mcount_call); + unsigned long *addr = data; + unsigned char old[MCOUNT_INSN_SIZE], *new; + + /* + * Replace the mcount stub with a pointer to the + * ip recorder function. + */ + memcpy(old, &mcount_call, MCOUNT_INSN_SIZE); + new = ftrace_call_replace(ip, *addr); + *addr = ftrace_modify_code(ip, old, new); + + return 0; +} + +int __init ftrace_dyn_arch_init(void *data) +{ + const unsigned char *const *noptable = find_nop_table(); + + /* This is running in kstop_machine */ + + ftrace_mcount_set(data); + + ftrace_nop = (unsigned long *)noptable[MCOUNT_INSN_SIZE]; + + return 0; +} diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c index 45e84acca8a..711f11c30b0 100644 --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/genx2apic_uv_x.c @@ -8,6 +8,7 @@ * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved. */ +#include <linux/kernel.h> #include <linux/threads.h> #include <linux/cpumask.h> #include <linux/string.h> @@ -20,6 +21,7 @@ #include <asm/smp.h> #include <asm/ipi.h> #include <asm/genapic.h> +#include <asm/pgtable.h> #include <asm/uv/uv_mmrs.h> #include <asm/uv/uv_hub.h> @@ -208,14 +210,79 @@ static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size) BUG(); } +static __init void map_low_mmrs(void) +{ + init_extra_mapping_uc(UV_GLOBAL_MMR32_BASE, UV_GLOBAL_MMR32_SIZE); + init_extra_mapping_uc(UV_LOCAL_MMR_BASE, UV_LOCAL_MMR_SIZE); +} + +enum map_type {map_wb, map_uc}; + +static void map_high(char *id, unsigned long base, int shift, enum map_type map_type) +{ + unsigned long bytes, paddr; + + paddr = base << shift; + bytes = (1UL << shift); + printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr, + paddr + bytes); + if (map_type == map_uc) + init_extra_mapping_uc(paddr, bytes); + else + init_extra_mapping_wb(paddr, bytes); + +} +static __init void map_gru_high(int max_pnode) +{ + union uvh_rh_gam_gru_overlay_config_mmr_u gru; + int shift = UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT; + + gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR); + if (gru.s.enable) + map_high("GRU", gru.s.base, shift, map_wb); +} + +static __init void map_config_high(int max_pnode) +{ + union uvh_rh_gam_cfg_overlay_config_mmr_u cfg; + int shift = UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_BASE_SHFT; + + cfg.v = uv_read_local_mmr(UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR); + if (cfg.s.enable) + map_high("CONFIG", cfg.s.base, shift, map_uc); +} + +static __init void map_mmr_high(int max_pnode) +{ + union uvh_rh_gam_mmr_overlay_config_mmr_u mmr; + int shift = UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT; + + mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR); + if (mmr.s.enable) + map_high("MMR", mmr.s.base, shift, map_uc); +} + +static __init void map_mmioh_high(int max_pnode) +{ + union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh; + int shift = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT; + + mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR); + if (mmioh.s.enable) + map_high("MMIOH", mmioh.s.base, shift, map_uc); +} + static __init void uv_system_init(void) { union uvh_si_addr_map_config_u m_n_config; union uvh_node_id_u node_id; unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size; int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val; + int max_pnode = 0; unsigned long mmr_base, present; + map_low_mmrs(); + m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG); m_val = m_n_config.s.m_skt; n_val = m_n_config.s.n_skt; @@ -281,12 +348,18 @@ static __init void uv_system_init(void) uv_cpu_hub_info(cpu)->coherency_domain_number = 0;/* ZZZ */ uv_node_to_blade[nid] = blade; uv_cpu_to_blade[cpu] = blade; + max_pnode = max(pnode, max_pnode); - printk(KERN_DEBUG "UV cpu %d, apicid 0x%x, pnode %d, nid %d, " + printk(KERN_DEBUG "UV: cpu %d, apicid 0x%x, pnode %d, nid %d, " "lcpu %d, blade %d\n", cpu, per_cpu(x86_cpu_to_apicid, cpu), pnode, nid, lcpu, blade); } + + map_gru_high(max_pnode); + map_mmr_high(max_pnode); + map_config_high(max_pnode); + map_mmioh_high(max_pnode); } /* diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index ea230ec6905..0ea6a19bfdf 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -36,26 +36,15 @@ static inline void hpet_writel(unsigned long d, unsigned long a) } #ifdef CONFIG_X86_64 - #include <asm/pgtable.h> - -static inline void hpet_set_mapping(void) -{ - set_fixmap_nocache(FIX_HPET_BASE, hpet_address); - __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE); - hpet_virt_address = (void __iomem *)fix_to_virt(FIX_HPET_BASE); -} - -static inline void hpet_clear_mapping(void) -{ - hpet_virt_address = NULL; -} - -#else +#endif static inline void hpet_set_mapping(void) { hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE); +#ifdef CONFIG_X86_64 + __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE); +#endif } static inline void hpet_clear_mapping(void) @@ -63,7 +52,6 @@ static inline void hpet_clear_mapping(void) iounmap(hpet_virt_address); hpet_virt_address = NULL; } -#endif /* * HPET command line enable / disable diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c index deb43785e92..dd7ebee446a 100644 --- a/arch/x86/kernel/i386_ksyms_32.c +++ b/arch/x86/kernel/i386_ksyms_32.c @@ -1,7 +1,14 @@ #include <linux/module.h> + #include <asm/checksum.h> -#include <asm/desc.h> #include <asm/pgtable.h> +#include <asm/desc.h> +#include <asm/ftrace.h> + +#ifdef CONFIG_FTRACE +/* mcount is defined in assembly */ +EXPORT_SYMBOL(mcount); +#endif /* Networking helper routines. */ EXPORT_SYMBOL(csum_partial_copy_generic); diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index 337ec3438a8..558abf4c796 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -1569,7 +1569,7 @@ void /*__init*/ print_local_APIC(void *dummy) void print_all_local_APICs(void) { - on_each_cpu(print_local_APIC, NULL, 1, 1); + on_each_cpu(print_local_APIC, NULL, 1); } void /*__init*/ print_PIC(void) @@ -2020,7 +2020,7 @@ static inline void init_IO_APIC_traps(void) * The local APIC irq-chip implementation: */ -static void ack_apic(unsigned int irq) +static void ack_lapic_irq(unsigned int irq) { ack_APIC_irq(); } @@ -2045,9 +2045,17 @@ static struct irq_chip lapic_chip __read_mostly = { .name = "local-APIC", .mask = mask_lapic_irq, .unmask = unmask_lapic_irq, - .eoi = ack_apic, + .ack = ack_lapic_irq, }; +static void lapic_register_intr(int irq, int vector) +{ + irq_desc[irq].status &= ~IRQ_LEVEL; + set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, + "edge"); + set_intr_gate(vector, interrupt[irq]); +} + static void __init setup_nmi(void) { /* @@ -2247,8 +2255,7 @@ static inline void __init check_timer(void) printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); - set_irq_chip_and_handler_name(0, &lapic_chip, handle_fasteoi_irq, - "fasteoi"); + lapic_register_intr(0, vector); apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */ enable_8259A_irq(0); @@ -2280,11 +2287,21 @@ out: } /* - * - * IRQ's that are handled by the PIC in the MPS IOAPIC case. - * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ. - * Linux doesn't really care, as it's not actually used - * for any interrupt handling anyway. + * Traditionally ISA IRQ2 is the cascade IRQ, and is not available + * to devices. However there may be an I/O APIC pin available for + * this interrupt regardless. The pin may be left unconnected, but + * typically it will be reused as an ExtINT cascade interrupt for + * the master 8259A. In the MPS case such a pin will normally be + * reported as an ExtINT interrupt in the MP table. With ACPI + * there is no provision for ExtINT interrupts, and in the absence + * of an override it would be treated as an ordinary ISA I/O APIC + * interrupt, that is edge-triggered and unmasked by default. We + * used to do this, but it caused problems on some systems because + * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using + * the same ExtINT cascade interrupt to drive the local APIC of the + * bootstrap processor. Therefore we refrain from routing IRQ2 to + * the I/O APIC in all cases now. No actual device should request + * it anyway. --macro */ #define PIC_IRQS (1 << PIC_CASCADE_IR) @@ -2298,10 +2315,7 @@ void __init setup_IO_APIC(void) enable_IO_APIC(); - if (acpi_ioapic) - io_apic_irqs = ~0; /* all IRQs go through IOAPIC */ - else - io_apic_irqs = ~PIC_IRQS; + io_apic_irqs = ~PIC_IRQS; printk("ENABLING IO-APIC IRQs\n"); diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 2b4c40bc12c..6510cde36b3 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -1160,7 +1160,7 @@ void __apicdebuginit print_local_APIC(void * dummy) void print_all_local_APICs (void) { - on_each_cpu(print_local_APIC, NULL, 1, 1); + on_each_cpu(print_local_APIC, NULL, 1); } void __apicdebuginit print_PIC(void) @@ -1554,7 +1554,7 @@ static inline void init_IO_APIC_traps(void) } } -static void enable_lapic_irq (unsigned int irq) +static void unmask_lapic_irq(unsigned int irq) { unsigned long v; @@ -1562,7 +1562,7 @@ static void enable_lapic_irq (unsigned int irq) apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); } -static void disable_lapic_irq (unsigned int irq) +static void mask_lapic_irq(unsigned int irq) { unsigned long v; @@ -1575,19 +1575,20 @@ static void ack_lapic_irq (unsigned int irq) ack_APIC_irq(); } -static void end_lapic_irq (unsigned int i) { /* nothing */ } - -static struct hw_interrupt_type lapic_irq_type __read_mostly = { - .name = "local-APIC", - .typename = "local-APIC-edge", - .startup = NULL, /* startup_irq() not used for IRQ0 */ - .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */ - .enable = enable_lapic_irq, - .disable = disable_lapic_irq, - .ack = ack_lapic_irq, - .end = end_lapic_irq, +static struct irq_chip lapic_chip __read_mostly = { + .name = "local-APIC", + .mask = mask_lapic_irq, + .unmask = unmask_lapic_irq, + .ack = ack_lapic_irq, }; +static void lapic_register_intr(int irq) +{ + irq_desc[irq].status &= ~IRQ_LEVEL; + set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, + "edge"); +} + static void __init setup_nmi(void) { /* @@ -1714,11 +1715,6 @@ static inline void __init check_timer(void) apic2 = apic1; } - replace_pin_at_irq(0, 0, 0, apic1, pin1); - apic1 = 0; - pin1 = 0; - setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); - if (pin1 != -1) { /* * Ok, does IRQ0 through the IOAPIC work? @@ -1779,7 +1775,7 @@ static inline void __init check_timer(void) apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); - irq_desc[0].chip = &lapic_irq_type; + lapic_register_intr(0); apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ enable_8259A_irq(0); @@ -1817,11 +1813,21 @@ static int __init notimercheck(char *s) __setup("no_timer_check", notimercheck); /* - * - * IRQs that are handled by the PIC in the MPS IOAPIC case. - * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ. - * Linux doesn't really care, as it's not actually used - * for any interrupt handling anyway. + * Traditionally ISA IRQ2 is the cascade IRQ, and is not available + * to devices. However there may be an I/O APIC pin available for + * this interrupt regardless. The pin may be left unconnected, but + * typically it will be reused as an ExtINT cascade interrupt for + * the master 8259A. In the MPS case such a pin will normally be + * reported as an ExtINT interrupt in the MP table. With ACPI + * there is no provision for ExtINT interrupts, and in the absence + * of an override it would be treated as an ordinary ISA I/O APIC + * interrupt, that is edge-triggered and unmasked by default. We + * used to do this, but it caused problems on some systems because + * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using + * the same ExtINT cascade interrupt to drive the local APIC of the + * bootstrap processor. Therefore we refrain from routing IRQ2 to + * the I/O APIC in all cases now. No actual device should request + * it anyway. --macro */ #define PIC_IRQS (1<<2) @@ -1832,10 +1838,7 @@ void __init setup_IO_APIC(void) * calling enable_IO_APIC() is moved to setup_local_APIC for BP */ - if (acpi_ioapic) - io_apic_irqs = ~0; /* all IRQs go through IOAPIC */ - else - io_apic_irqs = ~PIC_IRQS; + io_apic_irqs = ~PIC_IRQS; apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index 31f49e8f46a..0373e88de95 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -199,6 +199,10 @@ void __init native_init_IRQ(void) /* IPI for generic function call */ alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); + /* IPI for generic single function call */ + alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, + call_function_single_interrupt); + /* Low priority IPI to cleanup after moving an irq */ set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); #endif diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 21f2bae98c1..a8449571858 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -68,7 +68,7 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload) load_LDT(pc); mask = cpumask_of_cpu(smp_processor_id()); if (!cpus_equal(current->mm->cpu_vm_mask, mask)) - smp_call_function(flush_ldt, current->mm, 1, 1); + smp_call_function(flush_ldt, current->mm, 1); preempt_enable(); #else load_LDT(pc); diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index f4960171bc6..8864230d55a 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -11,6 +11,8 @@ #include <linux/delay.h> #include <linux/init.h> #include <linux/numa.h> +#include <linux/ftrace.h> + #include <asm/pgtable.h> #include <asm/pgalloc.h> #include <asm/tlbflush.h> @@ -107,6 +109,8 @@ NORET_TYPE void machine_kexec(struct kimage *image) unsigned long page_list[PAGES_NR]; void *control_page; + tracer_disable(); + /* Interrupts aren't acceptable while we reboot */ local_irq_disable(); diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 7830dc4a838..9dd9262693a 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -11,6 +11,8 @@ #include <linux/string.h> #include <linux/reboot.h> #include <linux/numa.h> +#include <linux/ftrace.h> + #include <asm/pgtable.h> #include <asm/tlbflush.h> #include <asm/mmu_context.h> @@ -184,6 +186,8 @@ NORET_TYPE void machine_kexec(struct kimage *image) unsigned long page_list[PAGES_NR]; void *control_page; + tracer_disable(); + /* Interrupts aren't acceptable while we reboot */ local_irq_disable(); diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c index 9758fea87c5..56b933119a0 100644 --- a/arch/x86/kernel/microcode.c +++ b/arch/x86/kernel/microcode.c @@ -76,6 +76,7 @@ #include <linux/kernel.h> #include <linux/init.h> #include <linux/sched.h> +#include <linux/smp_lock.h> #include <linux/cpumask.h> #include <linux/module.h> #include <linux/slab.h> @@ -423,6 +424,7 @@ out: static int microcode_open (struct inode *unused1, struct file *unused2) { + cycle_kernel_lock(); return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; } @@ -489,7 +491,7 @@ MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); #define microcode_dev_exit() do { } while(0) #endif -static long get_next_ucode_from_buffer(void **mc, void *buf, +static long get_next_ucode_from_buffer(void **mc, const u8 *buf, unsigned long size, long offset) { microcode_header_t *mc_header; @@ -523,7 +525,7 @@ static int cpu_request_microcode(int cpu) char name[30]; struct cpuinfo_x86 *c = &cpu_data(cpu); const struct firmware *firmware; - void *buf; + const u8 *buf; unsigned long size; long offset = 0; int error; diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 8b6b1e05c30..3b25e49380c 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -726,12 +726,22 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type) static struct intel_mp_floating *mpf_found; /* + * Machine specific quirk for finding the SMP config before other setup + * activities destroy the table: + */ +int (*mach_get_smp_config_quirk)(unsigned int early); + +/* * Scan the memory blocks for an SMP configuration block. */ -static void __init __get_smp_config(unsigned early) +static void __init __get_smp_config(unsigned int early) { struct intel_mp_floating *mpf = mpf_found; + if (mach_get_smp_config_quirk) { + if (mach_get_smp_config_quirk(early)) + return; + } if (acpi_lapic && early) return; /* @@ -889,10 +899,16 @@ static int __init smp_scan_config(unsigned long base, unsigned long length, return 0; } -static void __init __find_smp_config(unsigned reserve) +int (*mach_find_smp_config_quirk)(unsigned int reserve); + +static void __init __find_smp_config(unsigned int reserve) { unsigned int address; + if (mach_find_smp_config_quirk) { + if (mach_find_smp_config_quirk(reserve)) + return; + } /* * FIXME: Linux assumes you have 640K of base ram.. * this continues the error... diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 1f3abe048e9..a153b3905f6 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -117,12 +117,20 @@ static int msr_open(struct inode *inode, struct file *file) { unsigned int cpu = iminor(file->f_path.dentry->d_inode); struct cpuinfo_x86 *c = &cpu_data(cpu); + int ret = 0; - if (cpu >= NR_CPUS || !cpu_online(cpu)) - return -ENXIO; /* No such CPU */ - if (!cpu_has(c, X86_FEATURE_MSR)) - return -EIO; /* MSR not supported */ + lock_kernel(); + cpu = iminor(file->f_path.dentry->d_inode); + if (cpu >= NR_CPUS || !cpu_online(cpu)) { + ret = -ENXIO; /* No such CPU */ + goto out; + } + c = &cpu_data(cpu); + if (!cpu_has(c, X86_FEATURE_MSR)) + ret = -EIO; /* MSR not supported */ +out: + unlock_kernel(); return 0; } diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 8dfe9db87a9..ec024b3baad 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -130,7 +130,7 @@ int __init check_nmi_watchdog(void) #ifdef CONFIG_SMP if (nmi_watchdog == NMI_LOCAL_APIC) - smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0); + smp_call_function(nmi_cpu_busy, (void *)&endflag, 0); #endif for_each_possible_cpu(cpu) @@ -171,6 +171,9 @@ int __init check_nmi_watchdog(void) error: if (nmi_watchdog == NMI_IO_APIC && !timer_through_8259) disable_8259A_irq(0); +#ifdef CONFIG_X86_32 + timer_ack = 0; +#endif return -1; } @@ -269,7 +272,7 @@ static void __acpi_nmi_enable(void *__unused) void acpi_nmi_enable(void) { if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) - on_each_cpu(__acpi_nmi_enable, NULL, 0, 1); + on_each_cpu(__acpi_nmi_enable, NULL, 1); } static void __acpi_nmi_disable(void *__unused) @@ -283,7 +286,7 @@ static void __acpi_nmi_disable(void *__unused) void acpi_nmi_disable(void) { if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) - on_each_cpu(__acpi_nmi_disable, NULL, 0, 1); + on_each_cpu(__acpi_nmi_disable, NULL, 1); } void setup_apic_nmi_watchdog(void *unused) diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c index f0f1de1c4a1..a23e8233b9a 100644 --- a/arch/x86/kernel/numaq_32.c +++ b/arch/x86/kernel/numaq_32.c @@ -93,12 +93,13 @@ int __init get_memcfg_numaq(void) return 1; } -static int __init numaq_tsc_disable(void) +void __init numaq_tsc_disable(void) { + if (!found_numaq) + return; + if (num_online_nodes() > 1) { printk(KERN_DEBUG "NUMAQ: disabling TSC\n"); setup_clear_cpu_cap(X86_FEATURE_TSC); } - return 0; } -arch_initcall(numaq_tsc_disable); diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index e7e5652f65b..e0f571d58c1 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -285,7 +285,7 @@ struct pv_time_ops pv_time_ops = { .get_wallclock = native_get_wallclock, .set_wallclock = native_set_wallclock, .sched_clock = native_sched_clock, - .get_cpu_khz = native_calculate_cpu_khz, + .get_tsc_khz = native_calibrate_tsc, }; struct pv_irq_ops pv_irq_ops = { diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index d0d18db5d2a..c3fe78406d1 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -630,6 +630,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info) struct pci_dev *dev; void *gatt; int i, error; + unsigned long start_pfn, end_pfn; printk(KERN_INFO "PCI-DMA: Disabling AGP.\n"); aper_size = aper_base = info->aper_size = 0; @@ -674,6 +675,13 @@ static __init int init_k8_gatt(struct agp_kern_info *info) printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n", aper_base, aper_size>>10); + + /* need to map that range */ + end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT); + if (end_pfn > max_low_pfn_mapped) { + start_pfn = (aper_base>>PAGE_SHIFT); + init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT); + } return 0; nommu: diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 4061d63aabe..4d629c62f4f 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -7,6 +7,12 @@ #include <linux/module.h> #include <linux/pm.h> #include <linux/clockchips.h> +#include <asm/system.h> + +unsigned long idle_halt; +EXPORT_SYMBOL(idle_halt); +unsigned long idle_nomwait; +EXPORT_SYMBOL(idle_nomwait); struct kmem_cache *task_xstate_cachep; @@ -132,7 +138,7 @@ void cpu_idle_wait(void) { smp_mb(); /* kick all the CPUs so that they exit out of pm_idle */ - smp_call_function(do_nothing, NULL, 0, 1); + smp_call_function(do_nothing, NULL, 1); } EXPORT_SYMBOL_GPL(cpu_idle_wait); @@ -325,7 +331,27 @@ static int __init idle_setup(char *str) pm_idle = poll_idle; } else if (!strcmp(str, "mwait")) force_mwait = 1; - else + else if (!strcmp(str, "halt")) { + /* + * When the boot option of idle=halt is added, halt is + * forced to be used for CPU idle. In such case CPU C2/C3 + * won't be used again. + * To continue to load the CPU idle driver, don't touch + * the boot_option_idle_override. + */ + pm_idle = default_idle; + idle_halt = 1; + return 0; + } else if (!strcmp(str, "nomwait")) { + /* + * If the boot option of "idle=nomwait" is added, + * it means that mwait will be disabled for CPU C2/C3 + * states. In such case it won't touch the variable + * of boot_option_idle_override. + */ + idle_nomwait = 1; + return 0; + } else return -1; boot_option_idle_override = 1; diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 9a139f6c9df..0c3927accb0 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -142,7 +142,10 @@ void cpu_idle(void) local_irq_disable(); __get_cpu_var(irq_stat).idle_timestamp = jiffies; + /* Don't trace irqs off for idle */ + stop_critical_timings(); pm_idle(); + start_critical_timings(); } tick_nohz_restart_sched_tick(); preempt_enable_no_resched(); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index db5eb963e4d..a8e53626ac9 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -134,7 +134,10 @@ void cpu_idle(void) */ local_irq_disable(); enter_idle(); + /* Don't trace irqs off for idle */ + stop_critical_timings(); pm_idle(); + start_critical_timings(); /* In many cases the interrupt that ended idle has already called exit_idle. But some idle loops can be woken up without interrupt. */ diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 79bdcd11c66..d1385881810 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -266,6 +266,8 @@ static void old_ich_force_enable_hpet_user(struct pci_dev *dev) hpet_print_force_info(); } +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB_1, + old_ich_force_enable_hpet_user); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_0, old_ich_force_enable_hpet_user); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_12, diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index cfcfbefee0b..531b55b8e81 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -394,11 +394,10 @@ static void __init parse_setup_data(void) } } -static void __init reserve_setup_data(void) +static void __init e820_reserve_setup_data(void) { struct setup_data *data; u64 pa_data; - char buf[32]; int found = 0; if (boot_params.hdr.version < 0x0209) @@ -406,8 +405,6 @@ static void __init reserve_setup_data(void) pa_data = boot_params.hdr.setup_data; while (pa_data) { data = early_ioremap(pa_data, sizeof(*data)); - sprintf(buf, "setup data %x", data->type); - reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf); e820_update_range(pa_data, sizeof(*data)+data->len, E820_RAM, E820_RESERVED_KERN); found = 1; @@ -418,10 +415,29 @@ static void __init reserve_setup_data(void) return; sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + memcpy(&e820_saved, &e820, sizeof(struct e820map)); printk(KERN_INFO "extended physical RAM map:\n"); e820_print_map("reserve setup_data"); } +static void __init reserve_early_setup_data(void) +{ + struct setup_data *data; + u64 pa_data; + char buf[32]; + + if (boot_params.hdr.version < 0x0209) + return; + pa_data = boot_params.hdr.setup_data; + while (pa_data) { + data = early_ioremap(pa_data, sizeof(*data)); + sprintf(buf, "setup data %x", data->type); + reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf); + pa_data = data->next; + early_iounmap(data, sizeof(*data)); + } +} + /* * --------- Crashkernel reservation ------------------------------ */ @@ -580,6 +596,7 @@ void __init setup_arch(char **cmdline_p) { #ifdef CONFIG_X86_32 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); + visws_early_detect(); pre_setup_arch_hook(); early_cpu_init(); #else @@ -626,6 +643,8 @@ void __init setup_arch(char **cmdline_p) setup_memory_map(); parse_setup_data(); + /* update the e820_saved too */ + e820_reserve_setup_data(); copy_edd(); @@ -656,7 +675,7 @@ void __init setup_arch(char **cmdline_p) parse_early_param(); /* after early param, so could get panic from serial */ - reserve_setup_data(); + reserve_early_setup_data(); if (acpi_mps_check()) { #ifdef CONFIG_X86_LOCAL_APIC @@ -665,6 +684,11 @@ void __init setup_arch(char **cmdline_p) clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); } +#ifdef CONFIG_PCI + if (pci_early_dump_regs) + early_dump_pci_devices(); +#endif + finish_e820_parsing(); #ifdef CONFIG_X86_32 @@ -691,22 +715,18 @@ void __init setup_arch(char **cmdline_p) early_gart_iommu_check(); #endif - e820_register_active_regions(0, 0, -1UL); /* * partially used pages are not usable - thus * we are rounding upwards: */ - max_pfn = e820_end_of_ram(); + max_pfn = e820_end_of_ram_pfn(); /* preallocate 4k for mptable mpc */ early_reserve_e820_mpc_new(); /* update e820 for memory not covered by WB MTRRs */ mtrr_bp_init(); - if (mtrr_trim_uncached_memory(max_pfn)) { - remove_all_active_ranges(); - e820_register_active_regions(0, 0, -1UL); - max_pfn = e820_end_of_ram(); - } + if (mtrr_trim_uncached_memory(max_pfn)) + max_pfn = e820_end_of_ram_pfn(); #ifdef CONFIG_X86_32 /* max_low_pfn get updated here */ @@ -718,12 +738,26 @@ void __init setup_arch(char **cmdline_p) /* How many end-of-memory variables you have, grandma! */ /* need this before calling reserve_initrd */ - max_low_pfn = max_pfn; + if (max_pfn > (1UL<<(32 - PAGE_SHIFT))) + max_low_pfn = e820_end_of_low_ram_pfn(); + else + max_low_pfn = max_pfn; + high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; #endif /* max_pfn_mapped is updated here */ - max_pfn_mapped = init_memory_mapping(0, (max_low_pfn << PAGE_SHIFT)); + max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT); + max_pfn_mapped = max_low_pfn_mapped; + +#ifdef CONFIG_X86_64 + if (max_pfn > max_low_pfn) { + max_pfn_mapped = init_memory_mapping(1UL<<32, + max_pfn<<PAGE_SHIFT); + /* can we preseve max_low_pfn ?*/ + max_low_pfn = max_pfn; + } +#endif /* * NOTE: On x86-32, only from this point on, fixmaps are ready for use. @@ -749,9 +783,6 @@ void __init setup_arch(char **cmdline_p) */ acpi_boot_table_init(); - /* Remove active ranges so rediscovery with NUMA-awareness happens */ - remove_all_active_ranges(); - #ifdef CONFIG_ACPI_NUMA /* * Parse SRAT to discover nodes. @@ -823,6 +854,14 @@ void __init setup_arch(char **cmdline_p) init_cpu_to_node(); #endif +#ifdef CONFIG_X86_NUMAQ + /* + * need to check online nodes num, call it + * here before time_init/tsc_init + */ + numaq_tsc_disable(); +#endif + init_apic_mappings(); ioapic_init_mappings(); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 5fc310f746f..cac68430d31 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -343,23 +343,23 @@ static const cpumask_t cpu_mask_none; /* * Returns a pointer to the bitmask of CPUs on Node 'node'. */ -cpumask_t *_node_to_cpumask_ptr(int node) +const cpumask_t *_node_to_cpumask_ptr(int node) { if (node_to_cpumask_map == NULL) { printk(KERN_WARNING "_node_to_cpumask_ptr(%d): no node_to_cpumask_map!\n", node); dump_stack(); - return &cpu_online_map; + return (const cpumask_t *)&cpu_online_map; } if (node >= nr_node_ids) { printk(KERN_WARNING "_node_to_cpumask_ptr(%d): node > nr_node_ids(%d)\n", node, nr_node_ids); dump_stack(); - return (cpumask_t *)&cpu_mask_none; + return &cpu_mask_none; } - return (cpumask_t *)&node_to_cpumask_map[node]; + return &node_to_cpumask_map[node]; } EXPORT_SYMBOL(_node_to_cpumask_ptr); diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 0cb7aadc87c..361b7a4c640 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -121,132 +121,23 @@ static void native_smp_send_reschedule(int cpu) send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR); } -/* - * Structure and data for smp_call_function(). This is designed to minimise - * static memory requirements. It also looks cleaner. - */ -static DEFINE_SPINLOCK(call_lock); - -struct call_data_struct { - void (*func) (void *info); - void *info; - atomic_t started; - atomic_t finished; - int wait; -}; - -void lock_ipi_call_lock(void) +void native_send_call_func_single_ipi(int cpu) { - spin_lock_irq(&call_lock); -} - -void unlock_ipi_call_lock(void) -{ - spin_unlock_irq(&call_lock); -} - -static struct call_data_struct *call_data; - -static void __smp_call_function(void (*func) (void *info), void *info, - int nonatomic, int wait) -{ - struct call_data_struct data; - int cpus = num_online_cpus() - 1; - - if (!cpus) - return; - - data.func = func; - data.info = info; - atomic_set(&data.started, 0); - data.wait = wait; - if (wait) - atomic_set(&data.finished, 0); - - call_data = &data; - mb(); - - /* Send a message to all other CPUs and wait for them to respond */ - send_IPI_allbutself(CALL_FUNCTION_VECTOR); - - /* Wait for response */ - while (atomic_read(&data.started) != cpus) - cpu_relax(); - - if (wait) - while (atomic_read(&data.finished) != cpus) - cpu_relax(); + send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_SINGLE_VECTOR); } - -/** - * smp_call_function_mask(): Run a function on a set of other CPUs. - * @mask: The set of cpus to run on. Must not include the current cpu. - * @func: The function to run. This must be fast and non-blocking. - * @info: An arbitrary pointer to pass to the function. - * @wait: If true, wait (atomically) until function has completed on other CPUs. - * - * Returns 0 on success, else a negative status code. - * - * If @wait is true, then returns once @func has returned; otherwise - * it returns just before the target cpu calls @func. - * - * You must not call this function with disabled interrupts or from a - * hardware interrupt handler or from a bottom half handler. - */ -static int -native_smp_call_function_mask(cpumask_t mask, - void (*func)(void *), void *info, - int wait) +void native_send_call_func_ipi(cpumask_t mask) { - struct call_data_struct data; cpumask_t allbutself; - int cpus; - - /* Can deadlock when called with interrupts disabled */ - WARN_ON(irqs_disabled()); - - /* Holding any lock stops cpus from going down. */ - spin_lock(&call_lock); allbutself = cpu_online_map; cpu_clear(smp_processor_id(), allbutself); - cpus_and(mask, mask, allbutself); - cpus = cpus_weight(mask); - - if (!cpus) { - spin_unlock(&call_lock); - return 0; - } - - data.func = func; - data.info = info; - atomic_set(&data.started, 0); - data.wait = wait; - if (wait) - atomic_set(&data.finished, 0); - - call_data = &data; - wmb(); - - /* Send a message to other CPUs */ if (cpus_equal(mask, allbutself) && cpus_equal(cpu_online_map, cpu_callout_map)) send_IPI_allbutself(CALL_FUNCTION_VECTOR); else send_IPI_mask(mask, CALL_FUNCTION_VECTOR); - - /* Wait for response */ - while (atomic_read(&data.started) != cpus) - cpu_relax(); - - if (wait) - while (atomic_read(&data.finished) != cpus) - cpu_relax(); - spin_unlock(&call_lock); - - return 0; } static void stop_this_cpu(void *dummy) @@ -268,18 +159,13 @@ static void stop_this_cpu(void *dummy) static void native_smp_send_stop(void) { - int nolock; unsigned long flags; if (reboot_force) return; - /* Don't deadlock on the call lock in panic */ - nolock = !spin_trylock(&call_lock); + smp_call_function(stop_this_cpu, NULL, 0); local_irq_save(flags); - __smp_call_function(stop_this_cpu, NULL, 0, 0); - if (!nolock) - spin_unlock(&call_lock); disable_local_APIC(); local_irq_restore(flags); } @@ -301,33 +187,28 @@ void smp_reschedule_interrupt(struct pt_regs *regs) void smp_call_function_interrupt(struct pt_regs *regs) { - void (*func) (void *info) = call_data->func; - void *info = call_data->info; - int wait = call_data->wait; - ack_APIC_irq(); - /* - * Notify initiating CPU that I've grabbed the data and am - * about to execute the function - */ - mb(); - atomic_inc(&call_data->started); - /* - * At this point the info structure may be out of scope unless wait==1 - */ irq_enter(); - (*func)(info); + generic_smp_call_function_interrupt(); #ifdef CONFIG_X86_32 __get_cpu_var(irq_stat).irq_call_count++; #else add_pda(irq_call_count, 1); #endif irq_exit(); +} - if (wait) { - mb(); - atomic_inc(&call_data->finished); - } +void smp_call_function_single_interrupt(struct pt_regs *regs) +{ + ack_APIC_irq(); + irq_enter(); + generic_smp_call_function_single_interrupt(); +#ifdef CONFIG_X86_32 + __get_cpu_var(irq_stat).irq_call_count++; +#else + add_pda(irq_call_count, 1); +#endif + irq_exit(); } struct smp_ops smp_ops = { @@ -338,7 +219,8 @@ struct smp_ops smp_ops = { .smp_send_stop = native_smp_send_stop, .smp_send_reschedule = native_smp_send_reschedule, - .smp_call_function_mask = native_smp_call_function_mask, + + .send_call_func_ipi = native_send_call_func_ipi, + .send_call_func_single_ipi = native_send_call_func_single_ipi, }; EXPORT_SYMBOL_GPL(smp_ops); - diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index e1200b202ed..687376ab07e 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -327,12 +327,12 @@ static void __cpuinit start_secondary(void *unused) * lock helps us to not include this cpu in a currently in progress * smp_call_function(). */ - lock_ipi_call_lock(); + ipi_call_lock_irq(); #ifdef CONFIG_X86_IO_APIC setup_vector_irq(smp_processor_id()); #endif cpu_set(smp_processor_id(), cpu_online_map); - unlock_ipi_call_lock(); + ipi_call_unlock_irq(); per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; setup_secondary_clock(); @@ -939,9 +939,9 @@ do_rest: inquire_remote_apic(apicid); } } - +#ifdef CONFIG_X86_64 restore_state: - +#endif if (boot_error) { /* Try to put things back the way they were before ... */ numa_remove_cpu(cpu); /* was set by numa_add_cpu */ diff --git a/arch/x86/kernel/smpcommon.c b/arch/x86/kernel/smpcommon.c index 3449064d141..99941b37eca 100644 --- a/arch/x86/kernel/smpcommon.c +++ b/arch/x86/kernel/smpcommon.c @@ -25,59 +25,3 @@ __cpuinit void init_gdt(int cpu) per_cpu(cpu_number, cpu) = cpu; } #endif - -/** - * smp_call_function(): Run a function on all other CPUs. - * @func: The function to run. This must be fast and non-blocking. - * @info: An arbitrary pointer to pass to the function. - * @nonatomic: Unused. - * @wait: If true, wait (atomically) until function has completed on other CPUs. - * - * Returns 0 on success, else a negative status code. - * - * If @wait is true, then returns once @func has returned; otherwise - * it returns just before the target cpu calls @func. - * - * You must not call this function with disabled interrupts or from a - * hardware interrupt handler or from a bottom half handler. - */ -int smp_call_function(void (*func) (void *info), void *info, int nonatomic, - int wait) -{ - return smp_call_function_mask(cpu_online_map, func, info, wait); -} -EXPORT_SYMBOL(smp_call_function); - -/** - * smp_call_function_single - Run a function on a specific CPU - * @cpu: The target CPU. Cannot be the calling CPU. - * @func: The function to run. This must be fast and non-blocking. - * @info: An arbitrary pointer to pass to the function. - * @nonatomic: Unused. - * @wait: If true, wait until function has completed on other CPUs. - * - * Returns 0 on success, else a negative status code. - * - * If @wait is true, then returns once @func has returned; otherwise - * it returns just before the target cpu calls @func. - */ -int smp_call_function_single(int cpu, void (*func) (void *info), void *info, - int nonatomic, int wait) -{ - /* prevent preemption and reschedule on another processor */ - int ret; - int me = get_cpu(); - if (cpu == me) { - local_irq_disable(); - func(info); - local_irq_enable(); - put_cpu(); - return 0; - } - - ret = smp_call_function_mask(cpumask_of_cpu(cpu), func, info, wait); - - put_cpu(); - return ret; -} -EXPORT_SYMBOL(smp_call_function_single); diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index c28c342c162..a03e7f6d90c 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -74,6 +74,7 @@ void save_stack_trace(struct stack_trace *trace) if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = ULONG_MAX; } +EXPORT_SYMBOL_GPL(save_stack_trace); void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) { @@ -81,3 +82,4 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = ULONG_MAX; } +EXPORT_SYMBOL_GPL(save_stack_trace_tsk); diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c index 5f29f12da50..059ca6ee59b 100644 --- a/arch/x86/kernel/time_32.c +++ b/arch/x86/kernel/time_32.c @@ -39,9 +39,6 @@ #include "do_timer.h" -unsigned int cpu_khz; /* Detected as we calibrate the TSC */ -EXPORT_SYMBOL(cpu_khz); - int timer_ack; unsigned long profile_pc(struct pt_regs *regs) diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c index 39ae8511a13..e3d49c553af 100644 --- a/arch/x86/kernel/time_64.c +++ b/arch/x86/kernel/time_64.c @@ -56,7 +56,7 @@ static irqreturn_t timer_event_interrupt(int irq, void *dev_id) /* calibrate_cpu is used on systems with fixed rate TSCs to determine * processor frequency */ #define TICK_COUNT 100000000 -unsigned long __init native_calculate_cpu_khz(void) +unsigned long __init calibrate_cpu(void) { int tsc_start, tsc_now; int i, no_ctr_free; @@ -116,25 +116,11 @@ void __init hpet_time_init(void) void __init time_init(void) { - tsc_calibrate(); - - cpu_khz = tsc_khz; - if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) && - (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)) - cpu_khz = calculate_cpu_khz(); - - lpj_fine = ((unsigned long)tsc_khz * 1000)/HZ; - - if (unsynchronized_tsc()) - mark_tsc_unstable("TSCs unsynchronized"); - + tsc_init(); if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP)) vgetcpu_mode = VGETCPU_RDTSCP; else vgetcpu_mode = VGETCPU_LSL; - printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n", - cpu_khz / 1000, cpu_khz % 1000); - init_tsc_clocksource(); late_time_init = choose_time_init(); } diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c index 9bb2363851a..fec1ecedc9b 100644 --- a/arch/x86/kernel/tlb_32.c +++ b/arch/x86/kernel/tlb_32.c @@ -238,6 +238,6 @@ static void do_flush_tlb_all(void *info) void flush_tlb_all(void) { - on_each_cpu(do_flush_tlb_all, NULL, 1, 1); + on_each_cpu(do_flush_tlb_all, NULL, 1); } diff --git a/arch/x86/kernel/tlb_64.c b/arch/x86/kernel/tlb_64.c index 5039d0f097a..dcbf7a1159e 100644 --- a/arch/x86/kernel/tlb_64.c +++ b/arch/x86/kernel/tlb_64.c @@ -275,5 +275,5 @@ static void do_flush_tlb_all(void *info) void flush_tlb_all(void) { - on_each_cpu(do_flush_tlb_all, NULL, 1, 1); + on_each_cpu(do_flush_tlb_all, NULL, 1); } diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index d7cc292691f..8a768973c4f 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -1,5 +1,6 @@ /* * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs * * Pentium III FXSR, SSE support * Gareth Hughes <gareth@valinux.com>, May 2000 @@ -60,8 +61,6 @@ #include "mach_traps.h" -int panic_on_unrecovered_nmi; - DECLARE_BITMAP(used_vectors, NR_VECTORS); EXPORT_SYMBOL_GPL(used_vectors); @@ -98,19 +97,22 @@ asmlinkage void alignment_check(void); asmlinkage void spurious_interrupt_bug(void); asmlinkage void machine_check(void); +int panic_on_unrecovered_nmi; int kstack_depth_to_print = 24; static unsigned int code_bytes = 64; +static int ignore_nmis; +static int die_counter; void printk_address(unsigned long address, int reliable) { #ifdef CONFIG_KALLSYMS - char namebuf[KSYM_NAME_LEN]; unsigned long offset = 0; unsigned long symsize; const char *symname; - char reliab[4] = ""; - char *delim = ":"; char *modname; + char *delim = ":"; + char namebuf[KSYM_NAME_LEN]; + char reliab[4] = ""; symname = kallsyms_lookup(address, &symsize, &offset, &modname, namebuf); @@ -130,22 +132,23 @@ void printk_address(unsigned long address, int reliable) #endif } -static inline int valid_stack_ptr(struct thread_info *tinfo, void *p, unsigned size) +static inline int valid_stack_ptr(struct thread_info *tinfo, + void *p, unsigned int size) { - return p > (void *)tinfo && - p <= (void *)tinfo + THREAD_SIZE - size; + void *t = tinfo; + return p > t && p <= t + THREAD_SIZE - size; } /* The form of the top of the frame on the stack */ struct stack_frame { - struct stack_frame *next_frame; - unsigned long return_address; + struct stack_frame *next_frame; + unsigned long return_address; }; static inline unsigned long print_context_stack(struct thread_info *tinfo, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data) + unsigned long *stack, unsigned long bp, + const struct stacktrace_ops *ops, void *data) { struct stack_frame *frame = (struct stack_frame *)bp; @@ -167,8 +170,6 @@ print_context_stack(struct thread_info *tinfo, return bp; } -#define MSG(msg) ops->warning(data, msg) - void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data) @@ -178,7 +179,6 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, if (!stack) { unsigned long dummy; - stack = &dummy; if (task != current) stack = (unsigned long *)task->thread.sp; @@ -196,7 +196,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, } #endif - while (1) { + for (;;) { struct thread_info *context; context = (struct thread_info *) @@ -248,10 +248,10 @@ static void print_trace_address(void *data, unsigned long addr, int reliable) } static const struct stacktrace_ops print_trace_ops = { - .warning = print_trace_warning, - .warning_symbol = print_trace_warning_symbol, - .stack = print_trace_stack, - .address = print_trace_address, + .warning = print_trace_warning, + .warning_symbol = print_trace_warning_symbol, + .stack = print_trace_stack, + .address = print_trace_address, }; static void @@ -351,15 +351,14 @@ void show_registers(struct pt_regs *regs) printk(KERN_EMERG "Code: "); ip = (u8 *)regs->ip - code_prologue; - if (ip < (u8 *)PAGE_OFFSET || - probe_kernel_address(ip, c)) { + if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { /* try starting at EIP */ ip = (u8 *)regs->ip; code_len = code_len - code_prologue + 1; } for (i = 0; i < code_len; i++, ip++) { if (ip < (u8 *)PAGE_OFFSET || - probe_kernel_address(ip, c)) { + probe_kernel_address(ip, c)) { printk(" Bad EIP value."); break; } @@ -384,8 +383,6 @@ int is_valid_bugaddr(unsigned long ip) return ud2 == 0x0b0f; } -static int die_counter; - int __kprobes __die(const char *str, struct pt_regs *regs, long err) { unsigned short ss; @@ -402,26 +399,22 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err) printk("DEBUG_PAGEALLOC"); #endif printk("\n"); - if (notify_die(DIE_OOPS, str, regs, err, - current->thread.trap_no, SIGSEGV) != NOTIFY_STOP) { - - show_registers(regs); - /* Executive summary in case the oops scrolled away */ - sp = (unsigned long) (®s->sp); - savesegment(ss, ss); - if (user_mode(regs)) { - sp = regs->sp; - ss = regs->ss & 0xffff; - } - printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); - print_symbol("%s", regs->ip); - printk(" SS:ESP %04x:%08lx\n", ss, sp); + current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) + return 1; - return 0; + show_registers(regs); + /* Executive summary in case the oops scrolled away */ + sp = (unsigned long) (®s->sp); + savesegment(ss, ss); + if (user_mode(regs)) { + sp = regs->sp; + ss = regs->ss & 0xffff; } - - return 1; + printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); + print_symbol("%s", regs->ip); + printk(" SS:ESP %04x:%08lx\n", ss, sp); + return 0; } /* @@ -546,7 +539,7 @@ void do_##name(struct pt_regs *regs, long error_code) \ { \ trace_hardirqs_fixup(); \ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ + == NOTIFY_STOP) \ return; \ do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \ } @@ -562,7 +555,7 @@ void do_##name(struct pt_regs *regs, long error_code) \ info.si_code = sicode; \ info.si_addr = (void __user *)siaddr; \ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ + == NOTIFY_STOP) \ return; \ do_trap(trapnr, signr, str, 0, regs, error_code, &info); \ } @@ -571,7 +564,7 @@ void do_##name(struct pt_regs *regs, long error_code) \ void do_##name(struct pt_regs *regs, long error_code) \ { \ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ + == NOTIFY_STOP) \ return; \ do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \ } @@ -586,27 +579,29 @@ void do_##name(struct pt_regs *regs, long error_code) \ info.si_addr = (void __user *)siaddr; \ trace_hardirqs_fixup(); \ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ + == NOTIFY_STOP) \ return; \ do_trap(trapnr, signr, str, 1, regs, error_code, &info); \ } -DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) +DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) #ifndef CONFIG_KPROBES DO_VM86_ERROR(3, SIGTRAP, "int3", int3) #endif DO_VM86_ERROR(4, SIGSEGV, "overflow", overflow) DO_VM86_ERROR(5, SIGSEGV, "bounds", bounds) -DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0) -DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) +DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0) +DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) -DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) -DO_ERROR(12, SIGBUS, "stack segment", stack_segment) +DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) +DO_ERROR(12, SIGBUS, "stack segment", stack_segment) DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0) DO_ERROR_INFO(32, SIGILL, "iret exception", iret_error, ILL_BADSTK, 0, 1) -void __kprobes do_general_protection(struct pt_regs *regs, long error_code) +void __kprobes +do_general_protection(struct pt_regs *regs, long error_code) { + struct task_struct *tsk; struct thread_struct *thread; struct tss_struct *tss; int cpu; @@ -647,23 +642,24 @@ void __kprobes do_general_protection(struct pt_regs *regs, long error_code) if (regs->flags & X86_VM_MASK) goto gp_in_vm86; + tsk = current; if (!user_mode(regs)) goto gp_in_kernel; - current->thread.error_code = error_code; - current->thread.trap_no = 13; + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 13; - if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) && - printk_ratelimit()) { + if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && + printk_ratelimit()) { printk(KERN_INFO - "%s[%d] general protection ip:%lx sp:%lx error:%lx", - current->comm, task_pid_nr(current), - regs->ip, regs->sp, error_code); + "%s[%d] general protection ip:%lx sp:%lx error:%lx", + tsk->comm, task_pid_nr(tsk), + regs->ip, regs->sp, error_code); print_vma_addr(" in ", regs->ip); printk("\n"); } - force_sig(SIGSEGV, current); + force_sig(SIGSEGV, tsk); return; gp_in_vm86: @@ -672,14 +668,15 @@ gp_in_vm86: return; gp_in_kernel: - if (!fixup_exception(regs)) { - current->thread.error_code = error_code; - current->thread.trap_no = 13; - if (notify_die(DIE_GPF, "general protection fault", regs, + if (fixup_exception(regs)) + return; + + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 13; + if (notify_die(DIE_GPF, "general protection fault", regs, error_code, 13, SIGSEGV) == NOTIFY_STOP) - return; - die("general protection fault", regs, error_code); - } + return; + die("general protection fault", regs, error_code); } static notrace __kprobes void @@ -792,14 +789,17 @@ void notrace __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic) static notrace __kprobes void default_do_nmi(struct pt_regs *regs) { unsigned char reason = 0; + int cpu; + + cpu = smp_processor_id(); - /* Only the BSP gets external NMIs from the system: */ - if (!smp_processor_id()) + /* Only the BSP gets external NMIs from the system. */ + if (!cpu) reason = get_nmi_reason(); if (!(reason & 0xc0)) { if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) - == NOTIFY_STOP) + == NOTIFY_STOP) return; #ifdef CONFIG_X86_LOCAL_APIC /* @@ -808,7 +808,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) */ if (nmi_watchdog_tick(regs, reason)) return; - if (!do_nmi_callback(regs, smp_processor_id())) + if (!do_nmi_callback(regs, cpu)) unknown_nmi_error(reason, regs); #else unknown_nmi_error(reason, regs); @@ -818,6 +818,8 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) } if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) return; + + /* AK: following checks seem to be broken on modern chipsets. FIXME */ if (reason & 0x80) mem_parity_error(reason, regs); if (reason & 0x40) @@ -829,8 +831,6 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) reassert_nmi(); } -static int ignore_nmis; - notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code) { int cpu; @@ -915,7 +915,7 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code) tsk->thread.debugctlmsr = 0; if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, - SIGTRAP) == NOTIFY_STOP) + SIGTRAP) == NOTIFY_STOP) return; /* It's safe to allow irq's after DR6 has been saved */ if (regs->flags & X86_EFLAGS_IF) @@ -976,9 +976,8 @@ clear_TF_reenable: void math_error(void __user *ip) { struct task_struct *task; - unsigned short cwd; - unsigned short swd; siginfo_t info; + unsigned short cwd, swd; /* * Save the info for the exception handler and clear the error. @@ -997,7 +996,7 @@ void math_error(void __user *ip) * C1 reg you need in case of a stack fault, 0x040 is the stack * fault bit. We should only be taking one exception at a time, * so if this combination doesn't produce any single exception, - * then we have a bad program that isn't syncronizing its FPU usage + * then we have a bad program that isn't synchronizing its FPU usage * and it will suffer the consequences since we won't be able to * fully reproduce the context of the exception */ @@ -1006,7 +1005,7 @@ void math_error(void __user *ip) switch (swd & ~cwd & 0x3f) { case 0x000: /* No unmasked exception */ return; - default: /* Multiple exceptions */ + default: /* Multiple exceptions */ break; case 0x001: /* Invalid Op */ /* @@ -1042,8 +1041,8 @@ void do_coprocessor_error(struct pt_regs *regs, long error_code) static void simd_math_error(void __user *ip) { struct task_struct *task; - unsigned short mxcsr; siginfo_t info; + unsigned short mxcsr; /* * Save the info for the exception handler and clear the error. @@ -1198,16 +1197,16 @@ void __init trap_init(void) early_iounmap(p, 4); #endif - set_trap_gate(0, ÷_error); - set_intr_gate(1, &debug); - set_intr_gate(2, &nmi); - set_system_intr_gate(3, &int3); /* int3/4 can be called from all */ - set_system_gate(4, &overflow); - set_trap_gate(5, &bounds); - set_trap_gate(6, &invalid_op); - set_trap_gate(7, &device_not_available); - set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS); - set_trap_gate(9, &coprocessor_segment_overrun); + set_trap_gate(0, ÷_error); + set_intr_gate(1, &debug); + set_intr_gate(2, &nmi); + set_system_intr_gate(3, &int3); /* int3 can be called from all */ + set_system_gate(4, &overflow); /* int4 can be called from all */ + set_trap_gate(5, &bounds); + set_trap_gate(6, &invalid_op); + set_trap_gate(7, &device_not_available); + set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS); + set_trap_gate(9, &coprocessor_segment_overrun); set_trap_gate(10, &invalid_TSS); set_trap_gate(11, &segment_not_present); set_trap_gate(12, &stack_segment); diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index 80ba6d37bfe..2696a683778 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -10,49 +10,49 @@ * 'Traps.c' handles hardware traps and faults after we have saved some * state in 'entry.S'. */ -#include <linux/sched.h> +#include <linux/moduleparam.h> +#include <linux/interrupt.h> +#include <linux/kallsyms.h> +#include <linux/spinlock.h> +#include <linux/kprobes.h> +#include <linux/uaccess.h> +#include <linux/utsname.h> +#include <linux/kdebug.h> #include <linux/kernel.h> +#include <linux/module.h> +#include <linux/ptrace.h> #include <linux/string.h> +#include <linux/unwind.h> +#include <linux/delay.h> #include <linux/errno.h> -#include <linux/ptrace.h> +#include <linux/kexec.h> +#include <linux/sched.h> #include <linux/timer.h> -#include <linux/mm.h> #include <linux/init.h> -#include <linux/delay.h> -#include <linux/spinlock.h> -#include <linux/interrupt.h> -#include <linux/kallsyms.h> -#include <linux/module.h> -#include <linux/moduleparam.h> -#include <linux/nmi.h> -#include <linux/kprobes.h> -#include <linux/kexec.h> -#include <linux/unwind.h> -#include <linux/uaccess.h> #include <linux/bug.h> -#include <linux/kdebug.h> -#include <linux/utsname.h> - -#include <mach_traps.h> +#include <linux/nmi.h> +#include <linux/mm.h> #if defined(CONFIG_EDAC) #include <linux/edac.h> #endif -#include <asm/system.h> -#include <asm/io.h> -#include <asm/atomic.h> +#include <asm/stacktrace.h> +#include <asm/processor.h> #include <asm/debugreg.h> +#include <asm/atomic.h> +#include <asm/system.h> +#include <asm/unwind.h> #include <asm/desc.h> #include <asm/i387.h> -#include <asm/processor.h> -#include <asm/unwind.h> +#include <asm/nmi.h> #include <asm/smp.h> +#include <asm/io.h> #include <asm/pgalloc.h> -#include <asm/pda.h> #include <asm/proto.h> -#include <asm/nmi.h> -#include <asm/stacktrace.h> +#include <asm/pda.h> + +#include <mach_traps.h> asmlinkage void divide_error(void); asmlinkage void debug(void); @@ -72,12 +72,14 @@ asmlinkage void page_fault(void); asmlinkage void coprocessor_error(void); asmlinkage void simd_coprocessor_error(void); asmlinkage void alignment_check(void); -asmlinkage void machine_check(void); asmlinkage void spurious_interrupt_bug(void); +asmlinkage void machine_check(void); int panic_on_unrecovered_nmi; +int kstack_depth_to_print = 12; static unsigned int code_bytes = 64; -static unsigned ignore_nmis; +static int ignore_nmis; +static int die_counter; static inline void conditional_sti(struct pt_regs *regs) { @@ -101,34 +103,9 @@ static inline void preempt_conditional_cli(struct pt_regs *regs) dec_preempt_count(); } -int kstack_depth_to_print = 12; - void printk_address(unsigned long address, int reliable) { -#ifdef CONFIG_KALLSYMS - unsigned long offset = 0, symsize; - const char *symname; - char *modname; - char *delim = ":"; - char namebuf[KSYM_NAME_LEN]; - char reliab[4] = ""; - - symname = kallsyms_lookup(address, &symsize, &offset, - &modname, namebuf); - if (!symname) { - printk(" [<%016lx>]\n", address); - return; - } - if (!reliable) - strcpy(reliab, "? "); - - if (!modname) - modname = delim = ""; - printk(" [<%016lx>] %s%s%s%s%s+0x%lx/0x%lx\n", - address, reliab, delim, modname, delim, symname, offset, symsize); -#else - printk(" [<%016lx>]\n", address); -#endif + printk(" [<%016lx>] %s%pS\n", address, reliable ? "": "? ", (void *) address); } static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, @@ -205,8 +182,6 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, return NULL; } -#define MSG(txt) ops->warning(data, txt) - /* * x86-64 can have up to three kernel stacks: * process stack @@ -233,11 +208,11 @@ struct stack_frame { unsigned long return_address; }; - -static inline unsigned long print_context_stack(struct thread_info *tinfo, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data, - unsigned long *end) +static inline unsigned long +print_context_stack(struct thread_info *tinfo, + unsigned long *stack, unsigned long bp, + const struct stacktrace_ops *ops, void *data, + unsigned long *end) { struct stack_frame *frame = (struct stack_frame *)bp; @@ -259,7 +234,7 @@ static inline unsigned long print_context_stack(struct thread_info *tinfo, return bp; } -void dump_trace(struct task_struct *tsk, struct pt_regs *regs, +void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data) { @@ -268,36 +243,34 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned used = 0; struct thread_info *tinfo; - if (!tsk) - tsk = current; - tinfo = task_thread_info(tsk); + if (!task) + task = current; if (!stack) { unsigned long dummy; stack = &dummy; - if (tsk && tsk != current) - stack = (unsigned long *)tsk->thread.sp; + if (task && task != current) + stack = (unsigned long *)task->thread.sp; } #ifdef CONFIG_FRAME_POINTER if (!bp) { - if (tsk == current) { + if (task == current) { /* Grab bp right from our regs */ - asm("movq %%rbp, %0" : "=r" (bp):); + asm("movq %%rbp, %0" : "=r" (bp) :); } else { /* bp is the last reg pushed by switch_to */ - bp = *(unsigned long *) tsk->thread.sp; + bp = *(unsigned long *) task->thread.sp; } } #endif - - /* * Print function call entries in all stacks, starting at the * current stack address. If the stacks consist of nested * exceptions */ + tinfo = task_thread_info(task); for (;;) { char *id; unsigned long *estack_end; @@ -382,18 +355,17 @@ static const struct stacktrace_ops print_trace_ops = { .address = print_trace_address, }; -void -show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack, - unsigned long bp) +void show_trace(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, unsigned long bp) { printk("\nCall Trace:\n"); - dump_trace(tsk, regs, stack, bp, &print_trace_ops, NULL); + dump_trace(task, regs, stack, bp, &print_trace_ops, NULL); printk("\n"); } static void -_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp, - unsigned long bp) +_show_stack(struct task_struct *task, struct pt_regs *regs, + unsigned long *sp, unsigned long bp) { unsigned long *stack; int i; @@ -405,14 +377,14 @@ _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp, // back trace for this cpu. if (sp == NULL) { - if (tsk) - sp = (unsigned long *)tsk->thread.sp; + if (task) + sp = (unsigned long *)task->thread.sp; else sp = (unsigned long *)&sp; } stack = sp; - for(i=0; i < kstack_depth_to_print; i++) { + for (i = 0; i < kstack_depth_to_print; i++) { if (stack >= irqstack && stack <= irqstack_end) { if (stack == irqstack_end) { stack = (unsigned long *) (irqstack_end[-1]); @@ -427,12 +399,12 @@ _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp, printk(" %016lx", *stack++); touch_nmi_watchdog(); } - show_trace(tsk, regs, sp, bp); + show_trace(task, regs, sp, bp); } -void show_stack(struct task_struct *tsk, unsigned long * sp) +void show_stack(struct task_struct *task, unsigned long *sp) { - _show_stack(tsk, NULL, sp, 0); + _show_stack(task, NULL, sp, 0); } /* @@ -440,8 +412,8 @@ void show_stack(struct task_struct *tsk, unsigned long * sp) */ void dump_stack(void) { - unsigned long dummy; unsigned long bp = 0; + unsigned long stack; #ifdef CONFIG_FRAME_POINTER if (!bp) @@ -453,7 +425,7 @@ void dump_stack(void) init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version); - show_trace(NULL, NULL, &dummy, bp); + show_trace(NULL, NULL, &stack, bp); } EXPORT_SYMBOL(dump_stack); @@ -464,12 +436,8 @@ void show_registers(struct pt_regs *regs) unsigned long sp; const int cpu = smp_processor_id(); struct task_struct *cur = cpu_pda(cpu)->pcurrent; - u8 *ip; - unsigned int code_prologue = code_bytes * 43 / 64; - unsigned int code_len = code_bytes; sp = regs->sp; - ip = (u8 *) regs->ip - code_prologue; printk("CPU %d ", cpu); __show_regs(regs); printk("Process %s (pid: %d, threadinfo %p, task %p)\n", @@ -480,15 +448,21 @@ void show_registers(struct pt_regs *regs) * time of the fault.. */ if (!user_mode(regs)) { + unsigned int code_prologue = code_bytes * 43 / 64; + unsigned int code_len = code_bytes; unsigned char c; + u8 *ip; + printk("Stack: "); _show_stack(NULL, regs, (unsigned long *)sp, regs->bp); printk("\n"); printk(KERN_EMERG "Code: "); + + ip = (u8 *)regs->ip - code_prologue; if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { /* try starting at RIP */ - ip = (u8 *) regs->ip; + ip = (u8 *)regs->ip; code_len = code_len - code_prologue + 1; } for (i = 0; i < code_len; i++, ip++) { @@ -504,7 +478,7 @@ void show_registers(struct pt_regs *regs) } } printk("\n"); -} +} int is_valid_bugaddr(unsigned long ip) { @@ -562,10 +536,9 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) do_exit(signr); } -int __kprobes __die(const char * str, struct pt_regs * regs, long err) +int __kprobes __die(const char *str, struct pt_regs *regs, long err) { - static int die_counter; - printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter); + printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff, ++die_counter); #ifdef CONFIG_PREEMPT printk("PREEMPT "); #endif @@ -576,8 +549,10 @@ int __kprobes __die(const char * str, struct pt_regs * regs, long err) printk("DEBUG_PAGEALLOC"); #endif printk("\n"); - if (notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) + if (notify_die(DIE_OOPS, str, regs, err, + current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) return 1; + show_registers(regs); add_taint(TAINT_DIE); /* Executive summary in case the oops scrolled away */ @@ -589,7 +564,7 @@ int __kprobes __die(const char * str, struct pt_regs * regs, long err) return 0; } -void die(const char * str, struct pt_regs * regs, long err) +void die(const char *str, struct pt_regs *regs, long err) { unsigned long flags = oops_begin(); @@ -606,8 +581,7 @@ die_nmi(char *str, struct pt_regs *regs, int do_panic) { unsigned long flags; - if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == - NOTIFY_STOP) + if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) return; flags = oops_begin(); @@ -629,44 +603,44 @@ die_nmi(char *str, struct pt_regs *regs, int do_panic) do_exit(SIGBUS); } -static void __kprobes do_trap(int trapnr, int signr, char *str, - struct pt_regs * regs, long error_code, - siginfo_t *info) +static void __kprobes +do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, + long error_code, siginfo_t *info) { struct task_struct *tsk = current; - if (user_mode(regs)) { - /* - * We want error_code and trap_no set for userspace - * faults and kernelspace faults which result in - * die(), but not kernelspace faults which are fixed - * up. die() gives the process no chance to handle - * the signal and notice the kernel fault information, - * so that won't result in polluting the information - * about previously queued, but not yet delivered, - * faults. See also do_general_protection below. - */ - tsk->thread.error_code = error_code; - tsk->thread.trap_no = trapnr; - - if (show_unhandled_signals && unhandled_signal(tsk, signr) && - printk_ratelimit()) { - printk(KERN_INFO - "%s[%d] trap %s ip:%lx sp:%lx error:%lx", - tsk->comm, tsk->pid, str, - regs->ip, regs->sp, error_code); - print_vma_addr(" in ", regs->ip); - printk("\n"); - } + if (!user_mode(regs)) + goto kernel_trap; - if (info) - force_sig_info(signr, info, tsk); - else - force_sig(signr, tsk); - return; + /* + * We want error_code and trap_no set for userspace faults and + * kernelspace faults which result in die(), but not + * kernelspace faults which are fixed up. die() gives the + * process no chance to handle the signal and notice the + * kernel fault information, so that won't result in polluting + * the information about previously queued, but not yet + * delivered, faults. See also do_general_protection below. + */ + tsk->thread.error_code = error_code; + tsk->thread.trap_no = trapnr; + + if (show_unhandled_signals && unhandled_signal(tsk, signr) && + printk_ratelimit()) { + printk(KERN_INFO + "%s[%d] trap %s ip:%lx sp:%lx error:%lx", + tsk->comm, tsk->pid, str, + regs->ip, regs->sp, error_code); + print_vma_addr(" in ", regs->ip); + printk("\n"); } + if (info) + force_sig_info(signr, info, tsk); + else + force_sig(signr, tsk); + return; +kernel_trap: if (!fixup_exception(regs)) { tsk->thread.error_code = error_code; tsk->thread.trap_no = trapnr; @@ -676,38 +650,38 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, } #define DO_ERROR(trapnr, signr, str, name) \ -asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ -{ \ - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ - return; \ +asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ +{ \ + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ + == NOTIFY_STOP) \ + return; \ conditional_sti(regs); \ - do_trap(trapnr, signr, str, regs, error_code, NULL); \ + do_trap(trapnr, signr, str, regs, error_code, NULL); \ } -#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ -asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ -{ \ - siginfo_t info; \ - info.si_signo = signr; \ - info.si_errno = 0; \ - info.si_code = sicode; \ - info.si_addr = (void __user *)siaddr; \ - trace_hardirqs_fixup(); \ - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ - return; \ +#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ +asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ +{ \ + siginfo_t info; \ + info.si_signo = signr; \ + info.si_errno = 0; \ + info.si_code = sicode; \ + info.si_addr = (void __user *)siaddr; \ + trace_hardirqs_fixup(); \ + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ + == NOTIFY_STOP) \ + return; \ conditional_sti(regs); \ - do_trap(trapnr, signr, str, regs, error_code, &info); \ + do_trap(trapnr, signr, str, regs, error_code, &info); \ } -DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) -DO_ERROR( 4, SIGSEGV, "overflow", overflow) -DO_ERROR( 5, SIGSEGV, "bounds", bounds) -DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip) -DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) +DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) +DO_ERROR(4, SIGSEGV, "overflow", overflow) +DO_ERROR(5, SIGSEGV, "bounds", bounds) +DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip) +DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) -DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) +DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) /* Runs on IST stack */ @@ -738,31 +712,34 @@ asmlinkage void do_double_fault(struct pt_regs * regs, long error_code) die(str, regs, error_code); } -asmlinkage void __kprobes do_general_protection(struct pt_regs * regs, - long error_code) +asmlinkage void __kprobes +do_general_protection(struct pt_regs *regs, long error_code) { - struct task_struct *tsk = current; + struct task_struct *tsk; conditional_sti(regs); - if (user_mode(regs)) { - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 13; - - if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && - printk_ratelimit()) { - printk(KERN_INFO - "%s[%d] general protection ip:%lx sp:%lx error:%lx", - tsk->comm, tsk->pid, - regs->ip, regs->sp, error_code); - print_vma_addr(" in ", regs->ip); - printk("\n"); - } + tsk = current; + if (!user_mode(regs)) + goto gp_in_kernel; - force_sig(SIGSEGV, tsk); - return; - } + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 13; + if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && + printk_ratelimit()) { + printk(KERN_INFO + "%s[%d] general protection ip:%lx sp:%lx error:%lx", + tsk->comm, tsk->pid, + regs->ip, regs->sp, error_code); + print_vma_addr(" in ", regs->ip); + printk("\n"); + } + + force_sig(SIGSEGV, tsk); + return; + +gp_in_kernel: if (fixup_exception(regs)) return; @@ -775,14 +752,14 @@ asmlinkage void __kprobes do_general_protection(struct pt_regs * regs, } static notrace __kprobes void -mem_parity_error(unsigned char reason, struct pt_regs * regs) +mem_parity_error(unsigned char reason, struct pt_regs *regs) { printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n", reason); printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n"); #if defined(CONFIG_EDAC) - if(edac_handler_set()) { + if (edac_handler_set()) { edac_atomic_assert_error(); return; } @@ -799,7 +776,7 @@ mem_parity_error(unsigned char reason, struct pt_regs * regs) } static notrace __kprobes void -io_check_error(unsigned char reason, struct pt_regs * regs) +io_check_error(unsigned char reason, struct pt_regs *regs) { printk("NMI: IOCK error (debug interrupt?)\n"); show_registers(regs); @@ -829,14 +806,14 @@ unknown_nmi_error(unsigned char reason, struct pt_regs * regs) /* Runs on IST stack. This code must keep interrupts off all the time. Nested NMIs are prevented by the CPU. */ -asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs) +asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs) { unsigned char reason = 0; int cpu; cpu = smp_processor_id(); - /* Only the BSP gets external NMIs from the system. */ + /* Only the BSP gets external NMIs from the system. */ if (!cpu) reason = get_nmi_reason(); @@ -848,18 +825,17 @@ asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs) * Ok, so this is none of the documented NMI sources, * so it must be the NMI watchdog. */ - if (nmi_watchdog_tick(regs,reason)) + if (nmi_watchdog_tick(regs, reason)) return; - if (!do_nmi_callback(regs,cpu)) + if (!do_nmi_callback(regs, cpu)) unknown_nmi_error(reason, regs); return; } if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) - return; + return; /* AK: following checks seem to be broken on modern chipsets. FIXME */ - if (reason & 0x80) mem_parity_error(reason, regs); if (reason & 0x40) @@ -870,9 +846,12 @@ asmlinkage notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code) { nmi_enter(); + add_pda(__nmi_count, 1); + if (!ignore_nmis) default_do_nmi(regs); + nmi_exit(); } @@ -889,13 +868,14 @@ void restart_nmi(void) } /* runs on IST stack. */ -asmlinkage void __kprobes do_int3(struct pt_regs * regs, long error_code) +asmlinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) { trace_hardirqs_fixup(); - if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) { + if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) + == NOTIFY_STOP) return; - } + preempt_conditional_sti(regs); do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); preempt_conditional_cli(regs); @@ -926,8 +906,8 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) asmlinkage void __kprobes do_debug(struct pt_regs * regs, unsigned long error_code) { - unsigned long condition; struct task_struct *tsk = current; + unsigned long condition; siginfo_t info; trace_hardirqs_fixup(); @@ -948,21 +928,19 @@ asmlinkage void __kprobes do_debug(struct pt_regs * regs, /* Mask out spurious debug traps due to lazy DR7 setting */ if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { - if (!tsk->thread.debugreg7) { + if (!tsk->thread.debugreg7) goto clear_dr7; - } } tsk->thread.debugreg6 = condition; - /* * Single-stepping through TF: make sure we ignore any events in * kernel space (but re-enable TF when returning to user mode). */ if (condition & DR_STEP) { - if (!user_mode(regs)) - goto clear_TF_reenable; + if (!user_mode(regs)) + goto clear_TF_reenable; } /* Ok, finally something we can handle */ @@ -975,7 +953,7 @@ asmlinkage void __kprobes do_debug(struct pt_regs * regs, force_sig_info(SIGTRAP, &info, tsk); clear_dr7: - set_debugreg(0UL, 7); + set_debugreg(0, 7); preempt_conditional_cli(regs); return; @@ -983,6 +961,7 @@ clear_TF_reenable: set_tsk_thread_flag(tsk, TIF_SINGLESTEP); regs->flags &= ~X86_EFLAGS_TF; preempt_conditional_cli(regs); + return; } static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr) @@ -1005,7 +984,7 @@ static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr) asmlinkage void do_coprocessor_error(struct pt_regs *regs) { void __user *ip = (void __user *)(regs->ip); - struct task_struct * task; + struct task_struct *task; siginfo_t info; unsigned short cwd, swd; @@ -1038,30 +1017,30 @@ asmlinkage void do_coprocessor_error(struct pt_regs *regs) cwd = get_fpu_cwd(task); swd = get_fpu_swd(task); switch (swd & ~cwd & 0x3f) { - case 0x000: - default: - break; - case 0x001: /* Invalid Op */ - /* - * swd & 0x240 == 0x040: Stack Underflow - * swd & 0x240 == 0x240: Stack Overflow - * User must clear the SF bit (0x40) if set - */ - info.si_code = FPE_FLTINV; - break; - case 0x002: /* Denormalize */ - case 0x010: /* Underflow */ - info.si_code = FPE_FLTUND; - break; - case 0x004: /* Zero Divide */ - info.si_code = FPE_FLTDIV; - break; - case 0x008: /* Overflow */ - info.si_code = FPE_FLTOVF; - break; - case 0x020: /* Precision */ - info.si_code = FPE_FLTRES; - break; + case 0x000: /* No unmasked exception */ + default: /* Multiple exceptions */ + break; + case 0x001: /* Invalid Op */ + /* + * swd & 0x240 == 0x040: Stack Underflow + * swd & 0x240 == 0x240: Stack Overflow + * User must clear the SF bit (0x40) if set + */ + info.si_code = FPE_FLTINV; + break; + case 0x002: /* Denormalize */ + case 0x010: /* Underflow */ + info.si_code = FPE_FLTUND; + break; + case 0x004: /* Zero Divide */ + info.si_code = FPE_FLTDIV; + break; + case 0x008: /* Overflow */ + info.si_code = FPE_FLTOVF; + break; + case 0x020: /* Precision */ + info.si_code = FPE_FLTRES; + break; } force_sig_info(SIGFPE, &info, task); } @@ -1074,7 +1053,7 @@ asmlinkage void bad_intr(void) asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs) { void __user *ip = (void __user *)(regs->ip); - struct task_struct * task; + struct task_struct *task; siginfo_t info; unsigned short mxcsr; @@ -1102,25 +1081,25 @@ asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs) */ mxcsr = get_fpu_mxcsr(task); switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) { - case 0x000: - default: - break; - case 0x001: /* Invalid Op */ - info.si_code = FPE_FLTINV; - break; - case 0x002: /* Denormalize */ - case 0x010: /* Underflow */ - info.si_code = FPE_FLTUND; - break; - case 0x004: /* Zero Divide */ - info.si_code = FPE_FLTDIV; - break; - case 0x008: /* Overflow */ - info.si_code = FPE_FLTOVF; - break; - case 0x020: /* Precision */ - info.si_code = FPE_FLTRES; - break; + case 0x000: + default: + break; + case 0x001: /* Invalid Op */ + info.si_code = FPE_FLTINV; + break; + case 0x002: /* Denormalize */ + case 0x010: /* Underflow */ + info.si_code = FPE_FLTUND; + break; + case 0x004: /* Zero Divide */ + info.si_code = FPE_FLTDIV; + break; + case 0x008: /* Overflow */ + info.si_code = FPE_FLTOVF; + break; + case 0x020: /* Precision */ + info.si_code = FPE_FLTRES; + break; } force_sig_info(SIGFPE, &info, task); } @@ -1138,7 +1117,7 @@ asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void) } /* - * 'math_state_restore()' saves the current math information in the + * 'math_state_restore()' saves the current math information in the * old math state array, and gets the new ones from the current task * * Careful.. There are problems with IBM-designed IRQ13 behaviour. @@ -1163,7 +1142,7 @@ asmlinkage void math_state_restore(void) local_irq_disable(); } - clts(); /* Allow maths ops (or we recurse) */ + clts(); /* Allow maths ops (or we recurse) */ restore_fpu_checking(&me->thread.xstate->fxsave); task_thread_info(me)->status |= TS_USEDFPU; me->fpu_counter++; @@ -1172,64 +1151,61 @@ EXPORT_SYMBOL_GPL(math_state_restore); void __init trap_init(void) { - set_intr_gate(0,÷_error); - set_intr_gate_ist(1,&debug,DEBUG_STACK); - set_intr_gate_ist(2,&nmi,NMI_STACK); - set_system_gate_ist(3,&int3,DEBUG_STACK); /* int3 can be called from all */ - set_system_gate(4,&overflow); /* int4 can be called from all */ - set_intr_gate(5,&bounds); - set_intr_gate(6,&invalid_op); - set_intr_gate(7,&device_not_available); - set_intr_gate_ist(8,&double_fault, DOUBLEFAULT_STACK); - set_intr_gate(9,&coprocessor_segment_overrun); - set_intr_gate(10,&invalid_TSS); - set_intr_gate(11,&segment_not_present); - set_intr_gate_ist(12,&stack_segment,STACKFAULT_STACK); - set_intr_gate(13,&general_protection); - set_intr_gate(14,&page_fault); - set_intr_gate(15,&spurious_interrupt_bug); - set_intr_gate(16,&coprocessor_error); - set_intr_gate(17,&alignment_check); + set_intr_gate(0, ÷_error); + set_intr_gate_ist(1, &debug, DEBUG_STACK); + set_intr_gate_ist(2, &nmi, NMI_STACK); + set_system_gate_ist(3, &int3, DEBUG_STACK); /* int3 can be called from all */ + set_system_gate(4, &overflow); /* int4 can be called from all */ + set_intr_gate(5, &bounds); + set_intr_gate(6, &invalid_op); + set_intr_gate(7, &device_not_available); + set_intr_gate_ist(8, &double_fault, DOUBLEFAULT_STACK); + set_intr_gate(9, &coprocessor_segment_overrun); + set_intr_gate(10, &invalid_TSS); + set_intr_gate(11, &segment_not_present); + set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK); + set_intr_gate(13, &general_protection); + set_intr_gate(14, &page_fault); + set_intr_gate(15, &spurious_interrupt_bug); + set_intr_gate(16, &coprocessor_error); + set_intr_gate(17, &alignment_check); #ifdef CONFIG_X86_MCE - set_intr_gate_ist(18,&machine_check, MCE_STACK); + set_intr_gate_ist(18, &machine_check, MCE_STACK); #endif - set_intr_gate(19,&simd_coprocessor_error); + set_intr_gate(19, &simd_coprocessor_error); #ifdef CONFIG_IA32_EMULATION set_system_gate(IA32_SYSCALL_VECTOR, ia32_syscall); #endif - /* * initialize the per thread extended state: */ - init_thread_xstate(); + init_thread_xstate(); /* - * Should be a barrier for any external CPU state. + * Should be a barrier for any external CPU state: */ cpu_init(); } - static int __init oops_setup(char *s) -{ +{ if (!s) return -EINVAL; if (!strcmp(s, "panic")) panic_on_oops = 1; return 0; -} +} early_param("oops", oops_setup); static int __init kstack_setup(char *s) { if (!s) return -EINVAL; - kstack_depth_to_print = simple_strtoul(s,NULL,0); + kstack_depth_to_print = simple_strtoul(s, NULL, 0); return 0; } early_param("kstack", kstack_setup); - static int __init code_bytes_setup(char *s) { code_bytes = simple_strtoul(s, NULL, 0); diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c new file mode 100644 index 00000000000..7603c055390 --- /dev/null +++ b/arch/x86/kernel/tsc.c @@ -0,0 +1,535 @@ +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/timer.h> +#include <linux/acpi_pmtmr.h> +#include <linux/cpufreq.h> +#include <linux/dmi.h> +#include <linux/delay.h> +#include <linux/clocksource.h> +#include <linux/percpu.h> + +#include <asm/hpet.h> +#include <asm/timer.h> +#include <asm/vgtod.h> +#include <asm/time.h> +#include <asm/delay.h> + +unsigned int cpu_khz; /* TSC clocks / usec, not used here */ +EXPORT_SYMBOL(cpu_khz); +unsigned int tsc_khz; +EXPORT_SYMBOL(tsc_khz); + +/* + * TSC can be unstable due to cpufreq or due to unsynced TSCs + */ +static int tsc_unstable; + +/* native_sched_clock() is called before tsc_init(), so + we must start with the TSC soft disabled to prevent + erroneous rdtsc usage on !cpu_has_tsc processors */ +static int tsc_disabled = -1; + +/* + * Scheduler clock - returns current time in nanosec units. + */ +u64 native_sched_clock(void) +{ + u64 this_offset; + + /* + * Fall back to jiffies if there's no TSC available: + * ( But note that we still use it if the TSC is marked + * unstable. We do this because unlike Time Of Day, + * the scheduler clock tolerates small errors and it's + * very important for it to be as fast as the platform + * can achive it. ) + */ + if (unlikely(tsc_disabled)) { + /* No locking but a rare wrong value is not a big deal: */ + return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ); + } + + /* read the Time Stamp Counter: */ + rdtscll(this_offset); + + /* return the value in ns */ + return cycles_2_ns(this_offset); +} + +/* We need to define a real function for sched_clock, to override the + weak default version */ +#ifdef CONFIG_PARAVIRT +unsigned long long sched_clock(void) +{ + return paravirt_sched_clock(); +} +#else +unsigned long long +sched_clock(void) __attribute__((alias("native_sched_clock"))); +#endif + +int check_tsc_unstable(void) +{ + return tsc_unstable; +} +EXPORT_SYMBOL_GPL(check_tsc_unstable); + +#ifdef CONFIG_X86_TSC +int __init notsc_setup(char *str) +{ + printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, " + "cannot disable TSC completely.\n"); + tsc_disabled = 1; + return 1; +} +#else +/* + * disable flag for tsc. Takes effect by clearing the TSC cpu flag + * in cpu/common.c + */ +int __init notsc_setup(char *str) +{ + setup_clear_cpu_cap(X86_FEATURE_TSC); + return 1; +} +#endif + +__setup("notsc", notsc_setup); + +#define MAX_RETRIES 5 +#define SMI_TRESHOLD 50000 + +/* + * Read TSC and the reference counters. Take care of SMI disturbance + */ +static u64 __init tsc_read_refs(u64 *pm, u64 *hpet) +{ + u64 t1, t2; + int i; + + for (i = 0; i < MAX_RETRIES; i++) { + t1 = get_cycles(); + if (hpet) + *hpet = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF; + else + *pm = acpi_pm_read_early(); + t2 = get_cycles(); + if ((t2 - t1) < SMI_TRESHOLD) + return t2; + } + return ULLONG_MAX; +} + +/** + * native_calibrate_tsc - calibrate the tsc on boot + */ +unsigned long native_calibrate_tsc(void) +{ + unsigned long flags; + u64 tsc1, tsc2, tr1, tr2, delta, pm1, pm2, hpet1, hpet2; + int hpet = is_hpet_enabled(); + unsigned int tsc_khz_val = 0; + + local_irq_save(flags); + + tsc1 = tsc_read_refs(&pm1, hpet ? &hpet1 : NULL); + + outb((inb(0x61) & ~0x02) | 0x01, 0x61); + + outb(0xb0, 0x43); + outb((CLOCK_TICK_RATE / (1000 / 50)) & 0xff, 0x42); + outb((CLOCK_TICK_RATE / (1000 / 50)) >> 8, 0x42); + tr1 = get_cycles(); + while ((inb(0x61) & 0x20) == 0); + tr2 = get_cycles(); + + tsc2 = tsc_read_refs(&pm2, hpet ? &hpet2 : NULL); + + local_irq_restore(flags); + + /* + * Preset the result with the raw and inaccurate PIT + * calibration value + */ + delta = (tr2 - tr1); + do_div(delta, 50); + tsc_khz_val = delta; + + /* hpet or pmtimer available ? */ + if (!hpet && !pm1 && !pm2) { + printk(KERN_INFO "TSC calibrated against PIT\n"); + goto out; + } + + /* Check, whether the sampling was disturbed by an SMI */ + if (tsc1 == ULLONG_MAX || tsc2 == ULLONG_MAX) { + printk(KERN_WARNING "TSC calibration disturbed by SMI, " + "using PIT calibration result\n"); + goto out; + } + + tsc2 = (tsc2 - tsc1) * 1000000LL; + + if (hpet) { + printk(KERN_INFO "TSC calibrated against HPET\n"); + if (hpet2 < hpet1) + hpet2 += 0x100000000ULL; + hpet2 -= hpet1; + tsc1 = ((u64)hpet2 * hpet_readl(HPET_PERIOD)); + do_div(tsc1, 1000000); + } else { + printk(KERN_INFO "TSC calibrated against PM_TIMER\n"); + if (pm2 < pm1) + pm2 += (u64)ACPI_PM_OVRRUN; + pm2 -= pm1; + tsc1 = pm2 * 1000000000LL; + do_div(tsc1, PMTMR_TICKS_PER_SEC); + } + + do_div(tsc2, tsc1); + tsc_khz_val = tsc2; + +out: + return tsc_khz_val; +} + + +#ifdef CONFIG_X86_32 +/* Only called from the Powernow K7 cpu freq driver */ +int recalibrate_cpu_khz(void) +{ +#ifndef CONFIG_SMP + unsigned long cpu_khz_old = cpu_khz; + + if (cpu_has_tsc) { + tsc_khz = calibrate_tsc(); + cpu_khz = tsc_khz; + cpu_data(0).loops_per_jiffy = + cpufreq_scale(cpu_data(0).loops_per_jiffy, + cpu_khz_old, cpu_khz); + return 0; + } else + return -ENODEV; +#else + return -ENODEV; +#endif +} + +EXPORT_SYMBOL(recalibrate_cpu_khz); + +#endif /* CONFIG_X86_32 */ + +/* Accelerators for sched_clock() + * convert from cycles(64bits) => nanoseconds (64bits) + * basic equation: + * ns = cycles / (freq / ns_per_sec) + * ns = cycles * (ns_per_sec / freq) + * ns = cycles * (10^9 / (cpu_khz * 10^3)) + * ns = cycles * (10^6 / cpu_khz) + * + * Then we use scaling math (suggested by george@mvista.com) to get: + * ns = cycles * (10^6 * SC / cpu_khz) / SC + * ns = cycles * cyc2ns_scale / SC + * + * And since SC is a constant power of two, we can convert the div + * into a shift. + * + * We can use khz divisor instead of mhz to keep a better precision, since + * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. + * (mathieu.desnoyers@polymtl.ca) + * + * -johnstul@us.ibm.com "math is hard, lets go shopping!" + */ + +DEFINE_PER_CPU(unsigned long, cyc2ns); + +static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) +{ + unsigned long long tsc_now, ns_now; + unsigned long flags, *scale; + + local_irq_save(flags); + sched_clock_idle_sleep_event(); + + scale = &per_cpu(cyc2ns, cpu); + + rdtscll(tsc_now); + ns_now = __cycles_2_ns(tsc_now); + + if (cpu_khz) + *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz; + + sched_clock_idle_wakeup_event(0); + local_irq_restore(flags); +} + +#ifdef CONFIG_CPU_FREQ + +/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency + * changes. + * + * RED-PEN: On SMP we assume all CPUs run with the same frequency. It's + * not that important because current Opteron setups do not support + * scaling on SMP anyroads. + * + * Should fix up last_tsc too. Currently gettimeofday in the + * first tick after the change will be slightly wrong. + */ + +static unsigned int ref_freq; +static unsigned long loops_per_jiffy_ref; +static unsigned long tsc_khz_ref; + +static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct cpufreq_freqs *freq = data; + unsigned long *lpj, dummy; + + if (cpu_has(&cpu_data(freq->cpu), X86_FEATURE_CONSTANT_TSC)) + return 0; + + lpj = &dummy; + if (!(freq->flags & CPUFREQ_CONST_LOOPS)) +#ifdef CONFIG_SMP + lpj = &cpu_data(freq->cpu).loops_per_jiffy; +#else + lpj = &boot_cpu_data.loops_per_jiffy; +#endif + + if (!ref_freq) { + ref_freq = freq->old; + loops_per_jiffy_ref = *lpj; + tsc_khz_ref = tsc_khz; + } + if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || + (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || + (val == CPUFREQ_RESUMECHANGE)) { + *lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); + + tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); + if (!(freq->flags & CPUFREQ_CONST_LOOPS)) + mark_tsc_unstable("cpufreq changes"); + } + + set_cyc2ns_scale(tsc_khz_ref, freq->cpu); + + return 0; +} + +static struct notifier_block time_cpufreq_notifier_block = { + .notifier_call = time_cpufreq_notifier +}; + +static int __init cpufreq_tsc(void) +{ + cpufreq_register_notifier(&time_cpufreq_notifier_block, + CPUFREQ_TRANSITION_NOTIFIER); + return 0; +} + +core_initcall(cpufreq_tsc); + +#endif /* CONFIG_CPU_FREQ */ + +/* clocksource code */ + +static struct clocksource clocksource_tsc; + +/* + * We compare the TSC to the cycle_last value in the clocksource + * structure to avoid a nasty time-warp. This can be observed in a + * very small window right after one CPU updated cycle_last under + * xtime/vsyscall_gtod lock and the other CPU reads a TSC value which + * is smaller than the cycle_last reference value due to a TSC which + * is slighty behind. This delta is nowhere else observable, but in + * that case it results in a forward time jump in the range of hours + * due to the unsigned delta calculation of the time keeping core + * code, which is necessary to support wrapping clocksources like pm + * timer. + */ +static cycle_t read_tsc(void) +{ + cycle_t ret = (cycle_t)get_cycles(); + + return ret >= clocksource_tsc.cycle_last ? + ret : clocksource_tsc.cycle_last; +} + +#ifdef CONFIG_X86_64 +static cycle_t __vsyscall_fn vread_tsc(void) +{ + cycle_t ret = (cycle_t)vget_cycles(); + + return ret >= __vsyscall_gtod_data.clock.cycle_last ? + ret : __vsyscall_gtod_data.clock.cycle_last; +} +#endif + +static struct clocksource clocksource_tsc = { + .name = "tsc", + .rating = 300, + .read = read_tsc, + .mask = CLOCKSOURCE_MASK(64), + .shift = 22, + .flags = CLOCK_SOURCE_IS_CONTINUOUS | + CLOCK_SOURCE_MUST_VERIFY, +#ifdef CONFIG_X86_64 + .vread = vread_tsc, +#endif +}; + +void mark_tsc_unstable(char *reason) +{ + if (!tsc_unstable) { + tsc_unstable = 1; + printk("Marking TSC unstable due to %s\n", reason); + /* Change only the rating, when not registered */ + if (clocksource_tsc.mult) + clocksource_change_rating(&clocksource_tsc, 0); + else + clocksource_tsc.rating = 0; + } +} + +EXPORT_SYMBOL_GPL(mark_tsc_unstable); + +static int __init dmi_mark_tsc_unstable(const struct dmi_system_id *d) +{ + printk(KERN_NOTICE "%s detected: marking TSC unstable.\n", + d->ident); + tsc_unstable = 1; + return 0; +} + +/* List of systems that have known TSC problems */ +static struct dmi_system_id __initdata bad_tsc_dmi_table[] = { + { + .callback = dmi_mark_tsc_unstable, + .ident = "IBM Thinkpad 380XD", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), + DMI_MATCH(DMI_BOARD_NAME, "2635FA0"), + }, + }, + {} +}; + +/* + * Geode_LX - the OLPC CPU has a possibly a very reliable TSC + */ +#ifdef CONFIG_MGEODE_LX +/* RTSC counts during suspend */ +#define RTSC_SUSP 0x100 + +static void __init check_geode_tsc_reliable(void) +{ + unsigned long res_low, res_high; + + rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high); + if (res_low & RTSC_SUSP) + clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; +} +#else +static inline void check_geode_tsc_reliable(void) { } +#endif + +/* + * Make an educated guess if the TSC is trustworthy and synchronized + * over all CPUs. + */ +__cpuinit int unsynchronized_tsc(void) +{ + if (!cpu_has_tsc || tsc_unstable) + return 1; + +#ifdef CONFIG_SMP + if (apic_is_clustered_box()) + return 1; +#endif + + if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) + return 0; + /* + * Intel systems are normally all synchronized. + * Exceptions must mark TSC as unstable: + */ + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { + /* assume multi socket systems are not synchronized: */ + if (num_possible_cpus() > 1) + tsc_unstable = 1; + } + + return tsc_unstable; +} + +static void __init init_tsc_clocksource(void) +{ + clocksource_tsc.mult = clocksource_khz2mult(tsc_khz, + clocksource_tsc.shift); + /* lower the rating if we already know its unstable: */ + if (check_tsc_unstable()) { + clocksource_tsc.rating = 0; + clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS; + } + clocksource_register(&clocksource_tsc); +} + +void __init tsc_init(void) +{ + u64 lpj; + int cpu; + + if (!cpu_has_tsc) + return; + + tsc_khz = calibrate_tsc(); + cpu_khz = tsc_khz; + + if (!tsc_khz) { + mark_tsc_unstable("could not calculate TSC khz"); + return; + } + +#ifdef CONFIG_X86_64 + if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) && + (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)) + cpu_khz = calibrate_cpu(); +#endif + + lpj = ((u64)tsc_khz * 1000); + do_div(lpj, HZ); + lpj_fine = lpj; + + printk("Detected %lu.%03lu MHz processor.\n", + (unsigned long)cpu_khz / 1000, + (unsigned long)cpu_khz % 1000); + + /* + * Secondary CPUs do not run through tsc_init(), so set up + * all the scale factors for all CPUs, assuming the same + * speed as the bootup CPU. (cpufreq notifiers will fix this + * up if their speed diverges) + */ + for_each_possible_cpu(cpu) + set_cyc2ns_scale(cpu_khz, cpu); + + if (tsc_disabled > 0) + return; + + /* now allow native_sched_clock() to use rdtsc */ + tsc_disabled = 0; + + use_tsc_delay(); + /* Check and install the TSC clocksource */ + dmi_check_system(bad_tsc_dmi_table); + + if (unsynchronized_tsc()) + mark_tsc_unstable("TSCs unsynchronized"); + + check_geode_tsc_reliable(); + init_tsc_clocksource(); +} + diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c deleted file mode 100644 index 6240922e497..00000000000 --- a/arch/x86/kernel/tsc_32.c +++ /dev/null @@ -1,455 +0,0 @@ -#include <linux/sched.h> -#include <linux/clocksource.h> -#include <linux/workqueue.h> -#include <linux/delay.h> -#include <linux/cpufreq.h> -#include <linux/jiffies.h> -#include <linux/init.h> -#include <linux/dmi.h> -#include <linux/percpu.h> - -#include <asm/delay.h> -#include <asm/tsc.h> -#include <asm/io.h> -#include <asm/timer.h> - -#include "mach_timer.h" - -/* native_sched_clock() is called before tsc_init(), so - we must start with the TSC soft disabled to prevent - erroneous rdtsc usage on !cpu_has_tsc processors */ -static int tsc_disabled = -1; - -/* - * On some systems the TSC frequency does not - * change with the cpu frequency. So we need - * an extra value to store the TSC freq - */ -unsigned int tsc_khz; -EXPORT_SYMBOL_GPL(tsc_khz); - -#ifdef CONFIG_X86_TSC -static int __init tsc_setup(char *str) -{ - printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, " - "cannot disable TSC completely.\n"); - tsc_disabled = 1; - return 1; -} -#else -/* - * disable flag for tsc. Takes effect by clearing the TSC cpu flag - * in cpu/common.c - */ -static int __init tsc_setup(char *str) -{ - setup_clear_cpu_cap(X86_FEATURE_TSC); - return 1; -} -#endif - -__setup("notsc", tsc_setup); - -/* - * code to mark and check if the TSC is unstable - * due to cpufreq or due to unsynced TSCs - */ -static int tsc_unstable; - -int check_tsc_unstable(void) -{ - return tsc_unstable; -} -EXPORT_SYMBOL_GPL(check_tsc_unstable); - -/* Accelerators for sched_clock() - * convert from cycles(64bits) => nanoseconds (64bits) - * basic equation: - * ns = cycles / (freq / ns_per_sec) - * ns = cycles * (ns_per_sec / freq) - * ns = cycles * (10^9 / (cpu_khz * 10^3)) - * ns = cycles * (10^6 / cpu_khz) - * - * Then we use scaling math (suggested by george@mvista.com) to get: - * ns = cycles * (10^6 * SC / cpu_khz) / SC - * ns = cycles * cyc2ns_scale / SC - * - * And since SC is a constant power of two, we can convert the div - * into a shift. - * - * We can use khz divisor instead of mhz to keep a better precision, since - * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. - * (mathieu.desnoyers@polymtl.ca) - * - * -johnstul@us.ibm.com "math is hard, lets go shopping!" - */ - -DEFINE_PER_CPU(unsigned long, cyc2ns); - -static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) -{ - unsigned long long tsc_now, ns_now; - unsigned long flags, *scale; - - local_irq_save(flags); - sched_clock_idle_sleep_event(); - - scale = &per_cpu(cyc2ns, cpu); - - rdtscll(tsc_now); - ns_now = __cycles_2_ns(tsc_now); - - if (cpu_khz) - *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz; - - /* - * Start smoothly with the new frequency: - */ - sched_clock_idle_wakeup_event(0); - local_irq_restore(flags); -} - -/* - * Scheduler clock - returns current time in nanosec units. - */ -unsigned long long native_sched_clock(void) -{ - unsigned long long this_offset; - - /* - * Fall back to jiffies if there's no TSC available: - * ( But note that we still use it if the TSC is marked - * unstable. We do this because unlike Time Of Day, - * the scheduler clock tolerates small errors and it's - * very important for it to be as fast as the platform - * can achive it. ) - */ - if (unlikely(tsc_disabled)) - /* No locking but a rare wrong value is not a big deal: */ - return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ); - - /* read the Time Stamp Counter: */ - rdtscll(this_offset); - - /* return the value in ns */ - return cycles_2_ns(this_offset); -} - -/* We need to define a real function for sched_clock, to override the - weak default version */ -#ifdef CONFIG_PARAVIRT -unsigned long long sched_clock(void) -{ - return paravirt_sched_clock(); -} -#else -unsigned long long sched_clock(void) - __attribute__((alias("native_sched_clock"))); -#endif - -unsigned long native_calculate_cpu_khz(void) -{ - unsigned long long start, end; - unsigned long count; - u64 delta64 = (u64)ULLONG_MAX; - int i; - unsigned long flags; - - local_irq_save(flags); - - /* run 3 times to ensure the cache is warm and to get an accurate reading */ - for (i = 0; i < 3; i++) { - mach_prepare_counter(); - rdtscll(start); - mach_countup(&count); - rdtscll(end); - - /* - * Error: ECTCNEVERSET - * The CTC wasn't reliable: we got a hit on the very first read, - * or the CPU was so fast/slow that the quotient wouldn't fit in - * 32 bits.. - */ - if (count <= 1) - continue; - - /* cpu freq too slow: */ - if ((end - start) <= CALIBRATE_TIME_MSEC) - continue; - - /* - * We want the minimum time of all runs in case one of them - * is inaccurate due to SMI or other delay - */ - delta64 = min(delta64, (end - start)); - } - - /* cpu freq too fast (or every run was bad): */ - if (delta64 > (1ULL<<32)) - goto err; - - delta64 += CALIBRATE_TIME_MSEC/2; /* round for do_div */ - do_div(delta64,CALIBRATE_TIME_MSEC); - - local_irq_restore(flags); - return (unsigned long)delta64; -err: - local_irq_restore(flags); - return 0; -} - -int recalibrate_cpu_khz(void) -{ -#ifndef CONFIG_SMP - unsigned long cpu_khz_old = cpu_khz; - - if (cpu_has_tsc) { - cpu_khz = calculate_cpu_khz(); - tsc_khz = cpu_khz; - cpu_data(0).loops_per_jiffy = - cpufreq_scale(cpu_data(0).loops_per_jiffy, - cpu_khz_old, cpu_khz); - return 0; - } else - return -ENODEV; -#else - return -ENODEV; -#endif -} - -EXPORT_SYMBOL(recalibrate_cpu_khz); - -#ifdef CONFIG_CPU_FREQ - -/* - * if the CPU frequency is scaled, TSC-based delays will need a different - * loops_per_jiffy value to function properly. - */ -static unsigned int ref_freq; -static unsigned long loops_per_jiffy_ref; -static unsigned long cpu_khz_ref; - -static int -time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) -{ - struct cpufreq_freqs *freq = data; - - if (!ref_freq) { - if (!freq->old){ - ref_freq = freq->new; - return 0; - } - ref_freq = freq->old; - loops_per_jiffy_ref = cpu_data(freq->cpu).loops_per_jiffy; - cpu_khz_ref = cpu_khz; - } - - if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || - (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || - (val == CPUFREQ_RESUMECHANGE)) { - if (!(freq->flags & CPUFREQ_CONST_LOOPS)) - cpu_data(freq->cpu).loops_per_jiffy = - cpufreq_scale(loops_per_jiffy_ref, - ref_freq, freq->new); - - if (cpu_khz) { - - if (num_online_cpus() == 1) - cpu_khz = cpufreq_scale(cpu_khz_ref, - ref_freq, freq->new); - if (!(freq->flags & CPUFREQ_CONST_LOOPS)) { - tsc_khz = cpu_khz; - set_cyc2ns_scale(cpu_khz, freq->cpu); - /* - * TSC based sched_clock turns - * to junk w/ cpufreq - */ - mark_tsc_unstable("cpufreq changes"); - } - } - } - - return 0; -} - -static struct notifier_block time_cpufreq_notifier_block = { - .notifier_call = time_cpufreq_notifier -}; - -static int __init cpufreq_tsc(void) -{ - return cpufreq_register_notifier(&time_cpufreq_notifier_block, - CPUFREQ_TRANSITION_NOTIFIER); -} -core_initcall(cpufreq_tsc); - -#endif - -/* clock source code */ - -static struct clocksource clocksource_tsc; - -/* - * We compare the TSC to the cycle_last value in the clocksource - * structure to avoid a nasty time-warp issue. This can be observed in - * a very small window right after one CPU updated cycle_last under - * xtime lock and the other CPU reads a TSC value which is smaller - * than the cycle_last reference value due to a TSC which is slighty - * behind. This delta is nowhere else observable, but in that case it - * results in a forward time jump in the range of hours due to the - * unsigned delta calculation of the time keeping core code, which is - * necessary to support wrapping clocksources like pm timer. - */ -static cycle_t read_tsc(void) -{ - cycle_t ret; - - rdtscll(ret); - - return ret >= clocksource_tsc.cycle_last ? - ret : clocksource_tsc.cycle_last; -} - -static struct clocksource clocksource_tsc = { - .name = "tsc", - .rating = 300, - .read = read_tsc, - .mask = CLOCKSOURCE_MASK(64), - .mult = 0, /* to be set */ - .shift = 22, - .flags = CLOCK_SOURCE_IS_CONTINUOUS | - CLOCK_SOURCE_MUST_VERIFY, -}; - -void mark_tsc_unstable(char *reason) -{ - if (!tsc_unstable) { - tsc_unstable = 1; - printk("Marking TSC unstable due to: %s.\n", reason); - /* Can be called before registration */ - if (clocksource_tsc.mult) - clocksource_change_rating(&clocksource_tsc, 0); - else - clocksource_tsc.rating = 0; - } -} -EXPORT_SYMBOL_GPL(mark_tsc_unstable); - -static int __init dmi_mark_tsc_unstable(const struct dmi_system_id *d) -{ - printk(KERN_NOTICE "%s detected: marking TSC unstable.\n", - d->ident); - tsc_unstable = 1; - return 0; -} - -/* List of systems that have known TSC problems */ -static struct dmi_system_id __initdata bad_tsc_dmi_table[] = { - { - .callback = dmi_mark_tsc_unstable, - .ident = "IBM Thinkpad 380XD", - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), - DMI_MATCH(DMI_BOARD_NAME, "2635FA0"), - }, - }, - {} -}; - -/* - * Make an educated guess if the TSC is trustworthy and synchronized - * over all CPUs. - */ -__cpuinit int unsynchronized_tsc(void) -{ - if (!cpu_has_tsc || tsc_unstable) - return 1; - - /* Anything with constant TSC should be synchronized */ - if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) - return 0; - - /* - * Intel systems are normally all synchronized. - * Exceptions must mark TSC as unstable: - */ - if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { - /* assume multi socket systems are not synchronized: */ - if (num_possible_cpus() > 1) - tsc_unstable = 1; - } - return tsc_unstable; -} - -/* - * Geode_LX - the OLPC CPU has a possibly a very reliable TSC - */ -#ifdef CONFIG_MGEODE_LX -/* RTSC counts during suspend */ -#define RTSC_SUSP 0x100 - -static void __init check_geode_tsc_reliable(void) -{ - unsigned long res_low, res_high; - - rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high); - if (res_low & RTSC_SUSP) - clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; -} -#else -static inline void check_geode_tsc_reliable(void) { } -#endif - - -void __init tsc_init(void) -{ - int cpu; - u64 lpj; - - if (!cpu_has_tsc || tsc_disabled > 0) - return; - - cpu_khz = calculate_cpu_khz(); - tsc_khz = cpu_khz; - - if (!cpu_khz) { - mark_tsc_unstable("could not calculate TSC khz"); - return; - } - - lpj = ((u64)tsc_khz * 1000); - do_div(lpj, HZ); - lpj_fine = lpj; - - /* now allow native_sched_clock() to use rdtsc */ - tsc_disabled = 0; - - printk("Detected %lu.%03lu MHz processor.\n", - (unsigned long)cpu_khz / 1000, - (unsigned long)cpu_khz % 1000); - - /* - * Secondary CPUs do not run through tsc_init(), so set up - * all the scale factors for all CPUs, assuming the same - * speed as the bootup CPU. (cpufreq notifiers will fix this - * up if their speed diverges) - */ - for_each_possible_cpu(cpu) - set_cyc2ns_scale(cpu_khz, cpu); - - use_tsc_delay(); - - /* Check and install the TSC clocksource */ - dmi_check_system(bad_tsc_dmi_table); - - unsynchronized_tsc(); - check_geode_tsc_reliable(); - clocksource_tsc.mult = clocksource_khz2mult(tsc_khz, - clocksource_tsc.shift); - /* lower the rating if we already know its unstable: */ - if (check_tsc_unstable()) { - clocksource_tsc.rating = 0; - clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS; - } - clocksource_register(&clocksource_tsc); -} diff --git a/arch/x86/kernel/tsc_64.c b/arch/x86/kernel/tsc_64.c deleted file mode 100644 index 9898fb01edf..00000000000 --- a/arch/x86/kernel/tsc_64.c +++ /dev/null @@ -1,357 +0,0 @@ -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/interrupt.h> -#include <linux/init.h> -#include <linux/clocksource.h> -#include <linux/time.h> -#include <linux/acpi.h> -#include <linux/cpufreq.h> -#include <linux/acpi_pmtmr.h> - -#include <asm/hpet.h> -#include <asm/timex.h> -#include <asm/timer.h> -#include <asm/vgtod.h> - -static int notsc __initdata = 0; - -unsigned int cpu_khz; /* TSC clocks / usec, not used here */ -EXPORT_SYMBOL(cpu_khz); -unsigned int tsc_khz; -EXPORT_SYMBOL(tsc_khz); - -/* Accelerators for sched_clock() - * convert from cycles(64bits) => nanoseconds (64bits) - * basic equation: - * ns = cycles / (freq / ns_per_sec) - * ns = cycles * (ns_per_sec / freq) - * ns = cycles * (10^9 / (cpu_khz * 10^3)) - * ns = cycles * (10^6 / cpu_khz) - * - * Then we use scaling math (suggested by george@mvista.com) to get: - * ns = cycles * (10^6 * SC / cpu_khz) / SC - * ns = cycles * cyc2ns_scale / SC - * - * And since SC is a constant power of two, we can convert the div - * into a shift. - * - * We can use khz divisor instead of mhz to keep a better precision, since - * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. - * (mathieu.desnoyers@polymtl.ca) - * - * -johnstul@us.ibm.com "math is hard, lets go shopping!" - */ -DEFINE_PER_CPU(unsigned long, cyc2ns); - -static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) -{ - unsigned long long tsc_now, ns_now; - unsigned long flags, *scale; - - local_irq_save(flags); - sched_clock_idle_sleep_event(); - - scale = &per_cpu(cyc2ns, cpu); - - rdtscll(tsc_now); - ns_now = __cycles_2_ns(tsc_now); - - if (cpu_khz) - *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz; - - sched_clock_idle_wakeup_event(0); - local_irq_restore(flags); -} - -unsigned long long native_sched_clock(void) -{ - unsigned long a = 0; - - /* Could do CPU core sync here. Opteron can execute rdtsc speculatively, - * which means it is not completely exact and may not be monotonous - * between CPUs. But the errors should be too small to matter for - * scheduling purposes. - */ - - rdtscll(a); - return cycles_2_ns(a); -} - -/* We need to define a real function for sched_clock, to override the - weak default version */ -#ifdef CONFIG_PARAVIRT -unsigned long long sched_clock(void) -{ - return paravirt_sched_clock(); -} -#else -unsigned long long -sched_clock(void) __attribute__((alias("native_sched_clock"))); -#endif - - -static int tsc_unstable; - -int check_tsc_unstable(void) -{ - return tsc_unstable; -} -EXPORT_SYMBOL_GPL(check_tsc_unstable); - -#ifdef CONFIG_CPU_FREQ - -/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency - * changes. - * - * RED-PEN: On SMP we assume all CPUs run with the same frequency. It's - * not that important because current Opteron setups do not support - * scaling on SMP anyroads. - * - * Should fix up last_tsc too. Currently gettimeofday in the - * first tick after the change will be slightly wrong. - */ - -static unsigned int ref_freq; -static unsigned long loops_per_jiffy_ref; -static unsigned long tsc_khz_ref; - -static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, - void *data) -{ - struct cpufreq_freqs *freq = data; - unsigned long *lpj, dummy; - - if (cpu_has(&cpu_data(freq->cpu), X86_FEATURE_CONSTANT_TSC)) - return 0; - - lpj = &dummy; - if (!(freq->flags & CPUFREQ_CONST_LOOPS)) -#ifdef CONFIG_SMP - lpj = &cpu_data(freq->cpu).loops_per_jiffy; -#else - lpj = &boot_cpu_data.loops_per_jiffy; -#endif - - if (!ref_freq) { - ref_freq = freq->old; - loops_per_jiffy_ref = *lpj; - tsc_khz_ref = tsc_khz; - } - if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || - (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || - (val == CPUFREQ_RESUMECHANGE)) { - *lpj = - cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); - - tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); - if (!(freq->flags & CPUFREQ_CONST_LOOPS)) - mark_tsc_unstable("cpufreq changes"); - } - - set_cyc2ns_scale(tsc_khz_ref, freq->cpu); - - return 0; -} - -static struct notifier_block time_cpufreq_notifier_block = { - .notifier_call = time_cpufreq_notifier -}; - -static int __init cpufreq_tsc(void) -{ - cpufreq_register_notifier(&time_cpufreq_notifier_block, - CPUFREQ_TRANSITION_NOTIFIER); - return 0; -} - -core_initcall(cpufreq_tsc); - -#endif - -#define MAX_RETRIES 5 -#define SMI_TRESHOLD 50000 - -/* - * Read TSC and the reference counters. Take care of SMI disturbance - */ -static unsigned long __init tsc_read_refs(unsigned long *pm, - unsigned long *hpet) -{ - unsigned long t1, t2; - int i; - - for (i = 0; i < MAX_RETRIES; i++) { - t1 = get_cycles(); - if (hpet) - *hpet = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF; - else - *pm = acpi_pm_read_early(); - t2 = get_cycles(); - if ((t2 - t1) < SMI_TRESHOLD) - return t2; - } - return ULONG_MAX; -} - -/** - * tsc_calibrate - calibrate the tsc on boot - */ -void __init tsc_calibrate(void) -{ - unsigned long flags, tsc1, tsc2, tr1, tr2, pm1, pm2, hpet1, hpet2; - int hpet = is_hpet_enabled(), cpu; - - local_irq_save(flags); - - tsc1 = tsc_read_refs(&pm1, hpet ? &hpet1 : NULL); - - outb((inb(0x61) & ~0x02) | 0x01, 0x61); - - outb(0xb0, 0x43); - outb((CLOCK_TICK_RATE / (1000 / 50)) & 0xff, 0x42); - outb((CLOCK_TICK_RATE / (1000 / 50)) >> 8, 0x42); - tr1 = get_cycles(); - while ((inb(0x61) & 0x20) == 0); - tr2 = get_cycles(); - - tsc2 = tsc_read_refs(&pm2, hpet ? &hpet2 : NULL); - - local_irq_restore(flags); - - /* - * Preset the result with the raw and inaccurate PIT - * calibration value - */ - tsc_khz = (tr2 - tr1) / 50; - - /* hpet or pmtimer available ? */ - if (!hpet && !pm1 && !pm2) { - printk(KERN_INFO "TSC calibrated against PIT\n"); - goto out; - } - - /* Check, whether the sampling was disturbed by an SMI */ - if (tsc1 == ULONG_MAX || tsc2 == ULONG_MAX) { - printk(KERN_WARNING "TSC calibration disturbed by SMI, " - "using PIT calibration result\n"); - goto out; - } - - tsc2 = (tsc2 - tsc1) * 1000000L; - - if (hpet) { - printk(KERN_INFO "TSC calibrated against HPET\n"); - if (hpet2 < hpet1) - hpet2 += 0x100000000UL; - hpet2 -= hpet1; - tsc1 = (hpet2 * hpet_readl(HPET_PERIOD)) / 1000000; - } else { - printk(KERN_INFO "TSC calibrated against PM_TIMER\n"); - if (pm2 < pm1) - pm2 += ACPI_PM_OVRRUN; - pm2 -= pm1; - tsc1 = (pm2 * 1000000000) / PMTMR_TICKS_PER_SEC; - } - - tsc_khz = tsc2 / tsc1; - -out: - for_each_possible_cpu(cpu) - set_cyc2ns_scale(tsc_khz, cpu); -} - -/* - * Make an educated guess if the TSC is trustworthy and synchronized - * over all CPUs. - */ -__cpuinit int unsynchronized_tsc(void) -{ - if (tsc_unstable) - return 1; - -#ifdef CONFIG_SMP - if (apic_is_clustered_box()) - return 1; -#endif - - if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) - return 0; - - /* Assume multi socket systems are not synchronized */ - return num_present_cpus() > 1; -} - -int __init notsc_setup(char *s) -{ - notsc = 1; - return 1; -} - -__setup("notsc", notsc_setup); - -static struct clocksource clocksource_tsc; - -/* - * We compare the TSC to the cycle_last value in the clocksource - * structure to avoid a nasty time-warp. This can be observed in a - * very small window right after one CPU updated cycle_last under - * xtime/vsyscall_gtod lock and the other CPU reads a TSC value which - * is smaller than the cycle_last reference value due to a TSC which - * is slighty behind. This delta is nowhere else observable, but in - * that case it results in a forward time jump in the range of hours - * due to the unsigned delta calculation of the time keeping core - * code, which is necessary to support wrapping clocksources like pm - * timer. - */ -static cycle_t read_tsc(void) -{ - cycle_t ret = (cycle_t)get_cycles(); - - return ret >= clocksource_tsc.cycle_last ? - ret : clocksource_tsc.cycle_last; -} - -static cycle_t __vsyscall_fn vread_tsc(void) -{ - cycle_t ret = (cycle_t)vget_cycles(); - - return ret >= __vsyscall_gtod_data.clock.cycle_last ? - ret : __vsyscall_gtod_data.clock.cycle_last; -} - -static struct clocksource clocksource_tsc = { - .name = "tsc", - .rating = 300, - .read = read_tsc, - .mask = CLOCKSOURCE_MASK(64), - .shift = 22, - .flags = CLOCK_SOURCE_IS_CONTINUOUS | - CLOCK_SOURCE_MUST_VERIFY, - .vread = vread_tsc, -}; - -void mark_tsc_unstable(char *reason) -{ - if (!tsc_unstable) { - tsc_unstable = 1; - printk("Marking TSC unstable due to %s\n", reason); - /* Change only the rating, when not registered */ - if (clocksource_tsc.mult) - clocksource_change_rating(&clocksource_tsc, 0); - else - clocksource_tsc.rating = 0; - } -} -EXPORT_SYMBOL_GPL(mark_tsc_unstable); - -void __init init_tsc_clocksource(void) -{ - if (!notsc) { - clocksource_tsc.mult = clocksource_khz2mult(tsc_khz, - clocksource_tsc.shift); - if (check_tsc_unstable()) - clocksource_tsc.rating = 0; - - clocksource_register(&clocksource_tsc); - } -} diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c new file mode 100644 index 00000000000..e94bdb6add1 --- /dev/null +++ b/arch/x86/kernel/visws_quirks.c @@ -0,0 +1,709 @@ +/* + * SGI Visual Workstation support and quirks, unmaintained. + * + * Split out from setup.c by davej@suse.de + * + * Copyright (C) 1999 Bent Hagemark, Ingo Molnar + * + * SGI Visual Workstation interrupt controller + * + * The Cobalt system ASIC in the Visual Workstation contains a "Cobalt" APIC + * which serves as the main interrupt controller in the system. Non-legacy + * hardware in the system uses this controller directly. Legacy devices + * are connected to the PIIX4 which in turn has its 8259(s) connected to + * a of the Cobalt APIC entry. + * + * 09/02/2000 - Updated for 2.4 by jbarnes@sgi.com + * + * 25/11/2002 - Updated for 2.5 by Andrey Panin <pazke@orbita1.ru> + */ +#include <linux/interrupt.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/smp.h> + +#include <asm/visws/cobalt.h> +#include <asm/visws/piix4.h> +#include <asm/arch_hooks.h> +#include <asm/fixmap.h> +#include <asm/reboot.h> +#include <asm/setup.h> +#include <asm/e820.h> +#include <asm/smp.h> +#include <asm/io.h> + +#include <mach_ipi.h> + +#include "mach_apic.h" + +#include <linux/init.h> +#include <linux/smp.h> + +#include <linux/kernel_stat.h> +#include <linux/interrupt.h> +#include <linux/init.h> + +#include <asm/io.h> +#include <asm/apic.h> +#include <asm/i8259.h> +#include <asm/irq_vectors.h> +#include <asm/visws/cobalt.h> +#include <asm/visws/lithium.h> +#include <asm/visws/piix4.h> + +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/pci.h> +#include <linux/pci_ids.h> + +extern int no_broadcast; + +#include <asm/io.h> +#include <asm/apic.h> +#include <asm/arch_hooks.h> +#include <asm/visws/cobalt.h> +#include <asm/visws/lithium.h> + +char visws_board_type = -1; +char visws_board_rev = -1; + +int is_visws_box(void) +{ + return visws_board_type >= 0; +} + +static int __init visws_time_init_quirk(void) +{ + printk(KERN_INFO "Starting Cobalt Timer system clock\n"); + + /* Set the countdown value */ + co_cpu_write(CO_CPU_TIMEVAL, CO_TIME_HZ/HZ); + + /* Start the timer */ + co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) | CO_CTRL_TIMERUN); + + /* Enable (unmask) the timer interrupt */ + co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) & ~CO_CTRL_TIMEMASK); + + /* + * Zero return means the generic timer setup code will set up + * the standard vector: + */ + return 0; +} + +static int __init visws_pre_intr_init_quirk(void) +{ + init_VISWS_APIC_irqs(); + + /* + * We dont want ISA irqs to be set up by the generic code: + */ + return 1; +} + +/* Quirk for machine specific memory setup. */ + +#define MB (1024 * 1024) + +unsigned long sgivwfb_mem_phys; +unsigned long sgivwfb_mem_size; +EXPORT_SYMBOL(sgivwfb_mem_phys); +EXPORT_SYMBOL(sgivwfb_mem_size); + +long long mem_size __initdata = 0; + +static char * __init visws_memory_setup_quirk(void) +{ + long long gfx_mem_size = 8 * MB; + + mem_size = boot_params.alt_mem_k; + + if (!mem_size) { + printk(KERN_WARNING "Bootloader didn't set memory size, upgrade it !\n"); + mem_size = 128 * MB; + } + + /* + * this hardcodes the graphics memory to 8 MB + * it really should be sized dynamically (or at least + * set as a boot param) + */ + if (!sgivwfb_mem_size) { + printk(KERN_WARNING "Defaulting to 8 MB framebuffer size\n"); + sgivwfb_mem_size = 8 * MB; + } + + /* + * Trim to nearest MB + */ + sgivwfb_mem_size &= ~((1 << 20) - 1); + sgivwfb_mem_phys = mem_size - gfx_mem_size; + + e820_add_region(0, LOWMEMSIZE(), E820_RAM); + e820_add_region(HIGH_MEMORY, mem_size - sgivwfb_mem_size - HIGH_MEMORY, E820_RAM); + e820_add_region(sgivwfb_mem_phys, sgivwfb_mem_size, E820_RESERVED); + + return "PROM"; +} + +static void visws_machine_emergency_restart(void) +{ + /* + * Visual Workstations restart after this + * register is poked on the PIIX4 + */ + outb(PIIX4_RESET_VAL, PIIX4_RESET_PORT); +} + +static void visws_machine_power_off(void) +{ + unsigned short pm_status; +/* extern unsigned int pci_bus0; */ + + while ((pm_status = inw(PMSTS_PORT)) & 0x100) + outw(pm_status, PMSTS_PORT); + + outw(PM_SUSPEND_ENABLE, PMCNTRL_PORT); + + mdelay(10); + +#define PCI_CONF1_ADDRESS(bus, devfn, reg) \ + (0x80000000 | (bus << 16) | (devfn << 8) | (reg & ~3)) + +/* outl(PCI_CONF1_ADDRESS(pci_bus0, SPECIAL_DEV, SPECIAL_REG), 0xCF8); */ + outl(PIIX_SPECIAL_STOP, 0xCFC); +} + +static int __init visws_get_smp_config_quirk(unsigned int early) +{ + /* + * Prevent MP-table parsing by the generic code: + */ + return 1; +} + +extern unsigned int __cpuinitdata maxcpus; + +/* + * The Visual Workstation is Intel MP compliant in the hardware + * sense, but it doesn't have a BIOS(-configuration table). + * No problem for Linux. + */ + +static void __init MP_processor_info (struct mpc_config_processor *m) +{ + int ver, logical_apicid; + physid_mask_t apic_cpus; + + if (!(m->mpc_cpuflag & CPU_ENABLED)) + return; + + logical_apicid = m->mpc_apicid; + printk(KERN_INFO "%sCPU #%d %u:%u APIC version %d\n", + m->mpc_cpuflag & CPU_BOOTPROCESSOR ? "Bootup " : "", + m->mpc_apicid, + (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8, + (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4, + m->mpc_apicver); + + if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) + boot_cpu_physical_apicid = m->mpc_apicid; + + ver = m->mpc_apicver; + if ((ver >= 0x14 && m->mpc_apicid >= 0xff) || m->mpc_apicid >= 0xf) { + printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n", + m->mpc_apicid, MAX_APICS); + return; + } + + apic_cpus = apicid_to_cpu_present(m->mpc_apicid); + physids_or(phys_cpu_present_map, phys_cpu_present_map, apic_cpus); + /* + * Validate version + */ + if (ver == 0x0) { + printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! " + "fixing up to 0x10. (tell your hw vendor)\n", + m->mpc_apicid); + ver = 0x10; + } + apic_version[m->mpc_apicid] = ver; +} + +int __init visws_find_smp_config_quirk(unsigned int reserve) +{ + struct mpc_config_processor *mp = phys_to_virt(CO_CPU_TAB_PHYS); + unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS)); + + if (ncpus > CO_CPU_MAX) { + printk(KERN_WARNING "find_visws_smp: got cpu count of %d at %p\n", + ncpus, mp); + + ncpus = CO_CPU_MAX; + } + + if (ncpus > maxcpus) + ncpus = maxcpus; + +#ifdef CONFIG_X86_LOCAL_APIC + smp_found_config = 1; +#endif + while (ncpus--) + MP_processor_info(mp++); + + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; + + return 1; +} + +extern int visws_trap_init_quirk(void); + +void __init visws_early_detect(void) +{ + int raw; + + visws_board_type = (char)(inb_p(PIIX_GPI_BD_REG) & PIIX_GPI_BD_REG) + >> PIIX_GPI_BD_SHIFT; + + if (visws_board_type < 0) + return; + + /* + * Install special quirks for timer, interrupt and memory setup: + */ + arch_time_init_quirk = visws_time_init_quirk; + arch_pre_intr_init_quirk = visws_pre_intr_init_quirk; + arch_memory_setup_quirk = visws_memory_setup_quirk; + + /* + * Fall back to generic behavior for traps: + */ + arch_intr_init_quirk = NULL; + arch_trap_init_quirk = visws_trap_init_quirk; + + /* + * Install reboot quirks: + */ + pm_power_off = visws_machine_power_off; + machine_ops.emergency_restart = visws_machine_emergency_restart; + + /* + * Do not use broadcast IPIs: + */ + no_broadcast = 0; + + /* + * Override generic MP-table parsing: + */ + mach_get_smp_config_quirk = visws_get_smp_config_quirk; + mach_find_smp_config_quirk = visws_find_smp_config_quirk; + +#ifdef CONFIG_X86_IO_APIC + /* + * Turn off IO-APIC detection and initialization: + */ + skip_ioapic_setup = 1; +#endif + + /* + * Get Board rev. + * First, we have to initialize the 307 part to allow us access + * to the GPIO registers. Let's map them at 0x0fc0 which is right + * after the PIIX4 PM section. + */ + outb_p(SIO_DEV_SEL, SIO_INDEX); + outb_p(SIO_GP_DEV, SIO_DATA); /* Talk to GPIO regs. */ + + outb_p(SIO_DEV_MSB, SIO_INDEX); + outb_p(SIO_GP_MSB, SIO_DATA); /* MSB of GPIO base address */ + + outb_p(SIO_DEV_LSB, SIO_INDEX); + outb_p(SIO_GP_LSB, SIO_DATA); /* LSB of GPIO base address */ + + outb_p(SIO_DEV_ENB, SIO_INDEX); + outb_p(1, SIO_DATA); /* Enable GPIO registers. */ + + /* + * Now, we have to map the power management section to write + * a bit which enables access to the GPIO registers. + * What lunatic came up with this shit? + */ + outb_p(SIO_DEV_SEL, SIO_INDEX); + outb_p(SIO_PM_DEV, SIO_DATA); /* Talk to GPIO regs. */ + + outb_p(SIO_DEV_MSB, SIO_INDEX); + outb_p(SIO_PM_MSB, SIO_DATA); /* MSB of PM base address */ + + outb_p(SIO_DEV_LSB, SIO_INDEX); + outb_p(SIO_PM_LSB, SIO_DATA); /* LSB of PM base address */ + + outb_p(SIO_DEV_ENB, SIO_INDEX); + outb_p(1, SIO_DATA); /* Enable PM registers. */ + + /* + * Now, write the PM register which enables the GPIO registers. + */ + outb_p(SIO_PM_FER2, SIO_PM_INDEX); + outb_p(SIO_PM_GP_EN, SIO_PM_DATA); + + /* + * Now, initialize the GPIO registers. + * We want them all to be inputs which is the + * power on default, so let's leave them alone. + * So, let's just read the board rev! + */ + raw = inb_p(SIO_GP_DATA1); + raw &= 0x7f; /* 7 bits of valid board revision ID. */ + + if (visws_board_type == VISWS_320) { + if (raw < 0x6) { + visws_board_rev = 4; + } else if (raw < 0xc) { + visws_board_rev = 5; + } else { + visws_board_rev = 6; + } + } else if (visws_board_type == VISWS_540) { + visws_board_rev = 2; + } else { + visws_board_rev = raw; + } + + printk(KERN_INFO "Silicon Graphics Visual Workstation %s (rev %d) detected\n", + (visws_board_type == VISWS_320 ? "320" : + (visws_board_type == VISWS_540 ? "540" : + "unknown")), visws_board_rev); +} + +#define A01234 (LI_INTA_0 | LI_INTA_1 | LI_INTA_2 | LI_INTA_3 | LI_INTA_4) +#define BCD (LI_INTB | LI_INTC | LI_INTD) +#define ALLDEVS (A01234 | BCD) + +static __init void lithium_init(void) +{ + set_fixmap(FIX_LI_PCIA, LI_PCI_A_PHYS); + set_fixmap(FIX_LI_PCIB, LI_PCI_B_PHYS); + + if ((li_pcia_read16(PCI_VENDOR_ID) != PCI_VENDOR_ID_SGI) || + (li_pcia_read16(PCI_DEVICE_ID) != PCI_DEVICE_ID_SGI_LITHIUM)) { + printk(KERN_EMERG "Lithium hostbridge %c not found\n", 'A'); +/* panic("This machine is not SGI Visual Workstation 320/540"); */ + } + + if ((li_pcib_read16(PCI_VENDOR_ID) != PCI_VENDOR_ID_SGI) || + (li_pcib_read16(PCI_DEVICE_ID) != PCI_DEVICE_ID_SGI_LITHIUM)) { + printk(KERN_EMERG "Lithium hostbridge %c not found\n", 'B'); +/* panic("This machine is not SGI Visual Workstation 320/540"); */ + } + + li_pcia_write16(LI_PCI_INTEN, ALLDEVS); + li_pcib_write16(LI_PCI_INTEN, ALLDEVS); +} + +static __init void cobalt_init(void) +{ + /* + * On normal SMP PC this is used only with SMP, but we have to + * use it and set it up here to start the Cobalt clock + */ + set_fixmap(FIX_APIC_BASE, APIC_DEFAULT_PHYS_BASE); + setup_local_APIC(); + printk(KERN_INFO "Local APIC Version %#x, ID %#x\n", + (unsigned int)apic_read(APIC_LVR), + (unsigned int)apic_read(APIC_ID)); + + set_fixmap(FIX_CO_CPU, CO_CPU_PHYS); + set_fixmap(FIX_CO_APIC, CO_APIC_PHYS); + printk(KERN_INFO "Cobalt Revision %#lx, APIC ID %#lx\n", + co_cpu_read(CO_CPU_REV), co_apic_read(CO_APIC_ID)); + + /* Enable Cobalt APIC being careful to NOT change the ID! */ + co_apic_write(CO_APIC_ID, co_apic_read(CO_APIC_ID) | CO_APIC_ENABLE); + + printk(KERN_INFO "Cobalt APIC enabled: ID reg %#lx\n", + co_apic_read(CO_APIC_ID)); +} + +int __init visws_trap_init_quirk(void) +{ + lithium_init(); + cobalt_init(); + + return 1; +} + +/* + * IRQ controller / APIC support: + */ + +static DEFINE_SPINLOCK(cobalt_lock); + +/* + * Set the given Cobalt APIC Redirection Table entry to point + * to the given IDT vector/index. + */ +static inline void co_apic_set(int entry, int irq) +{ + co_apic_write(CO_APIC_LO(entry), CO_APIC_LEVEL | (irq + FIRST_EXTERNAL_VECTOR)); + co_apic_write(CO_APIC_HI(entry), 0); +} + +/* + * Cobalt (IO)-APIC functions to handle PCI devices. + */ +static inline int co_apic_ide0_hack(void) +{ + extern char visws_board_type; + extern char visws_board_rev; + + if (visws_board_type == VISWS_320 && visws_board_rev == 5) + return 5; + return CO_APIC_IDE0; +} + +static int is_co_apic(unsigned int irq) +{ + if (IS_CO_APIC(irq)) + return CO_APIC(irq); + + switch (irq) { + case 0: return CO_APIC_CPU; + case CO_IRQ_IDE0: return co_apic_ide0_hack(); + case CO_IRQ_IDE1: return CO_APIC_IDE1; + default: return -1; + } +} + + +/* + * This is the SGI Cobalt (IO-)APIC: + */ + +static void enable_cobalt_irq(unsigned int irq) +{ + co_apic_set(is_co_apic(irq), irq); +} + +static void disable_cobalt_irq(unsigned int irq) +{ + int entry = is_co_apic(irq); + + co_apic_write(CO_APIC_LO(entry), CO_APIC_MASK); + co_apic_read(CO_APIC_LO(entry)); +} + +/* + * "irq" really just serves to identify the device. Here is where we + * map this to the Cobalt APIC entry where it's physically wired. + * This is called via request_irq -> setup_irq -> irq_desc->startup() + */ +static unsigned int startup_cobalt_irq(unsigned int irq) +{ + unsigned long flags; + + spin_lock_irqsave(&cobalt_lock, flags); + if ((irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING))) + irq_desc[irq].status &= ~(IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING); + enable_cobalt_irq(irq); + spin_unlock_irqrestore(&cobalt_lock, flags); + return 0; +} + +static void ack_cobalt_irq(unsigned int irq) +{ + unsigned long flags; + + spin_lock_irqsave(&cobalt_lock, flags); + disable_cobalt_irq(irq); + apic_write(APIC_EOI, APIC_EIO_ACK); + spin_unlock_irqrestore(&cobalt_lock, flags); +} + +static void end_cobalt_irq(unsigned int irq) +{ + unsigned long flags; + + spin_lock_irqsave(&cobalt_lock, flags); + if (!(irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS))) + enable_cobalt_irq(irq); + spin_unlock_irqrestore(&cobalt_lock, flags); +} + +static struct irq_chip cobalt_irq_type = { + .typename = "Cobalt-APIC", + .startup = startup_cobalt_irq, + .shutdown = disable_cobalt_irq, + .enable = enable_cobalt_irq, + .disable = disable_cobalt_irq, + .ack = ack_cobalt_irq, + .end = end_cobalt_irq, +}; + + +/* + * This is the PIIX4-based 8259 that is wired up indirectly to Cobalt + * -- not the manner expected by the code in i8259.c. + * + * there is a 'master' physical interrupt source that gets sent to + * the CPU. But in the chipset there are various 'virtual' interrupts + * waiting to be handled. We represent this to Linux through a 'master' + * interrupt controller type, and through a special virtual interrupt- + * controller. Device drivers only see the virtual interrupt sources. + */ +static unsigned int startup_piix4_master_irq(unsigned int irq) +{ + init_8259A(0); + + return startup_cobalt_irq(irq); +} + +static void end_piix4_master_irq(unsigned int irq) +{ + unsigned long flags; + + spin_lock_irqsave(&cobalt_lock, flags); + enable_cobalt_irq(irq); + spin_unlock_irqrestore(&cobalt_lock, flags); +} + +static struct irq_chip piix4_master_irq_type = { + .typename = "PIIX4-master", + .startup = startup_piix4_master_irq, + .ack = ack_cobalt_irq, + .end = end_piix4_master_irq, +}; + + +static struct irq_chip piix4_virtual_irq_type = { + .typename = "PIIX4-virtual", + .shutdown = disable_8259A_irq, + .enable = enable_8259A_irq, + .disable = disable_8259A_irq, +}; + + +/* + * PIIX4-8259 master/virtual functions to handle interrupt requests + * from legacy devices: floppy, parallel, serial, rtc. + * + * None of these get Cobalt APIC entries, neither do they have IDT + * entries. These interrupts are purely virtual and distributed from + * the 'master' interrupt source: CO_IRQ_8259. + * + * When the 8259 interrupts its handler figures out which of these + * devices is interrupting and dispatches to its handler. + * + * CAREFUL: devices see the 'virtual' interrupt only. Thus disable/ + * enable_irq gets the right irq. This 'master' irq is never directly + * manipulated by any driver. + */ +static irqreturn_t piix4_master_intr(int irq, void *dev_id) +{ + int realirq; + irq_desc_t *desc; + unsigned long flags; + + spin_lock_irqsave(&i8259A_lock, flags); + + /* Find out what's interrupting in the PIIX4 master 8259 */ + outb(0x0c, 0x20); /* OCW3 Poll command */ + realirq = inb(0x20); + + /* + * Bit 7 == 0 means invalid/spurious + */ + if (unlikely(!(realirq & 0x80))) + goto out_unlock; + + realirq &= 7; + + if (unlikely(realirq == 2)) { + outb(0x0c, 0xa0); + realirq = inb(0xa0); + + if (unlikely(!(realirq & 0x80))) + goto out_unlock; + + realirq = (realirq & 7) + 8; + } + + /* mask and ack interrupt */ + cached_irq_mask |= 1 << realirq; + if (unlikely(realirq > 7)) { + inb(0xa1); + outb(cached_slave_mask, 0xa1); + outb(0x60 + (realirq & 7), 0xa0); + outb(0x60 + 2, 0x20); + } else { + inb(0x21); + outb(cached_master_mask, 0x21); + outb(0x60 + realirq, 0x20); + } + + spin_unlock_irqrestore(&i8259A_lock, flags); + + desc = irq_desc + realirq; + + /* + * handle this 'virtual interrupt' as a Cobalt one now. + */ + kstat_cpu(smp_processor_id()).irqs[realirq]++; + + if (likely(desc->action != NULL)) + handle_IRQ_event(realirq, desc->action); + + if (!(desc->status & IRQ_DISABLED)) + enable_8259A_irq(realirq); + + return IRQ_HANDLED; + +out_unlock: + spin_unlock_irqrestore(&i8259A_lock, flags); + return IRQ_NONE; +} + +static struct irqaction master_action = { + .handler = piix4_master_intr, + .name = "PIIX4-8259", +}; + +static struct irqaction cascade_action = { + .handler = no_action, + .name = "cascade", +}; + + +void init_VISWS_APIC_irqs(void) +{ + int i; + + for (i = 0; i < CO_IRQ_APIC0 + CO_APIC_LAST + 1; i++) { + irq_desc[i].status = IRQ_DISABLED; + irq_desc[i].action = 0; + irq_desc[i].depth = 1; + + if (i == 0) { + irq_desc[i].chip = &cobalt_irq_type; + } + else if (i == CO_IRQ_IDE0) { + irq_desc[i].chip = &cobalt_irq_type; + } + else if (i == CO_IRQ_IDE1) { + irq_desc[i].chip = &cobalt_irq_type; + } + else if (i == CO_IRQ_8259) { + irq_desc[i].chip = &piix4_master_irq_type; + } + else if (i < CO_IRQ_APIC0) { + irq_desc[i].chip = &piix4_virtual_irq_type; + } + else if (IS_CO_APIC(i)) { + irq_desc[i].chip = &cobalt_irq_type; + } + } + + setup_irq(CO_IRQ_8259, &master_action); + setup_irq(2, &cascade_action); +} diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 946bf13b44a..b15346092b7 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -932,7 +932,7 @@ static inline int __init activate_vmi(void) pv_apic_ops.setup_secondary_clock = vmi_time_ap_init; #endif pv_time_ops.sched_clock = vmi_sched_clock; - pv_time_ops.get_cpu_khz = vmi_cpu_khz; + pv_time_ops.get_tsc_khz = vmi_tsc_khz; /* We have true wallclock functions; disable CMOS clock sync */ no_sync_cmos_clock = 1; diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c index ba7d19e102b..6953859fe28 100644 --- a/arch/x86/kernel/vmiclock_32.c +++ b/arch/x86/kernel/vmiclock_32.c @@ -69,8 +69,8 @@ unsigned long long vmi_sched_clock(void) return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE)); } -/* paravirt_ops.get_cpu_khz = vmi_cpu_khz */ -unsigned long vmi_cpu_khz(void) +/* paravirt_ops.get_tsc_khz = vmi_tsc_khz */ +unsigned long vmi_tsc_khz(void) { unsigned long long khz; khz = vmi_timer_ops.get_cycle_frequency(); diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index 2674f579627..cdb2363697d 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -49,16 +49,14 @@ SECTIONS _etext = .; /* End of text section */ } :text = 0x9090 + NOTES :text :note + . = ALIGN(16); /* Exception table */ __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { __start___ex_table = .; *(__ex_table) __stop___ex_table = .; - } - - NOTES :text :note - - BUG_TABLE :text + } :text = 0x9090 RODATA diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index fd246e22fe6..63e5c1a22e8 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -19,7 +19,7 @@ PHDRS { data PT_LOAD FLAGS(7); /* RWE */ user PT_LOAD FLAGS(7); /* RWE */ data.init PT_LOAD FLAGS(7); /* RWE */ - note PT_NOTE FLAGS(4); /* R__ */ + note PT_NOTE FLAGS(0); /* ___ */ } SECTIONS { @@ -40,16 +40,14 @@ SECTIONS _etext = .; /* End of text section */ } :text = 0x9090 + NOTES :text :note + . = ALIGN(16); /* Exception table */ __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { __start___ex_table = .; *(__ex_table) __stop___ex_table = .; - } - - NOTES :text :note - - BUG_TABLE :text + } :text = 0x9090 RODATA diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index c87cbd84c3e..0b8b6690a86 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -42,7 +42,8 @@ #include <asm/topology.h> #include <asm/vgtod.h> -#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) +#define __vsyscall(nr) \ + __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace #define __syscall_clobber "r11","cx","memory" /* @@ -278,7 +279,7 @@ cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg) { long cpu = (long)arg; if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) - smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1); + smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1); return NOTIFY_DONE; } @@ -301,7 +302,7 @@ static int __init vsyscall_init(void) #ifdef CONFIG_SYSCTL register_sysctl_table(kernel_root_table2); #endif - on_each_cpu(cpu_vsyscall_init, NULL, 0, 1); + on_each_cpu(cpu_vsyscall_init, NULL, 1); hotcpu_notifier(cpu_vsyscall_notifier, 0); return 0; } diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index 2f306a82689..b545f371b5f 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -2,13 +2,20 @@ All C exports should go in the respective C files. */ #include <linux/module.h> -#include <net/checksum.h> #include <linux/smp.h> +#include <net/checksum.h> + #include <asm/processor.h> -#include <asm/uaccess.h> #include <asm/pgtable.h> +#include <asm/uaccess.h> #include <asm/desc.h> +#include <asm/ftrace.h> + +#ifdef CONFIG_FTRACE +/* mcount is defined in assembly */ +EXPORT_SYMBOL(mcount); +#endif EXPORT_SYMBOL(kernel_thread); |