diff options
Diffstat (limited to 'arch/x86/kernel')
133 files changed, 11944 insertions, 6819 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 3db651fc8ec..0d41f0343dc 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -10,7 +10,7 @@ ifdef CONFIG_FTRACE # Do not profile debug and lowlevel utilities CFLAGS_REMOVE_tsc.o = -pg CFLAGS_REMOVE_rtc.o = -pg -CFLAGS_REMOVE_paravirt.o = -pg +CFLAGS_REMOVE_paravirt-spinlocks.o = -pg endif # @@ -23,7 +23,7 @@ CFLAGS_hpet.o := $(nostackp) CFLAGS_tsc.o := $(nostackp) obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o -obj-y += traps_$(BITS).o irq_$(BITS).o +obj-y += traps.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time_$(BITS).o ioport.o ldt.o obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o obj-$(CONFIG_X86_VISWS) += visws_quirks.o @@ -38,7 +38,7 @@ obj-y += tsc.o io_delay.o rtc.o obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o obj-y += process.o -obj-y += i387.o +obj-y += i387.o xsave.o obj-y += ptrace.o obj-y += ds.o obj-$(CONFIG_X86_32) += tls.o @@ -51,7 +51,6 @@ obj-$(CONFIG_X86_BIOS_REBOOT) += reboot.o obj-$(CONFIG_MCA) += mca_32.o obj-$(CONFIG_X86_MSR) += msr.o obj-$(CONFIG_X86_CPUID) += cpuid.o -obj-$(CONFIG_MICROCODE) += microcode.o obj-$(CONFIG_PCI) += early-quirks.o apm-y := apm_32.o obj-$(CONFIG_APM) += apm.o @@ -69,6 +68,7 @@ obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o obj-$(CONFIG_X86_NUMAQ) += numaq_32.o +obj-$(CONFIG_X86_ES7000) += es7000_32.o obj-$(CONFIG_X86_SUMMIT_NUMA) += summit_32.o obj-y += vsmp_64.o obj-$(CONFIG_KPROBES) += kprobes.o @@ -89,7 +89,7 @@ obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o obj-$(CONFIG_KVM_GUEST) += kvm.o obj-$(CONFIG_KVM_CLOCK) += kvmclock.o -obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o +obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o paravirt-spinlocks.o obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o @@ -99,11 +99,18 @@ scx200-y += scx200_32.o obj-$(CONFIG_OLPC) += olpc.o +microcode-y := microcode_core.o +microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o +microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o +obj-$(CONFIG_MICROCODE) += microcode.o + ### # 64 bit specific files ifeq ($(CONFIG_X86_64),y) obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o obj-y += bios_uv.o + obj-y += genx2apic_cluster.o + obj-y += genx2apic_phys.o obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o obj-$(CONFIG_AUDIT) += audit_64.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index bfd10fd211c..eb875cdc736 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -58,7 +58,6 @@ EXPORT_SYMBOL(acpi_disabled); #ifdef CONFIG_X86_64 #include <asm/proto.h> -#include <asm/genapic.h> #else /* X86 */ @@ -97,8 +96,6 @@ static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; #warning ACPI uses CMPXCHG, i486 and later hardware #endif -static int acpi_mcfg_64bit_base_addr __initdata = FALSE; - /* -------------------------------------------------------------------------- Boot-time Configuration -------------------------------------------------------------------------- */ @@ -160,6 +157,8 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size) struct acpi_mcfg_allocation *pci_mmcfg_config; int pci_mmcfg_config_num; +static int acpi_mcfg_64bit_base_addr __initdata = FALSE; + static int __init acpi_mcfg_oem_check(struct acpi_table_mcfg *mcfg) { if (!strcmp(mcfg->header.oem_id, "SGI")) @@ -253,10 +252,8 @@ static void __cpuinit acpi_register_lapic(int id, u8 enabled) return; } -#ifdef CONFIG_X86_32 if (boot_cpu_physical_apicid != -1U) ver = apic_version[boot_cpu_physical_apicid]; -#endif generic_processor_info(id, ver); } @@ -775,11 +772,9 @@ static void __init acpi_register_lapic_address(unsigned long address) set_fixmap_nocache(FIX_APIC_BASE, address); if (boot_cpu_physical_apicid == -1U) { - boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id()); -#ifdef CONFIG_X86_32 + boot_cpu_physical_apicid = read_apic_id(); apic_version[boot_cpu_physical_apicid] = GET_APIC_VERSION(apic_read(APIC_LVR)); -#endif } } @@ -1351,7 +1346,9 @@ static void __init acpi_process_madt(void) acpi_ioapic = 1; smp_found_config = 1; +#ifdef CONFIG_X86_32 setup_apic_routing(); +#endif } } if (error == -EINVAL) { @@ -1421,8 +1418,16 @@ static int __init force_acpi_ht(const struct dmi_system_id *d) */ static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d) { - pr_notice("%s detected: Ignoring BIOS IRQ0 pin2 override\n", d->ident); - acpi_skip_timer_override = 1; + /* + * The ati_ixp4x0_rev() early PCI quirk should have set + * the acpi_skip_timer_override flag already: + */ + if (!acpi_skip_timer_override) { + WARN(1, KERN_ERR "ati_ixp4x0 quirk not complete.\n"); + pr_notice("%s detected: Ignoring BIOS IRQ0 pin2 override\n", + d->ident); + acpi_skip_timer_override = 1; + } return 0; } @@ -1605,6 +1610,14 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = { */ { .callback = dmi_ignore_irq0_timer_override, + .ident = "HP nx6115 laptop", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), + DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6115"), + }, + }, + { + .callback = dmi_ignore_irq0_timer_override, .ident = "HP NX6125 laptop", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), @@ -1619,6 +1632,14 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6325"), }, }, + { + .callback = dmi_ignore_irq0_timer_override, + .ident = "HP 6715b laptop", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), + DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq 6715b"), + }, + }, {} }; diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 2763cb37b55..a84ac7b570e 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -145,35 +145,25 @@ static const unsigned char *const p6_nops[ASM_NOP_MAX+1] = { extern char __vsyscall_0; const unsigned char *const *find_nop_table(void) { - return boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || - boot_cpu_data.x86 < 6 ? k8_nops : p6_nops; + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && + boot_cpu_has(X86_FEATURE_NOPL)) + return p6_nops; + else + return k8_nops; } #else /* CONFIG_X86_64 */ -static const struct nop { - int cpuid; - const unsigned char *const *noptable; -} noptypes[] = { - { X86_FEATURE_K8, k8_nops }, - { X86_FEATURE_K7, k7_nops }, - { X86_FEATURE_P4, p6_nops }, - { X86_FEATURE_P3, p6_nops }, - { -1, NULL } -}; - const unsigned char *const *find_nop_table(void) { - const unsigned char *const *noptable = intel_nops; - int i; - - for (i = 0; noptypes[i].cpuid >= 0; i++) { - if (boot_cpu_has(noptypes[i].cpuid)) { - noptable = noptypes[i].noptable; - break; - } - } - return noptable; + if (boot_cpu_has(X86_FEATURE_K8)) + return k8_nops; + else if (boot_cpu_has(X86_FEATURE_K7)) + return k7_nops; + else if (boot_cpu_has(X86_FEATURE_NOPL)) + return p6_nops; + else + return intel_nops; } #endif /* CONFIG_X86_64 */ @@ -241,25 +231,25 @@ static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end) continue; if (*ptr > text_end) continue; - text_poke(*ptr, ((unsigned char []){0xf0}), 1); /* add lock prefix */ + /* turn DS segment override prefix into lock prefix */ + text_poke(*ptr, ((unsigned char []){0xf0}), 1); }; } static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end) { u8 **ptr; - char insn[1]; if (noreplace_smp) return; - add_nops(insn, 1); for (ptr = start; ptr < end; ptr++) { if (*ptr < text) continue; if (*ptr > text_end) continue; - text_poke(*ptr, insn, 1); + /* turn lock prefix into DS segment override prefix */ + text_poke(*ptr, ((unsigned char []){0x3E}), 1); }; } @@ -454,7 +444,7 @@ void __init alternative_instructions(void) _text, _etext); /* Only switch to UP mode if we don't immediately boot others */ - if (num_possible_cpus() == 1 || setup_max_cpus <= 1) + if (num_present_cpus() == 1 || setup_max_cpus <= 1) alternatives_smp_switch(0); } #endif diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index de39e1f2ede..a8fd9ebdc8e 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -33,6 +33,10 @@ static DEFINE_RWLOCK(amd_iommu_devtable_lock); +/* A list of preallocated protection domains */ +static LIST_HEAD(iommu_pd_list); +static DEFINE_SPINLOCK(iommu_pd_list_lock); + /* * general struct to manage commands send to an IOMMU */ @@ -51,6 +55,102 @@ static int iommu_has_npcache(struct amd_iommu *iommu) /**************************************************************************** * + * Interrupt handling functions + * + ****************************************************************************/ + +static void iommu_print_event(void *__evt) +{ + u32 *event = __evt; + int type = (event[1] >> EVENT_TYPE_SHIFT) & EVENT_TYPE_MASK; + int devid = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK; + int domid = (event[1] >> EVENT_DOMID_SHIFT) & EVENT_DOMID_MASK; + int flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK; + u64 address = (u64)(((u64)event[3]) << 32) | event[2]; + + printk(KERN_ERR "AMD IOMMU: Event logged ["); + + switch (type) { + case EVENT_TYPE_ILL_DEV: + printk("ILLEGAL_DEV_TABLE_ENTRY device=%02x:%02x.%x " + "address=0x%016llx flags=0x%04x]\n", + PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid), + address, flags); + break; + case EVENT_TYPE_IO_FAULT: + printk("IO_PAGE_FAULT device=%02x:%02x.%x " + "domain=0x%04x address=0x%016llx flags=0x%04x]\n", + PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid), + domid, address, flags); + break; + case EVENT_TYPE_DEV_TAB_ERR: + printk("DEV_TAB_HARDWARE_ERROR device=%02x:%02x.%x " + "address=0x%016llx flags=0x%04x]\n", + PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid), + address, flags); + break; + case EVENT_TYPE_PAGE_TAB_ERR: + printk("PAGE_TAB_HARDWARE_ERROR device=%02x:%02x.%x " + "domain=0x%04x address=0x%016llx flags=0x%04x]\n", + PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid), + domid, address, flags); + break; + case EVENT_TYPE_ILL_CMD: + printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address); + break; + case EVENT_TYPE_CMD_HARD_ERR: + printk("COMMAND_HARDWARE_ERROR address=0x%016llx " + "flags=0x%04x]\n", address, flags); + break; + case EVENT_TYPE_IOTLB_INV_TO: + printk("IOTLB_INV_TIMEOUT device=%02x:%02x.%x " + "address=0x%016llx]\n", + PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid), + address); + break; + case EVENT_TYPE_INV_DEV_REQ: + printk("INVALID_DEVICE_REQUEST device=%02x:%02x.%x " + "address=0x%016llx flags=0x%04x]\n", + PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid), + address, flags); + break; + default: + printk(KERN_ERR "UNKNOWN type=0x%02x]\n", type); + } +} + +static void iommu_poll_events(struct amd_iommu *iommu) +{ + u32 head, tail; + unsigned long flags; + + spin_lock_irqsave(&iommu->lock, flags); + + head = readl(iommu->mmio_base + MMIO_EVT_HEAD_OFFSET); + tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET); + + while (head != tail) { + iommu_print_event(iommu->evt_buf + head); + head = (head + EVENT_ENTRY_SIZE) % iommu->evt_buf_size; + } + + writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET); + + spin_unlock_irqrestore(&iommu->lock, flags); +} + +irqreturn_t amd_iommu_int_handler(int irq, void *data) +{ + struct amd_iommu *iommu; + + list_for_each_entry(iommu, &amd_iommu_list, list) + iommu_poll_events(iommu); + + return IRQ_HANDLED; +} + +/**************************************************************************** + * * IOMMU command queuing functions * ****************************************************************************/ @@ -65,7 +165,7 @@ static int __iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) u8 *target; tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); - target = (iommu->cmd_buf + tail); + target = iommu->cmd_buf + tail; memcpy_toio(target, cmd, sizeof(*cmd)); tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size; head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); @@ -101,10 +201,10 @@ static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) */ static int iommu_completion_wait(struct amd_iommu *iommu) { - int ret, ready = 0; + int ret = 0, ready = 0; unsigned status = 0; struct iommu_cmd cmd; - unsigned long i = 0; + unsigned long flags, i = 0; memset(&cmd, 0, sizeof(cmd)); cmd.data[0] = CMD_COMPL_WAIT_INT_MASK; @@ -112,10 +212,12 @@ static int iommu_completion_wait(struct amd_iommu *iommu) iommu->need_sync = 0; - ret = iommu_queue_command(iommu, &cmd); + spin_lock_irqsave(&iommu->lock, flags); + + ret = __iommu_queue_command(iommu, &cmd); if (ret) - return ret; + goto out; while (!ready && (i < EXIT_LOOP_COUNT)) { ++i; @@ -130,6 +232,8 @@ static int iommu_completion_wait(struct amd_iommu *iommu) if (unlikely((i == EXIT_LOOP_COUNT) && printk_ratelimit())) printk(KERN_WARNING "AMD IOMMU: Completion wait loop failed\n"); +out: + spin_unlock_irqrestore(&iommu->lock, flags); return 0; } @@ -140,6 +244,7 @@ static int iommu_completion_wait(struct amd_iommu *iommu) static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid) { struct iommu_cmd cmd; + int ret; BUG_ON(iommu == NULL); @@ -147,9 +252,11 @@ static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid) CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY); cmd.data[0] = devid; + ret = iommu_queue_command(iommu, &cmd); + iommu->need_sync = 1; - return iommu_queue_command(iommu, &cmd); + return ret; } /* @@ -159,6 +266,7 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu, u64 address, u16 domid, int pde, int s) { struct iommu_cmd cmd; + int ret; memset(&cmd, 0, sizeof(cmd)); address &= PAGE_MASK; @@ -171,9 +279,11 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu, if (pde) /* PDE bit - we wan't flush everything not only the PTEs */ cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; + ret = iommu_queue_command(iommu, &cmd); + iommu->need_sync = 1; - return iommu_queue_command(iommu, &cmd); + return ret; } /* @@ -185,7 +295,7 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid, u64 address, size_t size) { int s = 0; - unsigned pages = iommu_num_pages(address, size); + unsigned pages = iommu_num_pages(address, size, PAGE_SIZE); address &= PAGE_MASK; @@ -203,6 +313,14 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid, return 0; } +/* Flush the whole IO/TLB for a given protection domain */ +static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid) +{ + u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; + + iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 1); +} + /**************************************************************************** * * The functions below are used the create the page table mappings for @@ -362,11 +480,6 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, * efficient allocator. * ****************************************************************************/ -static unsigned long dma_mask_to_pages(unsigned long mask) -{ - return (mask >> PAGE_SHIFT) + - (PAGE_ALIGN(mask & ~PAGE_MASK) >> PAGE_SHIFT); -} /* * The address allocator core function. @@ -375,25 +488,31 @@ static unsigned long dma_mask_to_pages(unsigned long mask) */ static unsigned long dma_ops_alloc_addresses(struct device *dev, struct dma_ops_domain *dom, - unsigned int pages) + unsigned int pages, + unsigned long align_mask, + u64 dma_mask) { - unsigned long limit = dma_mask_to_pages(*dev->dma_mask); + unsigned long limit; unsigned long address; - unsigned long size = dom->aperture_size >> PAGE_SHIFT; unsigned long boundary_size; boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, PAGE_SIZE) >> PAGE_SHIFT; - limit = limit < size ? limit : size; + limit = iommu_device_max_index(dom->aperture_size >> PAGE_SHIFT, 0, + dma_mask >> PAGE_SHIFT); - if (dom->next_bit >= limit) + if (dom->next_bit >= limit) { dom->next_bit = 0; + dom->need_flush = true; + } address = iommu_area_alloc(dom->bitmap, limit, dom->next_bit, pages, - 0 , boundary_size, 0); - if (address == -1) + 0 , boundary_size, align_mask); + if (address == -1) { address = iommu_area_alloc(dom->bitmap, limit, 0, pages, - 0, boundary_size, 0); + 0, boundary_size, align_mask); + dom->need_flush = true; + } if (likely(address != -1)) { dom->next_bit = address + pages; @@ -459,7 +578,7 @@ static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, if (start_page + pages > last_page) pages = last_page - start_page; - set_bit_string(dom->bitmap, start_page, pages); + iommu_area_reserve(dom->bitmap, start_page, pages); } static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom) @@ -553,12 +672,16 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, dma_dom->bitmap[0] = 1; dma_dom->next_bit = 0; + dma_dom->need_flush = false; + dma_dom->target_dev = 0xffff; + /* Intialize the exclusion range if necessary */ if (iommu->exclusion_start && iommu->exclusion_start < dma_dom->aperture_size) { unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; int pages = iommu_num_pages(iommu->exclusion_start, - iommu->exclusion_length); + iommu->exclusion_length, + PAGE_SIZE); dma_ops_reserve_addresses(dma_dom, startpage, pages); } @@ -623,12 +746,13 @@ static void set_device_domain(struct amd_iommu *iommu, u64 pte_root = virt_to_phys(domain->pt_root); - pte_root |= (domain->mode & 0x07) << 9; - pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | 2; + pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK) + << DEV_ENTRY_MODE_SHIFT; + pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV; write_lock_irqsave(&amd_iommu_devtable_lock, flags); - amd_iommu_dev_table[devid].data[0] = pte_root; - amd_iommu_dev_table[devid].data[1] = pte_root >> 32; + amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root); + amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root); amd_iommu_dev_table[devid].data[2] = domain->id; amd_iommu_pd_table[devid] = domain; @@ -646,6 +770,45 @@ static void set_device_domain(struct amd_iommu *iommu, *****************************************************************************/ /* + * This function checks if the driver got a valid device from the caller to + * avoid dereferencing invalid pointers. + */ +static bool check_device(struct device *dev) +{ + if (!dev || !dev->dma_mask) + return false; + + return true; +} + +/* + * In this function the list of preallocated protection domains is traversed to + * find the domain for a specific device + */ +static struct dma_ops_domain *find_protection_domain(u16 devid) +{ + struct dma_ops_domain *entry, *ret = NULL; + unsigned long flags; + + if (list_empty(&iommu_pd_list)) + return NULL; + + spin_lock_irqsave(&iommu_pd_list_lock, flags); + + list_for_each_entry(entry, &iommu_pd_list, list) { + if (entry->target_dev == devid) { + ret = entry; + list_del(&ret->list); + break; + } + } + + spin_unlock_irqrestore(&iommu_pd_list_lock, flags); + + return ret; +} + +/* * In the dma_ops path we only have the struct device. This function * finds the corresponding IOMMU, the protection domain and the * requestor id for a given device. @@ -661,27 +824,30 @@ static int get_device_resources(struct device *dev, struct pci_dev *pcidev; u16 _bdf; - BUG_ON(!dev || dev->bus != &pci_bus_type || !dev->dma_mask); + *iommu = NULL; + *domain = NULL; + *bdf = 0xffff; + + if (dev->bus != &pci_bus_type) + return 0; pcidev = to_pci_dev(dev); _bdf = calc_devid(pcidev->bus->number, pcidev->devfn); /* device not translated by any IOMMU in the system? */ - if (_bdf > amd_iommu_last_bdf) { - *iommu = NULL; - *domain = NULL; - *bdf = 0xffff; + if (_bdf > amd_iommu_last_bdf) return 0; - } *bdf = amd_iommu_alias_table[_bdf]; *iommu = amd_iommu_rlookup_table[*bdf]; if (*iommu == NULL) return 0; - dma_dom = (*iommu)->default_dom; *domain = domain_for_device(*bdf); if (*domain == NULL) { + dma_dom = find_protection_domain(*bdf); + if (!dma_dom) + dma_dom = (*iommu)->default_dom; *domain = &dma_dom->domain; set_device_domain(*iommu, *domain, *bdf); printk(KERN_INFO "AMD IOMMU: Using protection domain %d for " @@ -760,17 +926,24 @@ static dma_addr_t __map_single(struct device *dev, struct dma_ops_domain *dma_dom, phys_addr_t paddr, size_t size, - int dir) + int dir, + bool align, + u64 dma_mask) { dma_addr_t offset = paddr & ~PAGE_MASK; dma_addr_t address, start; unsigned int pages; + unsigned long align_mask = 0; int i; - pages = iommu_num_pages(paddr, size); + pages = iommu_num_pages(paddr, size, PAGE_SIZE); paddr &= PAGE_MASK; - address = dma_ops_alloc_addresses(dev, dma_dom, pages); + if (align) + align_mask = (1UL << get_order(size)) - 1; + + address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask, + dma_mask); if (unlikely(address == bad_dma_address)) goto out; @@ -782,6 +955,12 @@ static dma_addr_t __map_single(struct device *dev, } address += offset; + if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) { + iommu_flush_tlb(iommu, dma_dom->domain.id); + dma_dom->need_flush = false; + } else if (unlikely(iommu_has_npcache(iommu))) + iommu_flush_pages(iommu, dma_dom->domain.id, address, size); + out: return address; } @@ -802,7 +981,7 @@ static void __unmap_single(struct amd_iommu *iommu, if ((dma_addr == 0) || (dma_addr + size > dma_dom->aperture_size)) return; - pages = iommu_num_pages(dma_addr, size); + pages = iommu_num_pages(dma_addr, size, PAGE_SIZE); dma_addr &= PAGE_MASK; start = dma_addr; @@ -812,6 +991,9 @@ static void __unmap_single(struct amd_iommu *iommu, } dma_ops_free_addresses(dma_dom, dma_addr, pages); + + if (amd_iommu_unmap_flush) + iommu_flush_pages(iommu, dma_dom->domain.id, dma_addr, size); } /* @@ -825,6 +1007,12 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr, struct protection_domain *domain; u16 devid; dma_addr_t addr; + u64 dma_mask; + + if (!check_device(dev)) + return bad_dma_address; + + dma_mask = *dev->dma_mask; get_device_resources(dev, &iommu, &domain, &devid); @@ -833,14 +1021,12 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr, return (dma_addr_t)paddr; spin_lock_irqsave(&domain->lock, flags); - addr = __map_single(dev, iommu, domain->priv, paddr, size, dir); + addr = __map_single(dev, iommu, domain->priv, paddr, size, dir, false, + dma_mask); if (addr == bad_dma_address) goto out; - if (iommu_has_npcache(iommu)) - iommu_flush_pages(iommu, domain->id, addr, size); - - if (iommu->need_sync) + if (unlikely(iommu->need_sync)) iommu_completion_wait(iommu); out: @@ -860,7 +1046,8 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr, struct protection_domain *domain; u16 devid; - if (!get_device_resources(dev, &iommu, &domain, &devid)) + if (!check_device(dev) || + !get_device_resources(dev, &iommu, &domain, &devid)) /* device not handled by any AMD IOMMU */ return; @@ -868,9 +1055,7 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr, __unmap_single(iommu, domain->priv, dma_addr, size, dir); - iommu_flush_pages(iommu, domain->id, dma_addr, size); - - if (iommu->need_sync) + if (unlikely(iommu->need_sync)) iommu_completion_wait(iommu); spin_unlock_irqrestore(&domain->lock, flags); @@ -909,6 +1094,12 @@ static int map_sg(struct device *dev, struct scatterlist *sglist, struct scatterlist *s; phys_addr_t paddr; int mapped_elems = 0; + u64 dma_mask; + + if (!check_device(dev)) + return 0; + + dma_mask = *dev->dma_mask; get_device_resources(dev, &iommu, &domain, &devid); @@ -921,19 +1112,17 @@ static int map_sg(struct device *dev, struct scatterlist *sglist, paddr = sg_phys(s); s->dma_address = __map_single(dev, iommu, domain->priv, - paddr, s->length, dir); + paddr, s->length, dir, false, + dma_mask); if (s->dma_address) { s->dma_length = s->length; mapped_elems++; } else goto unmap; - if (iommu_has_npcache(iommu)) - iommu_flush_pages(iommu, domain->id, s->dma_address, - s->dma_length); } - if (iommu->need_sync) + if (unlikely(iommu->need_sync)) iommu_completion_wait(iommu); out: @@ -967,7 +1156,8 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist, u16 devid; int i; - if (!get_device_resources(dev, &iommu, &domain, &devid)) + if (!check_device(dev) || + !get_device_resources(dev, &iommu, &domain, &devid)) return; spin_lock_irqsave(&domain->lock, flags); @@ -975,12 +1165,10 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist, for_each_sg(sglist, s, nelems, i) { __unmap_single(iommu, domain->priv, s->dma_address, s->dma_length, dir); - iommu_flush_pages(iommu, domain->id, s->dma_address, - s->dma_length); s->dma_address = s->dma_length = 0; } - if (iommu->need_sync) + if (unlikely(iommu->need_sync)) iommu_completion_wait(iommu); spin_unlock_irqrestore(&domain->lock, flags); @@ -998,25 +1186,33 @@ static void *alloc_coherent(struct device *dev, size_t size, struct protection_domain *domain; u16 devid; phys_addr_t paddr; + u64 dma_mask = dev->coherent_dma_mask; + + if (!check_device(dev)) + return NULL; + if (!get_device_resources(dev, &iommu, &domain, &devid)) + flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); + + flag |= __GFP_ZERO; virt_addr = (void *)__get_free_pages(flag, get_order(size)); if (!virt_addr) return 0; - memset(virt_addr, 0, size); paddr = virt_to_phys(virt_addr); - get_device_resources(dev, &iommu, &domain, &devid); - if (!iommu || !domain) { *dma_addr = (dma_addr_t)paddr; return virt_addr; } + if (!dma_mask) + dma_mask = *dev->dma_mask; + spin_lock_irqsave(&domain->lock, flags); *dma_addr = __map_single(dev, iommu, domain->priv, paddr, - size, DMA_BIDIRECTIONAL); + size, DMA_BIDIRECTIONAL, true, dma_mask); if (*dma_addr == bad_dma_address) { free_pages((unsigned long)virt_addr, get_order(size)); @@ -1024,10 +1220,7 @@ static void *alloc_coherent(struct device *dev, size_t size, goto out; } - if (iommu_has_npcache(iommu)) - iommu_flush_pages(iommu, domain->id, *dma_addr, size); - - if (iommu->need_sync) + if (unlikely(iommu->need_sync)) iommu_completion_wait(iommu); out: @@ -1038,8 +1231,6 @@ out: /* * The exported free_coherent function for dma_ops. - * FIXME: fix the generic x86 DMA layer so that it actually calls that - * function. */ static void free_coherent(struct device *dev, size_t size, void *virt_addr, dma_addr_t dma_addr) @@ -1049,6 +1240,9 @@ static void free_coherent(struct device *dev, size_t size, struct protection_domain *domain; u16 devid; + if (!check_device(dev)) + return; + get_device_resources(dev, &iommu, &domain, &devid); if (!iommu || !domain) @@ -1057,9 +1251,8 @@ static void free_coherent(struct device *dev, size_t size, spin_lock_irqsave(&domain->lock, flags); __unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL); - iommu_flush_pages(iommu, domain->id, dma_addr, size); - if (iommu->need_sync) + if (unlikely(iommu->need_sync)) iommu_completion_wait(iommu); spin_unlock_irqrestore(&domain->lock, flags); @@ -1069,6 +1262,30 @@ free_mem: } /* + * This function is called by the DMA layer to find out if we can handle a + * particular device. It is part of the dma_ops. + */ +static int amd_iommu_dma_supported(struct device *dev, u64 mask) +{ + u16 bdf; + struct pci_dev *pcidev; + + /* No device or no PCI device */ + if (!dev || dev->bus != &pci_bus_type) + return 0; + + pcidev = to_pci_dev(dev); + + bdf = calc_devid(pcidev->bus->number, pcidev->devfn); + + /* Out of our scope? */ + if (bdf > amd_iommu_last_bdf) + return 0; + + return 1; +} + +/* * The function for pre-allocating protection domains. * * If the driver core informs the DMA layer if a driver grabs a device @@ -1097,10 +1314,9 @@ void prealloc_protection_domains(void) if (!dma_dom) continue; init_unity_mappings_for_device(dma_dom, devid); - set_device_domain(iommu, &dma_dom->domain, devid); - printk(KERN_INFO "AMD IOMMU: Allocated domain %d for device ", - dma_dom->domain.id); - print_devid(devid, 1); + dma_dom->target_dev = devid; + + list_add_tail(&dma_dom->list, &iommu_pd_list); } } @@ -1111,6 +1327,7 @@ static struct dma_mapping_ops amd_iommu_dma_ops = { .unmap_single = unmap_single, .map_sg = map_sg, .unmap_sg = unmap_sg, + .dma_supported = amd_iommu_dma_supported, }; /* diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index a69cc0f5204..4cd8083c58b 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -22,6 +22,8 @@ #include <linux/gfp.h> #include <linux/list.h> #include <linux/sysdev.h> +#include <linux/interrupt.h> +#include <linux/msi.h> #include <asm/pci-direct.h> #include <asm/amd_iommu_types.h> #include <asm/amd_iommu.h> @@ -30,7 +32,6 @@ /* * definitions for the ACPI scanning code */ -#define PCI_BUS(x) (((x) >> 8) & 0xff) #define IVRS_HEADER_LENGTH 48 #define ACPI_IVHD_TYPE 0x10 @@ -121,6 +122,7 @@ LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings we find in ACPI */ unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */ int amd_iommu_isolate; /* if 1, device isolation is enabled */ +bool amd_iommu_unmap_flush; /* if true, flush on every unmap */ LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the system */ @@ -234,7 +236,7 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit) { u32 ctrl; - ctrl = (u64)readl(iommu->mmio_base + MMIO_CONTROL_OFFSET); + ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET); ctrl &= ~(1 << bit); writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET); } @@ -242,13 +244,23 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit) /* Function to enable the hardware */ void __init iommu_enable(struct amd_iommu *iommu) { - printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at "); - print_devid(iommu->devid, 0); - printk(" cap 0x%hx\n", iommu->cap_ptr); + printk(KERN_INFO "AMD IOMMU: Enabling IOMMU " + "at %02x:%02x.%x cap 0x%hx\n", + iommu->dev->bus->number, + PCI_SLOT(iommu->dev->devfn), + PCI_FUNC(iommu->dev->devfn), + iommu->cap_ptr); iommu_feature_enable(iommu, CONTROL_IOMMU_EN); } +/* Function to enable IOMMU event logging and event interrupts */ +void __init iommu_enable_event_logging(struct amd_iommu *iommu) +{ + iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN); + iommu_feature_enable(iommu, CONTROL_EVT_INT_EN); +} + /* * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in * the system has one. @@ -286,6 +298,14 @@ static void __init iommu_unmap_mmio_space(struct amd_iommu *iommu) ****************************************************************************/ /* + * This function calculates the length of a given IVHD entry + */ +static inline int ivhd_entry_length(u8 *ivhd) +{ + return 0x04 << (*ivhd >> 6); +} + +/* * This function reads the last device id the IOMMU has to handle from the PCI * capability header for this IOMMU */ @@ -329,7 +349,7 @@ static int __init find_last_devid_from_ivhd(struct ivhd_header *h) default: break; } - p += 0x04 << (*p >> 6); + p += ivhd_entry_length(p); } WARN_ON(p != end); @@ -414,7 +434,32 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu) static void __init free_command_buffer(struct amd_iommu *iommu) { - free_pages((unsigned long)iommu->cmd_buf, get_order(CMD_BUFFER_SIZE)); + free_pages((unsigned long)iommu->cmd_buf, + get_order(iommu->cmd_buf_size)); +} + +/* allocates the memory where the IOMMU will log its events to */ +static u8 * __init alloc_event_buffer(struct amd_iommu *iommu) +{ + u64 entry; + iommu->evt_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, + get_order(EVT_BUFFER_SIZE)); + + if (iommu->evt_buf == NULL) + return NULL; + + entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK; + memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET, + &entry, sizeof(entry)); + + iommu->evt_buf_size = EVT_BUFFER_SIZE; + + return iommu->evt_buf; +} + +static void __init free_event_buffer(struct amd_iommu *iommu) +{ + free_pages((unsigned long)iommu->evt_buf, get_order(EVT_BUFFER_SIZE)); } /* sets a specific bit in the device table entry. */ @@ -487,19 +532,21 @@ static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m) */ static void __init init_iommu_from_pci(struct amd_iommu *iommu) { - int bus = PCI_BUS(iommu->devid); - int dev = PCI_SLOT(iommu->devid); - int fn = PCI_FUNC(iommu->devid); int cap_ptr = iommu->cap_ptr; - u32 range; + u32 range, misc; - iommu->cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_CAP_HDR_OFFSET); + pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET, + &iommu->cap); + pci_read_config_dword(iommu->dev, cap_ptr + MMIO_RANGE_OFFSET, + &range); + pci_read_config_dword(iommu->dev, cap_ptr + MMIO_MISC_OFFSET, + &misc); - range = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET); iommu->first_device = calc_devid(MMIO_GET_BUS(range), MMIO_GET_FD(range)); iommu->last_device = calc_devid(MMIO_GET_BUS(range), MMIO_GET_LD(range)); + iommu->evt_msi_num = MMIO_MSI_NUM(misc); } /* @@ -604,7 +651,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, break; } - p += 0x04 << (e->type >> 6); + p += ivhd_entry_length(p); } } @@ -622,6 +669,7 @@ static int __init init_iommu_devices(struct amd_iommu *iommu) static void __init free_iommu_one(struct amd_iommu *iommu) { free_command_buffer(iommu); + free_event_buffer(iommu); iommu_unmap_mmio_space(iommu); } @@ -649,8 +697,12 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) /* * Copy data from ACPI table entry to the iommu struct */ - iommu->devid = h->devid; + iommu->dev = pci_get_bus_and_slot(PCI_BUS(h->devid), h->devid & 0xff); + if (!iommu->dev) + return 1; + iommu->cap_ptr = h->cap_ptr; + iommu->pci_seg = h->pci_seg; iommu->mmio_phys = h->mmio_phys; iommu->mmio_base = iommu_map_mmio_space(h->mmio_phys); if (!iommu->mmio_base) @@ -661,11 +713,17 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) if (!iommu->cmd_buf) return -ENOMEM; + iommu->evt_buf = alloc_event_buffer(iommu); + if (!iommu->evt_buf) + return -ENOMEM; + + iommu->int_enabled = false; + init_iommu_from_pci(iommu); init_iommu_from_acpi(iommu, h); init_iommu_devices(iommu); - return 0; + return pci_enable_device(iommu->dev); } /* @@ -706,6 +764,95 @@ static int __init init_iommu_all(struct acpi_table_header *table) /**************************************************************************** * + * The following functions initialize the MSI interrupts for all IOMMUs + * in the system. Its a bit challenging because there could be multiple + * IOMMUs per PCI BDF but we can call pci_enable_msi(x) only once per + * pci_dev. + * + ****************************************************************************/ + +static int __init iommu_setup_msix(struct amd_iommu *iommu) +{ + struct amd_iommu *curr; + struct msix_entry entries[32]; /* only 32 supported by AMD IOMMU */ + int nvec = 0, i; + + list_for_each_entry(curr, &amd_iommu_list, list) { + if (curr->dev == iommu->dev) { + entries[nvec].entry = curr->evt_msi_num; + entries[nvec].vector = 0; + curr->int_enabled = true; + nvec++; + } + } + + if (pci_enable_msix(iommu->dev, entries, nvec)) { + pci_disable_msix(iommu->dev); + return 1; + } + + for (i = 0; i < nvec; ++i) { + int r = request_irq(entries->vector, amd_iommu_int_handler, + IRQF_SAMPLE_RANDOM, + "AMD IOMMU", + NULL); + if (r) + goto out_free; + } + + return 0; + +out_free: + for (i -= 1; i >= 0; --i) + free_irq(entries->vector, NULL); + + pci_disable_msix(iommu->dev); + + return 1; +} + +static int __init iommu_setup_msi(struct amd_iommu *iommu) +{ + int r; + struct amd_iommu *curr; + + list_for_each_entry(curr, &amd_iommu_list, list) { + if (curr->dev == iommu->dev) + curr->int_enabled = true; + } + + + if (pci_enable_msi(iommu->dev)) + return 1; + + r = request_irq(iommu->dev->irq, amd_iommu_int_handler, + IRQF_SAMPLE_RANDOM, + "AMD IOMMU", + NULL); + + if (r) { + pci_disable_msi(iommu->dev); + return 1; + } + + return 0; +} + +static int __init iommu_init_msi(struct amd_iommu *iommu) +{ + if (iommu->int_enabled) + return 0; + + if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSIX)) + return iommu_setup_msix(iommu); + else if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI)) + return iommu_setup_msi(iommu); + + return 1; +} + +/**************************************************************************** + * * The next functions belong to the third pass of parsing the ACPI * table. In this last pass the memory mapping requirements are * gathered (like exclusion and unity mapping reanges). @@ -811,7 +958,6 @@ static void init_device_table(void) for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) { set_dev_entry_bit(devid, DEV_ENTRY_VALID); set_dev_entry_bit(devid, DEV_ENTRY_TRANSLATION); - set_dev_entry_bit(devid, DEV_ENTRY_NO_PAGE_FAULT); } } @@ -825,6 +971,8 @@ static void __init enable_iommus(void) list_for_each_entry(iommu, &amd_iommu_list, list) { iommu_set_exclusion_range(iommu); + iommu_init_msi(iommu); + iommu_enable_event_logging(iommu); iommu_enable(iommu); } } @@ -995,11 +1143,17 @@ int __init amd_iommu_init(void) else printk("disabled\n"); + if (amd_iommu_unmap_flush) + printk(KERN_INFO "AMD IOMMU: IO/TLB flush on unmap enabled\n"); + else + printk(KERN_INFO "AMD IOMMU: Lazy IO/TLB flushing enabled\n"); + out: return ret; free: - free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1); + free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, + get_order(MAX_DOMAIN_ID/8)); free_pages((unsigned long)amd_iommu_pd_table, get_order(rlookup_table_size)); @@ -1057,8 +1211,10 @@ void __init amd_iommu_detect(void) static int __init parse_amd_iommu_options(char *str) { for (; *str; ++str) { - if (strcmp(str, "isolate") == 0) + if (strncmp(str, "isolate", 7) == 0) amd_iommu_isolate = 1; + if (strncmp(str, "fullflush", 11) == 0) + amd_iommu_unmap_flush = true; } return 1; diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 44e21826db1..9a32b37ee2e 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -455,11 +455,11 @@ out: force_iommu || valid_agp || fallback_aper_force) { - printk(KERN_ERR + printk(KERN_INFO "Your BIOS doesn't leave a aperture memory hole\n"); - printk(KERN_ERR + printk(KERN_INFO "Please enable the IOMMU option in the BIOS setup\n"); - printk(KERN_ERR + printk(KERN_INFO "This costs you %d MB of RAM\n", 32 << fallback_aper_order); diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index f88bd0d982b..21c831d96af 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -60,10 +60,8 @@ unsigned long mp_lapic_addr; static int force_enable_local_apic; int disable_apic; -/* Local APIC timer verification ok */ -static int local_apic_timer_verify_ok; /* Disable local APIC timer from the kernel commandline or via dmi quirk */ -static int local_apic_timer_disabled; +static int disable_apic_timer __cpuinitdata; /* Local APIC timer works in C2 */ int local_apic_timer_c2_ok; EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); @@ -130,7 +128,11 @@ static inline int lapic_get_version(void) */ static inline int lapic_is_integrated(void) { +#ifdef CONFIG_X86_64 + return 1; +#else return APIC_INTEGRATED(lapic_get_version()); +#endif } /* @@ -145,13 +147,18 @@ static int modern_apic(void) return lapic_get_version() >= 0x14; } -void apic_wait_icr_idle(void) +/* + * Paravirt kernels also might be using these below ops. So we still + * use generic apic_read()/apic_write(), which might be pointing to different + * ops in PARAVIRT case. + */ +void xapic_wait_icr_idle(void) { while (apic_read(APIC_ICR) & APIC_ICR_BUSY) cpu_relax(); } -u32 safe_apic_wait_icr_idle(void) +u32 safe_xapic_wait_icr_idle(void) { u32 send_status; int timeout; @@ -167,16 +174,48 @@ u32 safe_apic_wait_icr_idle(void) return send_status; } +void xapic_icr_write(u32 low, u32 id) +{ + apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id)); + apic_write(APIC_ICR, low); +} + +u64 xapic_icr_read(void) +{ + u32 icr1, icr2; + + icr2 = apic_read(APIC_ICR2); + icr1 = apic_read(APIC_ICR); + + return icr1 | ((u64)icr2 << 32); +} + +static struct apic_ops xapic_ops = { + .read = native_apic_mem_read, + .write = native_apic_mem_write, + .icr_read = xapic_icr_read, + .icr_write = xapic_icr_write, + .wait_icr_idle = xapic_wait_icr_idle, + .safe_wait_icr_idle = safe_xapic_wait_icr_idle, +}; + +struct apic_ops __read_mostly *apic_ops = &xapic_ops; +EXPORT_SYMBOL_GPL(apic_ops); + /** * enable_NMI_through_LVT0 - enable NMI through local vector table 0 */ void __cpuinit enable_NMI_through_LVT0(void) { - unsigned int v = APIC_DM_NMI; + unsigned int v; - /* Level triggered for 82489DX */ + /* unmask and set to NMI */ + v = APIC_DM_NMI; + + /* Level triggered for 82489DX (32bit mode) */ if (!lapic_is_integrated()) v |= APIC_LVT_LEVEL_TRIGGER; + apic_write(APIC_LVT0, v); } @@ -193,9 +232,13 @@ int get_physical_broadcast(void) */ int lapic_get_maxlvt(void) { - unsigned int v = apic_read(APIC_LVR); + unsigned int v; - /* 82489DXs do not report # of LVT entries. */ + v = apic_read(APIC_LVR); + /* + * - we always have APIC integrated on 64bit mode + * - 82489DXs do not report # of LVT entries + */ return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2; } @@ -203,8 +246,12 @@ int lapic_get_maxlvt(void) * Local APIC timer */ -/* Clock divisor is set to 16 */ +/* Clock divisor */ +#ifdef CONFG_X86_64 +#define APIC_DIVISOR 1 +#else #define APIC_DIVISOR 16 +#endif /* * This function sets up the local APIC timer, with a timeout of @@ -212,6 +259,9 @@ int lapic_get_maxlvt(void) * this function twice on the boot CPU, once with a bogus timeout * value, second time for real. The other (noncalibrating) CPUs * call this function only once, with the real, calibrated value. + * + * We do reads before writes even if unnecessary, to get around the + * P5 APIC double write bug. */ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) { @@ -233,14 +283,48 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) */ tmp_value = apic_read(APIC_TDCR); apic_write(APIC_TDCR, - (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) | - APIC_TDR_DIV_16); + (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) | + APIC_TDR_DIV_16); if (!oneshot) apic_write(APIC_TMICT, clocks / APIC_DIVISOR); } /* + * Setup extended LVT, AMD specific (K8, family 10h) + * + * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and + * MCE interrupts are supported. Thus MCE offset must be set to 0. + * + * If mask=1, the LVT entry does not generate interrupts while mask=0 + * enables the vector. See also the BKDGs. + */ + +#define APIC_EILVT_LVTOFF_MCE 0 +#define APIC_EILVT_LVTOFF_IBS 1 + +static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask) +{ + unsigned long reg = (lvt_off << 4) + APIC_EILVT0; + unsigned int v = (mask << 16) | (msg_type << 8) | vector; + + apic_write(reg, v); +} + +u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask) +{ + setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask); + return APIC_EILVT_LVTOFF_MCE; +} + +u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask) +{ + setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask); + return APIC_EILVT_LVTOFF_IBS; +} +EXPORT_SYMBOL_GPL(setup_APIC_eilvt_ibs); + +/* * Program the next event, relative to now */ static int lapic_next_event(unsigned long delta, @@ -259,8 +343,8 @@ static void lapic_timer_setup(enum clock_event_mode mode, unsigned long flags; unsigned int v; - /* Lapic used for broadcast ? */ - if (!local_apic_timer_verify_ok) + /* Lapic used as dummy for broadcast ? */ + if (evt->features & CLOCK_EVT_FEAT_DUMMY) return; local_irq_save(flags); @@ -473,7 +557,7 @@ static int __init calibrate_APIC_clock(void) return -1; } - local_apic_timer_verify_ok = 1; + levt->features &= ~CLOCK_EVT_FEAT_DUMMY; /* We trust the pm timer based calibration */ if (!pm_referenced) { @@ -507,11 +591,11 @@ static int __init calibrate_APIC_clock(void) if (deltaj >= LAPIC_CAL_LOOPS-2 && deltaj <= LAPIC_CAL_LOOPS+2) apic_printk(APIC_VERBOSE, "... jiffies result ok\n"); else - local_apic_timer_verify_ok = 0; + levt->features |= CLOCK_EVT_FEAT_DUMMY; } else local_irq_enable(); - if (!local_apic_timer_verify_ok) { + if (levt->features & CLOCK_EVT_FEAT_DUMMY) { printk(KERN_WARNING "APIC timer disabled due to verification failure.\n"); return -1; @@ -533,7 +617,8 @@ void __init setup_boot_APIC_clock(void) * timer as a dummy clock event source on SMP systems, so the * broadcast mechanism is used. On UP systems simply ignore it. */ - if (local_apic_timer_disabled) { + if (disable_apic_timer) { + printk(KERN_INFO "Disabling APIC timer\n"); /* No broadcast on UP ! */ if (num_possible_cpus() > 1) { lapic_clockevent.mult = 1; @@ -602,7 +687,11 @@ static void local_apic_timer_interrupt(void) /* * the NMI deadlock-detector uses this. */ +#ifdef CONFIG_X86_64 + add_pda(apic_timer_irqs, 1); +#else per_cpu(irq_stat, cpu).apic_timer_irqs++; +#endif evt->event_handler(evt); } @@ -642,35 +731,6 @@ int setup_profiling_timer(unsigned int multiplier) } /* - * Setup extended LVT, AMD specific (K8, family 10h) - * - * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and - * MCE interrupts are supported. Thus MCE offset must be set to 0. - */ - -#define APIC_EILVT_LVTOFF_MCE 0 -#define APIC_EILVT_LVTOFF_IBS 1 - -static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask) -{ - unsigned long reg = (lvt_off << 4) + APIC_EILVT0; - unsigned int v = (mask << 16) | (msg_type << 8) | vector; - apic_write(reg, v); -} - -u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask) -{ - setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask); - return APIC_EILVT_LVTOFF_MCE; -} - -u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask) -{ - setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask); - return APIC_EILVT_LVTOFF_IBS; -} - -/* * Local APIC start and shutdown */ @@ -715,7 +775,7 @@ void clear_local_APIC(void) } /* lets not touch this if we didn't frob it */ -#ifdef CONFIG_X86_MCE_P4THERMAL +#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(X86_MCE_INTEL) if (maxlvt >= 5) { v = apic_read(APIC_LVTTHMR); apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); @@ -732,10 +792,6 @@ void clear_local_APIC(void) if (maxlvt >= 4) apic_write(APIC_LVTPC, APIC_LVT_MASKED); -#ifdef CONFIG_X86_MCE_P4THERMAL - if (maxlvt >= 5) - apic_write(APIC_LVTTHMR, APIC_LVT_MASKED); -#endif /* Integrated APIC (!82489DX) ? */ if (lapic_is_integrated()) { if (maxlvt > 3) @@ -750,7 +806,7 @@ void clear_local_APIC(void) */ void disable_local_APIC(void) { - unsigned long value; + unsigned int value; clear_local_APIC(); @@ -762,6 +818,7 @@ void disable_local_APIC(void) value &= ~APIC_SPIV_APIC_ENABLED; apic_write(APIC_SPIV, value); +#ifdef CONFIG_X86_32 /* * When LAPIC was disabled by the BIOS and enabled by the kernel, * restore the disabled state. @@ -773,6 +830,7 @@ void disable_local_APIC(void) l &= ~MSR_IA32_APICBASE_ENABLE; wrmsr(MSR_IA32_APICBASE, l, h); } +#endif } /* @@ -789,11 +847,15 @@ void lapic_shutdown(void) return; local_irq_save(flags); - clear_local_APIC(); - if (enabled_via_apicbase) +#ifdef CONFIG_X86_32 + if (!enabled_via_apicbase) + clear_local_APIC(); + else +#endif disable_local_APIC(); + local_irq_restore(flags); } @@ -838,6 +900,12 @@ int __init verify_local_APIC(void) */ reg0 = apic_read(APIC_ID); apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0); + apic_write(APIC_ID, reg0 ^ APIC_ID_MASK); + reg1 = apic_read(APIC_ID); + apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1); + apic_write(APIC_ID, reg0); + if (reg1 != (reg0 ^ APIC_ID_MASK)) + return 0; /* * The next two are just to see if we have sane values. @@ -863,14 +931,15 @@ void __init sync_Arb_IDs(void) */ if (modern_apic() || boot_cpu_data.x86_vendor == X86_VENDOR_AMD) return; + /* * Wait for idle. */ apic_wait_icr_idle(); apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); - apic_write(APIC_ICR, - APIC_DEST_ALLINC | APIC_INT_LEVELTRIG | APIC_DM_INIT); + apic_write(APIC_ICR, APIC_DEST_ALLINC | + APIC_INT_LEVELTRIG | APIC_DM_INIT); } /* @@ -878,7 +947,7 @@ void __init sync_Arb_IDs(void) */ void __init init_bsp_APIC(void) { - unsigned long value; + unsigned int value; /* * Don't do the setup now if we have a SMP BIOS as the @@ -899,11 +968,13 @@ void __init init_bsp_APIC(void) value &= ~APIC_VECTOR_MASK; value |= APIC_SPIV_APIC_ENABLED; +#ifdef CONFIG_X86_32 /* This bit is reserved on P4/Xeon and should be cleared */ if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 15)) value &= ~APIC_SPIV_FOCUS_DISABLED; else +#endif value |= APIC_SPIV_FOCUS_DISABLED; value |= SPURIOUS_APIC_VECTOR; apic_write(APIC_SPIV, value); @@ -922,6 +993,16 @@ static void __cpuinit lapic_setup_esr(void) { unsigned long oldvalue, value, maxlvt; if (lapic_is_integrated() && !esr_disable) { + if (esr_disable) { + /* + * Something untraceable is creating bad interrupts on + * secondary quads ... for the moment, just leave the + * ESR disabled - we can't do anything useful with the + * errors anyway - mbligh + */ + printk(KERN_INFO "Leaving ESR disabled.\n"); + return; + } /* !82489DX */ maxlvt = lapic_get_maxlvt(); if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ @@ -942,16 +1023,7 @@ static void __cpuinit lapic_setup_esr(void) "vector: 0x%08lx after: 0x%08lx\n", oldvalue, value); } else { - if (esr_disable) - /* - * Something untraceable is creating bad interrupts on - * secondary quads ... for the moment, just leave the - * ESR disabled - we can't do anything useful with the - * errors anyway - mbligh - */ - printk(KERN_INFO "Leaving ESR disabled.\n"); - else - printk(KERN_INFO "No ESR for 82489DX.\n"); + printk(KERN_INFO "No ESR for 82489DX.\n"); } } @@ -1089,13 +1161,17 @@ void __cpuinit setup_local_APIC(void) void __cpuinit end_local_APIC_setup(void) { - unsigned long value; - lapic_setup_esr(); - /* Disable the local apic timer */ - value = apic_read(APIC_LVTT); - value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); - apic_write(APIC_LVTT, value); + +#ifdef CONFIG_X86_32 + { + unsigned int value; + /* Disable the local apic timer */ + value = apic_read(APIC_LVTT); + value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); + apic_write(APIC_LVTT, value); + } +#endif setup_apic_nmi_watchdog(NULL); apic_pm_activate(); @@ -1205,7 +1281,7 @@ void __init init_apic_mappings(void) * default configuration (or the MP table is broken). */ if (boot_cpu_physical_apicid == -1U) - boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id()); + boot_cpu_physical_apicid = read_apic_id(); } @@ -1242,7 +1318,7 @@ int __init APIC_init_uniprocessor(void) * might be zero if read from MP tables. Get it from LAPIC. */ #ifdef CONFIG_CRASH_DUMP - boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id()); + boot_cpu_physical_apicid = read_apic_id(); #endif physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); @@ -1321,59 +1397,12 @@ void smp_error_interrupt(struct pt_regs *regs) irq_exit(); } -#ifdef CONFIG_SMP -void __init smp_intr_init(void) -{ - /* - * IRQ0 must be given a fixed assignment and initialized, - * because it's used before the IO-APIC is set up. - */ - set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]); - - /* - * The reschedule interrupt is a CPU-to-CPU reschedule-helper - * IPI, driven by wakeup. - */ - alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); - - /* IPI for invalidation */ - alloc_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt); - - /* IPI for generic function call */ - alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); - - /* IPI for single call function */ - set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, - call_function_single_interrupt); -} -#endif - -/* - * Initialize APIC interrupts - */ -void __init apic_intr_init(void) -{ -#ifdef CONFIG_SMP - smp_intr_init(); -#endif - /* self generated IPI for local APIC timer */ - alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); - - /* IPI vectors for APIC spurious and error interrupts */ - alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); - alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); - - /* thermal monitor LVT interrupt */ -#ifdef CONFIG_X86_MCE_P4THERMAL - alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); -#endif -} - /** * connect_bsp_APIC - attach the APIC to the interrupt system */ void __init connect_bsp_APIC(void) { +#ifdef CONFIG_X86_32 if (pic_mode) { /* * Do not trust the local APIC being empty at bootup. @@ -1388,6 +1417,7 @@ void __init connect_bsp_APIC(void) outb(0x70, 0x22); outb(0x01, 0x23); } +#endif enable_apic_mode(); } @@ -1400,6 +1430,9 @@ void __init connect_bsp_APIC(void) */ void disconnect_bsp_APIC(int virt_wire_setup) { + unsigned int value; + +#ifdef CONFIG_X86_32 if (pic_mode) { /* * Put the board back into PIC mode (has an effect only on @@ -1411,54 +1444,53 @@ void disconnect_bsp_APIC(int virt_wire_setup) "entering PIC mode.\n"); outb(0x70, 0x22); outb(0x00, 0x23); - } else { - /* Go back to Virtual Wire compatibility mode */ - unsigned long value; + return; + } +#endif - /* For the spurious interrupt use vector F, and enable it */ - value = apic_read(APIC_SPIV); - value &= ~APIC_VECTOR_MASK; - value |= APIC_SPIV_APIC_ENABLED; - value |= 0xf; - apic_write(APIC_SPIV, value); + /* Go back to Virtual Wire compatibility mode */ - if (!virt_wire_setup) { - /* - * For LVT0 make it edge triggered, active high, - * external and enabled - */ - value = apic_read(APIC_LVT0); - value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | - APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | - APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); - value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; - value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); - apic_write(APIC_LVT0, value); - } else { - /* Disable LVT0 */ - apic_write(APIC_LVT0, APIC_LVT_MASKED); - } + /* For the spurious interrupt use vector F, and enable it */ + value = apic_read(APIC_SPIV); + value &= ~APIC_VECTOR_MASK; + value |= APIC_SPIV_APIC_ENABLED; + value |= 0xf; + apic_write(APIC_SPIV, value); + if (!virt_wire_setup) { /* - * For LVT1 make it edge triggered, active high, nmi and - * enabled + * For LVT0 make it edge triggered, active high, + * external and enabled */ - value = apic_read(APIC_LVT1); - value &= ~( - APIC_MODE_MASK | APIC_SEND_PENDING | + value = apic_read(APIC_LVT0); + value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; - value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); - apic_write(APIC_LVT1, value); + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); + apic_write(APIC_LVT0, value); + } else { + /* Disable LVT0 */ + apic_write(APIC_LVT0, APIC_LVT_MASKED); } + + /* + * For LVT1 make it edge triggered, active high, + * nmi and enabled + */ + value = apic_read(APIC_LVT1); + value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | + APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); + value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); + apic_write(APIC_LVT1, value); } void __cpuinit generic_processor_info(int apicid, int version) { int cpu; cpumask_t tmp_map; - physid_mask_t phys_cpu; /* * Validate version @@ -1471,9 +1503,6 @@ void __cpuinit generic_processor_info(int apicid, int version) } apic_version[apicid] = version; - phys_cpu = apicid_to_cpu_present(apicid); - physids_or(phys_cpu_present_map, phys_cpu_present_map, phys_cpu); - if (num_processors >= NR_CPUS) { printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." " Processor ignored.\n", NR_CPUS); @@ -1484,17 +1513,19 @@ void __cpuinit generic_processor_info(int apicid, int version) cpus_complement(tmp_map, cpu_present_map); cpu = first_cpu(tmp_map); - if (apicid == boot_cpu_physical_apicid) + physid_set(apicid, phys_cpu_present_map); + if (apicid == boot_cpu_physical_apicid) { /* * x86_bios_cpu_apicid is required to have processors listed * in same order as logical cpu numbers. Hence the first * entry is BSP, and so on. */ cpu = 0; - + } if (apicid > max_physical_apicid) max_physical_apicid = apicid; +#ifdef CONFIG_X86_32 /* * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y * but we need to work other dependencies like SMP_SUSPEND etc @@ -1514,7 +1545,9 @@ void __cpuinit generic_processor_info(int apicid, int version) def_to_bigsmp = 1; } } -#ifdef CONFIG_SMP +#endif + +#if defined(CONFIG_X86_SMP) || defined(CONFIG_X86_64) /* are we being called early in kernel startup? */ if (early_per_cpu_ptr(x86_cpu_to_apicid)) { u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); @@ -1527,6 +1560,7 @@ void __cpuinit generic_processor_info(int apicid, int version) per_cpu(x86_bios_cpu_apicid, cpu) = apicid; } #endif + cpu_set(cpu, cpu_possible_map); cpu_set(cpu, cpu_present_map); } @@ -1537,6 +1571,11 @@ void __cpuinit generic_processor_info(int apicid, int version) #ifdef CONFIG_PM static struct { + /* + * 'active' is true if the local APIC was enabled by us and + * not the BIOS; this signifies that we are also responsible + * for disabling it before entering apm/acpi suspend + */ int active; /* r/w apic fields */ unsigned int apic_id; @@ -1577,7 +1616,7 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state) apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); apic_pm_state.apic_tmict = apic_read(APIC_TMICT); apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); -#ifdef CONFIG_X86_MCE_P4THERMAL +#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) if (maxlvt >= 5) apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); #endif @@ -1601,16 +1640,23 @@ static int lapic_resume(struct sys_device *dev) local_irq_save(flags); - /* - * Make sure the APICBASE points to the right address - * - * FIXME! This will be wrong if we ever support suspend on - * SMP! We'll need to do this as part of the CPU restore! - */ - rdmsr(MSR_IA32_APICBASE, l, h); - l &= ~MSR_IA32_APICBASE_BASE; - l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; - wrmsr(MSR_IA32_APICBASE, l, h); +#ifdef CONFIG_X86_64 + if (x2apic) + enable_x2apic(); + else +#endif + { + /* + * Make sure the APICBASE points to the right address + * + * FIXME! This will be wrong if we ever support suspend on + * SMP! We'll need to do this as part of the CPU restore! + */ + rdmsr(MSR_IA32_APICBASE, l, h); + l &= ~MSR_IA32_APICBASE_BASE; + l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; + wrmsr(MSR_IA32_APICBASE, l, h); + } apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); apic_write(APIC_ID, apic_pm_state.apic_id); @@ -1620,7 +1666,7 @@ static int lapic_resume(struct sys_device *dev) apic_write(APIC_SPIV, apic_pm_state.apic_spiv); apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); -#ifdef CONFIG_X86_MCE_P4THERMAL +#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) if (maxlvt >= 5) apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); #endif @@ -1634,7 +1680,9 @@ static int lapic_resume(struct sys_device *dev) apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); apic_write(APIC_ESR, 0); apic_read(APIC_ESR); + local_irq_restore(flags); + return 0; } @@ -1690,20 +1738,20 @@ static int __init parse_lapic(char *arg) } early_param("lapic", parse_lapic); -static int __init parse_nolapic(char *arg) +static int __init setup_disableapic(char *arg) { disable_apic = 1; setup_clear_cpu_cap(X86_FEATURE_APIC); return 0; } -early_param("nolapic", parse_nolapic); +early_param("disableapic", setup_disableapic); -static int __init parse_disable_lapic_timer(char *arg) +/* same as disableapic, for compatibility */ +static int __init setup_nolapic(char *arg) { - local_apic_timer_disabled = 1; - return 0; + return setup_disableapic(arg); } -early_param("nolapic_timer", parse_disable_lapic_timer); +early_param("nolapic", setup_nolapic); static int __init parse_lapic_timer_c2_ok(char *arg) { @@ -1712,15 +1760,40 @@ static int __init parse_lapic_timer_c2_ok(char *arg) } early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); +static int __init parse_disable_apic_timer(char *arg) +{ + disable_apic_timer = 1; + return 0; +} +early_param("noapictimer", parse_disable_apic_timer); + +static int __init parse_nolapic_timer(char *arg) +{ + disable_apic_timer = 1; + return 0; +} +early_param("nolapic_timer", parse_nolapic_timer); + static int __init apic_set_verbosity(char *arg) { - if (!arg) + if (!arg) { +#ifdef CONFIG_X86_64 + skip_ioapic_setup = 0; + ioapic_force = 1; + return 0; +#endif return -EINVAL; + } - if (strcmp(arg, "debug") == 0) + if (strcmp("debug", arg) == 0) apic_verbosity = APIC_DEBUG; - else if (strcmp(arg, "verbose") == 0) + else if (strcmp("verbose", arg) == 0) apic_verbosity = APIC_VERBOSE; + else { + printk(KERN_WARNING "APIC Verbosity level %s not recognised" + " use apic=verbose or apic=debug\n", arg); + return -EINVAL; + } return 0; } diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 446c062e831..94ddb69ae15 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -27,6 +27,7 @@ #include <linux/clockchips.h> #include <linux/acpi_pmtmr.h> #include <linux/module.h> +#include <linux/dmar.h> #include <asm/atomic.h> #include <asm/smp.h> @@ -39,13 +40,20 @@ #include <asm/proto.h> #include <asm/timex.h> #include <asm/apic.h> +#include <asm/i8259.h> #include <mach_ipi.h> #include <mach_apic.h> +/* Disable local APIC timer from the kernel commandline or via dmi quirk */ static int disable_apic_timer __cpuinitdata; static int apic_calibrate_pmtmr __initdata; int disable_apic; +int disable_x2apic; +int x2apic; + +/* x2apic enabled before OS handover */ +int x2apic_preenabled; /* Local APIC timer works in C2 */ int local_apic_timer_c2_ok; @@ -73,6 +81,9 @@ static void lapic_timer_setup(enum clock_event_mode mode, static void lapic_timer_broadcast(cpumask_t mask); static void apic_pm_activate(void); +/* + * The local apic timer can be used for any function which is CPU local. + */ static struct clock_event_device lapic_clockevent = { .name = "lapic", .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT @@ -99,11 +110,15 @@ static inline int lapic_get_version(void) } /* - * Check, if the APIC is integrated or a seperate chip + * Check, if the APIC is integrated or a separate chip */ static inline int lapic_is_integrated(void) { +#ifdef CONFIG_X86_64 return 1; +#else + return APIC_INTEGRATED(lapic_get_version()); +#endif } /* @@ -118,13 +133,18 @@ static int modern_apic(void) return lapic_get_version() >= 0x14; } -void apic_wait_icr_idle(void) +/* + * Paravirt kernels also might be using these below ops. So we still + * use generic apic_read()/apic_write(), which might be pointing to different + * ops in PARAVIRT case. + */ +void xapic_wait_icr_idle(void) { while (apic_read(APIC_ICR) & APIC_ICR_BUSY) cpu_relax(); } -u32 safe_apic_wait_icr_idle(void) +u32 safe_xapic_wait_icr_idle(void) { u32 send_status; int timeout; @@ -140,6 +160,68 @@ u32 safe_apic_wait_icr_idle(void) return send_status; } +void xapic_icr_write(u32 low, u32 id) +{ + apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id)); + apic_write(APIC_ICR, low); +} + +u64 xapic_icr_read(void) +{ + u32 icr1, icr2; + + icr2 = apic_read(APIC_ICR2); + icr1 = apic_read(APIC_ICR); + + return icr1 | ((u64)icr2 << 32); +} + +static struct apic_ops xapic_ops = { + .read = native_apic_mem_read, + .write = native_apic_mem_write, + .icr_read = xapic_icr_read, + .icr_write = xapic_icr_write, + .wait_icr_idle = xapic_wait_icr_idle, + .safe_wait_icr_idle = safe_xapic_wait_icr_idle, +}; + +struct apic_ops __read_mostly *apic_ops = &xapic_ops; +EXPORT_SYMBOL_GPL(apic_ops); + +static void x2apic_wait_icr_idle(void) +{ + /* no need to wait for icr idle in x2apic */ + return; +} + +static u32 safe_x2apic_wait_icr_idle(void) +{ + /* no need to wait for icr idle in x2apic */ + return 0; +} + +void x2apic_icr_write(u32 low, u32 id) +{ + wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low); +} + +u64 x2apic_icr_read(void) +{ + unsigned long val; + + rdmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), val); + return val; +} + +static struct apic_ops x2apic_ops = { + .read = native_apic_msr_read, + .write = native_apic_msr_write, + .icr_read = x2apic_icr_read, + .icr_write = x2apic_icr_write, + .wait_icr_idle = x2apic_wait_icr_idle, + .safe_wait_icr_idle = safe_x2apic_wait_icr_idle, +}; + /** * enable_NMI_through_LVT0 - enable NMI through local vector table 0 */ @@ -149,6 +231,11 @@ void __cpuinit enable_NMI_through_LVT0(void) /* unmask and set to NMI */ v = APIC_DM_NMI; + + /* Level triggered for 82489DX (32bit mode) */ + if (!lapic_is_integrated()) + v |= APIC_LVT_LEVEL_TRIGGER; + apic_write(APIC_LVT0, v); } @@ -157,14 +244,28 @@ void __cpuinit enable_NMI_through_LVT0(void) */ int lapic_get_maxlvt(void) { - unsigned int v, maxlvt; + unsigned int v; v = apic_read(APIC_LVR); - maxlvt = GET_APIC_MAXLVT(v); - return maxlvt; + /* + * - we always have APIC integrated on 64bit mode + * - 82489DXs do not report # of LVT entries + */ + return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2; } /* + * Local APIC timer + */ + +/* Clock divisor */ +#ifdef CONFG_X86_64 +#define APIC_DIVISOR 1 +#else +#define APIC_DIVISOR 16 +#endif + +/* * This function sets up the local APIC timer, with a timeout of * 'clocks' APIC bus clock. During calibration we actually call * this function twice on the boot CPU, once with a bogus timeout @@ -174,7 +275,6 @@ int lapic_get_maxlvt(void) * We do reads before writes even if unnecessary, to get around the * P5 APIC double write bug. */ - static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) { unsigned int lvtt_value, tmp_value; @@ -182,6 +282,9 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) lvtt_value = LOCAL_TIMER_VECTOR; if (!oneshot) lvtt_value |= APIC_LVT_TIMER_PERIODIC; + if (!lapic_is_integrated()) + lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV); + if (!irqen) lvtt_value |= APIC_LVT_MASKED; @@ -191,12 +294,12 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) * Divide PICLK by 16 */ tmp_value = apic_read(APIC_TDCR); - apic_write(APIC_TDCR, (tmp_value - & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) - | APIC_TDR_DIV_16); + apic_write(APIC_TDCR, + (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) | + APIC_TDR_DIV_16); if (!oneshot) - apic_write(APIC_TMICT, clocks); + apic_write(APIC_TMICT, clocks / APIC_DIVISOR); } /* @@ -204,6 +307,9 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) * * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and * MCE interrupts are supported. Thus MCE offset must be set to 0. + * + * If mask=1, the LVT entry does not generate interrupts while mask=0 + * enables the vector. See also the BKDGs. */ #define APIC_EILVT_LVTOFF_MCE 0 @@ -228,6 +334,7 @@ u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask) setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask); return APIC_EILVT_LVTOFF_IBS; } +EXPORT_SYMBOL_GPL(setup_APIC_eilvt_ibs); /* * Program the next event, relative to now @@ -366,7 +473,7 @@ static int __init calibrate_APIC_clock(void) lapic_clockevent.min_delta_ns = clockevent_delta2ns(0xF, &lapic_clockevent); - calibration_result = result / HZ; + calibration_result = (result * APIC_DIVISOR) / HZ; /* * Do a sanity check on the APIC calibration result @@ -388,10 +495,10 @@ static int __init calibrate_APIC_clock(void) void __init setup_boot_APIC_clock(void) { /* - * The local apic timer can be disabled via the kernel commandline. - * Register the lapic timer as a dummy clock event source on SMP - * systems, so the broadcast mechanism is used. On UP systems simply - * ignore it. + * The local apic timer can be disabled via the kernel + * commandline or from the CPU detection code. Register the lapic + * timer as a dummy clock event source on SMP systems, so the + * broadcast mechanism is used. On UP systems simply ignore it. */ if (disable_apic_timer) { printk(KERN_INFO "Disabling APIC timer\n"); @@ -403,7 +510,9 @@ void __init setup_boot_APIC_clock(void) return; } - printk(KERN_INFO "Using local APIC timer interrupts.\n"); + apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n" + "calibrating APIC timer ...\n"); + if (calibrate_APIC_clock()) { /* No broadcast on UP ! */ if (num_possible_cpus() > 1) @@ -422,6 +531,7 @@ void __init setup_boot_APIC_clock(void) printk(KERN_WARNING "APIC timer registered as dummy," " due to nmi_watchdog=%d!\n", nmi_watchdog); + /* Setup the lapic or request the broadcast */ setup_APIC_timer(); } @@ -460,7 +570,11 @@ static void local_apic_timer_interrupt(void) /* * the NMI deadlock-detector uses this. */ +#ifdef CONFIG_X86_64 add_pda(apic_timer_irqs, 1); +#else + per_cpu(irq_stat, cpu).apic_timer_irqs++; +#endif evt->event_handler(evt); } @@ -491,6 +605,7 @@ void smp_apic_timer_interrupt(struct pt_regs *regs) irq_enter(); local_apic_timer_interrupt(); irq_exit(); + set_irq_regs(old_regs); } @@ -544,6 +659,13 @@ void clear_local_APIC(void) apic_write(APIC_LVTPC, v | APIC_LVT_MASKED); } + /* lets not touch this if we didn't frob it */ +#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(X86_MCE_INTEL) + if (maxlvt >= 5) { + v = apic_read(APIC_LVTTHMR); + apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); + } +#endif /* * Clean APIC state for other OSs: */ @@ -554,8 +676,14 @@ void clear_local_APIC(void) apic_write(APIC_LVTERR, APIC_LVT_MASKED); if (maxlvt >= 4) apic_write(APIC_LVTPC, APIC_LVT_MASKED); - apic_write(APIC_ESR, 0); - apic_read(APIC_ESR); + + /* Integrated APIC (!82489DX) ? */ + if (lapic_is_integrated()) { + if (maxlvt > 3) + /* Clear ESR due to Pentium errata 3AP and 11AP */ + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + } } /** @@ -574,8 +702,28 @@ void disable_local_APIC(void) value = apic_read(APIC_SPIV); value &= ~APIC_SPIV_APIC_ENABLED; apic_write(APIC_SPIV, value); + +#ifdef CONFIG_X86_32 + /* + * When LAPIC was disabled by the BIOS and enabled by the kernel, + * restore the disabled state. + */ + if (enabled_via_apicbase) { + unsigned int l, h; + + rdmsr(MSR_IA32_APICBASE, l, h); + l &= ~MSR_IA32_APICBASE_ENABLE; + wrmsr(MSR_IA32_APICBASE, l, h); + } +#endif } +/* + * If Linux enabled the LAPIC against the BIOS default disable it down before + * re-entering the BIOS on shutdown. Otherwise the BIOS may get confused and + * not power-off. Additionally clear all LVT entries before disable_local_APIC + * for the case where Linux didn't enable the LAPIC. + */ void lapic_shutdown(void) { unsigned long flags; @@ -585,7 +733,13 @@ void lapic_shutdown(void) local_irq_save(flags); - disable_local_APIC(); +#ifdef CONFIG_X86_32 + if (!enabled_via_apicbase) + clear_local_APIC(); + else +#endif + disable_local_APIC(); + local_irq_restore(flags); } @@ -629,10 +783,10 @@ int __init verify_local_APIC(void) /* * The ID register is read/write in a real APIC. */ - reg0 = read_apic_id(); + reg0 = apic_read(APIC_ID); apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0); apic_write(APIC_ID, reg0 ^ APIC_ID_MASK); - reg1 = read_apic_id(); + reg1 = apic_read(APIC_ID); apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1); apic_write(APIC_ID, reg0); if (reg1 != (reg0 ^ APIC_ID_MASK)) @@ -656,8 +810,11 @@ int __init verify_local_APIC(void) */ void __init sync_Arb_IDs(void) { - /* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 */ - if (modern_apic()) + /* + * Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 And not + * needed on AMD. + */ + if (modern_apic() || boot_cpu_data.x86_vendor == X86_VENDOR_AMD) return; /* @@ -666,8 +823,8 @@ void __init sync_Arb_IDs(void) apic_wait_icr_idle(); apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); - apic_write(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG - | APIC_DM_INIT); + apic_write(APIC_ICR, APIC_DEST_ALLINC | + APIC_INT_LEVELTRIG | APIC_DM_INIT); } /* @@ -684,8 +841,6 @@ void __init init_bsp_APIC(void) if (smp_found_config || !cpu_has_apic) return; - value = apic_read(APIC_LVR); - /* * Do not trust the local APIC being empty at bootup. */ @@ -697,7 +852,15 @@ void __init init_bsp_APIC(void) value = apic_read(APIC_SPIV); value &= ~APIC_VECTOR_MASK; value |= APIC_SPIV_APIC_ENABLED; - value |= APIC_SPIV_FOCUS_DISABLED; + +#ifdef CONFIG_X86_32 + /* This bit is reserved on P4/Xeon and should be cleared */ + if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && + (boot_cpu_data.x86 == 15)) + value &= ~APIC_SPIV_FOCUS_DISABLED; + else +#endif + value |= APIC_SPIV_FOCUS_DISABLED; value |= SPURIOUS_APIC_VECTOR; apic_write(APIC_SPIV, value); @@ -706,9 +869,50 @@ void __init init_bsp_APIC(void) */ apic_write(APIC_LVT0, APIC_DM_EXTINT); value = APIC_DM_NMI; + if (!lapic_is_integrated()) /* 82489DX */ + value |= APIC_LVT_LEVEL_TRIGGER; apic_write(APIC_LVT1, value); } +static void __cpuinit lapic_setup_esr(void) +{ + unsigned long oldvalue, value, maxlvt; + if (lapic_is_integrated() && !esr_disable) { + if (esr_disable) { + /* + * Something untraceable is creating bad interrupts on + * secondary quads ... for the moment, just leave the + * ESR disabled - we can't do anything useful with the + * errors anyway - mbligh + */ + printk(KERN_INFO "Leaving ESR disabled.\n"); + return; + } + /* !82489DX */ + maxlvt = lapic_get_maxlvt(); + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); + oldvalue = apic_read(APIC_ESR); + + /* enables sending errors */ + value = ERROR_APIC_VECTOR; + apic_write(APIC_LVTERR, value); + /* + * spec says clear errors after enabling vector. + */ + if (maxlvt > 3) + apic_write(APIC_ESR, 0); + value = apic_read(APIC_ESR); + if (value != oldvalue) + apic_printk(APIC_VERBOSE, "ESR value before enabling " + "vector: 0x%08lx after: 0x%08lx\n", + oldvalue, value); + } else { + printk(KERN_INFO "No ESR for 82489DX.\n"); + } +} + + /** * setup_local_APIC - setup the local APIC */ @@ -814,25 +1018,143 @@ void __cpuinit setup_local_APIC(void) preempt_enable(); } -static void __cpuinit lapic_setup_esr(void) -{ - unsigned maxlvt = lapic_get_maxlvt(); - - apic_write(APIC_LVTERR, ERROR_APIC_VECTOR); - /* - * spec says clear errors after enabling vector. - */ - if (maxlvt > 3) - apic_write(APIC_ESR, 0); -} - void __cpuinit end_local_APIC_setup(void) { lapic_setup_esr(); + +#ifdef CONFIG_X86_32 + { + unsigned int value; + /* Disable the local apic timer */ + value = apic_read(APIC_LVTT); + value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); + apic_write(APIC_LVTT, value); + } +#endif + setup_apic_nmi_watchdog(NULL); apic_pm_activate(); } +void check_x2apic(void) +{ + int msr, msr2; + + rdmsr(MSR_IA32_APICBASE, msr, msr2); + + if (msr & X2APIC_ENABLE) { + printk("x2apic enabled by BIOS, switching to x2apic ops\n"); + x2apic_preenabled = x2apic = 1; + apic_ops = &x2apic_ops; + } +} + +void enable_x2apic(void) +{ + int msr, msr2; + + rdmsr(MSR_IA32_APICBASE, msr, msr2); + if (!(msr & X2APIC_ENABLE)) { + printk("Enabling x2apic\n"); + wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0); + } +} + +void enable_IR_x2apic(void) +{ +#ifdef CONFIG_INTR_REMAP + int ret; + unsigned long flags; + + if (!cpu_has_x2apic) + return; + + if (!x2apic_preenabled && disable_x2apic) { + printk(KERN_INFO + "Skipped enabling x2apic and Interrupt-remapping " + "because of nox2apic\n"); + return; + } + + if (x2apic_preenabled && disable_x2apic) + panic("Bios already enabled x2apic, can't enforce nox2apic"); + + if (!x2apic_preenabled && skip_ioapic_setup) { + printk(KERN_INFO + "Skipped enabling x2apic and Interrupt-remapping " + "because of skipping io-apic setup\n"); + return; + } + + ret = dmar_table_init(); + if (ret) { + printk(KERN_INFO + "dmar_table_init() failed with %d:\n", ret); + + if (x2apic_preenabled) + panic("x2apic enabled by bios. But IR enabling failed"); + else + printk(KERN_INFO + "Not enabling x2apic,Intr-remapping\n"); + return; + } + + local_irq_save(flags); + mask_8259A(); + save_mask_IO_APIC_setup(); + + ret = enable_intr_remapping(1); + + if (ret && x2apic_preenabled) { + local_irq_restore(flags); + panic("x2apic enabled by bios. But IR enabling failed"); + } + + if (ret) + goto end; + + if (!x2apic) { + x2apic = 1; + apic_ops = &x2apic_ops; + enable_x2apic(); + } +end: + if (ret) + /* + * IR enabling failed + */ + restore_IO_APIC_setup(); + else + reinit_intr_remapped_IO_APIC(x2apic_preenabled); + + unmask_8259A(); + local_irq_restore(flags); + + if (!ret) { + if (!x2apic_preenabled) + printk(KERN_INFO + "Enabled x2apic and interrupt-remapping\n"); + else + printk(KERN_INFO + "Enabled Interrupt-remapping\n"); + } else + printk(KERN_ERR + "Failed to enable Interrupt-remapping and x2apic\n"); +#else + if (!cpu_has_x2apic) + return; + + if (x2apic_preenabled) + panic("x2apic enabled prior OS handover," + " enable CONFIG_INTR_REMAP"); + + printk(KERN_INFO "Enable CONFIG_INTR_REMAP for enabling intr-remapping " + " and x2apic\n"); +#endif + + return; +} + /* * Detect and enable local APICs on non-SMP boards. * Original code written by Keir Fraser. @@ -872,7 +1194,7 @@ void __init early_init_lapic_mapping(void) * Fetch the APIC ID of the BSP in case we have a * default configuration (or the MP table is broken). */ - boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id()); + boot_cpu_physical_apicid = read_apic_id(); } /** @@ -880,6 +1202,11 @@ void __init early_init_lapic_mapping(void) */ void __init init_apic_mappings(void) { + if (x2apic) { + boot_cpu_physical_apicid = read_apic_id(); + return; + } + /* * If no local APIC can be found then set up a fake all * zeroes page to simulate the local APIC and another @@ -899,13 +1226,15 @@ void __init init_apic_mappings(void) * Fetch the APIC ID of the BSP in case we have a * default configuration (or the MP table is broken). */ - boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id()); + boot_cpu_physical_apicid = read_apic_id(); } /* * This initializes the IO-APIC and APIC hardware if this is * a UP kernel. */ +int apic_version[MAX_APICS]; + int __init APIC_init_uniprocessor(void) { if (disable_apic) { @@ -918,6 +1247,9 @@ int __init APIC_init_uniprocessor(void) return -1; } + enable_IR_x2apic(); + setup_apic_routing(); + verify_local_APIC(); connect_bsp_APIC(); @@ -1004,17 +1336,57 @@ asmlinkage void smp_error_interrupt(void) } /** - * * connect_bsp_APIC - attach the APIC to the interrupt system - * */ + * connect_bsp_APIC - attach the APIC to the interrupt system + */ void __init connect_bsp_APIC(void) { +#ifdef CONFIG_X86_32 + if (pic_mode) { + /* + * Do not trust the local APIC being empty at bootup. + */ + clear_local_APIC(); + /* + * PIC mode, enable APIC mode in the IMCR, i.e. connect BSP's + * local APIC to INT and NMI lines. + */ + apic_printk(APIC_VERBOSE, "leaving PIC mode, " + "enabling APIC mode.\n"); + outb(0x70, 0x22); + outb(0x01, 0x23); + } +#endif enable_apic_mode(); } +/** + * disconnect_bsp_APIC - detach the APIC from the interrupt system + * @virt_wire_setup: indicates, whether virtual wire mode is selected + * + * Virtual wire mode is necessary to deliver legacy interrupts even when the + * APIC is disabled. + */ void disconnect_bsp_APIC(int virt_wire_setup) { + unsigned int value; + +#ifdef CONFIG_X86_32 + if (pic_mode) { + /* + * Put the board back into PIC mode (has an effect only on + * certain older boards). Note that APIC interrupts, including + * IPIs, won't work beyond this point! The only exception are + * INIT IPIs. + */ + apic_printk(APIC_VERBOSE, "disabling APIC mode, " + "entering PIC mode.\n"); + outb(0x70, 0x22); + outb(0x00, 0x23); + return; + } +#endif + /* Go back to Virtual Wire compatibility mode */ - unsigned long value; /* For the spurious interrupt use vector F, and enable it */ value = apic_read(APIC_SPIV); @@ -1040,7 +1412,10 @@ void disconnect_bsp_APIC(int virt_wire_setup) apic_write(APIC_LVT0, APIC_LVT_MASKED); } - /* For LVT1 make it edge triggered, active high, nmi and enabled */ + /* + * For LVT1 make it edge triggered, active high, + * nmi and enabled + */ value = apic_read(APIC_LVT1); value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | @@ -1055,9 +1430,20 @@ void __cpuinit generic_processor_info(int apicid, int version) int cpu; cpumask_t tmp_map; + /* + * Validate version + */ + if (version == 0x0) { + printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! " + "fixing up to 0x10. (tell your hw vendor)\n", + version); + version = 0x10; + } + apic_version[apicid] = version; + if (num_processors >= NR_CPUS) { printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." - " Processor ignored.\n", NR_CPUS); + " Processor ignored.\n", NR_CPUS); return; } @@ -1077,6 +1463,29 @@ void __cpuinit generic_processor_info(int apicid, int version) if (apicid > max_physical_apicid) max_physical_apicid = apicid; +#ifdef CONFIG_X86_32 + /* + * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y + * but we need to work other dependencies like SMP_SUSPEND etc + * before this can be done without some confusion. + * if (CPU_HOTPLUG_ENABLED || num_processors > 8) + * - Ashok Raj <ashok.raj@intel.com> + */ + if (max_physical_apicid >= 8) { + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_INTEL: + if (!APIC_XAPIC(version)) { + def_to_bigsmp = 0; + break; + } + /* If P4 and above fall through */ + case X86_VENDOR_AMD: + def_to_bigsmp = 1; + } + } +#endif + +#if defined(CONFIG_X86_SMP) || defined(CONFIG_X86_64) /* are we being called early in kernel startup? */ if (early_per_cpu_ptr(x86_cpu_to_apicid)) { u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); @@ -1088,20 +1497,28 @@ void __cpuinit generic_processor_info(int apicid, int version) per_cpu(x86_cpu_to_apicid, cpu) = apicid; per_cpu(x86_bios_cpu_apicid, cpu) = apicid; } +#endif cpu_set(cpu, cpu_possible_map); cpu_set(cpu, cpu_present_map); } +int hard_smp_processor_id(void) +{ + return read_apic_id(); +} + /* * Power management */ #ifdef CONFIG_PM static struct { - /* 'active' is true if the local APIC was enabled by us and - not the BIOS; this signifies that we are also responsible - for disabling it before entering apm/acpi suspend */ + /* + * 'active' is true if the local APIC was enabled by us and + * not the BIOS; this signifies that we are also responsible + * for disabling it before entering apm/acpi suspend + */ int active; /* r/w apic fields */ unsigned int apic_id; @@ -1129,7 +1546,7 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state) maxlvt = lapic_get_maxlvt(); - apic_pm_state.apic_id = read_apic_id(); + apic_pm_state.apic_id = apic_read(APIC_ID); apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI); apic_pm_state.apic_ldr = apic_read(APIC_LDR); apic_pm_state.apic_dfr = apic_read(APIC_DFR); @@ -1142,10 +1559,11 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state) apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); apic_pm_state.apic_tmict = apic_read(APIC_TMICT); apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); -#ifdef CONFIG_X86_MCE_INTEL +#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) if (maxlvt >= 5) apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); #endif + local_irq_save(flags); disable_local_APIC(); local_irq_restore(flags); @@ -1164,10 +1582,25 @@ static int lapic_resume(struct sys_device *dev) maxlvt = lapic_get_maxlvt(); local_irq_save(flags); - rdmsr(MSR_IA32_APICBASE, l, h); - l &= ~MSR_IA32_APICBASE_BASE; - l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; - wrmsr(MSR_IA32_APICBASE, l, h); + +#ifdef CONFIG_X86_64 + if (x2apic) + enable_x2apic(); + else +#endif + { + /* + * Make sure the APICBASE points to the right address + * + * FIXME! This will be wrong if we ever support suspend on + * SMP! We'll need to do this as part of the CPU restore! + */ + rdmsr(MSR_IA32_APICBASE, l, h); + l &= ~MSR_IA32_APICBASE_BASE; + l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; + wrmsr(MSR_IA32_APICBASE, l, h); + } + apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); apic_write(APIC_ID, apic_pm_state.apic_id); apic_write(APIC_DFR, apic_pm_state.apic_dfr); @@ -1176,7 +1609,7 @@ static int lapic_resume(struct sys_device *dev) apic_write(APIC_SPIV, apic_pm_state.apic_spiv); apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); -#ifdef CONFIG_X86_MCE_INTEL +#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) if (maxlvt >= 5) apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); #endif @@ -1190,10 +1623,17 @@ static int lapic_resume(struct sys_device *dev) apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); apic_write(APIC_ESR, 0); apic_read(APIC_ESR); + local_irq_restore(flags); + return 0; } +/* + * This device has no shutdown method - fully functioning local APICs + * are needed on every CPU up until machine_halt/restart/poweroff. + */ + static struct sysdev_class lapic_sysclass = { .name = "lapic", .resume = lapic_resume, @@ -1307,31 +1747,19 @@ __cpuinit int apic_is_clustered_box(void) return (clusters > 2); } -/* - * APIC command line parameters - */ -static int __init apic_set_verbosity(char *str) +static __init int setup_nox2apic(char *str) { - if (str == NULL) { - skip_ioapic_setup = 0; - ioapic_force = 1; - return 0; - } - if (strcmp("debug", str) == 0) - apic_verbosity = APIC_DEBUG; - else if (strcmp("verbose", str) == 0) - apic_verbosity = APIC_VERBOSE; - else { - printk(KERN_WARNING "APIC Verbosity level %s not recognised" - " use apic=verbose or apic=debug\n", str); - return -EINVAL; - } - + disable_x2apic = 1; + clear_cpu_cap(&boot_cpu_data, X86_FEATURE_X2APIC); return 0; } -early_param("apic", apic_set_verbosity); +early_param("nox2apic", setup_nox2apic); + -static __init int setup_disableapic(char *str) +/* + * APIC command line parameters + */ +static int __init setup_disableapic(char *arg) { disable_apic = 1; setup_clear_cpu_cap(X86_FEATURE_APIC); @@ -1340,9 +1768,9 @@ static __init int setup_disableapic(char *str) early_param("disableapic", setup_disableapic); /* same as disableapic, for compatibility */ -static __init int setup_nolapic(char *str) +static int __init setup_nolapic(char *arg) { - return setup_disableapic(str); + return setup_disableapic(arg); } early_param("nolapic", setup_nolapic); @@ -1353,14 +1781,19 @@ static int __init parse_lapic_timer_c2_ok(char *arg) } early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); -static __init int setup_noapictimer(char *str) +static int __init parse_disable_apic_timer(char *arg) { - if (str[0] != ' ' && str[0] != 0) - return 0; disable_apic_timer = 1; - return 1; + return 0; } -__setup("noapictimer", setup_noapictimer); +early_param("noapictimer", parse_disable_apic_timer); + +static int __init parse_nolapic_timer(char *arg) +{ + disable_apic_timer = 1; + return 0; +} +early_param("nolapic_timer", parse_nolapic_timer); static __init int setup_apicpmtimer(char *s) { @@ -1370,6 +1803,31 @@ static __init int setup_apicpmtimer(char *s) } __setup("apicpmtimer", setup_apicpmtimer); +static int __init apic_set_verbosity(char *arg) +{ + if (!arg) { +#ifdef CONFIG_X86_64 + skip_ioapic_setup = 0; + ioapic_force = 1; + return 0; +#endif + return -EINVAL; + } + + if (strcmp("debug", arg) == 0) + apic_verbosity = APIC_DEBUG; + else if (strcmp("verbose", arg) == 0) + apic_verbosity = APIC_VERBOSE; + else { + printk(KERN_WARNING "APIC Verbosity level %s not recognised" + " use apic=verbose or apic=debug\n", arg); + return -EINVAL; + } + + return 0; +} +early_param("apic", apic_set_verbosity); + static int __init lapic_insert_resource(void) { if (!apic_phys) diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 9ee24e6bc4b..5145a6e72bb 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -228,12 +228,12 @@ #include <linux/suspend.h> #include <linux/kthread.h> #include <linux/jiffies.h> -#include <linux/smp_lock.h> #include <asm/system.h> #include <asm/uaccess.h> #include <asm/desc.h> #include <asm/i8253.h> +#include <asm/olpc.h> #include <asm/paravirt.h> #include <asm/reboot.h> @@ -2217,7 +2217,7 @@ static int __init apm_init(void) dmi_check_system(apm_dmi_table); - if (apm_info.bios.version == 0 || paravirt_enabled()) { + if (apm_info.bios.version == 0 || paravirt_enabled() || machine_is_olpc()) { printk(KERN_INFO "apm: BIOS not found.\n"); return -ENODEV; } diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index aa89387006f..505543a75a5 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -22,7 +22,7 @@ #define __NO_STUBS 1 #undef __SYSCALL -#undef _ASM_X86_64_UNISTD_H_ +#undef ASM_X86__UNISTD_64_H #define __SYSCALL(nr, sym) [nr] = 1, static char syscalls[] = { #include <asm/unistd.h> diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c index c639bd55391..fdd585f9c53 100644 --- a/arch/x86/kernel/bios_uv.c +++ b/arch/x86/kernel/bios_uv.c @@ -25,11 +25,11 @@ x86_bios_strerror(long status) { const char *str; switch (status) { - case 0: str = "Call completed without error"; break; - case -1: str = "Not implemented"; break; - case -2: str = "Invalid argument"; break; - case -3: str = "Call completed with error"; break; - default: str = "Unknown BIOS status code"; break; + case 0: str = "Call completed without error"; break; + case -1: str = "Not implemented"; break; + case -2: str = "Invalid argument"; break; + case -3: str = "Call completed with error"; break; + default: str = "Unknown BIOS status code"; break; } return str; } diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index ee76eaad300..7f0b45a5d78 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -3,22 +3,30 @@ # obj-y := intel_cacheinfo.o addon_cpuid_features.o -obj-y += proc.o feature_names.o - -obj-$(CONFIG_X86_32) += common.o bugs.o -obj-$(CONFIG_X86_64) += common_64.o bugs_64.o -obj-$(CONFIG_X86_32) += amd.o -obj-$(CONFIG_X86_64) += amd_64.o -obj-$(CONFIG_X86_32) += cyrix.o -obj-$(CONFIG_X86_32) += centaur.o -obj-$(CONFIG_X86_64) += centaur_64.o -obj-$(CONFIG_X86_32) += transmeta.o -obj-$(CONFIG_X86_32) += intel.o -obj-$(CONFIG_X86_64) += intel_64.o -obj-$(CONFIG_X86_32) += umc.o +obj-y += proc.o capflags.o powerflags.o common.o + +obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o +obj-$(CONFIG_X86_64) += bugs_64.o + +obj-$(CONFIG_CPU_SUP_INTEL) += intel.o +obj-$(CONFIG_CPU_SUP_AMD) += amd.o +obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o +obj-$(CONFIG_CPU_SUP_CENTAUR_32) += centaur.o +obj-$(CONFIG_CPU_SUP_CENTAUR_64) += centaur_64.o +obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o +obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o obj-$(CONFIG_X86_MCE) += mcheck/ obj-$(CONFIG_MTRR) += mtrr/ obj-$(CONFIG_CPU_FREQ) += cpufreq/ obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o + +quiet_cmd_mkcapflags = MKCAP $@ + cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@ + +cpufeature = $(src)/../../../../include/asm-x86/cpufeature.h + +targets += capflags.c +$(obj)/capflags.c: $(cpufeature) $(src)/mkcapflags.pl FORCE + $(call if_changed,mkcapflags) diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c index 84a8220a607..0d9c993aa93 100644 --- a/arch/x86/kernel/cpu/addon_cpuid_features.c +++ b/arch/x86/kernel/cpu/addon_cpuid_features.c @@ -7,6 +7,8 @@ #include <asm/pat.h> #include <asm/processor.h> +#include <mach_apic.h> + struct cpuid_bit { u16 feature; u8 reg; @@ -48,6 +50,92 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) } } +/* leaf 0xb SMT level */ +#define SMT_LEVEL 0 + +/* leaf 0xb sub-leaf types */ +#define INVALID_TYPE 0 +#define SMT_TYPE 1 +#define CORE_TYPE 2 + +#define LEAFB_SUBTYPE(ecx) (((ecx) >> 8) & 0xff) +#define BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f) +#define LEVEL_MAX_SIBLINGS(ebx) ((ebx) & 0xffff) + +/* + * Check for extended topology enumeration cpuid leaf 0xb and if it + * exists, use it for populating initial_apicid and cpu topology + * detection. + */ +void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_SMP + unsigned int eax, ebx, ecx, edx, sub_index; + unsigned int ht_mask_width, core_plus_mask_width; + unsigned int core_select_mask, core_level_siblings; + + if (c->cpuid_level < 0xb) + return; + + cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx); + + /* + * check if the cpuid leaf 0xb is actually implemented. + */ + if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE)) + return; + + set_cpu_cap(c, X86_FEATURE_XTOPOLOGY); + + /* + * initial apic id, which also represents 32-bit extended x2apic id. + */ + c->initial_apicid = edx; + + /* + * Populate HT related information from sub-leaf level 0. + */ + core_level_siblings = smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx); + core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); + + sub_index = 1; + do { + cpuid_count(0xb, sub_index, &eax, &ebx, &ecx, &edx); + + /* + * Check for the Core type in the implemented sub leaves. + */ + if (LEAFB_SUBTYPE(ecx) == CORE_TYPE) { + core_level_siblings = LEVEL_MAX_SIBLINGS(ebx); + core_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); + break; + } + + sub_index++; + } while (LEAFB_SUBTYPE(ecx) != INVALID_TYPE); + + core_select_mask = (~(-1 << core_plus_mask_width)) >> ht_mask_width; + +#ifdef CONFIG_X86_32 + c->cpu_core_id = phys_pkg_id(c->initial_apicid, ht_mask_width) + & core_select_mask; + c->phys_proc_id = phys_pkg_id(c->initial_apicid, core_plus_mask_width); +#else + c->cpu_core_id = phys_pkg_id(ht_mask_width) & core_select_mask; + c->phys_proc_id = phys_pkg_id(core_plus_mask_width); +#endif + c->x86_max_cores = (core_level_siblings / smp_num_siblings); + + + printk(KERN_INFO "CPU: Physical Processor ID: %d\n", + c->phys_proc_id); + if (c->x86_max_cores > 1) + printk(KERN_INFO "CPU: Processor Core ID: %d\n", + c->cpu_core_id); + return; +#endif +} + #ifdef CONFIG_X86_PAT void __cpuinit validate_pat_support(struct cpuinfo_x86 *c) { @@ -56,9 +144,22 @@ void __cpuinit validate_pat_support(struct cpuinfo_x86 *c) switch (c->x86_vendor) { case X86_VENDOR_INTEL: - if (c->x86 == 0xF || (c->x86 == 6 && c->x86_model >= 15)) + /* + * There is a known erratum on Pentium III and Core Solo + * and Core Duo CPUs. + * " Page with PAT set to WC while associated MTRR is UC + * may consolidate to UC " + * Because of this erratum, it is better to stick with + * setting WC in MTRR rather than using PAT on these CPUs. + * + * Enable PAT WC only on P4, Core 2 or later CPUs. + */ + if (c->x86 > 0x6 || (c->x86 == 6 && c->x86_model >= 15)) return; - break; + + pat_disable("PAT WC disabled due to known CPU erratum."); + return; + case X86_VENDOR_AMD: case X86_VENDOR_CENTAUR: case X86_VENDOR_TRANSMETA: diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index cae9cabc303..32e73520adf 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -1,13 +1,22 @@ #include <linux/init.h> #include <linux/bitops.h> #include <linux/mm.h> + #include <asm/io.h> #include <asm/processor.h> #include <asm/apic.h> +#ifdef CONFIG_X86_64 +# include <asm/numa_64.h> +# include <asm/mmconfig.h> +# include <asm/cacheflush.h> +#endif + #include <mach_apic.h> + #include "cpu.h" +#ifdef CONFIG_X86_32 /* * B step AMD K6 before B 9730xxxx have hardware bugs that can cause * misexecution of code under Linux. Owners of such processors should @@ -24,21 +33,273 @@ extern void vide(void); __asm__(".align 4\nvide: ret"); -static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) +static void __cpuinit init_amd_k5(struct cpuinfo_x86 *c) { - if (cpuid_eax(0x80000000) >= 0x80000007) { - c->x86_power = cpuid_edx(0x80000007); - if (c->x86_power & (1<<8)) - set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); +/* + * General Systems BIOSen alias the cpu frequency registers + * of the Elan at 0x000df000. Unfortuantly, one of the Linux + * drivers subsequently pokes it, and changes the CPU speed. + * Workaround : Remove the unneeded alias. + */ +#define CBAR (0xfffc) /* Configuration Base Address (32-bit) */ +#define CBAR_ENB (0x80000000) +#define CBAR_KEY (0X000000CB) + if (c->x86_model == 9 || c->x86_model == 10) { + if (inl (CBAR) & CBAR_ENB) + outl (0 | CBAR_KEY, CBAR); } } -static void __cpuinit init_amd(struct cpuinfo_x86 *c) + +static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c) { u32 l, h; int mbytes = num_physpages >> (20-PAGE_SHIFT); - int r; + if (c->x86_model < 6) { + /* Based on AMD doc 20734R - June 2000 */ + if (c->x86_model == 0) { + clear_cpu_cap(c, X86_FEATURE_APIC); + set_cpu_cap(c, X86_FEATURE_PGE); + } + return; + } + + if (c->x86_model == 6 && c->x86_mask == 1) { + const int K6_BUG_LOOP = 1000000; + int n; + void (*f_vide)(void); + unsigned long d, d2; + + printk(KERN_INFO "AMD K6 stepping B detected - "); + + /* + * It looks like AMD fixed the 2.6.2 bug and improved indirect + * calls at the same time. + */ + + n = K6_BUG_LOOP; + f_vide = vide; + rdtscl(d); + while (n--) + f_vide(); + rdtscl(d2); + d = d2-d; + + if (d > 20*K6_BUG_LOOP) + printk("system stability may be impaired when more than 32 MB are used.\n"); + else + printk("probably OK (after B9730xxxx).\n"); + printk(KERN_INFO "Please see http://membres.lycos.fr/poulot/k6bug.html\n"); + } + + /* K6 with old style WHCR */ + if (c->x86_model < 8 || + (c->x86_model == 8 && c->x86_mask < 8)) { + /* We can only write allocate on the low 508Mb */ + if (mbytes > 508) + mbytes = 508; + + rdmsr(MSR_K6_WHCR, l, h); + if ((l&0x0000FFFF) == 0) { + unsigned long flags; + l = (1<<0)|((mbytes/4)<<1); + local_irq_save(flags); + wbinvd(); + wrmsr(MSR_K6_WHCR, l, h); + local_irq_restore(flags); + printk(KERN_INFO "Enabling old style K6 write allocation for %d Mb\n", + mbytes); + } + return; + } + + if ((c->x86_model == 8 && c->x86_mask > 7) || + c->x86_model == 9 || c->x86_model == 13) { + /* The more serious chips .. */ + + if (mbytes > 4092) + mbytes = 4092; + + rdmsr(MSR_K6_WHCR, l, h); + if ((l&0xFFFF0000) == 0) { + unsigned long flags; + l = ((mbytes>>2)<<22)|(1<<16); + local_irq_save(flags); + wbinvd(); + wrmsr(MSR_K6_WHCR, l, h); + local_irq_restore(flags); + printk(KERN_INFO "Enabling new style K6 write allocation for %d Mb\n", + mbytes); + } + + return; + } + + if (c->x86_model == 10) { + /* AMD Geode LX is model 10 */ + /* placeholder for any needed mods */ + return; + } +} + +static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c) +{ + u32 l, h; + + /* + * Bit 15 of Athlon specific MSR 15, needs to be 0 + * to enable SSE on Palomino/Morgan/Barton CPU's. + * If the BIOS didn't enable it already, enable it here. + */ + if (c->x86_model >= 6 && c->x86_model <= 10) { + if (!cpu_has(c, X86_FEATURE_XMM)) { + printk(KERN_INFO "Enabling disabled K7/SSE Support.\n"); + rdmsr(MSR_K7_HWCR, l, h); + l &= ~0x00008000; + wrmsr(MSR_K7_HWCR, l, h); + set_cpu_cap(c, X86_FEATURE_XMM); + } + } + + /* + * It's been determined by AMD that Athlons since model 8 stepping 1 + * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx + * As per AMD technical note 27212 0.2 + */ + if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) { + rdmsr(MSR_K7_CLK_CTL, l, h); + if ((l & 0xfff00000) != 0x20000000) { + printk ("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", l, + ((l & 0x000fffff)|0x20000000)); + wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h); + } + } + + set_cpu_cap(c, X86_FEATURE_K7); +} +#endif + +#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) +static int __cpuinit nearby_node(int apicid) +{ + int i, node; + + for (i = apicid - 1; i >= 0; i--) { + node = apicid_to_node[i]; + if (node != NUMA_NO_NODE && node_online(node)) + return node; + } + for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) { + node = apicid_to_node[i]; + if (node != NUMA_NO_NODE && node_online(node)) + return node; + } + return first_node(node_online_map); /* Shouldn't happen */ +} +#endif + +/* + * On a AMD dual core setup the lower bits of the APIC id distingush the cores. + * Assumes number of cores is a power of two. + */ +static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_X86_HT + unsigned bits; + + bits = c->x86_coreid_bits; + + /* Low order bits define the core id (index of core in socket) */ + c->cpu_core_id = c->initial_apicid & ((1 << bits)-1); + /* Convert the initial APIC ID into the socket ID */ + c->phys_proc_id = c->initial_apicid >> bits; +#endif +} + +static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) +{ +#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) + int cpu = smp_processor_id(); + int node; + unsigned apicid = hard_smp_processor_id(); + + node = c->phys_proc_id; + if (apicid_to_node[apicid] != NUMA_NO_NODE) + node = apicid_to_node[apicid]; + if (!node_online(node)) { + /* Two possibilities here: + - The CPU is missing memory and no node was created. + In that case try picking one from a nearby CPU + - The APIC IDs differ from the HyperTransport node IDs + which the K8 northbridge parsing fills in. + Assume they are all increased by a constant offset, + but in the same order as the HT nodeids. + If that doesn't result in a usable node fall back to the + path for the previous case. */ + + int ht_nodeid = c->initial_apicid; + + if (ht_nodeid >= 0 && + apicid_to_node[ht_nodeid] != NUMA_NO_NODE) + node = apicid_to_node[ht_nodeid]; + /* Pick a nearby node */ + if (!node_online(node)) + node = nearby_node(apicid); + } + numa_set_node(cpu, node); + + printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); +#endif +} + +static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_X86_HT + unsigned bits, ecx; + + /* Multi core CPU? */ + if (c->extended_cpuid_level < 0x80000008) + return; + + ecx = cpuid_ecx(0x80000008); + + c->x86_max_cores = (ecx & 0xff) + 1; + + /* CPU telling us the core id bits shift? */ + bits = (ecx >> 12) & 0xF; + + /* Otherwise recompute */ + if (bits == 0) { + while ((1 << bits) < c->x86_max_cores) + bits++; + } + + c->x86_coreid_bits = bits; +#endif +} + +static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) +{ + early_init_amd_mc(c); + + /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */ + if (c->x86_power & (1<<8)) + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + +#ifdef CONFIG_X86_64 + set_cpu_cap(c, X86_FEATURE_SYSCALL32); +#else + /* Set MTRR capability flag if appropriate */ + if (c->x86 == 5) + if (c->x86_model == 13 || c->x86_model == 9 || + (c->x86_model == 8 && c->x86_mask >= 8)) + set_cpu_cap(c, X86_FEATURE_K6_MTRR); +#endif +} + +static void __cpuinit init_amd(struct cpuinfo_x86 *c) +{ #ifdef CONFIG_SMP unsigned long long value; @@ -49,7 +310,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) * Errata 63 for SH-B3 steppings * Errata 122 for all steppings (F+ have it disabled by default) */ - if (c->x86 == 15) { + if (c->x86 == 0xf) { rdmsrl(MSR_K7_HWCR, value); value |= 1 << 6; wrmsrl(MSR_K7_HWCR, value); @@ -59,213 +320,119 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) early_init_amd(c); /* - * FIXME: We should handle the K5 here. Set up the write - * range and also turn on MSR 83 bits 4 and 31 (write alloc, - * no bus pipeline) - */ - - /* * Bit 31 in normal CPUID used for nonstandard 3DNow ID; * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */ clear_cpu_cap(c, 0*32+31); - r = get_model_name(c); +#ifdef CONFIG_X86_64 + /* On C+ stepping K8 rep microcode works well for copy/memset */ + if (c->x86 == 0xf) { + u32 level; - switch (c->x86) { - case 4: - /* - * General Systems BIOSen alias the cpu frequency registers - * of the Elan at 0x000df000. Unfortuantly, one of the Linux - * drivers subsequently pokes it, and changes the CPU speed. - * Workaround : Remove the unneeded alias. - */ -#define CBAR (0xfffc) /* Configuration Base Address (32-bit) */ -#define CBAR_ENB (0x80000000) -#define CBAR_KEY (0X000000CB) - if (c->x86_model == 9 || c->x86_model == 10) { - if (inl (CBAR) & CBAR_ENB) - outl (0 | CBAR_KEY, CBAR); - } - break; - case 5: - if (c->x86_model < 6) { - /* Based on AMD doc 20734R - June 2000 */ - if (c->x86_model == 0) { - clear_cpu_cap(c, X86_FEATURE_APIC); - set_cpu_cap(c, X86_FEATURE_PGE); - } - break; - } - - if (c->x86_model == 6 && c->x86_mask == 1) { - const int K6_BUG_LOOP = 1000000; - int n; - void (*f_vide)(void); - unsigned long d, d2; - - printk(KERN_INFO "AMD K6 stepping B detected - "); - - /* - * It looks like AMD fixed the 2.6.2 bug and improved indirect - * calls at the same time. - */ - - n = K6_BUG_LOOP; - f_vide = vide; - rdtscl(d); - while (n--) - f_vide(); - rdtscl(d2); - d = d2-d; - - if (d > 20*K6_BUG_LOOP) - printk("system stability may be impaired when more than 32 MB are used.\n"); - else - printk("probably OK (after B9730xxxx).\n"); - printk(KERN_INFO "Please see http://membres.lycos.fr/poulot/k6bug.html\n"); - } - - /* K6 with old style WHCR */ - if (c->x86_model < 8 || - (c->x86_model == 8 && c->x86_mask < 8)) { - /* We can only write allocate on the low 508Mb */ - if (mbytes > 508) - mbytes = 508; - - rdmsr(MSR_K6_WHCR, l, h); - if ((l&0x0000FFFF) == 0) { - unsigned long flags; - l = (1<<0)|((mbytes/4)<<1); - local_irq_save(flags); - wbinvd(); - wrmsr(MSR_K6_WHCR, l, h); - local_irq_restore(flags); - printk(KERN_INFO "Enabling old style K6 write allocation for %d Mb\n", - mbytes); - } - break; - } - - if ((c->x86_model == 8 && c->x86_mask > 7) || - c->x86_model == 9 || c->x86_model == 13) { - /* The more serious chips .. */ - - if (mbytes > 4092) - mbytes = 4092; - - rdmsr(MSR_K6_WHCR, l, h); - if ((l&0xFFFF0000) == 0) { - unsigned long flags; - l = ((mbytes>>2)<<22)|(1<<16); - local_irq_save(flags); - wbinvd(); - wrmsr(MSR_K6_WHCR, l, h); - local_irq_restore(flags); - printk(KERN_INFO "Enabling new style K6 write allocation for %d Mb\n", - mbytes); - } - - /* Set MTRR capability flag if appropriate */ - if (c->x86_model == 13 || c->x86_model == 9 || - (c->x86_model == 8 && c->x86_mask >= 8)) - set_cpu_cap(c, X86_FEATURE_K6_MTRR); - break; - } - - if (c->x86_model == 10) { - /* AMD Geode LX is model 10 */ - /* placeholder for any needed mods */ - break; - } - break; - case 6: /* An Athlon/Duron */ - - /* - * Bit 15 of Athlon specific MSR 15, needs to be 0 - * to enable SSE on Palomino/Morgan/Barton CPU's. - * If the BIOS didn't enable it already, enable it here. - */ - if (c->x86_model >= 6 && c->x86_model <= 10) { - if (!cpu_has(c, X86_FEATURE_XMM)) { - printk(KERN_INFO "Enabling disabled K7/SSE Support.\n"); - rdmsr(MSR_K7_HWCR, l, h); - l &= ~0x00008000; - wrmsr(MSR_K7_HWCR, l, h); - set_cpu_cap(c, X86_FEATURE_XMM); - } - } - - /* - * It's been determined by AMD that Athlons since model 8 stepping 1 - * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx - * As per AMD technical note 27212 0.2 - */ - if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) { - rdmsr(MSR_K7_CLK_CTL, l, h); - if ((l & 0xfff00000) != 0x20000000) { - printk ("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", l, - ((l & 0x000fffff)|0x20000000)); - wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h); - } - } - break; + level = cpuid_eax(1); + if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58) + set_cpu_cap(c, X86_FEATURE_REP_GOOD); } + if (c->x86 == 0x10 || c->x86 == 0x11) + set_cpu_cap(c, X86_FEATURE_REP_GOOD); +#else + + /* + * FIXME: We should handle the K5 here. Set up the write + * range and also turn on MSR 83 bits 4 and 31 (write alloc, + * no bus pipeline) + */ switch (c->x86) { - case 15: - /* Use K8 tuning for Fam10h and Fam11h */ - case 0x10: - case 0x11: - set_cpu_cap(c, X86_FEATURE_K8); + case 4: + init_amd_k5(c); break; - case 6: - set_cpu_cap(c, X86_FEATURE_K7); + case 5: + init_amd_k6(c); + break; + case 6: /* An Athlon/Duron */ + init_amd_k7(c); break; } + + /* K6s reports MCEs but don't actually have all the MSRs */ + if (c->x86 < 6) + clear_cpu_cap(c, X86_FEATURE_MCE); +#endif + + /* Enable workaround for FXSAVE leak */ if (c->x86 >= 6) set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK); - display_cacheinfo(c); - - if (cpuid_eax(0x80000000) >= 0x80000008) - c->x86_max_cores = (cpuid_ecx(0x80000008) & 0xff) + 1; + if (!c->x86_model_id[0]) { + switch (c->x86) { + case 0xf: + /* Should distinguish Models here, but this is only + a fallback anyways. */ + strcpy(c->x86_model_id, "Hammer"); + break; + } + } -#ifdef CONFIG_X86_HT - /* - * On a AMD multi core setup the lower bits of the APIC id - * distinguish the cores. - */ - if (c->x86_max_cores > 1) { - int cpu = smp_processor_id(); - unsigned bits = (cpuid_ecx(0x80000008) >> 12) & 0xf; + display_cacheinfo(c); - if (bits == 0) { - while ((1 << bits) < c->x86_max_cores) - bits++; - } - c->cpu_core_id = c->phys_proc_id & ((1<<bits)-1); - c->phys_proc_id >>= bits; - printk(KERN_INFO "CPU %d(%d) -> Core %d\n", - cpu, c->x86_max_cores, c->cpu_core_id); + /* Multi core CPU? */ + if (c->extended_cpuid_level >= 0x80000008) { + amd_detect_cmp(c); + srat_detect_node(c); } + +#ifdef CONFIG_X86_32 + detect_ht(c); #endif - if (cpuid_eax(0x80000000) >= 0x80000006) { - if ((c->x86 == 0x10) && (cpuid_edx(0x80000006) & 0xf000)) + if (c->extended_cpuid_level >= 0x80000006) { + if ((c->x86 >= 0x0f) && (cpuid_edx(0x80000006) & 0xf000)) num_cache_leaves = 4; else num_cache_leaves = 3; } - /* K6s reports MCEs but don't actually have all the MSRs */ - if (c->x86 < 6) - clear_cpu_cap(c, X86_FEATURE_MCE); + if (c->x86 >= 0xf && c->x86 <= 0x11) + set_cpu_cap(c, X86_FEATURE_K8); - if (cpu_has_xmm2) + if (cpu_has_xmm2) { + /* MFENCE stops RDTSC speculation */ set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); + } + +#ifdef CONFIG_X86_64 + if (c->x86 == 0x10) { + /* do this for boot cpu */ + if (c == &boot_cpu_data) + check_enable_amd_mmconf_dmi(); + + fam10h_check_enable_mmcfg(); + } + + if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) { + unsigned long long tseg; + + /* + * Split up direct mapping around the TSEG SMM area. + * Don't do it for gbpages because there seems very little + * benefit in doing so. + */ + if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) { + printk(KERN_DEBUG "tseg: %010llx\n", tseg); + if ((tseg>>PMD_SHIFT) < + (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) || + ((tseg>>PMD_SHIFT) < + (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) && + (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT)))) + set_memory_4k((unsigned long)__va(tseg), 1); + } + } +#endif } +#ifdef CONFIG_X86_32 static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int size) { /* AMD errata T13 (order #21922) */ @@ -278,10 +445,12 @@ static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int } return size; } +#endif static struct cpu_dev amd_cpu_dev __cpuinitdata = { .c_vendor = "AMD", .c_ident = { "AuthenticAMD" }, +#ifdef CONFIG_X86_32 .c_models = { { .vendor = X86_VENDOR_AMD, .family = 4, .model_names = { @@ -294,9 +463,11 @@ static struct cpu_dev amd_cpu_dev __cpuinitdata = { } }, }, + .c_size_cache = amd_size_cache, +#endif .c_early_init = early_init_amd, .c_init = init_amd, - .c_size_cache = amd_size_cache, + .c_x86_vendor = X86_VENDOR_AMD, }; -cpu_vendor_dev_register(X86_VENDOR_AMD, &amd_cpu_dev); +cpu_dev_register(amd_cpu_dev); diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c deleted file mode 100644 index d1692b2a41f..00000000000 --- a/arch/x86/kernel/cpu/amd_64.c +++ /dev/null @@ -1,224 +0,0 @@ -#include <linux/init.h> -#include <linux/mm.h> - -#include <asm/numa_64.h> -#include <asm/mmconfig.h> -#include <asm/cacheflush.h> - -#include <mach_apic.h> - -#include "cpu.h" - -int force_mwait __cpuinitdata; - -#ifdef CONFIG_NUMA -static int __cpuinit nearby_node(int apicid) -{ - int i, node; - - for (i = apicid - 1; i >= 0; i--) { - node = apicid_to_node[i]; - if (node != NUMA_NO_NODE && node_online(node)) - return node; - } - for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) { - node = apicid_to_node[i]; - if (node != NUMA_NO_NODE && node_online(node)) - return node; - } - return first_node(node_online_map); /* Shouldn't happen */ -} -#endif - -/* - * On a AMD dual core setup the lower bits of the APIC id distingush the cores. - * Assumes number of cores is a power of two. - */ -static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c) -{ -#ifdef CONFIG_SMP - unsigned bits; -#ifdef CONFIG_NUMA - int cpu = smp_processor_id(); - int node = 0; - unsigned apicid = hard_smp_processor_id(); -#endif - bits = c->x86_coreid_bits; - - /* Low order bits define the core id (index of core in socket) */ - c->cpu_core_id = c->initial_apicid & ((1 << bits)-1); - /* Convert the initial APIC ID into the socket ID */ - c->phys_proc_id = c->initial_apicid >> bits; - -#ifdef CONFIG_NUMA - node = c->phys_proc_id; - if (apicid_to_node[apicid] != NUMA_NO_NODE) - node = apicid_to_node[apicid]; - if (!node_online(node)) { - /* Two possibilities here: - - The CPU is missing memory and no node was created. - In that case try picking one from a nearby CPU - - The APIC IDs differ from the HyperTransport node IDs - which the K8 northbridge parsing fills in. - Assume they are all increased by a constant offset, - but in the same order as the HT nodeids. - If that doesn't result in a usable node fall back to the - path for the previous case. */ - - int ht_nodeid = c->initial_apicid; - - if (ht_nodeid >= 0 && - apicid_to_node[ht_nodeid] != NUMA_NO_NODE) - node = apicid_to_node[ht_nodeid]; - /* Pick a nearby node */ - if (!node_online(node)) - node = nearby_node(apicid); - } - numa_set_node(cpu, node); - - printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); -#endif -#endif -} - -static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c) -{ -#ifdef CONFIG_SMP - unsigned bits, ecx; - - /* Multi core CPU? */ - if (c->extended_cpuid_level < 0x80000008) - return; - - ecx = cpuid_ecx(0x80000008); - - c->x86_max_cores = (ecx & 0xff) + 1; - - /* CPU telling us the core id bits shift? */ - bits = (ecx >> 12) & 0xF; - - /* Otherwise recompute */ - if (bits == 0) { - while ((1 << bits) < c->x86_max_cores) - bits++; - } - - c->x86_coreid_bits = bits; - -#endif -} - -static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) -{ - early_init_amd_mc(c); - - /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */ - if (c->x86_power & (1<<8)) - set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); - - set_cpu_cap(c, X86_FEATURE_SYSCALL32); -} - -static void __cpuinit init_amd(struct cpuinfo_x86 *c) -{ - unsigned level; - -#ifdef CONFIG_SMP - unsigned long value; - - /* - * Disable TLB flush filter by setting HWCR.FFDIS on K8 - * bit 6 of msr C001_0015 - * - * Errata 63 for SH-B3 steppings - * Errata 122 for all steppings (F+ have it disabled by default) - */ - if (c->x86 == 0xf) { - rdmsrl(MSR_K8_HWCR, value); - value |= 1 << 6; - wrmsrl(MSR_K8_HWCR, value); - } -#endif - - /* Bit 31 in normal CPUID used for nonstandard 3DNow ID; - 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */ - clear_cpu_cap(c, 0*32+31); - - /* On C+ stepping K8 rep microcode works well for copy/memset */ - if (c->x86 == 0xf) { - level = cpuid_eax(1); - if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58) - set_cpu_cap(c, X86_FEATURE_REP_GOOD); - } - if (c->x86 == 0x10 || c->x86 == 0x11) - set_cpu_cap(c, X86_FEATURE_REP_GOOD); - - /* Enable workaround for FXSAVE leak */ - if (c->x86 >= 6) - set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK); - - level = get_model_name(c); - if (!level) { - switch (c->x86) { - case 0xf: - /* Should distinguish Models here, but this is only - a fallback anyways. */ - strcpy(c->x86_model_id, "Hammer"); - break; - } - } - display_cacheinfo(c); - - /* Multi core CPU? */ - if (c->extended_cpuid_level >= 0x80000008) - amd_detect_cmp(c); - - if (c->extended_cpuid_level >= 0x80000006 && - (cpuid_edx(0x80000006) & 0xf000)) - num_cache_leaves = 4; - else - num_cache_leaves = 3; - - if (c->x86 >= 0xf && c->x86 <= 0x11) - set_cpu_cap(c, X86_FEATURE_K8); - - /* MFENCE stops RDTSC speculation */ - set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); - - if (c->x86 == 0x10) { - /* do this for boot cpu */ - if (c == &boot_cpu_data) - check_enable_amd_mmconf_dmi(); - - fam10h_check_enable_mmcfg(); - } - - if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) { - unsigned long long tseg; - - /* - * Split up direct mapping around the TSEG SMM area. - * Don't do it for gbpages because there seems very little - * benefit in doing so. - */ - if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) { - printk(KERN_DEBUG "tseg: %010llx\n", tseg); - if ((tseg>>PMD_SHIFT) < - (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) || - ((tseg>>PMD_SHIFT) < - (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) && - (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT)))) - set_memory_4k((unsigned long)__va(tseg), 1); - } - } -} - -static struct cpu_dev amd_cpu_dev __cpuinitdata = { - .c_vendor = "AMD", - .c_ident = { "AuthenticAMD" }, - .c_early_init = early_init_amd, - .c_init = init_amd, -}; - -cpu_vendor_dev_register(X86_VENDOR_AMD, &amd_cpu_dev); - diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index e0f45edd6a5..89bfdd9cacc 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -289,7 +289,6 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c) if (c->x86_model >= 6 && c->x86_model < 9) set_cpu_cap(c, X86_FEATURE_3DNOW); - get_model_name(c); display_cacheinfo(c); } @@ -314,6 +313,16 @@ enum { EAMD3D = 1<<20, }; +static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c) +{ + switch (c->x86) { + case 5: + /* Emulate MTRRs using Centaur's MCR. */ + set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR); + break; + } +} + static void __cpuinit init_centaur(struct cpuinfo_x86 *c) { @@ -462,8 +471,10 @@ centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size) static struct cpu_dev centaur_cpu_dev __cpuinitdata = { .c_vendor = "Centaur", .c_ident = { "CentaurHauls" }, + .c_early_init = early_init_centaur, .c_init = init_centaur, .c_size_cache = centaur_size_cache, + .c_x86_vendor = X86_VENDOR_CENTAUR, }; -cpu_vendor_dev_register(X86_VENDOR_CENTAUR, ¢aur_cpu_dev); +cpu_dev_register(centaur_cpu_dev); diff --git a/arch/x86/kernel/cpu/centaur_64.c b/arch/x86/kernel/cpu/centaur_64.c index 1d181c40e2e..a1625f5a1e7 100644 --- a/arch/x86/kernel/cpu/centaur_64.c +++ b/arch/x86/kernel/cpu/centaur_64.c @@ -16,9 +16,10 @@ static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c) static void __cpuinit init_centaur(struct cpuinfo_x86 *c) { + early_init_centaur(c); + if (c->x86 == 0x6 && c->x86_model >= 0xf) { c->x86_cache_alignment = c->x86_clflush_size * 2; - set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_REP_GOOD); } set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); @@ -29,7 +30,8 @@ static struct cpu_dev centaur_cpu_dev __cpuinitdata = { .c_ident = { "CentaurHauls" }, .c_early_init = early_init_centaur, .c_init = init_centaur, + .c_x86_vendor = X86_VENDOR_CENTAUR, }; -cpu_vendor_dev_register(X86_VENDOR_CENTAUR, ¢aur_cpu_dev); +cpu_dev_register(centaur_cpu_dev); diff --git a/arch/x86/kernel/cpu/cmpxchg.c b/arch/x86/kernel/cpu/cmpxchg.c new file mode 100644 index 00000000000..2056ccf572c --- /dev/null +++ b/arch/x86/kernel/cpu/cmpxchg.c @@ -0,0 +1,72 @@ +/* + * cmpxchg*() fallbacks for CPU not supporting these instructions + */ + +#include <linux/kernel.h> +#include <linux/smp.h> +#include <linux/module.h> + +#ifndef CONFIG_X86_CMPXCHG +unsigned long cmpxchg_386_u8(volatile void *ptr, u8 old, u8 new) +{ + u8 prev; + unsigned long flags; + + /* Poor man's cmpxchg for 386. Unsuitable for SMP */ + local_irq_save(flags); + prev = *(u8 *)ptr; + if (prev == old) + *(u8 *)ptr = new; + local_irq_restore(flags); + return prev; +} +EXPORT_SYMBOL(cmpxchg_386_u8); + +unsigned long cmpxchg_386_u16(volatile void *ptr, u16 old, u16 new) +{ + u16 prev; + unsigned long flags; + + /* Poor man's cmpxchg for 386. Unsuitable for SMP */ + local_irq_save(flags); + prev = *(u16 *)ptr; + if (prev == old) + *(u16 *)ptr = new; + local_irq_restore(flags); + return prev; +} +EXPORT_SYMBOL(cmpxchg_386_u16); + +unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new) +{ + u32 prev; + unsigned long flags; + + /* Poor man's cmpxchg for 386. Unsuitable for SMP */ + local_irq_save(flags); + prev = *(u32 *)ptr; + if (prev == old) + *(u32 *)ptr = new; + local_irq_restore(flags); + return prev; +} +EXPORT_SYMBOL(cmpxchg_386_u32); +#endif + +#ifndef CONFIG_X86_CMPXCHG64 +unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new) +{ + u64 prev; + unsigned long flags; + + /* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */ + local_irq_save(flags); + prev = *(u64 *)ptr; + if (prev == old) + *(u64 *)ptr = new; + local_irq_restore(flags); + return prev; +} +EXPORT_SYMBOL(cmpxchg_486_u64); +#endif + diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 80ab20d4fa3..25581dcb280 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1,27 +1,62 @@ #include <linux/init.h> +#include <linux/kernel.h> +#include <linux/sched.h> #include <linux/string.h> +#include <linux/bootmem.h> +#include <linux/bitops.h> +#include <linux/module.h> +#include <linux/kgdb.h> +#include <linux/topology.h> #include <linux/delay.h> #include <linux/smp.h> -#include <linux/module.h> #include <linux/percpu.h> -#include <linux/bootmem.h> -#include <asm/processor.h> #include <asm/i387.h> #include <asm/msr.h> #include <asm/io.h> +#include <asm/linkage.h> #include <asm/mmu_context.h> #include <asm/mtrr.h> #include <asm/mce.h> #include <asm/pat.h> +#include <asm/asm.h> +#include <asm/numa.h> #ifdef CONFIG_X86_LOCAL_APIC #include <asm/mpspec.h> #include <asm/apic.h> #include <mach_apic.h> +#include <asm/genapic.h> #endif +#include <asm/pda.h> +#include <asm/pgtable.h> +#include <asm/processor.h> +#include <asm/desc.h> +#include <asm/atomic.h> +#include <asm/proto.h> +#include <asm/sections.h> +#include <asm/setup.h> + #include "cpu.h" +static struct cpu_dev *this_cpu __cpuinitdata; + +#ifdef CONFIG_X86_64 +/* We need valid kernel segments for data and code in long mode too + * IRET will check the segment types kkeil 2000/10/28 + * Also sysret mandates a special GDT layout + */ +/* The TLS descriptors are currently at a different place compared to i386. + Hopefully nobody expects them at a fixed place (Wine?) */ DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = { + [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } }, + [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } }, + [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } }, + [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } }, + [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } }, + [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } }, +} }; +#else +DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } }, [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } }, [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } }, @@ -55,17 +90,157 @@ DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = { [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } }, [GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } }, } }; +#endif EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); -__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; - +#ifdef CONFIG_X86_32 static int cachesize_override __cpuinitdata = -1; static int disable_x86_serial_nr __cpuinitdata = 1; -struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; +static int __init cachesize_setup(char *str) +{ + get_option(&str, &cachesize_override); + return 1; +} +__setup("cachesize=", cachesize_setup); + +static int __init x86_fxsr_setup(char *s) +{ + setup_clear_cpu_cap(X86_FEATURE_FXSR); + setup_clear_cpu_cap(X86_FEATURE_XMM); + return 1; +} +__setup("nofxsr", x86_fxsr_setup); + +static int __init x86_sep_setup(char *s) +{ + setup_clear_cpu_cap(X86_FEATURE_SEP); + return 1; +} +__setup("nosep", x86_sep_setup); + +/* Standard macro to see if a specific flag is changeable */ +static inline int flag_is_changeable_p(u32 flag) +{ + u32 f1, f2; + + /* + * Cyrix and IDT cpus allow disabling of CPUID + * so the code below may return different results + * when it is executed before and after enabling + * the CPUID. Add "volatile" to not allow gcc to + * optimize the subsequent calls to this function. + */ + asm volatile ("pushfl\n\t" + "pushfl\n\t" + "popl %0\n\t" + "movl %0,%1\n\t" + "xorl %2,%0\n\t" + "pushl %0\n\t" + "popfl\n\t" + "pushfl\n\t" + "popl %0\n\t" + "popfl\n\t" + : "=&r" (f1), "=&r" (f2) + : "ir" (flag)); + + return ((f1^f2) & flag) != 0; +} + +/* Probe for the CPUID instruction */ +static int __cpuinit have_cpuid_p(void) +{ + return flag_is_changeable_p(X86_EFLAGS_ID); +} + +static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c) +{ + if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr) { + /* Disable processor serial number */ + unsigned long lo, hi; + rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi); + lo |= 0x200000; + wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi); + printk(KERN_NOTICE "CPU serial number disabled.\n"); + clear_cpu_cap(c, X86_FEATURE_PN); + + /* Disabling the serial number may affect the cpuid level */ + c->cpuid_level = cpuid_eax(0); + } +} + +static int __init x86_serial_nr_setup(char *s) +{ + disable_x86_serial_nr = 0; + return 1; +} +__setup("serialnumber", x86_serial_nr_setup); +#else +static inline int flag_is_changeable_p(u32 flag) +{ + return 1; +} +/* Probe for the CPUID instruction */ +static inline int have_cpuid_p(void) +{ + return 1; +} +static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c) +{ +} +#endif + +/* + * Naming convention should be: <Name> [(<Codename>)] + * This table only is used unless init_<vendor>() below doesn't set it; + * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used + * + */ + +/* Look up CPU names by table lookup. */ +static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c) +{ + struct cpu_model_info *info; + + if (c->x86_model >= 16) + return NULL; /* Range check */ + + if (!this_cpu) + return NULL; + + info = this_cpu->c_models; + + while (info && info->family) { + if (info->family == c->x86) + return info->model_names[c->x86_model]; + info++; + } + return NULL; /* Not found */ +} + +__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; + +/* Current gdt points %fs at the "master" per-cpu area: after this, + * it's on the real one. */ +void switch_to_new_gdt(void) +{ + struct desc_ptr gdt_descr; + + gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id()); + gdt_descr.size = GDT_SIZE - 1; + load_gdt(&gdt_descr); +#ifdef CONFIG_X86_32 + asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory"); +#endif +} + +static struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; static void __cpuinit default_init(struct cpuinfo_x86 *c) { +#ifdef CONFIG_X86_64 + display_cacheinfo(c); +#else /* Not much we can do here... */ /* Check if at least it has cpuid */ if (c->cpuid_level == -1) { @@ -75,28 +250,22 @@ static void __cpuinit default_init(struct cpuinfo_x86 *c) else if (c->x86 == 3) strcpy(c->x86_model_id, "386"); } +#endif } static struct cpu_dev __cpuinitdata default_cpu = { .c_init = default_init, .c_vendor = "Unknown", + .c_x86_vendor = X86_VENDOR_UNKNOWN, }; -static struct cpu_dev *this_cpu __cpuinitdata = &default_cpu; - -static int __init cachesize_setup(char *str) -{ - get_option(&str, &cachesize_override); - return 1; -} -__setup("cachesize=", cachesize_setup); -int __cpuinit get_model_name(struct cpuinfo_x86 *c) +static void __cpuinit get_model_name(struct cpuinfo_x86 *c) { unsigned int *v; char *p, *q; - if (cpuid_eax(0x80000000) < 0x80000004) - return 0; + if (c->extended_cpuid_level < 0x80000004) + return; v = (unsigned int *) c->x86_model_id; cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]); @@ -115,30 +284,34 @@ int __cpuinit get_model_name(struct cpuinfo_x86 *c) while (q <= &c->x86_model_id[48]) *q++ = '\0'; /* Zero-pad the rest */ } - - return 1; } - void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) { - unsigned int n, dummy, ecx, edx, l2size; + unsigned int n, dummy, ebx, ecx, edx, l2size; - n = cpuid_eax(0x80000000); + n = c->extended_cpuid_level; if (n >= 0x80000005) { - cpuid(0x80000005, &dummy, &dummy, &ecx, &edx); + cpuid(0x80000005, &dummy, &ebx, &ecx, &edx); printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n", - edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); - c->x86_cache_size = (ecx>>24)+(edx>>24); + edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); + c->x86_cache_size = (ecx>>24) + (edx>>24); +#ifdef CONFIG_X86_64 + /* On K8 L1 TLB is inclusive, so don't count it */ + c->x86_tlbsize = 0; +#endif } if (n < 0x80000006) /* Some chips just has a large L1. */ return; - ecx = cpuid_ecx(0x80000006); + cpuid(0x80000006, &dummy, &ebx, &ecx, &edx); l2size = ecx >> 16; +#ifdef CONFIG_X86_64 + c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff); +#else /* do processor-specific cache resizing */ if (this_cpu->c_size_cache) l2size = this_cpu->c_size_cache(c, l2size); @@ -149,116 +322,106 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) if (l2size == 0) return; /* Again, no L2 cache is possible */ +#endif c->x86_cache_size = l2size; printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n", - l2size, ecx & 0xFF); + l2size, ecx & 0xFF); } -/* - * Naming convention should be: <Name> [(<Codename>)] - * This table only is used unless init_<vendor>() below doesn't set it; - * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used - * - */ - -/* Look up CPU names by table lookup. */ -static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c) +void __cpuinit detect_ht(struct cpuinfo_x86 *c) { - struct cpu_model_info *info; +#ifdef CONFIG_X86_HT + u32 eax, ebx, ecx, edx; + int index_msb, core_bits; - if (c->x86_model >= 16) - return NULL; /* Range check */ + if (!cpu_has(c, X86_FEATURE_HT)) + return; - if (!this_cpu) - return NULL; + if (cpu_has(c, X86_FEATURE_CMP_LEGACY)) + goto out; - info = this_cpu->c_models; + if (cpu_has(c, X86_FEATURE_XTOPOLOGY)) + return; - while (info && info->family) { - if (info->family == c->x86) - return info->model_names[c->x86_model]; - info++; + cpuid(1, &eax, &ebx, &ecx, &edx); + + smp_num_siblings = (ebx & 0xff0000) >> 16; + + if (smp_num_siblings == 1) { + printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); + } else if (smp_num_siblings > 1) { + + if (smp_num_siblings > NR_CPUS) { + printk(KERN_WARNING "CPU: Unsupported number of siblings %d", + smp_num_siblings); + smp_num_siblings = 1; + return; + } + + index_msb = get_count_order(smp_num_siblings); +#ifdef CONFIG_X86_64 + c->phys_proc_id = phys_pkg_id(index_msb); +#else + c->phys_proc_id = phys_pkg_id(c->initial_apicid, index_msb); +#endif + + smp_num_siblings = smp_num_siblings / c->x86_max_cores; + + index_msb = get_count_order(smp_num_siblings); + + core_bits = get_count_order(c->x86_max_cores); + +#ifdef CONFIG_X86_64 + c->cpu_core_id = phys_pkg_id(index_msb) & + ((1 << core_bits) - 1); +#else + c->cpu_core_id = phys_pkg_id(c->initial_apicid, index_msb) & + ((1 << core_bits) - 1); +#endif } - return NULL; /* Not found */ -} +out: + if ((c->x86_max_cores * smp_num_siblings) > 1) { + printk(KERN_INFO "CPU: Physical Processor ID: %d\n", + c->phys_proc_id); + printk(KERN_INFO "CPU: Processor Core ID: %d\n", + c->cpu_core_id); + } +#endif +} -static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c, int early) +static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) { char *v = c->x86_vendor_id; int i; static int printed; for (i = 0; i < X86_VENDOR_NUM; i++) { - if (cpu_devs[i]) { - if (!strcmp(v, cpu_devs[i]->c_ident[0]) || - (cpu_devs[i]->c_ident[1] && - !strcmp(v, cpu_devs[i]->c_ident[1]))) { - c->x86_vendor = i; - if (!early) - this_cpu = cpu_devs[i]; - return; - } + if (!cpu_devs[i]) + break; + + if (!strcmp(v, cpu_devs[i]->c_ident[0]) || + (cpu_devs[i]->c_ident[1] && + !strcmp(v, cpu_devs[i]->c_ident[1]))) { + this_cpu = cpu_devs[i]; + c->x86_vendor = this_cpu->c_x86_vendor; + return; } } + if (!printed) { printed++; - printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n"); + printk(KERN_ERR "CPU: vendor_id '%s' unknown, using generic init.\n", v); printk(KERN_ERR "CPU: Your system may be unstable.\n"); } + c->x86_vendor = X86_VENDOR_UNKNOWN; this_cpu = &default_cpu; } - -static int __init x86_fxsr_setup(char *s) -{ - setup_clear_cpu_cap(X86_FEATURE_FXSR); - setup_clear_cpu_cap(X86_FEATURE_XMM); - return 1; -} -__setup("nofxsr", x86_fxsr_setup); - - -static int __init x86_sep_setup(char *s) -{ - setup_clear_cpu_cap(X86_FEATURE_SEP); - return 1; -} -__setup("nosep", x86_sep_setup); - - -/* Standard macro to see if a specific flag is changeable */ -static inline int flag_is_changeable_p(u32 flag) -{ - u32 f1, f2; - - asm("pushfl\n\t" - "pushfl\n\t" - "popl %0\n\t" - "movl %0,%1\n\t" - "xorl %2,%0\n\t" - "pushl %0\n\t" - "popfl\n\t" - "pushfl\n\t" - "popl %0\n\t" - "popfl\n\t" - : "=&r" (f1), "=&r" (f2) - : "ir" (flag)); - - return ((f1^f2) & flag) != 0; -} - - -/* Probe for the CPUID instruction */ -static int __cpuinit have_cpuid_p(void) -{ - return flag_is_changeable_p(X86_EFLAGS_ID); -} - -void __init cpu_detect(struct cpuinfo_x86 *c) +void __cpuinit cpu_detect(struct cpuinfo_x86 *c) { /* Get vendor name */ cpuid(0x00000000, (unsigned int *)&c->cpuid_level, @@ -267,50 +430,87 @@ void __init cpu_detect(struct cpuinfo_x86 *c) (unsigned int *)&c->x86_vendor_id[4]); c->x86 = 4; + /* Intel-defined flags: level 0x00000001 */ if (c->cpuid_level >= 0x00000001) { u32 junk, tfms, cap0, misc; cpuid(0x00000001, &tfms, &misc, &junk, &cap0); - c->x86 = (tfms >> 8) & 15; - c->x86_model = (tfms >> 4) & 15; + c->x86 = (tfms >> 8) & 0xf; + c->x86_model = (tfms >> 4) & 0xf; + c->x86_mask = tfms & 0xf; if (c->x86 == 0xf) c->x86 += (tfms >> 20) & 0xff; if (c->x86 >= 0x6) - c->x86_model += ((tfms >> 16) & 0xF) << 4; - c->x86_mask = tfms & 15; + c->x86_model += ((tfms >> 16) & 0xf) << 4; if (cap0 & (1<<19)) { - c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8; c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; + c->x86_cache_alignment = c->x86_clflush_size; } } } -static void __cpuinit early_get_cap(struct cpuinfo_x86 *c) + +static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) { u32 tfms, xlvl; - unsigned int ebx; + u32 ebx; - memset(&c->x86_capability, 0, sizeof c->x86_capability); - if (have_cpuid_p()) { - /* Intel-defined flags: level 0x00000001 */ - if (c->cpuid_level >= 0x00000001) { - u32 capability, excap; - cpuid(0x00000001, &tfms, &ebx, &excap, &capability); - c->x86_capability[0] = capability; - c->x86_capability[4] = excap; - } + /* Intel-defined flags: level 0x00000001 */ + if (c->cpuid_level >= 0x00000001) { + u32 capability, excap; + cpuid(0x00000001, &tfms, &ebx, &excap, &capability); + c->x86_capability[0] = capability; + c->x86_capability[4] = excap; + } - /* AMD-defined flags: level 0x80000001 */ - xlvl = cpuid_eax(0x80000000); - if ((xlvl & 0xffff0000) == 0x80000000) { - if (xlvl >= 0x80000001) { - c->x86_capability[1] = cpuid_edx(0x80000001); - c->x86_capability[6] = cpuid_ecx(0x80000001); - } + /* AMD-defined flags: level 0x80000001 */ + xlvl = cpuid_eax(0x80000000); + c->extended_cpuid_level = xlvl; + if ((xlvl & 0xffff0000) == 0x80000000) { + if (xlvl >= 0x80000001) { + c->x86_capability[1] = cpuid_edx(0x80000001); + c->x86_capability[6] = cpuid_ecx(0x80000001); } + } + +#ifdef CONFIG_X86_64 + if (c->extended_cpuid_level >= 0x80000008) { + u32 eax = cpuid_eax(0x80000008); + c->x86_virt_bits = (eax >> 8) & 0xff; + c->x86_phys_bits = eax & 0xff; } +#endif + + if (c->extended_cpuid_level >= 0x80000007) + c->x86_power = cpuid_edx(0x80000007); } +static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_X86_32 + int i; + + /* + * First of all, decide if this is a 486 or higher + * It's a 486 if we can modify the AC flag + */ + if (flag_is_changeable_p(X86_EFLAGS_AC)) + c->x86 = 4; + else + c->x86 = 3; + + for (i = 0; i < X86_VENDOR_NUM; i++) + if (cpu_devs[i] && cpu_devs[i]->c_identify) { + c->x86_vendor_id[0] = 0; + cpu_devs[i]->c_identify(c); + if (c->x86_vendor_id[0]) { + get_cpu_vendor(c); + break; + } + } +#endif +} + /* * Do minimum CPU detection early. * Fields really needed: vendor, cpuid_level, family, model, mask, @@ -320,109 +520,113 @@ static void __cpuinit early_get_cap(struct cpuinfo_x86 *c) * WARNING: this function is only called on the BP. Don't add code here * that is supposed to run on all CPUs. */ -static void __init early_cpu_detect(void) +static void __init early_identify_cpu(struct cpuinfo_x86 *c) { - struct cpuinfo_x86 *c = &boot_cpu_data; - - c->x86_cache_alignment = 32; +#ifdef CONFIG_X86_64 + c->x86_clflush_size = 64; +#else c->x86_clflush_size = 32; +#endif + c->x86_cache_alignment = c->x86_clflush_size; + + memset(&c->x86_capability, 0, sizeof c->x86_capability); + c->extended_cpuid_level = 0; + + if (!have_cpuid_p()) + identify_cpu_without_cpuid(c); + /* cyrix could have cpuid enabled via c_identify()*/ if (!have_cpuid_p()) return; cpu_detect(c); - get_cpu_vendor(c, 1); + get_cpu_vendor(c); + + get_cpu_cap(c); - if (c->x86_vendor != X86_VENDOR_UNKNOWN && - cpu_devs[c->x86_vendor]->c_early_init) - cpu_devs[c->x86_vendor]->c_early_init(c); + if (this_cpu->c_early_init) + this_cpu->c_early_init(c); - early_get_cap(c); + validate_pat_support(c); } -static void __cpuinit generic_identify(struct cpuinfo_x86 *c) +void __init early_cpu_init(void) { - u32 tfms, xlvl; - unsigned int ebx; - - if (have_cpuid_p()) { - /* Get vendor name */ - cpuid(0x00000000, (unsigned int *)&c->cpuid_level, - (unsigned int *)&c->x86_vendor_id[0], - (unsigned int *)&c->x86_vendor_id[8], - (unsigned int *)&c->x86_vendor_id[4]); - - get_cpu_vendor(c, 0); - /* Initialize the standard set of capabilities */ - /* Note that the vendor-specific code below might override */ - /* Intel-defined flags: level 0x00000001 */ - if (c->cpuid_level >= 0x00000001) { - u32 capability, excap; - cpuid(0x00000001, &tfms, &ebx, &excap, &capability); - c->x86_capability[0] = capability; - c->x86_capability[4] = excap; - c->x86 = (tfms >> 8) & 15; - c->x86_model = (tfms >> 4) & 15; - if (c->x86 == 0xf) - c->x86 += (tfms >> 20) & 0xff; - if (c->x86 >= 0x6) - c->x86_model += ((tfms >> 16) & 0xF) << 4; - c->x86_mask = tfms & 15; - c->initial_apicid = (ebx >> 24) & 0xFF; -#ifdef CONFIG_X86_HT - c->apicid = phys_pkg_id(c->initial_apicid, 0); - c->phys_proc_id = c->initial_apicid; -#else - c->apicid = c->initial_apicid; -#endif - if (test_cpu_cap(c, X86_FEATURE_CLFLSH)) - c->x86_clflush_size = ((ebx >> 8) & 0xff) * 8; - } else { - /* Have CPUID level 0 only - unheard of */ - c->x86 = 4; - } - - /* AMD-defined flags: level 0x80000001 */ - xlvl = cpuid_eax(0x80000000); - if ((xlvl & 0xffff0000) == 0x80000000) { - if (xlvl >= 0x80000001) { - c->x86_capability[1] = cpuid_edx(0x80000001); - c->x86_capability[6] = cpuid_ecx(0x80000001); - } - if (xlvl >= 0x80000004) - get_model_name(c); /* Default name */ + struct cpu_dev **cdev; + int count = 0; + + printk("KERNEL supported cpus:\n"); + for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) { + struct cpu_dev *cpudev = *cdev; + unsigned int j; + + if (count >= X86_VENDOR_NUM) + break; + cpu_devs[count] = cpudev; + count++; + + for (j = 0; j < 2; j++) { + if (!cpudev->c_ident[j]) + continue; + printk(" %s %s\n", cpudev->c_vendor, + cpudev->c_ident[j]); } - - init_scattered_cpuid_features(c); } + early_identify_cpu(&boot_cpu_data); } -static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c) +/* + * The NOPL instruction is supposed to exist on all CPUs with + * family >= 6; unfortunately, that's not true in practice because + * of early VIA chips and (more importantly) broken virtualizers that + * are not easy to detect. In the latter case it doesn't even *fail* + * reliably, so probing for it doesn't even work. Disable it completely + * unless we can find a reliable way to detect all the broken cases. + */ +static void __cpuinit detect_nopl(struct cpuinfo_x86 *c) { - if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr) { - /* Disable processor serial number */ - unsigned long lo, hi; - rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi); - lo |= 0x200000; - wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi); - printk(KERN_NOTICE "CPU serial number disabled.\n"); - clear_cpu_cap(c, X86_FEATURE_PN); - - /* Disabling the serial number may affect the cpuid level */ - c->cpuid_level = cpuid_eax(0); - } + clear_cpu_cap(c, X86_FEATURE_NOPL); } -static int __init x86_serial_nr_setup(char *s) +static void __cpuinit generic_identify(struct cpuinfo_x86 *c) { - disable_x86_serial_nr = 0; - return 1; -} -__setup("serialnumber", x86_serial_nr_setup); + c->extended_cpuid_level = 0; + + if (!have_cpuid_p()) + identify_cpu_without_cpuid(c); + + /* cyrix could have cpuid enabled via c_identify()*/ + if (!have_cpuid_p()) + return; + + cpu_detect(c); + + get_cpu_vendor(c); + get_cpu_cap(c); + if (c->cpuid_level >= 0x00000001) { + c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF; +#ifdef CONFIG_X86_32 +# ifdef CONFIG_X86_HT + c->apicid = phys_pkg_id(c->initial_apicid, 0); +# else + c->apicid = c->initial_apicid; +# endif +#endif + +#ifdef CONFIG_X86_HT + c->phys_proc_id = c->initial_apicid; +#endif + } + + get_model_name(c); /* Default name */ + + init_scattered_cpuid_features(c); + detect_nopl(c); +} /* * This does the hard work of actually picking apart the CPU stuff... @@ -434,30 +638,29 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) c->loops_per_jiffy = loops_per_jiffy; c->x86_cache_size = -1; c->x86_vendor = X86_VENDOR_UNKNOWN; - c->cpuid_level = -1; /* CPUID not detected */ c->x86_model = c->x86_mask = 0; /* So far unknown... */ c->x86_vendor_id[0] = '\0'; /* Unset */ c->x86_model_id[0] = '\0'; /* Unset */ c->x86_max_cores = 1; + c->x86_coreid_bits = 0; +#ifdef CONFIG_X86_64 + c->x86_clflush_size = 64; +#else + c->cpuid_level = -1; /* CPUID not detected */ c->x86_clflush_size = 32; +#endif + c->x86_cache_alignment = c->x86_clflush_size; memset(&c->x86_capability, 0, sizeof c->x86_capability); - if (!have_cpuid_p()) { - /* - * First of all, decide if this is a 486 or higher - * It's a 486 if we can modify the AC flag - */ - if (flag_is_changeable_p(X86_EFLAGS_AC)) - c->x86 = 4; - else - c->x86 = 3; - } - generic_identify(c); if (this_cpu->c_identify) this_cpu->c_identify(c); +#ifdef CONFIG_X86_64 + c->apicid = phys_pkg_id(0); +#endif + /* * Vendor-specific initialization. In this section we * canonicalize the feature flags, meaning if there are @@ -491,6 +694,10 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) c->x86, c->x86_model); } +#ifdef CONFIG_X86_64 + detect_ht(c); +#endif + /* * On SMP, boot_cpu_data holds the common feature set between * all CPUs; so make sure that we indicate which features are @@ -499,7 +706,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) */ if (c != &boot_cpu_data) { /* AND the already accumulated flags with these */ - for (i = 0 ; i < NCAPINTS ; i++) + for (i = 0; i < NCAPINTS; i++) boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; } @@ -507,72 +714,91 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) for (i = 0; i < NCAPINTS; i++) c->x86_capability[i] &= ~cleared_cpu_caps[i]; +#ifdef CONFIG_X86_MCE /* Init Machine Check Exception if available. */ mcheck_init(c); +#endif select_idle_routine(c); + +#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) + numa_add_cpu(smp_processor_id()); +#endif } +#ifdef CONFIG_X86_64 +static void vgetcpu_set_mode(void) +{ + if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP)) + vgetcpu_mode = VGETCPU_RDTSCP; + else + vgetcpu_mode = VGETCPU_LSL; +} +#endif + void __init identify_boot_cpu(void) { identify_cpu(&boot_cpu_data); +#ifdef CONFIG_X86_32 sysenter_setup(); enable_sep_cpu(); +#else + vgetcpu_set_mode(); +#endif } void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) { BUG_ON(c == &boot_cpu_data); identify_cpu(c); +#ifdef CONFIG_X86_32 enable_sep_cpu(); +#endif mtrr_ap_init(); } -#ifdef CONFIG_X86_HT -void __cpuinit detect_ht(struct cpuinfo_x86 *c) -{ - u32 eax, ebx, ecx, edx; - int index_msb, core_bits; - - cpuid(1, &eax, &ebx, &ecx, &edx); - - if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY)) - return; - - smp_num_siblings = (ebx & 0xff0000) >> 16; +struct msr_range { + unsigned min; + unsigned max; +}; - if (smp_num_siblings == 1) { - printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); - } else if (smp_num_siblings > 1) { +static struct msr_range msr_range_array[] __cpuinitdata = { + { 0x00000000, 0x00000418}, + { 0xc0000000, 0xc000040b}, + { 0xc0010000, 0xc0010142}, + { 0xc0011000, 0xc001103b}, +}; - if (smp_num_siblings > NR_CPUS) { - printk(KERN_WARNING "CPU: Unsupported number of the " - "siblings %d", smp_num_siblings); - smp_num_siblings = 1; - return; +static void __cpuinit print_cpu_msr(void) +{ + unsigned index; + u64 val; + int i; + unsigned index_min, index_max; + + for (i = 0; i < ARRAY_SIZE(msr_range_array); i++) { + index_min = msr_range_array[i].min; + index_max = msr_range_array[i].max; + for (index = index_min; index < index_max; index++) { + if (rdmsrl_amd_safe(index, &val)) + continue; + printk(KERN_INFO " MSR%08x: %016llx\n", index, val); } + } +} - index_msb = get_count_order(smp_num_siblings); - c->phys_proc_id = phys_pkg_id(c->initial_apicid, index_msb); - - printk(KERN_INFO "CPU: Physical Processor ID: %d\n", - c->phys_proc_id); - - smp_num_siblings = smp_num_siblings / c->x86_max_cores; - - index_msb = get_count_order(smp_num_siblings) ; - - core_bits = get_count_order(c->x86_max_cores); +static int show_msr __cpuinitdata; +static __init int setup_show_msr(char *arg) +{ + int num; - c->cpu_core_id = phys_pkg_id(c->initial_apicid, index_msb) & - ((1 << core_bits) - 1); + get_option(&arg, &num); - if (c->x86_max_cores > 1) - printk(KERN_INFO "CPU: Processor Core ID: %d\n", - c->cpu_core_id); - } + if (num > 0) + show_msr = num; + return 1; } -#endif +__setup("show_msr=", setup_show_msr); static __init int setup_noclflush(char *arg) { @@ -590,18 +816,26 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) else if (c->cpuid_level >= 0) vendor = c->x86_vendor_id; - if (vendor && strncmp(c->x86_model_id, vendor, strlen(vendor))) - printk("%s ", vendor); + if (vendor && !strstr(c->x86_model_id, vendor)) + printk(KERN_CONT "%s ", vendor); - if (!c->x86_model_id[0]) - printk("%d86", c->x86); + if (c->x86_model_id[0]) + printk(KERN_CONT "%s", c->x86_model_id); else - printk("%s", c->x86_model_id); + printk(KERN_CONT "%d86", c->x86); if (c->x86_mask || c->cpuid_level >= 0) - printk(" stepping %02x\n", c->x86_mask); + printk(KERN_CONT " stepping %02x\n", c->x86_mask); else - printk("\n"); + printk(KERN_CONT "\n"); + +#ifdef CONFIG_SMP + if (c->cpu_index < show_msr) + print_cpu_msr(); +#else + if (show_msr) + print_cpu_msr(); +#endif } static __init int setup_disablecpuid(char *arg) @@ -617,19 +851,89 @@ __setup("clearcpuid=", setup_disablecpuid); cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; -void __init early_cpu_init(void) +#ifdef CONFIG_X86_64 +struct x8664_pda **_cpu_pda __read_mostly; +EXPORT_SYMBOL(_cpu_pda); + +struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; + +char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss; + +void __cpuinit pda_init(int cpu) { - struct cpu_vendor_dev *cvdev; + struct x8664_pda *pda = cpu_pda(cpu); + + /* Setup up data that may be needed in __get_free_pages early */ + loadsegment(fs, 0); + loadsegment(gs, 0); + /* Memory clobbers used to order PDA accessed */ + mb(); + wrmsrl(MSR_GS_BASE, pda); + mb(); + + pda->cpunumber = cpu; + pda->irqcount = -1; + pda->kernelstack = (unsigned long)stack_thread_info() - + PDA_STACKOFFSET + THREAD_SIZE; + pda->active_mm = &init_mm; + pda->mmu_state = 0; + + if (cpu == 0) { + /* others are initialized in smpboot.c */ + pda->pcurrent = &init_task; + pda->irqstackptr = boot_cpu_stack; + pda->irqstackptr += IRQSTACKSIZE - 64; + } else { + if (!pda->irqstackptr) { + pda->irqstackptr = (char *) + __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER); + if (!pda->irqstackptr) + panic("cannot allocate irqstack for cpu %d", + cpu); + pda->irqstackptr += IRQSTACKSIZE - 64; + } + + if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE) + pda->nodenumber = cpu_to_node(cpu); + } +} + +char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + + DEBUG_STKSZ] __page_aligned_bss; - for (cvdev = __x86cpuvendor_start ; - cvdev < __x86cpuvendor_end ; - cvdev++) - cpu_devs[cvdev->vendor] = cvdev->cpu_dev; +extern asmlinkage void ignore_sysret(void); - early_cpu_detect(); - validate_pat_support(&boot_cpu_data); +/* May not be marked __init: used by software suspend */ +void syscall_init(void) +{ + /* + * LSTAR and STAR live in a bit strange symbiosis. + * They both write to the same internal register. STAR allows to + * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip. + */ + wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); + wrmsrl(MSR_LSTAR, system_call); + wrmsrl(MSR_CSTAR, ignore_sysret); + +#ifdef CONFIG_IA32_EMULATION + syscall32_cpu_init(); +#endif + + /* Flags to clear on syscall */ + wrmsrl(MSR_SYSCALL_MASK, + X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL); } +unsigned long kernel_eflags; + +/* + * Copies of the original ist values from the tss are only accessed during + * debugging, no special alignment required. + */ +DEFINE_PER_CPU(struct orig_ist, orig_ist); + +#else + /* Make sure %fs is initialized properly in idle threads */ struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs) { @@ -637,25 +941,136 @@ struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs) regs->fs = __KERNEL_PERCPU; return regs; } - -/* Current gdt points %fs at the "master" per-cpu area: after this, - * it's on the real one. */ -void switch_to_new_gdt(void) -{ - struct desc_ptr gdt_descr; - - gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id()); - gdt_descr.size = GDT_SIZE - 1; - load_gdt(&gdt_descr); - asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory"); -} +#endif /* * cpu_init() initializes state that is per-CPU. Some data is already * initialized (naturally) in the bootstrap process, such as the GDT * and IDT. We reload them nevertheless, this function acts as a * 'CPU state barrier', nothing should get across. + * A lot of state is already set up in PDA init for 64 bit */ +#ifdef CONFIG_X86_64 +void __cpuinit cpu_init(void) +{ + int cpu = stack_smp_processor_id(); + struct tss_struct *t = &per_cpu(init_tss, cpu); + struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu); + unsigned long v; + char *estacks = NULL; + struct task_struct *me; + int i; + + /* CPU 0 is initialised in head64.c */ + if (cpu != 0) + pda_init(cpu); + else + estacks = boot_exception_stacks; + + me = current; + + if (cpu_test_and_set(cpu, cpu_initialized)) + panic("CPU#%d already initialized!\n", cpu); + + printk(KERN_INFO "Initializing CPU#%d\n", cpu); + + clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); + + /* + * Initialize the per-CPU GDT with the boot GDT, + * and set up the GDT descriptor: + */ + + switch_to_new_gdt(); + load_idt((const struct desc_ptr *)&idt_descr); + + memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8); + syscall_init(); + + wrmsrl(MSR_FS_BASE, 0); + wrmsrl(MSR_KERNEL_GS_BASE, 0); + barrier(); + + check_efer(); + if (cpu != 0 && x2apic) + enable_x2apic(); + + /* + * set up and load the per-CPU TSS + */ + if (!orig_ist->ist[0]) { + static const unsigned int order[N_EXCEPTION_STACKS] = { + [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER, + [DEBUG_STACK - 1] = DEBUG_STACK_ORDER + }; + for (v = 0; v < N_EXCEPTION_STACKS; v++) { + if (cpu) { + estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]); + if (!estacks) + panic("Cannot allocate exception " + "stack %ld %d\n", v, cpu); + } + estacks += PAGE_SIZE << order[v]; + orig_ist->ist[v] = t->x86_tss.ist[v] = + (unsigned long)estacks; + } + } + + t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); + /* + * <= is required because the CPU will access up to + * 8 bits beyond the end of the IO permission bitmap. + */ + for (i = 0; i <= IO_BITMAP_LONGS; i++) + t->io_bitmap[i] = ~0UL; + + atomic_inc(&init_mm.mm_count); + me->active_mm = &init_mm; + if (me->mm) + BUG(); + enter_lazy_tlb(&init_mm, me); + + load_sp0(t, ¤t->thread); + set_tss_desc(cpu, t); + load_TR_desc(); + load_LDT(&init_mm.context); + +#ifdef CONFIG_KGDB + /* + * If the kgdb is connected no debug regs should be altered. This + * is only applicable when KGDB and a KGDB I/O module are built + * into the kernel and you are using early debugging with + * kgdbwait. KGDB will control the kernel HW breakpoint registers. + */ + if (kgdb_connected && arch_kgdb_ops.correct_hw_break) + arch_kgdb_ops.correct_hw_break(); + else { +#endif + /* + * Clear all 6 debug registers: + */ + + set_debugreg(0UL, 0); + set_debugreg(0UL, 1); + set_debugreg(0UL, 2); + set_debugreg(0UL, 3); + set_debugreg(0UL, 6); + set_debugreg(0UL, 7); +#ifdef CONFIG_KGDB + /* If the kgdb is connected no debug regs should be altered. */ + } +#endif + + fpu_init(); + + raw_local_save_flags(kernel_eflags); + + if (is_uv_system()) + uv_cpu_init(); +} + +#else + void __cpuinit cpu_init(void) { int cpu = smp_processor_id(); @@ -709,19 +1124,21 @@ void __cpuinit cpu_init(void) /* * Force FPU initialization: */ - current_thread_info()->status = 0; + if (cpu_has_xsave) + current_thread_info()->status = TS_XSAVE; + else + current_thread_info()->status = 0; clear_used_math(); mxcsr_feature_mask_init(); -} -#ifdef CONFIG_HOTPLUG_CPU -void __cpuinit cpu_uninit(void) -{ - int cpu = raw_smp_processor_id(); - cpu_clear(cpu, cpu_initialized); + /* + * Boot processor to setup the FP and extended state context info. + */ + if (!smp_processor_id()) + init_thread_xstate(); - /* lazy TLB state */ - per_cpu(cpu_tlbstate, cpu).state = 0; - per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm; + xsave_init(); } + + #endif diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c deleted file mode 100644 index dd6e3f15017..00000000000 --- a/arch/x86/kernel/cpu/common_64.c +++ /dev/null @@ -1,670 +0,0 @@ -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/string.h> -#include <linux/bootmem.h> -#include <linux/bitops.h> -#include <linux/module.h> -#include <linux/kgdb.h> -#include <linux/topology.h> -#include <linux/delay.h> -#include <linux/smp.h> -#include <linux/percpu.h> -#include <asm/i387.h> -#include <asm/msr.h> -#include <asm/io.h> -#include <asm/linkage.h> -#include <asm/mmu_context.h> -#include <asm/mtrr.h> -#include <asm/mce.h> -#include <asm/pat.h> -#include <asm/numa.h> -#ifdef CONFIG_X86_LOCAL_APIC -#include <asm/mpspec.h> -#include <asm/apic.h> -#include <mach_apic.h> -#endif -#include <asm/pda.h> -#include <asm/pgtable.h> -#include <asm/processor.h> -#include <asm/desc.h> -#include <asm/atomic.h> -#include <asm/proto.h> -#include <asm/sections.h> -#include <asm/setup.h> -#include <asm/genapic.h> - -#include "cpu.h" - -/* We need valid kernel segments for data and code in long mode too - * IRET will check the segment types kkeil 2000/10/28 - * Also sysret mandates a special GDT layout - */ -/* The TLS descriptors are currently at a different place compared to i386. - Hopefully nobody expects them at a fixed place (Wine?) */ -DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = { - [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } }, - [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } }, - [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } }, - [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } }, - [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } }, - [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } }, -} }; -EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); - -__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; - -/* Current gdt points %fs at the "master" per-cpu area: after this, - * it's on the real one. */ -void switch_to_new_gdt(void) -{ - struct desc_ptr gdt_descr; - - gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id()); - gdt_descr.size = GDT_SIZE - 1; - load_gdt(&gdt_descr); -} - -struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; - -static void __cpuinit default_init(struct cpuinfo_x86 *c) -{ - display_cacheinfo(c); -} - -static struct cpu_dev __cpuinitdata default_cpu = { - .c_init = default_init, - .c_vendor = "Unknown", -}; -static struct cpu_dev *this_cpu __cpuinitdata = &default_cpu; - -int __cpuinit get_model_name(struct cpuinfo_x86 *c) -{ - unsigned int *v; - - if (c->extended_cpuid_level < 0x80000004) - return 0; - - v = (unsigned int *) c->x86_model_id; - cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]); - cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]); - cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]); - c->x86_model_id[48] = 0; - return 1; -} - - -void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) -{ - unsigned int n, dummy, ebx, ecx, edx; - - n = c->extended_cpuid_level; - - if (n >= 0x80000005) { - cpuid(0x80000005, &dummy, &ebx, &ecx, &edx); - printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), " - "D cache %dK (%d bytes/line)\n", - edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); - c->x86_cache_size = (ecx>>24) + (edx>>24); - /* On K8 L1 TLB is inclusive, so don't count it */ - c->x86_tlbsize = 0; - } - - if (n >= 0x80000006) { - cpuid(0x80000006, &dummy, &ebx, &ecx, &edx); - ecx = cpuid_ecx(0x80000006); - c->x86_cache_size = ecx >> 16; - c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff); - - printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n", - c->x86_cache_size, ecx & 0xFF); - } -} - -void __cpuinit detect_ht(struct cpuinfo_x86 *c) -{ -#ifdef CONFIG_SMP - u32 eax, ebx, ecx, edx; - int index_msb, core_bits; - - cpuid(1, &eax, &ebx, &ecx, &edx); - - - if (!cpu_has(c, X86_FEATURE_HT)) - return; - if (cpu_has(c, X86_FEATURE_CMP_LEGACY)) - goto out; - - smp_num_siblings = (ebx & 0xff0000) >> 16; - - if (smp_num_siblings == 1) { - printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); - } else if (smp_num_siblings > 1) { - - if (smp_num_siblings > NR_CPUS) { - printk(KERN_WARNING "CPU: Unsupported number of " - "siblings %d", smp_num_siblings); - smp_num_siblings = 1; - return; - } - - index_msb = get_count_order(smp_num_siblings); - c->phys_proc_id = phys_pkg_id(index_msb); - - smp_num_siblings = smp_num_siblings / c->x86_max_cores; - - index_msb = get_count_order(smp_num_siblings); - - core_bits = get_count_order(c->x86_max_cores); - - c->cpu_core_id = phys_pkg_id(index_msb) & - ((1 << core_bits) - 1); - } -out: - if ((c->x86_max_cores * smp_num_siblings) > 1) { - printk(KERN_INFO "CPU: Physical Processor ID: %d\n", - c->phys_proc_id); - printk(KERN_INFO "CPU: Processor Core ID: %d\n", - c->cpu_core_id); - } - -#endif -} - -static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) -{ - char *v = c->x86_vendor_id; - int i; - static int printed; - - for (i = 0; i < X86_VENDOR_NUM; i++) { - if (cpu_devs[i]) { - if (!strcmp(v, cpu_devs[i]->c_ident[0]) || - (cpu_devs[i]->c_ident[1] && - !strcmp(v, cpu_devs[i]->c_ident[1]))) { - c->x86_vendor = i; - this_cpu = cpu_devs[i]; - return; - } - } - } - if (!printed) { - printed++; - printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n"); - printk(KERN_ERR "CPU: Your system may be unstable.\n"); - } - c->x86_vendor = X86_VENDOR_UNKNOWN; -} - -static void __init early_cpu_support_print(void) -{ - int i,j; - struct cpu_dev *cpu_devx; - - printk("KERNEL supported cpus:\n"); - for (i = 0; i < X86_VENDOR_NUM; i++) { - cpu_devx = cpu_devs[i]; - if (!cpu_devx) - continue; - for (j = 0; j < 2; j++) { - if (!cpu_devx->c_ident[j]) - continue; - printk(" %s %s\n", cpu_devx->c_vendor, - cpu_devx->c_ident[j]); - } - } -} - -static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c); - -void __init early_cpu_init(void) -{ - struct cpu_vendor_dev *cvdev; - - for (cvdev = __x86cpuvendor_start ; - cvdev < __x86cpuvendor_end ; - cvdev++) - cpu_devs[cvdev->vendor] = cvdev->cpu_dev; - early_cpu_support_print(); - early_identify_cpu(&boot_cpu_data); -} - -/* Do some early cpuid on the boot CPU to get some parameter that are - needed before check_bugs. Everything advanced is in identify_cpu - below. */ -static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) -{ - u32 tfms, xlvl; - - c->loops_per_jiffy = loops_per_jiffy; - c->x86_cache_size = -1; - c->x86_vendor = X86_VENDOR_UNKNOWN; - c->x86_model = c->x86_mask = 0; /* So far unknown... */ - c->x86_vendor_id[0] = '\0'; /* Unset */ - c->x86_model_id[0] = '\0'; /* Unset */ - c->x86_clflush_size = 64; - c->x86_cache_alignment = c->x86_clflush_size; - c->x86_max_cores = 1; - c->x86_coreid_bits = 0; - c->extended_cpuid_level = 0; - memset(&c->x86_capability, 0, sizeof c->x86_capability); - - /* Get vendor name */ - cpuid(0x00000000, (unsigned int *)&c->cpuid_level, - (unsigned int *)&c->x86_vendor_id[0], - (unsigned int *)&c->x86_vendor_id[8], - (unsigned int *)&c->x86_vendor_id[4]); - - get_cpu_vendor(c); - - /* Initialize the standard set of capabilities */ - /* Note that the vendor-specific code below might override */ - - /* Intel-defined flags: level 0x00000001 */ - if (c->cpuid_level >= 0x00000001) { - __u32 misc; - cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4], - &c->x86_capability[0]); - c->x86 = (tfms >> 8) & 0xf; - c->x86_model = (tfms >> 4) & 0xf; - c->x86_mask = tfms & 0xf; - if (c->x86 == 0xf) - c->x86 += (tfms >> 20) & 0xff; - if (c->x86 >= 0x6) - c->x86_model += ((tfms >> 16) & 0xF) << 4; - if (test_cpu_cap(c, X86_FEATURE_CLFLSH)) - c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; - } else { - /* Have CPUID level 0 only - unheard of */ - c->x86 = 4; - } - - c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xff; -#ifdef CONFIG_SMP - c->phys_proc_id = c->initial_apicid; -#endif - /* AMD-defined flags: level 0x80000001 */ - xlvl = cpuid_eax(0x80000000); - c->extended_cpuid_level = xlvl; - if ((xlvl & 0xffff0000) == 0x80000000) { - if (xlvl >= 0x80000001) { - c->x86_capability[1] = cpuid_edx(0x80000001); - c->x86_capability[6] = cpuid_ecx(0x80000001); - } - if (xlvl >= 0x80000004) - get_model_name(c); /* Default name */ - } - - /* Transmeta-defined flags: level 0x80860001 */ - xlvl = cpuid_eax(0x80860000); - if ((xlvl & 0xffff0000) == 0x80860000) { - /* Don't set x86_cpuid_level here for now to not confuse. */ - if (xlvl >= 0x80860001) - c->x86_capability[2] = cpuid_edx(0x80860001); - } - - if (c->extended_cpuid_level >= 0x80000007) - c->x86_power = cpuid_edx(0x80000007); - - if (c->extended_cpuid_level >= 0x80000008) { - u32 eax = cpuid_eax(0x80000008); - - c->x86_virt_bits = (eax >> 8) & 0xff; - c->x86_phys_bits = eax & 0xff; - } - - if (c->x86_vendor != X86_VENDOR_UNKNOWN && - cpu_devs[c->x86_vendor]->c_early_init) - cpu_devs[c->x86_vendor]->c_early_init(c); - - validate_pat_support(c); -} - -/* - * This does the hard work of actually picking apart the CPU stuff... - */ -static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) -{ - int i; - - early_identify_cpu(c); - - init_scattered_cpuid_features(c); - - c->apicid = phys_pkg_id(0); - - /* - * Vendor-specific initialization. In this section we - * canonicalize the feature flags, meaning if there are - * features a certain CPU supports which CPUID doesn't - * tell us, CPUID claiming incorrect flags, or other bugs, - * we handle them here. - * - * At the end of this section, c->x86_capability better - * indicate the features this CPU genuinely supports! - */ - if (this_cpu->c_init) - this_cpu->c_init(c); - - detect_ht(c); - - /* - * On SMP, boot_cpu_data holds the common feature set between - * all CPUs; so make sure that we indicate which features are - * common between the CPUs. The first time this routine gets - * executed, c == &boot_cpu_data. - */ - if (c != &boot_cpu_data) { - /* AND the already accumulated flags with these */ - for (i = 0; i < NCAPINTS; i++) - boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; - } - - /* Clear all flags overriden by options */ - for (i = 0; i < NCAPINTS; i++) - c->x86_capability[i] &= ~cleared_cpu_caps[i]; - -#ifdef CONFIG_X86_MCE - mcheck_init(c); -#endif - select_idle_routine(c); - -#ifdef CONFIG_NUMA - numa_add_cpu(smp_processor_id()); -#endif - -} - -void __cpuinit identify_boot_cpu(void) -{ - identify_cpu(&boot_cpu_data); -} - -void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) -{ - BUG_ON(c == &boot_cpu_data); - identify_cpu(c); - mtrr_ap_init(); -} - -static __init int setup_noclflush(char *arg) -{ - setup_clear_cpu_cap(X86_FEATURE_CLFLSH); - return 1; -} -__setup("noclflush", setup_noclflush); - -void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) -{ - if (c->x86_model_id[0]) - printk(KERN_CONT "%s", c->x86_model_id); - - if (c->x86_mask || c->cpuid_level >= 0) - printk(KERN_CONT " stepping %02x\n", c->x86_mask); - else - printk(KERN_CONT "\n"); -} - -static __init int setup_disablecpuid(char *arg) -{ - int bit; - if (get_option(&arg, &bit) && bit < NCAPINTS*32) - setup_clear_cpu_cap(bit); - else - return 0; - return 1; -} -__setup("clearcpuid=", setup_disablecpuid); - -cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; - -struct x8664_pda **_cpu_pda __read_mostly; -EXPORT_SYMBOL(_cpu_pda); - -struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; - -char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss; - -unsigned long __supported_pte_mask __read_mostly = ~0UL; -EXPORT_SYMBOL_GPL(__supported_pte_mask); - -static int do_not_nx __cpuinitdata; - -/* noexec=on|off -Control non executable mappings for 64bit processes. - -on Enable(default) -off Disable -*/ -static int __init nonx_setup(char *str) -{ - if (!str) - return -EINVAL; - if (!strncmp(str, "on", 2)) { - __supported_pte_mask |= _PAGE_NX; - do_not_nx = 0; - } else if (!strncmp(str, "off", 3)) { - do_not_nx = 1; - __supported_pte_mask &= ~_PAGE_NX; - } - return 0; -} -early_param("noexec", nonx_setup); - -int force_personality32; - -/* noexec32=on|off -Control non executable heap for 32bit processes. -To control the stack too use noexec=off - -on PROT_READ does not imply PROT_EXEC for 32bit processes (default) -off PROT_READ implies PROT_EXEC -*/ -static int __init nonx32_setup(char *str) -{ - if (!strcmp(str, "on")) - force_personality32 &= ~READ_IMPLIES_EXEC; - else if (!strcmp(str, "off")) - force_personality32 |= READ_IMPLIES_EXEC; - return 1; -} -__setup("noexec32=", nonx32_setup); - -void pda_init(int cpu) -{ - struct x8664_pda *pda = cpu_pda(cpu); - - /* Setup up data that may be needed in __get_free_pages early */ - loadsegment(fs, 0); - loadsegment(gs, 0); - /* Memory clobbers used to order PDA accessed */ - mb(); - wrmsrl(MSR_GS_BASE, pda); - mb(); - - pda->cpunumber = cpu; - pda->irqcount = -1; - pda->kernelstack = (unsigned long)stack_thread_info() - - PDA_STACKOFFSET + THREAD_SIZE; - pda->active_mm = &init_mm; - pda->mmu_state = 0; - - if (cpu == 0) { - /* others are initialized in smpboot.c */ - pda->pcurrent = &init_task; - pda->irqstackptr = boot_cpu_stack; - } else { - pda->irqstackptr = (char *) - __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER); - if (!pda->irqstackptr) - panic("cannot allocate irqstack for cpu %d", cpu); - - if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE) - pda->nodenumber = cpu_to_node(cpu); - } - - pda->irqstackptr += IRQSTACKSIZE-64; -} - -char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + - DEBUG_STKSZ] __page_aligned_bss; - -extern asmlinkage void ignore_sysret(void); - -/* May not be marked __init: used by software suspend */ -void syscall_init(void) -{ - /* - * LSTAR and STAR live in a bit strange symbiosis. - * They both write to the same internal register. STAR allows to - * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip. - */ - wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); - wrmsrl(MSR_LSTAR, system_call); - wrmsrl(MSR_CSTAR, ignore_sysret); - -#ifdef CONFIG_IA32_EMULATION - syscall32_cpu_init(); -#endif - - /* Flags to clear on syscall */ - wrmsrl(MSR_SYSCALL_MASK, - X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL); -} - -void __cpuinit check_efer(void) -{ - unsigned long efer; - - rdmsrl(MSR_EFER, efer); - if (!(efer & EFER_NX) || do_not_nx) - __supported_pte_mask &= ~_PAGE_NX; -} - -unsigned long kernel_eflags; - -/* - * Copies of the original ist values from the tss are only accessed during - * debugging, no special alignment required. - */ -DEFINE_PER_CPU(struct orig_ist, orig_ist); - -/* - * cpu_init() initializes state that is per-CPU. Some data is already - * initialized (naturally) in the bootstrap process, such as the GDT - * and IDT. We reload them nevertheless, this function acts as a - * 'CPU state barrier', nothing should get across. - * A lot of state is already set up in PDA init. - */ -void __cpuinit cpu_init(void) -{ - int cpu = stack_smp_processor_id(); - struct tss_struct *t = &per_cpu(init_tss, cpu); - struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu); - unsigned long v; - char *estacks = NULL; - struct task_struct *me; - int i; - - /* CPU 0 is initialised in head64.c */ - if (cpu != 0) - pda_init(cpu); - else - estacks = boot_exception_stacks; - - me = current; - - if (cpu_test_and_set(cpu, cpu_initialized)) - panic("CPU#%d already initialized!\n", cpu); - - printk(KERN_INFO "Initializing CPU#%d\n", cpu); - - clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); - - /* - * Initialize the per-CPU GDT with the boot GDT, - * and set up the GDT descriptor: - */ - - switch_to_new_gdt(); - load_idt((const struct desc_ptr *)&idt_descr); - - memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8); - syscall_init(); - - wrmsrl(MSR_FS_BASE, 0); - wrmsrl(MSR_KERNEL_GS_BASE, 0); - barrier(); - - check_efer(); - - /* - * set up and load the per-CPU TSS - */ - for (v = 0; v < N_EXCEPTION_STACKS; v++) { - static const unsigned int order[N_EXCEPTION_STACKS] = { - [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER, - [DEBUG_STACK - 1] = DEBUG_STACK_ORDER - }; - if (cpu) { - estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]); - if (!estacks) - panic("Cannot allocate exception stack %ld %d\n", - v, cpu); - } - estacks += PAGE_SIZE << order[v]; - orig_ist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks; - } - - t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); - /* - * <= is required because the CPU will access up to - * 8 bits beyond the end of the IO permission bitmap. - */ - for (i = 0; i <= IO_BITMAP_LONGS; i++) - t->io_bitmap[i] = ~0UL; - - atomic_inc(&init_mm.mm_count); - me->active_mm = &init_mm; - if (me->mm) - BUG(); - enter_lazy_tlb(&init_mm, me); - - load_sp0(t, ¤t->thread); - set_tss_desc(cpu, t); - load_TR_desc(); - load_LDT(&init_mm.context); - -#ifdef CONFIG_KGDB - /* - * If the kgdb is connected no debug regs should be altered. This - * is only applicable when KGDB and a KGDB I/O module are built - * into the kernel and you are using early debugging with - * kgdbwait. KGDB will control the kernel HW breakpoint registers. - */ - if (kgdb_connected && arch_kgdb_ops.correct_hw_break) - arch_kgdb_ops.correct_hw_break(); - else { -#endif - /* - * Clear all 6 debug registers: - */ - - set_debugreg(0UL, 0); - set_debugreg(0UL, 1); - set_debugreg(0UL, 2); - set_debugreg(0UL, 3); - set_debugreg(0UL, 6); - set_debugreg(0UL, 7); -#ifdef CONFIG_KGDB - /* If the kgdb is connected no debug regs should be altered. */ - } -#endif - - fpu_init(); - - raw_local_save_flags(kernel_eflags); - - if (is_uv_system()) - uv_cpu_init(); -} diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 4d894e8565f..de4094a3921 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -21,23 +21,16 @@ struct cpu_dev { void (*c_init)(struct cpuinfo_x86 * c); void (*c_identify)(struct cpuinfo_x86 * c); unsigned int (*c_size_cache)(struct cpuinfo_x86 * c, unsigned int size); + int c_x86_vendor; }; -extern struct cpu_dev * cpu_devs [X86_VENDOR_NUM]; +#define cpu_dev_register(cpu_devX) \ + static struct cpu_dev *__cpu_dev_##cpu_devX __used \ + __attribute__((__section__(".x86_cpu_dev.init"))) = \ + &cpu_devX; -struct cpu_vendor_dev { - int vendor; - struct cpu_dev *cpu_dev; -}; - -#define cpu_vendor_dev_register(cpu_vendor_id, cpu_dev) \ - static struct cpu_vendor_dev __cpu_vendor_dev_##cpu_vendor_id __used \ - __attribute__((__section__(".x86cpuvendor.init"))) = \ - { cpu_vendor_id, cpu_dev } - -extern struct cpu_vendor_dev __x86cpuvendor_start[], __x86cpuvendor_end[]; +extern struct cpu_dev *__x86_cpu_dev_start[], *__x86_cpu_dev_end[]; -extern int get_model_name(struct cpuinfo_x86 *c); extern void display_cacheinfo(struct cpuinfo_x86 *c); #endif diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index dd097b83583..c24c4a487b7 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -256,7 +256,8 @@ static u32 get_cur_val(const cpumask_t *mask) * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and * no meaning should be associated with absolute values of these MSRs. */ -static unsigned int get_measured_perf(unsigned int cpu) +static unsigned int get_measured_perf(struct cpufreq_policy *policy, + unsigned int cpu) { union { struct { @@ -326,7 +327,7 @@ static unsigned int get_measured_perf(unsigned int cpu) #endif - retval = per_cpu(drv_data, cpu)->max_freq * perf_percent / 100; + retval = per_cpu(drv_data, policy->cpu)->max_freq * perf_percent / 100; put_cpu(); set_cpus_allowed_ptr(current, &saved_mask); @@ -785,7 +786,11 @@ static int __init acpi_cpufreq_init(void) if (ret) return ret; - return cpufreq_register_driver(&acpi_cpufreq_driver); + ret = cpufreq_register_driver(&acpi_cpufreq_driver); + if (ret) + free_percpu(acpi_perf_data); + + return ret; } static void __exit acpi_cpufreq_exit(void) @@ -795,8 +800,6 @@ static void __exit acpi_cpufreq_exit(void) cpufreq_unregister_driver(&acpi_cpufreq_driver); free_percpu(acpi_perf_data); - - return; } module_param(acpi_pstate_strict, uint, 0644); diff --git a/arch/x86/kernel/cpu/cpufreq/elanfreq.c b/arch/x86/kernel/cpu/cpufreq/elanfreq.c index e4a4bf870e9..fe613c93b36 100644 --- a/arch/x86/kernel/cpu/cpufreq/elanfreq.c +++ b/arch/x86/kernel/cpu/cpufreq/elanfreq.c @@ -25,8 +25,8 @@ #include <linux/cpufreq.h> #include <asm/msr.h> -#include <asm/timex.h> -#include <asm/io.h> +#include <linux/timex.h> +#include <linux/io.h> #define REG_CSCIR 0x22 /* Chip Setup and Control Index Register */ #define REG_CSCDR 0x23 /* Chip Setup and Control Data Register */ @@ -82,7 +82,7 @@ static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu) u8 clockspeed_reg; /* Clock Speed Register */ local_irq_disable(); - outb_p(0x80,REG_CSCIR); + outb_p(0x80, REG_CSCIR); clockspeed_reg = inb_p(REG_CSCDR); local_irq_enable(); @@ -98,10 +98,10 @@ static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu) } /* 33 MHz is not 32 MHz... */ - if ((clockspeed_reg & 0xE0)==0xA0) + if ((clockspeed_reg & 0xE0) == 0xA0) return 33000; - return ((1<<((clockspeed_reg & 0xE0) >> 5)) * 1000); + return (1<<((clockspeed_reg & 0xE0) >> 5)) * 1000; } @@ -117,7 +117,7 @@ static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu) * There is no return value. */ -static void elanfreq_set_cpu_state (unsigned int state) +static void elanfreq_set_cpu_state(unsigned int state) { struct cpufreq_freqs freqs; @@ -144,20 +144,20 @@ static void elanfreq_set_cpu_state (unsigned int state) */ local_irq_disable(); - outb_p(0x40,REG_CSCIR); /* Disable hyperspeed mode */ - outb_p(0x00,REG_CSCDR); + outb_p(0x40, REG_CSCIR); /* Disable hyperspeed mode */ + outb_p(0x00, REG_CSCDR); local_irq_enable(); /* wait till internal pipelines and */ udelay(1000); /* buffers have cleaned up */ local_irq_disable(); /* now, set the CPU clock speed register (0x80) */ - outb_p(0x80,REG_CSCIR); - outb_p(elan_multiplier[state].val80h,REG_CSCDR); + outb_p(0x80, REG_CSCIR); + outb_p(elan_multiplier[state].val80h, REG_CSCDR); /* now, the hyperspeed bit in PMU Force Mode Register (0x40) */ - outb_p(0x40,REG_CSCIR); - outb_p(elan_multiplier[state].val40h,REG_CSCDR); + outb_p(0x40, REG_CSCIR); + outb_p(elan_multiplier[state].val40h, REG_CSCDR); udelay(10000); local_irq_enable(); @@ -173,12 +173,12 @@ static void elanfreq_set_cpu_state (unsigned int state) * for the hardware supported by the driver. */ -static int elanfreq_verify (struct cpufreq_policy *policy) +static int elanfreq_verify(struct cpufreq_policy *policy) { return cpufreq_frequency_table_verify(policy, &elanfreq_table[0]); } -static int elanfreq_target (struct cpufreq_policy *policy, +static int elanfreq_target(struct cpufreq_policy *policy, unsigned int target_freq, unsigned int relation) { @@ -205,7 +205,7 @@ static int elanfreq_cpu_init(struct cpufreq_policy *policy) /* capability check */ if ((c->x86_vendor != X86_VENDOR_AMD) || - (c->x86 != 4) || (c->x86_model!=10)) + (c->x86 != 4) || (c->x86_model != 10)) return -ENODEV; /* max freq */ @@ -213,7 +213,7 @@ static int elanfreq_cpu_init(struct cpufreq_policy *policy) max_freq = elanfreq_get_cpu_frequency(0); /* table init */ - for (i=0; (elanfreq_table[i].frequency != CPUFREQ_TABLE_END); i++) { + for (i = 0; (elanfreq_table[i].frequency != CPUFREQ_TABLE_END); i++) { if (elanfreq_table[i].frequency > max_freq) elanfreq_table[i].frequency = CPUFREQ_ENTRY_INVALID; } @@ -224,7 +224,7 @@ static int elanfreq_cpu_init(struct cpufreq_policy *policy) result = cpufreq_frequency_table_cpuinfo(policy, elanfreq_table); if (result) - return (result); + return result; cpufreq_frequency_table_get_attr(elanfreq_table, policy->cpu); return 0; @@ -260,7 +260,7 @@ __setup("elanfreq=", elanfreq_setup); #endif -static struct freq_attr* elanfreq_attr[] = { +static struct freq_attr *elanfreq_attr[] = { &cpufreq_freq_attr_scaling_available_freqs, NULL, }; @@ -284,9 +284,9 @@ static int __init elanfreq_init(void) /* Test if we have the right hardware */ if ((c->x86_vendor != X86_VENDOR_AMD) || - (c->x86 != 4) || (c->x86_model!=10)) { + (c->x86 != 4) || (c->x86_model != 10)) { printk(KERN_INFO "elanfreq: error: no Elan processor found!\n"); - return -ENODEV; + return -ENODEV; } return cpufreq_register_driver(&elanfreq_driver); } @@ -298,7 +298,7 @@ static void __exit elanfreq_exit(void) } -module_param (max_freq, int, 0444); +module_param(max_freq, int, 0444); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Robert Schwebel <r.schwebel@pengutronix.de>, Sven Geggus <sven@geggus.net>"); diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c index f1685fb91fb..b8e05ee4f73 100644 --- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c +++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c @@ -171,7 +171,7 @@ static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c) } if (c->x86 != 0xF) { - printk(KERN_WARNING PFX "Unknown p4-clockmod-capable CPU. Please send an e-mail to <cpufreq@lists.linux.org.uk>\n"); + printk(KERN_WARNING PFX "Unknown p4-clockmod-capable CPU. Please send an e-mail to <cpufreq@vger.kernel.org>\n"); return 0; } diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c index eb9b62b0830..b5ced806a31 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c @@ -15,12 +15,11 @@ #include <linux/slab.h> #include <asm/msr.h> -#include <asm/timex.h> -#include <asm/io.h> +#include <linux/timex.h> +#include <linux/io.h> - -#define POWERNOW_IOPORT 0xfff0 /* it doesn't matter where, as long - as it is unused */ +#define POWERNOW_IOPORT 0xfff0 /* it doesn't matter where, as long + as it is unused */ static unsigned int busfreq; /* FSB, in 10 kHz */ static unsigned int max_multiplier; @@ -53,7 +52,7 @@ static int powernow_k6_get_cpu_multiplier(void) msrval = POWERNOW_IOPORT + 0x1; wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */ - invalue=inl(POWERNOW_IOPORT + 0x8); + invalue = inl(POWERNOW_IOPORT + 0x8); msrval = POWERNOW_IOPORT + 0x0; wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */ @@ -67,9 +66,9 @@ static int powernow_k6_get_cpu_multiplier(void) * * Tries to change the PowerNow! multiplier */ -static void powernow_k6_set_state (unsigned int best_i) +static void powernow_k6_set_state(unsigned int best_i) { - unsigned long outvalue=0, invalue=0; + unsigned long outvalue = 0, invalue = 0; unsigned long msrval; struct cpufreq_freqs freqs; @@ -90,10 +89,10 @@ static void powernow_k6_set_state (unsigned int best_i) msrval = POWERNOW_IOPORT + 0x1; wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */ - invalue=inl(POWERNOW_IOPORT + 0x8); + invalue = inl(POWERNOW_IOPORT + 0x8); invalue = invalue & 0xf; outvalue = outvalue | invalue; - outl(outvalue ,(POWERNOW_IOPORT + 0x8)); + outl(outvalue , (POWERNOW_IOPORT + 0x8)); msrval = POWERNOW_IOPORT + 0x0; wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */ @@ -124,7 +123,7 @@ static int powernow_k6_verify(struct cpufreq_policy *policy) * * sets a new CPUFreq policy */ -static int powernow_k6_target (struct cpufreq_policy *policy, +static int powernow_k6_target(struct cpufreq_policy *policy, unsigned int target_freq, unsigned int relation) { @@ -152,7 +151,7 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy) busfreq = cpu_khz / max_multiplier; /* table init */ - for (i=0; (clock_ratio[i].frequency != CPUFREQ_TABLE_END); i++) { + for (i = 0; (clock_ratio[i].frequency != CPUFREQ_TABLE_END); i++) { if (clock_ratio[i].index > max_multiplier) clock_ratio[i].frequency = CPUFREQ_ENTRY_INVALID; else @@ -165,7 +164,7 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy) result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio); if (result) - return (result); + return result; cpufreq_frequency_table_get_attr(clock_ratio, policy->cpu); @@ -176,8 +175,8 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy) static int powernow_k6_cpu_exit(struct cpufreq_policy *policy) { unsigned int i; - for (i=0; i<8; i++) { - if (i==max_multiplier) + for (i = 0; i < 8; i++) { + if (i == max_multiplier) powernow_k6_set_state(i); } cpufreq_frequency_table_put_attr(policy->cpu); @@ -189,7 +188,7 @@ static unsigned int powernow_k6_get(unsigned int cpu) return busfreq * powernow_k6_get_cpu_multiplier(); } -static struct freq_attr* powernow_k6_attr[] = { +static struct freq_attr *powernow_k6_attr[] = { &cpufreq_freq_attr_scaling_available_freqs, NULL, }; @@ -227,7 +226,7 @@ static int __init powernow_k6_init(void) } if (cpufreq_register_driver(&powernow_k6_driver)) { - release_region (POWERNOW_IOPORT, 16); + release_region(POWERNOW_IOPORT, 16); return -EINVAL; } @@ -243,13 +242,13 @@ static int __init powernow_k6_init(void) static void __exit powernow_k6_exit(void) { cpufreq_unregister_driver(&powernow_k6_driver); - release_region (POWERNOW_IOPORT, 16); + release_region(POWERNOW_IOPORT, 16); } -MODULE_AUTHOR ("Arjan van de Ven <arjanv@redhat.com>, Dave Jones <davej@codemonkey.org.uk>, Dominik Brodowski <linux@brodo.de>"); -MODULE_DESCRIPTION ("PowerNow! driver for AMD K6-2+ / K6-3+ processors."); -MODULE_LICENSE ("GPL"); +MODULE_AUTHOR("Arjan van de Ven <arjanv@redhat.com>, Dave Jones <davej@codemonkey.org.uk>, Dominik Brodowski <linux@brodo.de>"); +MODULE_DESCRIPTION("PowerNow! driver for AMD K6-2+ / K6-3+ processors."); +MODULE_LICENSE("GPL"); module_init(powernow_k6_init); module_exit(powernow_k6_exit); diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c index 15e13c01cc3..3b5f06423e7 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c @@ -26,7 +26,7 @@ #include <asm/cpufeature.h> #define PFX "speedstep-centrino: " -#define MAINTAINER "cpufreq@lists.linux.org.uk" +#define MAINTAINER "cpufreq@vger.kernel.org" #define dprintk(msg...) \ cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg) diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index 3fd7a67bb06..ffd0f5ed071 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -15,13 +15,11 @@ /* * Read NSC/Cyrix DEVID registers (DIR) to get more detailed info. about the CPU */ -static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) +static void __cpuinit __do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) { unsigned char ccr2, ccr3; - unsigned long flags; /* we test for DEVID by checking whether CCR3 is writable */ - local_irq_save(flags); ccr3 = getCx86(CX86_CCR3); setCx86(CX86_CCR3, ccr3 ^ 0x80); getCx86(0xc0); /* dummy to change bus */ @@ -44,9 +42,16 @@ static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) *dir0 = getCx86(CX86_DIR0); *dir1 = getCx86(CX86_DIR1); } - local_irq_restore(flags); } +static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) +{ + unsigned long flags; + + local_irq_save(flags); + __do_cyrix_devid(dir0, dir1); + local_irq_restore(flags); +} /* * Cx86_dir0_msb is a HACK needed by check_cx686_cpuid/slop in bugs.h in * order to identify the Cyrix CPU model after we're out of setup.c @@ -116,7 +121,7 @@ static void __cpuinit set_cx86_reorder(void) setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ /* Load/Store Serialize to mem access disable (=reorder it) */ - setCx86(CX86_PCR0, getCx86(CX86_PCR0) & ~0x80); + setCx86_old(CX86_PCR0, getCx86_old(CX86_PCR0) & ~0x80); /* set load/store serialize from 1GB to 4GB */ ccr3 |= 0xe0; setCx86(CX86_CCR3, ccr3); @@ -127,28 +132,11 @@ static void __cpuinit set_cx86_memwb(void) printk(KERN_INFO "Enable Memory-Write-back mode on Cyrix/NSC processor.\n"); /* CCR2 bit 2: unlock NW bit */ - setCx86(CX86_CCR2, getCx86(CX86_CCR2) & ~0x04); + setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) & ~0x04); /* set 'Not Write-through' */ write_cr0(read_cr0() | X86_CR0_NW); /* CCR2 bit 2: lock NW bit and set WT1 */ - setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x14); -} - -static void __cpuinit set_cx86_inc(void) -{ - unsigned char ccr3; - - printk(KERN_INFO "Enable Incrementor on Cyrix/NSC processor.\n"); - - ccr3 = getCx86(CX86_CCR3); - setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ - /* PCR1 -- Performance Control */ - /* Incrementor on, whatever that is */ - setCx86(CX86_PCR1, getCx86(CX86_PCR1) | 0x02); - /* PCR0 -- Performance Control */ - /* Incrementor Margin 10 */ - setCx86(CX86_PCR0, getCx86(CX86_PCR0) | 0x04); - setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ + setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) | 0x14); } /* @@ -162,23 +150,40 @@ static void __cpuinit geode_configure(void) local_irq_save(flags); /* Suspend on halt power saving and enable #SUSP pin */ - setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x88); + setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) | 0x88); ccr3 = getCx86(CX86_CCR3); setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ /* FPU fast, DTE cache, Mem bypass */ - setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x38); + setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x38); setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ set_cx86_memwb(); set_cx86_reorder(); - set_cx86_inc(); local_irq_restore(flags); } +static void __cpuinit early_init_cyrix(struct cpuinfo_x86 *c) +{ + unsigned char dir0, dir0_msn, dir1 = 0; + + __do_cyrix_devid(&dir0, &dir1); + dir0_msn = dir0 >> 4; /* identifies CPU "family" */ + + switch (dir0_msn) { + case 3: /* 6x86/6x86L */ + /* Emulate MTRRs using Cyrix's ARRs. */ + set_cpu_cap(c, X86_FEATURE_CYRIX_ARR); + break; + case 5: /* 6x86MX/M II */ + /* Emulate MTRRs using Cyrix's ARRs. */ + set_cpu_cap(c, X86_FEATURE_CYRIX_ARR); + break; + } +} static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) { @@ -286,7 +291,7 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) /* GXm supports extended cpuid levels 'ala' AMD */ if (c->cpuid_level == 2) { /* Enable cxMMX extensions (GX1 Datasheet 54) */ - setCx86(CX86_CCR7, getCx86(CX86_CCR7) | 1); + setCx86_old(CX86_CCR7, getCx86_old(CX86_CCR7) | 1); /* * GXm : 0x30 ... 0x5f GXm datasheet 51 @@ -296,7 +301,6 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) */ if ((0x30 <= dir1 && dir1 <= 0x6f) || (0x80 <= dir1 && dir1 <= 0x8f)) geode_configure(); - get_model_name(c); /* get CPU marketing name */ return; } else { /* MediaGX */ Cx86_cb[2] = (dir0_lsn & 1) ? '3' : '4'; @@ -309,7 +313,7 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) if (dir1 > 7) { dir0_msn++; /* M II */ /* Enable MMX extensions (App note 108) */ - setCx86(CX86_CCR7, getCx86(CX86_CCR7)|1); + setCx86_old(CX86_CCR7, getCx86_old(CX86_CCR7)|1); } else { c->coma_bug = 1; /* 6x86MX, it has the bug. */ } @@ -424,7 +428,7 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c) local_irq_save(flags); ccr3 = getCx86(CX86_CCR3); setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ - setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x80); /* enable cpuid */ + setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x80); /* enable cpuid */ setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ local_irq_restore(flags); } @@ -434,16 +438,19 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c) static struct cpu_dev cyrix_cpu_dev __cpuinitdata = { .c_vendor = "Cyrix", .c_ident = { "CyrixInstead" }, + .c_early_init = early_init_cyrix, .c_init = init_cyrix, .c_identify = cyrix_identify, + .c_x86_vendor = X86_VENDOR_CYRIX, }; -cpu_vendor_dev_register(X86_VENDOR_CYRIX, &cyrix_cpu_dev); +cpu_dev_register(cyrix_cpu_dev); static struct cpu_dev nsc_cpu_dev __cpuinitdata = { .c_vendor = "NSC", .c_ident = { "Geode by NSC" }, .c_init = init_nsc, + .c_x86_vendor = X86_VENDOR_NSC, }; -cpu_vendor_dev_register(X86_VENDOR_NSC, &nsc_cpu_dev); +cpu_dev_register(nsc_cpu_dev); diff --git a/arch/x86/kernel/cpu/feature_names.c b/arch/x86/kernel/cpu/feature_names.c deleted file mode 100644 index e43ad4ad4cb..00000000000 --- a/arch/x86/kernel/cpu/feature_names.c +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Strings for the various x86 capability flags. - * - * This file must not contain any executable code. - */ - -#include <asm/cpufeature.h> - -/* - * These flag bits must match the definitions in <asm/cpufeature.h>. - * NULL means this bit is undefined or reserved; either way it doesn't - * have meaning as far as Linux is concerned. Note that it's important - * to realize there is a difference between this table and CPUID -- if - * applications want to get the raw CPUID data, they should access - * /dev/cpu/<cpu_nr>/cpuid instead. - */ -const char * const x86_cap_flags[NCAPINTS*32] = { - /* Intel-defined */ - "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce", - "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov", - "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx", - "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe", - - /* AMD-defined */ - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL, - NULL, NULL, NULL, "mp", "nx", NULL, "mmxext", NULL, - NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm", - "3dnowext", "3dnow", - - /* Transmeta-defined */ - "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* Other (Linux-defined) */ - "cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr", - NULL, NULL, NULL, NULL, - "constant_tsc", "up", NULL, "arch_perfmon", - "pebs", "bts", NULL, NULL, - "rep_good", NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* Intel-defined (#2) */ - "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est", - "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL, - NULL, NULL, "dca", "sse4_1", "sse4_2", NULL, NULL, "popcnt", - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* VIA/Cyrix/Centaur-defined */ - NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en", - "ace2", "ace2_en", "phe", "phe_en", "pmm", "pmm_en", NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* AMD-defined (#2) */ - "lahf_lm", "cmp_legacy", "svm", "extapic", - "cr8_legacy", "abm", "sse4a", "misalignsse", - "3dnowprefetch", "osvw", "ibs", "sse5", - "skinit", "wdt", NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* Auxiliary (Linux-defined) */ - "ida", NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, -}; - -const char *const x86_power_flags[32] = { - "ts", /* temperature sensor */ - "fid", /* frequency id control */ - "vid", /* voltage id control */ - "ttp", /* thermal trip */ - "tm", - "stc", - "100mhzsteps", - "hwpstate", - "", /* tsc invariant mapped to constant_tsc */ - /* nothing */ -}; diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index b75f2569b8f..99468dbd08d 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -15,6 +15,11 @@ #include <asm/ds.h> #include <asm/bugs.h> +#ifdef CONFIG_X86_64 +#include <asm/topology.h> +#include <asm/numa_64.h> +#endif + #include "cpu.h" #ifdef CONFIG_X86_LOCAL_APIC @@ -23,23 +28,22 @@ #include <mach_apic.h> #endif -#ifdef CONFIG_X86_INTEL_USERCOPY -/* - * Alignment at which movsl is preferred for bulk memory copies. - */ -struct movsl_mask movsl_mask __read_mostly; -#endif - static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) { - /* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */ - if (c->x86 == 15 && c->x86_cache_alignment == 64) - c->x86_cache_alignment = 128; if ((c->x86 == 0xf && c->x86_model >= 0x03) || (c->x86 == 0x6 && c->x86_model >= 0x0e)) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + +#ifdef CONFIG_X86_64 + set_cpu_cap(c, X86_FEATURE_SYSENTER32); +#else + /* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */ + if (c->x86 == 15 && c->x86_cache_alignment == 64) + c->x86_cache_alignment = 128; +#endif } +#ifdef CONFIG_X86_32 /* * Early probe support logic for ppro memory erratum #50 * @@ -59,15 +63,54 @@ int __cpuinit ppro_with_ram_bug(void) return 0; } +#ifdef CONFIG_X86_F00F_BUG +static void __cpuinit trap_init_f00f_bug(void) +{ + __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO); -/* - * P4 Xeon errata 037 workaround. - * Hardware prefetcher may cause stale data to be loaded into the cache. - */ -static void __cpuinit Intel_errata_workarounds(struct cpuinfo_x86 *c) + /* + * Update the IDT descriptor and reload the IDT so that + * it uses the read-only mapped virtual address. + */ + idt_descr.address = fix_to_virt(FIX_F00F_IDT); + load_idt(&idt_descr); +} +#endif + +static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) { unsigned long lo, hi; +#ifdef CONFIG_X86_F00F_BUG + /* + * All current models of Pentium and Pentium with MMX technology CPUs + * have the F0 0F bug, which lets nonprivileged users lock up the system. + * Note that the workaround only should be initialized once... + */ + c->f00f_bug = 0; + if (!paravirt_enabled() && c->x86 == 5) { + static int f00f_workaround_enabled; + + c->f00f_bug = 1; + if (!f00f_workaround_enabled) { + trap_init_f00f_bug(); + printk(KERN_NOTICE "Intel Pentium with F0 0F bug - workaround enabled.\n"); + f00f_workaround_enabled = 1; + } + } +#endif + + /* + * SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until + * model 3 mask 3 + */ + if ((c->x86<<8 | c->x86_model<<4 | c->x86_mask) < 0x633) + clear_cpu_cap(c, X86_FEATURE_SEP); + + /* + * P4 Xeon errata 037 workaround. + * Hardware prefetcher may cause stale data to be loaded into the cache. + */ if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) { rdmsr(MSR_IA32_MISC_ENABLE, lo, hi); if ((lo & (1<<9)) == 0) { @@ -77,13 +120,68 @@ static void __cpuinit Intel_errata_workarounds(struct cpuinfo_x86 *c) wrmsr (MSR_IA32_MISC_ENABLE, lo, hi); } } + + /* + * See if we have a good local APIC by checking for buggy Pentia, + * i.e. all B steppings and the C2 stepping of P54C when using their + * integrated APIC (see 11AP erratum in "Pentium Processor + * Specification Update"). + */ + if (cpu_has_apic && (c->x86<<8 | c->x86_model<<4) == 0x520 && + (c->x86_mask < 0x6 || c->x86_mask == 0xb)) + set_cpu_cap(c, X86_FEATURE_11AP); + + +#ifdef CONFIG_X86_INTEL_USERCOPY + /* + * Set up the preferred alignment for movsl bulk memory moves + */ + switch (c->x86) { + case 4: /* 486: untested */ + break; + case 5: /* Old Pentia: untested */ + break; + case 6: /* PII/PIII only like movsl with 8-byte alignment */ + movsl_mask.mask = 7; + break; + case 15: /* P4 is OK down to 8-byte alignment */ + movsl_mask.mask = 7; + break; + } +#endif + +#ifdef CONFIG_X86_NUMAQ + numaq_tsc_disable(); +#endif } +#else +static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) +{ +} +#endif +static void __cpuinit srat_detect_node(void) +{ +#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) + unsigned node; + int cpu = smp_processor_id(); + int apicid = hard_smp_processor_id(); + + /* Don't do the funky fallback heuristics the AMD version employs + for now. */ + node = apicid_to_node[apicid]; + if (node == NUMA_NO_NODE || !node_online(node)) + node = first_node(node_online_map); + numa_set_node(cpu, node); + + printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); +#endif +} /* * find out the number of processor cores on the die */ -static int __cpuinit num_cpu_cores(struct cpuinfo_x86 *c) +static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c) { unsigned int eax, ebx, ecx, edx; @@ -98,45 +196,51 @@ static int __cpuinit num_cpu_cores(struct cpuinfo_x86 *c) return 1; } -#ifdef CONFIG_X86_F00F_BUG -static void __cpuinit trap_init_f00f_bug(void) +static void __cpuinit detect_vmx_virtcap(struct cpuinfo_x86 *c) { - __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO); - - /* - * Update the IDT descriptor and reload the IDT so that - * it uses the read-only mapped virtual address. - */ - idt_descr.address = fix_to_virt(FIX_F00F_IDT); - load_idt(&idt_descr); + /* Intel VMX MSR indicated features */ +#define X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW 0x00200000 +#define X86_VMX_FEATURE_PROC_CTLS_VNMI 0x00400000 +#define X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS 0x80000000 +#define X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC 0x00000001 +#define X86_VMX_FEATURE_PROC_CTLS2_EPT 0x00000002 +#define X86_VMX_FEATURE_PROC_CTLS2_VPID 0x00000020 + + u32 vmx_msr_low, vmx_msr_high, msr_ctl, msr_ctl2; + + clear_cpu_cap(c, X86_FEATURE_TPR_SHADOW); + clear_cpu_cap(c, X86_FEATURE_VNMI); + clear_cpu_cap(c, X86_FEATURE_FLEXPRIORITY); + clear_cpu_cap(c, X86_FEATURE_EPT); + clear_cpu_cap(c, X86_FEATURE_VPID); + + rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high); + msr_ctl = vmx_msr_high | vmx_msr_low; + if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW) + set_cpu_cap(c, X86_FEATURE_TPR_SHADOW); + if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_VNMI) + set_cpu_cap(c, X86_FEATURE_VNMI); + if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS) { + rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, + vmx_msr_low, vmx_msr_high); + msr_ctl2 = vmx_msr_high | vmx_msr_low; + if ((msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC) && + (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW)) + set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY); + if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_EPT) + set_cpu_cap(c, X86_FEATURE_EPT); + if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VPID) + set_cpu_cap(c, X86_FEATURE_VPID); + } } -#endif static void __cpuinit init_intel(struct cpuinfo_x86 *c) { unsigned int l2 = 0; - char *p = NULL; early_init_intel(c); -#ifdef CONFIG_X86_F00F_BUG - /* - * All current models of Pentium and Pentium with MMX technology CPUs - * have the F0 0F bug, which lets nonprivileged users lock up the system. - * Note that the workaround only should be initialized once... - */ - c->f00f_bug = 0; - if (!paravirt_enabled() && c->x86 == 5) { - static int f00f_workaround_enabled; - - c->f00f_bug = 1; - if (!f00f_workaround_enabled) { - trap_init_f00f_bug(); - printk(KERN_NOTICE "Intel Pentium with F0 0F bug - workaround enabled.\n"); - f00f_workaround_enabled = 1; - } - } -#endif + intel_workarounds(c); l2 = init_intel_cacheinfo(c); if (c->cpuid_level > 9) { @@ -146,16 +250,32 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); } - /* SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until model 3 mask 3 */ - if ((c->x86<<8 | c->x86_model<<4 | c->x86_mask) < 0x633) - clear_cpu_cap(c, X86_FEATURE_SEP); + if (cpu_has_xmm2) + set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); + if (cpu_has_ds) { + unsigned int l1; + rdmsr(MSR_IA32_MISC_ENABLE, l1, l2); + if (!(l1 & (1<<11))) + set_cpu_cap(c, X86_FEATURE_BTS); + if (!(l1 & (1<<12))) + set_cpu_cap(c, X86_FEATURE_PEBS); + ds_init_intel(c); + } +#ifdef CONFIG_X86_64 + if (c->x86 == 15) + c->x86_cache_alignment = c->x86_clflush_size * 2; + if (c->x86 == 6) + set_cpu_cap(c, X86_FEATURE_REP_GOOD); +#else /* * Names for the Pentium II/Celeron processors * detectable only by also checking the cache size. * Dixon is NOT a Celeron. */ if (c->x86 == 6) { + char *p = NULL; + switch (c->x86_model) { case 5: if (c->x86_mask == 0) { @@ -178,70 +298,41 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) p = "Celeron (Coppermine)"; break; } - } - - if (p) - strcpy(c->x86_model_id, p); - - c->x86_max_cores = num_cpu_cores(c); - - detect_ht(c); - /* Work around errata */ - Intel_errata_workarounds(c); - -#ifdef CONFIG_X86_INTEL_USERCOPY - /* - * Set up the preferred alignment for movsl bulk memory moves - */ - switch (c->x86) { - case 4: /* 486: untested */ - break; - case 5: /* Old Pentia: untested */ - break; - case 6: /* PII/PIII only like movsl with 8-byte alignment */ - movsl_mask.mask = 7; - break; - case 15: /* P4 is OK down to 8-byte alignment */ - movsl_mask.mask = 7; - break; + if (p) + strcpy(c->x86_model_id, p); } -#endif - if (cpu_has_xmm2) - set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); - if (c->x86 == 15) { + if (c->x86 == 15) set_cpu_cap(c, X86_FEATURE_P4); - } if (c->x86 == 6) set_cpu_cap(c, X86_FEATURE_P3); - if (cpu_has_ds) { - unsigned int l1; - rdmsr(MSR_IA32_MISC_ENABLE, l1, l2); - if (!(l1 & (1<<11))) - set_cpu_cap(c, X86_FEATURE_BTS); - if (!(l1 & (1<<12))) - set_cpu_cap(c, X86_FEATURE_PEBS); - } if (cpu_has_bts) - ds_init_intel(c); + ptrace_bts_init_intel(c); - /* - * See if we have a good local APIC by checking for buggy Pentia, - * i.e. all B steppings and the C2 stepping of P54C when using their - * integrated APIC (see 11AP erratum in "Pentium Processor - * Specification Update"). - */ - if (cpu_has_apic && (c->x86<<8 | c->x86_model<<4) == 0x520 && - (c->x86_mask < 0x6 || c->x86_mask == 0xb)) - set_cpu_cap(c, X86_FEATURE_11AP); +#endif -#ifdef CONFIG_X86_NUMAQ - numaq_tsc_disable(); + detect_extended_topology(c); + if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) { + /* + * let's use the legacy cpuid vector 0x1 and 0x4 for topology + * detection. + */ + c->x86_max_cores = intel_num_cpu_cores(c); +#ifdef CONFIG_X86_32 + detect_ht(c); #endif + } + + /* Work around errata */ + srat_detect_node(); + + if (cpu_has(c, X86_FEATURE_VMX)) + detect_vmx_virtcap(c); } +#ifdef CONFIG_X86_32 static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned int size) { /* @@ -254,10 +345,12 @@ static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned i size = 256; return size; } +#endif static struct cpu_dev intel_cpu_dev __cpuinitdata = { .c_vendor = "Intel", .c_ident = { "GenuineIntel" }, +#ifdef CONFIG_X86_32 .c_models = { { .vendor = X86_VENDOR_INTEL, .family = 4, .model_names = { @@ -307,76 +400,12 @@ static struct cpu_dev intel_cpu_dev __cpuinitdata = { } }, }, + .c_size_cache = intel_size_cache, +#endif .c_early_init = early_init_intel, .c_init = init_intel, - .c_size_cache = intel_size_cache, + .c_x86_vendor = X86_VENDOR_INTEL, }; -cpu_vendor_dev_register(X86_VENDOR_INTEL, &intel_cpu_dev); - -#ifndef CONFIG_X86_CMPXCHG -unsigned long cmpxchg_386_u8(volatile void *ptr, u8 old, u8 new) -{ - u8 prev; - unsigned long flags; - - /* Poor man's cmpxchg for 386. Unsuitable for SMP */ - local_irq_save(flags); - prev = *(u8 *)ptr; - if (prev == old) - *(u8 *)ptr = new; - local_irq_restore(flags); - return prev; -} -EXPORT_SYMBOL(cmpxchg_386_u8); - -unsigned long cmpxchg_386_u16(volatile void *ptr, u16 old, u16 new) -{ - u16 prev; - unsigned long flags; - - /* Poor man's cmpxchg for 386. Unsuitable for SMP */ - local_irq_save(flags); - prev = *(u16 *)ptr; - if (prev == old) - *(u16 *)ptr = new; - local_irq_restore(flags); - return prev; -} -EXPORT_SYMBOL(cmpxchg_386_u16); - -unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new) -{ - u32 prev; - unsigned long flags; - - /* Poor man's cmpxchg for 386. Unsuitable for SMP */ - local_irq_save(flags); - prev = *(u32 *)ptr; - if (prev == old) - *(u32 *)ptr = new; - local_irq_restore(flags); - return prev; -} -EXPORT_SYMBOL(cmpxchg_386_u32); -#endif - -#ifndef CONFIG_X86_CMPXCHG64 -unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new) -{ - u64 prev; - unsigned long flags; - - /* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */ - local_irq_save(flags); - prev = *(u64 *)ptr; - if (prev == old) - *(u64 *)ptr = new; - local_irq_restore(flags); - return prev; -} -EXPORT_SYMBOL(cmpxchg_486_u64); -#endif - -/* arch_initcall(intel_cpu_init); */ +cpu_dev_register(intel_cpu_dev); diff --git a/arch/x86/kernel/cpu/intel_64.c b/arch/x86/kernel/cpu/intel_64.c deleted file mode 100644 index 1019c58d39f..00000000000 --- a/arch/x86/kernel/cpu/intel_64.c +++ /dev/null @@ -1,95 +0,0 @@ -#include <linux/init.h> -#include <linux/smp.h> -#include <asm/processor.h> -#include <asm/ptrace.h> -#include <asm/topology.h> -#include <asm/numa_64.h> - -#include "cpu.h" - -static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) -{ - if ((c->x86 == 0xf && c->x86_model >= 0x03) || - (c->x86 == 0x6 && c->x86_model >= 0x0e)) - set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); - - set_cpu_cap(c, X86_FEATURE_SYSENTER32); -} - -/* - * find out the number of processor cores on the die - */ -static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c) -{ - unsigned int eax, t; - - if (c->cpuid_level < 4) - return 1; - - cpuid_count(4, 0, &eax, &t, &t, &t); - - if (eax & 0x1f) - return ((eax >> 26) + 1); - else - return 1; -} - -static void __cpuinit srat_detect_node(void) -{ -#ifdef CONFIG_NUMA - unsigned node; - int cpu = smp_processor_id(); - int apicid = hard_smp_processor_id(); - - /* Don't do the funky fallback heuristics the AMD version employs - for now. */ - node = apicid_to_node[apicid]; - if (node == NUMA_NO_NODE || !node_online(node)) - node = first_node(node_online_map); - numa_set_node(cpu, node); - - printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); -#endif -} - -static void __cpuinit init_intel(struct cpuinfo_x86 *c) -{ - init_intel_cacheinfo(c); - if (c->cpuid_level > 9) { - unsigned eax = cpuid_eax(10); - /* Check for version and the number of counters */ - if ((eax & 0xff) && (((eax>>8) & 0xff) > 1)) - set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); - } - - if (cpu_has_ds) { - unsigned int l1, l2; - rdmsr(MSR_IA32_MISC_ENABLE, l1, l2); - if (!(l1 & (1<<11))) - set_cpu_cap(c, X86_FEATURE_BTS); - if (!(l1 & (1<<12))) - set_cpu_cap(c, X86_FEATURE_PEBS); - } - - - if (cpu_has_bts) - ds_init_intel(c); - - if (c->x86 == 15) - c->x86_cache_alignment = c->x86_clflush_size * 2; - if (c->x86 == 6) - set_cpu_cap(c, X86_FEATURE_REP_GOOD); - set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); - c->x86_max_cores = intel_num_cpu_cores(c); - - srat_detect_node(); -} - -static struct cpu_dev intel_cpu_dev __cpuinitdata = { - .c_vendor = "Intel", - .c_ident = { "GenuineIntel" }, - .c_early_init = early_init_intel, - .c_init = init_intel, -}; -cpu_vendor_dev_register(X86_VENDOR_INTEL, &intel_cpu_dev); - diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 6b0a10b002f..3f46afbb1cf 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -1,8 +1,8 @@ /* - * Routines to indentify caches on Intel CPU. + * Routines to indentify caches on Intel CPU. * - * Changes: - * Venkatesh Pallipadi : Adding cache identification through cpuid(4) + * Changes: + * Venkatesh Pallipadi : Adding cache identification through cpuid(4) * Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure. * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD. */ @@ -13,6 +13,7 @@ #include <linux/compiler.h> #include <linux/cpu.h> #include <linux/sched.h> +#include <linux/pci.h> #include <asm/processor.h> #include <asm/smp.h> @@ -130,9 +131,18 @@ struct _cpuid4_info { union _cpuid4_leaf_ebx ebx; union _cpuid4_leaf_ecx ecx; unsigned long size; + unsigned long can_disable; cpumask_t shared_cpu_map; /* future?: only cpus/node is needed */ }; +#ifdef CONFIG_PCI +static struct pci_device_id k8_nb_id[] = { + { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) }, + {} +}; +#endif + unsigned short num_cache_leaves; /* AMD doesn't have CPUID4. Emulate it here to report the same @@ -182,9 +192,10 @@ static unsigned short assocs[] __cpuinitdata = { static unsigned char levels[] __cpuinitdata = { 1, 1, 2, 3 }; static unsigned char types[] __cpuinitdata = { 1, 2, 3, 3 }; -static void __cpuinit amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, - union _cpuid4_leaf_ebx *ebx, - union _cpuid4_leaf_ecx *ecx) +static void __cpuinit +amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, + union _cpuid4_leaf_ebx *ebx, + union _cpuid4_leaf_ecx *ecx) { unsigned dummy; unsigned line_size, lines_per_tag, assoc, size_in_kb; @@ -251,27 +262,40 @@ static void __cpuinit amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, (ebx->split.ways_of_associativity + 1) - 1; } -static int __cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf) +static void __cpuinit +amd_check_l3_disable(int index, struct _cpuid4_info *this_leaf) +{ + if (index < 3) + return; + this_leaf->can_disable = 1; +} + +static int +__cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf) { union _cpuid4_leaf_eax eax; union _cpuid4_leaf_ebx ebx; union _cpuid4_leaf_ecx ecx; unsigned edx; - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { amd_cpuid4(index, &eax, &ebx, &ecx); - else - cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); + if (boot_cpu_data.x86 >= 0x10) + amd_check_l3_disable(index, this_leaf); + } else { + cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); + } + if (eax.split.type == CACHE_TYPE_NULL) return -EIO; /* better error ? */ this_leaf->eax = eax; this_leaf->ebx = ebx; this_leaf->ecx = ecx; - this_leaf->size = (ecx.split.number_of_sets + 1) * - (ebx.split.coherency_line_size + 1) * - (ebx.split.physical_line_partition + 1) * - (ebx.split.ways_of_associativity + 1); + this_leaf->size = (ecx.split.number_of_sets + 1) * + (ebx.split.coherency_line_size + 1) * + (ebx.split.physical_line_partition + 1) * + (ebx.split.ways_of_associativity + 1); return 0; } @@ -453,7 +477,7 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) /* pointer to _cpuid4_info array (for each cache leaf) */ static DEFINE_PER_CPU(struct _cpuid4_info *, cpuid4_info); -#define CPUID4_INFO_IDX(x, y) (&((per_cpu(cpuid4_info, x))[y])) +#define CPUID4_INFO_IDX(x, y) (&((per_cpu(cpuid4_info, x))[y])) #ifdef CONFIG_SMP static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) @@ -490,7 +514,7 @@ static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index) this_leaf = CPUID4_INFO_IDX(cpu, index); for_each_cpu_mask_nr(sibling, this_leaf->shared_cpu_map) { - sibling_leaf = CPUID4_INFO_IDX(sibling, index); + sibling_leaf = CPUID4_INFO_IDX(sibling, index); cpu_clear(cpu, sibling_leaf->shared_cpu_map); } } @@ -572,7 +596,7 @@ struct _index_kobject { /* pointer to array of kobjects for cpuX/cache/indexY */ static DEFINE_PER_CPU(struct _index_kobject *, index_kobject); -#define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(index_kobject, x))[y])) +#define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(index_kobject, x))[y])) #define show_one_plus(file_name, object, val) \ static ssize_t show_##file_name \ @@ -637,6 +661,99 @@ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) { } } +#define to_object(k) container_of(k, struct _index_kobject, kobj) +#define to_attr(a) container_of(a, struct _cache_attr, attr) + +#ifdef CONFIG_PCI +static struct pci_dev *get_k8_northbridge(int node) +{ + struct pci_dev *dev = NULL; + int i; + + for (i = 0; i <= node; i++) { + do { + dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev); + if (!dev) + break; + } while (!pci_match_id(&k8_nb_id[0], dev)); + if (!dev) + break; + } + return dev; +} +#else +static struct pci_dev *get_k8_northbridge(int node) +{ + return NULL; +} +#endif + +static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf) +{ + int node = cpu_to_node(first_cpu(this_leaf->shared_cpu_map)); + struct pci_dev *dev = NULL; + ssize_t ret = 0; + int i; + + if (!this_leaf->can_disable) + return sprintf(buf, "Feature not enabled\n"); + + dev = get_k8_northbridge(node); + if (!dev) { + printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n"); + return -EINVAL; + } + + for (i = 0; i < 2; i++) { + unsigned int reg; + + pci_read_config_dword(dev, 0x1BC + i * 4, ®); + + ret += sprintf(buf, "%sEntry: %d\n", buf, i); + ret += sprintf(buf, "%sReads: %s\tNew Entries: %s\n", + buf, + reg & 0x80000000 ? "Disabled" : "Allowed", + reg & 0x40000000 ? "Disabled" : "Allowed"); + ret += sprintf(buf, "%sSubCache: %x\tIndex: %x\n", + buf, (reg & 0x30000) >> 16, reg & 0xfff); + } + return ret; +} + +static ssize_t +store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf, + size_t count) +{ + int node = cpu_to_node(first_cpu(this_leaf->shared_cpu_map)); + struct pci_dev *dev = NULL; + unsigned int ret, index, val; + + if (!this_leaf->can_disable) + return 0; + + if (strlen(buf) > 15) + return -EINVAL; + + ret = sscanf(buf, "%x %x", &index, &val); + if (ret != 2) + return -EINVAL; + if (index > 1) + return -EINVAL; + + val |= 0xc0000000; + dev = get_k8_northbridge(node); + if (!dev) { + printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n"); + return -EINVAL; + } + + pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000); + wbinvd(); + pci_write_config_dword(dev, 0x1BC + index * 4, val); + + return 1; +} + struct _cache_attr { struct attribute attr; ssize_t (*show)(struct _cpuid4_info *, char *); @@ -657,6 +774,8 @@ define_one_ro(size); define_one_ro(shared_cpu_map); define_one_ro(shared_cpu_list); +static struct _cache_attr cache_disable = __ATTR(cache_disable, 0644, show_cache_disable, store_cache_disable); + static struct attribute * default_attrs[] = { &type.attr, &level.attr, @@ -667,12 +786,10 @@ static struct attribute * default_attrs[] = { &size.attr, &shared_cpu_map.attr, &shared_cpu_list.attr, + &cache_disable.attr, NULL }; -#define to_object(k) container_of(k, struct _index_kobject, kobj) -#define to_attr(a) container_of(a, struct _cache_attr, attr) - static ssize_t show(struct kobject * kobj, struct attribute * attr, char * buf) { struct _cache_attr *fattr = to_attr(attr); @@ -682,14 +799,22 @@ static ssize_t show(struct kobject * kobj, struct attribute * attr, char * buf) ret = fattr->show ? fattr->show(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index), buf) : - 0; + 0; return ret; } static ssize_t store(struct kobject * kobj, struct attribute * attr, const char * buf, size_t count) { - return 0; + struct _cache_attr *fattr = to_attr(attr); + struct _index_kobject *this_leaf = to_object(kobj); + ssize_t ret; + + ret = fattr->store ? + fattr->store(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index), + buf, count) : + 0; + return ret; } static struct sysfs_ops sysfs_ops = { diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 65a339678ec..4b031a4ac85 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -759,6 +759,7 @@ static struct sysdev_class mce_sysclass = { }; DEFINE_PER_CPU(struct sys_device, device_mce); +void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinitdata; /* Why are there no generic functions for this? */ #define ACCESSOR(name, var, start) \ @@ -859,7 +860,7 @@ error: return err; } -static void mce_remove_device(unsigned int cpu) +static __cpuinit void mce_remove_device(unsigned int cpu) { int i; @@ -883,9 +884,13 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, case CPU_ONLINE: case CPU_ONLINE_FROZEN: mce_create_device(cpu); + if (threshold_cpu_callback) + threshold_cpu_callback(action, cpu); break; case CPU_DEAD: case CPU_DEAD_FROZEN: + if (threshold_cpu_callback) + threshold_cpu_callback(action, cpu); mce_remove_device(cpu); break; } diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c index 88736cadbaa..5eb390a4b2e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c @@ -628,6 +628,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank) deallocate_threshold_block(cpu, bank); free_out: + kobject_del(b->kobj); kobject_put(b->kobj); kfree(b); per_cpu(threshold_banks, cpu)[bank] = NULL; @@ -645,14 +646,11 @@ static void threshold_remove_device(unsigned int cpu) } /* get notified when a cpu comes on/off */ -static int __cpuinit threshold_cpu_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) +static void __cpuinit amd_64_threshold_cpu_callback(unsigned long action, + unsigned int cpu) { - /* cpu was unsigned int to begin with */ - unsigned int cpu = (unsigned long)hcpu; - if (cpu >= NR_CPUS) - goto out; + return; switch (action) { case CPU_ONLINE: @@ -666,14 +664,8 @@ static int __cpuinit threshold_cpu_callback(struct notifier_block *nfb, default: break; } - out: - return NOTIFY_OK; } -static struct notifier_block threshold_cpu_notifier __cpuinitdata = { - .notifier_call = threshold_cpu_callback, -}; - static __init int threshold_init_device(void) { unsigned lcpu = 0; @@ -684,7 +676,7 @@ static __init int threshold_init_device(void) if (err) return err; } - register_hotcpu_notifier(&threshold_cpu_notifier); + threshold_cpu_callback = amd_64_threshold_cpu_callback; return 0; } diff --git a/arch/x86/kernel/cpu/mkcapflags.pl b/arch/x86/kernel/cpu/mkcapflags.pl new file mode 100644 index 00000000000..dfea390e160 --- /dev/null +++ b/arch/x86/kernel/cpu/mkcapflags.pl @@ -0,0 +1,32 @@ +#!/usr/bin/perl +# +# Generate the x86_cap_flags[] array from include/asm-x86/cpufeature.h +# + +($in, $out) = @ARGV; + +open(IN, "< $in\0") or die "$0: cannot open: $in: $!\n"; +open(OUT, "> $out\0") or die "$0: cannot create: $out: $!\n"; + +print OUT "#include <asm/cpufeature.h>\n\n"; +print OUT "const char * const x86_cap_flags[NCAPINTS*32] = {\n"; + +while (defined($line = <IN>)) { + if ($line =~ /^\s*\#\s*define\s+(X86_FEATURE_(\S+))\s+(.*)$/) { + $macro = $1; + $feature = $2; + $tail = $3; + if ($tail =~ /\/\*\s*\"([^"]*)\".*\*\//) { + $feature = $1; + } + + if ($feature ne '') { + printf OUT "\t%-32s = \"%s\",\n", + "[$macro]", "\L$feature"; + } + } +} +print OUT "};\n"; + +close(IN); +close(OUT); diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 509bd3d9eac..4e8d77f01ee 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -379,6 +379,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, unsigned long *size, mtrr_type *type) { unsigned int mask_lo, mask_hi, base_lo, base_hi; + unsigned int tmp, hi; rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi); if ((mask_lo & 0x800) == 0) { @@ -392,8 +393,18 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi); /* Work out the shifted address mask. */ - mask_lo = size_or_mask | mask_hi << (32 - PAGE_SHIFT) - | mask_lo >> PAGE_SHIFT; + tmp = mask_hi << (32 - PAGE_SHIFT) | mask_lo >> PAGE_SHIFT; + mask_lo = size_or_mask | tmp; + /* Expand tmp with high bits to all 1s*/ + hi = fls(tmp); + if (hi > 0) { + tmp |= ~((1<<(hi - 1)) - 1); + + if (tmp != mask_lo) { + WARN_ONCE(1, KERN_INFO "mtrr: your BIOS has set up an incorrect mask, fixing it up.\n"); + mask_lo = tmp; + } + } /* This works correctly if size is a power of two, i.e. a contiguous range. */ diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c index 84c480bb371..4c4214690dd 100644 --- a/arch/x86/kernel/cpu/mtrr/if.c +++ b/arch/x86/kernel/cpu/mtrr/if.c @@ -405,9 +405,9 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset) } /* RED-PEN: base can be > 32bit */ len += seq_printf(seq, - "reg%02i: base=0x%05lx000 (%4luMB), size=%4lu%cB: %s, count=%d\n", + "reg%02i: base=0x%06lx000 (%5luMB), size=%5lu%cB, count=%d: %s\n", i, base, base >> (20 - PAGE_SHIFT), size, factor, - mtrr_attrib_to_str(type), mtrr_usage_table[i]); + mtrr_usage_table[i], mtrr_attrib_to_str(type)); } } return 0; diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 6f23969c8fa..c78c04821ea 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -729,7 +729,7 @@ struct var_mtrr_range_state { mtrr_type type; }; -struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; +static struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; static int __initdata debug_print; static int __init @@ -759,7 +759,8 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range, /* take out UC ranges */ for (i = 0; i < num_var_ranges; i++) { type = range_state[i].type; - if (type != MTRR_TYPE_UNCACHABLE) + if (type != MTRR_TYPE_UNCACHABLE && + type != MTRR_TYPE_WRPROT) continue; size = range_state[i].size_pfn; if (!size) @@ -834,7 +835,14 @@ static int __init enable_mtrr_cleanup_setup(char *str) enable_mtrr_cleanup = 1; return 0; } -early_param("enble_mtrr_cleanup", enable_mtrr_cleanup_setup); +early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup); + +static int __init mtrr_cleanup_debug_setup(char *str) +{ + debug_print = 1; + return 0; +} +early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup); struct var_mtrr_state { unsigned long range_startk; @@ -898,6 +906,27 @@ set_var_mtrr_all(unsigned int address_bits) } } +static unsigned long to_size_factor(unsigned long sizek, char *factorp) +{ + char factor; + unsigned long base = sizek; + + if (base & ((1<<10) - 1)) { + /* not MB alignment */ + factor = 'K'; + } else if (base & ((1<<20) - 1)){ + factor = 'M'; + base >>= 10; + } else { + factor = 'G'; + base >>= 20; + } + + *factorp = factor; + + return base; +} + static unsigned int __init range_to_mtrr(unsigned int reg, unsigned long range_startk, unsigned long range_sizek, unsigned char type) @@ -919,13 +948,21 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk, align = max_align; sizek = 1 << align; - if (debug_print) + if (debug_print) { + char start_factor = 'K', size_factor = 'K'; + unsigned long start_base, size_base; + + start_base = to_size_factor(range_startk, &start_factor), + size_base = to_size_factor(sizek, &size_factor), + printk(KERN_DEBUG "Setting variable MTRR %d, " - "base: %ldMB, range: %ldMB, type %s\n", - reg, range_startk >> 10, sizek >> 10, + "base: %ld%cB, range: %ld%cB, type %s\n", + reg, start_base, start_factor, + size_base, size_factor, (type == MTRR_TYPE_UNCACHABLE)?"UC": ((type == MTRR_TYPE_WRBACK)?"WB":"Other") ); + } save_var_mtrr(reg++, range_startk, sizek, type); range_startk += sizek; range_sizek -= sizek; @@ -970,6 +1007,8 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek, /* try to append some small hole */ range0_basek = state->range_startk; range0_sizek = ALIGN(state->range_sizek, chunk_sizek); + + /* no increase */ if (range0_sizek == state->range_sizek) { if (debug_print) printk(KERN_DEBUG "rangeX: %016lx - %016lx\n", @@ -980,13 +1019,40 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek, return 0; } - range0_sizek -= chunk_sizek; - if (range0_sizek && sizek) { - while (range0_basek + range0_sizek > (basek + sizek)) { - range0_sizek -= chunk_sizek; - if (!range0_sizek) - break; - } + /* only cut back, when it is not the last */ + if (sizek) { + while (range0_basek + range0_sizek > (basek + sizek)) { + if (range0_sizek >= chunk_sizek) + range0_sizek -= chunk_sizek; + else + range0_sizek = 0; + + if (!range0_sizek) + break; + } + } + +second_try: + range_basek = range0_basek + range0_sizek; + + /* one hole in the middle */ + if (range_basek > basek && range_basek <= (basek + sizek)) + second_sizek = range_basek - basek; + + if (range0_sizek > state->range_sizek) { + + /* one hole in middle or at end */ + hole_sizek = range0_sizek - state->range_sizek - second_sizek; + + /* hole size should be less than half of range0 size */ + if (hole_sizek >= (range0_sizek >> 1) && + range0_sizek >= chunk_sizek) { + range0_sizek -= chunk_sizek; + second_sizek = 0; + hole_sizek = 0; + + goto second_try; + } } if (range0_sizek) { @@ -996,50 +1062,28 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek, (range0_basek + range0_sizek)<<10); state->reg = range_to_mtrr(state->reg, range0_basek, range0_sizek, MTRR_TYPE_WRBACK); - - } - - range_basek = range0_basek + range0_sizek; - range_sizek = chunk_sizek; - - if (range_basek + range_sizek > basek && - range_basek + range_sizek <= (basek + sizek)) { - /* one hole */ - second_basek = basek; - second_sizek = range_basek + range_sizek - basek; } - /* if last piece, only could one hole near end */ - if ((second_basek || !basek) && - range_sizek - (state->range_sizek - range0_sizek) - second_sizek < - (chunk_sizek >> 1)) { - /* - * one hole in middle (second_sizek is 0) or at end - * (second_sizek is 0 ) - */ - hole_sizek = range_sizek - (state->range_sizek - range0_sizek) - - second_sizek; - hole_basek = range_basek + range_sizek - hole_sizek - - second_sizek; - } else { - /* fallback for big hole, or several holes */ + if (range0_sizek < state->range_sizek) { + /* need to handle left over */ range_sizek = state->range_sizek - range0_sizek; - second_basek = 0; - second_sizek = 0; + + if (debug_print) + printk(KERN_DEBUG "range: %016lx - %016lx\n", + range_basek<<10, + (range_basek + range_sizek)<<10); + state->reg = range_to_mtrr(state->reg, range_basek, + range_sizek, MTRR_TYPE_WRBACK); } - if (debug_print) - printk(KERN_DEBUG "range: %016lx - %016lx\n", range_basek<<10, - (range_basek + range_sizek)<<10); - state->reg = range_to_mtrr(state->reg, range_basek, range_sizek, - MTRR_TYPE_WRBACK); if (hole_sizek) { + hole_basek = range_basek - hole_sizek - second_sizek; if (debug_print) printk(KERN_DEBUG "hole: %016lx - %016lx\n", - hole_basek<<10, (hole_basek + hole_sizek)<<10); - state->reg = range_to_mtrr(state->reg, hole_basek, hole_sizek, - MTRR_TYPE_UNCACHABLE); - + hole_basek<<10, + (hole_basek + hole_sizek)<<10); + state->reg = range_to_mtrr(state->reg, hole_basek, + hole_sizek, MTRR_TYPE_UNCACHABLE); } return second_sizek; @@ -1154,11 +1198,11 @@ struct mtrr_cleanup_result { }; /* - * gran_size: 1M, 2M, ..., 2G - * chunk size: gran_size, ..., 4G - * so we need (2+13)*6 + * gran_size: 64K, 128K, 256K, 512K, 1M, 2M, ..., 2G + * chunk size: gran_size, ..., 2G + * so we need (1+16)*8 */ -#define NUM_RESULT 90 +#define NUM_RESULT 136 #define PSHIFT (PAGE_SHIFT - 10) static struct mtrr_cleanup_result __initdata result[NUM_RESULT]; @@ -1168,13 +1212,14 @@ static unsigned long __initdata min_loss_pfn[RANGE_NUM]; static int __init mtrr_cleanup(unsigned address_bits) { unsigned long extra_remove_base, extra_remove_size; - unsigned long i, base, size, def, dummy; + unsigned long base, size, def, dummy; mtrr_type type; int nr_range, nr_range_new; u64 chunk_size, gran_size; unsigned long range_sums, range_sums_new; int index_good; int num_reg_good; + int i; /* extra one for all 0 */ int num[MTRR_NUM_TYPES + 1]; @@ -1204,6 +1249,8 @@ static int __init mtrr_cleanup(unsigned address_bits) continue; if (!size) type = MTRR_NUM_TYPES; + if (type == MTRR_TYPE_WRPROT) + type = MTRR_TYPE_UNCACHABLE; num[type]++; } @@ -1216,23 +1263,57 @@ static int __init mtrr_cleanup(unsigned address_bits) num_var_ranges - num[MTRR_NUM_TYPES]) return 0; + /* print original var MTRRs at first, for debugging: */ + printk(KERN_DEBUG "original variable MTRRs\n"); + for (i = 0; i < num_var_ranges; i++) { + char start_factor = 'K', size_factor = 'K'; + unsigned long start_base, size_base; + + size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10); + if (!size_base) + continue; + + size_base = to_size_factor(size_base, &size_factor), + start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10); + start_base = to_size_factor(start_base, &start_factor), + type = range_state[i].type; + + printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n", + i, start_base, start_factor, + size_base, size_factor, + (type == MTRR_TYPE_UNCACHABLE) ? "UC" : + ((type == MTRR_TYPE_WRPROT) ? "WP" : + ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other")) + ); + } + memset(range, 0, sizeof(range)); extra_remove_size = 0; - if (mtrr_tom2) { - extra_remove_base = 1 << (32 - PAGE_SHIFT); + extra_remove_base = 1 << (32 - PAGE_SHIFT); + if (mtrr_tom2) extra_remove_size = (mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base; - } nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base, extra_remove_size); + /* + * [0, 1M) should always be coverred by var mtrr with WB + * and fixed mtrrs should take effective before var mtrr for it + */ + nr_range = add_range_with_merge(range, nr_range, 0, + (1ULL<<(20 - PAGE_SHIFT)) - 1); + /* sort the ranges */ + sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); + range_sums = sum_ranges(range, nr_range); printk(KERN_INFO "total RAM coverred: %ldM\n", range_sums >> (20 - PAGE_SHIFT)); if (mtrr_chunk_size && mtrr_gran_size) { int num_reg; + char gran_factor, chunk_factor, lose_factor; + unsigned long gran_base, chunk_base, lose_base; - debug_print = 1; + debug_print++; /* convert ranges to var ranges state */ num_reg = x86_setup_var_mtrrs(range, nr_range, mtrr_chunk_size, mtrr_gran_size); @@ -1256,34 +1337,48 @@ static int __init mtrr_cleanup(unsigned address_bits) result[i].lose_cover_sizek = (range_sums - range_sums_new) << PSHIFT; - printk(KERN_INFO "%sgran_size: %ldM \tchunk_size: %ldM \t", - result[i].bad?"*BAD*":" ", result[i].gran_sizek >> 10, - result[i].chunk_sizek >> 10); - printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ldM \n", + gran_base = to_size_factor(result[i].gran_sizek, &gran_factor), + chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor), + lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor), + printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t", + result[i].bad?"*BAD*":" ", + gran_base, gran_factor, chunk_base, chunk_factor); + printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n", result[i].num_reg, result[i].bad?"-":"", - result[i].lose_cover_sizek >> 10); + lose_base, lose_factor); if (!result[i].bad) { set_var_mtrr_all(address_bits); return 1; } printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, " "will find optimal one\n"); - debug_print = 0; + debug_print--; memset(result, 0, sizeof(result[0])); } i = 0; memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn)); memset(result, 0, sizeof(result)); - for (gran_size = (1ULL<<20); gran_size < (1ULL<<32); gran_size <<= 1) { - for (chunk_size = gran_size; chunk_size < (1ULL<<33); + for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) { + char gran_factor; + unsigned long gran_base; + + if (debug_print) + gran_base = to_size_factor(gran_size >> 10, &gran_factor); + + for (chunk_size = gran_size; chunk_size < (1ULL<<32); chunk_size <<= 1) { int num_reg; - if (debug_print) - printk(KERN_INFO - "\ngran_size: %lldM chunk_size_size: %lldM\n", - gran_size >> 20, chunk_size >> 20); + if (debug_print) { + char chunk_factor; + unsigned long chunk_base; + + chunk_base = to_size_factor(chunk_size>>10, &chunk_factor), + printk(KERN_INFO "\n"); + printk(KERN_INFO "gran_size: %ld%c chunk_size: %ld%c \n", + gran_base, gran_factor, chunk_base, chunk_factor); + } if (i >= NUM_RESULT) continue; @@ -1326,12 +1421,18 @@ static int __init mtrr_cleanup(unsigned address_bits) /* print out all */ for (i = 0; i < NUM_RESULT; i++) { - printk(KERN_INFO "%sgran_size: %ldM \tchunk_size: %ldM \t", - result[i].bad?"*BAD* ":" ", result[i].gran_sizek >> 10, - result[i].chunk_sizek >> 10); - printk(KERN_CONT "num_reg: %d \tlose RAM: %s%ldM\n", - result[i].num_reg, result[i].bad?"-":"", - result[i].lose_cover_sizek >> 10); + char gran_factor, chunk_factor, lose_factor; + unsigned long gran_base, chunk_base, lose_base; + + gran_base = to_size_factor(result[i].gran_sizek, &gran_factor), + chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor), + lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor), + printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t", + result[i].bad?"*BAD*":" ", + gran_base, gran_factor, chunk_base, chunk_factor); + printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n", + result[i].num_reg, result[i].bad?"-":"", + lose_base, lose_factor); } /* try to find the optimal index */ @@ -1339,10 +1440,8 @@ static int __init mtrr_cleanup(unsigned address_bits) nr_mtrr_spare_reg = num_var_ranges - 1; num_reg_good = -1; for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) { - if (!min_loss_pfn[i]) { + if (!min_loss_pfn[i]) num_reg_good = i; - break; - } } index_good = -1; @@ -1358,21 +1457,26 @@ static int __init mtrr_cleanup(unsigned address_bits) } if (index_good != -1) { + char gran_factor, chunk_factor, lose_factor; + unsigned long gran_base, chunk_base, lose_base; + printk(KERN_INFO "Found optimal setting for mtrr clean up\n"); i = index_good; - printk(KERN_INFO "gran_size: %ldM \tchunk_size: %ldM \t", - result[i].gran_sizek >> 10, - result[i].chunk_sizek >> 10); - printk(KERN_CONT "num_reg: %d \tlose RAM: %ldM\n", - result[i].num_reg, - result[i].lose_cover_sizek >> 10); + gran_base = to_size_factor(result[i].gran_sizek, &gran_factor), + chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor), + lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor), + printk(KERN_INFO "gran_size: %ld%c \tchunk_size: %ld%c \t", + gran_base, gran_factor, chunk_base, chunk_factor); + printk(KERN_CONT "num_reg: %d \tlose RAM: %ld%c\n", + result[i].num_reg, lose_base, lose_factor); /* convert ranges to var ranges state */ chunk_size = result[i].chunk_sizek; chunk_size <<= 10; gran_size = result[i].gran_sizek; gran_size <<= 10; - debug_print = 1; + debug_print++; x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size); + debug_print--; set_var_mtrr_all(address_bits); return 1; } @@ -1496,11 +1600,8 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) /* kvm/qemu doesn't have mtrr set right, don't trim them all */ if (!highest_pfn) { - if (!kvm_para_available()) { - printk(KERN_WARNING + WARN(!kvm_para_available(), KERN_WARNING "WARNING: strange, CPU MTRRs all blank?\n"); - WARN_ON(1); - } return 0; } diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 05cc22dbd4f..6bff382094f 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -295,13 +295,19 @@ static int setup_k7_watchdog(unsigned nmi_hz) /* setup the timer */ wrmsr(evntsel_msr, evntsel, 0); write_watchdog_counter(perfctr_msr, "K7_PERFCTR0",nmi_hz); - apic_write(APIC_LVTPC, APIC_DM_NMI); - evntsel |= K7_EVNTSEL_ENABLE; - wrmsr(evntsel_msr, evntsel, 0); + /* initialize the wd struct before enabling */ wd->perfctr_msr = perfctr_msr; wd->evntsel_msr = evntsel_msr; wd->cccr_msr = 0; /* unused */ + + /* ok, everything is initialized, announce that we're set */ + cpu_nmi_set_wd_enabled(); + + apic_write(APIC_LVTPC, APIC_DM_NMI); + evntsel |= K7_EVNTSEL_ENABLE; + wrmsr(evntsel_msr, evntsel, 0); + return 1; } @@ -379,13 +385,19 @@ static int setup_p6_watchdog(unsigned nmi_hz) wrmsr(evntsel_msr, evntsel, 0); nmi_hz = adjust_for_32bit_ctr(nmi_hz); write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0",nmi_hz); - apic_write(APIC_LVTPC, APIC_DM_NMI); - evntsel |= P6_EVNTSEL0_ENABLE; - wrmsr(evntsel_msr, evntsel, 0); + /* initialize the wd struct before enabling */ wd->perfctr_msr = perfctr_msr; wd->evntsel_msr = evntsel_msr; wd->cccr_msr = 0; /* unused */ + + /* ok, everything is initialized, announce that we're set */ + cpu_nmi_set_wd_enabled(); + + apic_write(APIC_LVTPC, APIC_DM_NMI); + evntsel |= P6_EVNTSEL0_ENABLE; + wrmsr(evntsel_msr, evntsel, 0); + return 1; } @@ -432,6 +444,27 @@ static const struct wd_ops p6_wd_ops = { #define P4_CCCR_ENABLE (1 << 12) #define P4_CCCR_OVF (1 << 31) +#define P4_CONTROLS 18 +static unsigned int p4_controls[18] = { + MSR_P4_BPU_CCCR0, + MSR_P4_BPU_CCCR1, + MSR_P4_BPU_CCCR2, + MSR_P4_BPU_CCCR3, + MSR_P4_MS_CCCR0, + MSR_P4_MS_CCCR1, + MSR_P4_MS_CCCR2, + MSR_P4_MS_CCCR3, + MSR_P4_FLAME_CCCR0, + MSR_P4_FLAME_CCCR1, + MSR_P4_FLAME_CCCR2, + MSR_P4_FLAME_CCCR3, + MSR_P4_IQ_CCCR0, + MSR_P4_IQ_CCCR1, + MSR_P4_IQ_CCCR2, + MSR_P4_IQ_CCCR3, + MSR_P4_IQ_CCCR4, + MSR_P4_IQ_CCCR5, +}; /* * Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter * CRU_ESCR0 (with any non-null event selector) through a complemented @@ -473,6 +506,26 @@ static int setup_p4_watchdog(unsigned nmi_hz) evntsel_msr = MSR_P4_CRU_ESCR0; cccr_msr = MSR_P4_IQ_CCCR0; cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4); + + /* + * If we're on the kdump kernel or other situation, we may + * still have other performance counter registers set to + * interrupt and they'll keep interrupting forever because + * of the P4_CCCR_OVF quirk. So we need to ACK all the + * pending interrupts and disable all the registers here, + * before reenabling the NMI delivery. Refer to p4_rearm() + * about the P4_CCCR_OVF quirk. + */ + if (reset_devices) { + unsigned int low, high; + int i; + + for (i = 0; i < P4_CONTROLS; i++) { + rdmsr(p4_controls[i], low, high); + low &= ~(P4_CCCR_ENABLE | P4_CCCR_OVF); + wrmsr(p4_controls[i], low, high); + } + } } else { /* logical cpu 1 */ perfctr_msr = MSR_P4_IQ_PERFCTR1; @@ -499,12 +552,17 @@ static int setup_p4_watchdog(unsigned nmi_hz) wrmsr(evntsel_msr, evntsel, 0); wrmsr(cccr_msr, cccr_val, 0); write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0", nmi_hz); - apic_write(APIC_LVTPC, APIC_DM_NMI); - cccr_val |= P4_CCCR_ENABLE; - wrmsr(cccr_msr, cccr_val, 0); + wd->perfctr_msr = perfctr_msr; wd->evntsel_msr = evntsel_msr; wd->cccr_msr = cccr_msr; + + /* ok, everything is initialized, announce that we're set */ + cpu_nmi_set_wd_enabled(); + + apic_write(APIC_LVTPC, APIC_DM_NMI); + cccr_val |= P4_CCCR_ENABLE; + wrmsr(cccr_msr, cccr_val, 0); return 1; } @@ -620,13 +678,17 @@ static int setup_intel_arch_watchdog(unsigned nmi_hz) wrmsr(evntsel_msr, evntsel, 0); nmi_hz = adjust_for_32bit_ctr(nmi_hz); write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0", nmi_hz); - apic_write(APIC_LVTPC, APIC_DM_NMI); - evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsr(evntsel_msr, evntsel, 0); wd->perfctr_msr = perfctr_msr; wd->evntsel_msr = evntsel_msr; wd->cccr_msr = 0; /* unused */ + + /* ok, everything is initialized, announce that we're set */ + cpu_nmi_set_wd_enabled(); + + apic_write(APIC_LVTPC, APIC_DM_NMI); + evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsr(evntsel_msr, evntsel, 0); intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1); return 1; } diff --git a/arch/x86/kernel/cpu/powerflags.c b/arch/x86/kernel/cpu/powerflags.c new file mode 100644 index 00000000000..5abbea297e0 --- /dev/null +++ b/arch/x86/kernel/cpu/powerflags.c @@ -0,0 +1,20 @@ +/* + * Strings for the various x86 power flags + * + * This file must not contain any executable code. + */ + +#include <asm/cpufeature.h> + +const char *const x86_power_flags[32] = { + "ts", /* temperature sensor */ + "fid", /* frequency id control */ + "vid", /* voltage id control */ + "ttp", /* thermal trip */ + "tm", + "stc", + "100mhzsteps", + "hwpstate", + "", /* tsc invariant mapped to constant_tsc */ + /* nothing */ +}; diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c index b911a2c61b8..52b3fefbd5a 100644 --- a/arch/x86/kernel/cpu/transmeta.c +++ b/arch/x86/kernel/cpu/transmeta.c @@ -5,6 +5,18 @@ #include <asm/msr.h> #include "cpu.h" +static void __cpuinit early_init_transmeta(struct cpuinfo_x86 *c) +{ + u32 xlvl; + + /* Transmeta-defined flags: level 0x80860001 */ + xlvl = cpuid_eax(0x80860000); + if ((xlvl & 0xffff0000) == 0x80860000) { + if (xlvl >= 0x80860001) + c->x86_capability[2] = cpuid_edx(0x80860001); + } +} + static void __cpuinit init_transmeta(struct cpuinfo_x86 *c) { unsigned int cap_mask, uk, max, dummy; @@ -12,7 +24,8 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c) unsigned int cpu_rev, cpu_freq = 0, cpu_flags, new_cpu_rev; char cpu_info[65]; - get_model_name(c); /* Same as AMD/Cyrix */ + early_init_transmeta(c); + display_cacheinfo(c); /* Print CMS and CPU revision */ @@ -85,23 +98,12 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c) #endif } -static void __cpuinit transmeta_identify(struct cpuinfo_x86 *c) -{ - u32 xlvl; - - /* Transmeta-defined flags: level 0x80860001 */ - xlvl = cpuid_eax(0x80860000); - if ((xlvl & 0xffff0000) == 0x80860000) { - if (xlvl >= 0x80860001) - c->x86_capability[2] = cpuid_edx(0x80860001); - } -} - static struct cpu_dev transmeta_cpu_dev __cpuinitdata = { .c_vendor = "Transmeta", .c_ident = { "GenuineTMx86", "TransmetaCPU" }, + .c_early_init = early_init_transmeta, .c_init = init_transmeta, - .c_identify = transmeta_identify, + .c_x86_vendor = X86_VENDOR_TRANSMETA, }; -cpu_vendor_dev_register(X86_VENDOR_TRANSMETA, &transmeta_cpu_dev); +cpu_dev_register(transmeta_cpu_dev); diff --git a/arch/x86/kernel/cpu/umc.c b/arch/x86/kernel/cpu/umc.c index b1fc90989d7..e777f79e096 100644 --- a/arch/x86/kernel/cpu/umc.c +++ b/arch/x86/kernel/cpu/umc.c @@ -19,7 +19,8 @@ static struct cpu_dev umc_cpu_dev __cpuinitdata = { } }, }, + .c_x86_vendor = X86_VENDOR_UMC, }; -cpu_vendor_dev_register(X86_VENDOR_UMC, &umc_cpu_dev); +cpu_dev_register(umc_cpu_dev); diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 14b11b3be31..72cefd1e649 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -36,7 +36,6 @@ #include <linux/smp_lock.h> #include <linux/major.h> #include <linux/fs.h> -#include <linux/smp_lock.h> #include <linux/device.h> #include <linux/cpu.h> #include <linux/notifier.h> @@ -89,6 +88,8 @@ static ssize_t cpuid_read(struct file *file, char __user *buf, struct cpuid_regs cmd; int cpu = iminor(file->f_path.dentry->d_inode); u64 pos = *ppos; + ssize_t bytes = 0; + int err = 0; if (count % 16) return -EINVAL; /* Invalid chunk size */ @@ -96,14 +97,19 @@ static ssize_t cpuid_read(struct file *file, char __user *buf, for (; count; count -= 16) { cmd.eax = pos; cmd.ecx = pos >> 32; - smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1); - if (copy_to_user(tmp, &cmd, 16)) - return -EFAULT; + err = smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1); + if (err) + break; + if (copy_to_user(tmp, &cmd, 16)) { + err = -EFAULT; + break; + } tmp += 16; + bytes += 16; *ppos = ++pos; } - return tmp - buf; + return bytes ? bytes : err; } static int cpuid_open(struct inode *inode, struct file *file) @@ -141,8 +147,8 @@ static __cpuinit int cpuid_device_create(int cpu) { struct device *dev; - dev = device_create_drvdata(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), - NULL, "cpu%d", cpu); + dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), NULL, + "cpu%d", cpu); return IS_ERR(dev) ? PTR_ERR(dev) : 0; } diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c index 15e6c6bc4a4..e90a60ef10c 100644 --- a/arch/x86/kernel/crash_dump_64.c +++ b/arch/x86/kernel/crash_dump_64.c @@ -7,9 +7,8 @@ #include <linux/errno.h> #include <linux/crash_dump.h> - -#include <asm/uaccess.h> -#include <asm/io.h> +#include <linux/uaccess.h> +#include <linux/io.h> /** * copy_oldmem_page - copy one page from "oldmem" @@ -25,7 +24,7 @@ * in the current kernel. We stitch up a pte, similar to kmap_atomic. */ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, - size_t csize, unsigned long offset, int userbuf) + size_t csize, unsigned long offset, int userbuf) { void *vaddr; @@ -33,14 +32,16 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, return 0; vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE); + if (!vaddr) + return -ENOMEM; if (userbuf) { - if (copy_to_user(buf, (vaddr + offset), csize)) { + if (copy_to_user(buf, vaddr + offset, csize)) { iounmap(vaddr); return -EFAULT; } } else - memcpy(buf, (vaddr + offset), csize); + memcpy(buf, vaddr + offset, csize); iounmap(vaddr); return csize; diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c index a47798b59f0..b4f14c6c09d 100644 --- a/arch/x86/kernel/doublefault_32.c +++ b/arch/x86/kernel/doublefault_32.c @@ -66,6 +66,6 @@ struct tss_struct doublefault_tss __cacheline_aligned = { .ds = __USER_DS, .fs = __KERNEL_PERCPU, - .__cr3 = __pa(swapper_pg_dir) + .__cr3 = __pa_nodebug(swapper_pg_dir), } }; diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 11c11b8ec48..2b69994fd3a 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -2,26 +2,49 @@ * Debug Store support * * This provides a low-level interface to the hardware's Debug Store - * feature that is used for last branch recording (LBR) and + * feature that is used for branch trace store (BTS) and * precise-event based sampling (PEBS). * - * Different architectures use a different DS layout/pointer size. - * The below functions therefore work on a void*. + * It manages: + * - per-thread and per-cpu allocation of BTS and PEBS + * - buffer memory allocation (optional) + * - buffer overflow handling + * - buffer access * + * It assumes: + * - get_task_struct on all parameter tasks + * - current is allowed to trace parameter tasks * - * Since there is no user for PEBS, yet, only LBR (or branch - * trace store, BTS) is supported. * - * - * Copyright (C) 2007 Intel Corporation. - * Markus Metzger <markus.t.metzger@intel.com>, Dec 2007 + * Copyright (C) 2007-2008 Intel Corporation. + * Markus Metzger <markus.t.metzger@intel.com>, 2007-2008 */ + +#ifdef CONFIG_X86_DS + #include <asm/ds.h> #include <linux/errno.h> #include <linux/string.h> #include <linux/slab.h> +#include <linux/sched.h> +#include <linux/mm.h> + + +/* + * The configuration for a particular DS hardware implementation. + */ +struct ds_configuration { + /* the size of the DS structure in bytes */ + unsigned char sizeof_ds; + /* the size of one pointer-typed field in the DS structure in bytes; + this covers the first 8 fields related to buffer management. */ + unsigned char sizeof_field; + /* the size of a BTS/PEBS record in bytes */ + unsigned char sizeof_rec[2]; +}; +static struct ds_configuration ds_cfg; /* @@ -44,378 +67,747 @@ * (interrupt occurs when write pointer passes interrupt pointer) * - value to which counter is reset following counter overflow * - * On later architectures, the last branch recording hardware uses - * 64bit pointers even in 32bit mode. - * - * - * Branch Trace Store (BTS) records store information about control - * flow changes. They at least provide the following information: - * - source linear address - * - destination linear address + * Later architectures use 64bit pointers throughout, whereas earlier + * architectures use 32bit pointers in 32bit mode. * - * Netburst supported a predicated bit that had been dropped in later - * architectures. We do not suppor it. * + * We compute the base address for the first 8 fields based on: + * - the field size stored in the DS configuration + * - the relative field position + * - an offset giving the start of the respective region * - * In order to abstract from the actual DS and BTS layout, we describe - * the access to the relevant fields. - * Thanks to Andi Kleen for proposing this design. + * This offset is further used to index various arrays holding + * information for BTS and PEBS at the respective index. * - * The implementation, however, is not as general as it might seem. In - * order to stay somewhat simple and efficient, we assume an - * underlying unsigned type (mostly a pointer type) and we expect the - * field to be at least as big as that type. + * On later 32bit processors, we only access the lower 32bit of the + * 64bit pointer fields. The upper halves will be zeroed out. */ -/* - * A special from_ip address to indicate that the BTS record is an - * info record that needs to be interpreted or skipped. - */ -#define BTS_ESCAPE_ADDRESS (-1) +enum ds_field { + ds_buffer_base = 0, + ds_index, + ds_absolute_maximum, + ds_interrupt_threshold, +}; -/* - * A field access descriptor - */ -struct access_desc { - unsigned char offset; - unsigned char size; +enum ds_qualifier { + ds_bts = 0, + ds_pebs }; +static inline unsigned long ds_get(const unsigned char *base, + enum ds_qualifier qual, enum ds_field field) +{ + base += (ds_cfg.sizeof_field * (field + (4 * qual))); + return *(unsigned long *)base; +} + +static inline void ds_set(unsigned char *base, enum ds_qualifier qual, + enum ds_field field, unsigned long value) +{ + base += (ds_cfg.sizeof_field * (field + (4 * qual))); + (*(unsigned long *)base) = value; +} + + /* - * The configuration for a particular DS/BTS hardware implementation. + * Locking is done only for allocating BTS or PEBS resources and for + * guarding context and buffer memory allocation. + * + * Most functions require the current task to own the ds context part + * they are going to access. All the locking is done when validating + * access to the context. */ -struct ds_configuration { - /* the DS configuration */ - unsigned char sizeof_ds; - struct access_desc bts_buffer_base; - struct access_desc bts_index; - struct access_desc bts_absolute_maximum; - struct access_desc bts_interrupt_threshold; - /* the BTS configuration */ - unsigned char sizeof_bts; - struct access_desc from_ip; - struct access_desc to_ip; - /* BTS variants used to store additional information like - timestamps */ - struct access_desc info_type; - struct access_desc info_data; - unsigned long debugctl_mask; -}; +static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock); /* - * The global configuration used by the below accessor functions + * Validate that the current task is allowed to access the BTS/PEBS + * buffer of the parameter task. + * + * Returns 0, if access is granted; -Eerrno, otherwise. */ -static struct ds_configuration ds_cfg; +static inline int ds_validate_access(struct ds_context *context, + enum ds_qualifier qual) +{ + if (!context) + return -EPERM; + + if (context->owner[qual] == current) + return 0; + + return -EPERM; +} + /* - * Accessor functions for some DS and BTS fields using the above - * global ptrace_bts_cfg. + * We either support (system-wide) per-cpu or per-thread allocation. + * We distinguish the two based on the task_struct pointer, where a + * NULL pointer indicates per-cpu allocation for the current cpu. + * + * Allocations are use-counted. As soon as resources are allocated, + * further allocations must be of the same type (per-cpu or + * per-thread). We model this by counting allocations (i.e. the number + * of tracers of a certain type) for one type negatively: + * =0 no tracers + * >0 number of per-thread tracers + * <0 number of per-cpu tracers + * + * The below functions to get and put tracers and to check the + * allocation type require the ds_lock to be held by the caller. + * + * Tracers essentially gives the number of ds contexts for a certain + * type of allocation. */ -static inline unsigned long get_bts_buffer_base(char *base) +static long tracers; + +static inline void get_tracer(struct task_struct *task) { - return *(unsigned long *)(base + ds_cfg.bts_buffer_base.offset); + tracers += (task ? 1 : -1); } -static inline void set_bts_buffer_base(char *base, unsigned long value) + +static inline void put_tracer(struct task_struct *task) { - (*(unsigned long *)(base + ds_cfg.bts_buffer_base.offset)) = value; + tracers -= (task ? 1 : -1); } -static inline unsigned long get_bts_index(char *base) + +static inline int check_tracer(struct task_struct *task) { - return *(unsigned long *)(base + ds_cfg.bts_index.offset); + return (task ? (tracers >= 0) : (tracers <= 0)); } -static inline void set_bts_index(char *base, unsigned long value) + + +/* + * The DS context is either attached to a thread or to a cpu: + * - in the former case, the thread_struct contains a pointer to the + * attached context. + * - in the latter case, we use a static array of per-cpu context + * pointers. + * + * Contexts are use-counted. They are allocated on first access and + * deallocated when the last user puts the context. + * + * We distinguish between an allocating and a non-allocating get of a + * context: + * - the allocating get is used for requesting BTS/PEBS resources. It + * requires the caller to hold the global ds_lock. + * - the non-allocating get is used for all other cases. A + * non-existing context indicates an error. It acquires and releases + * the ds_lock itself for obtaining the context. + * + * A context and its DS configuration are allocated and deallocated + * together. A context always has a DS configuration of the + * appropriate size. + */ +static DEFINE_PER_CPU(struct ds_context *, system_context); + +#define this_system_context per_cpu(system_context, smp_processor_id()) + +/* + * Returns the pointer to the parameter task's context or to the + * system-wide context, if task is NULL. + * + * Increases the use count of the returned context, if not NULL. + */ +static inline struct ds_context *ds_get_context(struct task_struct *task) { - (*(unsigned long *)(base + ds_cfg.bts_index.offset)) = value; + struct ds_context *context; + + spin_lock(&ds_lock); + + context = (task ? task->thread.ds_ctx : this_system_context); + if (context) + context->count++; + + spin_unlock(&ds_lock); + + return context; } -static inline unsigned long get_bts_absolute_maximum(char *base) + +/* + * Same as ds_get_context, but allocates the context and it's DS + * structure, if necessary; returns NULL; if out of memory. + * + * pre: requires ds_lock to be held + */ +static inline struct ds_context *ds_alloc_context(struct task_struct *task) { - return *(unsigned long *)(base + ds_cfg.bts_absolute_maximum.offset); + struct ds_context **p_context = + (task ? &task->thread.ds_ctx : &this_system_context); + struct ds_context *context = *p_context; + + if (!context) { + context = kzalloc(sizeof(*context), GFP_KERNEL); + + if (!context) + return NULL; + + context->ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL); + if (!context->ds) { + kfree(context); + return NULL; + } + + *p_context = context; + + context->this = p_context; + context->task = task; + + if (task) + set_tsk_thread_flag(task, TIF_DS_AREA_MSR); + + if (!task || (task == current)) + wrmsr(MSR_IA32_DS_AREA, (unsigned long)context->ds, 0); + + get_tracer(task); + } + + context->count++; + + return context; } -static inline void set_bts_absolute_maximum(char *base, unsigned long value) + +/* + * Decreases the use count of the parameter context, if not NULL. + * Deallocates the context, if the use count reaches zero. + */ +static inline void ds_put_context(struct ds_context *context) { - (*(unsigned long *)(base + ds_cfg.bts_absolute_maximum.offset)) = value; + if (!context) + return; + + spin_lock(&ds_lock); + + if (--context->count) + goto out; + + *(context->this) = NULL; + + if (context->task) + clear_tsk_thread_flag(context->task, TIF_DS_AREA_MSR); + + if (!context->task || (context->task == current)) + wrmsrl(MSR_IA32_DS_AREA, 0); + + put_tracer(context->task); + + /* free any leftover buffers from tracers that did not + * deallocate them properly. */ + kfree(context->buffer[ds_bts]); + kfree(context->buffer[ds_pebs]); + kfree(context->ds); + kfree(context); + out: + spin_unlock(&ds_lock); } -static inline unsigned long get_bts_interrupt_threshold(char *base) + + +/* + * Handle a buffer overflow + * + * task: the task whose buffers are overflowing; + * NULL for a buffer overflow on the current cpu + * context: the ds context + * qual: the buffer type + */ +static void ds_overflow(struct task_struct *task, struct ds_context *context, + enum ds_qualifier qual) { - return *(unsigned long *)(base + ds_cfg.bts_interrupt_threshold.offset); + if (!context) + return; + + if (context->callback[qual]) + (*context->callback[qual])(task); + + /* todo: do some more overflow handling */ } -static inline void set_bts_interrupt_threshold(char *base, unsigned long value) + + +/* + * Allocate a non-pageable buffer of the parameter size. + * Checks the memory and the locked memory rlimit. + * + * Returns the buffer, if successful; + * NULL, if out of memory or rlimit exceeded. + * + * size: the requested buffer size in bytes + * pages (out): if not NULL, contains the number of pages reserved + */ +static inline void *ds_allocate_buffer(size_t size, unsigned int *pages) { - (*(unsigned long *)(base + ds_cfg.bts_interrupt_threshold.offset)) = value; + unsigned long rlim, vm, pgsz; + void *buffer; + + pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; + + rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; + vm = current->mm->total_vm + pgsz; + if (rlim < vm) + return NULL; + + rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; + vm = current->mm->locked_vm + pgsz; + if (rlim < vm) + return NULL; + + buffer = kzalloc(size, GFP_KERNEL); + if (!buffer) + return NULL; + + current->mm->total_vm += pgsz; + current->mm->locked_vm += pgsz; + + if (pages) + *pages = pgsz; + + return buffer; } -static inline unsigned long get_from_ip(char *base) + +static int ds_request(struct task_struct *task, void *base, size_t size, + ds_ovfl_callback_t ovfl, enum ds_qualifier qual) { - return *(unsigned long *)(base + ds_cfg.from_ip.offset); + struct ds_context *context; + unsigned long buffer, adj; + const unsigned long alignment = (1 << 3); + int error = 0; + + if (!ds_cfg.sizeof_ds) + return -EOPNOTSUPP; + + /* we require some space to do alignment adjustments below */ + if (size < (alignment + ds_cfg.sizeof_rec[qual])) + return -EINVAL; + + /* buffer overflow notification is not yet implemented */ + if (ovfl) + return -EOPNOTSUPP; + + + spin_lock(&ds_lock); + + if (!check_tracer(task)) + return -EPERM; + + error = -ENOMEM; + context = ds_alloc_context(task); + if (!context) + goto out_unlock; + + error = -EALREADY; + if (context->owner[qual] == current) + goto out_unlock; + error = -EPERM; + if (context->owner[qual] != NULL) + goto out_unlock; + context->owner[qual] = current; + + spin_unlock(&ds_lock); + + + error = -ENOMEM; + if (!base) { + base = ds_allocate_buffer(size, &context->pages[qual]); + if (!base) + goto out_release; + + context->buffer[qual] = base; + } + error = 0; + + context->callback[qual] = ovfl; + + /* adjust the buffer address and size to meet alignment + * constraints: + * - buffer is double-word aligned + * - size is multiple of record size + * + * We checked the size at the very beginning; we have enough + * space to do the adjustment. + */ + buffer = (unsigned long)base; + + adj = ALIGN(buffer, alignment) - buffer; + buffer += adj; + size -= adj; + + size /= ds_cfg.sizeof_rec[qual]; + size *= ds_cfg.sizeof_rec[qual]; + + ds_set(context->ds, qual, ds_buffer_base, buffer); + ds_set(context->ds, qual, ds_index, buffer); + ds_set(context->ds, qual, ds_absolute_maximum, buffer + size); + + if (ovfl) { + /* todo: select a suitable interrupt threshold */ + } else + ds_set(context->ds, qual, + ds_interrupt_threshold, buffer + size + 1); + + /* we keep the context until ds_release */ + return error; + + out_release: + context->owner[qual] = NULL; + ds_put_context(context); + return error; + + out_unlock: + spin_unlock(&ds_lock); + ds_put_context(context); + return error; } -static inline void set_from_ip(char *base, unsigned long value) + +int ds_request_bts(struct task_struct *task, void *base, size_t size, + ds_ovfl_callback_t ovfl) { - (*(unsigned long *)(base + ds_cfg.from_ip.offset)) = value; + return ds_request(task, base, size, ovfl, ds_bts); } -static inline unsigned long get_to_ip(char *base) + +int ds_request_pebs(struct task_struct *task, void *base, size_t size, + ds_ovfl_callback_t ovfl) { - return *(unsigned long *)(base + ds_cfg.to_ip.offset); + return ds_request(task, base, size, ovfl, ds_pebs); } -static inline void set_to_ip(char *base, unsigned long value) + +static int ds_release(struct task_struct *task, enum ds_qualifier qual) { - (*(unsigned long *)(base + ds_cfg.to_ip.offset)) = value; + struct ds_context *context; + int error; + + context = ds_get_context(task); + error = ds_validate_access(context, qual); + if (error < 0) + goto out; + + kfree(context->buffer[qual]); + context->buffer[qual] = NULL; + + current->mm->total_vm -= context->pages[qual]; + current->mm->locked_vm -= context->pages[qual]; + context->pages[qual] = 0; + context->owner[qual] = NULL; + + /* + * we put the context twice: + * once for the ds_get_context + * once for the corresponding ds_request + */ + ds_put_context(context); + out: + ds_put_context(context); + return error; } -static inline unsigned char get_info_type(char *base) + +int ds_release_bts(struct task_struct *task) { - return *(unsigned char *)(base + ds_cfg.info_type.offset); + return ds_release(task, ds_bts); } -static inline void set_info_type(char *base, unsigned char value) + +int ds_release_pebs(struct task_struct *task) { - (*(unsigned char *)(base + ds_cfg.info_type.offset)) = value; + return ds_release(task, ds_pebs); } -static inline unsigned long get_info_data(char *base) + +static int ds_get_index(struct task_struct *task, size_t *pos, + enum ds_qualifier qual) { - return *(unsigned long *)(base + ds_cfg.info_data.offset); + struct ds_context *context; + unsigned long base, index; + int error; + + context = ds_get_context(task); + error = ds_validate_access(context, qual); + if (error < 0) + goto out; + + base = ds_get(context->ds, qual, ds_buffer_base); + index = ds_get(context->ds, qual, ds_index); + + error = ((index - base) / ds_cfg.sizeof_rec[qual]); + if (pos) + *pos = error; + out: + ds_put_context(context); + return error; } -static inline void set_info_data(char *base, unsigned long value) + +int ds_get_bts_index(struct task_struct *task, size_t *pos) { - (*(unsigned long *)(base + ds_cfg.info_data.offset)) = value; + return ds_get_index(task, pos, ds_bts); } +int ds_get_pebs_index(struct task_struct *task, size_t *pos) +{ + return ds_get_index(task, pos, ds_pebs); +} -int ds_allocate(void **dsp, size_t bts_size_in_bytes) +static int ds_get_end(struct task_struct *task, size_t *pos, + enum ds_qualifier qual) { - size_t bts_size_in_records; - unsigned long bts; - void *ds; + struct ds_context *context; + unsigned long base, end; + int error; + + context = ds_get_context(task); + error = ds_validate_access(context, qual); + if (error < 0) + goto out; + + base = ds_get(context->ds, qual, ds_buffer_base); + end = ds_get(context->ds, qual, ds_absolute_maximum); + + error = ((end - base) / ds_cfg.sizeof_rec[qual]); + if (pos) + *pos = error; + out: + ds_put_context(context); + return error; +} - if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) - return -EOPNOTSUPP; +int ds_get_bts_end(struct task_struct *task, size_t *pos) +{ + return ds_get_end(task, pos, ds_bts); +} - if (bts_size_in_bytes < 0) - return -EINVAL; +int ds_get_pebs_end(struct task_struct *task, size_t *pos) +{ + return ds_get_end(task, pos, ds_pebs); +} - bts_size_in_records = - bts_size_in_bytes / ds_cfg.sizeof_bts; - bts_size_in_bytes = - bts_size_in_records * ds_cfg.sizeof_bts; +static int ds_access(struct task_struct *task, size_t index, + const void **record, enum ds_qualifier qual) +{ + struct ds_context *context; + unsigned long base, idx; + int error; - if (bts_size_in_bytes <= 0) + if (!record) return -EINVAL; - bts = (unsigned long)kzalloc(bts_size_in_bytes, GFP_KERNEL); - - if (!bts) - return -ENOMEM; + context = ds_get_context(task); + error = ds_validate_access(context, qual); + if (error < 0) + goto out; - ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL); + base = ds_get(context->ds, qual, ds_buffer_base); + idx = base + (index * ds_cfg.sizeof_rec[qual]); - if (!ds) { - kfree((void *)bts); - return -ENOMEM; - } - - set_bts_buffer_base(ds, bts); - set_bts_index(ds, bts); - set_bts_absolute_maximum(ds, bts + bts_size_in_bytes); - set_bts_interrupt_threshold(ds, bts + bts_size_in_bytes + 1); + error = -EINVAL; + if (idx > ds_get(context->ds, qual, ds_absolute_maximum)) + goto out; - *dsp = ds; - return 0; + *record = (const void *)idx; + error = ds_cfg.sizeof_rec[qual]; + out: + ds_put_context(context); + return error; } -int ds_free(void **dsp) +int ds_access_bts(struct task_struct *task, size_t index, const void **record) { - if (*dsp) { - kfree((void *)get_bts_buffer_base(*dsp)); - kfree(*dsp); - *dsp = NULL; - } - return 0; + return ds_access(task, index, record, ds_bts); } -int ds_get_bts_size(void *ds) +int ds_access_pebs(struct task_struct *task, size_t index, const void **record) { - int size_in_bytes; - - if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) - return -EOPNOTSUPP; - - if (!ds) - return 0; - - size_in_bytes = - get_bts_absolute_maximum(ds) - - get_bts_buffer_base(ds); - return size_in_bytes; + return ds_access(task, index, record, ds_pebs); } -int ds_get_bts_end(void *ds) +static int ds_write(struct task_struct *task, const void *record, size_t size, + enum ds_qualifier qual, int force) { - int size_in_bytes = ds_get_bts_size(ds); - - if (size_in_bytes <= 0) - return size_in_bytes; + struct ds_context *context; + int error; - return size_in_bytes / ds_cfg.sizeof_bts; -} + if (!record) + return -EINVAL; -int ds_get_bts_index(void *ds) -{ - int index_offset_in_bytes; + error = -EPERM; + context = ds_get_context(task); + if (!context) + goto out; - if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) - return -EOPNOTSUPP; + if (!force) { + error = ds_validate_access(context, qual); + if (error < 0) + goto out; + } - index_offset_in_bytes = - get_bts_index(ds) - - get_bts_buffer_base(ds); + error = 0; + while (size) { + unsigned long base, index, end, write_end, int_th; + unsigned long write_size, adj_write_size; + + /* + * write as much as possible without producing an + * overflow interrupt. + * + * interrupt_threshold must either be + * - bigger than absolute_maximum or + * - point to a record between buffer_base and absolute_maximum + * + * index points to a valid record. + */ + base = ds_get(context->ds, qual, ds_buffer_base); + index = ds_get(context->ds, qual, ds_index); + end = ds_get(context->ds, qual, ds_absolute_maximum); + int_th = ds_get(context->ds, qual, ds_interrupt_threshold); + + write_end = min(end, int_th); + + /* if we are already beyond the interrupt threshold, + * we fill the entire buffer */ + if (write_end <= index) + write_end = end; + + if (write_end <= index) + goto out; + + write_size = min((unsigned long) size, write_end - index); + memcpy((void *)index, record, write_size); + + record = (const char *)record + write_size; + size -= write_size; + error += write_size; + + adj_write_size = write_size / ds_cfg.sizeof_rec[qual]; + adj_write_size *= ds_cfg.sizeof_rec[qual]; + + /* zero out trailing bytes */ + memset((char *)index + write_size, 0, + adj_write_size - write_size); + index += adj_write_size; + + if (index >= end) + index = base; + ds_set(context->ds, qual, ds_index, index); + + if (index >= int_th) + ds_overflow(task, context, qual); + } - return index_offset_in_bytes / ds_cfg.sizeof_bts; + out: + ds_put_context(context); + return error; } -int ds_set_overflow(void *ds, int method) +int ds_write_bts(struct task_struct *task, const void *record, size_t size) { - switch (method) { - case DS_O_SIGNAL: - return -EOPNOTSUPP; - case DS_O_WRAP: - return 0; - default: - return -EINVAL; - } + return ds_write(task, record, size, ds_bts, /* force = */ 0); } -int ds_get_overflow(void *ds) +int ds_write_pebs(struct task_struct *task, const void *record, size_t size) { - return DS_O_WRAP; + return ds_write(task, record, size, ds_pebs, /* force = */ 0); } -int ds_clear(void *ds) +int ds_unchecked_write_bts(struct task_struct *task, + const void *record, size_t size) { - int bts_size = ds_get_bts_size(ds); - unsigned long bts_base; - - if (bts_size <= 0) - return bts_size; - - bts_base = get_bts_buffer_base(ds); - memset((void *)bts_base, 0, bts_size); - - set_bts_index(ds, bts_base); - return 0; + return ds_write(task, record, size, ds_bts, /* force = */ 1); } -int ds_read_bts(void *ds, int index, struct bts_struct *out) +int ds_unchecked_write_pebs(struct task_struct *task, + const void *record, size_t size) { - void *bts; + return ds_write(task, record, size, ds_pebs, /* force = */ 1); +} - if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) - return -EOPNOTSUPP; +static int ds_reset_or_clear(struct task_struct *task, + enum ds_qualifier qual, int clear) +{ + struct ds_context *context; + unsigned long base, end; + int error; - if (index < 0) - return -EINVAL; + context = ds_get_context(task); + error = ds_validate_access(context, qual); + if (error < 0) + goto out; - if (index >= ds_get_bts_size(ds)) - return -EINVAL; + base = ds_get(context->ds, qual, ds_buffer_base); + end = ds_get(context->ds, qual, ds_absolute_maximum); - bts = (void *)(get_bts_buffer_base(ds) + (index * ds_cfg.sizeof_bts)); + if (clear) + memset((void *)base, 0, end - base); - memset(out, 0, sizeof(*out)); - if (get_from_ip(bts) == BTS_ESCAPE_ADDRESS) { - out->qualifier = get_info_type(bts); - out->variant.jiffies = get_info_data(bts); - } else { - out->qualifier = BTS_BRANCH; - out->variant.lbr.from_ip = get_from_ip(bts); - out->variant.lbr.to_ip = get_to_ip(bts); - } + ds_set(context->ds, qual, ds_index, base); - return sizeof(*out);; + error = 0; + out: + ds_put_context(context); + return error; } -int ds_write_bts(void *ds, const struct bts_struct *in) +int ds_reset_bts(struct task_struct *task) { - unsigned long bts; - - if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) - return -EOPNOTSUPP; - - if (ds_get_bts_size(ds) <= 0) - return -ENXIO; + return ds_reset_or_clear(task, ds_bts, /* clear = */ 0); +} - bts = get_bts_index(ds); +int ds_reset_pebs(struct task_struct *task) +{ + return ds_reset_or_clear(task, ds_pebs, /* clear = */ 0); +} - memset((void *)bts, 0, ds_cfg.sizeof_bts); - switch (in->qualifier) { - case BTS_INVALID: - break; +int ds_clear_bts(struct task_struct *task) +{ + return ds_reset_or_clear(task, ds_bts, /* clear = */ 1); +} - case BTS_BRANCH: - set_from_ip((void *)bts, in->variant.lbr.from_ip); - set_to_ip((void *)bts, in->variant.lbr.to_ip); - break; +int ds_clear_pebs(struct task_struct *task) +{ + return ds_reset_or_clear(task, ds_pebs, /* clear = */ 1); +} - case BTS_TASK_ARRIVES: - case BTS_TASK_DEPARTS: - set_from_ip((void *)bts, BTS_ESCAPE_ADDRESS); - set_info_type((void *)bts, in->qualifier); - set_info_data((void *)bts, in->variant.jiffies); - break; +int ds_get_pebs_reset(struct task_struct *task, u64 *value) +{ + struct ds_context *context; + int error; - default: + if (!value) return -EINVAL; - } - bts = bts + ds_cfg.sizeof_bts; - if (bts >= get_bts_absolute_maximum(ds)) - bts = get_bts_buffer_base(ds); - set_bts_index(ds, bts); + context = ds_get_context(task); + error = ds_validate_access(context, ds_pebs); + if (error < 0) + goto out; - return ds_cfg.sizeof_bts; + *value = *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)); + + error = 0; + out: + ds_put_context(context); + return error; } -unsigned long ds_debugctl_mask(void) +int ds_set_pebs_reset(struct task_struct *task, u64 value) { - return ds_cfg.debugctl_mask; -} + struct ds_context *context; + int error; -#ifdef __i386__ -static const struct ds_configuration ds_cfg_netburst = { - .sizeof_ds = 9 * 4, - .bts_buffer_base = { 0, 4 }, - .bts_index = { 4, 4 }, - .bts_absolute_maximum = { 8, 4 }, - .bts_interrupt_threshold = { 12, 4 }, - .sizeof_bts = 3 * 4, - .from_ip = { 0, 4 }, - .to_ip = { 4, 4 }, - .info_type = { 4, 1 }, - .info_data = { 8, 4 }, - .debugctl_mask = (1<<2)|(1<<3) -}; + context = ds_get_context(task); + error = ds_validate_access(context, ds_pebs); + if (error < 0) + goto out; -static const struct ds_configuration ds_cfg_pentium_m = { - .sizeof_ds = 9 * 4, - .bts_buffer_base = { 0, 4 }, - .bts_index = { 4, 4 }, - .bts_absolute_maximum = { 8, 4 }, - .bts_interrupt_threshold = { 12, 4 }, - .sizeof_bts = 3 * 4, - .from_ip = { 0, 4 }, - .to_ip = { 4, 4 }, - .info_type = { 4, 1 }, - .info_data = { 8, 4 }, - .debugctl_mask = (1<<6)|(1<<7) + *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)) = value; + + error = 0; + out: + ds_put_context(context); + return error; +} + +static const struct ds_configuration ds_cfg_var = { + .sizeof_ds = sizeof(long) * 12, + .sizeof_field = sizeof(long), + .sizeof_rec[ds_bts] = sizeof(long) * 3, + .sizeof_rec[ds_pebs] = sizeof(long) * 10 }; -#endif /* _i386_ */ - -static const struct ds_configuration ds_cfg_core2 = { - .sizeof_ds = 9 * 8, - .bts_buffer_base = { 0, 8 }, - .bts_index = { 8, 8 }, - .bts_absolute_maximum = { 16, 8 }, - .bts_interrupt_threshold = { 24, 8 }, - .sizeof_bts = 3 * 8, - .from_ip = { 0, 8 }, - .to_ip = { 8, 8 }, - .info_type = { 8, 1 }, - .info_data = { 16, 8 }, - .debugctl_mask = (1<<6)|(1<<7)|(1<<9) +static const struct ds_configuration ds_cfg_64 = { + .sizeof_ds = 8 * 12, + .sizeof_field = 8, + .sizeof_rec[ds_bts] = 8 * 3, + .sizeof_rec[ds_pebs] = 8 * 10 }; static inline void @@ -429,14 +821,13 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) switch (c->x86) { case 0x6: switch (c->x86_model) { -#ifdef __i386__ case 0xD: case 0xE: /* Pentium M */ - ds_configure(&ds_cfg_pentium_m); + ds_configure(&ds_cfg_var); break; -#endif /* _i386_ */ case 0xF: /* Core2 */ - ds_configure(&ds_cfg_core2); + case 0x1C: /* Atom */ + ds_configure(&ds_cfg_64); break; default: /* sorry, don't know about them */ @@ -445,13 +836,11 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) break; case 0xF: switch (c->x86_model) { -#ifdef __i386__ case 0x0: case 0x1: case 0x2: /* Netburst */ - ds_configure(&ds_cfg_netburst); + ds_configure(&ds_cfg_var); break; -#endif /* _i386_ */ default: /* sorry, don't know about them */ break; @@ -462,3 +851,14 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) break; } } + +void ds_free(struct ds_context *context) +{ + /* This is called when the task owning the parameter context + * is dying. There should not be any user of that context left + * to disturb us, anymore. */ + unsigned long leftovers = context->count; + while (leftovers--) + ds_put_context(context); +} +#endif /* CONFIG_X86_DS */ diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c new file mode 100644 index 00000000000..1a78180f08d --- /dev/null +++ b/arch/x86/kernel/dumpstack_32.c @@ -0,0 +1,449 @@ +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs + */ +#include <linux/kallsyms.h> +#include <linux/kprobes.h> +#include <linux/uaccess.h> +#include <linux/utsname.h> +#include <linux/hardirq.h> +#include <linux/kdebug.h> +#include <linux/module.h> +#include <linux/ptrace.h> +#include <linux/kexec.h> +#include <linux/bug.h> +#include <linux/nmi.h> +#include <linux/sysfs.h> + +#include <asm/stacktrace.h> + +#define STACKSLOTS_PER_LINE 8 +#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :) + +int panic_on_unrecovered_nmi; +int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; +static unsigned int code_bytes = 64; +static int die_counter; + +void printk_address(unsigned long address, int reliable) +{ + printk(" [<%p>] %s%pS\n", (void *) address, + reliable ? "" : "? ", (void *) address); +} + +static inline int valid_stack_ptr(struct thread_info *tinfo, + void *p, unsigned int size, void *end) +{ + void *t = tinfo; + if (end) { + if (p < end && p >= (end-THREAD_SIZE)) + return 1; + else + return 0; + } + return p > t && p < t + THREAD_SIZE - size; +} + +/* The form of the top of the frame on the stack */ +struct stack_frame { + struct stack_frame *next_frame; + unsigned long return_address; +}; + +static inline unsigned long +print_context_stack(struct thread_info *tinfo, + unsigned long *stack, unsigned long bp, + const struct stacktrace_ops *ops, void *data, + unsigned long *end) +{ + struct stack_frame *frame = (struct stack_frame *)bp; + + while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) { + unsigned long addr; + + addr = *stack; + if (__kernel_text_address(addr)) { + if ((unsigned long) stack == bp + sizeof(long)) { + ops->address(data, addr, 1); + frame = frame->next_frame; + bp = (unsigned long) frame; + } else { + ops->address(data, addr, bp == 0); + } + } + stack++; + } + return bp; +} + +void dump_trace(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, unsigned long bp, + const struct stacktrace_ops *ops, void *data) +{ + if (!task) + task = current; + + if (!stack) { + unsigned long dummy; + stack = &dummy; + if (task && task != current) + stack = (unsigned long *)task->thread.sp; + } + +#ifdef CONFIG_FRAME_POINTER + if (!bp) { + if (task == current) { + /* Grab bp right from our regs */ + get_bp(bp); + } else { + /* bp is the last reg pushed by switch_to */ + bp = *(unsigned long *) task->thread.sp; + } + } +#endif + + for (;;) { + struct thread_info *context; + + context = (struct thread_info *) + ((unsigned long)stack & (~(THREAD_SIZE - 1))); + bp = print_context_stack(context, stack, bp, ops, data, NULL); + + stack = (unsigned long *)context->previous_esp; + if (!stack) + break; + if (ops->stack(data, "IRQ") < 0) + break; + touch_nmi_watchdog(); + } +} +EXPORT_SYMBOL(dump_trace); + +static void +print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) +{ + printk(data); + print_symbol(msg, symbol); + printk("\n"); +} + +static void print_trace_warning(void *data, char *msg) +{ + printk("%s%s\n", (char *)data, msg); +} + +static int print_trace_stack(void *data, char *name) +{ + printk("%s <%s> ", (char *)data, name); + return 0; +} + +/* + * Print one address/symbol entries per line. + */ +static void print_trace_address(void *data, unsigned long addr, int reliable) +{ + touch_nmi_watchdog(); + printk(data); + printk_address(addr, reliable); +} + +static const struct stacktrace_ops print_trace_ops = { + .warning = print_trace_warning, + .warning_symbol = print_trace_warning_symbol, + .stack = print_trace_stack, + .address = print_trace_address, +}; + +static void +show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, unsigned long bp, char *log_lvl) +{ + printk("%sCall Trace:\n", log_lvl); + dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); +} + +void show_trace(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, unsigned long bp) +{ + show_trace_log_lvl(task, regs, stack, bp, ""); +} + +static void +show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *sp, unsigned long bp, char *log_lvl) +{ + unsigned long *stack; + int i; + + if (sp == NULL) { + if (task) + sp = (unsigned long *)task->thread.sp; + else + sp = (unsigned long *)&sp; + } + + stack = sp; + for (i = 0; i < kstack_depth_to_print; i++) { + if (kstack_end(stack)) + break; + if (i && ((i % STACKSLOTS_PER_LINE) == 0)) + printk("\n%s", log_lvl); + printk(" %08lx", *stack++); + touch_nmi_watchdog(); + } + printk("\n"); + show_trace_log_lvl(task, regs, sp, bp, log_lvl); +} + +void show_stack(struct task_struct *task, unsigned long *sp) +{ + show_stack_log_lvl(task, NULL, sp, 0, ""); +} + +/* + * The architecture-independent dump_stack generator + */ +void dump_stack(void) +{ + unsigned long bp = 0; + unsigned long stack; + +#ifdef CONFIG_FRAME_POINTER + if (!bp) + get_bp(bp); +#endif + + printk("Pid: %d, comm: %.20s %s %s %.*s\n", + current->pid, current->comm, print_tainted(), + init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version); + show_trace(NULL, NULL, &stack, bp); +} + +EXPORT_SYMBOL(dump_stack); + +void show_registers(struct pt_regs *regs) +{ + int i; + + print_modules(); + __show_regs(regs, 0); + + printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n", + TASK_COMM_LEN, current->comm, task_pid_nr(current), + current_thread_info(), current, task_thread_info(current)); + /* + * When in-kernel, we also print out the stack and code at the + * time of the fault.. + */ + if (!user_mode_vm(regs)) { + unsigned int code_prologue = code_bytes * 43 / 64; + unsigned int code_len = code_bytes; + unsigned char c; + u8 *ip; + + printk(KERN_EMERG "Stack:\n"); + show_stack_log_lvl(NULL, regs, ®s->sp, + 0, KERN_EMERG); + + printk(KERN_EMERG "Code: "); + + ip = (u8 *)regs->ip - code_prologue; + if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { + /* try starting at IP */ + ip = (u8 *)regs->ip; + code_len = code_len - code_prologue + 1; + } + for (i = 0; i < code_len; i++, ip++) { + if (ip < (u8 *)PAGE_OFFSET || + probe_kernel_address(ip, c)) { + printk(" Bad EIP value."); + break; + } + if (ip == (u8 *)regs->ip) + printk("<%02x> ", c); + else + printk("%02x ", c); + } + } + printk("\n"); +} + +int is_valid_bugaddr(unsigned long ip) +{ + unsigned short ud2; + + if (ip < PAGE_OFFSET) + return 0; + if (probe_kernel_address((unsigned short *)ip, ud2)) + return 0; + + return ud2 == 0x0b0f; +} + +static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; +static int die_owner = -1; +static unsigned int die_nest_count; + +unsigned __kprobes long oops_begin(void) +{ + unsigned long flags; + + oops_enter(); + + if (die_owner != raw_smp_processor_id()) { + console_verbose(); + raw_local_irq_save(flags); + __raw_spin_lock(&die_lock); + die_owner = smp_processor_id(); + die_nest_count = 0; + bust_spinlocks(1); + } else { + raw_local_irq_save(flags); + } + die_nest_count++; + return flags; +} + +void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) +{ + bust_spinlocks(0); + die_owner = -1; + add_taint(TAINT_DIE); + __raw_spin_unlock(&die_lock); + raw_local_irq_restore(flags); + + if (!regs) + return; + + if (kexec_should_crash(current)) + crash_kexec(regs); + if (in_interrupt()) + panic("Fatal exception in interrupt"); + if (panic_on_oops) + panic("Fatal exception"); + oops_exit(); + do_exit(signr); +} + +int __kprobes __die(const char *str, struct pt_regs *regs, long err) +{ + unsigned short ss; + unsigned long sp; + + printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); +#ifdef CONFIG_PREEMPT + printk("PREEMPT "); +#endif +#ifdef CONFIG_SMP + printk("SMP "); +#endif +#ifdef CONFIG_DEBUG_PAGEALLOC + printk("DEBUG_PAGEALLOC"); +#endif + printk("\n"); + sysfs_printk_last_file(); + if (notify_die(DIE_OOPS, str, regs, err, + current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) + return 1; + + show_registers(regs); + /* Executive summary in case the oops scrolled away */ + sp = (unsigned long) (®s->sp); + savesegment(ss, ss); + if (user_mode(regs)) { + sp = regs->sp; + ss = regs->ss & 0xffff; + } + printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); + print_symbol("%s", regs->ip); + printk(" SS:ESP %04x:%08lx\n", ss, sp); + return 0; +} + +/* + * This is gone through when something in the kernel has done something bad + * and is about to be terminated: + */ +void die(const char *str, struct pt_regs *regs, long err) +{ + unsigned long flags = oops_begin(); + + if (die_nest_count < 3) { + report_bug(regs->ip, regs); + + if (__die(str, regs, err)) + regs = NULL; + } else { + printk(KERN_EMERG "Recursive die() failure, output suppressed\n"); + } + + oops_end(flags, regs, SIGSEGV); +} + +static DEFINE_SPINLOCK(nmi_print_lock); + +void notrace __kprobes +die_nmi(char *str, struct pt_regs *regs, int do_panic) +{ + if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) + return; + + spin_lock(&nmi_print_lock); + /* + * We are in trouble anyway, lets at least try + * to get a message out: + */ + bust_spinlocks(1); + printk(KERN_EMERG "%s", str); + printk(" on CPU%d, ip %08lx, registers:\n", + smp_processor_id(), regs->ip); + show_registers(regs); + if (do_panic) + panic("Non maskable interrupt"); + console_silent(); + spin_unlock(&nmi_print_lock); + bust_spinlocks(0); + + /* + * If we are in kernel we are probably nested up pretty bad + * and might aswell get out now while we still can: + */ + if (!user_mode_vm(regs)) { + current->thread.trap_no = 2; + crash_kexec(regs); + } + + do_exit(SIGSEGV); +} + +static int __init oops_setup(char *s) +{ + if (!s) + return -EINVAL; + if (!strcmp(s, "panic")) + panic_on_oops = 1; + return 0; +} +early_param("oops", oops_setup); + +static int __init kstack_setup(char *s) +{ + if (!s) + return -EINVAL; + kstack_depth_to_print = simple_strtoul(s, NULL, 0); + return 0; +} +early_param("kstack", kstack_setup); + +static int __init code_bytes_setup(char *s) +{ + code_bytes = simple_strtoul(s, NULL, 0); + if (code_bytes > 8192) + code_bytes = 8192; + + return 1; +} +__setup("code_bytes=", code_bytes_setup); diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c new file mode 100644 index 00000000000..96a5db7da8a --- /dev/null +++ b/arch/x86/kernel/dumpstack_64.c @@ -0,0 +1,575 @@ +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs + */ +#include <linux/kallsyms.h> +#include <linux/kprobes.h> +#include <linux/uaccess.h> +#include <linux/utsname.h> +#include <linux/hardirq.h> +#include <linux/kdebug.h> +#include <linux/module.h> +#include <linux/ptrace.h> +#include <linux/kexec.h> +#include <linux/bug.h> +#include <linux/nmi.h> +#include <linux/sysfs.h> + +#include <asm/stacktrace.h> + +#define STACKSLOTS_PER_LINE 4 +#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :) + +int panic_on_unrecovered_nmi; +int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; +static unsigned int code_bytes = 64; +static int die_counter; + +void printk_address(unsigned long address, int reliable) +{ + printk(" [<%p>] %s%pS\n", (void *) address, + reliable ? "" : "? ", (void *) address); +} + +static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, + unsigned *usedp, char **idp) +{ + static char ids[][8] = { + [DEBUG_STACK - 1] = "#DB", + [NMI_STACK - 1] = "NMI", + [DOUBLEFAULT_STACK - 1] = "#DF", + [STACKFAULT_STACK - 1] = "#SS", + [MCE_STACK - 1] = "#MC", +#if DEBUG_STKSZ > EXCEPTION_STKSZ + [N_EXCEPTION_STACKS ... + N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]" +#endif + }; + unsigned k; + + /* + * Iterate over all exception stacks, and figure out whether + * 'stack' is in one of them: + */ + for (k = 0; k < N_EXCEPTION_STACKS; k++) { + unsigned long end = per_cpu(orig_ist, cpu).ist[k]; + /* + * Is 'stack' above this exception frame's end? + * If yes then skip to the next frame. + */ + if (stack >= end) + continue; + /* + * Is 'stack' above this exception frame's start address? + * If yes then we found the right frame. + */ + if (stack >= end - EXCEPTION_STKSZ) { + /* + * Make sure we only iterate through an exception + * stack once. If it comes up for the second time + * then there's something wrong going on - just + * break out and return NULL: + */ + if (*usedp & (1U << k)) + break; + *usedp |= 1U << k; + *idp = ids[k]; + return (unsigned long *)end; + } + /* + * If this is a debug stack, and if it has a larger size than + * the usual exception stacks, then 'stack' might still + * be within the lower portion of the debug stack: + */ +#if DEBUG_STKSZ > EXCEPTION_STKSZ + if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) { + unsigned j = N_EXCEPTION_STACKS - 1; + + /* + * Black magic. A large debug stack is composed of + * multiple exception stack entries, which we + * iterate through now. Dont look: + */ + do { + ++j; + end -= EXCEPTION_STKSZ; + ids[j][4] = '1' + (j - N_EXCEPTION_STACKS); + } while (stack < end - EXCEPTION_STKSZ); + if (*usedp & (1U << j)) + break; + *usedp |= 1U << j; + *idp = ids[j]; + return (unsigned long *)end; + } +#endif + } + return NULL; +} + +/* + * x86-64 can have up to three kernel stacks: + * process stack + * interrupt stack + * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack + */ + +static inline int valid_stack_ptr(struct thread_info *tinfo, + void *p, unsigned int size, void *end) +{ + void *t = tinfo; + if (end) { + if (p < end && p >= (end-THREAD_SIZE)) + return 1; + else + return 0; + } + return p > t && p < t + THREAD_SIZE - size; +} + +/* The form of the top of the frame on the stack */ +struct stack_frame { + struct stack_frame *next_frame; + unsigned long return_address; +}; + +static inline unsigned long +print_context_stack(struct thread_info *tinfo, + unsigned long *stack, unsigned long bp, + const struct stacktrace_ops *ops, void *data, + unsigned long *end) +{ + struct stack_frame *frame = (struct stack_frame *)bp; + + while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) { + unsigned long addr; + + addr = *stack; + if (__kernel_text_address(addr)) { + if ((unsigned long) stack == bp + sizeof(long)) { + ops->address(data, addr, 1); + frame = frame->next_frame; + bp = (unsigned long) frame; + } else { + ops->address(data, addr, bp == 0); + } + } + stack++; + } + return bp; +} + +void dump_trace(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, unsigned long bp, + const struct stacktrace_ops *ops, void *data) +{ + const unsigned cpu = get_cpu(); + unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr; + unsigned used = 0; + struct thread_info *tinfo; + + if (!task) + task = current; + + if (!stack) { + unsigned long dummy; + stack = &dummy; + if (task && task != current) + stack = (unsigned long *)task->thread.sp; + } + +#ifdef CONFIG_FRAME_POINTER + if (!bp) { + if (task == current) { + /* Grab bp right from our regs */ + get_bp(bp); + } else { + /* bp is the last reg pushed by switch_to */ + bp = *(unsigned long *) task->thread.sp; + } + } +#endif + + /* + * Print function call entries in all stacks, starting at the + * current stack address. If the stacks consist of nested + * exceptions + */ + tinfo = task_thread_info(task); + for (;;) { + char *id; + unsigned long *estack_end; + estack_end = in_exception_stack(cpu, (unsigned long)stack, + &used, &id); + + if (estack_end) { + if (ops->stack(data, id) < 0) + break; + + bp = print_context_stack(tinfo, stack, bp, ops, + data, estack_end); + ops->stack(data, "<EOE>"); + /* + * We link to the next stack via the + * second-to-last pointer (index -2 to end) in the + * exception stack: + */ + stack = (unsigned long *) estack_end[-2]; + continue; + } + if (irqstack_end) { + unsigned long *irqstack; + irqstack = irqstack_end - + (IRQSTACKSIZE - 64) / sizeof(*irqstack); + + if (stack >= irqstack && stack < irqstack_end) { + if (ops->stack(data, "IRQ") < 0) + break; + bp = print_context_stack(tinfo, stack, bp, + ops, data, irqstack_end); + /* + * We link to the next stack (which would be + * the process stack normally) the last + * pointer (index -1 to end) in the IRQ stack: + */ + stack = (unsigned long *) (irqstack_end[-1]); + irqstack_end = NULL; + ops->stack(data, "EOI"); + continue; + } + } + break; + } + + /* + * This handles the process stack: + */ + bp = print_context_stack(tinfo, stack, bp, ops, data, NULL); + put_cpu(); +} +EXPORT_SYMBOL(dump_trace); + +static void +print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) +{ + printk(data); + print_symbol(msg, symbol); + printk("\n"); +} + +static void print_trace_warning(void *data, char *msg) +{ + printk("%s%s\n", (char *)data, msg); +} + +static int print_trace_stack(void *data, char *name) +{ + printk("%s <%s> ", (char *)data, name); + return 0; +} + +/* + * Print one address/symbol entries per line. + */ +static void print_trace_address(void *data, unsigned long addr, int reliable) +{ + touch_nmi_watchdog(); + printk(data); + printk_address(addr, reliable); +} + +static const struct stacktrace_ops print_trace_ops = { + .warning = print_trace_warning, + .warning_symbol = print_trace_warning_symbol, + .stack = print_trace_stack, + .address = print_trace_address, +}; + +static void +show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, unsigned long bp, char *log_lvl) +{ + printk("%sCall Trace:\n", log_lvl); + dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); +} + +void show_trace(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, unsigned long bp) +{ + show_trace_log_lvl(task, regs, stack, bp, ""); +} + +static void +show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *sp, unsigned long bp, char *log_lvl) +{ + unsigned long *stack; + int i; + const int cpu = smp_processor_id(); + unsigned long *irqstack_end = + (unsigned long *) (cpu_pda(cpu)->irqstackptr); + unsigned long *irqstack = + (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE); + + /* + * debugging aid: "show_stack(NULL, NULL);" prints the + * back trace for this cpu. + */ + + if (sp == NULL) { + if (task) + sp = (unsigned long *)task->thread.sp; + else + sp = (unsigned long *)&sp; + } + + stack = sp; + for (i = 0; i < kstack_depth_to_print; i++) { + if (stack >= irqstack && stack <= irqstack_end) { + if (stack == irqstack_end) { + stack = (unsigned long *) (irqstack_end[-1]); + printk(" <EOI> "); + } + } else { + if (((long) stack & (THREAD_SIZE-1)) == 0) + break; + } + if (i && ((i % STACKSLOTS_PER_LINE) == 0)) + printk("\n%s", log_lvl); + printk(" %016lx", *stack++); + touch_nmi_watchdog(); + } + printk("\n"); + show_trace_log_lvl(task, regs, sp, bp, log_lvl); +} + +void show_stack(struct task_struct *task, unsigned long *sp) +{ + show_stack_log_lvl(task, NULL, sp, 0, ""); +} + +/* + * The architecture-independent dump_stack generator + */ +void dump_stack(void) +{ + unsigned long bp = 0; + unsigned long stack; + +#ifdef CONFIG_FRAME_POINTER + if (!bp) + get_bp(bp); +#endif + + printk("Pid: %d, comm: %.20s %s %s %.*s\n", + current->pid, current->comm, print_tainted(), + init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version); + show_trace(NULL, NULL, &stack, bp); +} +EXPORT_SYMBOL(dump_stack); + +void show_registers(struct pt_regs *regs) +{ + int i; + unsigned long sp; + const int cpu = smp_processor_id(); + struct task_struct *cur = cpu_pda(cpu)->pcurrent; + + sp = regs->sp; + printk("CPU %d ", cpu); + __show_regs(regs, 1); + printk("Process %s (pid: %d, threadinfo %p, task %p)\n", + cur->comm, cur->pid, task_thread_info(cur), cur); + + /* + * When in-kernel, we also print out the stack and code at the + * time of the fault.. + */ + if (!user_mode(regs)) { + unsigned int code_prologue = code_bytes * 43 / 64; + unsigned int code_len = code_bytes; + unsigned char c; + u8 *ip; + + printk(KERN_EMERG "Stack:\n"); + show_stack_log_lvl(NULL, regs, (unsigned long *)sp, + regs->bp, KERN_EMERG); + + printk(KERN_EMERG "Code: "); + + ip = (u8 *)regs->ip - code_prologue; + if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { + /* try starting at IP */ + ip = (u8 *)regs->ip; + code_len = code_len - code_prologue + 1; + } + for (i = 0; i < code_len; i++, ip++) { + if (ip < (u8 *)PAGE_OFFSET || + probe_kernel_address(ip, c)) { + printk(" Bad RIP value."); + break; + } + if (ip == (u8 *)regs->ip) + printk("<%02x> ", c); + else + printk("%02x ", c); + } + } + printk("\n"); +} + +int is_valid_bugaddr(unsigned long ip) +{ + unsigned short ud2; + + if (__copy_from_user(&ud2, (const void __user *) ip, sizeof(ud2))) + return 0; + + return ud2 == 0x0b0f; +} + +static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; +static int die_owner = -1; +static unsigned int die_nest_count; + +unsigned __kprobes long oops_begin(void) +{ + int cpu; + unsigned long flags; + + oops_enter(); + + /* racy, but better than risking deadlock. */ + raw_local_irq_save(flags); + cpu = smp_processor_id(); + if (!__raw_spin_trylock(&die_lock)) { + if (cpu == die_owner) + /* nested oops. should stop eventually */; + else + __raw_spin_lock(&die_lock); + } + die_nest_count++; + die_owner = cpu; + console_verbose(); + bust_spinlocks(1); + return flags; +} + +void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) +{ + die_owner = -1; + bust_spinlocks(0); + die_nest_count--; + if (!die_nest_count) + /* Nest count reaches zero, release the lock. */ + __raw_spin_unlock(&die_lock); + raw_local_irq_restore(flags); + if (!regs) { + oops_exit(); + return; + } + if (in_interrupt()) + panic("Fatal exception in interrupt"); + if (panic_on_oops) + panic("Fatal exception"); + oops_exit(); + do_exit(signr); +} + +int __kprobes __die(const char *str, struct pt_regs *regs, long err) +{ + printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); +#ifdef CONFIG_PREEMPT + printk("PREEMPT "); +#endif +#ifdef CONFIG_SMP + printk("SMP "); +#endif +#ifdef CONFIG_DEBUG_PAGEALLOC + printk("DEBUG_PAGEALLOC"); +#endif + printk("\n"); + sysfs_printk_last_file(); + if (notify_die(DIE_OOPS, str, regs, err, + current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) + return 1; + + show_registers(regs); + add_taint(TAINT_DIE); + /* Executive summary in case the oops scrolled away */ + printk(KERN_ALERT "RIP "); + printk_address(regs->ip, 1); + printk(" RSP <%016lx>\n", regs->sp); + if (kexec_should_crash(current)) + crash_kexec(regs); + return 0; +} + +void die(const char *str, struct pt_regs *regs, long err) +{ + unsigned long flags = oops_begin(); + + if (!user_mode(regs)) + report_bug(regs->ip, regs); + + if (__die(str, regs, err)) + regs = NULL; + oops_end(flags, regs, SIGSEGV); +} + +notrace __kprobes void +die_nmi(char *str, struct pt_regs *regs, int do_panic) +{ + unsigned long flags; + + if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) + return; + + flags = oops_begin(); + /* + * We are in trouble anyway, lets at least try + * to get a message out. + */ + printk(KERN_EMERG "%s", str); + printk(" on CPU%d, ip %08lx, registers:\n", + smp_processor_id(), regs->ip); + show_registers(regs); + if (kexec_should_crash(current)) + crash_kexec(regs); + if (do_panic || panic_on_oops) + panic("Non maskable interrupt"); + oops_end(flags, NULL, SIGBUS); + nmi_exit(); + local_irq_enable(); + do_exit(SIGBUS); +} + +static int __init oops_setup(char *s) +{ + if (!s) + return -EINVAL; + if (!strcmp(s, "panic")) + panic_on_oops = 1; + return 0; +} +early_param("oops", oops_setup); + +static int __init kstack_setup(char *s) +{ + if (!s) + return -EINVAL; + kstack_depth_to_print = simple_strtoul(s, NULL, 0); + return 0; +} +early_param("kstack", kstack_setup); + +static int __init code_bytes_setup(char *s) +{ + code_bytes = simple_strtoul(s, NULL, 0); + if (code_bytes > 8192) + code_bytes = 8192; + + return 1; +} +__setup("code_bytes=", code_bytes_setup); diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 9af89078f7b..ce97bf3bed1 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -148,6 +148,9 @@ void __init e820_print_map(char *who) case E820_NVS: printk(KERN_CONT "(ACPI NVS)\n"); break; + case E820_UNUSABLE: + printk("(unusable)\n"); + break; default: printk(KERN_CONT "type %u\n", e820.map[i].type); break; @@ -1203,7 +1206,7 @@ static int __init parse_memmap_opt(char *p) if (!p) return -EINVAL; - if (!strcmp(p, "exactmap")) { + if (!strncmp(p, "exactmap", 8)) { #ifdef CONFIG_CRASH_DUMP /* * If we are doing a crash dump, we still need to know @@ -1260,6 +1263,7 @@ static inline const char *e820_type_to_string(int e820_type) case E820_RAM: return "System RAM"; case E820_ACPI: return "ACPI Tables"; case E820_NVS: return "ACPI Non-volatile Storage"; + case E820_UNUSABLE: return "Unusable memory"; default: return "reserved"; } } @@ -1267,6 +1271,7 @@ static inline const char *e820_type_to_string(int e820_type) /* * Mark e820 reserved areas as busy for the resource manager. */ +static struct resource __initdata *e820_res; void __init e820_reserve_resources(void) { int i; @@ -1274,20 +1279,26 @@ void __init e820_reserve_resources(void) u64 end; res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map); + e820_res = res; for (i = 0; i < e820.nr_map; i++) { end = e820.map[i].addr + e820.map[i].size - 1; -#ifndef CONFIG_RESOURCES_64BIT - if (end > 0x100000000ULL) { + if (end != (resource_size_t)end) { res++; continue; } -#endif res->name = e820_type_to_string(e820.map[i].type); res->start = e820.map[i].addr; res->end = end; res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; - insert_resource(&iomem_resource, res); + + /* + * don't register the region that could be conflicted with + * pci device BAR resource and insert them later in + * pcibios_resource_survey() + */ + if (e820.map[i].type != E820_RESERVED || res->start < (1ULL<<20)) + insert_resource(&iomem_resource, res); res++; } @@ -1299,6 +1310,19 @@ void __init e820_reserve_resources(void) } } +void __init e820_reserve_resources_late(void) +{ + int i; + struct resource *res; + + res = e820_res; + for (i = 0; i < e820.nr_map; i++) { + if (!res->parent && res->end) + reserve_region_with_split(&iomem_resource, res->start, res->end, res->name); + res++; + } +} + char *__init default_machine_specific_memory_setup(void) { char *who = "BIOS-e820"; diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 4353cf5e6fa..733c4f8d42e 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -95,6 +95,66 @@ static void __init nvidia_bugs(int num, int slot, int func) } +static u32 ati_ixp4x0_rev(int num, int slot, int func) +{ + u32 d; + u8 b; + + b = read_pci_config_byte(num, slot, func, 0xac); + b &= ~(1<<5); + write_pci_config_byte(num, slot, func, 0xac, b); + + d = read_pci_config(num, slot, func, 0x70); + d |= 1<<8; + write_pci_config(num, slot, func, 0x70, d); + + d = read_pci_config(num, slot, func, 0x8); + d &= 0xff; + return d; +} + +static void __init ati_bugs(int num, int slot, int func) +{ +#if defined(CONFIG_ACPI) && defined (CONFIG_X86_IO_APIC) + u32 d; + u8 b; + + if (acpi_use_timer_override) + return; + + d = ati_ixp4x0_rev(num, slot, func); + if (d < 0x82) + acpi_skip_timer_override = 1; + else { + /* check for IRQ0 interrupt swap */ + outb(0x72, 0xcd6); b = inb(0xcd7); + if (!(b & 0x2)) + acpi_skip_timer_override = 1; + } + + if (acpi_skip_timer_override) { + printk(KERN_INFO "SB4X0 revision 0x%x\n", d); + printk(KERN_INFO "Ignoring ACPI timer override.\n"); + printk(KERN_INFO "If you got timer trouble " + "try acpi_use_timer_override\n"); + } +#endif +} + +#ifdef CONFIG_DMAR +static void __init intel_g33_dmar(int num, int slot, int func) +{ + struct acpi_table_header *dmar_tbl; + acpi_status status; + + status = acpi_get_table(ACPI_SIG_DMAR, 0, &dmar_tbl); + if (ACPI_SUCCESS(status)) { + printk(KERN_INFO "BIOS BUG: DMAR advertised on Intel G31/G33 chipset -- ignoring\n"); + dmar_disabled = 1; + } +} +#endif + #define QFLAG_APPLY_ONCE 0x1 #define QFLAG_APPLIED 0x2 #define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED) @@ -114,6 +174,12 @@ static struct chipset early_qrk[] __initdata = { PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, via_bugs }, { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB, PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, fix_hypertransport_config }, + { PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS, + PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs }, +#ifdef CONFIG_DMAR + { PCI_VENDOR_ID_INTEL, 0x29c0, + PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, intel_g33_dmar }, +#endif {} }; diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index ff9e7350da5..34ad997d383 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -3,11 +3,19 @@ #include <linux/init.h> #include <linux/string.h> #include <linux/screen_info.h> +#include <linux/usb/ch9.h> +#include <linux/pci_regs.h> +#include <linux/pci_ids.h> +#include <linux/errno.h> #include <asm/io.h> #include <asm/processor.h> #include <asm/fcntl.h> #include <asm/setup.h> #include <xen/hvc-console.h> +#include <asm/pci-direct.h> +#include <asm/pgtable.h> +#include <asm/fixmap.h> +#include <linux/usb/ehci_def.h> /* Simple VGA output */ #define VGABASE (__ISA_IO_base + 0xb8000) @@ -78,6 +86,7 @@ static int early_serial_base = 0x3f8; /* ttyS0 */ static int early_serial_putc(unsigned char ch) { unsigned timeout = 0xffff; + while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout) cpu_relax(); outb(ch, early_serial_base + TXR); @@ -111,7 +120,7 @@ static __init void early_serial_init(char *s) if (!strncmp(s, "0x", 2)) { early_serial_base = simple_strtoul(s, &e, 16); } else { - static int bases[] = { 0x3f8, 0x2f8 }; + static const int __initconst bases[] = { 0x3f8, 0x2f8 }; if (!strncmp(s, "ttyS", 4)) s += 4; @@ -151,6 +160,721 @@ static struct console early_serial_console = { .index = -1, }; +#ifdef CONFIG_EARLY_PRINTK_DBGP + +static struct ehci_caps __iomem *ehci_caps; +static struct ehci_regs __iomem *ehci_regs; +static struct ehci_dbg_port __iomem *ehci_debug; +static unsigned int dbgp_endpoint_out; + +struct ehci_dev { + u32 bus; + u32 slot; + u32 func; +}; + +static struct ehci_dev ehci_dev; + +#define USB_DEBUG_DEVNUM 127 + +#define DBGP_DATA_TOGGLE 0x8800 + +static inline u32 dbgp_pid_update(u32 x, u32 tok) +{ + return ((x ^ DBGP_DATA_TOGGLE) & 0xffff00) | (tok & 0xff); +} + +static inline u32 dbgp_len_update(u32 x, u32 len) +{ + return (x & ~0x0f) | (len & 0x0f); +} + +/* + * USB Packet IDs (PIDs) + */ + +/* token */ +#define USB_PID_OUT 0xe1 +#define USB_PID_IN 0x69 +#define USB_PID_SOF 0xa5 +#define USB_PID_SETUP 0x2d +/* handshake */ +#define USB_PID_ACK 0xd2 +#define USB_PID_NAK 0x5a +#define USB_PID_STALL 0x1e +#define USB_PID_NYET 0x96 +/* data */ +#define USB_PID_DATA0 0xc3 +#define USB_PID_DATA1 0x4b +#define USB_PID_DATA2 0x87 +#define USB_PID_MDATA 0x0f +/* Special */ +#define USB_PID_PREAMBLE 0x3c +#define USB_PID_ERR 0x3c +#define USB_PID_SPLIT 0x78 +#define USB_PID_PING 0xb4 +#define USB_PID_UNDEF_0 0xf0 + +#define USB_PID_DATA_TOGGLE 0x88 +#define DBGP_CLAIM (DBGP_OWNER | DBGP_ENABLED | DBGP_INUSE) + +#define PCI_CAP_ID_EHCI_DEBUG 0xa + +#define HUB_ROOT_RESET_TIME 50 /* times are in msec */ +#define HUB_SHORT_RESET_TIME 10 +#define HUB_LONG_RESET_TIME 200 +#define HUB_RESET_TIMEOUT 500 + +#define DBGP_MAX_PACKET 8 + +static int dbgp_wait_until_complete(void) +{ + u32 ctrl; + int loop = 0x100000; + + do { + ctrl = readl(&ehci_debug->control); + /* Stop when the transaction is finished */ + if (ctrl & DBGP_DONE) + break; + } while (--loop > 0); + + if (!loop) + return -1; + + /* + * Now that we have observed the completed transaction, + * clear the done bit. + */ + writel(ctrl | DBGP_DONE, &ehci_debug->control); + return (ctrl & DBGP_ERROR) ? -DBGP_ERRCODE(ctrl) : DBGP_LEN(ctrl); +} + +static void dbgp_mdelay(int ms) +{ + int i; + + while (ms--) { + for (i = 0; i < 1000; i++) + outb(0x1, 0x80); + } +} + +static void dbgp_breath(void) +{ + /* Sleep to give the debug port a chance to breathe */ +} + +static int dbgp_wait_until_done(unsigned ctrl) +{ + u32 pids, lpid; + int ret; + int loop = 3; + +retry: + writel(ctrl | DBGP_GO, &ehci_debug->control); + ret = dbgp_wait_until_complete(); + pids = readl(&ehci_debug->pids); + lpid = DBGP_PID_GET(pids); + + if (ret < 0) + return ret; + + /* + * If the port is getting full or it has dropped data + * start pacing ourselves, not necessary but it's friendly. + */ + if ((lpid == USB_PID_NAK) || (lpid == USB_PID_NYET)) + dbgp_breath(); + + /* If I get a NACK reissue the transmission */ + if (lpid == USB_PID_NAK) { + if (--loop > 0) + goto retry; + } + + return ret; +} + +static void dbgp_set_data(const void *buf, int size) +{ + const unsigned char *bytes = buf; + u32 lo, hi; + int i; + + lo = hi = 0; + for (i = 0; i < 4 && i < size; i++) + lo |= bytes[i] << (8*i); + for (; i < 8 && i < size; i++) + hi |= bytes[i] << (8*(i - 4)); + writel(lo, &ehci_debug->data03); + writel(hi, &ehci_debug->data47); +} + +static void dbgp_get_data(void *buf, int size) +{ + unsigned char *bytes = buf; + u32 lo, hi; + int i; + + lo = readl(&ehci_debug->data03); + hi = readl(&ehci_debug->data47); + for (i = 0; i < 4 && i < size; i++) + bytes[i] = (lo >> (8*i)) & 0xff; + for (; i < 8 && i < size; i++) + bytes[i] = (hi >> (8*(i - 4))) & 0xff; +} + +static int dbgp_bulk_write(unsigned devnum, unsigned endpoint, + const char *bytes, int size) +{ + u32 pids, addr, ctrl; + int ret; + + if (size > DBGP_MAX_PACKET) + return -1; + + addr = DBGP_EPADDR(devnum, endpoint); + + pids = readl(&ehci_debug->pids); + pids = dbgp_pid_update(pids, USB_PID_OUT); + + ctrl = readl(&ehci_debug->control); + ctrl = dbgp_len_update(ctrl, size); + ctrl |= DBGP_OUT; + ctrl |= DBGP_GO; + + dbgp_set_data(bytes, size); + writel(addr, &ehci_debug->address); + writel(pids, &ehci_debug->pids); + + ret = dbgp_wait_until_done(ctrl); + if (ret < 0) + return ret; + + return ret; +} + +static int dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data, + int size) +{ + u32 pids, addr, ctrl; + int ret; + + if (size > DBGP_MAX_PACKET) + return -1; + + addr = DBGP_EPADDR(devnum, endpoint); + + pids = readl(&ehci_debug->pids); + pids = dbgp_pid_update(pids, USB_PID_IN); + + ctrl = readl(&ehci_debug->control); + ctrl = dbgp_len_update(ctrl, size); + ctrl &= ~DBGP_OUT; + ctrl |= DBGP_GO; + + writel(addr, &ehci_debug->address); + writel(pids, &ehci_debug->pids); + ret = dbgp_wait_until_done(ctrl); + if (ret < 0) + return ret; + + if (size > ret) + size = ret; + dbgp_get_data(data, size); + return ret; +} + +static int dbgp_control_msg(unsigned devnum, int requesttype, int request, + int value, int index, void *data, int size) +{ + u32 pids, addr, ctrl; + struct usb_ctrlrequest req; + int read; + int ret; + + read = (requesttype & USB_DIR_IN) != 0; + if (size > (read ? DBGP_MAX_PACKET:0)) + return -1; + + /* Compute the control message */ + req.bRequestType = requesttype; + req.bRequest = request; + req.wValue = cpu_to_le16(value); + req.wIndex = cpu_to_le16(index); + req.wLength = cpu_to_le16(size); + + pids = DBGP_PID_SET(USB_PID_DATA0, USB_PID_SETUP); + addr = DBGP_EPADDR(devnum, 0); + + ctrl = readl(&ehci_debug->control); + ctrl = dbgp_len_update(ctrl, sizeof(req)); + ctrl |= DBGP_OUT; + ctrl |= DBGP_GO; + + /* Send the setup message */ + dbgp_set_data(&req, sizeof(req)); + writel(addr, &ehci_debug->address); + writel(pids, &ehci_debug->pids); + ret = dbgp_wait_until_done(ctrl); + if (ret < 0) + return ret; + + /* Read the result */ + return dbgp_bulk_read(devnum, 0, data, size); +} + + +/* Find a PCI capability */ +static u32 __init find_cap(u32 num, u32 slot, u32 func, int cap) +{ + u8 pos; + int bytes; + + if (!(read_pci_config_16(num, slot, func, PCI_STATUS) & + PCI_STATUS_CAP_LIST)) + return 0; + + pos = read_pci_config_byte(num, slot, func, PCI_CAPABILITY_LIST); + for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) { + u8 id; + + pos &= ~3; + id = read_pci_config_byte(num, slot, func, pos+PCI_CAP_LIST_ID); + if (id == 0xff) + break; + if (id == cap) + return pos; + + pos = read_pci_config_byte(num, slot, func, + pos+PCI_CAP_LIST_NEXT); + } + return 0; +} + +static u32 __init __find_dbgp(u32 bus, u32 slot, u32 func) +{ + u32 class; + + class = read_pci_config(bus, slot, func, PCI_CLASS_REVISION); + if ((class >> 8) != PCI_CLASS_SERIAL_USB_EHCI) + return 0; + + return find_cap(bus, slot, func, PCI_CAP_ID_EHCI_DEBUG); +} + +static u32 __init find_dbgp(int ehci_num, u32 *rbus, u32 *rslot, u32 *rfunc) +{ + u32 bus, slot, func; + + for (bus = 0; bus < 256; bus++) { + for (slot = 0; slot < 32; slot++) { + for (func = 0; func < 8; func++) { + unsigned cap; + + cap = __find_dbgp(bus, slot, func); + + if (!cap) + continue; + if (ehci_num-- != 0) + continue; + *rbus = bus; + *rslot = slot; + *rfunc = func; + return cap; + } + } + } + return 0; +} + +static int ehci_reset_port(int port) +{ + u32 portsc; + u32 delay_time, delay; + int loop; + + /* Reset the usb debug port */ + portsc = readl(&ehci_regs->port_status[port - 1]); + portsc &= ~PORT_PE; + portsc |= PORT_RESET; + writel(portsc, &ehci_regs->port_status[port - 1]); + + delay = HUB_ROOT_RESET_TIME; + for (delay_time = 0; delay_time < HUB_RESET_TIMEOUT; + delay_time += delay) { + dbgp_mdelay(delay); + + portsc = readl(&ehci_regs->port_status[port - 1]); + if (portsc & PORT_RESET) { + /* force reset to complete */ + loop = 2; + writel(portsc & ~(PORT_RWC_BITS | PORT_RESET), + &ehci_regs->port_status[port - 1]); + do { + portsc = readl(&ehci_regs->port_status[port-1]); + } while ((portsc & PORT_RESET) && (--loop > 0)); + } + + /* Device went away? */ + if (!(portsc & PORT_CONNECT)) + return -ENOTCONN; + + /* bomb out completely if something weird happend */ + if ((portsc & PORT_CSC)) + return -EINVAL; + + /* If we've finished resetting, then break out of the loop */ + if (!(portsc & PORT_RESET) && (portsc & PORT_PE)) + return 0; + } + return -EBUSY; +} + +static int ehci_wait_for_port(int port) +{ + u32 status; + int ret, reps; + + for (reps = 0; reps < 3; reps++) { + dbgp_mdelay(100); + status = readl(&ehci_regs->status); + if (status & STS_PCD) { + ret = ehci_reset_port(port); + if (ret == 0) + return 0; + } + } + return -ENOTCONN; +} + +#ifdef DBGP_DEBUG +# define dbgp_printk early_printk +#else +static inline void dbgp_printk(const char *fmt, ...) { } +#endif + +typedef void (*set_debug_port_t)(int port); + +static void default_set_debug_port(int port) +{ +} + +static set_debug_port_t set_debug_port = default_set_debug_port; + +static void nvidia_set_debug_port(int port) +{ + u32 dword; + dword = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, + 0x74); + dword &= ~(0x0f<<12); + dword |= ((port & 0x0f)<<12); + write_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, 0x74, + dword); + dbgp_printk("set debug port to %d\n", port); +} + +static void __init detect_set_debug_port(void) +{ + u32 vendorid; + + vendorid = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, + 0x00); + + if ((vendorid & 0xffff) == 0x10de) { + dbgp_printk("using nvidia set_debug_port\n"); + set_debug_port = nvidia_set_debug_port; + } +} + +static int __init ehci_setup(void) +{ + struct usb_debug_descriptor dbgp_desc; + u32 cmd, ctrl, status, portsc, hcs_params; + u32 debug_port, new_debug_port = 0, n_ports; + u32 devnum; + int ret, i; + int loop; + int port_map_tried; + int playtimes = 3; + +try_next_time: + port_map_tried = 0; + +try_next_port: + + hcs_params = readl(&ehci_caps->hcs_params); + debug_port = HCS_DEBUG_PORT(hcs_params); + n_ports = HCS_N_PORTS(hcs_params); + + dbgp_printk("debug_port: %d\n", debug_port); + dbgp_printk("n_ports: %d\n", n_ports); + + for (i = 1; i <= n_ports; i++) { + portsc = readl(&ehci_regs->port_status[i-1]); + dbgp_printk("portstatus%d: %08x\n", i, portsc); + } + + if (port_map_tried && (new_debug_port != debug_port)) { + if (--playtimes) { + set_debug_port(new_debug_port); + goto try_next_time; + } + return -1; + } + + loop = 10; + /* Reset the EHCI controller */ + cmd = readl(&ehci_regs->command); + cmd |= CMD_RESET; + writel(cmd, &ehci_regs->command); + do { + cmd = readl(&ehci_regs->command); + } while ((cmd & CMD_RESET) && (--loop > 0)); + + if (!loop) { + dbgp_printk("can not reset ehci\n"); + return -1; + } + dbgp_printk("ehci reset done\n"); + + /* Claim ownership, but do not enable yet */ + ctrl = readl(&ehci_debug->control); + ctrl |= DBGP_OWNER; + ctrl &= ~(DBGP_ENABLED | DBGP_INUSE); + writel(ctrl, &ehci_debug->control); + + /* Start the ehci running */ + cmd = readl(&ehci_regs->command); + cmd &= ~(CMD_LRESET | CMD_IAAD | CMD_PSE | CMD_ASE | CMD_RESET); + cmd |= CMD_RUN; + writel(cmd, &ehci_regs->command); + + /* Ensure everything is routed to the EHCI */ + writel(FLAG_CF, &ehci_regs->configured_flag); + + /* Wait until the controller is no longer halted */ + loop = 10; + do { + status = readl(&ehci_regs->status); + } while ((status & STS_HALT) && (--loop > 0)); + + if (!loop) { + dbgp_printk("ehci can be started\n"); + return -1; + } + dbgp_printk("ehci started\n"); + + /* Wait for a device to show up in the debug port */ + ret = ehci_wait_for_port(debug_port); + if (ret < 0) { + dbgp_printk("No device found in debug port\n"); + goto next_debug_port; + } + dbgp_printk("ehci wait for port done\n"); + + /* Enable the debug port */ + ctrl = readl(&ehci_debug->control); + ctrl |= DBGP_CLAIM; + writel(ctrl, &ehci_debug->control); + ctrl = readl(&ehci_debug->control); + if ((ctrl & DBGP_CLAIM) != DBGP_CLAIM) { + dbgp_printk("No device in debug port\n"); + writel(ctrl & ~DBGP_CLAIM, &ehci_debug->control); + goto err; + } + dbgp_printk("debug ported enabled\n"); + + /* Completely transfer the debug device to the debug controller */ + portsc = readl(&ehci_regs->port_status[debug_port - 1]); + portsc &= ~PORT_PE; + writel(portsc, &ehci_regs->port_status[debug_port - 1]); + + dbgp_mdelay(100); + + /* Find the debug device and make it device number 127 */ + for (devnum = 0; devnum <= 127; devnum++) { + ret = dbgp_control_msg(devnum, + USB_DIR_IN | USB_TYPE_STANDARD | USB_RECIP_DEVICE, + USB_REQ_GET_DESCRIPTOR, (USB_DT_DEBUG << 8), 0, + &dbgp_desc, sizeof(dbgp_desc)); + if (ret > 0) + break; + } + if (devnum > 127) { + dbgp_printk("Could not find attached debug device\n"); + goto err; + } + if (ret < 0) { + dbgp_printk("Attached device is not a debug device\n"); + goto err; + } + dbgp_endpoint_out = dbgp_desc.bDebugOutEndpoint; + + /* Move the device to 127 if it isn't already there */ + if (devnum != USB_DEBUG_DEVNUM) { + ret = dbgp_control_msg(devnum, + USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE, + USB_REQ_SET_ADDRESS, USB_DEBUG_DEVNUM, 0, NULL, 0); + if (ret < 0) { + dbgp_printk("Could not move attached device to %d\n", + USB_DEBUG_DEVNUM); + goto err; + } + devnum = USB_DEBUG_DEVNUM; + dbgp_printk("debug device renamed to 127\n"); + } + + /* Enable the debug interface */ + ret = dbgp_control_msg(USB_DEBUG_DEVNUM, + USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE, + USB_REQ_SET_FEATURE, USB_DEVICE_DEBUG_MODE, 0, NULL, 0); + if (ret < 0) { + dbgp_printk(" Could not enable the debug device\n"); + goto err; + } + dbgp_printk("debug interface enabled\n"); + + /* Perform a small write to get the even/odd data state in sync + */ + ret = dbgp_bulk_write(USB_DEBUG_DEVNUM, dbgp_endpoint_out, " ", 1); + if (ret < 0) { + dbgp_printk("dbgp_bulk_write failed: %d\n", ret); + goto err; + } + dbgp_printk("small write doned\n"); + + return 0; +err: + /* Things didn't work so remove my claim */ + ctrl = readl(&ehci_debug->control); + ctrl &= ~(DBGP_CLAIM | DBGP_OUT); + writel(ctrl, &ehci_debug->control); + return -1; + +next_debug_port: + port_map_tried |= (1<<(debug_port - 1)); + new_debug_port = ((debug_port-1+1)%n_ports) + 1; + if (port_map_tried != ((1<<n_ports) - 1)) { + set_debug_port(new_debug_port); + goto try_next_port; + } + if (--playtimes) { + set_debug_port(new_debug_port); + goto try_next_time; + } + + return -1; +} + +static int __init early_dbgp_init(char *s) +{ + u32 debug_port, bar, offset; + u32 bus, slot, func, cap; + void __iomem *ehci_bar; + u32 dbgp_num; + u32 bar_val; + char *e; + int ret; + u8 byte; + + if (!early_pci_allowed()) + return -1; + + dbgp_num = 0; + if (*s) + dbgp_num = simple_strtoul(s, &e, 10); + dbgp_printk("dbgp_num: %d\n", dbgp_num); + + cap = find_dbgp(dbgp_num, &bus, &slot, &func); + if (!cap) + return -1; + + dbgp_printk("Found EHCI debug port on %02x:%02x.%1x\n", bus, slot, + func); + + debug_port = read_pci_config(bus, slot, func, cap); + bar = (debug_port >> 29) & 0x7; + bar = (bar * 4) + 0xc; + offset = (debug_port >> 16) & 0xfff; + dbgp_printk("bar: %02x offset: %03x\n", bar, offset); + if (bar != PCI_BASE_ADDRESS_0) { + dbgp_printk("only debug ports on bar 1 handled.\n"); + + return -1; + } + + bar_val = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0); + dbgp_printk("bar_val: %02x offset: %03x\n", bar_val, offset); + if (bar_val & ~PCI_BASE_ADDRESS_MEM_MASK) { + dbgp_printk("only simple 32bit mmio bars supported\n"); + + return -1; + } + + /* double check if the mem space is enabled */ + byte = read_pci_config_byte(bus, slot, func, 0x04); + if (!(byte & 0x2)) { + byte |= 0x02; + write_pci_config_byte(bus, slot, func, 0x04, byte); + dbgp_printk("mmio for ehci enabled\n"); + } + + /* + * FIXME I don't have the bar size so just guess PAGE_SIZE is more + * than enough. 1K is the biggest I have seen. + */ + set_fixmap_nocache(FIX_DBGP_BASE, bar_val & PAGE_MASK); + ehci_bar = (void __iomem *)__fix_to_virt(FIX_DBGP_BASE); + ehci_bar += bar_val & ~PAGE_MASK; + dbgp_printk("ehci_bar: %p\n", ehci_bar); + + ehci_caps = ehci_bar; + ehci_regs = ehci_bar + HC_LENGTH(readl(&ehci_caps->hc_capbase)); + ehci_debug = ehci_bar + offset; + ehci_dev.bus = bus; + ehci_dev.slot = slot; + ehci_dev.func = func; + + detect_set_debug_port(); + + ret = ehci_setup(); + if (ret < 0) { + dbgp_printk("ehci_setup failed\n"); + ehci_debug = NULL; + + return -1; + } + + return 0; +} + +static void early_dbgp_write(struct console *con, const char *str, u32 n) +{ + int chunk, ret; + + if (!ehci_debug) + return; + while (n > 0) { + chunk = n; + if (chunk > DBGP_MAX_PACKET) + chunk = DBGP_MAX_PACKET; + ret = dbgp_bulk_write(USB_DEBUG_DEVNUM, + dbgp_endpoint_out, str, chunk); + str += chunk; + n -= chunk; + } +} + +static struct console early_dbgp_console = { + .name = "earlydbg", + .write = early_dbgp_write, + .flags = CON_PRINTBUFFER, + .index = -1, +}; +#endif + /* Console interface to a host file on AMD's SimNow! */ static int simnow_fd; @@ -165,6 +889,7 @@ enum { static noinline long simnow(long cmd, long a, long b, long c) { long ret; + asm volatile("cpuid" : "=a" (ret) : "b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2)); @@ -174,6 +899,7 @@ static noinline long simnow(long cmd, long a, long b, long c) static void __init simnow_init(char *str) { char *fn = "klog"; + if (*str == '=') fn = ++str; /* error ignored */ @@ -194,7 +920,7 @@ static struct console simnow_console = { /* Direct interface for emergencies */ static struct console *early_console = &early_vga_console; -static int early_console_initialized; +static int __initdata early_console_initialized; asmlinkage void early_printk(const char *fmt, ...) { @@ -208,10 +934,11 @@ asmlinkage void early_printk(const char *fmt, ...) va_end(ap); } -static int __initdata keep_early; static int __init setup_early_printk(char *buf) { + int keep_early; + if (!buf) return 0; @@ -219,8 +946,7 @@ static int __init setup_early_printk(char *buf) return 0; early_console_initialized = 1; - if (strstr(buf, "keep")) - keep_early = 1; + keep_early = (strstr(buf, "keep") != NULL); if (!strncmp(buf, "serial", 6)) { early_serial_init(buf + 6); @@ -238,6 +964,17 @@ static int __init setup_early_printk(char *buf) simnow_init(buf + 6); early_console = &simnow_console; keep_early = 1; +#ifdef CONFIG_EARLY_PRINTK_DBGP + } else if (!strncmp(buf, "dbgp", 4)) { + if (early_dbgp_init(buf+4) < 0) + return 0; + early_console = &early_dbgp_console; + /* + * usb subsys will reset ehci controller, so don't keep + * that early console + */ + keep_early = 0; +#endif #ifdef CONFIG_HVC_XEN } else if (!strncmp(buf, "xen", 3)) { early_console = &xenboot_console; @@ -251,4 +988,5 @@ static int __init setup_early_printk(char *buf) register_console(early_console); return 0; } + early_param("earlyprintk", setup_early_printk); diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c index 06cc8d4254b..945a31cdd81 100644 --- a/arch/x86/kernel/efi.c +++ b/arch/x86/kernel/efi.c @@ -414,9 +414,11 @@ void __init efi_init(void) if (memmap.map == NULL) printk(KERN_ERR "Could not map the EFI memory map!\n"); memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size); + if (memmap.desc_size != sizeof(efi_memory_desc_t)) - printk(KERN_WARNING "Kernel-defined memdesc" - "doesn't match the one from EFI!\n"); + printk(KERN_WARNING + "Kernel-defined memdesc doesn't match the one from EFI!\n"); + if (add_efi_memmap) do_add_efi_memmap(); diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 109792bc7cf..b21fbfaffe3 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -730,6 +730,7 @@ error_code: movl $(__USER_DS), %ecx movl %ecx, %ds movl %ecx, %es + TRACE_IRQS_OFF movl %esp,%eax # pt_regs pointer call *%edi jmp ret_from_exception @@ -760,20 +761,9 @@ ENTRY(device_not_available) RING0_INT_FRAME pushl $-1 # mark this as an int CFI_ADJUST_CFA_OFFSET 4 - SAVE_ALL - GET_CR0_INTO_EAX - testl $0x4, %eax # EM (math emulation bit) - jne device_not_available_emulate - preempt_stop(CLBR_ANY) - call math_state_restore - jmp ret_from_exception -device_not_available_emulate: - pushl $0 # temporary storage for ORIG_EIP + pushl $do_device_not_available CFI_ADJUST_CFA_OFFSET 4 - call math_emulate - addl $4, %esp - CFI_ADJUST_CFA_OFFSET -4 - jmp ret_from_exception + jmp error_code CFI_ENDPROC END(device_not_available) @@ -814,6 +804,7 @@ debug_stack_correct: pushl $-1 # mark this as an int CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL + TRACE_IRQS_OFF xorl %edx,%edx # error code 0 movl %esp,%eax # pt_regs pointer call do_debug @@ -858,6 +849,7 @@ nmi_stack_correct: pushl %eax CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL + TRACE_IRQS_OFF xorl %edx,%edx # zero error code movl %esp,%eax # pt_regs pointer call do_nmi @@ -898,6 +890,7 @@ nmi_espfix_stack: pushl %eax CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL + TRACE_IRQS_OFF FIXUP_ESPFIX_STACK # %eax == %esp xorl %edx,%edx # zero error code call do_nmi @@ -928,6 +921,7 @@ KPROBE_ENTRY(int3) pushl $-1 # mark this as an int CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL + TRACE_IRQS_OFF xorl %edx,%edx # zero error code movl %esp,%eax # pt_regs pointer call do_int3 @@ -1030,7 +1024,7 @@ ENTRY(machine_check) RING0_INT_FRAME pushl $0 CFI_ADJUST_CFA_OFFSET 4 - pushl machine_check_vector + pushl $do_machine_check CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 89434d43960..1db6ce4314e 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -275,9 +275,9 @@ ENTRY(native_usergs_sysret64) ENTRY(ret_from_fork) CFI_DEFAULT_STACK push kernel_eflags(%rip) - CFI_ADJUST_CFA_OFFSET 4 + CFI_ADJUST_CFA_OFFSET 8 popf # reset kernel eflags - CFI_ADJUST_CFA_OFFSET -4 + CFI_ADJUST_CFA_OFFSET -8 call schedule_tail GET_THREAD_INFO(%rcx) testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%rcx) @@ -667,6 +667,13 @@ END(stub_rt_sigreturn) SAVE_ARGS leaq -ARGOFFSET(%rsp),%rdi # arg1 for handler pushq %rbp + /* + * Save rbp twice: One is for marking the stack frame, as usual, and the + * other, to fill pt_regs properly. This is because bx comes right + * before the last saved register in that structure, and not bp. If the + * base pointer were in the place bx is today, this would not be needed. + */ + movq %rbp, -8(%rsp) CFI_ADJUST_CFA_OFFSET 8 CFI_REL_OFFSET rbp, 0 movq %rsp,%rbp @@ -932,6 +939,9 @@ END(spurious_interrupt) .if \ist movq %gs:pda_data_offset, %rbp .endif + .if \irqtrace + TRACE_IRQS_OFF + .endif movq %rsp,%rdi movq ORIG_RAX(%rsp),%rsi movq $-1,ORIG_RAX(%rsp) @@ -1058,7 +1068,8 @@ KPROBE_ENTRY(error_entry) je error_kernelspace error_swapgs: SWAPGS -error_sti: +error_sti: + TRACE_IRQS_OFF movq %rdi,RDI(%rsp) CFI_REL_OFFSET rdi,RDI movq %rsp,%rdi @@ -1232,7 +1243,7 @@ ENTRY(simd_coprocessor_error) END(simd_coprocessor_error) ENTRY(device_not_available) - zeroentry math_state_restore + zeroentry do_device_not_available END(device_not_available) /* runs on exception stack */ diff --git a/arch/x86/kernel/es7000_32.c b/arch/x86/kernel/es7000_32.c new file mode 100644 index 00000000000..f454c78fcef --- /dev/null +++ b/arch/x86/kernel/es7000_32.c @@ -0,0 +1,363 @@ +/* + * Written by: Garry Forsgren, Unisys Corporation + * Natalie Protasevich, Unisys Corporation + * This file contains the code to configure and interface + * with Unisys ES7000 series hardware system manager. + * + * Copyright (c) 2003 Unisys Corporation. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Unisys Corporation, Township Line & Union Meeting + * Roads-A, Unisys Way, Blue Bell, Pennsylvania, 19424, or: + * + * http://www.unisys.com + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/smp.h> +#include <linux/string.h> +#include <linux/spinlock.h> +#include <linux/errno.h> +#include <linux/notifier.h> +#include <linux/reboot.h> +#include <linux/init.h> +#include <linux/acpi.h> +#include <asm/io.h> +#include <asm/nmi.h> +#include <asm/smp.h> +#include <asm/apicdef.h> +#include <mach_mpparse.h> + +/* + * ES7000 chipsets + */ + +#define NON_UNISYS 0 +#define ES7000_CLASSIC 1 +#define ES7000_ZORRO 2 + + +#define MIP_REG 1 +#define MIP_PSAI_REG 4 + +#define MIP_BUSY 1 +#define MIP_SPIN 0xf0000 +#define MIP_VALID 0x0100000000000000ULL +#define MIP_PORT(VALUE) ((VALUE >> 32) & 0xffff) + +#define MIP_RD_LO(VALUE) (VALUE & 0xffffffff) + +struct mip_reg_info { + unsigned long long mip_info; + unsigned long long delivery_info; + unsigned long long host_reg; + unsigned long long mip_reg; +}; + +struct part_info { + unsigned char type; + unsigned char length; + unsigned char part_id; + unsigned char apic_mode; + unsigned long snum; + char ptype[16]; + char sname[64]; + char pname[64]; +}; + +struct psai { + unsigned long long entry_type; + unsigned long long addr; + unsigned long long bep_addr; +}; + +struct es7000_mem_info { + unsigned char type; + unsigned char length; + unsigned char resv[6]; + unsigned long long start; + unsigned long long size; +}; + +struct es7000_oem_table { + unsigned long long hdr; + struct mip_reg_info mip; + struct part_info pif; + struct es7000_mem_info shm; + struct psai psai; +}; + +#ifdef CONFIG_ACPI + +struct oem_table { + struct acpi_table_header Header; + u32 OEMTableAddr; + u32 OEMTableSize; +}; + +extern int find_unisys_acpi_oem_table(unsigned long *oem_addr); +extern void unmap_unisys_acpi_oem_table(unsigned long oem_addr); +#endif + +struct mip_reg { + unsigned long long off_0; + unsigned long long off_8; + unsigned long long off_10; + unsigned long long off_18; + unsigned long long off_20; + unsigned long long off_28; + unsigned long long off_30; + unsigned long long off_38; +}; + +#define MIP_SW_APIC 0x1020b +#define MIP_FUNC(VALUE) (VALUE & 0xff) + +/* + * ES7000 Globals + */ + +static volatile unsigned long *psai = NULL; +static struct mip_reg *mip_reg; +static struct mip_reg *host_reg; +static int mip_port; +static unsigned long mip_addr, host_addr; + +int es7000_plat; + +/* + * GSI override for ES7000 platforms. + */ + +static unsigned int base; + +static int +es7000_rename_gsi(int ioapic, int gsi) +{ + if (es7000_plat == ES7000_ZORRO) + return gsi; + + if (!base) { + int i; + for (i = 0; i < nr_ioapics; i++) + base += nr_ioapic_registers[i]; + } + + if (!ioapic && (gsi < 16)) + gsi += base; + return gsi; +} + +void __init +setup_unisys(void) +{ + /* + * Determine the generation of the ES7000 currently running. + * + * es7000_plat = 1 if the machine is a 5xx ES7000 box + * es7000_plat = 2 if the machine is a x86_64 ES7000 box + * + */ + if (!(boot_cpu_data.x86 <= 15 && boot_cpu_data.x86_model <= 2)) + es7000_plat = ES7000_ZORRO; + else + es7000_plat = ES7000_CLASSIC; + ioapic_renumber_irq = es7000_rename_gsi; +} + +/* + * Parse the OEM Table + */ + +int __init +parse_unisys_oem (char *oemptr) +{ + int i; + int success = 0; + unsigned char type, size; + unsigned long val; + char *tp = NULL; + struct psai *psaip = NULL; + struct mip_reg_info *mi; + struct mip_reg *host, *mip; + + tp = oemptr; + + tp += 8; + + for (i=0; i <= 6; i++) { + type = *tp++; + size = *tp++; + tp -= 2; + switch (type) { + case MIP_REG: + mi = (struct mip_reg_info *)tp; + val = MIP_RD_LO(mi->host_reg); + host_addr = val; + host = (struct mip_reg *)val; + host_reg = __va(host); + val = MIP_RD_LO(mi->mip_reg); + mip_port = MIP_PORT(mi->mip_info); + mip_addr = val; + mip = (struct mip_reg *)val; + mip_reg = __va(mip); + pr_debug("es7000_mipcfg: host_reg = 0x%lx \n", + (unsigned long)host_reg); + pr_debug("es7000_mipcfg: mip_reg = 0x%lx \n", + (unsigned long)mip_reg); + success++; + break; + case MIP_PSAI_REG: + psaip = (struct psai *)tp; + if (tp != NULL) { + if (psaip->addr) + psai = __va(psaip->addr); + else + psai = NULL; + success++; + } + break; + default: + break; + } + tp += size; + } + + if (success < 2) { + es7000_plat = NON_UNISYS; + } else + setup_unisys(); + return es7000_plat; +} + +#ifdef CONFIG_ACPI +static unsigned long oem_addrX; +static unsigned long oem_size; +int __init find_unisys_acpi_oem_table(unsigned long *oem_addr) +{ + struct acpi_table_header *header = NULL; + int i = 0; + acpi_size tbl_size; + + while (ACPI_SUCCESS(acpi_get_table_with_size("OEM1", i++, &header, &tbl_size))) { + if (!memcmp((char *) &header->oem_id, "UNISYS", 6)) { + struct oem_table *t = (struct oem_table *)header; + + oem_addrX = t->OEMTableAddr; + oem_size = t->OEMTableSize; + early_acpi_os_unmap_memory(header, tbl_size); + + *oem_addr = (unsigned long)__acpi_map_table(oem_addrX, + oem_size); + return 0; + } + early_acpi_os_unmap_memory(header, tbl_size); + } + return -1; +} + +void __init unmap_unisys_acpi_oem_table(unsigned long oem_addr) +{ + if (!oem_addr) + return; + + __acpi_unmap_table((char *)oem_addr, oem_size); +} +#endif + +static void +es7000_spin(int n) +{ + int i = 0; + + while (i++ < n) + rep_nop(); +} + +static int __init +es7000_mip_write(struct mip_reg *mip_reg) +{ + int status = 0; + int spin; + + spin = MIP_SPIN; + while (((unsigned long long)host_reg->off_38 & + (unsigned long long)MIP_VALID) != 0) { + if (--spin <= 0) { + printk("es7000_mip_write: Timeout waiting for Host Valid Flag"); + return -1; + } + es7000_spin(MIP_SPIN); + } + + memcpy(host_reg, mip_reg, sizeof(struct mip_reg)); + outb(1, mip_port); + + spin = MIP_SPIN; + + while (((unsigned long long)mip_reg->off_38 & + (unsigned long long)MIP_VALID) == 0) { + if (--spin <= 0) { + printk("es7000_mip_write: Timeout waiting for MIP Valid Flag"); + return -1; + } + es7000_spin(MIP_SPIN); + } + + status = ((unsigned long long)mip_reg->off_0 & + (unsigned long long)0xffff0000000000ULL) >> 48; + mip_reg->off_38 = ((unsigned long long)mip_reg->off_38 & + (unsigned long long)~MIP_VALID); + return status; +} + +int +es7000_start_cpu(int cpu, unsigned long eip) +{ + unsigned long vect = 0, psaival = 0; + + if (psai == NULL) + return -1; + + vect = ((unsigned long)__pa(eip)/0x1000) << 16; + psaival = (0x1000000 | vect | cpu); + + while (*psai & 0x1000000) + ; + + *psai = psaival; + + return 0; + +} + +void __init +es7000_sw_apic(void) +{ + if (es7000_plat) { + int mip_status; + struct mip_reg es7000_mip_reg; + + printk("ES7000: Enabling APIC mode.\n"); + memset(&es7000_mip_reg, 0, sizeof(struct mip_reg)); + es7000_mip_reg.off_0 = MIP_SW_APIC; + es7000_mip_reg.off_38 = (MIP_VALID); + while ((mip_status = es7000_mip_write(&es7000_mip_reg)) != 0) + printk("es7000_sw_apic: command failed, status = %x\n", + mip_status); + return; + } +} diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c index eaff0bbb144..6c9bfc9e1e9 100644 --- a/arch/x86/kernel/genapic_64.c +++ b/arch/x86/kernel/genapic_64.c @@ -16,87 +16,63 @@ #include <linux/ctype.h> #include <linux/init.h> #include <linux/hardirq.h> +#include <linux/dmar.h> #include <asm/smp.h> #include <asm/ipi.h> #include <asm/genapic.h> -#ifdef CONFIG_ACPI -#include <acpi/acpi_bus.h> -#endif - -DEFINE_PER_CPU(int, x2apic_extra_bits); +extern struct genapic apic_flat; +extern struct genapic apic_physflat; +extern struct genapic apic_x2xpic_uv_x; +extern struct genapic apic_x2apic_phys; +extern struct genapic apic_x2apic_cluster; struct genapic __read_mostly *genapic = &apic_flat; -static enum uv_system_type uv_system_type; +static struct genapic *apic_probe[] __initdata = { + &apic_x2apic_uv_x, + &apic_x2apic_phys, + &apic_x2apic_cluster, + &apic_physflat, + NULL, +}; /* * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. */ void __init setup_apic_routing(void) { - if (uv_system_type == UV_NON_UNIQUE_APIC) - genapic = &apic_x2apic_uv_x; - else -#ifdef CONFIG_ACPI - /* - * Quirk: some x86_64 machines can only use physical APIC mode - * regardless of how many processors are present (x86_64 ES7000 - * is an example). - */ - if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID && - (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) - genapic = &apic_physflat; - else -#endif - - if (max_physical_apicid < 8) - genapic = &apic_flat; - else - genapic = &apic_physflat; + if (genapic == &apic_x2apic_phys || genapic == &apic_x2apic_cluster) { + if (!intr_remapping_enabled) + genapic = &apic_flat; + } - printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name); + if (genapic == &apic_flat) { + if (max_physical_apicid >= 8) + genapic = &apic_physflat; + printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name); + } } /* Same for both flat and physical. */ -void send_IPI_self(int vector) +void apic_send_IPI_self(int vector) { __send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL); } int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) { - if (!strcmp(oem_id, "SGI")) { - if (!strcmp(oem_table_id, "UVL")) - uv_system_type = UV_LEGACY_APIC; - else if (!strcmp(oem_table_id, "UVX")) - uv_system_type = UV_X2APIC; - else if (!strcmp(oem_table_id, "UVH")) - uv_system_type = UV_NON_UNIQUE_APIC; + int i; + + for (i = 0; apic_probe[i]; ++i) { + if (apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) { + genapic = apic_probe[i]; + printk(KERN_INFO "Setting APIC routing to %s.\n", + genapic->name); + return 1; + } } return 0; } - -unsigned int read_apic_id(void) -{ - unsigned int id; - - WARN_ON(preemptible() && num_online_cpus() > 1); - id = apic_read(APIC_ID); - if (uv_system_type >= UV_X2APIC) - id |= __get_cpu_var(x2apic_extra_bits); - return id; -} - -enum uv_system_type get_uv_system_type(void) -{ - return uv_system_type; -} - -int is_uv_system(void) -{ - return uv_system_type != UV_NONE; -} -EXPORT_SYMBOL_GPL(is_uv_system); diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/genapic_flat_64.c index 786548a62d3..9eca5ba7a6b 100644 --- a/arch/x86/kernel/genapic_flat_64.c +++ b/arch/x86/kernel/genapic_flat_64.c @@ -15,9 +15,20 @@ #include <linux/kernel.h> #include <linux/ctype.h> #include <linux/init.h> +#include <linux/hardirq.h> #include <asm/smp.h> #include <asm/ipi.h> #include <asm/genapic.h> +#include <mach_apicdef.h> + +#ifdef CONFIG_ACPI +#include <acpi/acpi_bus.h> +#endif + +static int __init flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ + return 1; +} static cpumask_t flat_target_cpus(void) { @@ -95,9 +106,33 @@ static void flat_send_IPI_all(int vector) __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL); } +static unsigned int get_apic_id(unsigned long x) +{ + unsigned int id; + + id = (((x)>>24) & 0xFFu); + return id; +} + +static unsigned long set_apic_id(unsigned int id) +{ + unsigned long x; + + x = ((id & 0xFFu)<<24); + return x; +} + +static unsigned int read_xapic_id(void) +{ + unsigned int id; + + id = get_apic_id(apic_read(APIC_ID)); + return id; +} + static int flat_apic_id_registered(void) { - return physid_isset(GET_APIC_ID(read_apic_id()), phys_cpu_present_map); + return physid_isset(read_xapic_id(), phys_cpu_present_map); } static unsigned int flat_cpu_mask_to_apicid(cpumask_t cpumask) @@ -112,6 +147,7 @@ static unsigned int phys_pkg_id(int index_msb) struct genapic apic_flat = { .name = "flat", + .acpi_madt_oem_check = flat_acpi_madt_oem_check, .int_delivery_mode = dest_LowestPrio, .int_dest_mode = (APIC_DEST_LOGICAL != 0), .target_cpus = flat_target_cpus, @@ -121,8 +157,12 @@ struct genapic apic_flat = { .send_IPI_all = flat_send_IPI_all, .send_IPI_allbutself = flat_send_IPI_allbutself, .send_IPI_mask = flat_send_IPI_mask, + .send_IPI_self = apic_send_IPI_self, .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, .phys_pkg_id = phys_pkg_id, + .get_apic_id = get_apic_id, + .set_apic_id = set_apic_id, + .apic_id_mask = (0xFFu<<24), }; /* @@ -130,6 +170,21 @@ struct genapic apic_flat = { * We cannot use logical delivery in this case because the mask * overflows, so use physical mode. */ +static int __init physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ +#ifdef CONFIG_ACPI + /* + * Quirk: some x86_64 machines can only use physical APIC mode + * regardless of how many processors are present (x86_64 ES7000 + * is an example). + */ + if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID && + (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) + return 1; +#endif + + return 0; +} static cpumask_t physflat_target_cpus(void) { @@ -176,6 +231,7 @@ static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask) struct genapic apic_physflat = { .name = "physical flat", + .acpi_madt_oem_check = physflat_acpi_madt_oem_check, .int_delivery_mode = dest_Fixed, .int_dest_mode = (APIC_DEST_PHYSICAL != 0), .target_cpus = physflat_target_cpus, @@ -185,6 +241,10 @@ struct genapic apic_physflat = { .send_IPI_all = physflat_send_IPI_all, .send_IPI_allbutself = physflat_send_IPI_allbutself, .send_IPI_mask = physflat_send_IPI_mask, + .send_IPI_self = apic_send_IPI_self, .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid, .phys_pkg_id = phys_pkg_id, + .get_apic_id = get_apic_id, + .set_apic_id = set_apic_id, + .apic_id_mask = (0xFFu<<24), }; diff --git a/arch/x86/kernel/genx2apic_cluster.c b/arch/x86/kernel/genx2apic_cluster.c new file mode 100644 index 00000000000..e4bf2cc0d74 --- /dev/null +++ b/arch/x86/kernel/genx2apic_cluster.c @@ -0,0 +1,159 @@ +#include <linux/threads.h> +#include <linux/cpumask.h> +#include <linux/string.h> +#include <linux/kernel.h> +#include <linux/ctype.h> +#include <linux/init.h> +#include <linux/dmar.h> + +#include <asm/smp.h> +#include <asm/ipi.h> +#include <asm/genapic.h> + +DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid); + +static int __init x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ + if (cpu_has_x2apic) + return 1; + + return 0; +} + +/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ + +static cpumask_t x2apic_target_cpus(void) +{ + return cpumask_of_cpu(0); +} + +/* + * for now each logical cpu is in its own vector allocation domain. + */ +static cpumask_t x2apic_vector_allocation_domain(int cpu) +{ + cpumask_t domain = CPU_MASK_NONE; + cpu_set(cpu, domain); + return domain; +} + +static void __x2apic_send_IPI_dest(unsigned int apicid, int vector, + unsigned int dest) +{ + unsigned long cfg; + + cfg = __prepare_ICR(0, vector, dest); + + /* + * send the IPI. + */ + x2apic_icr_write(cfg, apicid); +} + +/* + * for now, we send the IPI's one by one in the cpumask. + * TBD: Based on the cpu mask, we can send the IPI's to the cluster group + * at once. We have 16 cpu's in a cluster. This will minimize IPI register + * writes. + */ +static void x2apic_send_IPI_mask(cpumask_t mask, int vector) +{ + unsigned long flags; + unsigned long query_cpu; + + local_irq_save(flags); + for_each_cpu_mask(query_cpu, mask) { + __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_logical_apicid, query_cpu), + vector, APIC_DEST_LOGICAL); + } + local_irq_restore(flags); +} + +static void x2apic_send_IPI_allbutself(int vector) +{ + cpumask_t mask = cpu_online_map; + + cpu_clear(smp_processor_id(), mask); + + if (!cpus_empty(mask)) + x2apic_send_IPI_mask(mask, vector); +} + +static void x2apic_send_IPI_all(int vector) +{ + x2apic_send_IPI_mask(cpu_online_map, vector); +} + +static int x2apic_apic_id_registered(void) +{ + return 1; +} + +static unsigned int x2apic_cpu_mask_to_apicid(cpumask_t cpumask) +{ + int cpu; + + /* + * We're using fixed IRQ delivery, can only return one phys APIC ID. + * May as well be the first. + */ + cpu = first_cpu(cpumask); + if ((unsigned)cpu < NR_CPUS) + return per_cpu(x86_cpu_to_logical_apicid, cpu); + else + return BAD_APICID; +} + +static unsigned int get_apic_id(unsigned long x) +{ + unsigned int id; + + id = x; + return id; +} + +static unsigned long set_apic_id(unsigned int id) +{ + unsigned long x; + + x = id; + return x; +} + +static unsigned int phys_pkg_id(int index_msb) +{ + return current_cpu_data.initial_apicid >> index_msb; +} + +static void x2apic_send_IPI_self(int vector) +{ + apic_write(APIC_SELF_IPI, vector); +} + +static void init_x2apic_ldr(void) +{ + int cpu = smp_processor_id(); + + per_cpu(x86_cpu_to_logical_apicid, cpu) = apic_read(APIC_LDR); + return; +} + +struct genapic apic_x2apic_cluster = { + .name = "cluster x2apic", + .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, + .int_delivery_mode = dest_LowestPrio, + .int_dest_mode = (APIC_DEST_LOGICAL != 0), + .target_cpus = x2apic_target_cpus, + .vector_allocation_domain = x2apic_vector_allocation_domain, + .apic_id_registered = x2apic_apic_id_registered, + .init_apic_ldr = init_x2apic_ldr, + .send_IPI_all = x2apic_send_IPI_all, + .send_IPI_allbutself = x2apic_send_IPI_allbutself, + .send_IPI_mask = x2apic_send_IPI_mask, + .send_IPI_self = x2apic_send_IPI_self, + .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid, + .phys_pkg_id = phys_pkg_id, + .get_apic_id = get_apic_id, + .set_apic_id = set_apic_id, + .apic_id_mask = (0xFFFFFFFFu), +}; diff --git a/arch/x86/kernel/genx2apic_phys.c b/arch/x86/kernel/genx2apic_phys.c new file mode 100644 index 00000000000..8f1343df262 --- /dev/null +++ b/arch/x86/kernel/genx2apic_phys.c @@ -0,0 +1,154 @@ +#include <linux/threads.h> +#include <linux/cpumask.h> +#include <linux/string.h> +#include <linux/kernel.h> +#include <linux/ctype.h> +#include <linux/init.h> +#include <linux/dmar.h> + +#include <asm/smp.h> +#include <asm/ipi.h> +#include <asm/genapic.h> + +static int x2apic_phys; + +static int set_x2apic_phys_mode(char *arg) +{ + x2apic_phys = 1; + return 0; +} +early_param("x2apic_phys", set_x2apic_phys_mode); + +static int __init x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ + if (cpu_has_x2apic && x2apic_phys) + return 1; + + return 0; +} + +/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ + +static cpumask_t x2apic_target_cpus(void) +{ + return cpumask_of_cpu(0); +} + +static cpumask_t x2apic_vector_allocation_domain(int cpu) +{ + cpumask_t domain = CPU_MASK_NONE; + cpu_set(cpu, domain); + return domain; +} + +static void __x2apic_send_IPI_dest(unsigned int apicid, int vector, + unsigned int dest) +{ + unsigned long cfg; + + cfg = __prepare_ICR(0, vector, dest); + + /* + * send the IPI. + */ + x2apic_icr_write(cfg, apicid); +} + +static void x2apic_send_IPI_mask(cpumask_t mask, int vector) +{ + unsigned long flags; + unsigned long query_cpu; + + local_irq_save(flags); + for_each_cpu_mask(query_cpu, mask) { + __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu), + vector, APIC_DEST_PHYSICAL); + } + local_irq_restore(flags); +} + +static void x2apic_send_IPI_allbutself(int vector) +{ + cpumask_t mask = cpu_online_map; + + cpu_clear(smp_processor_id(), mask); + + if (!cpus_empty(mask)) + x2apic_send_IPI_mask(mask, vector); +} + +static void x2apic_send_IPI_all(int vector) +{ + x2apic_send_IPI_mask(cpu_online_map, vector); +} + +static int x2apic_apic_id_registered(void) +{ + return 1; +} + +static unsigned int x2apic_cpu_mask_to_apicid(cpumask_t cpumask) +{ + int cpu; + + /* + * We're using fixed IRQ delivery, can only return one phys APIC ID. + * May as well be the first. + */ + cpu = first_cpu(cpumask); + if ((unsigned)cpu < NR_CPUS) + return per_cpu(x86_cpu_to_apicid, cpu); + else + return BAD_APICID; +} + +static unsigned int get_apic_id(unsigned long x) +{ + unsigned int id; + + id = x; + return id; +} + +static unsigned long set_apic_id(unsigned int id) +{ + unsigned long x; + + x = id; + return x; +} + +static unsigned int phys_pkg_id(int index_msb) +{ + return current_cpu_data.initial_apicid >> index_msb; +} + +void x2apic_send_IPI_self(int vector) +{ + apic_write(APIC_SELF_IPI, vector); +} + +void init_x2apic_ldr(void) +{ + return; +} + +struct genapic apic_x2apic_phys = { + .name = "physical x2apic", + .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, + .int_delivery_mode = dest_Fixed, + .int_dest_mode = (APIC_DEST_PHYSICAL != 0), + .target_cpus = x2apic_target_cpus, + .vector_allocation_domain = x2apic_vector_allocation_domain, + .apic_id_registered = x2apic_apic_id_registered, + .init_apic_ldr = init_x2apic_ldr, + .send_IPI_all = x2apic_send_IPI_all, + .send_IPI_allbutself = x2apic_send_IPI_allbutself, + .send_IPI_mask = x2apic_send_IPI_mask, + .send_IPI_self = x2apic_send_IPI_self, + .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid, + .phys_pkg_id = phys_pkg_id, + .get_apic_id = get_apic_id, + .set_apic_id = set_apic_id, + .apic_id_mask = (0xFFFFFFFFu), +}; diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c index 2d7e307c777..33581d94a90 100644 --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/genx2apic_uv_x.c @@ -12,12 +12,12 @@ #include <linux/threads.h> #include <linux/cpumask.h> #include <linux/string.h> -#include <linux/kernel.h> #include <linux/ctype.h> #include <linux/init.h> #include <linux/sched.h> #include <linux/bootmem.h> #include <linux/module.h> +#include <linux/hardirq.h> #include <asm/smp.h> #include <asm/ipi.h> #include <asm/genapic.h> @@ -26,6 +26,36 @@ #include <asm/uv/uv_hub.h> #include <asm/uv/bios.h> +DEFINE_PER_CPU(int, x2apic_extra_bits); + +static enum uv_system_type uv_system_type; + +static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ + if (!strcmp(oem_id, "SGI")) { + if (!strcmp(oem_table_id, "UVL")) + uv_system_type = UV_LEGACY_APIC; + else if (!strcmp(oem_table_id, "UVX")) + uv_system_type = UV_X2APIC; + else if (!strcmp(oem_table_id, "UVH")) { + uv_system_type = UV_NON_UNIQUE_APIC; + return 1; + } + } + return 0; +} + +enum uv_system_type get_uv_system_type(void) +{ + return uv_system_type; +} + +int is_uv_system(void) +{ + return uv_system_type != UV_NONE; +} +EXPORT_SYMBOL_GPL(is_uv_system); + DEFINE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); EXPORT_PER_CPU_SYMBOL_GPL(__uv_hub_info); @@ -84,7 +114,7 @@ static void uv_send_IPI_one(int cpu, int vector) unsigned long val, apicid, lapicid; int pnode; - apicid = per_cpu(x86_cpu_to_apicid, cpu); /* ZZZ - cache node-local ? */ + apicid = per_cpu(x86_cpu_to_apicid, cpu); lapicid = apicid & 0x3f; /* ZZZ macro needed */ pnode = uv_apicid_to_pnode(apicid); val = @@ -123,6 +153,10 @@ static int uv_apic_id_registered(void) return 1; } +static void uv_init_apic_ldr(void) +{ +} + static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask) { int cpu; @@ -138,31 +172,59 @@ static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask) return BAD_APICID; } +static unsigned int get_apic_id(unsigned long x) +{ + unsigned int id; + + WARN_ON(preemptible() && num_online_cpus() > 1); + id = x | __get_cpu_var(x2apic_extra_bits); + + return id; +} + +static unsigned long set_apic_id(unsigned int id) +{ + unsigned long x; + + /* maskout x2apic_extra_bits ? */ + x = id; + return x; +} + +static unsigned int uv_read_apic_id(void) +{ + + return get_apic_id(apic_read(APIC_ID)); +} + static unsigned int phys_pkg_id(int index_msb) { - return GET_APIC_ID(read_apic_id()) >> index_msb; + return uv_read_apic_id() >> index_msb; } -#ifdef ZZZ /* Needs x2apic patch */ static void uv_send_IPI_self(int vector) { apic_write(APIC_SELF_IPI, vector); } -#endif struct genapic apic_x2apic_uv_x = { .name = "UV large system", + .acpi_madt_oem_check = uv_acpi_madt_oem_check, .int_delivery_mode = dest_Fixed, .int_dest_mode = (APIC_DEST_PHYSICAL != 0), .target_cpus = uv_target_cpus, - .vector_allocation_domain = uv_vector_allocation_domain,/* Fixme ZZZ */ + .vector_allocation_domain = uv_vector_allocation_domain, .apic_id_registered = uv_apic_id_registered, + .init_apic_ldr = uv_init_apic_ldr, .send_IPI_all = uv_send_IPI_all, .send_IPI_allbutself = uv_send_IPI_allbutself, .send_IPI_mask = uv_send_IPI_mask, - /* ZZZ.send_IPI_self = uv_send_IPI_self, */ + .send_IPI_self = uv_send_IPI_self, .cpu_mask_to_apicid = uv_cpu_mask_to_apicid, - .phys_pkg_id = phys_pkg_id, /* Fixme ZZZ */ + .phys_pkg_id = phys_pkg_id, + .get_apic_id = get_apic_id, + .set_apic_id = set_apic_id, + .apic_id_mask = (0xFFFFFFFFu), }; static __cpuinit void set_x2apic_extra_bits(int pnode) @@ -222,12 +284,13 @@ static __init void map_low_mmrs(void) enum map_type {map_wb, map_uc}; -static __init void map_high(char *id, unsigned long base, int shift, enum map_type map_type) +static __init void map_high(char *id, unsigned long base, int shift, + int max_pnode, enum map_type map_type) { unsigned long bytes, paddr; paddr = base << shift; - bytes = (1UL << shift); + bytes = (1UL << shift) * (max_pnode + 1); printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr, paddr + bytes); if (map_type == map_uc) @@ -243,7 +306,7 @@ static __init void map_gru_high(int max_pnode) gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR); if (gru.s.enable) - map_high("GRU", gru.s.base, shift, map_wb); + map_high("GRU", gru.s.base, shift, max_pnode, map_wb); } static __init void map_config_high(int max_pnode) @@ -253,7 +316,7 @@ static __init void map_config_high(int max_pnode) cfg.v = uv_read_local_mmr(UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR); if (cfg.s.enable) - map_high("CONFIG", cfg.s.base, shift, map_uc); + map_high("CONFIG", cfg.s.base, shift, max_pnode, map_uc); } static __init void map_mmr_high(int max_pnode) @@ -263,7 +326,7 @@ static __init void map_mmr_high(int max_pnode) mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR); if (mmr.s.enable) - map_high("MMR", mmr.s.base, shift, map_uc); + map_high("MMR", mmr.s.base, shift, max_pnode, map_uc); } static __init void map_mmioh_high(int max_pnode) @@ -273,7 +336,7 @@ static __init void map_mmioh_high(int max_pnode) mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR); if (mmioh.s.enable) - map_high("MMIOH", mmioh.s.base, shift, map_uc); + map_high("MMIOH", mmioh.s.base, shift, max_pnode, map_uc); } static __init void uv_rtc_init(void) @@ -293,7 +356,9 @@ static __init void uv_rtc_init(void) sn_rtc_cycles_per_second = ticks_per_sec; } -static __init void uv_system_init(void) +static bool uv_system_inited; + +void __init uv_system_init(void) { union uvh_si_addr_map_config_u m_n_config; union uvh_node_id_u node_id; @@ -383,6 +448,7 @@ static __init void uv_system_init(void) map_mmr_high(max_pnode); map_config_high(max_pnode); map_mmioh_high(max_pnode); + uv_system_inited = true; } /* @@ -391,11 +457,12 @@ static __init void uv_system_init(void) */ void __cpuinit uv_cpu_init(void) { - if (!uv_node_to_blade) - uv_system_init(); + BUG_ON(!uv_system_inited); uv_blade_info[uv_numa_blade_id()].nr_online_cpus++; if (get_uv_system_type() == UV_NON_UNIQUE_APIC) set_x2apic_extra_bits(uv_hub_info->pnode); } + + diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c index 3e66bd364a9..1dcb0f13897 100644 --- a/arch/x86/kernel/head.c +++ b/arch/x86/kernel/head.c @@ -35,6 +35,7 @@ void __init reserve_ebda_region(void) /* start of EBDA area */ ebda_addr = get_bios_ebda(); + printk(KERN_INFO "BIOS EBDA/lowmem at: %08x/%08x\n", ebda_addr, lowmem); /* Fixup: bios puts an EBDA in the top 64K segment */ /* of conventional memory, but does not adjust lowmem. */ diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 9bfc4d72fb2..d16084f9064 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -108,12 +108,11 @@ void __init x86_64_start_kernel(char * real_mode_data) } load_idt((const struct desc_ptr *)&idt_descr); - early_printk("Kernel alive\n"); + if (console_loglevel == 10) + early_printk("Kernel alive\n"); x86_64_init_pda(); - early_printk("Kernel really alive\n"); - x86_64_start_reservations(real_mode_data); } diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index a7010c3a377..e835b4eea70 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -172,10 +172,6 @@ num_subarch_entries = (. - subarch_entries) / 4 * * Note that the stack is not yet set up! */ -#define PTE_ATTR 0x007 /* PRESENT+RW+USER */ -#define PDE_ATTR 0x067 /* PRESENT+RW+USER+DIRTY+ACCESSED */ -#define PGD_ATTR 0x001 /* PRESENT (no other attributes) */ - default_entry: #ifdef CONFIG_X86_PAE @@ -196,9 +192,9 @@ default_entry: movl $pa(pg0), %edi movl %edi, pa(init_pg_tables_start) movl $pa(swapper_pg_pmd), %edx - movl $PTE_ATTR, %eax + movl $PTE_IDENT_ATTR, %eax 10: - leal PDE_ATTR(%edi),%ecx /* Create PMD entry */ + leal PDE_IDENT_ATTR(%edi),%ecx /* Create PMD entry */ movl %ecx,(%edx) /* Store PMD entry */ /* Upper half already zero */ addl $8,%edx @@ -215,7 +211,7 @@ default_entry: * End condition: we must map up to and including INIT_MAP_BEYOND_END * bytes beyond the end of our own page tables. */ - leal (INIT_MAP_BEYOND_END+PTE_ATTR)(%edi),%ebp + leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp cmpl %ebp,%eax jb 10b 1: @@ -224,7 +220,7 @@ default_entry: movl %eax, pa(max_pfn_mapped) /* Do early initialization of the fixmap area */ - movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax + movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax movl %eax,pa(swapper_pg_pmd+0x1000*KPMDS-8) #else /* Not PAE */ @@ -233,9 +229,9 @@ page_pde_offset = (__PAGE_OFFSET >> 20); movl $pa(pg0), %edi movl %edi, pa(init_pg_tables_start) movl $pa(swapper_pg_dir), %edx - movl $PTE_ATTR, %eax + movl $PTE_IDENT_ATTR, %eax 10: - leal PDE_ATTR(%edi),%ecx /* Create PDE entry */ + leal PDE_IDENT_ATTR(%edi),%ecx /* Create PDE entry */ movl %ecx,(%edx) /* Store identity PDE entry */ movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */ addl $4,%edx @@ -249,7 +245,7 @@ page_pde_offset = (__PAGE_OFFSET >> 20); * bytes beyond the end of our own page tables; the +0x007 is * the attribute bits */ - leal (INIT_MAP_BEYOND_END+PTE_ATTR)(%edi),%ebp + leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp cmpl %ebp,%eax jb 10b movl %edi,pa(init_pg_tables_end) @@ -257,7 +253,7 @@ page_pde_offset = (__PAGE_OFFSET >> 20); movl %eax, pa(max_pfn_mapped) /* Do early initialization of the fixmap area */ - movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax + movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax movl %eax,pa(swapper_pg_dir+0xffc) #endif jmp 3f @@ -634,19 +630,19 @@ ENTRY(empty_zero_page) /* Page-aligned for the benefit of paravirt? */ .align PAGE_SIZE_asm ENTRY(swapper_pg_dir) - .long pa(swapper_pg_pmd+PGD_ATTR),0 /* low identity map */ + .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */ # if KPMDS == 3 - .long pa(swapper_pg_pmd+PGD_ATTR),0 - .long pa(swapper_pg_pmd+PGD_ATTR+0x1000),0 - .long pa(swapper_pg_pmd+PGD_ATTR+0x2000),0 + .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 + .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0 + .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x2000),0 # elif KPMDS == 2 .long 0,0 - .long pa(swapper_pg_pmd+PGD_ATTR),0 - .long pa(swapper_pg_pmd+PGD_ATTR+0x1000),0 + .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 + .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0 # elif KPMDS == 1 .long 0,0 .long 0,0 - .long pa(swapper_pg_pmd+PGD_ATTR),0 + .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 # else # error "Kernel PMDs should be 1, 2 or 3" # endif diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index db3280afe88..26cfdc1d7c7 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -110,7 +110,7 @@ startup_64: movq %rdi, %rax shrq $PMD_SHIFT, %rax andq $(PTRS_PER_PMD - 1), %rax - leaq __PAGE_KERNEL_LARGE_EXEC(%rdi), %rdx + leaq __PAGE_KERNEL_IDENT_LARGE_EXEC(%rdi), %rdx leaq level2_spare_pgt(%rip), %rbx movq %rdx, 0(%rbx, %rax, 8) ident_complete: @@ -374,7 +374,7 @@ NEXT_PAGE(level2_ident_pgt) /* Since I easily can, map the first 1G. * Don't set NX because code runs from these pages. */ - PMDS(0, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD) + PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) NEXT_PAGE(level2_kernel_pgt) /* diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 59fd3b6b130..acf62fc233d 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -115,13 +115,17 @@ static void hpet_reserve_platform_timers(unsigned long id) hd.hd_phys_address = hpet_address; hd.hd_address = hpet; hd.hd_nirqs = nrtimers; - hd.hd_flags = HPET_DATA_PLATFORM; hpet_reserve_timer(&hd, 0); #ifdef CONFIG_HPET_EMULATE_RTC hpet_reserve_timer(&hd, 1); #endif + /* + * NOTE that hd_irq[] reflects IOAPIC input pins (LEGACY_8254 + * is wrong for i8259!) not the output IRQ. Many BIOS writers + * don't bother configuring *any* comparator interrupts. + */ hd.hd_irq[0] = HPET_LEGACY_8254; hd.hd_irq[1] = HPET_LEGACY_RTC; @@ -210,8 +214,8 @@ static void hpet_legacy_clockevent_register(void) /* Calculate the min / max delta */ hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, &hpet_clockevent); - hpet_clockevent.min_delta_ns = clockevent_delta2ns(0x30, - &hpet_clockevent); + /* 5 usec minimum reprogramming delta. */ + hpet_clockevent.min_delta_ns = 5000; /* * Start hpet with the boot cpu mask and make it @@ -270,15 +274,22 @@ static void hpet_legacy_set_mode(enum clock_event_mode mode, } static int hpet_legacy_next_event(unsigned long delta, - struct clock_event_device *evt) + struct clock_event_device *evt) { - unsigned long cnt; + u32 cnt; cnt = hpet_readl(HPET_COUNTER); - cnt += delta; + cnt += (u32) delta; hpet_writel(cnt, HPET_T0_CMP); - return ((long)(hpet_readl(HPET_COUNTER) - cnt ) > 0) ? -ETIME : 0; + /* + * We need to read back the CMP register to make sure that + * what we wrote hit the chip before we compare it to the + * counter. + */ + WARN_ON((u32)hpet_readl(HPET_T0_CMP) != cnt); + + return (s32)((u32)hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0; } /* diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index eb9ddd8efb8..1f20608d4ca 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -21,9 +21,12 @@ # include <asm/sigcontext32.h> # include <asm/user32.h> #else -# define save_i387_ia32 save_i387 -# define restore_i387_ia32 restore_i387 +# define save_i387_xstate_ia32 save_i387_xstate +# define restore_i387_xstate_ia32 restore_i387_xstate # define _fpstate_ia32 _fpstate +# define _xstate_ia32 _xstate +# define sig_xstate_ia32_size sig_xstate_size +# define fx_sw_reserved_ia32 fx_sw_reserved # define user_i387_ia32_struct user_i387_struct # define user32_fxsr_struct user_fxsr_struct #endif @@ -36,6 +39,7 @@ static unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu; unsigned int xstate_size; +unsigned int sig_xstate_ia32_size = sizeof(struct _fpstate_ia32); static struct i387_fxsave_struct fx_scratch __cpuinitdata; void __cpuinit mxcsr_feature_mask_init(void) @@ -61,6 +65,11 @@ void __init init_thread_xstate(void) return; } + if (cpu_has_xsave) { + xsave_cntxt_init(); + return; + } + if (cpu_has_fxsr) xstate_size = sizeof(struct i387_fxsave_struct); #ifdef CONFIG_X86_32 @@ -83,9 +92,19 @@ void __cpuinit fpu_init(void) write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */ + /* + * Boot processor to setup the FP and extended state context info. + */ + if (!smp_processor_id()) + init_thread_xstate(); + xsave_init(); + mxcsr_feature_mask_init(); /* clean state in init */ - current_thread_info()->status = 0; + if (cpu_has_xsave) + current_thread_info()->status = TS_XSAVE; + else + current_thread_info()->status = 0; clear_used_math(); } #endif /* CONFIG_X86_64 */ @@ -195,6 +214,13 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, */ target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask; + /* + * update the header bits in the xsave header, indicating the + * presence of FP and SSE state. + */ + if (cpu_has_xsave) + target->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE; + return ret; } @@ -395,6 +421,12 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, if (!ret) convert_to_fxsr(target, &env); + /* + * update the header bit in the xsave header, indicating the + * presence of FP. + */ + if (cpu_has_xsave) + target->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FP; return ret; } @@ -407,7 +439,6 @@ static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf) struct task_struct *tsk = current; struct i387_fsave_struct *fp = &tsk->thread.xstate->fsave; - unlazy_fpu(tsk); fp->status = fp->swd; if (__copy_to_user(buf, fp, sizeof(struct i387_fsave_struct))) return -1; @@ -421,8 +452,6 @@ static int save_i387_fxsave(struct _fpstate_ia32 __user *buf) struct user_i387_ia32_struct env; int err = 0; - unlazy_fpu(tsk); - convert_from_fxsr(&env, tsk); if (__copy_to_user(buf, &env, sizeof(env))) return -1; @@ -432,16 +461,54 @@ static int save_i387_fxsave(struct _fpstate_ia32 __user *buf) if (err) return -1; - if (__copy_to_user(&buf->_fxsr_env[0], fx, - sizeof(struct i387_fxsave_struct))) + if (__copy_to_user(&buf->_fxsr_env[0], fx, xstate_size)) + return -1; + return 1; +} + +static int save_i387_xsave(void __user *buf) +{ + struct task_struct *tsk = current; + struct _fpstate_ia32 __user *fx = buf; + int err = 0; + + /* + * For legacy compatible, we always set FP/SSE bits in the bit + * vector while saving the state to the user context. + * This will enable us capturing any changes(during sigreturn) to + * the FP/SSE bits by the legacy applications which don't touch + * xstate_bv in the xsave header. + * + * xsave aware applications can change the xstate_bv in the xsave + * header as well as change any contents in the memory layout. + * xrestore as part of sigreturn will capture all the changes. + */ + tsk->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE; + + if (save_i387_fxsave(fx) < 0) + return -1; + + err = __copy_to_user(&fx->sw_reserved, &fx_sw_reserved_ia32, + sizeof(struct _fpx_sw_bytes)); + err |= __put_user(FP_XSTATE_MAGIC2, + (__u32 __user *) (buf + sig_xstate_ia32_size + - FP_XSTATE_MAGIC2_SIZE)); + if (err) return -1; + return 1; } -int save_i387_ia32(struct _fpstate_ia32 __user *buf) +int save_i387_xstate_ia32(void __user *buf) { + struct _fpstate_ia32 __user *fp = (struct _fpstate_ia32 __user *) buf; + struct task_struct *tsk = current; + if (!used_math()) return 0; + + if (!access_ok(VERIFY_WRITE, buf, sig_xstate_ia32_size)) + return -EACCES; /* * This will cause a "finit" to be triggered by the next * attempted FPU operation by the 'current' process. @@ -451,13 +518,17 @@ int save_i387_ia32(struct _fpstate_ia32 __user *buf) if (!HAVE_HWFP) { return fpregs_soft_get(current, NULL, 0, sizeof(struct user_i387_ia32_struct), - NULL, buf) ? -1 : 1; + NULL, fp) ? -1 : 1; } + unlazy_fpu(tsk); + + if (cpu_has_xsave) + return save_i387_xsave(fp); if (cpu_has_fxsr) - return save_i387_fxsave(buf); + return save_i387_fxsave(fp); else - return save_i387_fsave(buf); + return save_i387_fsave(fp); } static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf) @@ -468,14 +539,15 @@ static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf) sizeof(struct i387_fsave_struct)); } -static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf) +static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf, + unsigned int size) { struct task_struct *tsk = current; struct user_i387_ia32_struct env; int err; err = __copy_from_user(&tsk->thread.xstate->fxsave, &buf->_fxsr_env[0], - sizeof(struct i387_fxsave_struct)); + size); /* mxcsr reserved bits must be masked to zero for security reasons */ tsk->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask; if (err || __copy_from_user(&env, buf, sizeof(env))) @@ -485,14 +557,69 @@ static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf) return 0; } -int restore_i387_ia32(struct _fpstate_ia32 __user *buf) +static int restore_i387_xsave(void __user *buf) +{ + struct _fpx_sw_bytes fx_sw_user; + struct _fpstate_ia32 __user *fx_user = + ((struct _fpstate_ia32 __user *) buf); + struct i387_fxsave_struct __user *fx = + (struct i387_fxsave_struct __user *) &fx_user->_fxsr_env[0]; + struct xsave_hdr_struct *xsave_hdr = + ¤t->thread.xstate->xsave.xsave_hdr; + u64 mask; + int err; + + if (check_for_xstate(fx, buf, &fx_sw_user)) + goto fx_only; + + mask = fx_sw_user.xstate_bv; + + err = restore_i387_fxsave(buf, fx_sw_user.xstate_size); + + xsave_hdr->xstate_bv &= pcntxt_mask; + /* + * These bits must be zero. + */ + xsave_hdr->reserved1[0] = xsave_hdr->reserved1[1] = 0; + + /* + * Init the state that is not present in the memory layout + * and enabled by the OS. + */ + mask = ~(pcntxt_mask & ~mask); + xsave_hdr->xstate_bv &= mask; + + return err; +fx_only: + /* + * Couldn't find the extended state information in the memory + * layout. Restore the FP/SSE and init the other extended state + * enabled by the OS. + */ + xsave_hdr->xstate_bv = XSTATE_FPSSE; + return restore_i387_fxsave(buf, sizeof(struct i387_fxsave_struct)); +} + +int restore_i387_xstate_ia32(void __user *buf) { int err; struct task_struct *tsk = current; + struct _fpstate_ia32 __user *fp = (struct _fpstate_ia32 __user *) buf; if (HAVE_HWFP) clear_fpu(tsk); + if (!buf) { + if (used_math()) { + clear_fpu(tsk); + clear_used_math(); + } + + return 0; + } else + if (!access_ok(VERIFY_READ, buf, sig_xstate_ia32_size)) + return -EACCES; + if (!used_math()) { err = init_fpu(tsk); if (err) @@ -500,14 +627,17 @@ int restore_i387_ia32(struct _fpstate_ia32 __user *buf) } if (HAVE_HWFP) { - if (cpu_has_fxsr) - err = restore_i387_fxsave(buf); + if (cpu_has_xsave) + err = restore_i387_xsave(buf); + else if (cpu_has_fxsr) + err = restore_i387_fxsave(fp, sizeof(struct + i387_fxsave_struct)); else - err = restore_i387_fsave(buf); + err = restore_i387_fsave(fp); } else { err = fpregs_soft_set(current, NULL, 0, sizeof(struct user_i387_ia32_struct), - NULL, buf) != 0; + NULL, fp) != 0; } set_used_math(); diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index dc92b49d920..4b8a53d841f 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -282,6 +282,30 @@ static int __init i8259A_init_sysfs(void) device_initcall(i8259A_init_sysfs); +void mask_8259A(void) +{ + unsigned long flags; + + spin_lock_irqsave(&i8259A_lock, flags); + + outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ + outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ + + spin_unlock_irqrestore(&i8259A_lock, flags); +} + +void unmask_8259A(void) +{ + unsigned long flags; + + spin_lock_irqsave(&i8259A_lock, flags); + + outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ + outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ + + spin_unlock_irqrestore(&i8259A_lock, flags); +} + void init_8259A(int auto_eoi) { unsigned long flags; diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index 09cddb57bec..e710289f673 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -46,10 +46,13 @@ #include <asm/nmi.h> #include <asm/msidef.h> #include <asm/hypertransport.h> +#include <asm/setup.h> #include <mach_apic.h> #include <mach_apicdef.h> +#define __apicdebuginit(type) static type __init + int (*ioapic_renumber_irq)(int ioapic, int irq); atomic_t irq_mis_count; @@ -1341,7 +1344,8 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin, ioapic_write_entry(apic, pin, entry); } -void __init print_IO_APIC(void) + +__apicdebuginit(void) print_IO_APIC(void) { int apic, i; union IO_APIC_reg_00 reg_00; @@ -1456,9 +1460,7 @@ void __init print_IO_APIC(void) return; } -#if 0 - -static void print_APIC_bitfield(int base) +__apicdebuginit(void) print_APIC_bitfield(int base) { unsigned int v; int i, j; @@ -1479,9 +1481,10 @@ static void print_APIC_bitfield(int base) } } -void /*__init*/ print_local_APIC(void *dummy) +__apicdebuginit(void) print_local_APIC(void *dummy) { unsigned int v, ver, maxlvt; + u64 icr; if (apic_verbosity == APIC_QUIET) return; @@ -1490,7 +1493,7 @@ void /*__init*/ print_local_APIC(void *dummy) smp_processor_id(), hard_smp_processor_id()); v = apic_read(APIC_ID); printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, - GET_APIC_ID(read_apic_id())); + GET_APIC_ID(v)); v = apic_read(APIC_LVR); printk(KERN_INFO "... APIC VERSION: %08x\n", v); ver = GET_APIC_VERSION(v); @@ -1532,10 +1535,9 @@ void /*__init*/ print_local_APIC(void *dummy) printk(KERN_DEBUG "... APIC ESR: %08x\n", v); } - v = apic_read(APIC_ICR); - printk(KERN_DEBUG "... APIC ICR: %08x\n", v); - v = apic_read(APIC_ICR2); - printk(KERN_DEBUG "... APIC ICR2: %08x\n", v); + icr = apic_icr_read(); + printk(KERN_DEBUG "... APIC ICR: %08x\n", icr); + printk(KERN_DEBUG "... APIC ICR2: %08x\n", icr >> 32); v = apic_read(APIC_LVTT); printk(KERN_DEBUG "... APIC LVTT: %08x\n", v); @@ -1563,12 +1565,12 @@ void /*__init*/ print_local_APIC(void *dummy) printk("\n"); } -void print_all_local_APICs(void) +__apicdebuginit(void) print_all_local_APICs(void) { on_each_cpu(print_local_APIC, NULL, 1); } -void /*__init*/ print_PIC(void) +__apicdebuginit(void) print_PIC(void) { unsigned int v; unsigned long flags; @@ -1600,7 +1602,17 @@ void /*__init*/ print_PIC(void) printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); } -#endif /* 0 */ +__apicdebuginit(int) print_all_ICs(void) +{ + print_PIC(); + print_all_local_APICs(); + print_IO_APIC(); + + return 0; +} + +fs_initcall(print_all_ICs); + static void __init enable_IO_APIC(void) { @@ -1698,8 +1710,7 @@ void disable_IO_APIC(void) entry.dest_mode = 0; /* Physical */ entry.delivery_mode = dest_ExtINT; /* ExtInt */ entry.vector = 0; - entry.dest.physical.physical_dest = - GET_APIC_ID(read_apic_id()); + entry.dest.physical.physical_dest = read_apic_id(); /* * Add it to the IO-APIC irq-routing table: @@ -1725,10 +1736,8 @@ static void __init setup_ioapic_ids_from_mpc(void) unsigned char old_id; unsigned long flags; -#ifdef CONFIG_X86_NUMAQ - if (found_numaq) + if (x86_quirks->setup_ioapic_ids && x86_quirks->setup_ioapic_ids()) return; -#endif /* * Don't check I/O APIC IDs for xAPIC systems. They have @@ -2329,8 +2338,6 @@ void __init setup_IO_APIC(void) setup_IO_APIC_irqs(); init_IO_APIC_traps(); check_timer(); - if (!acpi_ioapic) - print_IO_APIC(); } /* diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 61a83b70c18..02063ae042f 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -37,6 +37,7 @@ #include <acpi/acpi_bus.h> #endif #include <linux/bootmem.h> +#include <linux/dmar.h> #include <asm/idle.h> #include <asm/io.h> @@ -49,10 +50,13 @@ #include <asm/nmi.h> #include <asm/msidef.h> #include <asm/hypertransport.h> +#include <asm/irq_remapping.h> #include <mach_ipi.h> #include <mach_apic.h> +#define __apicdebuginit(type) static type __init + struct irq_cfg { cpumask_t domain; cpumask_t old_domain; @@ -87,8 +91,6 @@ int first_system_vector = 0xfe; char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE}; -#define __apicdebuginit __init - int sis_apic_bug; /* not actually supported, dummy for compile */ static int no_timer_check; @@ -108,6 +110,9 @@ static DEFINE_SPINLOCK(vector_lock); */ int nr_ioapic_registers[MAX_IO_APICS]; +/* I/O APIC RTE contents at the OS boot up */ +struct IO_APIC_route_entry *early_ioapic_entries[MAX_IO_APICS]; + /* I/O APIC entries */ struct mp_config_ioapic mp_ioapics[MAX_IO_APICS]; int nr_ioapics; @@ -303,7 +308,12 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) pin = entry->pin; if (pin == -1) break; - io_apic_write(apic, 0x11 + pin*2, dest); + /* + * With interrupt-remapping, destination information comes + * from interrupt-remapping table entry. + */ + if (!irq_remapped(irq)) + io_apic_write(apic, 0x11 + pin*2, dest); reg = io_apic_read(apic, 0x10 + pin*2); reg &= ~IO_APIC_REDIR_VECTOR_MASK; reg |= vector; @@ -440,6 +450,69 @@ static void clear_IO_APIC (void) clear_IO_APIC_pin(apic, pin); } +/* + * Saves and masks all the unmasked IO-APIC RTE's + */ +int save_mask_IO_APIC_setup(void) +{ + union IO_APIC_reg_01 reg_01; + unsigned long flags; + int apic, pin; + + /* + * The number of IO-APIC IRQ registers (== #pins): + */ + for (apic = 0; apic < nr_ioapics; apic++) { + spin_lock_irqsave(&ioapic_lock, flags); + reg_01.raw = io_apic_read(apic, 1); + spin_unlock_irqrestore(&ioapic_lock, flags); + nr_ioapic_registers[apic] = reg_01.bits.entries+1; + } + + for (apic = 0; apic < nr_ioapics; apic++) { + early_ioapic_entries[apic] = + kzalloc(sizeof(struct IO_APIC_route_entry) * + nr_ioapic_registers[apic], GFP_KERNEL); + if (!early_ioapic_entries[apic]) + return -ENOMEM; + } + + for (apic = 0; apic < nr_ioapics; apic++) + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + struct IO_APIC_route_entry entry; + + entry = early_ioapic_entries[apic][pin] = + ioapic_read_entry(apic, pin); + if (!entry.mask) { + entry.mask = 1; + ioapic_write_entry(apic, pin, entry); + } + } + return 0; +} + +void restore_IO_APIC_setup(void) +{ + int apic, pin; + + for (apic = 0; apic < nr_ioapics; apic++) + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) + ioapic_write_entry(apic, pin, + early_ioapic_entries[apic][pin]); +} + +void reinit_intr_remapped_IO_APIC(int intr_remapping) +{ + /* + * for now plain restore of previous settings. + * TBD: In the case of OS enabling interrupt-remapping, + * IO-APIC RTE's need to be setup to point to interrupt-remapping + * table entries. for now, do a plain restore, and wait for + * the setup_IO_APIC_irqs() to do proper initialization. + */ + restore_IO_APIC_setup(); +} + int skip_ioapic_setup; int ioapic_force; @@ -839,18 +912,98 @@ void __setup_vector_irq(int cpu) } static struct irq_chip ioapic_chip; +#ifdef CONFIG_INTR_REMAP +static struct irq_chip ir_ioapic_chip; +#endif static void ioapic_register_intr(int irq, unsigned long trigger) { - if (trigger) { + if (trigger) irq_desc[irq].status |= IRQ_LEVEL; - set_irq_chip_and_handler_name(irq, &ioapic_chip, - handle_fasteoi_irq, "fasteoi"); - } else { + else irq_desc[irq].status &= ~IRQ_LEVEL; + +#ifdef CONFIG_INTR_REMAP + if (irq_remapped(irq)) { + irq_desc[irq].status |= IRQ_MOVE_PCNTXT; + if (trigger) + set_irq_chip_and_handler_name(irq, &ir_ioapic_chip, + handle_fasteoi_irq, + "fasteoi"); + else + set_irq_chip_and_handler_name(irq, &ir_ioapic_chip, + handle_edge_irq, "edge"); + return; + } +#endif + if (trigger) + set_irq_chip_and_handler_name(irq, &ioapic_chip, + handle_fasteoi_irq, + "fasteoi"); + else set_irq_chip_and_handler_name(irq, &ioapic_chip, handle_edge_irq, "edge"); +} + +static int setup_ioapic_entry(int apic, int irq, + struct IO_APIC_route_entry *entry, + unsigned int destination, int trigger, + int polarity, int vector) +{ + /* + * add it to the IO-APIC irq-routing table: + */ + memset(entry,0,sizeof(*entry)); + +#ifdef CONFIG_INTR_REMAP + if (intr_remapping_enabled) { + struct intel_iommu *iommu = map_ioapic_to_ir(apic); + struct irte irte; + struct IR_IO_APIC_route_entry *ir_entry = + (struct IR_IO_APIC_route_entry *) entry; + int index; + + if (!iommu) + panic("No mapping iommu for ioapic %d\n", apic); + + index = alloc_irte(iommu, irq, 1); + if (index < 0) + panic("Failed to allocate IRTE for ioapic %d\n", apic); + + memset(&irte, 0, sizeof(irte)); + + irte.present = 1; + irte.dst_mode = INT_DEST_MODE; + irte.trigger_mode = trigger; + irte.dlvry_mode = INT_DELIVERY_MODE; + irte.vector = vector; + irte.dest_id = IRTE_DEST(destination); + + modify_irte(irq, &irte); + + ir_entry->index2 = (index >> 15) & 0x1; + ir_entry->zero = 0; + ir_entry->format = 1; + ir_entry->index = (index & 0x7fff); + } else +#endif + { + entry->delivery_mode = INT_DELIVERY_MODE; + entry->dest_mode = INT_DEST_MODE; + entry->dest = destination; } + + entry->mask = 0; /* enable IRQ */ + entry->trigger = trigger; + entry->polarity = polarity; + entry->vector = vector; + + /* Mask level triggered irqs. + * Use IRQ_DELAYED_DISABLE for edge triggered irqs. + */ + if (trigger) + entry->mask = 1; + return 0; } static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, @@ -875,24 +1028,15 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector, irq, trigger, polarity); - /* - * add it to the IO-APIC irq-routing table: - */ - memset(&entry,0,sizeof(entry)); - - entry.delivery_mode = INT_DELIVERY_MODE; - entry.dest_mode = INT_DEST_MODE; - entry.dest = cpu_mask_to_apicid(mask); - entry.mask = 0; /* enable IRQ */ - entry.trigger = trigger; - entry.polarity = polarity; - entry.vector = cfg->vector; - /* Mask level triggered irqs. - * Use IRQ_DELAYED_DISABLE for edge triggered irqs. - */ - if (trigger) - entry.mask = 1; + if (setup_ioapic_entry(mp_ioapics[apic].mp_apicid, irq, &entry, + cpu_mask_to_apicid(mask), trigger, polarity, + cfg->vector)) { + printk("Failed to setup ioapic entry for ioapic %d, pin %d\n", + mp_ioapics[apic].mp_apicid, pin); + __clear_irq_vector(irq); + return; + } ioapic_register_intr(irq, trigger); if (irq < 16) @@ -944,6 +1088,9 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin, { struct IO_APIC_route_entry entry; + if (intr_remapping_enabled) + return; + memset(&entry, 0, sizeof(entry)); /* @@ -970,7 +1117,8 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin, ioapic_write_entry(apic, pin, entry); } -void __apicdebuginit print_IO_APIC(void) + +__apicdebuginit(void) print_IO_APIC(void) { int apic, i; union IO_APIC_reg_00 reg_00; @@ -1064,9 +1212,7 @@ void __apicdebuginit print_IO_APIC(void) return; } -#if 0 - -static __apicdebuginit void print_APIC_bitfield (int base) +__apicdebuginit(void) print_APIC_bitfield(int base) { unsigned int v; int i, j; @@ -1087,9 +1233,10 @@ static __apicdebuginit void print_APIC_bitfield (int base) } } -void __apicdebuginit print_local_APIC(void * dummy) +__apicdebuginit(void) print_local_APIC(void *dummy) { unsigned int v, ver, maxlvt; + unsigned long icr; if (apic_verbosity == APIC_QUIET) return; @@ -1097,7 +1244,7 @@ void __apicdebuginit print_local_APIC(void * dummy) printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", smp_processor_id(), hard_smp_processor_id()); v = apic_read(APIC_ID); - printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(read_apic_id())); + printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, read_apic_id()); v = apic_read(APIC_LVR); printk(KERN_INFO "... APIC VERSION: %08x\n", v); ver = GET_APIC_VERSION(v); @@ -1133,10 +1280,9 @@ void __apicdebuginit print_local_APIC(void * dummy) v = apic_read(APIC_ESR); printk(KERN_DEBUG "... APIC ESR: %08x\n", v); - v = apic_read(APIC_ICR); - printk(KERN_DEBUG "... APIC ICR: %08x\n", v); - v = apic_read(APIC_ICR2); - printk(KERN_DEBUG "... APIC ICR2: %08x\n", v); + icr = apic_icr_read(); + printk(KERN_DEBUG "... APIC ICR: %08x\n", (u32)icr); + printk(KERN_DEBUG "... APIC ICR2: %08x\n", (u32)(icr >> 32)); v = apic_read(APIC_LVTT); printk(KERN_DEBUG "... APIC LVTT: %08x\n", v); @@ -1164,12 +1310,12 @@ void __apicdebuginit print_local_APIC(void * dummy) printk("\n"); } -void print_all_local_APICs (void) +__apicdebuginit(void) print_all_local_APICs(void) { on_each_cpu(print_local_APIC, NULL, 1); } -void __apicdebuginit print_PIC(void) +__apicdebuginit(void) print_PIC(void) { unsigned int v; unsigned long flags; @@ -1201,7 +1347,17 @@ void __apicdebuginit print_PIC(void) printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); } -#endif /* 0 */ +__apicdebuginit(int) print_all_ICs(void) +{ + print_PIC(); + print_all_local_APICs(); + print_IO_APIC(); + + return 0; +} + +fs_initcall(print_all_ICs); + void __init enable_IO_APIC(void) { @@ -1291,7 +1447,7 @@ void disable_IO_APIC(void) entry.dest_mode = 0; /* Physical */ entry.delivery_mode = dest_ExtINT; /* ExtInt */ entry.vector = 0; - entry.dest = GET_APIC_ID(read_apic_id()); + entry.dest = read_apic_id(); /* * Add it to the IO-APIC irq-routing table: @@ -1397,6 +1553,147 @@ static int ioapic_retrigger_irq(unsigned int irq) */ #ifdef CONFIG_SMP + +#ifdef CONFIG_INTR_REMAP +static void ir_irq_migration(struct work_struct *work); + +static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration); + +/* + * Migrate the IO-APIC irq in the presence of intr-remapping. + * + * For edge triggered, irq migration is a simple atomic update(of vector + * and cpu destination) of IRTE and flush the hardware cache. + * + * For level triggered, we need to modify the io-apic RTE aswell with the update + * vector information, along with modifying IRTE with vector and destination. + * So irq migration for level triggered is little bit more complex compared to + * edge triggered migration. But the good news is, we use the same algorithm + * for level triggered migration as we have today, only difference being, + * we now initiate the irq migration from process context instead of the + * interrupt context. + * + * In future, when we do a directed EOI (combined with cpu EOI broadcast + * suppression) to the IO-APIC, level triggered irq migration will also be + * as simple as edge triggered migration and we can do the irq migration + * with a simple atomic update to IO-APIC RTE. + */ +static void migrate_ioapic_irq(int irq, cpumask_t mask) +{ + struct irq_cfg *cfg = irq_cfg + irq; + struct irq_desc *desc = irq_desc + irq; + cpumask_t tmp, cleanup_mask; + struct irte irte; + int modify_ioapic_rte = desc->status & IRQ_LEVEL; + unsigned int dest; + unsigned long flags; + + cpus_and(tmp, mask, cpu_online_map); + if (cpus_empty(tmp)) + return; + + if (get_irte(irq, &irte)) + return; + + if (assign_irq_vector(irq, mask)) + return; + + cpus_and(tmp, cfg->domain, mask); + dest = cpu_mask_to_apicid(tmp); + + if (modify_ioapic_rte) { + spin_lock_irqsave(&ioapic_lock, flags); + __target_IO_APIC_irq(irq, dest, cfg->vector); + spin_unlock_irqrestore(&ioapic_lock, flags); + } + + irte.vector = cfg->vector; + irte.dest_id = IRTE_DEST(dest); + + /* + * Modified the IRTE and flushes the Interrupt entry cache. + */ + modify_irte(irq, &irte); + + if (cfg->move_in_progress) { + cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); + cfg->move_cleanup_count = cpus_weight(cleanup_mask); + send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); + cfg->move_in_progress = 0; + } + + irq_desc[irq].affinity = mask; +} + +static int migrate_irq_remapped_level(int irq) +{ + int ret = -1; + + mask_IO_APIC_irq(irq); + + if (io_apic_level_ack_pending(irq)) { + /* + * Interrupt in progress. Migrating irq now will change the + * vector information in the IO-APIC RTE and that will confuse + * the EOI broadcast performed by cpu. + * So, delay the irq migration to the next instance. + */ + schedule_delayed_work(&ir_migration_work, 1); + goto unmask; + } + + /* everthing is clear. we have right of way */ + migrate_ioapic_irq(irq, irq_desc[irq].pending_mask); + + ret = 0; + irq_desc[irq].status &= ~IRQ_MOVE_PENDING; + cpus_clear(irq_desc[irq].pending_mask); + +unmask: + unmask_IO_APIC_irq(irq); + return ret; +} + +static void ir_irq_migration(struct work_struct *work) +{ + int irq; + + for (irq = 0; irq < NR_IRQS; irq++) { + struct irq_desc *desc = irq_desc + irq; + if (desc->status & IRQ_MOVE_PENDING) { + unsigned long flags; + + spin_lock_irqsave(&desc->lock, flags); + if (!desc->chip->set_affinity || + !(desc->status & IRQ_MOVE_PENDING)) { + desc->status &= ~IRQ_MOVE_PENDING; + spin_unlock_irqrestore(&desc->lock, flags); + continue; + } + + desc->chip->set_affinity(irq, + irq_desc[irq].pending_mask); + spin_unlock_irqrestore(&desc->lock, flags); + } + } +} + +/* + * Migrates the IRQ destination in the process context. + */ +static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) +{ + if (irq_desc[irq].status & IRQ_LEVEL) { + irq_desc[irq].status |= IRQ_MOVE_PENDING; + irq_desc[irq].pending_mask = mask; + migrate_irq_remapped_level(irq); + return; + } + + migrate_ioapic_irq(irq, mask); +} +#endif + asmlinkage void smp_irq_move_cleanup_interrupt(void) { unsigned vector, me; @@ -1453,6 +1750,17 @@ static void irq_complete_move(unsigned int irq) #else static inline void irq_complete_move(unsigned int irq) {} #endif +#ifdef CONFIG_INTR_REMAP +static void ack_x2apic_level(unsigned int irq) +{ + ack_x2APIC_irq(); +} + +static void ack_x2apic_edge(unsigned int irq) +{ + ack_x2APIC_irq(); +} +#endif static void ack_apic_edge(unsigned int irq) { @@ -1527,6 +1835,21 @@ static struct irq_chip ioapic_chip __read_mostly = { .retrigger = ioapic_retrigger_irq, }; +#ifdef CONFIG_INTR_REMAP +static struct irq_chip ir_ioapic_chip __read_mostly = { + .name = "IR-IO-APIC", + .startup = startup_ioapic_irq, + .mask = mask_IO_APIC_irq, + .unmask = unmask_IO_APIC_irq, + .ack = ack_x2apic_edge, + .eoi = ack_x2apic_level, +#ifdef CONFIG_SMP + .set_affinity = set_ir_ioapic_affinity_irq, +#endif + .retrigger = ioapic_retrigger_irq, +}; +#endif + static inline void init_IO_APIC_traps(void) { int irq; @@ -1712,6 +2035,8 @@ static inline void __init check_timer(void) * 8259A. */ if (pin1 == -1) { + if (intr_remapping_enabled) + panic("BIOS bug: timer not connected to IO-APIC"); pin1 = pin2; apic1 = apic2; no_pin1 = 1; @@ -1738,6 +2063,8 @@ static inline void __init check_timer(void) clear_IO_APIC_pin(0, pin1); goto out; } + if (intr_remapping_enabled) + panic("timer doesn't work through Interrupt-remapped IO-APIC"); clear_IO_APIC_pin(apic1, pin1); if (!no_pin1) apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: " @@ -1854,8 +2181,6 @@ void __init setup_IO_APIC(void) setup_IO_APIC_irqs(); init_IO_APIC_traps(); check_timer(); - if (!acpi_ioapic) - print_IO_APIC(); } struct sysfs_ioapic_data { @@ -1977,6 +2302,9 @@ void destroy_irq(unsigned int irq) dynamic_irq_cleanup(irq); +#ifdef CONFIG_INTR_REMAP + free_irte(irq); +#endif spin_lock_irqsave(&vector_lock, flags); __clear_irq_vector(irq); spin_unlock_irqrestore(&vector_lock, flags); @@ -1995,11 +2323,42 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms tmp = TARGET_CPUS; err = assign_irq_vector(irq, tmp); - if (!err) { - cpus_and(tmp, cfg->domain, tmp); - dest = cpu_mask_to_apicid(tmp); + if (err) + return err; + + cpus_and(tmp, cfg->domain, tmp); + dest = cpu_mask_to_apicid(tmp); + +#ifdef CONFIG_INTR_REMAP + if (irq_remapped(irq)) { + struct irte irte; + int ir_index; + u16 sub_handle; + + ir_index = map_irq_to_irte_handle(irq, &sub_handle); + BUG_ON(ir_index == -1); + + memset (&irte, 0, sizeof(irte)); + + irte.present = 1; + irte.dst_mode = INT_DEST_MODE; + irte.trigger_mode = 0; /* edge */ + irte.dlvry_mode = INT_DELIVERY_MODE; + irte.vector = cfg->vector; + irte.dest_id = IRTE_DEST(dest); + + modify_irte(irq, &irte); msg->address_hi = MSI_ADDR_BASE_HI; + msg->data = sub_handle; + msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT | + MSI_ADDR_IR_SHV | + MSI_ADDR_IR_INDEX1(ir_index) | + MSI_ADDR_IR_INDEX2(ir_index); + } else +#endif + { + msg->address_hi = MSI_ADDR_BASE_HI; msg->address_lo = MSI_ADDR_BASE_LO | ((INT_DEST_MODE == 0) ? @@ -2049,6 +2408,55 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) write_msi_msg(irq, &msg); irq_desc[irq].affinity = mask; } + +#ifdef CONFIG_INTR_REMAP +/* + * Migrate the MSI irq to another cpumask. This migration is + * done in the process context using interrupt-remapping hardware. + */ +static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask) +{ + struct irq_cfg *cfg = irq_cfg + irq; + unsigned int dest; + cpumask_t tmp, cleanup_mask; + struct irte irte; + + cpus_and(tmp, mask, cpu_online_map); + if (cpus_empty(tmp)) + return; + + if (get_irte(irq, &irte)) + return; + + if (assign_irq_vector(irq, mask)) + return; + + cpus_and(tmp, cfg->domain, mask); + dest = cpu_mask_to_apicid(tmp); + + irte.vector = cfg->vector; + irte.dest_id = IRTE_DEST(dest); + + /* + * atomically update the IRTE with the new destination and vector. + */ + modify_irte(irq, &irte); + + /* + * After this point, all the interrupts will start arriving + * at the new destination. So, time to cleanup the previous + * vector allocation. + */ + if (cfg->move_in_progress) { + cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); + cfg->move_cleanup_count = cpus_weight(cleanup_mask); + send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); + cfg->move_in_progress = 0; + } + + irq_desc[irq].affinity = mask; +} +#endif #endif /* CONFIG_SMP */ /* @@ -2066,26 +2474,157 @@ static struct irq_chip msi_chip = { .retrigger = ioapic_retrigger_irq, }; -int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) +#ifdef CONFIG_INTR_REMAP +static struct irq_chip msi_ir_chip = { + .name = "IR-PCI-MSI", + .unmask = unmask_msi_irq, + .mask = mask_msi_irq, + .ack = ack_x2apic_edge, +#ifdef CONFIG_SMP + .set_affinity = ir_set_msi_irq_affinity, +#endif + .retrigger = ioapic_retrigger_irq, +}; + +/* + * Map the PCI dev to the corresponding remapping hardware unit + * and allocate 'nvec' consecutive interrupt-remapping table entries + * in it. + */ +static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec) { + struct intel_iommu *iommu; + int index; + + iommu = map_dev_to_ir(dev); + if (!iommu) { + printk(KERN_ERR + "Unable to map PCI %s to iommu\n", pci_name(dev)); + return -ENOENT; + } + + index = alloc_irte(iommu, irq, nvec); + if (index < 0) { + printk(KERN_ERR + "Unable to allocate %d IRTE for PCI %s\n", nvec, + pci_name(dev)); + return -ENOSPC; + } + return index; +} +#endif + +static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq) +{ + int ret; struct msi_msg msg; + + ret = msi_compose_msg(dev, irq, &msg); + if (ret < 0) + return ret; + + set_irq_msi(irq, desc); + write_msi_msg(irq, &msg); + +#ifdef CONFIG_INTR_REMAP + if (irq_remapped(irq)) { + struct irq_desc *desc = irq_desc + irq; + /* + * irq migration in process context + */ + desc->status |= IRQ_MOVE_PCNTXT; + set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge"); + } else +#endif + set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); + + return 0; +} + +int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) +{ int irq, ret; + irq = create_irq(); if (irq < 0) return irq; - ret = msi_compose_msg(dev, irq, &msg); +#ifdef CONFIG_INTR_REMAP + if (!intr_remapping_enabled) + goto no_ir; + + ret = msi_alloc_irte(dev, irq, 1); + if (ret < 0) + goto error; +no_ir: +#endif + ret = setup_msi_irq(dev, desc, irq); if (ret < 0) { destroy_irq(irq); return ret; } + return 0; - set_irq_msi(irq, desc); - write_msi_msg(irq, &msg); +#ifdef CONFIG_INTR_REMAP +error: + destroy_irq(irq); + return ret; +#endif +} - set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); +int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) +{ + int irq, ret, sub_handle; + struct msi_desc *desc; +#ifdef CONFIG_INTR_REMAP + struct intel_iommu *iommu = 0; + int index = 0; +#endif + + sub_handle = 0; + list_for_each_entry(desc, &dev->msi_list, list) { + irq = create_irq(); + if (irq < 0) + return irq; +#ifdef CONFIG_INTR_REMAP + if (!intr_remapping_enabled) + goto no_ir; + if (!sub_handle) { + /* + * allocate the consecutive block of IRTE's + * for 'nvec' + */ + index = msi_alloc_irte(dev, irq, nvec); + if (index < 0) { + ret = index; + goto error; + } + } else { + iommu = map_dev_to_ir(dev); + if (!iommu) { + ret = -ENOENT; + goto error; + } + /* + * setup the mapping between the irq and the IRTE + * base index, the sub_handle pointing to the + * appropriate interrupt remap table entry. + */ + set_irte_irq(irq, iommu, index, sub_handle); + } +no_ir: +#endif + ret = setup_msi_irq(dev, desc, irq); + if (ret < 0) + goto error; + sub_handle++; + } return 0; + +error: + destroy_irq(irq); + return ret; } void arch_teardown_msi_irq(unsigned int irq) @@ -2333,6 +2872,10 @@ void __init setup_ioapic_dest(void) setup_IO_APIC_irq(ioapic, pin, irq, irq_trigger(irq_entry), irq_polarity(irq_entry)); +#ifdef CONFIG_INTR_REMAP + else if (intr_remapping_enabled) + set_ir_ioapic_affinity_irq(irq, TARGET_CPUS); +#endif else set_ioapic_affinity_irq(irq, TARGET_CPUS); } diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c index 1c3a66a67f8..720d2607aac 100644 --- a/arch/x86/kernel/io_delay.c +++ b/arch/x86/kernel/io_delay.c @@ -92,6 +92,14 @@ static struct dmi_system_id __initdata io_delay_0xed_port_dmi_table[] = { DMI_MATCH(DMI_BOARD_NAME, "30BF") } }, + { + .callback = dmi_io_delay_0xed_port, + .ident = "Presario F700", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), + DMI_MATCH(DMI_BOARD_NAME, "30D3") + } + }, { } }; diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index 50e5e4a31c8..19191430274 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -14,6 +14,7 @@ #include <linux/slab.h> #include <linux/thread_info.h> #include <linux/syscalls.h> +#include <asm/syscalls.h> /* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ static void set_bitmap(unsigned long *bitmap, unsigned int base, diff --git a/arch/x86/kernel/ipi.c b/arch/x86/kernel/ipi.c index 3f7537b669d..f1c688e46f3 100644 --- a/arch/x86/kernel/ipi.c +++ b/arch/x86/kernel/ipi.c @@ -20,6 +20,8 @@ #ifdef CONFIG_X86_32 #include <mach_apic.h> +#include <mach_ipi.h> + /* * the following functions deal with sending IPIs between CPUs. * @@ -147,7 +149,6 @@ void send_IPI_mask_sequence(cpumask_t mask, int vector) } /* must come after the send_IPI functions above for inlining */ -#include <mach_ipi.h> static int convert_apicid_to_cpu(int apic_id) { int i; diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 1cf8c1fcc08..b71e02d42f4 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -325,7 +325,7 @@ skip: for_each_online_cpu(j) seq_printf(p, "%10u ", per_cpu(irq_stat,j).irq_call_count); - seq_printf(p, " function call interrupts\n"); + seq_printf(p, " Function call interrupts\n"); seq_printf(p, "TLB: "); for_each_online_cpu(j) seq_printf(p, "%10u ", diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 1f78b238d8d..f065fe9071b 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -129,7 +129,7 @@ skip: seq_printf(p, "CAL: "); for_each_online_cpu(j) seq_printf(p, "%10u ", cpu_pda(j)->irq_call_count); - seq_printf(p, " function call interrupts\n"); + seq_printf(p, " Function call interrupts\n"); seq_printf(p, "TLB: "); for_each_online_cpu(j) seq_printf(p, "%10u ", cpu_pda(j)->irq_tlb_count); diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index d66914287ee..9200a1e2752 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -74,6 +74,15 @@ void __init init_ISA_irqs (void) } } +/* + * IRQ2 is cascade interrupt to second interrupt controller + */ +static struct irqaction irq2 = { + .handler = no_action, + .mask = CPU_MASK_NONE, + .name = "cascade", +}; + /* Overridden in paravirt.c */ void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); @@ -98,6 +107,46 @@ void __init native_init_IRQ(void) set_intr_gate(vector, interrupt[i]); } +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP) + /* + * IRQ0 must be given a fixed assignment and initialized, + * because it's used before the IO-APIC is set up. + */ + set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]); + + /* + * The reschedule interrupt is a CPU-to-CPU reschedule-helper + * IPI, driven by wakeup. + */ + alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); + + /* IPI for invalidation */ + alloc_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt); + + /* IPI for generic function call */ + alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); + + /* IPI for single call function */ + set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, call_function_single_interrupt); +#endif + +#ifdef CONFIG_X86_LOCAL_APIC + /* self generated IPI for local APIC timer */ + alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); + + /* IPI vectors for APIC spurious and error interrupts */ + alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); + alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); +#endif + +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL) + /* thermal monitor LVT interrupt */ + alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); +#endif + + if (!acpi_ioapic) + setup_irq(2, &irq2); + /* setup after call gates are initialised (usually add in * the architecture specific gates) */ diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index 1f26fd9ec4f..5b5be9d43c2 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -135,7 +135,7 @@ DEFINE_PER_CPU(vector_irq_t, vector_irq) = { [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 }; -static void __init init_ISA_irqs (void) +void __init init_ISA_irqs(void) { int i; @@ -164,22 +164,8 @@ static void __init init_ISA_irqs (void) void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); -void __init native_init_IRQ(void) +static void __init smp_intr_init(void) { - int i; - - init_ISA_irqs(); - /* - * Cover the whole vector space, no vector can escape - * us. (some of these will be overridden and become - * 'special' SMP interrupts) - */ - for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { - int vector = FIRST_EXTERNAL_VECTOR + i; - if (vector != IA32_SYSCALL_VECTOR) - set_intr_gate(vector, interrupt[i]); - } - #ifdef CONFIG_SMP /* * The reschedule interrupt is a CPU-to-CPU reschedule-helper @@ -207,6 +193,12 @@ void __init native_init_IRQ(void) /* Low priority IPI to cleanup after moving an irq */ set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); #endif +} + +static void __init apic_intr_init(void) +{ + smp_intr_init(); + alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); @@ -216,6 +208,25 @@ void __init native_init_IRQ(void) /* IPI vectors for APIC spurious and error interrupts */ alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); +} + +void __init native_init_IRQ(void) +{ + int i; + + init_ISA_irqs(); + /* + * Cover the whole vector space, no vector can escape + * us. (some of these will be overridden and become + * 'special' SMP interrupts) + */ + for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { + int vector = FIRST_EXTERNAL_VECTOR + i; + if (vector != IA32_SYSCALL_VECTOR) + set_intr_gate(vector, interrupt[i]); + } + + apic_intr_init(); if (!acpi_ioapic) setup_irq(2, &irq2); diff --git a/arch/x86/kernel/k8.c b/arch/x86/kernel/k8.c index 7377ccb2133..304d8bad655 100644 --- a/arch/x86/kernel/k8.c +++ b/arch/x86/kernel/k8.c @@ -16,8 +16,9 @@ EXPORT_SYMBOL(num_k8_northbridges); static u32 *flush_words; struct pci_device_id k8_nb_ids[] = { - { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_11H_NB_MISC) }, {} }; EXPORT_SYMBOL(k8_nb_ids); diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c index f2d43bc7551..ff7d3b0124f 100644 --- a/arch/x86/kernel/kdebugfs.c +++ b/arch/x86/kernel/kdebugfs.c @@ -139,6 +139,7 @@ static int __init create_setup_data_nodes(struct dentry *parent) if (PageHighMem(pg)) { data = ioremap_cache(pa_data, sizeof(*data)); if (!data) { + kfree(node); error = -ENXIO; goto err_dir; } diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index f47f0eb886b..10435a120d2 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -69,6 +69,9 @@ static int gdb_x86vector = -1; */ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) { +#ifndef CONFIG_X86_32 + u32 *gdb_regs32 = (u32 *)gdb_regs; +#endif gdb_regs[GDB_AX] = regs->ax; gdb_regs[GDB_BX] = regs->bx; gdb_regs[GDB_CX] = regs->cx; @@ -76,9 +79,9 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) gdb_regs[GDB_SI] = regs->si; gdb_regs[GDB_DI] = regs->di; gdb_regs[GDB_BP] = regs->bp; - gdb_regs[GDB_PS] = regs->flags; gdb_regs[GDB_PC] = regs->ip; #ifdef CONFIG_X86_32 + gdb_regs[GDB_PS] = regs->flags; gdb_regs[GDB_DS] = regs->ds; gdb_regs[GDB_ES] = regs->es; gdb_regs[GDB_CS] = regs->cs; @@ -94,6 +97,9 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) gdb_regs[GDB_R13] = regs->r13; gdb_regs[GDB_R14] = regs->r14; gdb_regs[GDB_R15] = regs->r15; + gdb_regs32[GDB_PS] = regs->flags; + gdb_regs32[GDB_CS] = regs->cs; + gdb_regs32[GDB_SS] = regs->ss; #endif gdb_regs[GDB_SP] = regs->sp; } @@ -112,6 +118,9 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) */ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) { +#ifndef CONFIG_X86_32 + u32 *gdb_regs32 = (u32 *)gdb_regs; +#endif gdb_regs[GDB_AX] = 0; gdb_regs[GDB_BX] = 0; gdb_regs[GDB_CX] = 0; @@ -129,8 +138,10 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) gdb_regs[GDB_FS] = 0xFFFF; gdb_regs[GDB_GS] = 0xFFFF; #else - gdb_regs[GDB_PS] = *(unsigned long *)(p->thread.sp + 8); - gdb_regs[GDB_PC] = 0; + gdb_regs32[GDB_PS] = *(unsigned long *)(p->thread.sp + 8); + gdb_regs32[GDB_CS] = __KERNEL_CS; + gdb_regs32[GDB_SS] = __KERNEL_DS; + gdb_regs[GDB_PC] = p->thread.ip; gdb_regs[GDB_R8] = 0; gdb_regs[GDB_R9] = 0; gdb_regs[GDB_R10] = 0; @@ -153,6 +164,9 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) */ void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs) { +#ifndef CONFIG_X86_32 + u32 *gdb_regs32 = (u32 *)gdb_regs; +#endif regs->ax = gdb_regs[GDB_AX]; regs->bx = gdb_regs[GDB_BX]; regs->cx = gdb_regs[GDB_CX]; @@ -160,9 +174,9 @@ void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs) regs->si = gdb_regs[GDB_SI]; regs->di = gdb_regs[GDB_DI]; regs->bp = gdb_regs[GDB_BP]; - regs->flags = gdb_regs[GDB_PS]; regs->ip = gdb_regs[GDB_PC]; #ifdef CONFIG_X86_32 + regs->flags = gdb_regs[GDB_PS]; regs->ds = gdb_regs[GDB_DS]; regs->es = gdb_regs[GDB_ES]; regs->cs = gdb_regs[GDB_CS]; @@ -175,6 +189,9 @@ void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs) regs->r13 = gdb_regs[GDB_R13]; regs->r14 = gdb_regs[GDB_R14]; regs->r15 = gdb_regs[GDB_R15]; + regs->flags = gdb_regs32[GDB_PS]; + regs->cs = gdb_regs32[GDB_CS]; + regs->ss = gdb_regs32[GDB_SS]; #endif } @@ -378,10 +395,8 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code, if (remcomInBuffer[0] == 's') { linux_regs->flags |= X86_EFLAGS_TF; kgdb_single_step = 1; - if (kgdb_contthread) { - atomic_set(&kgdb_cpu_doing_single_step, - raw_smp_processor_id()); - } + atomic_set(&kgdb_cpu_doing_single_step, + raw_smp_processor_id()); } get_debugreg(dr6, 6); @@ -440,12 +455,7 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd) return NOTIFY_DONE; case DIE_NMI_IPI: - if (atomic_read(&kgdb_active) != -1) { - /* KGDB CPU roundup */ - kgdb_nmicallback(raw_smp_processor_id(), regs); - was_in_debug_nmi[raw_smp_processor_id()] = 1; - touch_nmi_watchdog(); - } + /* Just ignore, we will handle the roundup on DIE_NMI. */ return NOTIFY_DONE; case DIE_NMIUNKNOWN: @@ -466,9 +476,15 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd) case DIE_DEBUG: if (atomic_read(&kgdb_cpu_doing_single_step) == - raw_smp_processor_id() && - user_mode(regs)) - return single_step_cont(regs, args); + raw_smp_processor_id()) { + if (user_mode(regs)) + return single_step_cont(regs, args); + break; + } else if (test_thread_flag(TIF_SINGLESTEP)) + /* This means a user thread is single stepping + * a system call which should be ignored + */ + return NOTIFY_DONE; /* fall through */ default: if (user_mode(regs)) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 8b7a3cf37d2..478bca986ec 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -178,7 +178,7 @@ static void kvm_flush_tlb(void) kvm_deferred_mmu_op(&ftlb, sizeof ftlb); } -static void kvm_release_pt(u32 pfn) +static void kvm_release_pt(unsigned long pfn) { struct kvm_mmu_op_release_pt rpt = { .header.op = KVM_MMU_OP_RELEASE_PT, diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index d02def06ca9..774ac499156 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -78,6 +78,34 @@ static cycle_t kvm_clock_read(void) return ret; } +/* + * If we don't do that, there is the possibility that the guest + * will calibrate under heavy load - thus, getting a lower lpj - + * and execute the delays themselves without load. This is wrong, + * because no delay loop can finish beforehand. + * Any heuristics is subject to fail, because ultimately, a large + * poll of guests can be running and trouble each other. So we preset + * lpj here + */ +static unsigned long kvm_get_tsc_khz(void) +{ + return preset_lpj; +} + +static void kvm_get_preset_lpj(void) +{ + struct pvclock_vcpu_time_info *src; + unsigned long khz; + u64 lpj; + + src = &per_cpu(hv_clock, 0); + khz = pvclock_tsc_khz(src); + + lpj = ((u64)khz * 1000); + do_div(lpj, HZ); + preset_lpj = lpj; +} + static struct clocksource kvm_clock = { .name = "kvm-clock", .read = kvm_clock_read, @@ -153,6 +181,7 @@ void __init kvmclock_init(void) pv_time_ops.get_wallclock = kvm_get_wallclock; pv_time_ops.set_wallclock = kvm_set_wallclock; pv_time_ops.sched_clock = kvm_clock_read; + pv_time_ops.get_tsc_khz = kvm_get_tsc_khz; #ifdef CONFIG_X86_LOCAL_APIC pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock; #endif @@ -163,6 +192,7 @@ void __init kvmclock_init(void) #ifdef CONFIG_KEXEC machine_ops.crash_shutdown = kvm_crash_shutdown; #endif + kvm_get_preset_lpj(); clocksource_register(&kvm_clock); } } diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index b68e21f06f4..eee32b43fee 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -18,6 +18,7 @@ #include <asm/ldt.h> #include <asm/desc.h> #include <asm/mmu_context.h> +#include <asm/syscalls.h> #ifdef CONFIG_SMP static void flush_ldt(void *current_mm) @@ -51,6 +52,8 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload) memset(newldt + oldsize * LDT_ENTRY_SIZE, 0, (mincount - oldsize) * LDT_ENTRY_SIZE); + paravirt_alloc_ldt(newldt, mincount); + #ifdef CONFIG_X86_64 /* CHECKME: Do we really need this ? */ wmb(); @@ -73,6 +76,7 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload) #endif } if (oldsize) { + paravirt_free_ldt(oldldt, oldsize); if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE) vfree(oldldt); else @@ -84,10 +88,13 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload) static inline int copy_ldt(mm_context_t *new, mm_context_t *old) { int err = alloc_ldt(new, old->size, 0); + int i; if (err < 0) return err; - memcpy(new->ldt, old->ldt, old->size * LDT_ENTRY_SIZE); + + for(i = 0; i < old->size; i++) + write_ldt_entry(new->ldt, i, old->ldt + i * LDT_ENTRY_SIZE); return 0; } @@ -124,6 +131,7 @@ void destroy_context(struct mm_struct *mm) if (mm == current->active_mm) clear_LDT(); #endif + paravirt_free_ldt(mm->context.ldt, mm->context.size); if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE) vfree(mm->context.ldt); else diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c deleted file mode 100644 index 652fa5c38eb..00000000000 --- a/arch/x86/kernel/microcode.c +++ /dev/null @@ -1,853 +0,0 @@ -/* - * Intel CPU Microcode Update Driver for Linux - * - * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk> - * 2006 Shaohua Li <shaohua.li@intel.com> - * - * This driver allows to upgrade microcode on Intel processors - * belonging to IA-32 family - PentiumPro, Pentium II, - * Pentium III, Xeon, Pentium 4, etc. - * - * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture - * Software Developer's Manual - * Order Number 253668 or free download from: - * - * http://developer.intel.com/design/pentium4/manuals/253668.htm - * - * For more information, go to http://www.urbanmyth.org/microcode - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * 1.0 16 Feb 2000, Tigran Aivazian <tigran@sco.com> - * Initial release. - * 1.01 18 Feb 2000, Tigran Aivazian <tigran@sco.com> - * Added read() support + cleanups. - * 1.02 21 Feb 2000, Tigran Aivazian <tigran@sco.com> - * Added 'device trimming' support. open(O_WRONLY) zeroes - * and frees the saved copy of applied microcode. - * 1.03 29 Feb 2000, Tigran Aivazian <tigran@sco.com> - * Made to use devfs (/dev/cpu/microcode) + cleanups. - * 1.04 06 Jun 2000, Simon Trimmer <simon@veritas.com> - * Added misc device support (now uses both devfs and misc). - * Added MICROCODE_IOCFREE ioctl to clear memory. - * 1.05 09 Jun 2000, Simon Trimmer <simon@veritas.com> - * Messages for error cases (non Intel & no suitable microcode). - * 1.06 03 Aug 2000, Tigran Aivazian <tigran@veritas.com> - * Removed ->release(). Removed exclusive open and status bitmap. - * Added microcode_rwsem to serialize read()/write()/ioctl(). - * Removed global kernel lock usage. - * 1.07 07 Sep 2000, Tigran Aivazian <tigran@veritas.com> - * Write 0 to 0x8B msr and then cpuid before reading revision, - * so that it works even if there were no update done by the - * BIOS. Otherwise, reading from 0x8B gives junk (which happened - * to be 0 on my machine which is why it worked even when I - * disabled update by the BIOS) - * Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix. - * 1.08 11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and - * Tigran Aivazian <tigran@veritas.com> - * Intel Pentium 4 processor support and bugfixes. - * 1.09 30 Oct 2001, Tigran Aivazian <tigran@veritas.com> - * Bugfix for HT (Hyper-Threading) enabled processors - * whereby processor resources are shared by all logical processors - * in a single CPU package. - * 1.10 28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and - * Tigran Aivazian <tigran@veritas.com>, - * Serialize updates as required on HT processors due to speculative - * nature of implementation. - * 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com> - * Fix the panic when writing zero-length microcode chunk. - * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>, - * Jun Nakajima <jun.nakajima@intel.com> - * Support for the microcode updates in the new format. - * 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com> - * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl - * because we no longer hold a copy of applied microcode - * in kernel memory. - * 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com> - * Fix sigmatch() macro to handle old CPUs with pf == 0. - * Thanks to Stuart Swales for pointing out this bug. - */ - -//#define DEBUG /* pr_debug */ -#include <linux/capability.h> -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/sched.h> -#include <linux/smp_lock.h> -#include <linux/cpumask.h> -#include <linux/module.h> -#include <linux/slab.h> -#include <linux/vmalloc.h> -#include <linux/miscdevice.h> -#include <linux/spinlock.h> -#include <linux/mm.h> -#include <linux/fs.h> -#include <linux/mutex.h> -#include <linux/cpu.h> -#include <linux/firmware.h> -#include <linux/platform_device.h> - -#include <asm/msr.h> -#include <asm/uaccess.h> -#include <asm/processor.h> - -MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver"); -MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>"); -MODULE_LICENSE("GPL"); - -#define MICROCODE_VERSION "1.14a" - -#define DEFAULT_UCODE_DATASIZE (2000) /* 2000 bytes */ -#define MC_HEADER_SIZE (sizeof (microcode_header_t)) /* 48 bytes */ -#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 2048 bytes */ -#define EXT_HEADER_SIZE (sizeof (struct extended_sigtable)) /* 20 bytes */ -#define EXT_SIGNATURE_SIZE (sizeof (struct extended_signature)) /* 12 bytes */ -#define DWSIZE (sizeof (u32)) -#define get_totalsize(mc) \ - (((microcode_t *)mc)->hdr.totalsize ? \ - ((microcode_t *)mc)->hdr.totalsize : DEFAULT_UCODE_TOTALSIZE) -#define get_datasize(mc) \ - (((microcode_t *)mc)->hdr.datasize ? \ - ((microcode_t *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE) - -#define sigmatch(s1, s2, p1, p2) \ - (((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0)))) - -#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE) - -/* serialize access to the physical write to MSR 0x79 */ -static DEFINE_SPINLOCK(microcode_update_lock); - -/* no concurrent ->write()s are allowed on /dev/cpu/microcode */ -static DEFINE_MUTEX(microcode_mutex); - -static struct ucode_cpu_info { - int valid; - unsigned int sig; - unsigned int pf; - unsigned int rev; - microcode_t *mc; -} ucode_cpu_info[NR_CPUS]; - -static void collect_cpu_info(int cpu_num) -{ - struct cpuinfo_x86 *c = &cpu_data(cpu_num); - struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; - unsigned int val[2]; - - /* We should bind the task to the CPU */ - BUG_ON(raw_smp_processor_id() != cpu_num); - uci->pf = uci->rev = 0; - uci->mc = NULL; - uci->valid = 1; - - if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || - cpu_has(c, X86_FEATURE_IA64)) { - printk(KERN_ERR "microcode: CPU%d not a capable Intel " - "processor\n", cpu_num); - uci->valid = 0; - return; - } - - uci->sig = cpuid_eax(0x00000001); - - if ((c->x86_model >= 5) || (c->x86 > 6)) { - /* get processor flags from MSR 0x17 */ - rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); - uci->pf = 1 << ((val[1] >> 18) & 7); - } - - wrmsr(MSR_IA32_UCODE_REV, 0, 0); - /* see notes above for revision 1.07. Apparent chip bug */ - sync_core(); - /* get the current revision from MSR 0x8B */ - rdmsr(MSR_IA32_UCODE_REV, val[0], uci->rev); - pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n", - uci->sig, uci->pf, uci->rev); -} - -static inline int microcode_update_match(int cpu_num, - microcode_header_t *mc_header, int sig, int pf) -{ - struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; - - if (!sigmatch(sig, uci->sig, pf, uci->pf) - || mc_header->rev <= uci->rev) - return 0; - return 1; -} - -static int microcode_sanity_check(void *mc) -{ - microcode_header_t *mc_header = mc; - struct extended_sigtable *ext_header = NULL; - struct extended_signature *ext_sig; - unsigned long total_size, data_size, ext_table_size; - int sum, orig_sum, ext_sigcount = 0, i; - - total_size = get_totalsize(mc_header); - data_size = get_datasize(mc_header); - if (data_size + MC_HEADER_SIZE > total_size) { - printk(KERN_ERR "microcode: error! " - "Bad data size in microcode data file\n"); - return -EINVAL; - } - - if (mc_header->ldrver != 1 || mc_header->hdrver != 1) { - printk(KERN_ERR "microcode: error! " - "Unknown microcode update format\n"); - return -EINVAL; - } - ext_table_size = total_size - (MC_HEADER_SIZE + data_size); - if (ext_table_size) { - if ((ext_table_size < EXT_HEADER_SIZE) - || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) { - printk(KERN_ERR "microcode: error! " - "Small exttable size in microcode data file\n"); - return -EINVAL; - } - ext_header = mc + MC_HEADER_SIZE + data_size; - if (ext_table_size != exttable_size(ext_header)) { - printk(KERN_ERR "microcode: error! " - "Bad exttable size in microcode data file\n"); - return -EFAULT; - } - ext_sigcount = ext_header->count; - } - - /* check extended table checksum */ - if (ext_table_size) { - int ext_table_sum = 0; - int *ext_tablep = (int *)ext_header; - - i = ext_table_size / DWSIZE; - while (i--) - ext_table_sum += ext_tablep[i]; - if (ext_table_sum) { - printk(KERN_WARNING "microcode: aborting, " - "bad extended signature table checksum\n"); - return -EINVAL; - } - } - - /* calculate the checksum */ - orig_sum = 0; - i = (MC_HEADER_SIZE + data_size) / DWSIZE; - while (i--) - orig_sum += ((int *)mc)[i]; - if (orig_sum) { - printk(KERN_ERR "microcode: aborting, bad checksum\n"); - return -EINVAL; - } - if (!ext_table_size) - return 0; - /* check extended signature checksum */ - for (i = 0; i < ext_sigcount; i++) { - ext_sig = (void *)ext_header + EXT_HEADER_SIZE + - EXT_SIGNATURE_SIZE * i; - sum = orig_sum - - (mc_header->sig + mc_header->pf + mc_header->cksum) - + (ext_sig->sig + ext_sig->pf + ext_sig->cksum); - if (sum) { - printk(KERN_ERR "microcode: aborting, bad checksum\n"); - return -EINVAL; - } - } - return 0; -} - -/* - * return 0 - no update found - * return 1 - found update - * return < 0 - error - */ -static int get_maching_microcode(void *mc, int cpu) -{ - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - microcode_header_t *mc_header = mc; - struct extended_sigtable *ext_header; - unsigned long total_size = get_totalsize(mc_header); - int ext_sigcount, i; - struct extended_signature *ext_sig; - void *new_mc; - - if (microcode_update_match(cpu, mc_header, - mc_header->sig, mc_header->pf)) - goto find; - - if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE) - return 0; - - ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE; - ext_sigcount = ext_header->count; - ext_sig = (void *)ext_header + EXT_HEADER_SIZE; - for (i = 0; i < ext_sigcount; i++) { - if (microcode_update_match(cpu, mc_header, - ext_sig->sig, ext_sig->pf)) - goto find; - ext_sig++; - } - return 0; -find: - pr_debug("microcode: CPU%d found a matching microcode update with" - " version 0x%x (current=0x%x)\n", cpu, mc_header->rev,uci->rev); - new_mc = vmalloc(total_size); - if (!new_mc) { - printk(KERN_ERR "microcode: error! Can not allocate memory\n"); - return -ENOMEM; - } - - /* free previous update file */ - vfree(uci->mc); - - memcpy(new_mc, mc, total_size); - uci->mc = new_mc; - return 1; -} - -static void apply_microcode(int cpu) -{ - unsigned long flags; - unsigned int val[2]; - int cpu_num = raw_smp_processor_id(); - struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; - - /* We should bind the task to the CPU */ - BUG_ON(cpu_num != cpu); - - if (uci->mc == NULL) - return; - - /* serialize access to the physical write to MSR 0x79 */ - spin_lock_irqsave(µcode_update_lock, flags); - - /* write microcode via MSR 0x79 */ - wrmsr(MSR_IA32_UCODE_WRITE, - (unsigned long) uci->mc->bits, - (unsigned long) uci->mc->bits >> 16 >> 16); - wrmsr(MSR_IA32_UCODE_REV, 0, 0); - - /* see notes above for revision 1.07. Apparent chip bug */ - sync_core(); - - /* get the current revision from MSR 0x8B */ - rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); - - spin_unlock_irqrestore(µcode_update_lock, flags); - if (val[1] != uci->mc->hdr.rev) { - printk(KERN_ERR "microcode: CPU%d update from revision " - "0x%x to 0x%x failed\n", cpu_num, uci->rev, val[1]); - return; - } - printk(KERN_INFO "microcode: CPU%d updated from revision " - "0x%x to 0x%x, date = %08x \n", - cpu_num, uci->rev, val[1], uci->mc->hdr.date); - uci->rev = val[1]; -} - -#ifdef CONFIG_MICROCODE_OLD_INTERFACE -static void __user *user_buffer; /* user area microcode data buffer */ -static unsigned int user_buffer_size; /* it's size */ - -static long get_next_ucode(void **mc, long offset) -{ - microcode_header_t mc_header; - unsigned long total_size; - - /* No more data */ - if (offset >= user_buffer_size) - return 0; - if (copy_from_user(&mc_header, user_buffer + offset, MC_HEADER_SIZE)) { - printk(KERN_ERR "microcode: error! Can not read user data\n"); - return -EFAULT; - } - total_size = get_totalsize(&mc_header); - if (offset + total_size > user_buffer_size) { - printk(KERN_ERR "microcode: error! Bad total size in microcode " - "data file\n"); - return -EINVAL; - } - *mc = vmalloc(total_size); - if (!*mc) - return -ENOMEM; - if (copy_from_user(*mc, user_buffer + offset, total_size)) { - printk(KERN_ERR "microcode: error! Can not read user data\n"); - vfree(*mc); - return -EFAULT; - } - return offset + total_size; -} - -static int do_microcode_update (void) -{ - long cursor = 0; - int error = 0; - void *new_mc = NULL; - int cpu; - cpumask_t old; - - old = current->cpus_allowed; - - while ((cursor = get_next_ucode(&new_mc, cursor)) > 0) { - error = microcode_sanity_check(new_mc); - if (error) - goto out; - /* - * It's possible the data file has multiple matching ucode, - * lets keep searching till the latest version - */ - for_each_online_cpu(cpu) { - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - - if (!uci->valid) - continue; - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); - error = get_maching_microcode(new_mc, cpu); - if (error < 0) - goto out; - if (error == 1) - apply_microcode(cpu); - } - vfree(new_mc); - } -out: - if (cursor > 0) - vfree(new_mc); - if (cursor < 0) - error = cursor; - set_cpus_allowed_ptr(current, &old); - return error; -} - -static int microcode_open (struct inode *unused1, struct file *unused2) -{ - cycle_kernel_lock(); - return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; -} - -static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos) -{ - ssize_t ret; - - if ((len >> PAGE_SHIFT) > num_physpages) { - printk(KERN_ERR "microcode: too much data (max %ld pages)\n", num_physpages); - return -EINVAL; - } - - get_online_cpus(); - mutex_lock(µcode_mutex); - - user_buffer = (void __user *) buf; - user_buffer_size = (int) len; - - ret = do_microcode_update(); - if (!ret) - ret = (ssize_t)len; - - mutex_unlock(µcode_mutex); - put_online_cpus(); - - return ret; -} - -static const struct file_operations microcode_fops = { - .owner = THIS_MODULE, - .write = microcode_write, - .open = microcode_open, -}; - -static struct miscdevice microcode_dev = { - .minor = MICROCODE_MINOR, - .name = "microcode", - .fops = µcode_fops, -}; - -static int __init microcode_dev_init (void) -{ - int error; - - error = misc_register(µcode_dev); - if (error) { - printk(KERN_ERR - "microcode: can't misc_register on minor=%d\n", - MICROCODE_MINOR); - return error; - } - - return 0; -} - -static void microcode_dev_exit (void) -{ - misc_deregister(µcode_dev); -} - -MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); -#else -#define microcode_dev_init() 0 -#define microcode_dev_exit() do { } while(0) -#endif - -static long get_next_ucode_from_buffer(void **mc, const u8 *buf, - unsigned long size, long offset) -{ - microcode_header_t *mc_header; - unsigned long total_size; - - /* No more data */ - if (offset >= size) - return 0; - mc_header = (microcode_header_t *)(buf + offset); - total_size = get_totalsize(mc_header); - - if (offset + total_size > size) { - printk(KERN_ERR "microcode: error! Bad data in microcode data file\n"); - return -EINVAL; - } - - *mc = vmalloc(total_size); - if (!*mc) { - printk(KERN_ERR "microcode: error! Can not allocate memory\n"); - return -ENOMEM; - } - memcpy(*mc, buf + offset, total_size); - return offset + total_size; -} - -/* fake device for request_firmware */ -static struct platform_device *microcode_pdev; - -static int cpu_request_microcode(int cpu) -{ - char name[30]; - struct cpuinfo_x86 *c = &cpu_data(cpu); - const struct firmware *firmware; - const u8 *buf; - unsigned long size; - long offset = 0; - int error; - void *mc; - - /* We should bind the task to the CPU */ - BUG_ON(cpu != raw_smp_processor_id()); - sprintf(name,"intel-ucode/%02x-%02x-%02x", - c->x86, c->x86_model, c->x86_mask); - error = request_firmware(&firmware, name, µcode_pdev->dev); - if (error) { - pr_debug("microcode: data file %s load failed\n", name); - return error; - } - buf = firmware->data; - size = firmware->size; - while ((offset = get_next_ucode_from_buffer(&mc, buf, size, offset)) - > 0) { - error = microcode_sanity_check(mc); - if (error) - break; - error = get_maching_microcode(mc, cpu); - if (error < 0) - break; - /* - * It's possible the data file has multiple matching ucode, - * lets keep searching till the latest version - */ - if (error == 1) { - apply_microcode(cpu); - error = 0; - } - vfree(mc); - } - if (offset > 0) - vfree(mc); - if (offset < 0) - error = offset; - release_firmware(firmware); - - return error; -} - -static int apply_microcode_check_cpu(int cpu) -{ - struct cpuinfo_x86 *c = &cpu_data(cpu); - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - cpumask_t old; - unsigned int val[2]; - int err = 0; - - /* Check if the microcode is available */ - if (!uci->mc) - return 0; - - old = current->cpus_allowed; - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); - - /* Check if the microcode we have in memory matches the CPU */ - if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || - cpu_has(c, X86_FEATURE_IA64) || uci->sig != cpuid_eax(0x00000001)) - err = -EINVAL; - - if (!err && ((c->x86_model >= 5) || (c->x86 > 6))) { - /* get processor flags from MSR 0x17 */ - rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); - if (uci->pf != (1 << ((val[1] >> 18) & 7))) - err = -EINVAL; - } - - if (!err) { - wrmsr(MSR_IA32_UCODE_REV, 0, 0); - /* see notes above for revision 1.07. Apparent chip bug */ - sync_core(); - /* get the current revision from MSR 0x8B */ - rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); - if (uci->rev != val[1]) - err = -EINVAL; - } - - if (!err) - apply_microcode(cpu); - else - printk(KERN_ERR "microcode: Could not apply microcode to CPU%d:" - " sig=0x%x, pf=0x%x, rev=0x%x\n", - cpu, uci->sig, uci->pf, uci->rev); - - set_cpus_allowed_ptr(current, &old); - return err; -} - -static void microcode_init_cpu(int cpu, int resume) -{ - cpumask_t old; - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - - old = current->cpus_allowed; - - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); - mutex_lock(µcode_mutex); - collect_cpu_info(cpu); - if (uci->valid && system_state == SYSTEM_RUNNING && !resume) - cpu_request_microcode(cpu); - mutex_unlock(µcode_mutex); - set_cpus_allowed_ptr(current, &old); -} - -static void microcode_fini_cpu(int cpu) -{ - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - - mutex_lock(µcode_mutex); - uci->valid = 0; - vfree(uci->mc); - uci->mc = NULL; - mutex_unlock(µcode_mutex); -} - -static ssize_t reload_store(struct sys_device *dev, - struct sysdev_attribute *attr, - const char *buf, size_t sz) -{ - struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; - char *end; - unsigned long val = simple_strtoul(buf, &end, 0); - int err = 0; - int cpu = dev->id; - - if (end == buf) - return -EINVAL; - if (val == 1) { - cpumask_t old = current->cpus_allowed; - - get_online_cpus(); - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); - - mutex_lock(µcode_mutex); - if (uci->valid) - err = cpu_request_microcode(cpu); - mutex_unlock(µcode_mutex); - put_online_cpus(); - set_cpus_allowed_ptr(current, &old); - } - if (err) - return err; - return sz; -} - -static ssize_t version_show(struct sys_device *dev, - struct sysdev_attribute *attr, char *buf) -{ - struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; - - return sprintf(buf, "0x%x\n", uci->rev); -} - -static ssize_t pf_show(struct sys_device *dev, - struct sysdev_attribute *attr, char *buf) -{ - struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; - - return sprintf(buf, "0x%x\n", uci->pf); -} - -static SYSDEV_ATTR(reload, 0200, NULL, reload_store); -static SYSDEV_ATTR(version, 0400, version_show, NULL); -static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL); - -static struct attribute *mc_default_attrs[] = { - &attr_reload.attr, - &attr_version.attr, - &attr_processor_flags.attr, - NULL -}; - -static struct attribute_group mc_attr_group = { - .attrs = mc_default_attrs, - .name = "microcode", -}; - -static int __mc_sysdev_add(struct sys_device *sys_dev, int resume) -{ - int err, cpu = sys_dev->id; - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - - if (!cpu_online(cpu)) - return 0; - - pr_debug("microcode: CPU%d added\n", cpu); - memset(uci, 0, sizeof(*uci)); - - err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group); - if (err) - return err; - - microcode_init_cpu(cpu, resume); - - return 0; -} - -static int mc_sysdev_add(struct sys_device *sys_dev) -{ - return __mc_sysdev_add(sys_dev, 0); -} - -static int mc_sysdev_remove(struct sys_device *sys_dev) -{ - int cpu = sys_dev->id; - - if (!cpu_online(cpu)) - return 0; - - pr_debug("microcode: CPU%d removed\n", cpu); - microcode_fini_cpu(cpu); - sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); - return 0; -} - -static int mc_sysdev_resume(struct sys_device *dev) -{ - int cpu = dev->id; - - if (!cpu_online(cpu)) - return 0; - pr_debug("microcode: CPU%d resumed\n", cpu); - /* only CPU 0 will apply ucode here */ - apply_microcode(0); - return 0; -} - -static struct sysdev_driver mc_sysdev_driver = { - .add = mc_sysdev_add, - .remove = mc_sysdev_remove, - .resume = mc_sysdev_resume, -}; - -static __cpuinit int -mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) -{ - unsigned int cpu = (unsigned long)hcpu; - struct sys_device *sys_dev; - - sys_dev = get_cpu_sysdev(cpu); - switch (action) { - case CPU_UP_CANCELED_FROZEN: - /* The CPU refused to come up during a system resume */ - microcode_fini_cpu(cpu); - break; - case CPU_ONLINE: - case CPU_DOWN_FAILED: - mc_sysdev_add(sys_dev); - break; - case CPU_ONLINE_FROZEN: - /* System-wide resume is in progress, try to apply microcode */ - if (apply_microcode_check_cpu(cpu)) { - /* The application of microcode failed */ - microcode_fini_cpu(cpu); - __mc_sysdev_add(sys_dev, 1); - break; - } - case CPU_DOWN_FAILED_FROZEN: - if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group)) - printk(KERN_ERR "microcode: Failed to create the sysfs " - "group for CPU%d\n", cpu); - break; - case CPU_DOWN_PREPARE: - mc_sysdev_remove(sys_dev); - break; - case CPU_DOWN_PREPARE_FROZEN: - /* Suspend is in progress, only remove the interface */ - sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); - break; - } - return NOTIFY_OK; -} - -static struct notifier_block __refdata mc_cpu_notifier = { - .notifier_call = mc_cpu_callback, -}; - -static int __init microcode_init (void) -{ - int error; - - printk(KERN_INFO - "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>\n"); - - error = microcode_dev_init(); - if (error) - return error; - microcode_pdev = platform_device_register_simple("microcode", -1, - NULL, 0); - if (IS_ERR(microcode_pdev)) { - microcode_dev_exit(); - return PTR_ERR(microcode_pdev); - } - - get_online_cpus(); - error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver); - put_online_cpus(); - if (error) { - microcode_dev_exit(); - platform_device_unregister(microcode_pdev); - return error; - } - - register_hotcpu_notifier(&mc_cpu_notifier); - return 0; -} - -static void __exit microcode_exit (void) -{ - microcode_dev_exit(); - - unregister_hotcpu_notifier(&mc_cpu_notifier); - - get_online_cpus(); - sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver); - put_online_cpus(); - - platform_device_unregister(microcode_pdev); -} - -module_init(microcode_init) -module_exit(microcode_exit) diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c new file mode 100644 index 00000000000..7a1f8eeac2c --- /dev/null +++ b/arch/x86/kernel/microcode_amd.c @@ -0,0 +1,435 @@ +/* + * AMD CPU Microcode Update Driver for Linux + * Copyright (C) 2008 Advanced Micro Devices Inc. + * + * Author: Peter Oruba <peter.oruba@amd.com> + * + * Based on work by: + * Tigran Aivazian <tigran@aivazian.fsnet.co.uk> + * + * This driver allows to upgrade microcode on AMD + * family 0x10 and 0x11 processors. + * + * Licensed unter the terms of the GNU General Public + * License version 2. See file COPYING for details. +*/ + +#include <linux/capability.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/sched.h> +#include <linux/cpumask.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/miscdevice.h> +#include <linux/spinlock.h> +#include <linux/mm.h> +#include <linux/fs.h> +#include <linux/mutex.h> +#include <linux/cpu.h> +#include <linux/firmware.h> +#include <linux/platform_device.h> +#include <linux/pci.h> +#include <linux/pci_ids.h> + +#include <asm/msr.h> +#include <asm/uaccess.h> +#include <asm/processor.h> +#include <asm/microcode.h> + +MODULE_DESCRIPTION("AMD Microcode Update Driver"); +MODULE_AUTHOR("Peter Oruba <peter.oruba@amd.com>"); +MODULE_LICENSE("GPL v2"); + +#define UCODE_MAGIC 0x00414d44 +#define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000 +#define UCODE_UCODE_TYPE 0x00000001 + +struct equiv_cpu_entry { + unsigned int installed_cpu; + unsigned int fixed_errata_mask; + unsigned int fixed_errata_compare; + unsigned int equiv_cpu; +}; + +struct microcode_header_amd { + unsigned int data_code; + unsigned int patch_id; + unsigned char mc_patch_data_id[2]; + unsigned char mc_patch_data_len; + unsigned char init_flag; + unsigned int mc_patch_data_checksum; + unsigned int nb_dev_id; + unsigned int sb_dev_id; + unsigned char processor_rev_id[2]; + unsigned char nb_rev_id; + unsigned char sb_rev_id; + unsigned char bios_api_rev; + unsigned char reserved1[3]; + unsigned int match_reg[8]; +}; + +struct microcode_amd { + struct microcode_header_amd hdr; + unsigned int mpb[0]; +}; + +#define UCODE_MAX_SIZE (2048) +#define DEFAULT_UCODE_DATASIZE (896) +#define MC_HEADER_SIZE (sizeof(struct microcode_header_amd)) +#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) +#define DWSIZE (sizeof(u32)) +/* For now we support a fixed ucode total size only */ +#define get_totalsize(mc) \ + ((((struct microcode_amd *)mc)->hdr.mc_patch_data_len * 28) \ + + MC_HEADER_SIZE) + +/* serialize access to the physical write */ +static DEFINE_SPINLOCK(microcode_update_lock); + +static struct equiv_cpu_entry *equiv_cpu_table; + +static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) +{ + struct cpuinfo_x86 *c = &cpu_data(cpu); + + memset(csig, 0, sizeof(*csig)); + + if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) { + printk(KERN_ERR "microcode: CPU%d not a capable AMD processor\n", + cpu); + return -1; + } + + asm volatile("movl %1, %%ecx; rdmsr" + : "=a" (csig->rev) + : "i" (0x0000008B) : "ecx"); + + printk(KERN_INFO "microcode: collect_cpu_info_amd : patch_id=0x%x\n", + csig->rev); + + return 0; +} + +static int get_matching_microcode(int cpu, void *mc, int rev) +{ + struct microcode_header_amd *mc_header = mc; + struct pci_dev *nb_pci_dev, *sb_pci_dev; + unsigned int current_cpu_id; + unsigned int equiv_cpu_id = 0x00; + unsigned int i = 0; + + BUG_ON(equiv_cpu_table == NULL); + current_cpu_id = cpuid_eax(0x00000001); + + while (equiv_cpu_table[i].installed_cpu != 0) { + if (current_cpu_id == equiv_cpu_table[i].installed_cpu) { + equiv_cpu_id = equiv_cpu_table[i].equiv_cpu; + break; + } + i++; + } + + if (!equiv_cpu_id) { + printk(KERN_ERR "microcode: CPU%d cpu_id " + "not found in equivalent cpu table \n", cpu); + return 0; + } + + if ((mc_header->processor_rev_id[0]) != (equiv_cpu_id & 0xff)) { + printk(KERN_ERR + "microcode: CPU%d patch does not match " + "(patch is %x, cpu extended is %x) \n", + cpu, mc_header->processor_rev_id[0], + (equiv_cpu_id & 0xff)); + return 0; + } + + if ((mc_header->processor_rev_id[1]) != ((equiv_cpu_id >> 16) & 0xff)) { + printk(KERN_ERR "microcode: CPU%d patch does not match " + "(patch is %x, cpu base id is %x) \n", + cpu, mc_header->processor_rev_id[1], + ((equiv_cpu_id >> 16) & 0xff)); + + return 0; + } + + /* ucode may be northbridge specific */ + if (mc_header->nb_dev_id) { + nb_pci_dev = pci_get_device(PCI_VENDOR_ID_AMD, + (mc_header->nb_dev_id & 0xff), + NULL); + if ((!nb_pci_dev) || + (mc_header->nb_rev_id != nb_pci_dev->revision)) { + printk(KERN_ERR "microcode: CPU%d NB mismatch \n", cpu); + pci_dev_put(nb_pci_dev); + return 0; + } + pci_dev_put(nb_pci_dev); + } + + /* ucode may be southbridge specific */ + if (mc_header->sb_dev_id) { + sb_pci_dev = pci_get_device(PCI_VENDOR_ID_AMD, + (mc_header->sb_dev_id & 0xff), + NULL); + if ((!sb_pci_dev) || + (mc_header->sb_rev_id != sb_pci_dev->revision)) { + printk(KERN_ERR "microcode: CPU%d SB mismatch \n", cpu); + pci_dev_put(sb_pci_dev); + return 0; + } + pci_dev_put(sb_pci_dev); + } + + if (mc_header->patch_id <= rev) + return 0; + + return 1; +} + +static void apply_microcode_amd(int cpu) +{ + unsigned long flags; + unsigned int eax, edx; + unsigned int rev; + int cpu_num = raw_smp_processor_id(); + struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; + struct microcode_amd *mc_amd = uci->mc; + unsigned long addr; + + /* We should bind the task to the CPU */ + BUG_ON(cpu_num != cpu); + + if (mc_amd == NULL) + return; + + spin_lock_irqsave(µcode_update_lock, flags); + + addr = (unsigned long)&mc_amd->hdr.data_code; + edx = (unsigned int)(((unsigned long)upper_32_bits(addr))); + eax = (unsigned int)(((unsigned long)lower_32_bits(addr))); + + asm volatile("movl %0, %%ecx; wrmsr" : + : "i" (0xc0010020), "a" (eax), "d" (edx) : "ecx"); + + /* get patch id after patching */ + asm volatile("movl %1, %%ecx; rdmsr" + : "=a" (rev) + : "i" (0x0000008B) : "ecx"); + + spin_unlock_irqrestore(µcode_update_lock, flags); + + /* check current patch id and patch's id for match */ + if (rev != mc_amd->hdr.patch_id) { + printk(KERN_ERR "microcode: CPU%d update from revision " + "0x%x to 0x%x failed\n", cpu_num, + mc_amd->hdr.patch_id, rev); + return; + } + + printk(KERN_INFO "microcode: CPU%d updated from revision " + "0x%x to 0x%x \n", + cpu_num, uci->cpu_sig.rev, mc_amd->hdr.patch_id); + + uci->cpu_sig.rev = rev; +} + +static void * get_next_ucode(u8 *buf, unsigned int size, + int (*get_ucode_data)(void *, const void *, size_t), + unsigned int *mc_size) +{ + unsigned int total_size; +#define UCODE_CONTAINER_SECTION_HDR 8 + u8 section_hdr[UCODE_CONTAINER_SECTION_HDR]; + void *mc; + + if (get_ucode_data(section_hdr, buf, UCODE_CONTAINER_SECTION_HDR)) + return NULL; + + if (section_hdr[0] != UCODE_UCODE_TYPE) { + printk(KERN_ERR "microcode: error! " + "Wrong microcode payload type field\n"); + return NULL; + } + + total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8)); + + printk(KERN_INFO "microcode: size %u, total_size %u\n", + size, total_size); + + if (total_size > size || total_size > UCODE_MAX_SIZE) { + printk(KERN_ERR "microcode: error! Bad data in microcode data file\n"); + return NULL; + } + + mc = vmalloc(UCODE_MAX_SIZE); + if (mc) { + memset(mc, 0, UCODE_MAX_SIZE); + if (get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, total_size)) { + vfree(mc); + mc = NULL; + } else + *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR; + } +#undef UCODE_CONTAINER_SECTION_HDR + return mc; +} + + +static int install_equiv_cpu_table(u8 *buf, + int (*get_ucode_data)(void *, const void *, size_t)) +{ +#define UCODE_CONTAINER_HEADER_SIZE 12 + u8 *container_hdr[UCODE_CONTAINER_HEADER_SIZE]; + unsigned int *buf_pos = (unsigned int *)container_hdr; + unsigned long size; + + if (get_ucode_data(&container_hdr, buf, UCODE_CONTAINER_HEADER_SIZE)) + return 0; + + size = buf_pos[2]; + + if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) { + printk(KERN_ERR "microcode: error! " + "Wrong microcode equivalnet cpu table\n"); + return 0; + } + + equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size); + if (!equiv_cpu_table) { + printk(KERN_ERR "microcode: error, can't allocate memory for equiv CPU table\n"); + return 0; + } + + buf += UCODE_CONTAINER_HEADER_SIZE; + if (get_ucode_data(equiv_cpu_table, buf, size)) { + vfree(equiv_cpu_table); + return 0; + } + + return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */ +#undef UCODE_CONTAINER_HEADER_SIZE +} + +static void free_equiv_cpu_table(void) +{ + if (equiv_cpu_table) { + vfree(equiv_cpu_table); + equiv_cpu_table = NULL; + } +} + +static int generic_load_microcode(int cpu, void *data, size_t size, + int (*get_ucode_data)(void *, const void *, size_t)) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + u8 *ucode_ptr = data, *new_mc = NULL, *mc; + int new_rev = uci->cpu_sig.rev; + unsigned int leftover; + unsigned long offset; + + offset = install_equiv_cpu_table(ucode_ptr, get_ucode_data); + if (!offset) { + printk(KERN_ERR "microcode: installing equivalent cpu table failed\n"); + return -EINVAL; + } + + ucode_ptr += offset; + leftover = size - offset; + + while (leftover) { + unsigned int uninitialized_var(mc_size); + struct microcode_header_amd *mc_header; + + mc = get_next_ucode(ucode_ptr, leftover, get_ucode_data, &mc_size); + if (!mc) + break; + + mc_header = (struct microcode_header_amd *)mc; + if (get_matching_microcode(cpu, mc, new_rev)) { + if (new_mc) + vfree(new_mc); + new_rev = mc_header->patch_id; + new_mc = mc; + } else + vfree(mc); + + ucode_ptr += mc_size; + leftover -= mc_size; + } + + if (new_mc) { + if (!leftover) { + if (uci->mc) + vfree(uci->mc); + uci->mc = new_mc; + pr_debug("microcode: CPU%d found a matching microcode update with" + " version 0x%x (current=0x%x)\n", + cpu, new_rev, uci->cpu_sig.rev); + } else + vfree(new_mc); + } + + free_equiv_cpu_table(); + + return (int)leftover; +} + +static int get_ucode_fw(void *to, const void *from, size_t n) +{ + memcpy(to, from, n); + return 0; +} + +static int request_microcode_fw(int cpu, struct device *device) +{ + const char *fw_name = "amd-ucode/microcode_amd.bin"; + const struct firmware *firmware; + int ret; + + /* We should bind the task to the CPU */ + BUG_ON(cpu != raw_smp_processor_id()); + + ret = request_firmware(&firmware, fw_name, device); + if (ret) { + printk(KERN_ERR "microcode: ucode data file %s load failed\n", fw_name); + return ret; + } + + ret = generic_load_microcode(cpu, (void*)firmware->data, firmware->size, + &get_ucode_fw); + + release_firmware(firmware); + + return ret; +} + +static int request_microcode_user(int cpu, const void __user *buf, size_t size) +{ + printk(KERN_WARNING "microcode: AMD microcode update via /dev/cpu/microcode" + "is not supported\n"); + return -1; +} + +static void microcode_fini_cpu_amd(int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + vfree(uci->mc); + uci->mc = NULL; +} + +static struct microcode_ops microcode_amd_ops = { + .request_microcode_user = request_microcode_user, + .request_microcode_fw = request_microcode_fw, + .collect_cpu_info = collect_cpu_info_amd, + .apply_microcode = apply_microcode_amd, + .microcode_fini_cpu = microcode_fini_cpu_amd, +}; + +struct microcode_ops * __init init_amd_microcode(void) +{ + return µcode_amd_ops; +} diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c new file mode 100644 index 00000000000..936d8d55f23 --- /dev/null +++ b/arch/x86/kernel/microcode_core.c @@ -0,0 +1,508 @@ +/* + * Intel CPU Microcode Update Driver for Linux + * + * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk> + * 2006 Shaohua Li <shaohua.li@intel.com> + * + * This driver allows to upgrade microcode on Intel processors + * belonging to IA-32 family - PentiumPro, Pentium II, + * Pentium III, Xeon, Pentium 4, etc. + * + * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture + * Software Developer's Manual + * Order Number 253668 or free download from: + * + * http://developer.intel.com/design/pentium4/manuals/253668.htm + * + * For more information, go to http://www.urbanmyth.org/microcode + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * 1.0 16 Feb 2000, Tigran Aivazian <tigran@sco.com> + * Initial release. + * 1.01 18 Feb 2000, Tigran Aivazian <tigran@sco.com> + * Added read() support + cleanups. + * 1.02 21 Feb 2000, Tigran Aivazian <tigran@sco.com> + * Added 'device trimming' support. open(O_WRONLY) zeroes + * and frees the saved copy of applied microcode. + * 1.03 29 Feb 2000, Tigran Aivazian <tigran@sco.com> + * Made to use devfs (/dev/cpu/microcode) + cleanups. + * 1.04 06 Jun 2000, Simon Trimmer <simon@veritas.com> + * Added misc device support (now uses both devfs and misc). + * Added MICROCODE_IOCFREE ioctl to clear memory. + * 1.05 09 Jun 2000, Simon Trimmer <simon@veritas.com> + * Messages for error cases (non Intel & no suitable microcode). + * 1.06 03 Aug 2000, Tigran Aivazian <tigran@veritas.com> + * Removed ->release(). Removed exclusive open and status bitmap. + * Added microcode_rwsem to serialize read()/write()/ioctl(). + * Removed global kernel lock usage. + * 1.07 07 Sep 2000, Tigran Aivazian <tigran@veritas.com> + * Write 0 to 0x8B msr and then cpuid before reading revision, + * so that it works even if there were no update done by the + * BIOS. Otherwise, reading from 0x8B gives junk (which happened + * to be 0 on my machine which is why it worked even when I + * disabled update by the BIOS) + * Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix. + * 1.08 11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and + * Tigran Aivazian <tigran@veritas.com> + * Intel Pentium 4 processor support and bugfixes. + * 1.09 30 Oct 2001, Tigran Aivazian <tigran@veritas.com> + * Bugfix for HT (Hyper-Threading) enabled processors + * whereby processor resources are shared by all logical processors + * in a single CPU package. + * 1.10 28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and + * Tigran Aivazian <tigran@veritas.com>, + * Serialize updates as required on HT processors due to + * speculative nature of implementation. + * 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com> + * Fix the panic when writing zero-length microcode chunk. + * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>, + * Jun Nakajima <jun.nakajima@intel.com> + * Support for the microcode updates in the new format. + * 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com> + * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl + * because we no longer hold a copy of applied microcode + * in kernel memory. + * 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com> + * Fix sigmatch() macro to handle old CPUs with pf == 0. + * Thanks to Stuart Swales for pointing out this bug. + */ +#include <linux/capability.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/sched.h> +#include <linux/smp_lock.h> +#include <linux/cpumask.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/miscdevice.h> +#include <linux/spinlock.h> +#include <linux/mm.h> +#include <linux/fs.h> +#include <linux/mutex.h> +#include <linux/cpu.h> +#include <linux/firmware.h> +#include <linux/platform_device.h> + +#include <asm/msr.h> +#include <asm/uaccess.h> +#include <asm/processor.h> +#include <asm/microcode.h> + +MODULE_DESCRIPTION("Microcode Update Driver"); +MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>"); +MODULE_LICENSE("GPL"); + +#define MICROCODE_VERSION "2.00" + +struct microcode_ops *microcode_ops; + +/* no concurrent ->write()s are allowed on /dev/cpu/microcode */ +static DEFINE_MUTEX(microcode_mutex); + +struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; +EXPORT_SYMBOL_GPL(ucode_cpu_info); + +#ifdef CONFIG_MICROCODE_OLD_INTERFACE +static int do_microcode_update(const void __user *buf, size_t size) +{ + cpumask_t old; + int error = 0; + int cpu; + + old = current->cpus_allowed; + + for_each_online_cpu(cpu) { + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + if (!uci->valid) + continue; + + set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); + error = microcode_ops->request_microcode_user(cpu, buf, size); + if (error < 0) + goto out; + if (!error) + microcode_ops->apply_microcode(cpu); + } +out: + set_cpus_allowed_ptr(current, &old); + return error; +} + +static int microcode_open(struct inode *unused1, struct file *unused2) +{ + cycle_kernel_lock(); + return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; +} + +static ssize_t microcode_write(struct file *file, const char __user *buf, + size_t len, loff_t *ppos) +{ + ssize_t ret; + + if ((len >> PAGE_SHIFT) > num_physpages) { + printk(KERN_ERR "microcode: too much data (max %ld pages)\n", + num_physpages); + return -EINVAL; + } + + get_online_cpus(); + mutex_lock(µcode_mutex); + + ret = do_microcode_update(buf, len); + if (!ret) + ret = (ssize_t)len; + + mutex_unlock(µcode_mutex); + put_online_cpus(); + + return ret; +} + +static const struct file_operations microcode_fops = { + .owner = THIS_MODULE, + .write = microcode_write, + .open = microcode_open, +}; + +static struct miscdevice microcode_dev = { + .minor = MICROCODE_MINOR, + .name = "microcode", + .fops = µcode_fops, +}; + +static int __init microcode_dev_init(void) +{ + int error; + + error = misc_register(µcode_dev); + if (error) { + printk(KERN_ERR + "microcode: can't misc_register on minor=%d\n", + MICROCODE_MINOR); + return error; + } + + return 0; +} + +static void microcode_dev_exit(void) +{ + misc_deregister(µcode_dev); +} + +MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); +#else +#define microcode_dev_init() 0 +#define microcode_dev_exit() do { } while (0) +#endif + +/* fake device for request_firmware */ +struct platform_device *microcode_pdev; + +static ssize_t reload_store(struct sys_device *dev, + struct sysdev_attribute *attr, + const char *buf, size_t sz) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; + char *end; + unsigned long val = simple_strtoul(buf, &end, 0); + int err = 0; + int cpu = dev->id; + + if (end == buf) + return -EINVAL; + if (val == 1) { + cpumask_t old = current->cpus_allowed; + + get_online_cpus(); + if (cpu_online(cpu)) { + set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); + mutex_lock(µcode_mutex); + if (uci->valid) { + err = microcode_ops->request_microcode_fw(cpu, + µcode_pdev->dev); + if (!err) + microcode_ops->apply_microcode(cpu); + } + mutex_unlock(µcode_mutex); + set_cpus_allowed_ptr(current, &old); + } + put_online_cpus(); + } + if (err) + return err; + return sz; +} + +static ssize_t version_show(struct sys_device *dev, + struct sysdev_attribute *attr, char *buf) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; + + return sprintf(buf, "0x%x\n", uci->cpu_sig.rev); +} + +static ssize_t pf_show(struct sys_device *dev, + struct sysdev_attribute *attr, char *buf) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; + + return sprintf(buf, "0x%x\n", uci->cpu_sig.pf); +} + +static SYSDEV_ATTR(reload, 0200, NULL, reload_store); +static SYSDEV_ATTR(version, 0400, version_show, NULL); +static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL); + +static struct attribute *mc_default_attrs[] = { + &attr_reload.attr, + &attr_version.attr, + &attr_processor_flags.attr, + NULL +}; + +static struct attribute_group mc_attr_group = { + .attrs = mc_default_attrs, + .name = "microcode", +}; + +static void microcode_fini_cpu(int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + mutex_lock(µcode_mutex); + microcode_ops->microcode_fini_cpu(cpu); + uci->valid = 0; + mutex_unlock(µcode_mutex); +} + +static void collect_cpu_info(int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + memset(uci, 0, sizeof(*uci)); + if (!microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig)) + uci->valid = 1; +} + +static int microcode_resume_cpu(int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + struct cpu_signature nsig; + + pr_debug("microcode: CPU%d resumed\n", cpu); + + if (!uci->mc) + return 1; + + /* + * Let's verify that the 'cached' ucode does belong + * to this cpu (a bit of paranoia): + */ + if (microcode_ops->collect_cpu_info(cpu, &nsig)) { + microcode_fini_cpu(cpu); + return -1; + } + + if (memcmp(&nsig, &uci->cpu_sig, sizeof(nsig))) { + microcode_fini_cpu(cpu); + /* Should we look for a new ucode here? */ + return 1; + } + + return 0; +} + +void microcode_update_cpu(int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + int err = 0; + + /* + * Check if the system resume is in progress (uci->valid != NULL), + * otherwise just request a firmware: + */ + if (uci->valid) { + err = microcode_resume_cpu(cpu); + } else { + collect_cpu_info(cpu); + if (uci->valid && system_state == SYSTEM_RUNNING) + err = microcode_ops->request_microcode_fw(cpu, + µcode_pdev->dev); + } + if (!err) + microcode_ops->apply_microcode(cpu); +} + +static void microcode_init_cpu(int cpu) +{ + cpumask_t old = current->cpus_allowed; + + set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); + /* We should bind the task to the CPU */ + BUG_ON(raw_smp_processor_id() != cpu); + + mutex_lock(µcode_mutex); + microcode_update_cpu(cpu); + mutex_unlock(µcode_mutex); + + set_cpus_allowed_ptr(current, &old); +} + +static int mc_sysdev_add(struct sys_device *sys_dev) +{ + int err, cpu = sys_dev->id; + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + if (!cpu_online(cpu)) + return 0; + + pr_debug("microcode: CPU%d added\n", cpu); + memset(uci, 0, sizeof(*uci)); + + err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group); + if (err) + return err; + + microcode_init_cpu(cpu); + return 0; +} + +static int mc_sysdev_remove(struct sys_device *sys_dev) +{ + int cpu = sys_dev->id; + + if (!cpu_online(cpu)) + return 0; + + pr_debug("microcode: CPU%d removed\n", cpu); + microcode_fini_cpu(cpu); + sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); + return 0; +} + +static int mc_sysdev_resume(struct sys_device *dev) +{ + int cpu = dev->id; + + if (!cpu_online(cpu)) + return 0; + + /* only CPU 0 will apply ucode here */ + microcode_update_cpu(0); + return 0; +} + +static struct sysdev_driver mc_sysdev_driver = { + .add = mc_sysdev_add, + .remove = mc_sysdev_remove, + .resume = mc_sysdev_resume, +}; + +static __cpuinit int +mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + struct sys_device *sys_dev; + + sys_dev = get_cpu_sysdev(cpu); + switch (action) { + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + microcode_init_cpu(cpu); + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: + pr_debug("microcode: CPU%d added\n", cpu); + if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group)) + printk(KERN_ERR "microcode: Failed to create the sysfs " + "group for CPU%d\n", cpu); + break; + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + /* Suspend is in progress, only remove the interface */ + sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); + pr_debug("microcode: CPU%d removed\n", cpu); + break; + case CPU_DEAD: + case CPU_UP_CANCELED_FROZEN: + /* The CPU refused to come up during a system resume */ + microcode_fini_cpu(cpu); + break; + } + return NOTIFY_OK; +} + +static struct notifier_block __refdata mc_cpu_notifier = { + .notifier_call = mc_cpu_callback, +}; + +static int __init microcode_init(void) +{ + struct cpuinfo_x86 *c = &cpu_data(0); + int error; + + if (c->x86_vendor == X86_VENDOR_INTEL) + microcode_ops = init_intel_microcode(); + else if (c->x86_vendor == X86_VENDOR_AMD) + microcode_ops = init_amd_microcode(); + + if (!microcode_ops) { + printk(KERN_ERR "microcode: no support for this CPU vendor\n"); + return -ENODEV; + } + + error = microcode_dev_init(); + if (error) + return error; + microcode_pdev = platform_device_register_simple("microcode", -1, + NULL, 0); + if (IS_ERR(microcode_pdev)) { + microcode_dev_exit(); + return PTR_ERR(microcode_pdev); + } + + get_online_cpus(); + error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver); + put_online_cpus(); + if (error) { + microcode_dev_exit(); + platform_device_unregister(microcode_pdev); + return error; + } + + register_hotcpu_notifier(&mc_cpu_notifier); + + printk(KERN_INFO + "Microcode Update Driver: v" MICROCODE_VERSION + " <tigran@aivazian.fsnet.co.uk>" + " <peter.oruba@amd.com>\n"); + + return 0; +} + +static void __exit microcode_exit(void) +{ + microcode_dev_exit(); + + unregister_hotcpu_notifier(&mc_cpu_notifier); + + get_online_cpus(); + sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver); + put_online_cpus(); + + platform_device_unregister(microcode_pdev); + + microcode_ops = NULL; + + printk(KERN_INFO + "Microcode Update Driver: v" MICROCODE_VERSION " removed.\n"); +} + +module_init(microcode_init); +module_exit(microcode_exit); diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c new file mode 100644 index 00000000000..622dc4a2178 --- /dev/null +++ b/arch/x86/kernel/microcode_intel.c @@ -0,0 +1,480 @@ +/* + * Intel CPU Microcode Update Driver for Linux + * + * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk> + * 2006 Shaohua Li <shaohua.li@intel.com> + * + * This driver allows to upgrade microcode on Intel processors + * belonging to IA-32 family - PentiumPro, Pentium II, + * Pentium III, Xeon, Pentium 4, etc. + * + * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture + * Software Developer's Manual + * Order Number 253668 or free download from: + * + * http://developer.intel.com/design/pentium4/manuals/253668.htm + * + * For more information, go to http://www.urbanmyth.org/microcode + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * 1.0 16 Feb 2000, Tigran Aivazian <tigran@sco.com> + * Initial release. + * 1.01 18 Feb 2000, Tigran Aivazian <tigran@sco.com> + * Added read() support + cleanups. + * 1.02 21 Feb 2000, Tigran Aivazian <tigran@sco.com> + * Added 'device trimming' support. open(O_WRONLY) zeroes + * and frees the saved copy of applied microcode. + * 1.03 29 Feb 2000, Tigran Aivazian <tigran@sco.com> + * Made to use devfs (/dev/cpu/microcode) + cleanups. + * 1.04 06 Jun 2000, Simon Trimmer <simon@veritas.com> + * Added misc device support (now uses both devfs and misc). + * Added MICROCODE_IOCFREE ioctl to clear memory. + * 1.05 09 Jun 2000, Simon Trimmer <simon@veritas.com> + * Messages for error cases (non Intel & no suitable microcode). + * 1.06 03 Aug 2000, Tigran Aivazian <tigran@veritas.com> + * Removed ->release(). Removed exclusive open and status bitmap. + * Added microcode_rwsem to serialize read()/write()/ioctl(). + * Removed global kernel lock usage. + * 1.07 07 Sep 2000, Tigran Aivazian <tigran@veritas.com> + * Write 0 to 0x8B msr and then cpuid before reading revision, + * so that it works even if there were no update done by the + * BIOS. Otherwise, reading from 0x8B gives junk (which happened + * to be 0 on my machine which is why it worked even when I + * disabled update by the BIOS) + * Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix. + * 1.08 11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and + * Tigran Aivazian <tigran@veritas.com> + * Intel Pentium 4 processor support and bugfixes. + * 1.09 30 Oct 2001, Tigran Aivazian <tigran@veritas.com> + * Bugfix for HT (Hyper-Threading) enabled processors + * whereby processor resources are shared by all logical processors + * in a single CPU package. + * 1.10 28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and + * Tigran Aivazian <tigran@veritas.com>, + * Serialize updates as required on HT processors due to + * speculative nature of implementation. + * 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com> + * Fix the panic when writing zero-length microcode chunk. + * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>, + * Jun Nakajima <jun.nakajima@intel.com> + * Support for the microcode updates in the new format. + * 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com> + * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl + * because we no longer hold a copy of applied microcode + * in kernel memory. + * 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com> + * Fix sigmatch() macro to handle old CPUs with pf == 0. + * Thanks to Stuart Swales for pointing out this bug. + */ +#include <linux/capability.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/sched.h> +#include <linux/smp_lock.h> +#include <linux/cpumask.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/miscdevice.h> +#include <linux/spinlock.h> +#include <linux/mm.h> +#include <linux/fs.h> +#include <linux/mutex.h> +#include <linux/cpu.h> +#include <linux/firmware.h> +#include <linux/platform_device.h> + +#include <asm/msr.h> +#include <asm/uaccess.h> +#include <asm/processor.h> +#include <asm/microcode.h> + +MODULE_DESCRIPTION("Microcode Update Driver"); +MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>"); +MODULE_LICENSE("GPL"); + +struct microcode_header_intel { + unsigned int hdrver; + unsigned int rev; + unsigned int date; + unsigned int sig; + unsigned int cksum; + unsigned int ldrver; + unsigned int pf; + unsigned int datasize; + unsigned int totalsize; + unsigned int reserved[3]; +}; + +struct microcode_intel { + struct microcode_header_intel hdr; + unsigned int bits[0]; +}; + +/* microcode format is extended from prescott processors */ +struct extended_signature { + unsigned int sig; + unsigned int pf; + unsigned int cksum; +}; + +struct extended_sigtable { + unsigned int count; + unsigned int cksum; + unsigned int reserved[3]; + struct extended_signature sigs[0]; +}; + +#define DEFAULT_UCODE_DATASIZE (2000) +#define MC_HEADER_SIZE (sizeof(struct microcode_header_intel)) +#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) +#define EXT_HEADER_SIZE (sizeof(struct extended_sigtable)) +#define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature)) +#define DWSIZE (sizeof(u32)) +#define get_totalsize(mc) \ + (((struct microcode_intel *)mc)->hdr.totalsize ? \ + ((struct microcode_intel *)mc)->hdr.totalsize : \ + DEFAULT_UCODE_TOTALSIZE) + +#define get_datasize(mc) \ + (((struct microcode_intel *)mc)->hdr.datasize ? \ + ((struct microcode_intel *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE) + +#define sigmatch(s1, s2, p1, p2) \ + (((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0)))) + +#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE) + +/* serialize access to the physical write to MSR 0x79 */ +static DEFINE_SPINLOCK(microcode_update_lock); + +static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) +{ + struct cpuinfo_x86 *c = &cpu_data(cpu_num); + unsigned int val[2]; + + memset(csig, 0, sizeof(*csig)); + + if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || + cpu_has(c, X86_FEATURE_IA64)) { + printk(KERN_ERR "microcode: CPU%d not a capable Intel " + "processor\n", cpu_num); + return -1; + } + + csig->sig = cpuid_eax(0x00000001); + + if ((c->x86_model >= 5) || (c->x86 > 6)) { + /* get processor flags from MSR 0x17 */ + rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); + csig->pf = 1 << ((val[1] >> 18) & 7); + } + + wrmsr(MSR_IA32_UCODE_REV, 0, 0); + /* see notes above for revision 1.07. Apparent chip bug */ + sync_core(); + /* get the current revision from MSR 0x8B */ + rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev); + pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n", + csig->sig, csig->pf, csig->rev); + + return 0; +} + +static inline int update_match_cpu(struct cpu_signature *csig, int sig, int pf) +{ + return (!sigmatch(sig, csig->sig, pf, csig->pf)) ? 0 : 1; +} + +static inline int +update_match_revision(struct microcode_header_intel *mc_header, int rev) +{ + return (mc_header->rev <= rev) ? 0 : 1; +} + +static int microcode_sanity_check(void *mc) +{ + struct microcode_header_intel *mc_header = mc; + struct extended_sigtable *ext_header = NULL; + struct extended_signature *ext_sig; + unsigned long total_size, data_size, ext_table_size; + int sum, orig_sum, ext_sigcount = 0, i; + + total_size = get_totalsize(mc_header); + data_size = get_datasize(mc_header); + if (data_size + MC_HEADER_SIZE > total_size) { + printk(KERN_ERR "microcode: error! " + "Bad data size in microcode data file\n"); + return -EINVAL; + } + + if (mc_header->ldrver != 1 || mc_header->hdrver != 1) { + printk(KERN_ERR "microcode: error! " + "Unknown microcode update format\n"); + return -EINVAL; + } + ext_table_size = total_size - (MC_HEADER_SIZE + data_size); + if (ext_table_size) { + if ((ext_table_size < EXT_HEADER_SIZE) + || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) { + printk(KERN_ERR "microcode: error! " + "Small exttable size in microcode data file\n"); + return -EINVAL; + } + ext_header = mc + MC_HEADER_SIZE + data_size; + if (ext_table_size != exttable_size(ext_header)) { + printk(KERN_ERR "microcode: error! " + "Bad exttable size in microcode data file\n"); + return -EFAULT; + } + ext_sigcount = ext_header->count; + } + + /* check extended table checksum */ + if (ext_table_size) { + int ext_table_sum = 0; + int *ext_tablep = (int *)ext_header; + + i = ext_table_size / DWSIZE; + while (i--) + ext_table_sum += ext_tablep[i]; + if (ext_table_sum) { + printk(KERN_WARNING "microcode: aborting, " + "bad extended signature table checksum\n"); + return -EINVAL; + } + } + + /* calculate the checksum */ + orig_sum = 0; + i = (MC_HEADER_SIZE + data_size) / DWSIZE; + while (i--) + orig_sum += ((int *)mc)[i]; + if (orig_sum) { + printk(KERN_ERR "microcode: aborting, bad checksum\n"); + return -EINVAL; + } + if (!ext_table_size) + return 0; + /* check extended signature checksum */ + for (i = 0; i < ext_sigcount; i++) { + ext_sig = (void *)ext_header + EXT_HEADER_SIZE + + EXT_SIGNATURE_SIZE * i; + sum = orig_sum + - (mc_header->sig + mc_header->pf + mc_header->cksum) + + (ext_sig->sig + ext_sig->pf + ext_sig->cksum); + if (sum) { + printk(KERN_ERR "microcode: aborting, bad checksum\n"); + return -EINVAL; + } + } + return 0; +} + +/* + * return 0 - no update found + * return 1 - found update + */ +static int +get_matching_microcode(struct cpu_signature *cpu_sig, void *mc, int rev) +{ + struct microcode_header_intel *mc_header = mc; + struct extended_sigtable *ext_header; + unsigned long total_size = get_totalsize(mc_header); + int ext_sigcount, i; + struct extended_signature *ext_sig; + + if (!update_match_revision(mc_header, rev)) + return 0; + + if (update_match_cpu(cpu_sig, mc_header->sig, mc_header->pf)) + return 1; + + /* Look for ext. headers: */ + if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE) + return 0; + + ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE; + ext_sigcount = ext_header->count; + ext_sig = (void *)ext_header + EXT_HEADER_SIZE; + + for (i = 0; i < ext_sigcount; i++) { + if (update_match_cpu(cpu_sig, ext_sig->sig, ext_sig->pf)) + return 1; + ext_sig++; + } + return 0; +} + +static void apply_microcode(int cpu) +{ + unsigned long flags; + unsigned int val[2]; + int cpu_num = raw_smp_processor_id(); + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + struct microcode_intel *mc_intel = uci->mc; + + /* We should bind the task to the CPU */ + BUG_ON(cpu_num != cpu); + + if (mc_intel == NULL) + return; + + /* serialize access to the physical write to MSR 0x79 */ + spin_lock_irqsave(µcode_update_lock, flags); + + /* write microcode via MSR 0x79 */ + wrmsr(MSR_IA32_UCODE_WRITE, + (unsigned long) mc_intel->bits, + (unsigned long) mc_intel->bits >> 16 >> 16); + wrmsr(MSR_IA32_UCODE_REV, 0, 0); + + /* see notes above for revision 1.07. Apparent chip bug */ + sync_core(); + + /* get the current revision from MSR 0x8B */ + rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); + + spin_unlock_irqrestore(µcode_update_lock, flags); + if (val[1] != mc_intel->hdr.rev) { + printk(KERN_ERR "microcode: CPU%d update from revision " + "0x%x to 0x%x failed\n", cpu_num, uci->cpu_sig.rev, val[1]); + return; + } + printk(KERN_INFO "microcode: CPU%d updated from revision " + "0x%x to 0x%x, date = %04x-%02x-%02x \n", + cpu_num, uci->cpu_sig.rev, val[1], + mc_intel->hdr.date & 0xffff, + mc_intel->hdr.date >> 24, + (mc_intel->hdr.date >> 16) & 0xff); + uci->cpu_sig.rev = val[1]; +} + +static int generic_load_microcode(int cpu, void *data, size_t size, + int (*get_ucode_data)(void *, const void *, size_t)) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + u8 *ucode_ptr = data, *new_mc = NULL, *mc; + int new_rev = uci->cpu_sig.rev; + unsigned int leftover = size; + + while (leftover) { + struct microcode_header_intel mc_header; + unsigned int mc_size; + + if (get_ucode_data(&mc_header, ucode_ptr, sizeof(mc_header))) + break; + + mc_size = get_totalsize(&mc_header); + if (!mc_size || mc_size > leftover) { + printk(KERN_ERR "microcode: error!" + "Bad data in microcode data file\n"); + break; + } + + mc = vmalloc(mc_size); + if (!mc) + break; + + if (get_ucode_data(mc, ucode_ptr, mc_size) || + microcode_sanity_check(mc) < 0) { + vfree(mc); + break; + } + + if (get_matching_microcode(&uci->cpu_sig, mc, new_rev)) { + if (new_mc) + vfree(new_mc); + new_rev = mc_header.rev; + new_mc = mc; + } else + vfree(mc); + + ucode_ptr += mc_size; + leftover -= mc_size; + } + + if (new_mc) { + if (!leftover) { + if (uci->mc) + vfree(uci->mc); + uci->mc = (struct microcode_intel *)new_mc; + pr_debug("microcode: CPU%d found a matching microcode update with" + " version 0x%x (current=0x%x)\n", + cpu, new_rev, uci->cpu_sig.rev); + } else + vfree(new_mc); + } + + return (int)leftover; +} + +static int get_ucode_fw(void *to, const void *from, size_t n) +{ + memcpy(to, from, n); + return 0; +} + +static int request_microcode_fw(int cpu, struct device *device) +{ + char name[30]; + struct cpuinfo_x86 *c = &cpu_data(cpu); + const struct firmware *firmware; + int ret; + + /* We should bind the task to the CPU */ + BUG_ON(cpu != raw_smp_processor_id()); + sprintf(name, "intel-ucode/%02x-%02x-%02x", + c->x86, c->x86_model, c->x86_mask); + ret = request_firmware(&firmware, name, device); + if (ret) { + pr_debug("microcode: data file %s load failed\n", name); + return ret; + } + + ret = generic_load_microcode(cpu, (void*)firmware->data, firmware->size, + &get_ucode_fw); + + release_firmware(firmware); + + return ret; +} + +static int get_ucode_user(void *to, const void *from, size_t n) +{ + return copy_from_user(to, from, n); +} + +static int request_microcode_user(int cpu, const void __user *buf, size_t size) +{ + /* We should bind the task to the CPU */ + BUG_ON(cpu != raw_smp_processor_id()); + + return generic_load_microcode(cpu, (void*)buf, size, &get_ucode_user); +} + +static void microcode_fini_cpu(int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + vfree(uci->mc); + uci->mc = NULL; +} + +struct microcode_ops microcode_intel_ops = { + .request_microcode_user = request_microcode_user, + .request_microcode_fw = request_microcode_fw, + .collect_cpu_info = collect_cpu_info, + .apply_microcode = apply_microcode, + .microcode_fini_cpu = microcode_fini_cpu, +}; + +struct microcode_ops * __init init_intel_microcode(void) +{ + return µcode_intel_ops; +} + diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index b3fb430725c..f98f4e1dba0 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -397,7 +397,9 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early) generic_bigsmp_probe(); #endif +#ifdef CONFIG_X86_32 setup_apic_routing(); +#endif if (!num_processors) printk(KERN_ERR "MPTABLE: no processors registered!\n"); return num_processors; diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index e4393808688..82a7c7ed6d4 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -72,21 +72,28 @@ static ssize_t msr_read(struct file *file, char __user *buf, u32 data[2]; u32 reg = *ppos; int cpu = iminor(file->f_path.dentry->d_inode); - int err; + int err = 0; + ssize_t bytes = 0; if (count % 8) return -EINVAL; /* Invalid chunk size */ for (; count; count -= 8) { err = rdmsr_safe_on_cpu(cpu, reg, &data[0], &data[1]); - if (err) - return -EIO; - if (copy_to_user(tmp, &data, 8)) - return -EFAULT; + if (err) { + if (err == -EFAULT) /* Fix idiotic error code */ + err = -EIO; + break; + } + if (copy_to_user(tmp, &data, 8)) { + err = -EFAULT; + break; + } tmp += 2; + bytes += 8; } - return ((char __user *)tmp) - buf; + return bytes ? bytes : err; } static ssize_t msr_write(struct file *file, const char __user *buf, @@ -96,21 +103,28 @@ static ssize_t msr_write(struct file *file, const char __user *buf, u32 data[2]; u32 reg = *ppos; int cpu = iminor(file->f_path.dentry->d_inode); - int err; + int err = 0; + ssize_t bytes = 0; if (count % 8) return -EINVAL; /* Invalid chunk size */ for (; count; count -= 8) { - if (copy_from_user(&data, tmp, 8)) - return -EFAULT; + if (copy_from_user(&data, tmp, 8)) { + err = -EFAULT; + break; + } err = wrmsr_safe_on_cpu(cpu, reg, data[0], data[1]); - if (err) - return -EIO; + if (err) { + if (err == -EFAULT) /* Fix idiotic error code */ + err = -EIO; + break; + } tmp += 2; + bytes += 8; } - return ((char __user *)tmp) - buf; + return bytes ? bytes : err; } static int msr_open(struct inode *inode, struct file *file) @@ -149,8 +163,8 @@ static int __cpuinit msr_device_create(int cpu) { struct device *dev; - dev = device_create_drvdata(msr_class, NULL, MKDEV(MSR_MAJOR, cpu), - NULL, "msr%d", cpu); + dev = device_create(msr_class, NULL, MKDEV(MSR_MAJOR, cpu), NULL, + "msr%d", cpu); return IS_ERR(dev) ? PTR_ERR(dev) : 0; } diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index abb78a2cc4a..2c97f07f1c2 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -299,6 +299,15 @@ void acpi_nmi_disable(void) on_each_cpu(__acpi_nmi_disable, NULL, 1); } +/* + * This function is called as soon the LAPIC NMI watchdog driver has everything + * in place and it's ready to check if the NMIs belong to the NMI watchdog + */ +void cpu_nmi_set_wd_enabled(void) +{ + __get_cpu_var(wd_enabled) = 1; +} + void setup_apic_nmi_watchdog(void *unused) { if (__get_cpu_var(wd_enabled)) @@ -311,8 +320,6 @@ void setup_apic_nmi_watchdog(void *unused) switch (nmi_watchdog) { case NMI_LOCAL_APIC: - /* enable it before to avoid race with handler */ - __get_cpu_var(wd_enabled) = 1; if (lapic_watchdog_init(nmi_hz) < 0) { __get_cpu_var(wd_enabled) = 0; return; diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c index b8c45610b20..4caff39078e 100644 --- a/arch/x86/kernel/numaq_32.c +++ b/arch/x86/kernel/numaq_32.c @@ -73,7 +73,7 @@ static void __init smp_dump_qct(void) } -void __init numaq_tsc_disable(void) +void __cpuinit numaq_tsc_disable(void) { if (!found_numaq) return; @@ -229,6 +229,12 @@ static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable, } } +static int __init numaq_setup_ioapic_ids(void) +{ + /* so can skip it */ + return 1; +} + static struct x86_quirks numaq_x86_quirks __initdata = { .arch_pre_time_init = numaq_pre_time_init, .arch_time_init = NULL, @@ -243,6 +249,7 @@ static struct x86_quirks numaq_x86_quirks __initdata = { .mpc_oem_bus_info = mpc_oem_bus_info, .mpc_oem_pci_bus = mpc_oem_pci_bus, .smp_read_mpc_oem = smp_read_mpc_oem, + .setup_ioapic_ids = numaq_setup_ioapic_ids, }; void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem, diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c index 3e667227480..7a13fac63a1 100644 --- a/arch/x86/kernel/olpc.c +++ b/arch/x86/kernel/olpc.c @@ -190,12 +190,12 @@ EXPORT_SYMBOL_GPL(olpc_ec_cmd); static void __init platform_detect(void) { size_t propsize; - u32 rev; + __be32 rev; if (ofw("getprop", 4, 1, NULL, "board-revision-int", &rev, 4, &propsize) || propsize != 4) { printk(KERN_ERR "ofw: getprop call failed!\n"); - rev = 0; + rev = cpu_to_be32(0); } olpc_platform_info.boardrev = be32_to_cpu(rev); } @@ -203,7 +203,7 @@ static void __init platform_detect(void) static void __init platform_detect(void) { /* stopgap until OFW support is added to the kernel */ - olpc_platform_info.boardrev = be32_to_cpu(0xc2); + olpc_platform_info.boardrev = 0xc2; } #endif diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c new file mode 100644 index 00000000000..0e9f1982b1d --- /dev/null +++ b/arch/x86/kernel/paravirt-spinlocks.c @@ -0,0 +1,37 @@ +/* + * Split spinlock implementation out into its own file, so it can be + * compiled in a FTRACE-compatible way. + */ +#include <linux/spinlock.h> +#include <linux/module.h> + +#include <asm/paravirt.h> + +static void default_spin_lock_flags(struct raw_spinlock *lock, unsigned long flags) +{ + __raw_spin_lock(lock); +} + +struct pv_lock_ops pv_lock_ops = { +#ifdef CONFIG_SMP + .spin_is_locked = __ticket_spin_is_locked, + .spin_is_contended = __ticket_spin_is_contended, + + .spin_lock = __ticket_spin_lock, + .spin_lock_flags = default_spin_lock_flags, + .spin_trylock = __ticket_spin_trylock, + .spin_unlock = __ticket_spin_unlock, +#endif +}; +EXPORT_SYMBOL(pv_lock_ops); + +void __init paravirt_use_bytelocks(void) +{ +#ifdef CONFIG_SMP + pv_lock_ops.spin_is_locked = __byte_spin_is_locked; + pv_lock_ops.spin_is_contended = __byte_spin_is_contended; + pv_lock_ops.spin_lock = __byte_spin_lock; + pv_lock_ops.spin_trylock = __byte_spin_trylock; + pv_lock_ops.spin_unlock = __byte_spin_unlock; +#endif +} diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 94da4d52d79..e4c8fb60887 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -268,17 +268,6 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void) return __get_cpu_var(paravirt_lazy_mode); } -void __init paravirt_use_bytelocks(void) -{ -#ifdef CONFIG_SMP - pv_lock_ops.spin_is_locked = __byte_spin_is_locked; - pv_lock_ops.spin_is_contended = __byte_spin_is_contended; - pv_lock_ops.spin_lock = __byte_spin_lock; - pv_lock_ops.spin_trylock = __byte_spin_trylock; - pv_lock_ops.spin_unlock = __byte_spin_unlock; -#endif -} - struct pv_info pv_info = { .name = "bare hardware", .paravirt_enabled = 0, @@ -330,6 +319,7 @@ struct pv_cpu_ops pv_cpu_ops = { #endif .wbinvd = native_wbinvd, .read_msr = native_read_msr_safe, + .read_msr_amd = native_read_msr_amd_safe, .write_msr = native_write_msr_safe, .read_tsc = native_read_tsc, .read_pmc = native_read_pmc, @@ -348,6 +338,10 @@ struct pv_cpu_ops pv_cpu_ops = { .write_ldt_entry = native_write_ldt_entry, .write_gdt_entry = native_write_gdt_entry, .write_idt_entry = native_write_idt_entry, + + .alloc_ldt = paravirt_nop, + .free_ldt = paravirt_nop, + .load_sp0 = native_load_sp0, #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) @@ -373,8 +367,6 @@ struct pv_cpu_ops pv_cpu_ops = { struct pv_apic_ops pv_apic_ops = { #ifdef CONFIG_X86_LOCAL_APIC - .apic_write = native_apic_write, - .apic_read = native_apic_read, .setup_boot_clock = setup_boot_APIC_clock, .setup_secondary_clock = setup_secondary_APIC_clock, .startup_ipi_hook = paravirt_nop, @@ -461,18 +453,6 @@ struct pv_mmu_ops pv_mmu_ops = { .set_fixmap = native_set_fixmap, }; -struct pv_lock_ops pv_lock_ops = { -#ifdef CONFIG_SMP - .spin_is_locked = __ticket_spin_is_locked, - .spin_is_contended = __ticket_spin_is_contended, - - .spin_lock = __ticket_spin_lock, - .spin_trylock = __ticket_spin_trylock, - .spin_unlock = __ticket_spin_unlock, -#endif -}; -EXPORT_SYMBOL_GPL(pv_lock_ops); - EXPORT_SYMBOL_GPL(pv_time_ops); EXPORT_SYMBOL (pv_cpu_ops); EXPORT_SYMBOL (pv_mmu_ops); diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c index 58262218781..9fe644f4861 100644 --- a/arch/x86/kernel/paravirt_patch_32.c +++ b/arch/x86/kernel/paravirt_patch_32.c @@ -23,7 +23,7 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, start = start_##ops##_##x; \ end = end_##ops##_##x; \ goto patch_site - switch(type) { + switch (type) { PATCH_SITE(pv_irq_ops, irq_disable); PATCH_SITE(pv_irq_ops, irq_enable); PATCH_SITE(pv_irq_ops, restore_fl); diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 218d783ed7a..e1e731d78f3 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -217,16 +217,6 @@ static inline unsigned long verify_bit_range(unsigned long* bitmap, #endif /* CONFIG_IOMMU_DEBUG */ -static inline unsigned int num_dma_pages(unsigned long dma, unsigned int dmalen) -{ - unsigned int npages; - - npages = PAGE_ALIGN(dma + dmalen) - (dma & PAGE_MASK); - npages >>= PAGE_SHIFT; - - return npages; -} - static inline int translation_enabled(struct iommu_table *tbl) { /* only PHBs with translation enabled have an IOMMU table */ @@ -261,7 +251,7 @@ static void iommu_range_reserve(struct iommu_table *tbl, badbit, tbl, start_addr, npages); } - set_bit_string(tbl->it_map, index, npages); + iommu_area_reserve(tbl->it_map, index, npages); spin_unlock_irqrestore(&tbl->it_lock, flags); } @@ -343,9 +333,8 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, /* were we called with bad_dma_address? */ badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE); if (unlikely((dma_addr >= bad_dma_address) && (dma_addr < badend))) { - printk(KERN_ERR "Calgary: driver tried unmapping bad DMA " + WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA " "address 0x%Lx\n", dma_addr); - WARN_ON(1); return; } @@ -409,7 +398,7 @@ static void calgary_unmap_sg(struct device *dev, if (dmalen == 0) break; - npages = num_dma_pages(dma, dmalen); + npages = iommu_num_pages(dma, dmalen, PAGE_SIZE); iommu_free(tbl, dma, npages); } } @@ -428,7 +417,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg, BUG_ON(!sg_page(s)); vaddr = (unsigned long) sg_virt(s); - npages = num_dma_pages(vaddr, s->length); + npages = iommu_num_pages(vaddr, s->length, PAGE_SIZE); entry = iommu_range_alloc(dev, tbl, npages); if (entry == bad_dma_address) { @@ -465,7 +454,7 @@ static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr, struct iommu_table *tbl = find_iommu_table(dev); uaddr = (unsigned long)vaddr; - npages = num_dma_pages(uaddr, size); + npages = iommu_num_pages(uaddr, size, PAGE_SIZE); return iommu_alloc(dev, tbl, vaddr, npages, direction); } @@ -476,7 +465,7 @@ static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle, struct iommu_table *tbl = find_iommu_table(dev); unsigned int npages; - npages = num_dma_pages(dma_handle, size); + npages = iommu_num_pages(dma_handle, size, PAGE_SIZE); iommu_free(tbl, dma_handle, npages); } @@ -492,6 +481,8 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size, npages = size >> PAGE_SHIFT; order = get_order(size); + flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); + /* alloc enough pages (and possibly more) */ ret = (void *)__get_free_pages(flag, order); if (!ret) @@ -511,8 +502,22 @@ error: return ret; } +static void calgary_free_coherent(struct device *dev, size_t size, + void *vaddr, dma_addr_t dma_handle) +{ + unsigned int npages; + struct iommu_table *tbl = find_iommu_table(dev); + + size = PAGE_ALIGN(size); + npages = size >> PAGE_SHIFT; + + iommu_free(tbl, dma_handle, npages); + free_pages((unsigned long)vaddr, get_order(size)); +} + static struct dma_mapping_ops calgary_dma_ops = { .alloc_coherent = calgary_alloc_coherent, + .free_coherent = calgary_free_coherent, .map_single = calgary_map_single, .unmap_single = calgary_unmap_single, .map_sg = calgary_map_sg, @@ -1269,13 +1274,15 @@ static inline int __init determine_tce_table_size(u64 ram) static int __init build_detail_arrays(void) { unsigned long ptr; - int i, scal_detail_size, rio_detail_size; + unsigned numnodes, i; + int scal_detail_size, rio_detail_size; - if (rio_table_hdr->num_scal_dev > MAX_NUMNODES){ + numnodes = rio_table_hdr->num_scal_dev; + if (numnodes > MAX_NUMNODES){ printk(KERN_WARNING "Calgary: MAX_NUMNODES too low! Defined as %d, " "but system has %d nodes.\n", - MAX_NUMNODES, rio_table_hdr->num_scal_dev); + MAX_NUMNODES, numnodes); return -ENODEV; } @@ -1296,8 +1303,7 @@ static int __init build_detail_arrays(void) } ptr = ((unsigned long)rio_table_hdr) + 3; - for (i = 0; i < rio_table_hdr->num_scal_dev; - i++, ptr += scal_detail_size) + for (i = 0; i < numnodes; i++, ptr += scal_detail_size) scal_devs[i] = (struct scal_detail *)ptr; for (i = 0; i < rio_table_hdr->num_rio_dev; diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 87d4d6964ec..19262482021 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -41,11 +41,12 @@ EXPORT_SYMBOL(bad_dma_address); /* Dummy device used for NULL arguments (normally ISA). Better would be probably a smaller DMA mask, but this is bug-to-bug compatible to older i386. */ -struct device fallback_dev = { +struct device x86_dma_fallback_dev = { .bus_id = "fallback device", .coherent_dma_mask = DMA_32BIT_MASK, - .dma_mask = &fallback_dev.coherent_dma_mask, + .dma_mask = &x86_dma_fallback_dev.coherent_dma_mask, }; +EXPORT_SYMBOL(x86_dma_fallback_dev); int dma_set_mask(struct device *dev, u64 mask) { @@ -82,7 +83,7 @@ void __init dma32_reserve_bootmem(void) * using 512M as goal */ align = 64ULL<<20; - size = round_up(dma32_bootmem_size, align); + size = roundup(dma32_bootmem_size, align); dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align, 512ULL<<20); if (dma32_bootmem_ptr) @@ -124,15 +125,46 @@ void __init pci_iommu_alloc(void) pci_swiotlb_init(); } -unsigned long iommu_num_pages(unsigned long addr, unsigned long len) +unsigned long iommu_nr_pages(unsigned long addr, unsigned long len) { unsigned long size = roundup((addr & ~PAGE_MASK) + len, PAGE_SIZE); return size >> PAGE_SHIFT; } -EXPORT_SYMBOL(iommu_num_pages); +EXPORT_SYMBOL(iommu_nr_pages); #endif +void *dma_generic_alloc_coherent(struct device *dev, size_t size, + dma_addr_t *dma_addr, gfp_t flag) +{ + unsigned long dma_mask; + struct page *page; + dma_addr_t addr; + + dma_mask = dma_alloc_coherent_mask(dev, flag); + + flag |= __GFP_ZERO; +again: + page = alloc_pages_node(dev_to_node(dev), flag, get_order(size)); + if (!page) + return NULL; + + addr = page_to_phys(page); + if (!is_buffer_dma_capable(dma_mask, addr, size)) { + __free_pages(page, get_order(size)); + + if (dma_mask < DMA_32BIT_MASK && !(flag & GFP_DMA)) { + flag = (flag & ~GFP_DMA32) | GFP_DMA; + goto again; + } + + return NULL; + } + + *dma_addr = addr; + return page_address(page); +} + /* * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter * documentation. @@ -241,147 +273,6 @@ int dma_supported(struct device *dev, u64 mask) } EXPORT_SYMBOL(dma_supported); -/* Allocate DMA memory on node near device */ -static noinline struct page * -dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order) -{ - int node; - - node = dev_to_node(dev); - - return alloc_pages_node(node, gfp, order); -} - -/* - * Allocate memory for a coherent mapping. - */ -void * -dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, - gfp_t gfp) -{ - struct dma_mapping_ops *ops = get_dma_ops(dev); - void *memory = NULL; - struct page *page; - unsigned long dma_mask = 0; - dma_addr_t bus; - int noretry = 0; - - /* ignore region specifiers */ - gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); - - if (dma_alloc_from_coherent(dev, size, dma_handle, &memory)) - return memory; - - if (!dev) { - dev = &fallback_dev; - gfp |= GFP_DMA; - } - dma_mask = dev->coherent_dma_mask; - if (dma_mask == 0) - dma_mask = (gfp & GFP_DMA) ? DMA_24BIT_MASK : DMA_32BIT_MASK; - - /* Device not DMA able */ - if (dev->dma_mask == NULL) - return NULL; - - /* Don't invoke OOM killer or retry in lower 16MB DMA zone */ - if (gfp & __GFP_DMA) - noretry = 1; - -#ifdef CONFIG_X86_64 - /* Why <=? Even when the mask is smaller than 4GB it is often - larger than 16MB and in this case we have a chance of - finding fitting memory in the next higher zone first. If - not retry with true GFP_DMA. -AK */ - if (dma_mask <= DMA_32BIT_MASK && !(gfp & GFP_DMA)) { - gfp |= GFP_DMA32; - if (dma_mask < DMA_32BIT_MASK) - noretry = 1; - } -#endif - - again: - page = dma_alloc_pages(dev, - noretry ? gfp | __GFP_NORETRY : gfp, get_order(size)); - if (page == NULL) - return NULL; - - { - int high, mmu; - bus = page_to_phys(page); - memory = page_address(page); - high = (bus + size) >= dma_mask; - mmu = high; - if (force_iommu && !(gfp & GFP_DMA)) - mmu = 1; - else if (high) { - free_pages((unsigned long)memory, - get_order(size)); - - /* Don't use the 16MB ZONE_DMA unless absolutely - needed. It's better to use remapping first. */ - if (dma_mask < DMA_32BIT_MASK && !(gfp & GFP_DMA)) { - gfp = (gfp & ~GFP_DMA32) | GFP_DMA; - goto again; - } - - /* Let low level make its own zone decisions */ - gfp &= ~(GFP_DMA32|GFP_DMA); - - if (ops->alloc_coherent) - return ops->alloc_coherent(dev, size, - dma_handle, gfp); - return NULL; - } - - memset(memory, 0, size); - if (!mmu) { - *dma_handle = bus; - return memory; - } - } - - if (ops->alloc_coherent) { - free_pages((unsigned long)memory, get_order(size)); - gfp &= ~(GFP_DMA|GFP_DMA32); - return ops->alloc_coherent(dev, size, dma_handle, gfp); - } - - if (ops->map_simple) { - *dma_handle = ops->map_simple(dev, virt_to_phys(memory), - size, - PCI_DMA_BIDIRECTIONAL); - if (*dma_handle != bad_dma_address) - return memory; - } - - if (panic_on_overflow) - panic("dma_alloc_coherent: IOMMU overflow by %lu bytes\n", - (unsigned long)size); - free_pages((unsigned long)memory, get_order(size)); - return NULL; -} -EXPORT_SYMBOL(dma_alloc_coherent); - -/* - * Unmap coherent memory. - * The caller must ensure that the device has finished accessing the mapping. - */ -void dma_free_coherent(struct device *dev, size_t size, - void *vaddr, dma_addr_t bus) -{ - struct dma_mapping_ops *ops = get_dma_ops(dev); - - int order = get_order(size); - WARN_ON(irqs_disabled()); /* for portability */ - if (dma_release_from_coherent(dev, order, vaddr)) - return; - if (ops->unmap_single) - ops->unmap_single(dev, bus, size, 0); - free_pages((unsigned long)vaddr, order); -} -EXPORT_SYMBOL(dma_free_coherent); - static int __init pci_iommu_init(void) { calgary_iommu_init(); diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 49285f8fd4d..e3f75bbcede 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -27,8 +27,8 @@ #include <linux/scatterlist.h> #include <linux/iommu-helper.h> #include <linux/sysdev.h> +#include <linux/io.h> #include <asm/atomic.h> -#include <asm/io.h> #include <asm/mtrr.h> #include <asm/pgtable.h> #include <asm/proto.h> @@ -80,9 +80,10 @@ AGPEXTERN int agp_memory_reserved; AGPEXTERN __u32 *agp_gatt_table; static unsigned long next_bit; /* protected by iommu_bitmap_lock */ -static int need_flush; /* global flush state. set for each gart wrap */ +static bool need_flush; /* global flush state. set for each gart wrap */ -static unsigned long alloc_iommu(struct device *dev, int size) +static unsigned long alloc_iommu(struct device *dev, int size, + unsigned long align_mask) { unsigned long offset, flags; unsigned long boundary_size; @@ -90,26 +91,27 @@ static unsigned long alloc_iommu(struct device *dev, int size) base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev), PAGE_SIZE) >> PAGE_SHIFT; - boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, + boundary_size = ALIGN((unsigned long long)dma_get_seg_boundary(dev) + 1, PAGE_SIZE) >> PAGE_SHIFT; spin_lock_irqsave(&iommu_bitmap_lock, flags); offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, next_bit, - size, base_index, boundary_size, 0); + size, base_index, boundary_size, align_mask); if (offset == -1) { - need_flush = 1; + need_flush = true; offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, 0, - size, base_index, boundary_size, 0); + size, base_index, boundary_size, + align_mask); } if (offset != -1) { next_bit = offset+size; if (next_bit >= iommu_pages) { next_bit = 0; - need_flush = 1; + need_flush = true; } } if (iommu_fullflush) - need_flush = 1; + need_flush = true; spin_unlock_irqrestore(&iommu_bitmap_lock, flags); return offset; @@ -134,7 +136,7 @@ static void flush_gart(void) spin_lock_irqsave(&iommu_bitmap_lock, flags); if (need_flush) { k8_flush_garts(); - need_flush = 0; + need_flush = false; } spin_unlock_irqrestore(&iommu_bitmap_lock, flags); } @@ -173,7 +175,8 @@ static void dump_leak(void) iommu_leak_pages); for (i = 0; i < iommu_leak_pages; i += 2) { printk(KERN_DEBUG "%lu: ", iommu_pages-i); - printk_address((unsigned long) iommu_leak_tab[iommu_pages-i], 0); + printk_address((unsigned long) iommu_leak_tab[iommu_pages-i], + 0); printk(KERN_CONT "%c", (i+1)%2 == 0 ? '\n' : ' '); } printk(KERN_DEBUG "\n"); @@ -212,34 +215,24 @@ static void iommu_full(struct device *dev, size_t size, int dir) static inline int need_iommu(struct device *dev, unsigned long addr, size_t size) { - u64 mask = *dev->dma_mask; - int high = addr + size > mask; - int mmu = high; - - if (force_iommu) - mmu = 1; - - return mmu; + return force_iommu || + !is_buffer_dma_capable(*dev->dma_mask, addr, size); } static inline int nonforced_iommu(struct device *dev, unsigned long addr, size_t size) { - u64 mask = *dev->dma_mask; - int high = addr + size > mask; - int mmu = high; - - return mmu; + return !is_buffer_dma_capable(*dev->dma_mask, addr, size); } /* Map a single continuous physical area into the IOMMU. * Caller needs to check if the iommu is needed and flush. */ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, - size_t size, int dir) + size_t size, int dir, unsigned long align_mask) { - unsigned long npages = iommu_num_pages(phys_mem, size); - unsigned long iommu_page = alloc_iommu(dev, npages); + unsigned long npages = iommu_num_pages(phys_mem, size, PAGE_SIZE); + unsigned long iommu_page = alloc_iommu(dev, npages, align_mask); int i; if (iommu_page == -1) { @@ -259,16 +252,6 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK); } -static dma_addr_t -gart_map_simple(struct device *dev, phys_addr_t paddr, size_t size, int dir) -{ - dma_addr_t map = dma_map_area(dev, paddr, size, dir); - - flush_gart(); - - return map; -} - /* Map a single area into the IOMMU */ static dma_addr_t gart_map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir) @@ -276,12 +259,13 @@ gart_map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir) unsigned long bus; if (!dev) - dev = &fallback_dev; + dev = &x86_dma_fallback_dev; if (!need_iommu(dev, paddr, size)) return paddr; - bus = gart_map_simple(dev, paddr, size, dir); + bus = dma_map_area(dev, paddr, size, dir, 0); + flush_gart(); return bus; } @@ -301,7 +285,7 @@ static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr, return; iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT; - npages = iommu_num_pages(dma_addr, size); + npages = iommu_num_pages(dma_addr, size, PAGE_SIZE); for (i = 0; i < npages; i++) { iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; CLEAR_LEAK(iommu_page + i); @@ -340,7 +324,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg, unsigned long addr = sg_phys(s); if (nonforced_iommu(dev, addr, s->length)) { - addr = dma_map_area(dev, addr, s->length, dir); + addr = dma_map_area(dev, addr, s->length, dir, 0); if (addr == bad_dma_address) { if (i > 0) gart_unmap_sg(dev, sg, i, dir); @@ -362,7 +346,7 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start, int nelems, struct scatterlist *sout, unsigned long pages) { - unsigned long iommu_start = alloc_iommu(dev, pages); + unsigned long iommu_start = alloc_iommu(dev, pages, 0); unsigned long iommu_page = iommu_start; struct scatterlist *s; int i; @@ -384,7 +368,7 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start, } addr = phys_addr; - pages = iommu_num_pages(s->offset, s->length); + pages = iommu_num_pages(s->offset, s->length, PAGE_SIZE); while (pages--) { iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); SET_LEAK(iommu_page); @@ -427,7 +411,7 @@ gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) return 0; if (!dev) - dev = &fallback_dev; + dev = &x86_dma_fallback_dev; out = 0; start = 0; @@ -467,7 +451,7 @@ gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) seg_size += s->length; need = nextneed; - pages += iommu_num_pages(s->offset, s->length); + pages += iommu_num_pages(s->offset, s->length, PAGE_SIZE); ps = s; } if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0) @@ -499,6 +483,46 @@ error: return 0; } +/* allocate and map a coherent mapping */ +static void * +gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, + gfp_t flag) +{ + dma_addr_t paddr; + unsigned long align_mask; + struct page *page; + + if (force_iommu && !(flag & GFP_DMA)) { + flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); + page = alloc_pages(flag | __GFP_ZERO, get_order(size)); + if (!page) + return NULL; + + align_mask = (1UL << get_order(size)) - 1; + paddr = dma_map_area(dev, page_to_phys(page), size, + DMA_BIDIRECTIONAL, align_mask); + + flush_gart(); + if (paddr != bad_dma_address) { + *dma_addr = paddr; + return page_address(page); + } + __free_pages(page, get_order(size)); + } else + return dma_generic_alloc_coherent(dev, size, dma_addr, flag); + + return NULL; +} + +/* free a coherent mapping */ +static void +gart_free_coherent(struct device *dev, size_t size, void *vaddr, + dma_addr_t dma_addr) +{ + gart_unmap_single(dev, dma_addr, size, DMA_BIDIRECTIONAL); + free_pages((unsigned long)vaddr, get_order(size)); +} + static int no_agp; static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) @@ -626,7 +650,6 @@ static __init int init_k8_gatt(struct agp_kern_info *info) struct pci_dev *dev; void *gatt; int i, error; - unsigned long start_pfn, end_pfn; printk(KERN_INFO "PCI-DMA: Disabling AGP.\n"); aper_size = aper_base = info->aper_size = 0; @@ -650,13 +673,13 @@ static __init int init_k8_gatt(struct agp_kern_info *info) info->aper_size = aper_size >> 20; gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32); - gatt = (void *)__get_free_pages(GFP_KERNEL, get_order(gatt_size)); + gatt = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, + get_order(gatt_size)); if (!gatt) panic("Cannot allocate GATT table"); if (set_memory_uc((unsigned long)gatt, gatt_size >> PAGE_SHIFT)) panic("Could not set GART PTEs to uncacheable pages"); - memset(gatt, 0, gatt_size); agp_gatt_table = gatt; enable_gart_translations(); @@ -665,19 +688,14 @@ static __init int init_k8_gatt(struct agp_kern_info *info) if (!error) error = sysdev_register(&device_gart); if (error) - panic("Could not register gart_sysdev -- would corrupt data on next suspend"); + panic("Could not register gart_sysdev -- " + "would corrupt data on next suspend"); flush_gart(); printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n", aper_base, aper_size>>10); - /* need to map that range */ - end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT); - if (end_pfn > max_low_pfn_mapped) { - start_pfn = (aper_base>>PAGE_SHIFT); - init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT); - } return 0; nommu: @@ -687,20 +705,13 @@ static __init int init_k8_gatt(struct agp_kern_info *info) return -1; } -extern int agp_amd64_init(void); - static struct dma_mapping_ops gart_dma_ops = { .map_single = gart_map_single, - .map_simple = gart_map_simple, .unmap_single = gart_unmap_single, - .sync_single_for_cpu = NULL, - .sync_single_for_device = NULL, - .sync_single_range_for_cpu = NULL, - .sync_single_range_for_device = NULL, - .sync_sg_for_cpu = NULL, - .sync_sg_for_device = NULL, .map_sg = gart_map_sg, .unmap_sg = gart_unmap_sg, + .alloc_coherent = gart_alloc_coherent, + .free_coherent = gart_free_coherent, }; void gart_iommu_shutdown(void) @@ -727,7 +738,8 @@ void __init gart_iommu_init(void) { struct agp_kern_info info; unsigned long iommu_start; - unsigned long aper_size; + unsigned long aper_base, aper_size; + unsigned long start_pfn, end_pfn; unsigned long scratch; long i; @@ -759,30 +771,35 @@ void __init gart_iommu_init(void) (no_agp && init_k8_gatt(&info) < 0)) { if (max_pfn > MAX_DMA32_PFN) { printk(KERN_WARNING "More than 4GB of memory " - "but GART IOMMU not available.\n" - KERN_WARNING "falling back to iommu=soft.\n"); + "but GART IOMMU not available.\n"); + printk(KERN_WARNING "falling back to iommu=soft.\n"); } return; } + /* need to map that range */ + aper_size = info.aper_size << 20; + aper_base = info.aper_base; + end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT); + if (end_pfn > max_low_pfn_mapped) { + start_pfn = (aper_base>>PAGE_SHIFT); + init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT); + } + printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n"); - aper_size = info.aper_size * 1024 * 1024; iommu_size = check_iommu_size(info.aper_base, aper_size); iommu_pages = iommu_size >> PAGE_SHIFT; - iommu_gart_bitmap = (void *) __get_free_pages(GFP_KERNEL, + iommu_gart_bitmap = (void *) __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(iommu_pages/8)); if (!iommu_gart_bitmap) panic("Cannot allocate iommu bitmap\n"); - memset(iommu_gart_bitmap, 0, iommu_pages/8); #ifdef CONFIG_IOMMU_LEAK if (leak_trace) { - iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL, + iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, get_order(iommu_pages*sizeof(void *))); - if (iommu_leak_tab) - memset(iommu_leak_tab, 0, iommu_pages * 8); - else + if (!iommu_leak_tab) printk(KERN_DEBUG "PCI-DMA: Cannot allocate leak trace area\n"); } @@ -792,7 +809,7 @@ void __init gart_iommu_init(void) * Out of IOMMU space handling. * Reserve some invalid pages at the beginning of the GART. */ - set_bit_string(iommu_gart_bitmap, 0, EMERGENCY_PAGES); + iommu_area_reserve(iommu_gart_bitmap, 0, EMERGENCY_PAGES); agp_memory_reserved = iommu_size; printk(KERN_INFO @@ -850,7 +867,8 @@ void __init gart_parse_options(char *p) if (!strncmp(p, "leak", 4)) { leak_trace = 1; p += 4; - if (*p == '=') ++p; + if (*p == '=') + ++p; if (isdigit(*p) && get_option(&p, &arg)) iommu_leak_pages = arg; } diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index 3f91f71cdc3..c70ab5a5d4c 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c @@ -14,7 +14,7 @@ static int check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size) { - if (hwdev && bus + size > *hwdev->dma_mask) { + if (hwdev && !is_buffer_dma_capable(*hwdev->dma_mask, bus, size)) { if (*hwdev->dma_mask >= DMA_32BIT_MASK) printk(KERN_ERR "nommu_%s: overflow %Lx+%zu of device mask %Lx\n", @@ -72,7 +72,15 @@ static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg, return nents; } +static void nommu_free_coherent(struct device *dev, size_t size, void *vaddr, + dma_addr_t dma_addr) +{ + free_pages((unsigned long)vaddr, get_order(size)); +} + struct dma_mapping_ops nommu_dma_ops = { + .alloc_coherent = dma_generic_alloc_coherent, + .free_coherent = nommu_free_coherent, .map_single = nommu_map_single, .map_sg = nommu_map_sg, .is_phys = 1, diff --git a/arch/x86/kernel/pcspeaker.c b/arch/x86/kernel/pcspeaker.c index bc1f2d3ea27..a311ffcaad1 100644 --- a/arch/x86/kernel/pcspeaker.c +++ b/arch/x86/kernel/pcspeaker.c @@ -1,20 +1,13 @@ #include <linux/platform_device.h> -#include <linux/errno.h> +#include <linux/err.h> #include <linux/init.h> static __init int add_pcspkr(void) { struct platform_device *pd; - int ret; - pd = platform_device_alloc("pcspkr", -1); - if (!pd) - return -ENOMEM; + pd = platform_device_register_simple("pcspkr", -1, NULL, 0); - ret = platform_device_add(pd); - if (ret) - platform_device_put(pd); - - return ret; + return IS_ERR(pd) ? PTR_ERR(pd) : 0; } device_initcall(add_pcspkr); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 7fc4d5b0a6a..c622772744d 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -15,7 +15,6 @@ unsigned long idle_nomwait; EXPORT_SYMBOL(idle_nomwait); struct kmem_cache *task_xstate_cachep; -static int force_mwait __cpuinitdata; int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { @@ -185,7 +184,8 @@ static void mwait_idle(void) static void poll_idle(void) { local_irq_enable(); - cpu_relax(); + while (!need_resched()) + cpu_relax(); } /* @@ -246,6 +246,14 @@ static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c) return 1; } +static cpumask_t c1e_mask = CPU_MASK_NONE; +static int c1e_detected; + +void c1e_remove_cpu(int cpu) +{ + cpu_clear(cpu, c1e_mask); +} + /* * C1E aware idle routine. We check for C1E active in the interrupt * pending message MSR. If we detect C1E, then we handle it the same @@ -253,9 +261,6 @@ static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c) */ static void c1e_idle(void) { - static cpumask_t c1e_mask = CPU_MASK_NONE; - static int c1e_detected; - if (need_resched()) return; @@ -265,8 +270,10 @@ static void c1e_idle(void) rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); if (lo & K8_INTP_C1E_ACTIVE_MASK) { c1e_detected = 1; - mark_tsc_unstable("TSC halt in C1E"); - printk(KERN_INFO "System has C1E enabled\n"); + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) + mark_tsc_unstable("TSC halt in AMD C1E"); + printk(KERN_INFO "System has AMD C1E enabled\n"); + set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E); } } diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 3b7a1ddcc0b..0a1302fe6d4 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -37,6 +37,7 @@ #include <linux/tick.h> #include <linux/percpu.h> #include <linux/prctl.h> +#include <linux/dmi.h> #include <asm/uaccess.h> #include <asm/pgtable.h> @@ -55,6 +56,9 @@ #include <asm/tlbflush.h> #include <asm/cpu.h> #include <asm/kdebug.h> +#include <asm/idle.h> +#include <asm/syscalls.h> +#include <asm/smp.h> asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); @@ -72,46 +76,12 @@ unsigned long thread_saved_pc(struct task_struct *tsk) return ((unsigned long *)tsk->thread.sp)[3]; } -#ifdef CONFIG_HOTPLUG_CPU -#include <asm/nmi.h> - -static void cpu_exit_clear(void) -{ - int cpu = raw_smp_processor_id(); - - idle_task_exit(); - - cpu_uninit(); - irq_ctx_exit(cpu); - - cpu_clear(cpu, cpu_callout_map); - cpu_clear(cpu, cpu_callin_map); - - numa_remove_cpu(cpu); -} - -/* We don't actually take CPU down, just spin without interrupts. */ -static inline void play_dead(void) -{ - /* This must be done before dead CPU ack */ - cpu_exit_clear(); - mb(); - /* Ack it */ - __get_cpu_var(cpu_state) = CPU_DEAD; - - /* - * With physical CPU hotplug, we should halt the cpu - */ - local_irq_disable(); - /* mask all interrupts, flush any and all caches, and halt */ - wbinvd_halt(); -} -#else +#ifndef CONFIG_SMP static inline void play_dead(void) { BUG(); } -#endif /* CONFIG_HOTPLUG_CPU */ +#endif /* * The idle thread. There's no useful work to be @@ -153,12 +123,13 @@ void cpu_idle(void) } } -void __show_registers(struct pt_regs *regs, int all) +void __show_regs(struct pt_regs *regs, int all) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; unsigned long d0, d1, d2, d3, d6, d7; unsigned long sp; unsigned short ss, gs; + const char *board; if (user_mode_vm(regs)) { sp = regs->sp; @@ -171,11 +142,15 @@ void __show_registers(struct pt_regs *regs, int all) } printk("\n"); - printk("Pid: %d, comm: %s %s (%s %.*s)\n", + + board = dmi_get_system_info(DMI_PRODUCT_NAME); + if (!board) + board = ""; + printk("Pid: %d, comm: %s %s (%s %.*s) %s\n", task_pid_nr(current), current->comm, print_tainted(), init_utsname()->release, (int)strcspn(init_utsname()->version, " "), - init_utsname()->version); + init_utsname()->version, board); printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n", (u16)regs->cs, regs->ip, regs->flags, @@ -214,7 +189,7 @@ void __show_registers(struct pt_regs *regs, int all) void show_regs(struct pt_regs *regs) { - __show_registers(regs, 1); + __show_regs(regs, 1); show_trace(NULL, regs, ®s->sp, regs->bp); } @@ -275,6 +250,14 @@ void exit_thread(void) tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET; put_cpu(); } +#ifdef CONFIG_X86_DS + /* Free any DS contexts that have not been properly released. */ + if (unlikely(current->thread.ds_ctx)) { + /* we clear debugctl to make sure DS is not used. */ + update_debugctlmsr(0); + ds_free(current->thread.ds_ctx); + } +#endif /* CONFIG_X86_DS */ } void flush_thread(void) @@ -436,6 +419,35 @@ int set_tsc_mode(unsigned int val) return 0; } +#ifdef CONFIG_X86_DS +static int update_debugctl(struct thread_struct *prev, + struct thread_struct *next, unsigned long debugctl) +{ + unsigned long ds_prev = 0; + unsigned long ds_next = 0; + + if (prev->ds_ctx) + ds_prev = (unsigned long)prev->ds_ctx->ds; + if (next->ds_ctx) + ds_next = (unsigned long)next->ds_ctx->ds; + + if (ds_next != ds_prev) { + /* we clear debugctl to make sure DS + * is not in use when we change it */ + debugctl = 0; + update_debugctlmsr(0); + wrmsr(MSR_IA32_DS_AREA, ds_next, 0); + } + return debugctl; +} +#else +static int update_debugctl(struct thread_struct *prev, + struct thread_struct *next, unsigned long debugctl) +{ + return debugctl; +} +#endif /* CONFIG_X86_DS */ + static noinline void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, struct tss_struct *tss) @@ -446,14 +458,7 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, prev = &prev_p->thread; next = &next_p->thread; - debugctl = prev->debugctlmsr; - if (next->ds_area_msr != prev->ds_area_msr) { - /* we clear debugctl to make sure DS - * is not in use when we change it */ - debugctl = 0; - update_debugctlmsr(0); - wrmsr(MSR_IA32_DS_AREA, next->ds_area_msr, 0); - } + debugctl = update_debugctl(prev, next, prev->debugctlmsr); if (next->debugctlmsr != debugctl) update_debugctlmsr(next->debugctlmsr); @@ -477,13 +482,13 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, hard_enable_TSC(); } -#ifdef X86_BTS +#ifdef CONFIG_X86_PTRACE_BTS if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); -#endif +#endif /* CONFIG_X86_PTRACE_BTS */ if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 71553b664e2..cd8c0ed02b7 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -37,11 +37,11 @@ #include <linux/kdebug.h> #include <linux/tick.h> #include <linux/prctl.h> +#include <linux/uaccess.h> +#include <linux/io.h> -#include <asm/uaccess.h> #include <asm/pgtable.h> #include <asm/system.h> -#include <asm/io.h> #include <asm/processor.h> #include <asm/i387.h> #include <asm/mmu_context.h> @@ -51,6 +51,7 @@ #include <asm/proto.h> #include <asm/ia32.h> #include <asm/idle.h> +#include <asm/syscalls.h> asmlinkage extern void ret_from_fork(void); @@ -85,28 +86,12 @@ void exit_idle(void) __exit_idle(); } -#ifdef CONFIG_HOTPLUG_CPU -DECLARE_PER_CPU(int, cpu_state); - -#include <asm/nmi.h> -/* We halt the CPU with physical CPU hotplug */ -static inline void play_dead(void) -{ - idle_task_exit(); - mb(); - /* Ack it */ - __get_cpu_var(cpu_state) = CPU_DEAD; - - local_irq_disable(); - /* mask all interrupts, flush any and all caches, and halt */ - wbinvd_halt(); -} -#else +#ifndef CONFIG_SMP static inline void play_dead(void) { BUG(); } -#endif /* CONFIG_HOTPLUG_CPU */ +#endif /* * The idle thread. There's no useful work to be @@ -151,7 +136,7 @@ void cpu_idle(void) } /* Prints also some state that isn't saved in the pt_regs */ -void __show_regs(struct pt_regs * regs) +void __show_regs(struct pt_regs *regs, int all) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; unsigned long d0, d1, d2, d3, d6, d7; @@ -160,60 +145,65 @@ void __show_regs(struct pt_regs * regs) printk("\n"); print_modules(); - printk("Pid: %d, comm: %.20s %s %s %.*s\n", + printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s\n", current->pid, current->comm, print_tainted(), init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version); - printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); + printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); printk_address(regs->ip, 1); - printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->sp, - regs->flags); - printk("RAX: %016lx RBX: %016lx RCX: %016lx\n", + printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, + regs->sp, regs->flags); + printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n", regs->ax, regs->bx, regs->cx); - printk("RDX: %016lx RSI: %016lx RDI: %016lx\n", + printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n", regs->dx, regs->si, regs->di); - printk("RBP: %016lx R08: %016lx R09: %016lx\n", + printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n", regs->bp, regs->r8, regs->r9); - printk("R10: %016lx R11: %016lx R12: %016lx\n", - regs->r10, regs->r11, regs->r12); - printk("R13: %016lx R14: %016lx R15: %016lx\n", - regs->r13, regs->r14, regs->r15); - - asm("movl %%ds,%0" : "=r" (ds)); - asm("movl %%cs,%0" : "=r" (cs)); - asm("movl %%es,%0" : "=r" (es)); + printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n", + regs->r10, regs->r11, regs->r12); + printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n", + regs->r13, regs->r14, regs->r15); + + asm("movl %%ds,%0" : "=r" (ds)); + asm("movl %%cs,%0" : "=r" (cs)); + asm("movl %%es,%0" : "=r" (es)); asm("movl %%fs,%0" : "=r" (fsindex)); asm("movl %%gs,%0" : "=r" (gsindex)); rdmsrl(MSR_FS_BASE, fs); - rdmsrl(MSR_GS_BASE, gs); - rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); + rdmsrl(MSR_GS_BASE, gs); + rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); + + if (!all) + return; cr0 = read_cr0(); cr2 = read_cr2(); cr3 = read_cr3(); cr4 = read_cr4(); - printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", - fs,fsindex,gs,gsindex,shadowgs); - printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0); - printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4); + printk(KERN_INFO "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", + fs, fsindex, gs, gsindex, shadowgs); + printk(KERN_INFO "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, + es, cr0); + printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, + cr4); get_debugreg(d0, 0); get_debugreg(d1, 1); get_debugreg(d2, 2); - printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); + printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); get_debugreg(d3, 3); get_debugreg(d6, 6); get_debugreg(d7, 7); - printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); + printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); } void show_regs(struct pt_regs *regs) { - printk("CPU %d:", smp_processor_id()); - __show_regs(regs); + printk(KERN_INFO "CPU %d:", smp_processor_id()); + __show_regs(regs, 1); show_trace(NULL, regs, (void *)(regs + 1), regs->bp); } @@ -238,6 +228,14 @@ void exit_thread(void) t->io_bitmap_max = 0; put_cpu(); } +#ifdef CONFIG_X86_DS + /* Free any DS contexts that have not been properly released. */ + if (unlikely(t->ds_ctx)) { + /* we clear debugctl to make sure DS is not used. */ + update_debugctlmsr(0); + ds_free(t->ds_ctx); + } +#endif /* CONFIG_X86_DS */ } void flush_thread(void) @@ -313,10 +311,10 @@ void prepare_to_copy(struct task_struct *tsk) int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, unsigned long unused, - struct task_struct * p, struct pt_regs * regs) + struct task_struct *p, struct pt_regs *regs) { int err; - struct pt_regs * childregs; + struct pt_regs *childregs; struct task_struct *me = current; childregs = ((struct pt_regs *) @@ -361,10 +359,10 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, if (test_thread_flag(TIF_IA32)) err = do_set_thread_area(p, -1, (struct user_desc __user *)childregs->si, 0); - else -#endif - err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); - if (err) + else +#endif + err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); + if (err) goto out; } err = 0; @@ -471,13 +469,27 @@ static inline void __switch_to_xtra(struct task_struct *prev_p, next = &next_p->thread; debugctl = prev->debugctlmsr; - if (next->ds_area_msr != prev->ds_area_msr) { - /* we clear debugctl to make sure DS - * is not in use when we change it */ - debugctl = 0; - update_debugctlmsr(0); - wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr); + +#ifdef CONFIG_X86_DS + { + unsigned long ds_prev = 0, ds_next = 0; + + if (prev->ds_ctx) + ds_prev = (unsigned long)prev->ds_ctx->ds; + if (next->ds_ctx) + ds_next = (unsigned long)next->ds_ctx->ds; + + if (ds_next != ds_prev) { + /* + * We clear debugctl to make sure DS + * is not in use when we change it: + */ + debugctl = 0; + update_debugctlmsr(0); + wrmsrl(MSR_IA32_DS_AREA, ds_next); + } } +#endif /* CONFIG_X86_DS */ if (next->debugctlmsr != debugctl) update_debugctlmsr(next->debugctlmsr); @@ -515,13 +527,13 @@ static inline void __switch_to_xtra(struct task_struct *prev_p, memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); } -#ifdef X86_BTS +#ifdef CONFIG_X86_PTRACE_BTS if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); -#endif +#endif /* CONFIG_X86_PTRACE_BTS */ } /* @@ -543,7 +555,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) unsigned fsindex, gsindex; /* we're going to use this soon, after a few expensive things */ - if (next_p->fpu_counter>5) + if (next_p->fpu_counter > 5) prefetch(next->xstate); /* @@ -551,13 +563,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) */ load_sp0(tss, next); - /* + /* * Switch DS and ES. * This won't pick up thread selector changes, but I guess that is ok. */ savesegment(es, prev->es); if (unlikely(next->es | prev->es)) - loadsegment(es, next->es); + loadsegment(es, next->es); savesegment(ds, prev->ds); if (unlikely(next->ds | prev->ds)) @@ -583,7 +595,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) */ arch_leave_lazy_cpu_mode(); - /* + /* * Switch FS and GS. * * Segment register != 0 always requires a reload. Also @@ -592,13 +604,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) */ if (unlikely(fsindex | next->fsindex | prev->fs)) { loadsegment(fs, next->fsindex); - /* + /* * Check if the user used a selector != 0; if yes * clear 64bit base, since overloaded base is always * mapped to the Null selector */ if (fsindex) - prev->fs = 0; + prev->fs = 0; } /* when next process has a 64bit base use it */ if (next->fs) @@ -608,7 +620,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) if (unlikely(gsindex | next->gsindex | prev->gs)) { load_gs_index(next->gsindex); if (gsindex) - prev->gs = 0; + prev->gs = 0; } if (next->gs) wrmsrl(MSR_KERNEL_GS_BASE, next->gs); @@ -617,12 +629,12 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* Must be after DS reload */ unlazy_fpu(prev_p); - /* + /* * Switch the PDA and FPU contexts. */ prev->usersp = read_pda(oldrsp); write_pda(oldrsp, next->usersp); - write_pda(pcurrent, next_p); + write_pda(pcurrent, next_p); write_pda(kernelstack, (unsigned long)task_stack_page(next_p) + @@ -663,7 +675,7 @@ long sys_execve(char __user *name, char __user * __user *argv, char __user * __user *envp, struct pt_regs *regs) { long error; - char * filename; + char *filename; filename = getname(name); error = PTR_ERR(filename); @@ -721,55 +733,55 @@ asmlinkage long sys_vfork(struct pt_regs *regs) unsigned long get_wchan(struct task_struct *p) { unsigned long stack; - u64 fp,ip; + u64 fp, ip; int count = 0; - if (!p || p == current || p->state==TASK_RUNNING) - return 0; + if (!p || p == current || p->state == TASK_RUNNING) + return 0; stack = (unsigned long)task_stack_page(p); - if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE) + if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE) return 0; fp = *(u64 *)(p->thread.sp); - do { + do { if (fp < (unsigned long)stack || - fp > (unsigned long)stack+THREAD_SIZE) - return 0; + fp >= (unsigned long)stack+THREAD_SIZE) + return 0; ip = *(u64 *)(fp+8); if (!in_sched_functions(ip)) return ip; - fp = *(u64 *)fp; - } while (count++ < 16); + fp = *(u64 *)fp; + } while (count++ < 16); return 0; } long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) -{ - int ret = 0; +{ + int ret = 0; int doit = task == current; int cpu; - switch (code) { + switch (code) { case ARCH_SET_GS: if (addr >= TASK_SIZE_OF(task)) - return -EPERM; + return -EPERM; cpu = get_cpu(); - /* handle small bases via the GDT because that's faster to + /* handle small bases via the GDT because that's faster to switch. */ - if (addr <= 0xffffffff) { - set_32bit_tls(task, GS_TLS, addr); - if (doit) { + if (addr <= 0xffffffff) { + set_32bit_tls(task, GS_TLS, addr); + if (doit) { load_TLS(&task->thread, cpu); - load_gs_index(GS_TLS_SEL); + load_gs_index(GS_TLS_SEL); } - task->thread.gsindex = GS_TLS_SEL; + task->thread.gsindex = GS_TLS_SEL; task->thread.gs = 0; - } else { + } else { task->thread.gsindex = 0; task->thread.gs = addr; if (doit) { load_gs_index(0); ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr); - } + } } put_cpu(); break; @@ -823,8 +835,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) rdmsrl(MSR_KERNEL_GS_BASE, base); else base = task->thread.gs; - } - else + } else base = task->thread.gs; ret = put_user(base, (unsigned long __user *)addr); break; diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index e37dccce85d..0a6d8c12e10 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -14,6 +14,7 @@ #include <linux/errno.h> #include <linux/ptrace.h> #include <linux/regset.h> +#include <linux/tracehook.h> #include <linux/user.h> #include <linux/elf.h> #include <linux/security.h> @@ -39,7 +40,9 @@ enum x86_regset { REGSET_GENERAL, REGSET_FP, REGSET_XFP, + REGSET_IOPERM64 = REGSET_XFP, REGSET_TLS, + REGSET_IOPERM32, }; /* @@ -69,7 +72,7 @@ static inline bool invalid_selector(u16 value) #define FLAG_MASK FLAG_MASK_32 -static long *pt_regs_access(struct pt_regs *regs, unsigned long regno) +static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long regno) { BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0); regno >>= 2; @@ -554,45 +557,138 @@ static int ptrace_set_debugreg(struct task_struct *child, return 0; } -#ifdef X86_BTS +/* + * These access the current or another (stopped) task's io permission + * bitmap for debugging or core dump. + */ +static int ioperm_active(struct task_struct *target, + const struct user_regset *regset) +{ + return target->thread.io_bitmap_max / regset->size; +} -static int ptrace_bts_get_size(struct task_struct *child) +static int ioperm_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) { - if (!child->thread.ds_area_msr) + if (!target->thread.io_bitmap_ptr) return -ENXIO; - return ds_get_bts_index((void *)child->thread.ds_area_msr); + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, + target->thread.io_bitmap_ptr, + 0, IO_BITMAP_BYTES); +} + +#ifdef CONFIG_X86_PTRACE_BTS +/* + * The configuration for a particular BTS hardware implementation. + */ +struct bts_configuration { + /* the size of a BTS record in bytes; at most BTS_MAX_RECORD_SIZE */ + unsigned char sizeof_bts; + /* the size of a field in the BTS record in bytes */ + unsigned char sizeof_field; + /* a bitmask to enable/disable BTS in DEBUGCTL MSR */ + unsigned long debugctl_mask; +}; +static struct bts_configuration bts_cfg; + +#define BTS_MAX_RECORD_SIZE (8 * 3) + + +/* + * Branch Trace Store (BTS) uses the following format. Different + * architectures vary in the size of those fields. + * - source linear address + * - destination linear address + * - flags + * + * Later architectures use 64bit pointers throughout, whereas earlier + * architectures use 32bit pointers in 32bit mode. + * + * We compute the base address for the first 8 fields based on: + * - the field size stored in the DS configuration + * - the relative field position + * + * In order to store additional information in the BTS buffer, we use + * a special source address to indicate that the record requires + * special interpretation. + * + * Netburst indicated via a bit in the flags field whether the branch + * was predicted; this is ignored. + */ + +enum bts_field { + bts_from = 0, + bts_to, + bts_flags, + + bts_escape = (unsigned long)-1, + bts_qual = bts_to, + bts_jiffies = bts_flags +}; + +static inline unsigned long bts_get(const char *base, enum bts_field field) +{ + base += (bts_cfg.sizeof_field * field); + return *(unsigned long *)base; +} + +static inline void bts_set(char *base, enum bts_field field, unsigned long val) +{ + base += (bts_cfg.sizeof_field * field);; + (*(unsigned long *)base) = val; +} + +/* + * Translate a BTS record from the raw format into the bts_struct format + * + * out (out): bts_struct interpretation + * raw: raw BTS record + */ +static void ptrace_bts_translate_record(struct bts_struct *out, const void *raw) +{ + memset(out, 0, sizeof(*out)); + if (bts_get(raw, bts_from) == bts_escape) { + out->qualifier = bts_get(raw, bts_qual); + out->variant.jiffies = bts_get(raw, bts_jiffies); + } else { + out->qualifier = BTS_BRANCH; + out->variant.lbr.from_ip = bts_get(raw, bts_from); + out->variant.lbr.to_ip = bts_get(raw, bts_to); + } } -static int ptrace_bts_read_record(struct task_struct *child, - long index, +static int ptrace_bts_read_record(struct task_struct *child, size_t index, struct bts_struct __user *out) { struct bts_struct ret; - int retval; - int bts_end; - int bts_index; - - if (!child->thread.ds_area_msr) - return -ENXIO; + const void *bts_record; + size_t bts_index, bts_end; + int error; - if (index < 0) - return -EINVAL; + error = ds_get_bts_end(child, &bts_end); + if (error < 0) + return error; - bts_end = ds_get_bts_end((void *)child->thread.ds_area_msr); if (bts_end <= index) return -EINVAL; + error = ds_get_bts_index(child, &bts_index); + if (error < 0) + return error; + /* translate the ptrace bts index into the ds bts index */ - bts_index = ds_get_bts_index((void *)child->thread.ds_area_msr); - bts_index -= (index + 1); - if (bts_index < 0) - bts_index += bts_end; + bts_index += bts_end - (index + 1); + if (bts_end <= bts_index) + bts_index -= bts_end; + + error = ds_access_bts(child, bts_index, &bts_record); + if (error < 0) + return error; - retval = ds_read_bts((void *)child->thread.ds_area_msr, - bts_index, &ret); - if (retval < 0) - return retval; + ptrace_bts_translate_record(&ret, bts_record); if (copy_to_user(out, &ret, sizeof(ret))) return -EFAULT; @@ -600,101 +696,106 @@ static int ptrace_bts_read_record(struct task_struct *child, return sizeof(ret); } -static int ptrace_bts_clear(struct task_struct *child) -{ - if (!child->thread.ds_area_msr) - return -ENXIO; - - return ds_clear((void *)child->thread.ds_area_msr); -} - static int ptrace_bts_drain(struct task_struct *child, long size, struct bts_struct __user *out) { - int end, i; - void *ds = (void *)child->thread.ds_area_msr; - - if (!ds) - return -ENXIO; + struct bts_struct ret; + const unsigned char *raw; + size_t end, i; + int error; - end = ds_get_bts_index(ds); - if (end <= 0) - return end; + error = ds_get_bts_index(child, &end); + if (error < 0) + return error; if (size < (end * sizeof(struct bts_struct))) return -EIO; - for (i = 0; i < end; i++, out++) { - struct bts_struct ret; - int retval; + error = ds_access_bts(child, 0, (const void **)&raw); + if (error < 0) + return error; - retval = ds_read_bts(ds, i, &ret); - if (retval < 0) - return retval; + for (i = 0; i < end; i++, out++, raw += bts_cfg.sizeof_bts) { + ptrace_bts_translate_record(&ret, raw); if (copy_to_user(out, &ret, sizeof(ret))) return -EFAULT; } - ds_clear(ds); + error = ds_clear_bts(child); + if (error < 0) + return error; return end; } +static void ptrace_bts_ovfl(struct task_struct *child) +{ + send_sig(child->thread.bts_ovfl_signal, child, 0); +} + static int ptrace_bts_config(struct task_struct *child, long cfg_size, const struct ptrace_bts_config __user *ucfg) { struct ptrace_bts_config cfg; - int bts_size, ret = 0; - void *ds; + int error = 0; + error = -EOPNOTSUPP; + if (!bts_cfg.sizeof_bts) + goto errout; + + error = -EIO; if (cfg_size < sizeof(cfg)) - return -EIO; + goto errout; + error = -EFAULT; if (copy_from_user(&cfg, ucfg, sizeof(cfg))) - return -EFAULT; + goto errout; - if ((int)cfg.size < 0) - return -EINVAL; + error = -EINVAL; + if ((cfg.flags & PTRACE_BTS_O_SIGNAL) && + !(cfg.flags & PTRACE_BTS_O_ALLOC)) + goto errout; - bts_size = 0; - ds = (void *)child->thread.ds_area_msr; - if (ds) { - bts_size = ds_get_bts_size(ds); - if (bts_size < 0) - return bts_size; - } - cfg.size = PAGE_ALIGN(cfg.size); + if (cfg.flags & PTRACE_BTS_O_ALLOC) { + ds_ovfl_callback_t ovfl = NULL; + unsigned int sig = 0; + + /* we ignore the error in case we were not tracing child */ + (void)ds_release_bts(child); + + if (cfg.flags & PTRACE_BTS_O_SIGNAL) { + if (!cfg.signal) + goto errout; + + sig = cfg.signal; + ovfl = ptrace_bts_ovfl; + } - if (bts_size != cfg.size) { - ret = ptrace_bts_realloc(child, cfg.size, - cfg.flags & PTRACE_BTS_O_CUT_SIZE); - if (ret < 0) + error = ds_request_bts(child, /* base = */ NULL, cfg.size, ovfl); + if (error < 0) goto errout; - ds = (void *)child->thread.ds_area_msr; + child->thread.bts_ovfl_signal = sig; } - if (cfg.flags & PTRACE_BTS_O_SIGNAL) - ret = ds_set_overflow(ds, DS_O_SIGNAL); - else - ret = ds_set_overflow(ds, DS_O_WRAP); - if (ret < 0) + error = -EINVAL; + if (!child->thread.ds_ctx && cfg.flags) goto errout; if (cfg.flags & PTRACE_BTS_O_TRACE) - child->thread.debugctlmsr |= ds_debugctl_mask(); + child->thread.debugctlmsr |= bts_cfg.debugctl_mask; else - child->thread.debugctlmsr &= ~ds_debugctl_mask(); + child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask; if (cfg.flags & PTRACE_BTS_O_SCHED) set_tsk_thread_flag(child, TIF_BTS_TRACE_TS); else clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); - ret = sizeof(cfg); + error = sizeof(cfg); out: if (child->thread.debugctlmsr) @@ -702,10 +803,10 @@ out: else clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); - return ret; + return error; errout: - child->thread.debugctlmsr &= ~ds_debugctl_mask(); + child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask; clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); goto out; } @@ -714,29 +815,40 @@ static int ptrace_bts_status(struct task_struct *child, long cfg_size, struct ptrace_bts_config __user *ucfg) { - void *ds = (void *)child->thread.ds_area_msr; struct ptrace_bts_config cfg; + size_t end; + const void *base, *max; + int error; if (cfg_size < sizeof(cfg)) return -EIO; - memset(&cfg, 0, sizeof(cfg)); + error = ds_get_bts_end(child, &end); + if (error < 0) + return error; - if (ds) { - cfg.size = ds_get_bts_size(ds); + error = ds_access_bts(child, /* index = */ 0, &base); + if (error < 0) + return error; - if (ds_get_overflow(ds) == DS_O_SIGNAL) - cfg.flags |= PTRACE_BTS_O_SIGNAL; + error = ds_access_bts(child, /* index = */ end, &max); + if (error < 0) + return error; - if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) && - child->thread.debugctlmsr & ds_debugctl_mask()) - cfg.flags |= PTRACE_BTS_O_TRACE; + memset(&cfg, 0, sizeof(cfg)); + cfg.size = (max - base); + cfg.signal = child->thread.bts_ovfl_signal; + cfg.bts_size = sizeof(struct bts_struct); - if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS)) - cfg.flags |= PTRACE_BTS_O_SCHED; - } + if (cfg.signal) + cfg.flags |= PTRACE_BTS_O_SIGNAL; - cfg.bts_size = sizeof(struct bts_struct); + if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) && + child->thread.debugctlmsr & bts_cfg.debugctl_mask) + cfg.flags |= PTRACE_BTS_O_TRACE; + + if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS)) + cfg.flags |= PTRACE_BTS_O_SCHED; if (copy_to_user(ucfg, &cfg, sizeof(cfg))) return -EFAULT; @@ -744,89 +856,38 @@ static int ptrace_bts_status(struct task_struct *child, return sizeof(cfg); } - static int ptrace_bts_write_record(struct task_struct *child, const struct bts_struct *in) { - int retval; + unsigned char bts_record[BTS_MAX_RECORD_SIZE]; - if (!child->thread.ds_area_msr) - return -ENXIO; + BUG_ON(BTS_MAX_RECORD_SIZE < bts_cfg.sizeof_bts); - retval = ds_write_bts((void *)child->thread.ds_area_msr, in); - if (retval) - return retval; + memset(bts_record, 0, bts_cfg.sizeof_bts); + switch (in->qualifier) { + case BTS_INVALID: + break; - return sizeof(*in); -} + case BTS_BRANCH: + bts_set(bts_record, bts_from, in->variant.lbr.from_ip); + bts_set(bts_record, bts_to, in->variant.lbr.to_ip); + break; -static int ptrace_bts_realloc(struct task_struct *child, - int size, int reduce_size) -{ - unsigned long rlim, vm; - int ret, old_size; + case BTS_TASK_ARRIVES: + case BTS_TASK_DEPARTS: + bts_set(bts_record, bts_from, bts_escape); + bts_set(bts_record, bts_qual, in->qualifier); + bts_set(bts_record, bts_jiffies, in->variant.jiffies); + break; - if (size < 0) + default: return -EINVAL; - - old_size = ds_get_bts_size((void *)child->thread.ds_area_msr); - if (old_size < 0) - return old_size; - - ret = ds_free((void **)&child->thread.ds_area_msr); - if (ret < 0) - goto out; - - size >>= PAGE_SHIFT; - old_size >>= PAGE_SHIFT; - - current->mm->total_vm -= old_size; - current->mm->locked_vm -= old_size; - - if (size == 0) - goto out; - - rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; - vm = current->mm->total_vm + size; - if (rlim < vm) { - ret = -ENOMEM; - - if (!reduce_size) - goto out; - - size = rlim - current->mm->total_vm; - if (size <= 0) - goto out; - } - - rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; - vm = current->mm->locked_vm + size; - if (rlim < vm) { - ret = -ENOMEM; - - if (!reduce_size) - goto out; - - size = rlim - current->mm->locked_vm; - if (size <= 0) - goto out; } - ret = ds_allocate((void **)&child->thread.ds_area_msr, - size << PAGE_SHIFT); - if (ret < 0) - goto out; - - current->mm->total_vm += size; - current->mm->locked_vm += size; - -out: - if (child->thread.ds_area_msr) - set_tsk_thread_flag(child, TIF_DS_AREA_MSR); - else - clear_tsk_thread_flag(child, TIF_DS_AREA_MSR); - - return ret; + /* The writing task will be the switched-to task on a context + * switch. It needs to write into the switched-from task's BTS + * buffer. */ + return ds_unchecked_write_bts(child, bts_record, bts_cfg.sizeof_bts); } void ptrace_bts_take_timestamp(struct task_struct *tsk, @@ -839,7 +900,66 @@ void ptrace_bts_take_timestamp(struct task_struct *tsk, ptrace_bts_write_record(tsk, &rec); } -#endif /* X86_BTS */ + +static const struct bts_configuration bts_cfg_netburst = { + .sizeof_bts = sizeof(long) * 3, + .sizeof_field = sizeof(long), + .debugctl_mask = (1<<2)|(1<<3)|(1<<5) +}; + +static const struct bts_configuration bts_cfg_pentium_m = { + .sizeof_bts = sizeof(long) * 3, + .sizeof_field = sizeof(long), + .debugctl_mask = (1<<6)|(1<<7) +}; + +static const struct bts_configuration bts_cfg_core2 = { + .sizeof_bts = 8 * 3, + .sizeof_field = 8, + .debugctl_mask = (1<<6)|(1<<7)|(1<<9) +}; + +static inline void bts_configure(const struct bts_configuration *cfg) +{ + bts_cfg = *cfg; +} + +void __cpuinit ptrace_bts_init_intel(struct cpuinfo_x86 *c) +{ + switch (c->x86) { + case 0x6: + switch (c->x86_model) { + case 0xD: + case 0xE: /* Pentium M */ + bts_configure(&bts_cfg_pentium_m); + break; + case 0xF: /* Core2 */ + case 0x1C: /* Atom */ + bts_configure(&bts_cfg_core2); + break; + default: + /* sorry, don't know about them */ + break; + } + break; + case 0xF: + switch (c->x86_model) { + case 0x0: + case 0x1: + case 0x2: /* Netburst */ + bts_configure(&bts_cfg_netburst); + break; + default: + /* sorry, don't know about them */ + break; + } + break; + default: + /* sorry, don't know about them */ + break; + } +} +#endif /* CONFIG_X86_PTRACE_BTS */ /* * Called by kernel/ptrace.c when detaching.. @@ -852,15 +972,15 @@ void ptrace_disable(struct task_struct *child) #ifdef TIF_SYSCALL_EMU clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); #endif - if (child->thread.ds_area_msr) { -#ifdef X86_BTS - ptrace_bts_realloc(child, 0, 0); -#endif - child->thread.debugctlmsr &= ~ds_debugctl_mask(); - if (!child->thread.debugctlmsr) - clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); - clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); - } +#ifdef CONFIG_X86_PTRACE_BTS + (void)ds_release_bts(child); + + child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask; + if (!child->thread.debugctlmsr) + clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); + + clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); +#endif /* CONFIG_X86_PTRACE_BTS */ } #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION @@ -980,7 +1100,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) /* * These bits need more cooking - not enabled yet: */ -#ifdef X86_BTS +#ifdef CONFIG_X86_PTRACE_BTS case PTRACE_BTS_CONFIG: ret = ptrace_bts_config (child, data, (struct ptrace_bts_config __user *)addr); @@ -992,7 +1112,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) break; case PTRACE_BTS_SIZE: - ret = ptrace_bts_get_size(child); + ret = ds_get_bts_index(child, /* pos = */ NULL); break; case PTRACE_BTS_GET: @@ -1001,14 +1121,14 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) break; case PTRACE_BTS_CLEAR: - ret = ptrace_bts_clear(child); + ret = ds_clear_bts(child); break; case PTRACE_BTS_DRAIN: ret = ptrace_bts_drain (child, data, (struct bts_struct __user *) addr); break; -#endif +#endif /* CONFIG_X86_PTRACE_BTS */ default: ret = ptrace_request(child, request, addr, data); @@ -1290,6 +1410,12 @@ static const struct user_regset x86_64_regsets[] = { .size = sizeof(long), .align = sizeof(long), .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set }, + [REGSET_IOPERM64] = { + .core_note_type = NT_386_IOPERM, + .n = IO_BITMAP_LONGS, + .size = sizeof(long), .align = sizeof(long), + .active = ioperm_active, .get = ioperm_get + }, }; static const struct user_regset_view user_x86_64_view = { @@ -1336,6 +1462,12 @@ static const struct user_regset x86_32_regsets[] = { .active = regset_tls_active, .get = regset_tls_get, .set = regset_tls_set }, + [REGSET_IOPERM32] = { + .core_note_type = NT_386_IOPERM, + .n = IO_BITMAP_BYTES / sizeof(u32), + .size = sizeof(u32), .align = sizeof(u32), + .active = ioperm_active, .get = ioperm_get + }, }; static const struct user_regset_view user_x86_32_view = { @@ -1357,7 +1489,8 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task) #endif } -void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code) +void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, + int error_code, int si_code) { struct siginfo info; @@ -1366,7 +1499,7 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code) memset(&info, 0, sizeof(info)); info.si_signo = SIGTRAP; - info.si_code = TRAP_BRKPT; + info.si_code = si_code; /* User-mode ip? */ info.si_addr = user_mode_vm(regs) ? (void __user *) regs->ip : NULL; @@ -1375,30 +1508,6 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code) force_sig_info(SIGTRAP, &info, tsk); } -static void syscall_trace(struct pt_regs *regs) -{ - if (!(current->ptrace & PT_PTRACED)) - return; - -#if 0 - printk("trace %s ip %lx sp %lx ax %d origrax %d caller %lx tiflags %x ptrace %x\n", - current->comm, - regs->ip, regs->sp, regs->ax, regs->orig_ax, __builtin_return_address(0), - current_thread_info()->flags, current->ptrace); -#endif - - ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) - ? 0x80 : 0)); - /* - * this isn't the same as continuing with a signal, but it will do - * for normal use. strace only continues with a signal if the - * stopping signal is not SIGTRAP. -brl - */ - if (current->exit_code) { - send_sig(current->exit_code, current, 1); - current->exit_code = 0; - } -} #ifdef CONFIG_X86_32 # define IS_IA32 1 @@ -1432,8 +1541,9 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs) if (unlikely(test_thread_flag(TIF_SYSCALL_EMU))) ret = -1L; - if (ret || test_thread_flag(TIF_SYSCALL_TRACE)) - syscall_trace(regs); + if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) && + tracehook_report_syscall_entry(regs)) + ret = -1L; if (unlikely(current->audit_context)) { if (IS_IA32) @@ -1459,7 +1569,7 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs) audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); if (test_thread_flag(TIF_SYSCALL_TRACE)) - syscall_trace(regs); + tracehook_report_syscall_exit(regs, 0); /* * If TIF_SYSCALL_EMU is set, we only get here because of @@ -1475,6 +1585,6 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs) * system call instruction. */ if (test_thread_flag(TIF_SINGLESTEP) && - (current->ptrace & PT_PTRACED)) - send_sigtrap(current, regs, 0); + tracehook_consider_fatal_signal(current, SIGTRAP, SIG_DFL)) + send_sigtrap(current, regs, 0, TRAP_BRKPT); } diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 05fbe9a0325..4f9c55f3a7c 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -97,6 +97,18 @@ static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst, return dst->version; } +unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src) +{ + u64 pv_tsc_khz = 1000000ULL << 32; + + do_div(pv_tsc_khz, src->tsc_to_system_mul); + if (src->tsc_shift < 0) + pv_tsc_khz <<= -src->tsc_shift; + else + pv_tsc_khz >>= src->tsc_shift; + return pv_tsc_khz; +} + cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) { struct pvclock_shadow_time shadow; diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index d1385881810..f6a11b9b1f9 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -354,9 +354,27 @@ static void ati_force_hpet_resume(void) printk(KERN_DEBUG "Force enabled HPET at resume\n"); } +static u32 ati_ixp4x0_rev(struct pci_dev *dev) +{ + u32 d; + u8 b; + + pci_read_config_byte(dev, 0xac, &b); + b &= ~(1<<5); + pci_write_config_byte(dev, 0xac, b); + pci_read_config_dword(dev, 0x70, &d); + d |= 1<<8; + pci_write_config_dword(dev, 0x70, d); + pci_read_config_dword(dev, 0x8, &d); + d &= 0xff; + dev_printk(KERN_DEBUG, &dev->dev, "SB4X0 revision 0x%x\n", d); + return d; +} + static void ati_force_enable_hpet(struct pci_dev *dev) { - u32 uninitialized_var(val); + u32 d, val; + u8 b; if (hpet_address || force_hpet_address) return; @@ -366,14 +384,33 @@ static void ati_force_enable_hpet(struct pci_dev *dev) return; } + d = ati_ixp4x0_rev(dev); + if (d < 0x82) + return; + + /* base address */ pci_write_config_dword(dev, 0x14, 0xfed00000); pci_read_config_dword(dev, 0x14, &val); + + /* enable interrupt */ + outb(0x72, 0xcd6); b = inb(0xcd7); + b |= 0x1; + outb(0x72, 0xcd6); outb(b, 0xcd7); + outb(0x72, 0xcd6); b = inb(0xcd7); + if (!(b & 0x1)) + return; + pci_read_config_dword(dev, 0x64, &d); + d |= (1<<10); + pci_write_config_dword(dev, 0x64, d); + pci_read_config_dword(dev, 0x64, &d); + if (!(d & (1<<10))) + return; + force_hpet_address = val; force_hpet_resume_type = ATI_FORCE_HPET_RESUME; dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n", force_hpet_address); cached_dev = dev; - return; } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS, ati_force_enable_hpet); diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 724adfc63cb..f4c93f1cfc1 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -29,7 +29,11 @@ EXPORT_SYMBOL(pm_power_off); static const struct desc_ptr no_idt = {}; static int reboot_mode; -enum reboot_type reboot_type = BOOT_KBD; +/* + * Keyboard reset and triple fault may result in INIT, not RESET, which + * doesn't work when we're in vmx root mode. Try ACPI first. + */ +enum reboot_type reboot_type = BOOT_ACPI; int reboot_force; #if defined(CONFIG_X86_32) && defined(CONFIG_SMP) diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 05191bbc68b..0a23b5795b2 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -223,11 +223,25 @@ static struct platform_device rtc_device = { static __init int add_rtc_cmos(void) { #ifdef CONFIG_PNP - if (!pnp_platform_devices) - platform_device_register(&rtc_device); -#else + static const char *ids[] __initconst = + { "PNP0b00", "PNP0b01", "PNP0b02", }; + struct pnp_dev *dev; + struct pnp_id *id; + int i; + + pnp_for_each_dev(dev) { + for (id = dev->id; id; id = id->next) { + for (i = 0; i < ARRAY_SIZE(ids); i++) { + if (compare_pnp_id(id, ids[i]) != 0) + return 0; + } + } + } +#endif + platform_device_register(&rtc_device); -#endif /* CONFIG_PNP */ + dev_info(&rtc_device.dev, + "registered platform RTC device (no PNP device found)\n"); return 0; } device_initcall(add_rtc_cmos); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index a4656adab53..2255782e8d4 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -223,6 +223,9 @@ unsigned long saved_video_mode; #define RAMDISK_LOAD_FLAG 0x4000 static char __initdata command_line[COMMAND_LINE_SIZE]; +#ifdef CONFIG_CMDLINE_BOOL +static char __initdata builtin_cmdline[COMMAND_LINE_SIZE] = CONFIG_CMDLINE; +#endif #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) struct edd edd; @@ -299,7 +302,7 @@ static void __init relocate_initrd(void) if (clen > MAX_MAP_CHUNK-slop) clen = MAX_MAP_CHUNK-slop; mapaddr = ramdisk_image & PAGE_MASK; - p = early_ioremap(mapaddr, clen+slop); + p = early_memremap(mapaddr, clen+slop); memcpy(q, p+slop, clen); early_iounmap(p, clen+slop); q += clen; @@ -376,7 +379,7 @@ static void __init parse_setup_data(void) return; pa_data = boot_params.hdr.setup_data; while (pa_data) { - data = early_ioremap(pa_data, PAGE_SIZE); + data = early_memremap(pa_data, PAGE_SIZE); switch (data->type) { case SETUP_E820_EXT: parse_e820_ext(data, pa_data); @@ -399,7 +402,7 @@ static void __init e820_reserve_setup_data(void) return; pa_data = boot_params.hdr.setup_data; while (pa_data) { - data = early_ioremap(pa_data, sizeof(*data)); + data = early_memremap(pa_data, sizeof(*data)); e820_update_range(pa_data, sizeof(*data)+data->len, E820_RAM, E820_RESERVED_KERN); found = 1; @@ -425,7 +428,7 @@ static void __init reserve_early_setup_data(void) return; pa_data = boot_params.hdr.setup_data; while (pa_data) { - data = early_ioremap(pa_data, sizeof(*data)); + data = early_memremap(pa_data, sizeof(*data)); sprintf(buf, "setup data %x", data->type); reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf); pa_data = data->next; @@ -579,6 +582,190 @@ static struct x86_quirks default_x86_quirks __initdata; struct x86_quirks *x86_quirks __initdata = &default_x86_quirks; /* + * Some BIOSes seem to corrupt the low 64k of memory during events + * like suspend/resume and unplugging an HDMI cable. Reserve all + * remaining free memory in that area and fill it with a distinct + * pattern. + */ +#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION +#define MAX_SCAN_AREAS 8 + +static int __read_mostly memory_corruption_check = -1; + +static unsigned __read_mostly corruption_check_size = 64*1024; +static unsigned __read_mostly corruption_check_period = 60; /* seconds */ + +static struct e820entry scan_areas[MAX_SCAN_AREAS]; +static int num_scan_areas; + + +static int set_corruption_check(char *arg) +{ + char *end; + + memory_corruption_check = simple_strtol(arg, &end, 10); + + return (*end == 0) ? 0 : -EINVAL; +} +early_param("memory_corruption_check", set_corruption_check); + +static int set_corruption_check_period(char *arg) +{ + char *end; + + corruption_check_period = simple_strtoul(arg, &end, 10); + + return (*end == 0) ? 0 : -EINVAL; +} +early_param("memory_corruption_check_period", set_corruption_check_period); + +static int set_corruption_check_size(char *arg) +{ + char *end; + unsigned size; + + size = memparse(arg, &end); + + if (*end == '\0') + corruption_check_size = size; + + return (size == corruption_check_size) ? 0 : -EINVAL; +} +early_param("memory_corruption_check_size", set_corruption_check_size); + + +static void __init setup_bios_corruption_check(void) +{ + u64 addr = PAGE_SIZE; /* assume first page is reserved anyway */ + + if (memory_corruption_check == -1) { + memory_corruption_check = +#ifdef CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK + 1 +#else + 0 +#endif + ; + } + + if (corruption_check_size == 0) + memory_corruption_check = 0; + + if (!memory_corruption_check) + return; + + corruption_check_size = round_up(corruption_check_size, PAGE_SIZE); + + while(addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) { + u64 size; + addr = find_e820_area_size(addr, &size, PAGE_SIZE); + + if (addr == 0) + break; + + if ((addr + size) > corruption_check_size) + size = corruption_check_size - addr; + + if (size == 0) + break; + + e820_update_range(addr, size, E820_RAM, E820_RESERVED); + scan_areas[num_scan_areas].addr = addr; + scan_areas[num_scan_areas].size = size; + num_scan_areas++; + + /* Assume we've already mapped this early memory */ + memset(__va(addr), 0, size); + + addr += size; + } + + printk(KERN_INFO "Scanning %d areas for low memory corruption\n", + num_scan_areas); + update_e820(); +} + +static struct timer_list periodic_check_timer; + +void check_for_bios_corruption(void) +{ + int i; + int corruption = 0; + + if (!memory_corruption_check) + return; + + for(i = 0; i < num_scan_areas; i++) { + unsigned long *addr = __va(scan_areas[i].addr); + unsigned long size = scan_areas[i].size; + + for(; size; addr++, size -= sizeof(unsigned long)) { + if (!*addr) + continue; + printk(KERN_ERR "Corrupted low memory at %p (%lx phys) = %08lx\n", + addr, __pa(addr), *addr); + corruption = 1; + *addr = 0; + } + } + + WARN(corruption, KERN_ERR "Memory corruption detected in low memory\n"); +} + +static void periodic_check_for_corruption(unsigned long data) +{ + check_for_bios_corruption(); + mod_timer(&periodic_check_timer, round_jiffies(jiffies + corruption_check_period*HZ)); +} + +void start_periodic_check_for_corruption(void) +{ + if (!memory_corruption_check || corruption_check_period == 0) + return; + + printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n", + corruption_check_period); + + init_timer(&periodic_check_timer); + periodic_check_timer.function = &periodic_check_for_corruption; + periodic_check_for_corruption(0); +} +#endif + +static int __init dmi_low_memory_corruption(const struct dmi_system_id *d) +{ + printk(KERN_NOTICE + "%s detected: BIOS may corrupt low RAM, working it around.\n", + d->ident); + + e820_update_range(0, 0x10000, E820_RAM, E820_RESERVED); + sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + + return 0; +} + +/* List of systems that have known low memory corruption BIOS problems */ +static struct dmi_system_id __initdata bad_bios_dmi_table[] = { +#ifdef CONFIG_X86_RESERVE_LOW_64K + { + .callback = dmi_low_memory_corruption, + .ident = "AMI BIOS", + .matches = { + DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."), + }, + }, + { + .callback = dmi_low_memory_corruption, + .ident = "Phoenix BIOS", + .matches = { + DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies, LTD"), + }, + }, +#endif + {} +}; + +/* * Determine if we were loaded by an EFI loader. If so, then we have also been * passed the efi memmap, systab, etc., so we should use these data structures * for initialization. Note, the efi init code path is determined by the @@ -604,14 +791,6 @@ void __init setup_arch(char **cmdline_p) early_cpu_init(); early_ioremap_init(); -#if defined(CONFIG_VMI) && defined(CONFIG_X86_32) - /* - * Must be before kernel pagetables are setup - * or fixmap area is touched. - */ - vmi_init(); -#endif - ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); screen_info = boot_params.screen_info; edid_info = boot_params.edid_info; @@ -673,11 +852,36 @@ void __init setup_arch(char **cmdline_p) bss_resource.start = virt_to_phys(&__bss_start); bss_resource.end = virt_to_phys(&__bss_stop)-1; +#ifdef CONFIG_CMDLINE_BOOL +#ifdef CONFIG_CMDLINE_OVERRIDE + strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); +#else + if (builtin_cmdline[0]) { + /* append boot loader cmdline to builtin */ + strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE); + strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE); + strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); + } +#endif +#endif + strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); *cmdline_p = command_line; parse_early_param(); +#ifdef CONFIG_X86_64 + check_efer(); +#endif + +#if defined(CONFIG_VMI) && defined(CONFIG_X86_32) + /* + * Must be before kernel pagetables are setup + * or fixmap area is touched. + */ + vmi_init(); +#endif + /* after early param, so could get panic from serial */ reserve_early_setup_data(); @@ -695,6 +899,10 @@ void __init setup_arch(char **cmdline_p) finish_e820_parsing(); + dmi_scan_machine(); + + dmi_check_system(bad_bios_dmi_table); + #ifdef CONFIG_X86_32 probe_roms(); #endif @@ -738,7 +946,8 @@ void __init setup_arch(char **cmdline_p) #else num_physpages = max_pfn; - check_efer(); + if (cpu_has_x2apic) + check_x2apic(); /* How many end-of-memory variables you have, grandma! */ /* need this before calling reserve_initrd */ @@ -750,6 +959,10 @@ void __init setup_arch(char **cmdline_p) high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; #endif +#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION + setup_bios_corruption_check(); +#endif + /* max_pfn_mapped is updated here */ max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT); max_pfn_mapped = max_low_pfn_mapped; @@ -778,8 +991,6 @@ void __init setup_arch(char **cmdline_p) vsmp_init(); #endif - dmi_scan_machine(); - io_delay_init(); /* @@ -787,6 +998,8 @@ void __init setup_arch(char **cmdline_p) */ acpi_boot_table_init(); + early_acpi_boot_init(); + #ifdef CONFIG_ACPI_NUMA /* * Parse SRAT to discover nodes. @@ -882,3 +1095,5 @@ void __init setup_arch(char **cmdline_p) #endif #endif } + + diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 76e305e064f..0e67f72d931 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -162,9 +162,16 @@ void __init setup_per_cpu_areas(void) printk(KERN_INFO "cpu %d has no node %d or node-local memory\n", cpu, node); + if (ptr) + printk(KERN_DEBUG "per cpu data for cpu%d at %016lx\n", + cpu, __pa(ptr)); } - else + else { ptr = alloc_bootmem_pages_node(NODE_DATA(node), size); + if (ptr) + printk(KERN_DEBUG "per cpu data for cpu%d on node%d at %016lx\n", + cpu, node, __pa(ptr)); + } #endif per_cpu_offset(cpu) = ptr - __per_cpu_start; memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); diff --git a/arch/x86/kernel/sigframe.h b/arch/x86/kernel/sigframe.h index 72bbb519d2d..cc673aa55ce 100644 --- a/arch/x86/kernel/sigframe.h +++ b/arch/x86/kernel/sigframe.h @@ -3,9 +3,18 @@ struct sigframe { char __user *pretcode; int sig; struct sigcontext sc; - struct _fpstate fpstate; + /* + * fpstate is unused. fpstate is moved/allocated after + * retcode[] below. This movement allows to have the FP state and the + * future state extensions (xsave) stay together. + * And at the same time retaining the unused fpstate, prevents changing + * the offset of extramask[] in the sigframe and thus prevent any + * legacy application accessing/modifying it. + */ + struct _fpstate fpstate_unused; unsigned long extramask[_NSIG_WORDS-1]; char retcode[8]; + /* fp state follows here */ }; struct rt_sigframe { @@ -15,13 +24,19 @@ struct rt_sigframe { void __user *puc; struct siginfo info; struct ucontext uc; - struct _fpstate fpstate; char retcode[8]; + /* fp state follows here */ }; #else struct rt_sigframe { char __user *pretcode; struct ucontext uc; struct siginfo info; + /* fp state follows here */ }; + +int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, + sigset_t *set, struct pt_regs *regs); +int ia32_setup_frame(int sig, struct k_sigaction *ka, + sigset_t *set, struct pt_regs *regs); #endif diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index 6fb5bcdd893..d6dd057d0f2 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -17,6 +17,7 @@ #include <linux/errno.h> #include <linux/sched.h> #include <linux/wait.h> +#include <linux/tracehook.h> #include <linux/elf.h> #include <linux/smp.h> #include <linux/mm.h> @@ -26,6 +27,8 @@ #include <asm/uaccess.h> #include <asm/i387.h> #include <asm/vdso.h> +#include <asm/syscall.h> +#include <asm/syscalls.h> #include "sigframe.h" @@ -110,6 +113,27 @@ asmlinkage int sys_sigaltstack(unsigned long bx) return do_sigaltstack(uss, uoss, regs->sp); } +#define COPY(x) { \ + err |= __get_user(regs->x, &sc->x); \ +} + +#define COPY_SEG(seg) { \ + unsigned short tmp; \ + err |= __get_user(tmp, &sc->seg); \ + regs->seg = tmp; \ +} + +#define COPY_SEG_STRICT(seg) { \ + unsigned short tmp; \ + err |= __get_user(tmp, &sc->seg); \ + regs->seg = tmp | 3; \ +} + +#define GET_SEG(seg) { \ + unsigned short tmp; \ + err |= __get_user(tmp, &sc->seg); \ + loadsegment(seg, tmp); \ +} /* * Do a signal return; undo the signal stack. @@ -118,28 +142,13 @@ static int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned long *pax) { + void __user *buf; + unsigned int tmpflags; unsigned int err = 0; /* Always make any pending restarted system calls return -EINTR */ current_thread_info()->restart_block.fn = do_no_restart_syscall; -#define COPY(x) err |= __get_user(regs->x, &sc->x) - -#define COPY_SEG(seg) \ - { unsigned short tmp; \ - err |= __get_user(tmp, &sc->seg); \ - regs->seg = tmp; } - -#define COPY_SEG_STRICT(seg) \ - { unsigned short tmp; \ - err |= __get_user(tmp, &sc->seg); \ - regs->seg = tmp|3; } - -#define GET_SEG(seg) \ - { unsigned short tmp; \ - err |= __get_user(tmp, &sc->seg); \ - loadsegment(seg, tmp); } - GET_SEG(gs); COPY_SEG(fs); COPY_SEG(es); @@ -149,38 +158,15 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, COPY_SEG_STRICT(cs); COPY_SEG_STRICT(ss); - { - unsigned int tmpflags; - - err |= __get_user(tmpflags, &sc->flags); - regs->flags = (regs->flags & ~FIX_EFLAGS) | - (tmpflags & FIX_EFLAGS); - regs->orig_ax = -1; /* disable syscall checks */ - } + err |= __get_user(tmpflags, &sc->flags); + regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); + regs->orig_ax = -1; /* disable syscall checks */ - { - struct _fpstate __user *buf; - - err |= __get_user(buf, &sc->fpstate); - if (buf) { - if (!access_ok(VERIFY_READ, buf, sizeof(*buf))) - goto badframe; - err |= restore_i387(buf); - } else { - struct task_struct *me = current; - - if (used_math()) { - clear_fpu(me); - clear_used_math(); - } - } - } + err |= __get_user(buf, &sc->fpstate); + err |= restore_i387_xstate(buf); err |= __get_user(*pax, &sc->ax); return err; - -badframe: - return 1; } asmlinkage unsigned long sys_sigreturn(unsigned long __unused) @@ -226,9 +212,8 @@ badframe: return 0; } -asmlinkage int sys_rt_sigreturn(unsigned long __unused) +static long do_rt_sigreturn(struct pt_regs *regs) { - struct pt_regs *regs = (struct pt_regs *)&__unused; struct rt_sigframe __user *frame; unsigned long ax; sigset_t set; @@ -254,15 +239,22 @@ asmlinkage int sys_rt_sigreturn(unsigned long __unused) return ax; badframe: - force_sig(SIGSEGV, current); + signal_fault(regs, frame, "rt_sigreturn"); return 0; } +asmlinkage int sys_rt_sigreturn(unsigned long __unused) +{ + struct pt_regs *regs = (struct pt_regs *)&__unused; + + return do_rt_sigreturn(regs); +} + /* * Set up a signal frame. */ static int -setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate, +setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, struct pt_regs *regs, unsigned long mask) { int tmp, err = 0; @@ -289,7 +281,7 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate, err |= __put_user(regs->sp, &sc->sp_at_signal); err |= __put_user(regs->ss, (unsigned int __user *)&sc->ss); - tmp = save_i387(fpstate); + tmp = save_i387_xstate(fpstate); if (tmp < 0) err = 1; else @@ -306,7 +298,8 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate, * Determine which stack to use.. */ static inline void __user * -get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size) +get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, + void **fpstate) { unsigned long sp; @@ -332,6 +325,11 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size) sp = (unsigned long) ka->sa.sa_restorer; } + if (used_math()) { + sp = sp - sig_xstate_size; + *fpstate = (struct _fpstate *) sp; + } + sp -= frame_size; /* * Align the stack pointer according to the i386 ABI, @@ -343,38 +341,29 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size) } static int -setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, - struct pt_regs *regs) +__setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, + struct pt_regs *regs) { struct sigframe __user *frame; void __user *restorer; int err = 0; - int usig; + void __user *fpstate = NULL; - frame = get_sigframe(ka, regs, sizeof(*frame)); + frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) - goto give_sigsegv; + return -EFAULT; - usig = current_thread_info()->exec_domain - && current_thread_info()->exec_domain->signal_invmap - && sig < 32 - ? current_thread_info()->exec_domain->signal_invmap[sig] - : sig; + if (__put_user(sig, &frame->sig)) + return -EFAULT; - err = __put_user(usig, &frame->sig); - if (err) - goto give_sigsegv; - - err = setup_sigcontext(&frame->sc, &frame->fpstate, regs, set->sig[0]); - if (err) - goto give_sigsegv; + if (setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0])) + return -EFAULT; if (_NSIG_WORDS > 1) { - err = __copy_to_user(&frame->extramask, &set->sig[1], - sizeof(frame->extramask)); - if (err) - goto give_sigsegv; + if (__copy_to_user(&frame->extramask, &set->sig[1], + sizeof(frame->extramask))) + return -EFAULT; } if (current->mm->context.vdso) @@ -399,7 +388,7 @@ setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, err |= __put_user(0x80cd, (short __user *)(frame->retcode+6)); if (err) - goto give_sigsegv; + return -EFAULT; /* Set up registers for signal handler */ regs->sp = (unsigned long)frame; @@ -414,50 +403,43 @@ setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, regs->cs = __USER_CS; return 0; - -give_sigsegv: - force_sigsegv(sig, current); - return -EFAULT; } -static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, - sigset_t *set, struct pt_regs *regs) +static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, + sigset_t *set, struct pt_regs *regs) { struct rt_sigframe __user *frame; void __user *restorer; int err = 0; - int usig; + void __user *fpstate = NULL; - frame = get_sigframe(ka, regs, sizeof(*frame)); + frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) - goto give_sigsegv; + return -EFAULT; - usig = current_thread_info()->exec_domain - && current_thread_info()->exec_domain->signal_invmap - && sig < 32 - ? current_thread_info()->exec_domain->signal_invmap[sig] - : sig; - - err |= __put_user(usig, &frame->sig); + err |= __put_user(sig, &frame->sig); err |= __put_user(&frame->info, &frame->pinfo); err |= __put_user(&frame->uc, &frame->puc); err |= copy_siginfo_to_user(&frame->info, info); if (err) - goto give_sigsegv; + return -EFAULT; /* Create the ucontext. */ - err |= __put_user(0, &frame->uc.uc_flags); + if (cpu_has_xsave) + err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags); + else + err |= __put_user(0, &frame->uc.uc_flags); err |= __put_user(0, &frame->uc.uc_link); err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); err |= __put_user(sas_ss_flags(regs->sp), &frame->uc.uc_stack.ss_flags); err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size); - err |= setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate, + err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate, regs, set->sig[0]); err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); if (err) - goto give_sigsegv; + return -EFAULT; /* Set up to return from userspace. */ restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn); @@ -477,12 +459,12 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, err |= __put_user(0x80cd, (short __user *)(frame->retcode+5)); if (err) - goto give_sigsegv; + return -EFAULT; /* Set up registers for signal handler */ regs->sp = (unsigned long)frame; regs->ip = (unsigned long)ka->sa.sa_handler; - regs->ax = (unsigned long)usig; + regs->ax = (unsigned long)sig; regs->dx = (unsigned long)&frame->info; regs->cx = (unsigned long)&frame->uc; @@ -492,15 +474,48 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, regs->cs = __USER_CS; return 0; - -give_sigsegv: - force_sigsegv(sig, current); - return -EFAULT; } /* * OK, we're invoking a handler: */ +static int signr_convert(int sig) +{ + struct thread_info *info = current_thread_info(); + + if (info->exec_domain && info->exec_domain->signal_invmap && sig < 32) + return info->exec_domain->signal_invmap[sig]; + return sig; +} + +#define is_ia32 1 +#define ia32_setup_frame __setup_frame +#define ia32_setup_rt_frame __setup_rt_frame + +static int +setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, + sigset_t *set, struct pt_regs *regs) +{ + int usig = signr_convert(sig); + int ret; + + /* Set up the stack frame */ + if (is_ia32) { + if (ka->sa.sa_flags & SA_SIGINFO) + ret = ia32_setup_rt_frame(usig, ka, info, set, regs); + else + ret = ia32_setup_frame(usig, ka, set, regs); + } else + ret = __setup_rt_frame(sig, ka, info, set, regs); + + if (ret) { + force_sigsegv(sig, current); + return -EFAULT; + } + + return ret; +} + static int handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, sigset_t *oldset, struct pt_regs *regs) @@ -508,9 +523,9 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, int ret; /* Are we from a system call? */ - if ((long)regs->orig_ax >= 0) { + if (syscall_get_nr(current, regs) >= 0) { /* If so, check system call restarting.. */ - switch (regs->ax) { + switch (syscall_get_error(current, regs)) { case -ERESTART_RESTARTBLOCK: case -ERESTARTNOHAND: regs->ax = -EINTR; @@ -537,15 +552,20 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, likely(test_and_clear_thread_flag(TIF_FORCED_TF))) regs->flags &= ~X86_EFLAGS_TF; - /* Set up the stack frame */ - if (ka->sa.sa_flags & SA_SIGINFO) - ret = setup_rt_frame(sig, ka, info, oldset, regs); - else - ret = setup_frame(sig, ka, oldset, regs); + ret = setup_rt_frame(sig, ka, info, oldset, regs); if (ret) return ret; +#ifdef CONFIG_X86_64 + /* + * This has nothing to do with segment registers, + * despite the name. This magic affects uaccess.h + * macros' behavior. Reset it to the normal setting. + */ + set_fs(USER_DS); +#endif + /* * Clear the direction flag as per the ABI for function entry. */ @@ -558,8 +578,6 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, * handler too. */ regs->flags &= ~X86_EFLAGS_TF; - if (test_thread_flag(TIF_SINGLESTEP)) - ptrace_notify(SIGTRAP); spin_lock_irq(¤t->sighand->siglock); sigorsets(¤t->blocked, ¤t->blocked, &ka->sa.sa_mask); @@ -568,9 +586,13 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); + tracehook_signal_handler(sig, info, ka, regs, + test_thread_flag(TIF_SINGLESTEP)); + return 0; } +#define NR_restart_syscall __NR_restart_syscall /* * Note that 'init' is a special process: it doesn't get signals it doesn't * want to handle. Thus you cannot kill init even with a SIGKILL even by @@ -623,9 +645,9 @@ static void do_signal(struct pt_regs *regs) } /* Did we come from a system call? */ - if ((long)regs->orig_ax >= 0) { + if (syscall_get_nr(current, regs) >= 0) { /* Restart the system call - no handlers present */ - switch (regs->ax) { + switch (syscall_get_error(current, regs)) { case -ERESTARTNOHAND: case -ERESTARTSYS: case -ERESTARTNOINTR: @@ -634,7 +656,7 @@ static void do_signal(struct pt_regs *regs) break; case -ERESTART_RESTARTBLOCK: - regs->ax = __NR_restart_syscall; + regs->ax = NR_restart_syscall; regs->ip -= 2; break; } @@ -657,9 +679,38 @@ static void do_signal(struct pt_regs *regs) void do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) { +#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE) + /* notify userspace of pending MCEs */ + if (thread_info_flags & _TIF_MCE_NOTIFY) + mce_notify_user(); +#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */ + /* deal with pending signal delivery */ if (thread_info_flags & _TIF_SIGPENDING) do_signal(regs); + if (thread_info_flags & _TIF_NOTIFY_RESUME) { + clear_thread_flag(TIF_NOTIFY_RESUME); + tracehook_notify_resume(regs); + } + +#ifdef CONFIG_X86_32 clear_thread_flag(TIF_IRET); +#endif /* CONFIG_X86_32 */ +} + +void signal_fault(struct pt_regs *regs, void __user *frame, char *where) +{ + struct task_struct *me = current; + + if (show_unhandled_signals && printk_ratelimit()) { + printk(KERN_INFO + "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx", + me->comm, me->pid, where, frame, + regs->ip, regs->sp, regs->orig_ax); + print_vma_addr(" in ", regs->ip); + printk(KERN_CONT "\n"); + } + + force_sig(SIGSEGV, me); } diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index ca316b5b742..a5c9627f4db 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -15,17 +15,21 @@ #include <linux/errno.h> #include <linux/wait.h> #include <linux/ptrace.h> +#include <linux/tracehook.h> #include <linux/unistd.h> #include <linux/stddef.h> #include <linux/personality.h> #include <linux/compiler.h> +#include <linux/uaccess.h> + #include <asm/processor.h> #include <asm/ucontext.h> -#include <asm/uaccess.h> #include <asm/i387.h> #include <asm/proto.h> #include <asm/ia32_unistd.h> #include <asm/mce.h> +#include <asm/syscall.h> +#include <asm/syscalls.h> #include "sigframe.h" #define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) @@ -41,11 +45,6 @@ # define FIX_EFLAGS __FIX_EFLAGS #endif -int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, - sigset_t *set, struct pt_regs * regs); -int ia32_setup_frame(int sig, struct k_sigaction *ka, - sigset_t *set, struct pt_regs * regs); - asmlinkage long sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, struct pt_regs *regs) @@ -53,67 +52,14 @@ sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, return do_sigaltstack(uss, uoss, regs->sp); } -/* - * Signal frame handlers. - */ - -static inline int save_i387(struct _fpstate __user *buf) -{ - struct task_struct *tsk = current; - int err = 0; - - BUILD_BUG_ON(sizeof(struct user_i387_struct) != - sizeof(tsk->thread.xstate->fxsave)); - - if ((unsigned long)buf % 16) - printk("save_i387: bad fpstate %p\n", buf); - - if (!used_math()) - return 0; - clear_used_math(); /* trigger finit */ - if (task_thread_info(tsk)->status & TS_USEDFPU) { - err = save_i387_checking((struct i387_fxsave_struct __user *) - buf); - if (err) - return err; - task_thread_info(tsk)->status &= ~TS_USEDFPU; - stts(); - } else { - if (__copy_to_user(buf, &tsk->thread.xstate->fxsave, - sizeof(struct i387_fxsave_struct))) - return -1; - } - return 1; +#define COPY(x) { \ + err |= __get_user(regs->x, &sc->x); \ } -/* - * This restores directly out of user space. Exceptions are handled. - */ -static inline int restore_i387(struct _fpstate __user *buf) -{ - struct task_struct *tsk = current; - int err; - - if (!used_math()) { - err = init_fpu(tsk); - if (err) - return err; - } - - if (!(task_thread_info(current)->status & TS_USEDFPU)) { - clts(); - task_thread_info(current)->status |= TS_USEDFPU; - } - err = restore_fpu_checking((__force struct i387_fxsave_struct *)buf); - if (unlikely(err)) { - /* - * Encountered an error while doing the restore from the - * user buffer, clear the fpu state. - */ - clear_fpu(tsk); - clear_used_math(); - } - return err; +#define COPY_SEG_STRICT(seg) { \ + unsigned short tmp; \ + err |= __get_user(tmp, &sc->seg); \ + regs->seg = tmp | 3; \ } /* @@ -123,13 +69,13 @@ static int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned long *pax) { + void __user *buf; + unsigned int tmpflags; unsigned int err = 0; /* Always make any pending restarted system calls return -EINTR */ current_thread_info()->restart_block.fn = do_no_restart_syscall; -#define COPY(x) err |= __get_user(regs->x, &sc->x) - COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); COPY(dx); COPY(cx); COPY(ip); COPY(r8); @@ -144,48 +90,24 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, /* Kernel saves and restores only the CS segment register on signals, * which is the bare minimum needed to allow mixed 32/64-bit code. * App's signal handler can save/restore other segments if needed. */ - { - unsigned cs; - err |= __get_user(cs, &sc->cs); - regs->cs = cs | 3; /* Force into user mode */ - } + COPY_SEG_STRICT(cs); - { - unsigned int tmpflags; - err |= __get_user(tmpflags, &sc->flags); - regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); - regs->orig_ax = -1; /* disable syscall checks */ - } + err |= __get_user(tmpflags, &sc->flags); + regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); + regs->orig_ax = -1; /* disable syscall checks */ - { - struct _fpstate __user * buf; - err |= __get_user(buf, &sc->fpstate); - - if (buf) { - if (!access_ok(VERIFY_READ, buf, sizeof(*buf))) - goto badframe; - err |= restore_i387(buf); - } else { - struct task_struct *me = current; - if (used_math()) { - clear_fpu(me); - clear_used_math(); - } - } - } + err |= __get_user(buf, &sc->fpstate); + err |= restore_i387_xstate(buf); err |= __get_user(*pax, &sc->ax); return err; - -badframe: - return 1; } -asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) +static long do_rt_sigreturn(struct pt_regs *regs) { struct rt_sigframe __user *frame; - sigset_t set; unsigned long ax; + sigset_t set; frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) @@ -198,7 +120,7 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) current->blocked = set; recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); - + if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) goto badframe; @@ -208,16 +130,22 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) return ax; badframe: - signal_fault(regs,frame,"sigreturn"); + signal_fault(regs, frame, "rt_sigreturn"); return 0; -} +} + +asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) +{ + return do_rt_sigreturn(regs); +} /* * Set up a signal frame. */ static inline int -setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned long mask, struct task_struct *me) +setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, + unsigned long mask, struct task_struct *me) { int err = 0; @@ -269,41 +197,40 @@ get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size) sp = current->sas_ss_sp + current->sas_ss_size; } - return (void __user *)round_down(sp - size, 16); + return (void __user *)round_down(sp - size, 64); } -static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, - sigset_t *set, struct pt_regs * regs) +static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, + sigset_t *set, struct pt_regs *regs) { struct rt_sigframe __user *frame; - struct _fpstate __user *fp = NULL; + void __user *fp = NULL; int err = 0; struct task_struct *me = current; if (used_math()) { - fp = get_stack(ka, regs, sizeof(struct _fpstate)); + fp = get_stack(ka, regs, sig_xstate_size); frame = (void __user *)round_down( (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8; - if (!access_ok(VERIFY_WRITE, fp, sizeof(struct _fpstate))) - goto give_sigsegv; - - if (save_i387(fp) < 0) - err |= -1; + if (save_i387_xstate(fp) < 0) + return -EFAULT; } else frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8; if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) - goto give_sigsegv; + return -EFAULT; - if (ka->sa.sa_flags & SA_SIGINFO) { - err |= copy_siginfo_to_user(&frame->info, info); - if (err) - goto give_sigsegv; + if (ka->sa.sa_flags & SA_SIGINFO) { + if (copy_siginfo_to_user(&frame->info, info)) + return -EFAULT; } - + /* Create the ucontext. */ - err |= __put_user(0, &frame->uc.uc_flags); + if (cpu_has_xsave) + err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags); + else + err |= __put_user(0, &frame->uc.uc_flags); err |= __put_user(0, &frame->uc.uc_link); err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp); err |= __put_user(sas_ss_flags(regs->sp), @@ -311,9 +238,9 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size); err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me); err |= __put_user(fp, &frame->uc.uc_mcontext.fpstate); - if (sizeof(*set) == 16) { + if (sizeof(*set) == 16) { __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]); - __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]); + __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]); } else err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); @@ -324,15 +251,15 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, err |= __put_user(ka->sa.sa_restorer, &frame->pretcode); } else { /* could use a vstub here */ - goto give_sigsegv; + return -EFAULT; } if (err) - goto give_sigsegv; + return -EFAULT; /* Set up registers for signal handler */ regs->di = sig; - /* In case the signal handler was declared without prototypes */ + /* In case the signal handler was declared without prototypes */ regs->ax = 0; /* This also works for non SA_SIGINFO handlers because they expect the @@ -348,44 +275,45 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, regs->cs = __USER_CS; return 0; - -give_sigsegv: - force_sigsegv(sig, current); - return -EFAULT; } /* - * Return -1L or the syscall number that @regs is executing. + * OK, we're invoking a handler */ -static long current_syscall(struct pt_regs *regs) +static int signr_convert(int sig) { - /* - * We always sign-extend a -1 value being set here, - * so this is always either -1L or a syscall number. - */ - return regs->orig_ax; + return sig; } -/* - * Return a value that is -EFOO if the system call in @regs->orig_ax - * returned an error. This only works for @regs from @current. - */ -static long current_syscall_ret(struct pt_regs *regs) -{ #ifdef CONFIG_IA32_EMULATION - if (test_thread_flag(TIF_IA32)) - /* - * Sign-extend the value so (int)-EFOO becomes (long)-EFOO - * and will match correctly in comparisons. - */ - return (int) regs->ax; +#define is_ia32 test_thread_flag(TIF_IA32) +#else +#define is_ia32 0 #endif - return regs->ax; -} -/* - * OK, we're invoking a handler - */ +static int +setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, + sigset_t *set, struct pt_regs *regs) +{ + int usig = signr_convert(sig); + int ret; + + /* Set up the stack frame */ + if (is_ia32) { + if (ka->sa.sa_flags & SA_SIGINFO) + ret = ia32_setup_rt_frame(usig, ka, info, set, regs); + else + ret = ia32_setup_frame(usig, ka, set, regs); + } else + ret = __setup_rt_frame(sig, ka, info, set, regs); + + if (ret) { + force_sigsegv(sig, current); + return -EFAULT; + } + + return ret; +} static int handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, @@ -394,9 +322,9 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, int ret; /* Are we from a system call? */ - if (current_syscall(regs) >= 0) { + if (syscall_get_nr(current, regs) >= 0) { /* If so, check system call restarting.. */ - switch (current_syscall_ret(regs)) { + switch (syscall_get_error(current, regs)) { case -ERESTART_RESTARTBLOCK: case -ERESTARTNOHAND: regs->ax = -EINTR; @@ -423,50 +351,48 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, likely(test_and_clear_thread_flag(TIF_FORCED_TF))) regs->flags &= ~X86_EFLAGS_TF; -#ifdef CONFIG_IA32_EMULATION - if (test_thread_flag(TIF_IA32)) { - if (ka->sa.sa_flags & SA_SIGINFO) - ret = ia32_setup_rt_frame(sig, ka, info, oldset, regs); - else - ret = ia32_setup_frame(sig, ka, oldset, regs); - } else -#endif ret = setup_rt_frame(sig, ka, info, oldset, regs); - if (ret == 0) { - /* - * This has nothing to do with segment registers, - * despite the name. This magic affects uaccess.h - * macros' behavior. Reset it to the normal setting. - */ - set_fs(USER_DS); + if (ret) + return ret; - /* - * Clear the direction flag as per the ABI for function entry. - */ - regs->flags &= ~X86_EFLAGS_DF; +#ifdef CONFIG_X86_64 + /* + * This has nothing to do with segment registers, + * despite the name. This magic affects uaccess.h + * macros' behavior. Reset it to the normal setting. + */ + set_fs(USER_DS); +#endif - /* - * Clear TF when entering the signal handler, but - * notify any tracer that was single-stepping it. - * The tracer may want to single-step inside the - * handler too. - */ - regs->flags &= ~X86_EFLAGS_TF; - if (test_thread_flag(TIF_SINGLESTEP)) - ptrace_notify(SIGTRAP); - - spin_lock_irq(¤t->sighand->siglock); - sigorsets(¤t->blocked,¤t->blocked,&ka->sa.sa_mask); - if (!(ka->sa.sa_flags & SA_NODEFER)) - sigaddset(¤t->blocked,sig); - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - } + /* + * Clear the direction flag as per the ABI for function entry. + */ + regs->flags &= ~X86_EFLAGS_DF; - return ret; + /* + * Clear TF when entering the signal handler, but + * notify any tracer that was single-stepping it. + * The tracer may want to single-step inside the + * handler too. + */ + regs->flags &= ~X86_EFLAGS_TF; + + spin_lock_irq(¤t->sighand->siglock); + sigorsets(¤t->blocked, ¤t->blocked, &ka->sa.sa_mask); + if (!(ka->sa.sa_flags & SA_NODEFER)) + sigaddset(¤t->blocked, sig); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + + tracehook_signal_handler(sig, info, ka, regs, + test_thread_flag(TIF_SINGLESTEP)); + + return 0; } +#define NR_restart_syscall \ + test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : __NR_restart_syscall /* * Note that 'init' is a special process: it doesn't get signals it doesn't * want to handle. Thus you cannot kill init even with a SIGKILL even by @@ -496,7 +422,8 @@ static void do_signal(struct pt_regs *regs) signr = get_signal_to_deliver(&info, &ka, regs, NULL); if (signr > 0) { - /* Re-enable any watchpoints before delivering the + /* + * Re-enable any watchpoints before delivering the * signal to user space. The processor register will * have been cleared if the watchpoint triggered * inside the kernel. @@ -504,7 +431,7 @@ static void do_signal(struct pt_regs *regs) if (current->thread.debugreg7) set_debugreg(current->thread.debugreg7, 7); - /* Whee! Actually deliver the signal. */ + /* Whee! Actually deliver the signal. */ if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { /* * A signal was successfully delivered; the saved @@ -518,19 +445,18 @@ static void do_signal(struct pt_regs *regs) } /* Did we come from a system call? */ - if (current_syscall(regs) >= 0) { + if (syscall_get_nr(current, regs) >= 0) { /* Restart the system call - no handlers present */ - switch (current_syscall_ret(regs)) { + switch (syscall_get_error(current, regs)) { case -ERESTARTNOHAND: case -ERESTARTSYS: case -ERESTARTNOINTR: regs->ax = regs->orig_ax; regs->ip -= 2; break; + case -ERESTART_RESTARTBLOCK: - regs->ax = test_thread_flag(TIF_IA32) ? - __NR_ia32_restart_syscall : - __NR_restart_syscall; + regs->ax = NR_restart_syscall; regs->ip -= 2; break; } @@ -546,29 +472,45 @@ static void do_signal(struct pt_regs *regs) } } -void do_notify_resume(struct pt_regs *regs, void *unused, - __u32 thread_info_flags) +/* + * notification of userspace execution resumption + * - triggered by the TIF_WORK_MASK flags + */ +void +do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) { -#ifdef CONFIG_X86_MCE +#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE) /* notify userspace of pending MCEs */ if (thread_info_flags & _TIF_MCE_NOTIFY) mce_notify_user(); -#endif /* CONFIG_X86_MCE */ +#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */ /* deal with pending signal delivery */ if (thread_info_flags & _TIF_SIGPENDING) do_signal(regs); + + if (thread_info_flags & _TIF_NOTIFY_RESUME) { + clear_thread_flag(TIF_NOTIFY_RESUME); + tracehook_notify_resume(regs); + } + +#ifdef CONFIG_X86_32 + clear_thread_flag(TIF_IRET); +#endif /* CONFIG_X86_32 */ } void signal_fault(struct pt_regs *regs, void __user *frame, char *where) -{ - struct task_struct *me = current; +{ + struct task_struct *me = current; + if (show_unhandled_signals && printk_ratelimit()) { - printk("%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx", - me->comm,me->pid,where,frame,regs->ip,regs->sp,regs->orig_ax); + printk(KERN_INFO + "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx", + me->comm, me->pid, where, frame, + regs->ip, regs->sp, regs->orig_ax); print_vma_addr(" in ", regs->ip); - printk("\n"); + printk(KERN_CONT "\n"); } - force_sig(SIGSEGV, me); -} + force_sig(SIGSEGV, me); +} diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 361b7a4c640..18f9b19f5f8 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -214,12 +214,16 @@ void smp_call_function_single_interrupt(struct pt_regs *regs) struct smp_ops smp_ops = { .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu, .smp_prepare_cpus = native_smp_prepare_cpus, - .cpu_up = native_cpu_up, .smp_cpus_done = native_smp_cpus_done, .smp_send_stop = native_smp_send_stop, .smp_send_reschedule = native_smp_send_reschedule, + .cpu_up = native_cpu_up, + .cpu_die = native_cpu_die, + .cpu_disable = native_cpu_disable, + .play_dead = native_play_dead, + .send_call_func_ipi = native_send_call_func_ipi, .send_call_func_single_ipi = native_send_call_func_single_ipi, }; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index e139e617f42..7ed9e070a6e 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -52,6 +52,7 @@ #include <asm/desc.h> #include <asm/nmi.h> #include <asm/irq.h> +#include <asm/idle.h> #include <asm/smp.h> #include <asm/trampoline.h> #include <asm/cpu.h> @@ -88,7 +89,7 @@ static DEFINE_PER_CPU(struct task_struct *, idle_thread_array); #define get_idle_for_cpu(x) (per_cpu(idle_thread_array, x)) #define set_idle_for_cpu(x, p) (per_cpu(idle_thread_array, x) = (p)) #else -struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ; +static struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ; #define get_idle_for_cpu(x) (idle_thread_array[(x)]) #define set_idle_for_cpu(x, p) (idle_thread_array[(x)] = (p)) #endif @@ -123,13 +124,12 @@ EXPORT_PER_CPU_SYMBOL(cpu_info); static atomic_t init_deasserted; -static int boot_cpu_logical_apicid; /* representing cpus for which sibling maps can be computed */ static cpumask_t cpu_sibling_setup_map; /* Set if we find a B stepping CPU */ -int __cpuinitdata smp_b_stepping; +static int __cpuinitdata smp_b_stepping; #if defined(CONFIG_NUMA) && defined(CONFIG_X86_32) @@ -165,6 +165,8 @@ static void unmap_cpu_to_node(int cpu) #endif #ifdef CONFIG_X86_32 +static int boot_cpu_logical_apicid; + u8 cpu_2_logical_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID }; @@ -210,7 +212,7 @@ static void __cpuinit smp_callin(void) /* * (This works even if the APIC is not enabled.) */ - phys_id = GET_APIC_ID(read_apic_id()); + phys_id = read_apic_id(); cpuid = smp_processor_id(); if (cpu_isset(cpuid, cpu_callin_map)) { panic("%s: phys CPU#%d, CPU#%d already present??\n", __func__, @@ -257,6 +259,7 @@ static void __cpuinit smp_callin(void) end_local_APIC_setup(); map_cpu_to_logical_apicid(); + notify_cpu_starting(cpuid); /* * Get our bogomips. * @@ -279,6 +282,8 @@ static void __cpuinit smp_callin(void) cpu_set(cpuid, cpu_callin_map); } +static int __cpuinitdata unsafe_smp; + /* * Activate a secondary processor. */ @@ -331,14 +336,17 @@ static void __cpuinit start_secondary(void *unused) * does not change while we are assigning vectors to cpus. Holding * this lock ensures we don't half assign or remove an irq from a cpu. */ - ipi_call_lock_irq(); + ipi_call_lock(); lock_vector_lock(); __setup_vector_irq(smp_processor_id()); cpu_set(smp_processor_id(), cpu_online_map); unlock_vector_lock(); - ipi_call_unlock_irq(); + ipi_call_unlock(); per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; + /* enable local interrupts */ + local_irq_enable(); + setup_secondary_clock(); wmb(); @@ -391,7 +399,7 @@ static void __cpuinit smp_apply_quirks(struct cpuinfo_x86 *c) goto valid_k7; /* If we get here, not a certified SMP capable AMD system. */ - add_taint(TAINT_UNSAFE_SMP); + unsafe_smp = 1; } valid_k7: @@ -408,12 +416,10 @@ static void __cpuinit smp_checks(void) * Don't taint if we are running SMP kernel on a single non-MP * approved Athlon */ - if (tainted & TAINT_UNSAFE_SMP) { - if (num_online_cpus()) - printk(KERN_INFO "WARNING: This combination of AMD" - "processors is not suitable for SMP.\n"); - else - tainted &= ~TAINT_UNSAFE_SMP; + if (unsafe_smp && num_online_cpus() > 1) { + printk(KERN_INFO "WARNING: This combination of AMD" + "processors is not suitable for SMP.\n"); + add_taint(TAINT_UNSAFE_SMP); } } @@ -550,8 +556,7 @@ static inline void __inquire_remote_apic(int apicid) printk(KERN_CONT "a previous APIC delivery may have failed\n"); - apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); - apic_write(APIC_ICR, APIC_DM_REMRD | regs[i]); + apic_icr_write(APIC_DM_REMRD | regs[i], apicid); timeout = 0; do { @@ -583,11 +588,9 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) int maxlvt; /* Target chip */ - apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid)); - /* Boot on the stack */ /* Kick the second */ - apic_write(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL); + apic_icr_write(APIC_DM_NMI | APIC_DEST_LOGICAL, logical_apicid); pr_debug("Waiting for send to finish...\n"); send_status = safe_apic_wait_icr_idle(); @@ -596,10 +599,12 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) * Give the other CPU some time to accept the IPI. */ udelay(200); - maxlvt = lapic_get_maxlvt(); - if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ - apic_write(APIC_ESR, 0); - accept_status = (apic_read(APIC_ESR) & 0xEF); + if (APIC_INTEGRATED(apic_version[phys_apicid])) { + maxlvt = lapic_get_maxlvt(); + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); + accept_status = (apic_read(APIC_ESR) & 0xEF); + } pr_debug("NMI sent.\n"); if (send_status) @@ -640,13 +645,11 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) /* * Turn INIT on target chip */ - apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); - /* * Send IPI */ - apic_write(APIC_ICR, - APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT); + apic_icr_write(APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT, + phys_apicid); pr_debug("Waiting for send to finish...\n"); send_status = safe_apic_wait_icr_idle(); @@ -656,10 +659,8 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) pr_debug("Deasserting INIT.\n"); /* Target chip */ - apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); - /* Send IPI */ - apic_write(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT); + apic_icr_write(APIC_INT_LEVELTRIG | APIC_DM_INIT, phys_apicid); pr_debug("Waiting for send to finish...\n"); send_status = safe_apic_wait_icr_idle(); @@ -702,11 +703,10 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) */ /* Target chip */ - apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); - /* Boot on the stack */ /* Kick the second */ - apic_write(APIC_ICR, APIC_DM_STARTUP | (start_eip >> 12)); + apic_icr_write(APIC_DM_STARTUP | (start_eip >> 12), + phys_apicid); /* * Give the other CPU some time to accept the IPI. @@ -1175,10 +1175,17 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) * Setup boot CPU information */ smp_store_cpu_info(0); /* Final full version of the data */ +#ifdef CONFIG_X86_32 boot_cpu_logical_apicid = logical_smp_processor_id(); +#endif current_thread_info()->cpu = 0; /* needed? */ set_cpu_sibling_map(0); +#ifdef CONFIG_X86_64 + enable_IR_x2apic(); + setup_apic_routing(); +#endif + if (smp_sanity_check(max_cpus) < 0) { printk(KERN_INFO "SMP disabled\n"); disable_smp(); @@ -1186,9 +1193,9 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) } preempt_disable(); - if (GET_APIC_ID(read_apic_id()) != boot_cpu_physical_apicid) { + if (read_apic_id() != boot_cpu_physical_apicid) { panic("Boot APIC ID in local APIC unexpected (%d vs %d)", - GET_APIC_ID(read_apic_id()), boot_cpu_physical_apicid); + read_apic_id(), boot_cpu_physical_apicid); /* Or can we switch back to PIC here? */ } preempt_enable(); @@ -1221,6 +1228,9 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) printk(KERN_INFO "CPU%d: ", 0); print_cpu_info(&cpu_data(0)); setup_boot_clock(); + + if (is_uv_system()) + uv_system_init(); out: preempt_enable(); } @@ -1251,39 +1261,6 @@ void __init native_smp_cpus_done(unsigned int max_cpus) check_nmi_watchdog(); } -#ifdef CONFIG_HOTPLUG_CPU - -static void remove_siblinginfo(int cpu) -{ - int sibling; - struct cpuinfo_x86 *c = &cpu_data(cpu); - - for_each_cpu_mask_nr(sibling, per_cpu(cpu_core_map, cpu)) { - cpu_clear(cpu, per_cpu(cpu_core_map, sibling)); - /*/ - * last thread sibling in this cpu core going down - */ - if (cpus_weight(per_cpu(cpu_sibling_map, cpu)) == 1) - cpu_data(sibling).booted_cores--; - } - - for_each_cpu_mask_nr(sibling, per_cpu(cpu_sibling_map, cpu)) - cpu_clear(cpu, per_cpu(cpu_sibling_map, sibling)); - cpus_clear(per_cpu(cpu_sibling_map, cpu)); - cpus_clear(per_cpu(cpu_core_map, cpu)); - c->phys_proc_id = 0; - c->cpu_core_id = 0; - cpu_clear(cpu, cpu_sibling_setup_map); -} - -static int additional_cpus __initdata = -1; - -static __init int setup_additional_cpus(char *s) -{ - return s && get_option(&s, &additional_cpus) ? 0 : -EINVAL; -} -early_param("additional_cpus", setup_additional_cpus); - /* * cpu_possible_map should be static, it cannot change as cpu's * are onlined, or offlined. The reason is per-cpu data-structures @@ -1303,24 +1280,13 @@ early_param("additional_cpus", setup_additional_cpus); */ __init void prefill_possible_map(void) { - int i; - int possible; + int i, possible; /* no processor from mptable or madt */ if (!num_processors) num_processors = 1; -#ifdef CONFIG_HOTPLUG_CPU - if (additional_cpus == -1) { - if (disabled_cpus > 0) - additional_cpus = disabled_cpus; - else - additional_cpus = 0; - } -#else - additional_cpus = 0; -#endif - possible = num_processors + additional_cpus; + possible = num_processors + disabled_cpus; if (possible > NR_CPUS) possible = NR_CPUS; @@ -1333,6 +1299,31 @@ __init void prefill_possible_map(void) nr_cpu_ids = possible; } +#ifdef CONFIG_HOTPLUG_CPU + +static void remove_siblinginfo(int cpu) +{ + int sibling; + struct cpuinfo_x86 *c = &cpu_data(cpu); + + for_each_cpu_mask_nr(sibling, per_cpu(cpu_core_map, cpu)) { + cpu_clear(cpu, per_cpu(cpu_core_map, sibling)); + /*/ + * last thread sibling in this cpu core going down + */ + if (cpus_weight(per_cpu(cpu_sibling_map, cpu)) == 1) + cpu_data(sibling).booted_cores--; + } + + for_each_cpu_mask_nr(sibling, per_cpu(cpu_sibling_map, cpu)) + cpu_clear(cpu, per_cpu(cpu_sibling_map, sibling)); + cpus_clear(per_cpu(cpu_sibling_map, cpu)); + cpus_clear(per_cpu(cpu_core_map, cpu)); + c->phys_proc_id = 0; + c->cpu_core_id = 0; + cpu_clear(cpu, cpu_sibling_setup_map); +} + static void __ref remove_cpu_from_maps(int cpu) { cpu_clear(cpu, cpu_online_map); @@ -1343,25 +1334,9 @@ static void __ref remove_cpu_from_maps(int cpu) numa_remove_cpu(cpu); } -int __cpu_disable(void) +void cpu_disable_common(void) { int cpu = smp_processor_id(); - - /* - * Perhaps use cpufreq to drop frequency, but that could go - * into generic code. - * - * We won't take down the boot processor on i386 due to some - * interrupts only being able to be serviced by the BSP. - * Especially so if we're not using an IOAPIC -zwane - */ - if (cpu == 0) - return -EBUSY; - - if (nmi_watchdog == NMI_LOCAL_APIC) - stop_apic_nmi_watchdog(NULL); - clear_local_APIC(); - /* * HACK: * Allow any queued timer interrupts to get serviced @@ -1379,10 +1354,32 @@ int __cpu_disable(void) remove_cpu_from_maps(cpu); unlock_vector_lock(); fixup_irqs(cpu_online_map); +} + +int native_cpu_disable(void) +{ + int cpu = smp_processor_id(); + + /* + * Perhaps use cpufreq to drop frequency, but that could go + * into generic code. + * + * We won't take down the boot processor on i386 due to some + * interrupts only being able to be serviced by the BSP. + * Especially so if we're not using an IOAPIC -zwane + */ + if (cpu == 0) + return -EBUSY; + + if (nmi_watchdog == NMI_LOCAL_APIC) + stop_apic_nmi_watchdog(NULL); + clear_local_APIC(); + + cpu_disable_common(); return 0; } -void __cpu_die(unsigned int cpu) +void native_cpu_die(unsigned int cpu) { /* We don't do anything here: idle task is faking death itself. */ unsigned int i; @@ -1399,15 +1396,45 @@ void __cpu_die(unsigned int cpu) } printk(KERN_ERR "CPU %u didn't die...\n", cpu); } + +void play_dead_common(void) +{ + idle_task_exit(); + reset_lazy_tlbstate(); + irq_ctx_exit(raw_smp_processor_id()); + c1e_remove_cpu(raw_smp_processor_id()); + + mb(); + /* Ack it */ + __get_cpu_var(cpu_state) = CPU_DEAD; + + /* + * With physical CPU hotplug, we should halt the cpu + */ + local_irq_disable(); +} + +void native_play_dead(void) +{ + play_dead_common(); + wbinvd_halt(); +} + #else /* ... !CONFIG_HOTPLUG_CPU */ -int __cpu_disable(void) +int native_cpu_disable(void) { return -ENOSYS; } -void __cpu_die(unsigned int cpu) +void native_cpu_die(unsigned int cpu) { /* We said "no" in __cpu_disable */ BUG(); } + +void native_play_dead(void) +{ + BUG(); +} + #endif diff --git a/arch/x86/kernel/summit_32.c b/arch/x86/kernel/summit_32.c index d67ce5f044b..7b987852e87 100644 --- a/arch/x86/kernel/summit_32.c +++ b/arch/x86/kernel/summit_32.c @@ -30,7 +30,7 @@ #include <linux/init.h> #include <asm/io.h> #include <asm/bios_ebda.h> -#include <asm/mach-summit/mach_mpparse.h> +#include <asm/summit/mpparse.h> static struct rio_table_hdr *rio_table_hdr __initdata; static struct scal_detail *scal_devs[MAX_NUMNODES] __initdata; diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c index 7066cb855a6..1884a8d12bf 100644 --- a/arch/x86/kernel/sys_i386_32.c +++ b/arch/x86/kernel/sys_i386_32.c @@ -22,6 +22,8 @@ #include <linux/uaccess.h> #include <linux/unistd.h> +#include <asm/syscalls.h> + asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long pgoff) diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 3b360ef3381..6bc211accf0 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -13,15 +13,17 @@ #include <linux/utsname.h> #include <linux/personality.h> #include <linux/random.h> +#include <linux/uaccess.h> -#include <asm/uaccess.h> #include <asm/ia32.h> +#include <asm/syscalls.h> -asmlinkage long sys_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long off) +asmlinkage long sys_mmap(unsigned long addr, unsigned long len, + unsigned long prot, unsigned long flags, + unsigned long fd, unsigned long off) { long error; - struct file * file; + struct file *file; error = -EINVAL; if (off & ~PAGE_MASK) @@ -56,9 +58,9 @@ static void find_start_end(unsigned long flags, unsigned long *begin, unmapped base down for this case. This can give conflicts with the heap, but we assume that glibc malloc knows how to fall back to mmap. Give it 1GB - of playground for now. -AK */ - *begin = 0x40000000; - *end = 0x80000000; + of playground for now. -AK */ + *begin = 0x40000000; + *end = 0x80000000; if (current->flags & PF_RANDOMIZE) { new_begin = randomize_range(*begin, *begin + 0x02000000, 0); if (new_begin) @@ -66,9 +68,9 @@ static void find_start_end(unsigned long flags, unsigned long *begin, } } else { *begin = TASK_UNMAPPED_BASE; - *end = TASK_SIZE; + *end = TASK_SIZE; } -} +} unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, @@ -78,11 +80,11 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, struct vm_area_struct *vma; unsigned long start_addr; unsigned long begin, end; - + if (flags & MAP_FIXED) return addr; - find_start_end(flags, &begin, &end); + find_start_end(flags, &begin, &end); if (len > end) return -ENOMEM; @@ -96,12 +98,12 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, } if (((flags & MAP_32BIT) || test_thread_flag(TIF_IA32)) && len <= mm->cached_hole_size) { - mm->cached_hole_size = 0; + mm->cached_hole_size = 0; mm->free_area_cache = begin; } addr = mm->free_area_cache; - if (addr < begin) - addr = begin; + if (addr < begin) + addr = begin; start_addr = addr; full_search: @@ -127,7 +129,7 @@ full_search: return addr; } if (addr + mm->cached_hole_size < vma->vm_start) - mm->cached_hole_size = vma->vm_start - addr; + mm->cached_hole_size = vma->vm_start - addr; addr = vma->vm_end; } @@ -177,7 +179,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, vma = find_vma(mm, addr-len); if (!vma || addr <= vma->vm_start) /* remember the address as a hint for next time */ - return (mm->free_area_cache = addr-len); + return mm->free_area_cache = addr-len; } if (mm->mmap_base < len) @@ -194,7 +196,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, vma = find_vma(mm, addr); if (!vma || addr+len <= vma->vm_start) /* remember the address as a hint for next time */ - return (mm->free_area_cache = addr); + return mm->free_area_cache = addr; /* remember the largest hole we saw so far */ if (addr + mm->cached_hole_size < vma->vm_start) @@ -224,13 +226,13 @@ bottomup: } -asmlinkage long sys_uname(struct new_utsname __user * name) +asmlinkage long sys_uname(struct new_utsname __user *name) { int err; down_read(&uts_sem); - err = copy_to_user(name, utsname(), sizeof (*name)); + err = copy_to_user(name, utsname(), sizeof(*name)); up_read(&uts_sem); - if (personality(current->personality) == PER_LINUX32) - err |= copy_to_user(&name->machine, "i686", 5); + if (personality(current->personality) == PER_LINUX32) + err |= copy_to_user(&name->machine, "i686", 5); return err ? -EFAULT : 0; } diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c index 170d43c1748..3d1be4f0fac 100644 --- a/arch/x86/kernel/syscall_64.c +++ b/arch/x86/kernel/syscall_64.c @@ -8,12 +8,12 @@ #define __NO_STUBS #define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ; -#undef _ASM_X86_64_UNISTD_H_ +#undef ASM_X86__UNISTD_64_H #include <asm/unistd_64.h> #undef __SYSCALL #define __SYSCALL(nr, sym) [nr] = sym, -#undef _ASM_X86_64_UNISTD_H_ +#undef ASM_X86__UNISTD_64_H typedef void (*sys_call_ptr_t)(void); diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c index ffe3c664afc..77b400f06ea 100644 --- a/arch/x86/kernel/time_32.c +++ b/arch/x86/kernel/time_32.c @@ -36,6 +36,7 @@ #include <asm/arch_hooks.h> #include <asm/hpet.h> #include <asm/time.h> +#include <asm/timer.h> #include "do_timer.h" @@ -46,10 +47,9 @@ unsigned long profile_pc(struct pt_regs *regs) unsigned long pc = instruction_pointer(regs); #ifdef CONFIG_SMP - if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->cs) && - in_lock_functions(pc)) { + if (!user_mode_vm(regs) && in_lock_functions(pc)) { #ifdef CONFIG_FRAME_POINTER - return *(unsigned long *)(regs->bp + 4); + return *(unsigned long *)(regs->bp + sizeof(long)); #else unsigned long *sp = (unsigned long *)®s->sp; @@ -94,6 +94,7 @@ irqreturn_t timer_interrupt(int irq, void *dev_id) do_timer_interrupt_hook(); +#ifdef CONFIG_MCA if (MCA_bus) { /* The PS/2 uses level-triggered interrupts. You can't turn them off, nor would you want to (any attempt to @@ -107,6 +108,7 @@ irqreturn_t timer_interrupt(int irq, void *dev_id) u8 irq_v = inb_p( 0x61 ); /* read the current state */ outb_p( irq_v|0x80, 0x61 ); /* reset the IRQ */ } +#endif return IRQ_HANDLED; } diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c index e3d49c553af..cb19d650c21 100644 --- a/arch/x86/kernel/time_64.c +++ b/arch/x86/kernel/time_64.c @@ -16,6 +16,7 @@ #include <linux/interrupt.h> #include <linux/module.h> #include <linux/time.h> +#include <linux/mca.h> #include <asm/i8253.h> #include <asm/hpet.h> @@ -33,23 +34,34 @@ unsigned long profile_pc(struct pt_regs *regs) /* Assume the lock function has either no stack frame or a copy of flags from PUSHF Eflags always has bits 22 and up cleared unlike kernel addresses. */ - if (!user_mode(regs) && in_lock_functions(pc)) { + if (!user_mode_vm(regs) && in_lock_functions(pc)) { +#ifdef CONFIG_FRAME_POINTER + return *(unsigned long *)(regs->bp + sizeof(long)); +#else unsigned long *sp = (unsigned long *)regs->sp; if (sp[0] >> 22) return sp[0]; if (sp[1] >> 22) return sp[1]; +#endif } return pc; } EXPORT_SYMBOL(profile_pc); -static irqreturn_t timer_event_interrupt(int irq, void *dev_id) +irqreturn_t timer_interrupt(int irq, void *dev_id) { add_pda(irq0_irqs, 1); global_clock_event->event_handler(global_clock_event); +#ifdef CONFIG_MCA + if (MCA_bus) { + u8 irq_v = inb_p(0x61); /* read the current state */ + outb_p(irq_v|0x80, 0x61); /* reset the IRQ */ + } +#endif + return IRQ_HANDLED; } @@ -100,7 +112,7 @@ unsigned long __init calibrate_cpu(void) } static struct irqaction irq0 = { - .handler = timer_event_interrupt, + .handler = timer_interrupt, .flags = IRQF_DISABLED | IRQF_IRQPOLL | IRQF_NOBALANCING, .mask = CPU_MASK_NONE, .name = "timer" @@ -111,16 +123,13 @@ void __init hpet_time_init(void) if (!hpet_enable()) setup_pit_timer(); + irq0.mask = cpumask_of_cpu(0); setup_irq(0, &irq0); } void __init time_init(void) { tsc_init(); - if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP)) - vgetcpu_mode = VGETCPU_RDTSCP; - else - vgetcpu_mode = VGETCPU_LSL; late_time_init = choose_time_init(); } diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c index fec1ecedc9b..e00534b3353 100644 --- a/arch/x86/kernel/tlb_32.c +++ b/arch/x86/kernel/tlb_32.c @@ -241,3 +241,11 @@ void flush_tlb_all(void) on_each_cpu(do_flush_tlb_all, NULL, 1); } +void reset_lazy_tlbstate(void) +{ + int cpu = raw_smp_processor_id(); + + per_cpu(cpu_tlbstate, cpu).state = 0; + per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm; +} + diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index d0fbb7712ab..8b8c0d6640f 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -17,6 +17,7 @@ #include <asm/genapic.h> #include <asm/idle.h> #include <asm/tsc.h> +#include <asm/irq_vectors.h> #include <mach_apic.h> @@ -783,7 +784,7 @@ static int __init uv_bau_init(void) uv_init_blade(blade, node, cur_cpu); cur_cpu += uv_blade_nr_possible_cpus(blade); } - set_intr_gate(UV_BAU_MESSAGE, uv_bau_message_intr1); + alloc_intr_gate(UV_BAU_MESSAGE, uv_bau_message_intr1); uv_enable_timeouts(); return 0; diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c index ab6bf375a30..6bb7b8579e7 100644 --- a/arch/x86/kernel/tls.c +++ b/arch/x86/kernel/tls.c @@ -10,6 +10,7 @@ #include <asm/ldt.h> #include <asm/processor.h> #include <asm/proto.h> +#include <asm/syscalls.h> #include "tls.h" diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps.c index 03df8e45e5a..e062974cce3 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps.c @@ -7,13 +7,11 @@ */ /* - * 'Traps.c' handles hardware traps and faults after we have saved some - * state in 'asm.s'. + * Handle hardware traps and faults. */ #include <linux/interrupt.h> #include <linux/kallsyms.h> #include <linux/spinlock.h> -#include <linux/highmem.h> #include <linux/kprobes.h> #include <linux/uaccess.h> #include <linux/utsname.h> @@ -32,6 +30,8 @@ #include <linux/bug.h> #include <linux/nmi.h> #include <linux/mm.h> +#include <linux/smp.h> +#include <linux/io.h> #ifdef CONFIG_EISA #include <linux/ioport.h> @@ -46,21 +46,31 @@ #include <linux/edac.h> #endif -#include <asm/arch_hooks.h> #include <asm/stacktrace.h> #include <asm/processor.h> #include <asm/debugreg.h> #include <asm/atomic.h> #include <asm/system.h> #include <asm/unwind.h> +#include <asm/traps.h> #include <asm/desc.h> #include <asm/i387.h> + +#include <mach_traps.h> + +#ifdef CONFIG_X86_64 +#include <asm/pgalloc.h> +#include <asm/proto.h> +#include <asm/pda.h> +#else +#include <asm/processor-flags.h> +#include <asm/arch_hooks.h> #include <asm/nmi.h> #include <asm/smp.h> #include <asm/io.h> #include <asm/traps.h> -#include "mach_traps.h" +#include "cpu/mcheck/mce.h" DECLARE_BITMAP(used_vectors, NR_VECTORS); EXPORT_SYMBOL_GPL(used_vectors); @@ -77,418 +87,104 @@ char ignore_fpu_irq; */ gate_desc idt_table[256] __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; - -int panic_on_unrecovered_nmi; -int kstack_depth_to_print = 24; -static unsigned int code_bytes = 64; -static int ignore_nmis; -static int die_counter; - -void printk_address(unsigned long address, int reliable) -{ -#ifdef CONFIG_KALLSYMS - unsigned long offset = 0; - unsigned long symsize; - const char *symname; - char *modname; - char *delim = ":"; - char namebuf[KSYM_NAME_LEN]; - char reliab[4] = ""; - - symname = kallsyms_lookup(address, &symsize, &offset, - &modname, namebuf); - if (!symname) { - printk(" [<%08lx>]\n", address); - return; - } - if (!reliable) - strcpy(reliab, "? "); - - if (!modname) - modname = delim = ""; - printk(" [<%08lx>] %s%s%s%s%s+0x%lx/0x%lx\n", - address, reliab, delim, modname, delim, symname, offset, symsize); -#else - printk(" [<%08lx>]\n", address); #endif -} - -static inline int valid_stack_ptr(struct thread_info *tinfo, - void *p, unsigned int size) -{ - void *t = tinfo; - return p > t && p <= t + THREAD_SIZE - size; -} - -/* The form of the top of the frame on the stack */ -struct stack_frame { - struct stack_frame *next_frame; - unsigned long return_address; -}; - -static inline unsigned long -print_context_stack(struct thread_info *tinfo, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data) -{ - struct stack_frame *frame = (struct stack_frame *)bp; - - while (valid_stack_ptr(tinfo, stack, sizeof(*stack))) { - unsigned long addr; - - addr = *stack; - if (__kernel_text_address(addr)) { - if ((unsigned long) stack == bp + 4) { - ops->address(data, addr, 1); - frame = frame->next_frame; - bp = (unsigned long) frame; - } else { - ops->address(data, addr, bp == 0); - } - } - stack++; - } - return bp; -} - -void dump_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data) -{ - if (!task) - task = current; - - if (!stack) { - unsigned long dummy; - stack = &dummy; - if (task != current) - stack = (unsigned long *)task->thread.sp; - } - -#ifdef CONFIG_FRAME_POINTER - if (!bp) { - if (task == current) { - /* Grab bp right from our regs */ - asm("movl %%ebp, %0" : "=r" (bp) :); - } else { - /* bp is the last reg pushed by switch_to */ - bp = *(unsigned long *) task->thread.sp; - } - } -#endif - - for (;;) { - struct thread_info *context; - - context = (struct thread_info *) - ((unsigned long)stack & (~(THREAD_SIZE - 1))); - bp = print_context_stack(context, stack, bp, ops, data); - /* - * Should be after the line below, but somewhere - * in early boot context comes out corrupted and we - * can't reference it: - */ - if (ops->stack(data, "IRQ") < 0) - break; - stack = (unsigned long *)context->previous_esp; - if (!stack) - break; - touch_nmi_watchdog(); - } -} -EXPORT_SYMBOL(dump_trace); - -static void -print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) -{ - printk(data); - print_symbol(msg, symbol); - printk("\n"); -} - -static void print_trace_warning(void *data, char *msg) -{ - printk("%s%s\n", (char *)data, msg); -} -static int print_trace_stack(void *data, char *name) -{ - return 0; -} - -/* - * Print one address/symbol entries per line. - */ -static void print_trace_address(void *data, unsigned long addr, int reliable) -{ - printk("%s [<%08lx>] ", (char *)data, addr); - if (!reliable) - printk("? "); - print_symbol("%s\n", addr); - touch_nmi_watchdog(); -} - -static const struct stacktrace_ops print_trace_ops = { - .warning = print_trace_warning, - .warning_symbol = print_trace_warning_symbol, - .stack = print_trace_stack, - .address = print_trace_address, -}; +static int ignore_nmis; -static void -show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, char *log_lvl) +static inline void conditional_sti(struct pt_regs *regs) { - dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); - printk("%s =======================\n", log_lvl); + if (regs->flags & X86_EFLAGS_IF) + local_irq_enable(); } -void show_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp) +static inline void preempt_conditional_sti(struct pt_regs *regs) { - show_trace_log_lvl(task, regs, stack, bp, ""); + inc_preempt_count(); + if (regs->flags & X86_EFLAGS_IF) + local_irq_enable(); } -static void -show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, unsigned long bp, char *log_lvl) +static inline void preempt_conditional_cli(struct pt_regs *regs) { - unsigned long *stack; - int i; - - if (sp == NULL) { - if (task) - sp = (unsigned long *)task->thread.sp; - else - sp = (unsigned long *)&sp; - } - - stack = sp; - for (i = 0; i < kstack_depth_to_print; i++) { - if (kstack_end(stack)) - break; - if (i && ((i % 8) == 0)) - printk("\n%s ", log_lvl); - printk("%08lx ", *stack++); - } - printk("\n%sCall Trace:\n", log_lvl); - - show_trace_log_lvl(task, regs, sp, bp, log_lvl); + if (regs->flags & X86_EFLAGS_IF) + local_irq_disable(); + dec_preempt_count(); } -void show_stack(struct task_struct *task, unsigned long *sp) +#ifdef CONFIG_X86_32 +static inline void +die_if_kernel(const char *str, struct pt_regs *regs, long err) { - printk(" "); - show_stack_log_lvl(task, NULL, sp, 0, ""); + if (!user_mode_vm(regs)) + die(str, regs, err); } /* - * The architecture-independent dump_stack generator + * Perform the lazy TSS's I/O bitmap copy. If the TSS has an + * invalid offset set (the LAZY one) and the faulting thread has + * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS, + * we set the offset field correctly and return 1. */ -void dump_stack(void) +static int lazy_iobitmap_copy(void) { - unsigned long bp = 0; - unsigned long stack; - -#ifdef CONFIG_FRAME_POINTER - if (!bp) - asm("movl %%ebp, %0" : "=r" (bp):); -#endif - - printk("Pid: %d, comm: %.20s %s %s %.*s\n", - current->pid, current->comm, print_tainted(), - init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version); - - show_trace(current, NULL, &stack, bp); -} - -EXPORT_SYMBOL(dump_stack); - -void show_registers(struct pt_regs *regs) -{ - int i; + struct thread_struct *thread; + struct tss_struct *tss; + int cpu; - print_modules(); - __show_registers(regs, 0); + cpu = get_cpu(); + tss = &per_cpu(init_tss, cpu); + thread = ¤t->thread; - printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)", - TASK_COMM_LEN, current->comm, task_pid_nr(current), - current_thread_info(), current, task_thread_info(current)); - /* - * When in-kernel, we also print out the stack and code at the - * time of the fault.. - */ - if (!user_mode_vm(regs)) { - unsigned int code_prologue = code_bytes * 43 / 64; - unsigned int code_len = code_bytes; - unsigned char c; - u8 *ip; - - printk("\n" KERN_EMERG "Stack: "); - show_stack_log_lvl(NULL, regs, ®s->sp, 0, KERN_EMERG); - - printk(KERN_EMERG "Code: "); - - ip = (u8 *)regs->ip - code_prologue; - if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { - /* try starting at EIP */ - ip = (u8 *)regs->ip; - code_len = code_len - code_prologue + 1; - } - for (i = 0; i < code_len; i++, ip++) { - if (ip < (u8 *)PAGE_OFFSET || - probe_kernel_address(ip, c)) { - printk(" Bad EIP value."); - break; - } - if (ip == (u8 *)regs->ip) - printk("<%02x> ", c); - else - printk("%02x ", c); + if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY && + thread->io_bitmap_ptr) { + memcpy(tss->io_bitmap, thread->io_bitmap_ptr, + thread->io_bitmap_max); + /* + * If the previously set map was extending to higher ports + * than the current one, pad extra space with 0xff (no access). + */ + if (thread->io_bitmap_max < tss->io_bitmap_max) { + memset((char *) tss->io_bitmap + + thread->io_bitmap_max, 0xff, + tss->io_bitmap_max - thread->io_bitmap_max); } - } - printk("\n"); -} - -int is_valid_bugaddr(unsigned long ip) -{ - unsigned short ud2; - - if (ip < PAGE_OFFSET) - return 0; - if (probe_kernel_address((unsigned short *)ip, ud2)) - return 0; - - return ud2 == 0x0b0f; -} - -static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; -static int die_owner = -1; -static unsigned int die_nest_count; - -unsigned __kprobes long oops_begin(void) -{ - unsigned long flags; - - oops_enter(); - - if (die_owner != raw_smp_processor_id()) { - console_verbose(); - raw_local_irq_save(flags); - __raw_spin_lock(&die_lock); - die_owner = smp_processor_id(); - die_nest_count = 0; - bust_spinlocks(1); - } else { - raw_local_irq_save(flags); - } - die_nest_count++; - return flags; -} - -void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) -{ - bust_spinlocks(0); - die_owner = -1; - add_taint(TAINT_DIE); - __raw_spin_unlock(&die_lock); - raw_local_irq_restore(flags); - - if (!regs) - return; - - if (kexec_should_crash(current)) - crash_kexec(regs); - - if (in_interrupt()) - panic("Fatal exception in interrupt"); - - if (panic_on_oops) - panic("Fatal exception"); - - oops_exit(); - do_exit(signr); -} - -int __kprobes __die(const char *str, struct pt_regs *regs, long err) -{ - unsigned short ss; - unsigned long sp; + tss->io_bitmap_max = thread->io_bitmap_max; + tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; + tss->io_bitmap_owner = thread; + put_cpu(); - printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); -#ifdef CONFIG_PREEMPT - printk("PREEMPT "); -#endif -#ifdef CONFIG_SMP - printk("SMP "); -#endif -#ifdef CONFIG_DEBUG_PAGEALLOC - printk("DEBUG_PAGEALLOC"); -#endif - printk("\n"); - if (notify_die(DIE_OOPS, str, regs, err, - current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) return 1; - - show_registers(regs); - /* Executive summary in case the oops scrolled away */ - sp = (unsigned long) (®s->sp); - savesegment(ss, ss); - if (user_mode(regs)) { - sp = regs->sp; - ss = regs->ss & 0xffff; } - printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); - print_symbol("%s", regs->ip); - printk(" SS:ESP %04x:%08lx\n", ss, sp); - return 0; -} - -/* - * This is gone through when something in the kernel has done something bad - * and is about to be terminated: - */ -void die(const char *str, struct pt_regs *regs, long err) -{ - unsigned long flags = oops_begin(); - - if (die_nest_count < 3) { - report_bug(regs->ip, regs); - - if (__die(str, regs, err)) - regs = NULL; - } else { - printk(KERN_EMERG "Recursive die() failure, output suppressed\n"); - } - - oops_end(flags, regs, SIGSEGV); -} + put_cpu(); -static inline void -die_if_kernel(const char *str, struct pt_regs *regs, long err) -{ - if (!user_mode_vm(regs)) - die(str, regs, err); + return 0; } +#endif static void __kprobes -do_trap(int trapnr, int signr, char *str, int vm86, struct pt_regs *regs, +do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, long error_code, siginfo_t *info) { struct task_struct *tsk = current; +#ifdef CONFIG_X86_32 if (regs->flags & X86_VM_MASK) { - if (vm86) + /* + * traps 0, 1, 3, 4, and 5 should be forwarded to vm86. + * On nmi (interrupt 2), do_trap should not be called. + */ + if (trapnr < 6) goto vm86_trap; goto trap_signal; } +#endif if (!user_mode(regs)) goto kernel_trap; +#ifdef CONFIG_X86_32 trap_signal: +#endif /* * We want error_code and trap_no set for userspace faults and * kernelspace faults which result in die(), but not @@ -501,6 +197,18 @@ trap_signal: tsk->thread.error_code = error_code; tsk->thread.trap_no = trapnr; +#ifdef CONFIG_X86_64 + if (show_unhandled_signals && unhandled_signal(tsk, signr) && + printk_ratelimit()) { + printk(KERN_INFO + "%s[%d] trap %s ip:%lx sp:%lx error:%lx", + tsk->comm, tsk->pid, str, + regs->ip, regs->sp, error_code); + print_vma_addr(" in ", regs->ip); + printk("\n"); + } +#endif + if (info) force_sig_info(signr, info, tsk); else @@ -515,29 +223,29 @@ kernel_trap: } return; +#ifdef CONFIG_X86_32 vm86_trap: if (handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, trapnr)) goto trap_signal; return; +#endif } #define DO_ERROR(trapnr, signr, str, name) \ -void do_##name(struct pt_regs *regs, long error_code) \ +dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ { \ - trace_hardirqs_fixup(); \ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ == NOTIFY_STOP) \ return; \ - do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \ + conditional_sti(regs); \ + do_trap(trapnr, signr, str, regs, error_code, NULL); \ } -#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \ -void do_##name(struct pt_regs *regs, long error_code) \ +#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ +dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ { \ siginfo_t info; \ - if (irq) \ - local_irq_enable(); \ info.si_signo = signr; \ info.si_errno = 0; \ info.si_code = sicode; \ @@ -545,90 +253,68 @@ void do_##name(struct pt_regs *regs, long error_code) \ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ == NOTIFY_STOP) \ return; \ - do_trap(trapnr, signr, str, 0, regs, error_code, &info); \ + conditional_sti(regs); \ + do_trap(trapnr, signr, str, regs, error_code, &info); \ } -#define DO_VM86_ERROR(trapnr, signr, str, name) \ -void do_##name(struct pt_regs *regs, long error_code) \ -{ \ - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ - return; \ - do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \ -} - -#define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ -void do_##name(struct pt_regs *regs, long error_code) \ -{ \ - siginfo_t info; \ - info.si_signo = signr; \ - info.si_errno = 0; \ - info.si_code = sicode; \ - info.si_addr = (void __user *)siaddr; \ - trace_hardirqs_fixup(); \ - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ - return; \ - do_trap(trapnr, signr, str, 1, regs, error_code, &info); \ -} - -DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) -#ifndef CONFIG_KPROBES -DO_VM86_ERROR(3, SIGTRAP, "int3", int3) -#endif -DO_VM86_ERROR(4, SIGSEGV, "overflow", overflow) -DO_VM86_ERROR(5, SIGSEGV, "bounds", bounds) -DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0) +DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) +DO_ERROR(4, SIGSEGV, "overflow", overflow) +DO_ERROR(5, SIGSEGV, "bounds", bounds) +DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip) DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) +#ifdef CONFIG_X86_32 DO_ERROR(12, SIGBUS, "stack segment", stack_segment) -DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0) -DO_ERROR_INFO(32, SIGILL, "iret exception", iret_error, ILL_BADSTK, 0, 1) +#endif +DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) + +#ifdef CONFIG_X86_64 +/* Runs on IST stack */ +dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code) +{ + if (notify_die(DIE_TRAP, "stack segment", regs, error_code, + 12, SIGBUS) == NOTIFY_STOP) + return; + preempt_conditional_sti(regs); + do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL); + preempt_conditional_cli(regs); +} + +dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) +{ + static const char str[] = "double fault"; + struct task_struct *tsk = current; + + /* Return not checked because double check cannot be ignored */ + notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV); -void __kprobes + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 8; + + /* This is always a kernel trap and never fixable (and thus must + never return). */ + for (;;) + die(str, regs, error_code); +} +#endif + +dotraplinkage void __kprobes do_general_protection(struct pt_regs *regs, long error_code) { struct task_struct *tsk; - struct thread_struct *thread; - struct tss_struct *tss; - int cpu; - cpu = get_cpu(); - tss = &per_cpu(init_tss, cpu); - thread = ¤t->thread; - - /* - * Perform the lazy TSS's I/O bitmap copy. If the TSS has an - * invalid offset set (the LAZY one) and the faulting thread has - * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS - * and we set the offset field correctly. Then we let the CPU to - * restart the faulting instruction. - */ - if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY && - thread->io_bitmap_ptr) { - memcpy(tss->io_bitmap, thread->io_bitmap_ptr, - thread->io_bitmap_max); - /* - * If the previously set map was extending to higher ports - * than the current one, pad extra space with 0xff (no access). - */ - if (thread->io_bitmap_max < tss->io_bitmap_max) { - memset((char *) tss->io_bitmap + - thread->io_bitmap_max, 0xff, - tss->io_bitmap_max - thread->io_bitmap_max); - } - tss->io_bitmap_max = thread->io_bitmap_max; - tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; - tss->io_bitmap_owner = thread; - put_cpu(); + conditional_sti(regs); +#ifdef CONFIG_X86_32 + if (lazy_iobitmap_copy()) { + /* restart the faulting instruction */ return; } - put_cpu(); if (regs->flags & X86_VM_MASK) goto gp_in_vm86; +#endif tsk = current; if (!user_mode(regs)) @@ -650,10 +336,12 @@ do_general_protection(struct pt_regs *regs, long error_code) force_sig(SIGSEGV, tsk); return; +#ifdef CONFIG_X86_32 gp_in_vm86: local_irq_enable(); handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); return; +#endif gp_in_kernel: if (fixup_exception(regs)) @@ -690,7 +378,8 @@ mem_parity_error(unsigned char reason, struct pt_regs *regs) printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); /* Clear and disable the memory parity error line. */ - clear_mem_error(reason); + reason = (reason & 0xf) | 4; + outb(reason, 0x61); } static notrace __kprobes void @@ -716,7 +405,8 @@ io_check_error(unsigned char reason, struct pt_regs *regs) static notrace __kprobes void unknown_nmi_error(unsigned char reason, struct pt_regs *regs) { - if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) + if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == + NOTIFY_STOP) return; #ifdef CONFIG_MCA /* @@ -739,41 +429,6 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs) printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); } -static DEFINE_SPINLOCK(nmi_print_lock); - -void notrace __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic) -{ - if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) - return; - - spin_lock(&nmi_print_lock); - /* - * We are in trouble anyway, lets at least try - * to get a message out: - */ - bust_spinlocks(1); - printk(KERN_EMERG "%s", str); - printk(" on CPU%d, ip %08lx, registers:\n", - smp_processor_id(), regs->ip); - show_registers(regs); - if (do_panic) - panic("Non maskable interrupt"); - console_silent(); - spin_unlock(&nmi_print_lock); - bust_spinlocks(0); - - /* - * If we are in kernel we are probably nested up pretty bad - * and might aswell get out now while we still can: - */ - if (!user_mode_vm(regs)) { - current->thread.trap_no = 2; - crash_kexec(regs); - } - - do_exit(SIGSEGV); -} - static notrace __kprobes void default_do_nmi(struct pt_regs *regs) { unsigned char reason = 0; @@ -812,22 +467,25 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) mem_parity_error(reason, regs); if (reason & 0x40) io_check_error(reason, regs); +#ifdef CONFIG_X86_32 /* * Reassert NMI in case it became active meanwhile * as it's edge-triggered: */ reassert_nmi(); +#endif } -notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code) +dotraplinkage notrace __kprobes void +do_nmi(struct pt_regs *regs, long error_code) { - int cpu; - nmi_enter(); - cpu = smp_processor_id(); - - ++nmi_count(cpu); +#ifdef CONFIG_X86_32 + { int cpu; cpu = smp_processor_id(); ++nmi_count(cpu); } +#else + add_pda(__nmi_count, 1); +#endif if (!ignore_nmis) default_do_nmi(regs); @@ -847,21 +505,44 @@ void restart_nmi(void) acpi_nmi_enable(); } -#ifdef CONFIG_KPROBES -void __kprobes do_int3(struct pt_regs *regs, long error_code) +/* May run on IST stack. */ +dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) { - trace_hardirqs_fixup(); - +#ifdef CONFIG_KPROBES if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) return; - /* - * This is an interrupt gate, because kprobes wants interrupts - * disabled. Normal trap handlers don't. - */ - restore_interrupts(regs); +#else + if (notify_die(DIE_TRAP, "int3", regs, error_code, 3, SIGTRAP) + == NOTIFY_STOP) + return; +#endif + + preempt_conditional_sti(regs); + do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); + preempt_conditional_cli(regs); +} - do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL); +#ifdef CONFIG_X86_64 +/* Help handler running on IST stack to switch back to user stack + for scheduling or signal handling. The actual stack switch is done in + entry.S */ +asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) +{ + struct pt_regs *regs = eregs; + /* Did already sync */ + if (eregs == (struct pt_regs *)eregs->sp) + ; + /* Exception from user space */ + else if (user_mode(eregs)) + regs = task_pt_regs(current); + /* Exception from kernel and interrupts are enabled. Move to + kernel process stack. */ + else if (eregs->flags & X86_EFLAGS_IF) + regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs)); + if (eregs != regs) + *regs = *eregs; + return regs; } #endif @@ -886,13 +567,14 @@ void __kprobes do_int3(struct pt_regs *regs, long error_code) * about restoring all the debug state, and ptrace doesn't have to * find every occurrence of the TF bit that could be saved away even * by user code) + * + * May run on IST stack. */ -void __kprobes do_debug(struct pt_regs *regs, long error_code) +dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) { struct task_struct *tsk = current; - unsigned int condition; - - trace_hardirqs_fixup(); + unsigned long condition; + int si_code; get_debugreg(condition, 6); @@ -905,9 +587,9 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code) if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, SIGTRAP) == NOTIFY_STOP) return; + /* It's safe to allow irq's after DR6 has been saved */ - if (regs->flags & X86_EFLAGS_IF) - local_irq_enable(); + preempt_conditional_sti(regs); /* Mask out spurious debug traps due to lazy DR7 setting */ if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { @@ -915,8 +597,10 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code) goto clear_dr7; } +#ifdef CONFIG_X86_32 if (regs->flags & X86_VM_MASK) goto debug_vm86; +#endif /* Save debug status register where ptrace can see it */ tsk->thread.debugreg6 = condition; @@ -926,17 +610,13 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code) * kernel space (but re-enable TF when returning to user mode). */ if (condition & DR_STEP) { - /* - * We already checked v86 mode above, so we can - * check for kernel mode by just checking the CPL - * of CS. - */ if (!user_mode(regs)) goto clear_TF_reenable; } + si_code = get_si_code(condition); /* Ok, finally something we can handle */ - send_sigtrap(tsk, regs, error_code); + send_sigtrap(tsk, regs, error_code, si_code); /* * Disable additional traps. They'll be re-enabled when @@ -944,18 +624,37 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code) */ clear_dr7: set_debugreg(0, 7); + preempt_conditional_cli(regs); return; +#ifdef CONFIG_X86_32 debug_vm86: handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); + preempt_conditional_cli(regs); return; +#endif clear_TF_reenable: set_tsk_thread_flag(tsk, TIF_SINGLESTEP); regs->flags &= ~X86_EFLAGS_TF; + preempt_conditional_cli(regs); return; } +#ifdef CONFIG_X86_64 +static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr) +{ + if (fixup_exception(regs)) + return 1; + + notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE); + /* Illegal floating point operation in the kernel */ + current->thread.trap_no = trapnr; + die(str, regs, 0); + return 0; +} +#endif + /* * Note that we play around with the 'TS' bit in an attempt to get * the correct behaviour even in the presence of the asynchronous @@ -992,7 +691,9 @@ void math_error(void __user *ip) swd = get_fpu_swd(task); switch (swd & ~cwd & 0x3f) { case 0x000: /* No unmasked exception */ +#ifdef CONFIG_X86_32 return; +#endif default: /* Multiple exceptions */ break; case 0x001: /* Invalid Op */ @@ -1020,9 +721,18 @@ void math_error(void __user *ip) force_sig_info(SIGFPE, &info, task); } -void do_coprocessor_error(struct pt_regs *regs, long error_code) +dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) { + conditional_sti(regs); + +#ifdef CONFIG_X86_32 ignore_fpu_irq = 1; +#else + if (!user_mode(regs) && + kernel_math_error(regs, "kernel x87 math error", 16)) + return; +#endif + math_error((void __user *)regs->ip); } @@ -1074,8 +784,12 @@ static void simd_math_error(void __user *ip) force_sig_info(SIGFPE, &info, task); } -void do_simd_coprocessor_error(struct pt_regs *regs, long error_code) +dotraplinkage void +do_simd_coprocessor_error(struct pt_regs *regs, long error_code) { + conditional_sti(regs); + +#ifdef CONFIG_X86_32 if (cpu_has_xmm) { /* Handle SIMD FPU exceptions on PIII+ processors. */ ignore_fpu_irq = 1; @@ -1094,16 +808,25 @@ void do_simd_coprocessor_error(struct pt_regs *regs, long error_code) current->thread.error_code = error_code; die_if_kernel("cache flush denied", regs, error_code); force_sig(SIGSEGV, current); +#else + if (!user_mode(regs) && + kernel_math_error(regs, "kernel simd math error", 19)) + return; + simd_math_error((void __user *)regs->ip); +#endif } -void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) +dotraplinkage void +do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) { + conditional_sti(regs); #if 0 /* No need to warn about this any longer. */ printk(KERN_INFO "Ignoring P6 Local APIC Spurious Interrupt Bug...\n"); #endif } +#ifdef CONFIG_X86_32 unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp) { struct desc_struct *gdt = get_cpu_gdt_table(smp_processor_id()); @@ -1122,6 +845,15 @@ unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp) return new_kesp; } +#else +asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) +{ +} + +asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void) +{ +} +#endif /* * 'math_state_restore()' saves the current math information in the @@ -1154,14 +886,24 @@ asmlinkage void math_state_restore(void) } clts(); /* Allow maths ops (or we recurse) */ +#ifdef CONFIG_X86_32 restore_fpu(tsk); +#else + /* + * Paranoid restore. send a SIGSEGV if we fail to restore the state. + */ + if (unlikely(restore_fpu_checking(tsk))) { + stts(); + force_sig(SIGSEGV, tsk); + return; + } +#endif thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ tsk->fpu_counter++; } EXPORT_SYMBOL_GPL(math_state_restore); #ifndef CONFIG_MATH_EMULATION - asmlinkage void math_emulate(long arg) { printk(KERN_EMERG @@ -1170,12 +912,54 @@ asmlinkage void math_emulate(long arg) force_sig(SIGFPE, current); schedule(); } - #endif /* CONFIG_MATH_EMULATION */ +dotraplinkage void __kprobes +do_device_not_available(struct pt_regs *regs, long error) +{ +#ifdef CONFIG_X86_32 + if (read_cr0() & X86_CR0_EM) { + conditional_sti(regs); + math_emulate(0); + } else { + math_state_restore(); /* interrupts still off */ + conditional_sti(regs); + } +#else + math_state_restore(); +#endif +} + +#ifdef CONFIG_X86_32 +#ifdef CONFIG_X86_MCE +dotraplinkage void __kprobes do_machine_check(struct pt_regs *regs, long error) +{ + conditional_sti(regs); + machine_check_vector(regs, error); +} +#endif + +dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) +{ + siginfo_t info; + local_irq_enable(); + + info.si_signo = SIGILL; + info.si_errno = 0; + info.si_code = ILL_BADSTK; + info.si_addr = 0; + if (notify_die(DIE_TRAP, "iret exception", + regs, error_code, 32, SIGILL) == NOTIFY_STOP) + return; + do_trap(32, SIGILL, "iret exception", regs, error_code, &info); +} +#endif + void __init trap_init(void) { +#ifdef CONFIG_X86_32 int i; +#endif #ifdef CONFIG_EISA void __iomem *p = early_ioremap(0x0FFFD9, 4); @@ -1185,29 +969,40 @@ void __init trap_init(void) early_iounmap(p, 4); #endif - set_trap_gate(0, ÷_error); - set_intr_gate(1, &debug); - set_intr_gate(2, &nmi); - set_system_intr_gate(3, &int3); /* int3 can be called from all */ - set_system_gate(4, &overflow); /* int4 can be called from all */ - set_trap_gate(5, &bounds); - set_trap_gate(6, &invalid_op); - set_trap_gate(7, &device_not_available); + set_intr_gate(0, ÷_error); + set_intr_gate_ist(1, &debug, DEBUG_STACK); + set_intr_gate_ist(2, &nmi, NMI_STACK); + /* int3 can be called from all */ + set_system_intr_gate_ist(3, &int3, DEBUG_STACK); + /* int4 can be called from all */ + set_system_intr_gate(4, &overflow); + set_intr_gate(5, &bounds); + set_intr_gate(6, &invalid_op); + set_intr_gate(7, &device_not_available); +#ifdef CONFIG_X86_32 set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS); - set_trap_gate(9, &coprocessor_segment_overrun); - set_trap_gate(10, &invalid_TSS); - set_trap_gate(11, &segment_not_present); - set_trap_gate(12, &stack_segment); - set_trap_gate(13, &general_protection); +#else + set_intr_gate_ist(8, &double_fault, DOUBLEFAULT_STACK); +#endif + set_intr_gate(9, &coprocessor_segment_overrun); + set_intr_gate(10, &invalid_TSS); + set_intr_gate(11, &segment_not_present); + set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK); + set_intr_gate(13, &general_protection); set_intr_gate(14, &page_fault); - set_trap_gate(15, &spurious_interrupt_bug); - set_trap_gate(16, &coprocessor_error); - set_trap_gate(17, &alignment_check); + set_intr_gate(15, &spurious_interrupt_bug); + set_intr_gate(16, &coprocessor_error); + set_intr_gate(17, &alignment_check); #ifdef CONFIG_X86_MCE - set_trap_gate(18, &machine_check); + set_intr_gate_ist(18, &machine_check, MCE_STACK); #endif - set_trap_gate(19, &simd_coprocessor_error); + set_intr_gate(19, &simd_coprocessor_error); +#ifdef CONFIG_IA32_EMULATION + set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall); +#endif + +#ifdef CONFIG_X86_32 if (cpu_has_fxsr) { printk(KERN_INFO "Enabling fast FPU save and restore... "); set_in_cr4(X86_CR4_OSFXSR); @@ -1220,37 +1015,20 @@ void __init trap_init(void) printk("done.\n"); } - set_system_gate(SYSCALL_VECTOR, &system_call); + set_system_trap_gate(SYSCALL_VECTOR, &system_call); /* Reserve all the builtin and the syscall vector: */ for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) set_bit(i, used_vectors); set_bit(SYSCALL_VECTOR, used_vectors); - - init_thread_xstate(); +#endif /* * Should be a barrier for any external CPU state: */ cpu_init(); +#ifdef CONFIG_X86_32 trap_init_hook(); +#endif } - -static int __init kstack_setup(char *s) -{ - kstack_depth_to_print = simple_strtoul(s, NULL, 0); - - return 1; -} -__setup("kstack=", kstack_setup); - -static int __init code_bytes_setup(char *s) -{ - code_bytes = simple_strtoul(s, NULL, 0); - if (code_bytes > 8192) - code_bytes = 8192; - - return 1; -} -__setup("code_bytes=", code_bytes_setup); diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c deleted file mode 100644 index 513caaca711..00000000000 --- a/arch/x86/kernel/traps_64.c +++ /dev/null @@ -1,1212 +0,0 @@ -/* - * Copyright (C) 1991, 1992 Linus Torvalds - * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs - * - * Pentium III FXSR, SSE support - * Gareth Hughes <gareth@valinux.com>, May 2000 - */ - -/* - * 'Traps.c' handles hardware traps and faults after we have saved some - * state in 'entry.S'. - */ -#include <linux/moduleparam.h> -#include <linux/interrupt.h> -#include <linux/kallsyms.h> -#include <linux/spinlock.h> -#include <linux/kprobes.h> -#include <linux/uaccess.h> -#include <linux/utsname.h> -#include <linux/kdebug.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/ptrace.h> -#include <linux/string.h> -#include <linux/unwind.h> -#include <linux/delay.h> -#include <linux/errno.h> -#include <linux/kexec.h> -#include <linux/sched.h> -#include <linux/timer.h> -#include <linux/init.h> -#include <linux/bug.h> -#include <linux/nmi.h> -#include <linux/mm.h> - -#if defined(CONFIG_EDAC) -#include <linux/edac.h> -#endif - -#include <asm/stacktrace.h> -#include <asm/processor.h> -#include <asm/debugreg.h> -#include <asm/atomic.h> -#include <asm/system.h> -#include <asm/unwind.h> -#include <asm/desc.h> -#include <asm/i387.h> -#include <asm/nmi.h> -#include <asm/smp.h> -#include <asm/io.h> -#include <asm/pgalloc.h> -#include <asm/proto.h> -#include <asm/pda.h> -#include <asm/traps.h> - -#include <mach_traps.h> - -int panic_on_unrecovered_nmi; -int kstack_depth_to_print = 12; -static unsigned int code_bytes = 64; -static int ignore_nmis; -static int die_counter; - -static inline void conditional_sti(struct pt_regs *regs) -{ - if (regs->flags & X86_EFLAGS_IF) - local_irq_enable(); -} - -static inline void preempt_conditional_sti(struct pt_regs *regs) -{ - inc_preempt_count(); - if (regs->flags & X86_EFLAGS_IF) - local_irq_enable(); -} - -static inline void preempt_conditional_cli(struct pt_regs *regs) -{ - if (regs->flags & X86_EFLAGS_IF) - local_irq_disable(); - /* Make sure to not schedule here because we could be running - on an exception stack. */ - dec_preempt_count(); -} - -void printk_address(unsigned long address, int reliable) -{ - printk(" [<%016lx>] %s%pS\n", address, reliable ? "": "? ", (void *) address); -} - -static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, - unsigned *usedp, char **idp) -{ - static char ids[][8] = { - [DEBUG_STACK - 1] = "#DB", - [NMI_STACK - 1] = "NMI", - [DOUBLEFAULT_STACK - 1] = "#DF", - [STACKFAULT_STACK - 1] = "#SS", - [MCE_STACK - 1] = "#MC", -#if DEBUG_STKSZ > EXCEPTION_STKSZ - [N_EXCEPTION_STACKS ... N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]" -#endif - }; - unsigned k; - - /* - * Iterate over all exception stacks, and figure out whether - * 'stack' is in one of them: - */ - for (k = 0; k < N_EXCEPTION_STACKS; k++) { - unsigned long end = per_cpu(orig_ist, cpu).ist[k]; - /* - * Is 'stack' above this exception frame's end? - * If yes then skip to the next frame. - */ - if (stack >= end) - continue; - /* - * Is 'stack' above this exception frame's start address? - * If yes then we found the right frame. - */ - if (stack >= end - EXCEPTION_STKSZ) { - /* - * Make sure we only iterate through an exception - * stack once. If it comes up for the second time - * then there's something wrong going on - just - * break out and return NULL: - */ - if (*usedp & (1U << k)) - break; - *usedp |= 1U << k; - *idp = ids[k]; - return (unsigned long *)end; - } - /* - * If this is a debug stack, and if it has a larger size than - * the usual exception stacks, then 'stack' might still - * be within the lower portion of the debug stack: - */ -#if DEBUG_STKSZ > EXCEPTION_STKSZ - if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) { - unsigned j = N_EXCEPTION_STACKS - 1; - - /* - * Black magic. A large debug stack is composed of - * multiple exception stack entries, which we - * iterate through now. Dont look: - */ - do { - ++j; - end -= EXCEPTION_STKSZ; - ids[j][4] = '1' + (j - N_EXCEPTION_STACKS); - } while (stack < end - EXCEPTION_STKSZ); - if (*usedp & (1U << j)) - break; - *usedp |= 1U << j; - *idp = ids[j]; - return (unsigned long *)end; - } -#endif - } - return NULL; -} - -/* - * x86-64 can have up to three kernel stacks: - * process stack - * interrupt stack - * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack - */ - -static inline int valid_stack_ptr(struct thread_info *tinfo, - void *p, unsigned int size, void *end) -{ - void *t = tinfo; - if (end) { - if (p < end && p >= (end-THREAD_SIZE)) - return 1; - else - return 0; - } - return p > t && p < t + THREAD_SIZE - size; -} - -/* The form of the top of the frame on the stack */ -struct stack_frame { - struct stack_frame *next_frame; - unsigned long return_address; -}; - -static inline unsigned long -print_context_stack(struct thread_info *tinfo, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data, - unsigned long *end) -{ - struct stack_frame *frame = (struct stack_frame *)bp; - - while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) { - unsigned long addr; - - addr = *stack; - if (__kernel_text_address(addr)) { - if ((unsigned long) stack == bp + 8) { - ops->address(data, addr, 1); - frame = frame->next_frame; - bp = (unsigned long) frame; - } else { - ops->address(data, addr, bp == 0); - } - } - stack++; - } - return bp; -} - -void dump_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data) -{ - const unsigned cpu = get_cpu(); - unsigned long *irqstack_end = (unsigned long*)cpu_pda(cpu)->irqstackptr; - unsigned used = 0; - struct thread_info *tinfo; - - if (!task) - task = current; - - if (!stack) { - unsigned long dummy; - stack = &dummy; - if (task && task != current) - stack = (unsigned long *)task->thread.sp; - } - -#ifdef CONFIG_FRAME_POINTER - if (!bp) { - if (task == current) { - /* Grab bp right from our regs */ - asm("movq %%rbp, %0" : "=r" (bp) :); - } else { - /* bp is the last reg pushed by switch_to */ - bp = *(unsigned long *) task->thread.sp; - } - } -#endif - - /* - * Print function call entries in all stacks, starting at the - * current stack address. If the stacks consist of nested - * exceptions - */ - tinfo = task_thread_info(task); - for (;;) { - char *id; - unsigned long *estack_end; - estack_end = in_exception_stack(cpu, (unsigned long)stack, - &used, &id); - - if (estack_end) { - if (ops->stack(data, id) < 0) - break; - - bp = print_context_stack(tinfo, stack, bp, ops, - data, estack_end); - ops->stack(data, "<EOE>"); - /* - * We link to the next stack via the - * second-to-last pointer (index -2 to end) in the - * exception stack: - */ - stack = (unsigned long *) estack_end[-2]; - continue; - } - if (irqstack_end) { - unsigned long *irqstack; - irqstack = irqstack_end - - (IRQSTACKSIZE - 64) / sizeof(*irqstack); - - if (stack >= irqstack && stack < irqstack_end) { - if (ops->stack(data, "IRQ") < 0) - break; - bp = print_context_stack(tinfo, stack, bp, - ops, data, irqstack_end); - /* - * We link to the next stack (which would be - * the process stack normally) the last - * pointer (index -1 to end) in the IRQ stack: - */ - stack = (unsigned long *) (irqstack_end[-1]); - irqstack_end = NULL; - ops->stack(data, "EOI"); - continue; - } - } - break; - } - - /* - * This handles the process stack: - */ - bp = print_context_stack(tinfo, stack, bp, ops, data, NULL); - put_cpu(); -} -EXPORT_SYMBOL(dump_trace); - -static void -print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) -{ - print_symbol(msg, symbol); - printk("\n"); -} - -static void print_trace_warning(void *data, char *msg) -{ - printk("%s\n", msg); -} - -static int print_trace_stack(void *data, char *name) -{ - printk(" <%s> ", name); - return 0; -} - -static void print_trace_address(void *data, unsigned long addr, int reliable) -{ - touch_nmi_watchdog(); - printk_address(addr, reliable); -} - -static const struct stacktrace_ops print_trace_ops = { - .warning = print_trace_warning, - .warning_symbol = print_trace_warning_symbol, - .stack = print_trace_stack, - .address = print_trace_address, -}; - -static void -show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, char *log_lvl) -{ - printk("\nCall Trace:\n"); - dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); - printk("\n"); -} - -void show_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp) -{ - show_trace_log_lvl(task, regs, stack, bp, ""); -} - -static void -show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, unsigned long bp, char *log_lvl) -{ - unsigned long *stack; - int i; - const int cpu = smp_processor_id(); - unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr); - unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE); - - // debugging aid: "show_stack(NULL, NULL);" prints the - // back trace for this cpu. - - if (sp == NULL) { - if (task) - sp = (unsigned long *)task->thread.sp; - else - sp = (unsigned long *)&sp; - } - - stack = sp; - for (i = 0; i < kstack_depth_to_print; i++) { - if (stack >= irqstack && stack <= irqstack_end) { - if (stack == irqstack_end) { - stack = (unsigned long *) (irqstack_end[-1]); - printk(" <EOI> "); - } - } else { - if (((long) stack & (THREAD_SIZE-1)) == 0) - break; - } - if (i && ((i % 4) == 0)) - printk("\n"); - printk(" %016lx", *stack++); - touch_nmi_watchdog(); - } - show_trace_log_lvl(task, regs, sp, bp, log_lvl); -} - -void show_stack(struct task_struct *task, unsigned long *sp) -{ - show_stack_log_lvl(task, NULL, sp, 0, ""); -} - -/* - * The architecture-independent dump_stack generator - */ -void dump_stack(void) -{ - unsigned long bp = 0; - unsigned long stack; - -#ifdef CONFIG_FRAME_POINTER - if (!bp) - asm("movq %%rbp, %0" : "=r" (bp):); -#endif - - printk("Pid: %d, comm: %.20s %s %s %.*s\n", - current->pid, current->comm, print_tainted(), - init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version); - show_trace(NULL, NULL, &stack, bp); -} - -EXPORT_SYMBOL(dump_stack); - -void show_registers(struct pt_regs *regs) -{ - int i; - unsigned long sp; - const int cpu = smp_processor_id(); - struct task_struct *cur = cpu_pda(cpu)->pcurrent; - - sp = regs->sp; - printk("CPU %d ", cpu); - __show_regs(regs); - printk("Process %s (pid: %d, threadinfo %p, task %p)\n", - cur->comm, cur->pid, task_thread_info(cur), cur); - - /* - * When in-kernel, we also print out the stack and code at the - * time of the fault.. - */ - if (!user_mode(regs)) { - unsigned int code_prologue = code_bytes * 43 / 64; - unsigned int code_len = code_bytes; - unsigned char c; - u8 *ip; - - printk("Stack: "); - show_stack_log_lvl(NULL, regs, (unsigned long *)sp, - regs->bp, ""); - printk("\n"); - - printk(KERN_EMERG "Code: "); - - ip = (u8 *)regs->ip - code_prologue; - if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { - /* try starting at RIP */ - ip = (u8 *)regs->ip; - code_len = code_len - code_prologue + 1; - } - for (i = 0; i < code_len; i++, ip++) { - if (ip < (u8 *)PAGE_OFFSET || - probe_kernel_address(ip, c)) { - printk(" Bad RIP value."); - break; - } - if (ip == (u8 *)regs->ip) - printk("<%02x> ", c); - else - printk("%02x ", c); - } - } - printk("\n"); -} - -int is_valid_bugaddr(unsigned long ip) -{ - unsigned short ud2; - - if (__copy_from_user(&ud2, (const void __user *) ip, sizeof(ud2))) - return 0; - - return ud2 == 0x0b0f; -} - -static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; -static int die_owner = -1; -static unsigned int die_nest_count; - -unsigned __kprobes long oops_begin(void) -{ - int cpu; - unsigned long flags; - - oops_enter(); - - /* racy, but better than risking deadlock. */ - raw_local_irq_save(flags); - cpu = smp_processor_id(); - if (!__raw_spin_trylock(&die_lock)) { - if (cpu == die_owner) - /* nested oops. should stop eventually */; - else - __raw_spin_lock(&die_lock); - } - die_nest_count++; - die_owner = cpu; - console_verbose(); - bust_spinlocks(1); - return flags; -} - -void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) -{ - die_owner = -1; - bust_spinlocks(0); - die_nest_count--; - if (!die_nest_count) - /* Nest count reaches zero, release the lock. */ - __raw_spin_unlock(&die_lock); - raw_local_irq_restore(flags); - if (!regs) { - oops_exit(); - return; - } - if (panic_on_oops) - panic("Fatal exception"); - oops_exit(); - do_exit(signr); -} - -int __kprobes __die(const char *str, struct pt_regs *regs, long err) -{ - printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff, ++die_counter); -#ifdef CONFIG_PREEMPT - printk("PREEMPT "); -#endif -#ifdef CONFIG_SMP - printk("SMP "); -#endif -#ifdef CONFIG_DEBUG_PAGEALLOC - printk("DEBUG_PAGEALLOC"); -#endif - printk("\n"); - if (notify_die(DIE_OOPS, str, regs, err, - current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) - return 1; - - show_registers(regs); - add_taint(TAINT_DIE); - /* Executive summary in case the oops scrolled away */ - printk(KERN_ALERT "RIP "); - printk_address(regs->ip, 1); - printk(" RSP <%016lx>\n", regs->sp); - if (kexec_should_crash(current)) - crash_kexec(regs); - return 0; -} - -void die(const char *str, struct pt_regs *regs, long err) -{ - unsigned long flags = oops_begin(); - - if (!user_mode(regs)) - report_bug(regs->ip, regs); - - if (__die(str, regs, err)) - regs = NULL; - oops_end(flags, regs, SIGSEGV); -} - -notrace __kprobes void -die_nmi(char *str, struct pt_regs *regs, int do_panic) -{ - unsigned long flags; - - if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) - return; - - flags = oops_begin(); - /* - * We are in trouble anyway, lets at least try - * to get a message out. - */ - printk(KERN_EMERG "%s", str); - printk(" on CPU%d, ip %08lx, registers:\n", - smp_processor_id(), regs->ip); - show_registers(regs); - if (kexec_should_crash(current)) - crash_kexec(regs); - if (do_panic || panic_on_oops) - panic("Non maskable interrupt"); - oops_end(flags, NULL, SIGBUS); - nmi_exit(); - local_irq_enable(); - do_exit(SIGBUS); -} - -static void __kprobes -do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, - long error_code, siginfo_t *info) -{ - struct task_struct *tsk = current; - - if (!user_mode(regs)) - goto kernel_trap; - - /* - * We want error_code and trap_no set for userspace faults and - * kernelspace faults which result in die(), but not - * kernelspace faults which are fixed up. die() gives the - * process no chance to handle the signal and notice the - * kernel fault information, so that won't result in polluting - * the information about previously queued, but not yet - * delivered, faults. See also do_general_protection below. - */ - tsk->thread.error_code = error_code; - tsk->thread.trap_no = trapnr; - - if (show_unhandled_signals && unhandled_signal(tsk, signr) && - printk_ratelimit()) { - printk(KERN_INFO - "%s[%d] trap %s ip:%lx sp:%lx error:%lx", - tsk->comm, tsk->pid, str, - regs->ip, regs->sp, error_code); - print_vma_addr(" in ", regs->ip); - printk("\n"); - } - - if (info) - force_sig_info(signr, info, tsk); - else - force_sig(signr, tsk); - return; - -kernel_trap: - if (!fixup_exception(regs)) { - tsk->thread.error_code = error_code; - tsk->thread.trap_no = trapnr; - die(str, regs, error_code); - } - return; -} - -#define DO_ERROR(trapnr, signr, str, name) \ -asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ -{ \ - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ - return; \ - conditional_sti(regs); \ - do_trap(trapnr, signr, str, regs, error_code, NULL); \ -} - -#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ -asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ -{ \ - siginfo_t info; \ - info.si_signo = signr; \ - info.si_errno = 0; \ - info.si_code = sicode; \ - info.si_addr = (void __user *)siaddr; \ - trace_hardirqs_fixup(); \ - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ - return; \ - conditional_sti(regs); \ - do_trap(trapnr, signr, str, regs, error_code, &info); \ -} - -DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) -DO_ERROR(4, SIGSEGV, "overflow", overflow) -DO_ERROR(5, SIGSEGV, "bounds", bounds) -DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip) -DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) -DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) -DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) -DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) - -/* Runs on IST stack */ -asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code) -{ - if (notify_die(DIE_TRAP, "stack segment", regs, error_code, - 12, SIGBUS) == NOTIFY_STOP) - return; - preempt_conditional_sti(regs); - do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL); - preempt_conditional_cli(regs); -} - -asmlinkage void do_double_fault(struct pt_regs * regs, long error_code) -{ - static const char str[] = "double fault"; - struct task_struct *tsk = current; - - /* Return not checked because double check cannot be ignored */ - notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV); - - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 8; - - /* This is always a kernel trap and never fixable (and thus must - never return). */ - for (;;) - die(str, regs, error_code); -} - -asmlinkage void __kprobes -do_general_protection(struct pt_regs *regs, long error_code) -{ - struct task_struct *tsk; - - conditional_sti(regs); - - tsk = current; - if (!user_mode(regs)) - goto gp_in_kernel; - - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 13; - - if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && - printk_ratelimit()) { - printk(KERN_INFO - "%s[%d] general protection ip:%lx sp:%lx error:%lx", - tsk->comm, tsk->pid, - regs->ip, regs->sp, error_code); - print_vma_addr(" in ", regs->ip); - printk("\n"); - } - - force_sig(SIGSEGV, tsk); - return; - -gp_in_kernel: - if (fixup_exception(regs)) - return; - - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 13; - if (notify_die(DIE_GPF, "general protection fault", regs, - error_code, 13, SIGSEGV) == NOTIFY_STOP) - return; - die("general protection fault", regs, error_code); -} - -static notrace __kprobes void -mem_parity_error(unsigned char reason, struct pt_regs *regs) -{ - printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n", - reason); - printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n"); - -#if defined(CONFIG_EDAC) - if (edac_handler_set()) { - edac_atomic_assert_error(); - return; - } -#endif - - if (panic_on_unrecovered_nmi) - panic("NMI: Not continuing"); - - printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); - - /* Clear and disable the memory parity error line. */ - reason = (reason & 0xf) | 4; - outb(reason, 0x61); -} - -static notrace __kprobes void -io_check_error(unsigned char reason, struct pt_regs *regs) -{ - printk("NMI: IOCK error (debug interrupt?)\n"); - show_registers(regs); - - /* Re-enable the IOCK line, wait for a few seconds */ - reason = (reason & 0xf) | 8; - outb(reason, 0x61); - mdelay(2000); - reason &= ~8; - outb(reason, 0x61); -} - -static notrace __kprobes void -unknown_nmi_error(unsigned char reason, struct pt_regs * regs) -{ - if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) - return; - printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n", - reason); - printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n"); - - if (panic_on_unrecovered_nmi) - panic("NMI: Not continuing"); - - printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); -} - -/* Runs on IST stack. This code must keep interrupts off all the time. - Nested NMIs are prevented by the CPU. */ -asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs) -{ - unsigned char reason = 0; - int cpu; - - cpu = smp_processor_id(); - - /* Only the BSP gets external NMIs from the system. */ - if (!cpu) - reason = get_nmi_reason(); - - if (!(reason & 0xc0)) { - if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) - == NOTIFY_STOP) - return; - /* - * Ok, so this is none of the documented NMI sources, - * so it must be the NMI watchdog. - */ - if (nmi_watchdog_tick(regs, reason)) - return; - if (!do_nmi_callback(regs, cpu)) - unknown_nmi_error(reason, regs); - - return; - } - if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) - return; - - /* AK: following checks seem to be broken on modern chipsets. FIXME */ - if (reason & 0x80) - mem_parity_error(reason, regs); - if (reason & 0x40) - io_check_error(reason, regs); -} - -asmlinkage notrace __kprobes void -do_nmi(struct pt_regs *regs, long error_code) -{ - nmi_enter(); - - add_pda(__nmi_count, 1); - - if (!ignore_nmis) - default_do_nmi(regs); - - nmi_exit(); -} - -void stop_nmi(void) -{ - acpi_nmi_disable(); - ignore_nmis++; -} - -void restart_nmi(void) -{ - ignore_nmis--; - acpi_nmi_enable(); -} - -/* runs on IST stack. */ -asmlinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) -{ - trace_hardirqs_fixup(); - - if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) - == NOTIFY_STOP) - return; - - preempt_conditional_sti(regs); - do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); - preempt_conditional_cli(regs); -} - -/* Help handler running on IST stack to switch back to user stack - for scheduling or signal handling. The actual stack switch is done in - entry.S */ -asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) -{ - struct pt_regs *regs = eregs; - /* Did already sync */ - if (eregs == (struct pt_regs *)eregs->sp) - ; - /* Exception from user space */ - else if (user_mode(eregs)) - regs = task_pt_regs(current); - /* Exception from kernel and interrupts are enabled. Move to - kernel process stack. */ - else if (eregs->flags & X86_EFLAGS_IF) - regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs)); - if (eregs != regs) - *regs = *eregs; - return regs; -} - -/* runs on IST stack. */ -asmlinkage void __kprobes do_debug(struct pt_regs * regs, - unsigned long error_code) -{ - struct task_struct *tsk = current; - unsigned long condition; - siginfo_t info; - - trace_hardirqs_fixup(); - - get_debugreg(condition, 6); - - /* - * The processor cleared BTF, so don't mark that we need it set. - */ - clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR); - tsk->thread.debugctlmsr = 0; - - if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, - SIGTRAP) == NOTIFY_STOP) - return; - - preempt_conditional_sti(regs); - - /* Mask out spurious debug traps due to lazy DR7 setting */ - if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { - if (!tsk->thread.debugreg7) - goto clear_dr7; - } - - tsk->thread.debugreg6 = condition; - - /* - * Single-stepping through TF: make sure we ignore any events in - * kernel space (but re-enable TF when returning to user mode). - */ - if (condition & DR_STEP) { - if (!user_mode(regs)) - goto clear_TF_reenable; - } - - /* Ok, finally something we can handle */ - tsk->thread.trap_no = 1; - tsk->thread.error_code = error_code; - info.si_signo = SIGTRAP; - info.si_errno = 0; - info.si_code = TRAP_BRKPT; - info.si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL; - force_sig_info(SIGTRAP, &info, tsk); - -clear_dr7: - set_debugreg(0, 7); - preempt_conditional_cli(regs); - return; - -clear_TF_reenable: - set_tsk_thread_flag(tsk, TIF_SINGLESTEP); - regs->flags &= ~X86_EFLAGS_TF; - preempt_conditional_cli(regs); - return; -} - -static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr) -{ - if (fixup_exception(regs)) - return 1; - - notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE); - /* Illegal floating point operation in the kernel */ - current->thread.trap_no = trapnr; - die(str, regs, 0); - return 0; -} - -/* - * Note that we play around with the 'TS' bit in an attempt to get - * the correct behaviour even in the presence of the asynchronous - * IRQ13 behaviour - */ -asmlinkage void do_coprocessor_error(struct pt_regs *regs) -{ - void __user *ip = (void __user *)(regs->ip); - struct task_struct *task; - siginfo_t info; - unsigned short cwd, swd; - - conditional_sti(regs); - if (!user_mode(regs) && - kernel_math_error(regs, "kernel x87 math error", 16)) - return; - - /* - * Save the info for the exception handler and clear the error. - */ - task = current; - save_init_fpu(task); - task->thread.trap_no = 16; - task->thread.error_code = 0; - info.si_signo = SIGFPE; - info.si_errno = 0; - info.si_code = __SI_FAULT; - info.si_addr = ip; - /* - * (~cwd & swd) will mask out exceptions that are not set to unmasked - * status. 0x3f is the exception bits in these regs, 0x200 is the - * C1 reg you need in case of a stack fault, 0x040 is the stack - * fault bit. We should only be taking one exception at a time, - * so if this combination doesn't produce any single exception, - * then we have a bad program that isn't synchronizing its FPU usage - * and it will suffer the consequences since we won't be able to - * fully reproduce the context of the exception - */ - cwd = get_fpu_cwd(task); - swd = get_fpu_swd(task); - switch (swd & ~cwd & 0x3f) { - case 0x000: /* No unmasked exception */ - default: /* Multiple exceptions */ - break; - case 0x001: /* Invalid Op */ - /* - * swd & 0x240 == 0x040: Stack Underflow - * swd & 0x240 == 0x240: Stack Overflow - * User must clear the SF bit (0x40) if set - */ - info.si_code = FPE_FLTINV; - break; - case 0x002: /* Denormalize */ - case 0x010: /* Underflow */ - info.si_code = FPE_FLTUND; - break; - case 0x004: /* Zero Divide */ - info.si_code = FPE_FLTDIV; - break; - case 0x008: /* Overflow */ - info.si_code = FPE_FLTOVF; - break; - case 0x020: /* Precision */ - info.si_code = FPE_FLTRES; - break; - } - force_sig_info(SIGFPE, &info, task); -} - -asmlinkage void bad_intr(void) -{ - printk("bad interrupt"); -} - -asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs) -{ - void __user *ip = (void __user *)(regs->ip); - struct task_struct *task; - siginfo_t info; - unsigned short mxcsr; - - conditional_sti(regs); - if (!user_mode(regs) && - kernel_math_error(regs, "kernel simd math error", 19)) - return; - - /* - * Save the info for the exception handler and clear the error. - */ - task = current; - save_init_fpu(task); - task->thread.trap_no = 19; - task->thread.error_code = 0; - info.si_signo = SIGFPE; - info.si_errno = 0; - info.si_code = __SI_FAULT; - info.si_addr = ip; - /* - * The SIMD FPU exceptions are handled a little differently, as there - * is only a single status/control register. Thus, to determine which - * unmasked exception was caught we must mask the exception mask bits - * at 0x1f80, and then use these to mask the exception bits at 0x3f. - */ - mxcsr = get_fpu_mxcsr(task); - switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) { - case 0x000: - default: - break; - case 0x001: /* Invalid Op */ - info.si_code = FPE_FLTINV; - break; - case 0x002: /* Denormalize */ - case 0x010: /* Underflow */ - info.si_code = FPE_FLTUND; - break; - case 0x004: /* Zero Divide */ - info.si_code = FPE_FLTDIV; - break; - case 0x008: /* Overflow */ - info.si_code = FPE_FLTOVF; - break; - case 0x020: /* Precision */ - info.si_code = FPE_FLTRES; - break; - } - force_sig_info(SIGFPE, &info, task); -} - -asmlinkage void do_spurious_interrupt_bug(struct pt_regs * regs) -{ -} - -asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) -{ -} - -asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void) -{ -} - -/* - * 'math_state_restore()' saves the current math information in the - * old math state array, and gets the new ones from the current task - * - * Careful.. There are problems with IBM-designed IRQ13 behaviour. - * Don't touch unless you *really* know how it works. - */ -asmlinkage void math_state_restore(void) -{ - struct task_struct *me = current; - - if (!used_math()) { - local_irq_enable(); - /* - * does a slab alloc which can sleep - */ - if (init_fpu(me)) { - /* - * ran out of memory! - */ - do_group_exit(SIGKILL); - return; - } - local_irq_disable(); - } - - clts(); /* Allow maths ops (or we recurse) */ - /* - * Paranoid restore. send a SIGSEGV if we fail to restore the state. - */ - if (unlikely(restore_fpu_checking(&me->thread.xstate->fxsave))) { - stts(); - force_sig(SIGSEGV, me); - return; - } - task_thread_info(me)->status |= TS_USEDFPU; - me->fpu_counter++; -} -EXPORT_SYMBOL_GPL(math_state_restore); - -void __init trap_init(void) -{ - set_intr_gate(0, ÷_error); - set_intr_gate_ist(1, &debug, DEBUG_STACK); - set_intr_gate_ist(2, &nmi, NMI_STACK); - set_system_gate_ist(3, &int3, DEBUG_STACK); /* int3 can be called from all */ - set_system_gate(4, &overflow); /* int4 can be called from all */ - set_intr_gate(5, &bounds); - set_intr_gate(6, &invalid_op); - set_intr_gate(7, &device_not_available); - set_intr_gate_ist(8, &double_fault, DOUBLEFAULT_STACK); - set_intr_gate(9, &coprocessor_segment_overrun); - set_intr_gate(10, &invalid_TSS); - set_intr_gate(11, &segment_not_present); - set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK); - set_intr_gate(13, &general_protection); - set_intr_gate(14, &page_fault); - set_intr_gate(15, &spurious_interrupt_bug); - set_intr_gate(16, &coprocessor_error); - set_intr_gate(17, &alignment_check); -#ifdef CONFIG_X86_MCE - set_intr_gate_ist(18, &machine_check, MCE_STACK); -#endif - set_intr_gate(19, &simd_coprocessor_error); - -#ifdef CONFIG_IA32_EMULATION - set_system_gate(IA32_SYSCALL_VECTOR, ia32_syscall); -#endif - /* - * initialize the per thread extended state: - */ - init_thread_xstate(); - /* - * Should be a barrier for any external CPU state: - */ - cpu_init(); -} - -static int __init oops_setup(char *s) -{ - if (!s) - return -EINVAL; - if (!strcmp(s, "panic")) - panic_on_oops = 1; - return 0; -} -early_param("oops", oops_setup); - -static int __init kstack_setup(char *s) -{ - if (!s) - return -EINVAL; - kstack_depth_to_print = simple_strtoul(s, NULL, 0); - return 0; -} -early_param("kstack", kstack_setup); - -static int __init code_bytes_setup(char *s) -{ - code_bytes = simple_strtoul(s, NULL, 0); - if (code_bytes > 8192) - code_bytes = 8192; - - return 1; -} -__setup("code_bytes=", code_bytes_setup); diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 46af7167673..161bb850fc4 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -104,7 +104,7 @@ __setup("notsc", notsc_setup); /* * Read TSC and the reference counters. Take care of SMI disturbance */ -static u64 tsc_read_refs(u64 *pm, u64 *hpet) +static u64 tsc_read_refs(u64 *p, int hpet) { u64 t1, t2; int i; @@ -112,9 +112,9 @@ static u64 tsc_read_refs(u64 *pm, u64 *hpet) for (i = 0; i < MAX_RETRIES; i++) { t1 = get_cycles(); if (hpet) - *hpet = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF; + *p = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF; else - *pm = acpi_pm_read_early(); + *p = acpi_pm_read_early(); t2 = get_cycles(); if ((t2 - t1) < SMI_TRESHOLD) return t2; @@ -122,80 +122,390 @@ static u64 tsc_read_refs(u64 *pm, u64 *hpet) return ULLONG_MAX; } -/** - * native_calibrate_tsc - calibrate the tsc on boot +/* + * Calculate the TSC frequency from HPET reference */ -unsigned long native_calibrate_tsc(void) +static unsigned long calc_hpet_ref(u64 deltatsc, u64 hpet1, u64 hpet2) { - unsigned long flags; - u64 tsc1, tsc2, tr1, tr2, delta, pm1, pm2, hpet1, hpet2; - int hpet = is_hpet_enabled(); - unsigned int tsc_khz_val = 0; + u64 tmp; - local_irq_save(flags); + if (hpet2 < hpet1) + hpet2 += 0x100000000ULL; + hpet2 -= hpet1; + tmp = ((u64)hpet2 * hpet_readl(HPET_PERIOD)); + do_div(tmp, 1000000); + do_div(deltatsc, tmp); + + return (unsigned long) deltatsc; +} + +/* + * Calculate the TSC frequency from PMTimer reference + */ +static unsigned long calc_pmtimer_ref(u64 deltatsc, u64 pm1, u64 pm2) +{ + u64 tmp; - tsc1 = tsc_read_refs(&pm1, hpet ? &hpet1 : NULL); + if (!pm1 && !pm2) + return ULONG_MAX; + + if (pm2 < pm1) + pm2 += (u64)ACPI_PM_OVRRUN; + pm2 -= pm1; + tmp = pm2 * 1000000000LL; + do_div(tmp, PMTMR_TICKS_PER_SEC); + do_div(deltatsc, tmp); + + return (unsigned long) deltatsc; +} + +#define CAL_MS 10 +#define CAL_LATCH (CLOCK_TICK_RATE / (1000 / CAL_MS)) +#define CAL_PIT_LOOPS 1000 + +#define CAL2_MS 50 +#define CAL2_LATCH (CLOCK_TICK_RATE / (1000 / CAL2_MS)) +#define CAL2_PIT_LOOPS 5000 + + +/* + * Try to calibrate the TSC against the Programmable + * Interrupt Timer and return the frequency of the TSC + * in kHz. + * + * Return ULONG_MAX on failure to calibrate. + */ +static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin) +{ + u64 tsc, t1, t2, delta; + unsigned long tscmin, tscmax; + int pitcnt; + /* Set the Gate high, disable speaker */ outb((inb(0x61) & ~0x02) | 0x01, 0x61); + /* + * Setup CTC channel 2* for mode 0, (interrupt on terminal + * count mode), binary count. Set the latch register to 50ms + * (LSB then MSB) to begin countdown. + */ outb(0xb0, 0x43); - outb((CLOCK_TICK_RATE / (1000 / 50)) & 0xff, 0x42); - outb((CLOCK_TICK_RATE / (1000 / 50)) >> 8, 0x42); - tr1 = get_cycles(); - while ((inb(0x61) & 0x20) == 0); - tr2 = get_cycles(); + outb(latch & 0xff, 0x42); + outb(latch >> 8, 0x42); + + tsc = t1 = t2 = get_cycles(); + + pitcnt = 0; + tscmax = 0; + tscmin = ULONG_MAX; + while ((inb(0x61) & 0x20) == 0) { + t2 = get_cycles(); + delta = t2 - tsc; + tsc = t2; + if ((unsigned long) delta < tscmin) + tscmin = (unsigned int) delta; + if ((unsigned long) delta > tscmax) + tscmax = (unsigned int) delta; + pitcnt++; + } + + /* + * Sanity checks: + * + * If we were not able to read the PIT more than loopmin + * times, then we have been hit by a massive SMI + * + * If the maximum is 10 times larger than the minimum, + * then we got hit by an SMI as well. + */ + if (pitcnt < loopmin || tscmax > 10 * tscmin) + return ULONG_MAX; + + /* Calculate the PIT value */ + delta = t2 - t1; + do_div(delta, ms); + return delta; +} - tsc2 = tsc_read_refs(&pm2, hpet ? &hpet2 : NULL); +/* + * This reads the current MSB of the PIT counter, and + * checks if we are running on sufficiently fast and + * non-virtualized hardware. + * + * Our expectations are: + * + * - the PIT is running at roughly 1.19MHz + * + * - each IO is going to take about 1us on real hardware, + * but we allow it to be much faster (by a factor of 10) or + * _slightly_ slower (ie we allow up to a 2us read+counter + * update - anything else implies a unacceptably slow CPU + * or PIT for the fast calibration to work. + * + * - with 256 PIT ticks to read the value, we have 214us to + * see the same MSB (and overhead like doing a single TSC + * read per MSB value etc). + * + * - We're doing 2 reads per loop (LSB, MSB), and we expect + * them each to take about a microsecond on real hardware. + * So we expect a count value of around 100. But we'll be + * generous, and accept anything over 50. + * + * - if the PIT is stuck, and we see *many* more reads, we + * return early (and the next caller of pit_expect_msb() + * then consider it a failure when they don't see the + * next expected value). + * + * These expectations mean that we know that we have seen the + * transition from one expected value to another with a fairly + * high accuracy, and we didn't miss any events. We can thus + * use the TSC value at the transitions to calculate a pretty + * good value for the TSC frequencty. + */ +static inline int pit_expect_msb(unsigned char val) +{ + int count = 0; + for (count = 0; count < 50000; count++) { + /* Ignore LSB */ + inb(0x42); + if (inb(0x42) != val) + break; + } + return count > 50; +} + +/* + * How many MSB values do we want to see? We aim for a + * 15ms calibration, which assuming a 2us counter read + * error should give us roughly 150 ppm precision for + * the calibration. + */ +#define QUICK_PIT_MS 15 +#define QUICK_PIT_ITERATIONS (QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256) + +static unsigned long quick_pit_calibrate(void) +{ + /* Set the Gate high, disable speaker */ + outb((inb(0x61) & ~0x02) | 0x01, 0x61); + + /* + * Counter 2, mode 0 (one-shot), binary count + * + * NOTE! Mode 2 decrements by two (and then the + * output is flipped each time, giving the same + * final output frequency as a decrement-by-one), + * so mode 0 is much better when looking at the + * individual counts. + */ + outb(0xb0, 0x43); + + /* Start at 0xffff */ + outb(0xff, 0x42); + outb(0xff, 0x42); + + if (pit_expect_msb(0xff)) { + int i; + u64 t1, t2, delta; + unsigned char expect = 0xfe; + + t1 = get_cycles(); + for (i = 0; i < QUICK_PIT_ITERATIONS; i++, expect--) { + if (!pit_expect_msb(expect)) + goto failed; + } + t2 = get_cycles(); + + /* + * Make sure we can rely on the second TSC timestamp: + */ + if (!pit_expect_msb(expect)) + goto failed; + + /* + * Ok, if we get here, then we've seen the + * MSB of the PIT decrement QUICK_PIT_ITERATIONS + * times, and each MSB had many hits, so we never + * had any sudden jumps. + * + * As a result, we can depend on there not being + * any odd delays anywhere, and the TSC reads are + * reliable. + * + * kHz = ticks / time-in-seconds / 1000; + * kHz = (t2 - t1) / (QPI * 256 / PIT_TICK_RATE) / 1000 + * kHz = ((t2 - t1) * PIT_TICK_RATE) / (QPI * 256 * 1000) + */ + delta = (t2 - t1)*PIT_TICK_RATE; + do_div(delta, QUICK_PIT_ITERATIONS*256*1000); + printk("Fast TSC calibration using PIT\n"); + return delta; + } +failed: + return 0; +} + +/** + * native_calibrate_tsc - calibrate the tsc on boot + */ +unsigned long native_calibrate_tsc(void) +{ + u64 tsc1, tsc2, delta, ref1, ref2; + unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; + unsigned long flags, latch, ms, fast_calibrate; + int hpet = is_hpet_enabled(), i, loopmin; + + local_irq_save(flags); + fast_calibrate = quick_pit_calibrate(); local_irq_restore(flags); + if (fast_calibrate) + return fast_calibrate; /* - * Preset the result with the raw and inaccurate PIT - * calibration value + * Run 5 calibration loops to get the lowest frequency value + * (the best estimate). We use two different calibration modes + * here: + * + * 1) PIT loop. We set the PIT Channel 2 to oneshot mode and + * load a timeout of 50ms. We read the time right after we + * started the timer and wait until the PIT count down reaches + * zero. In each wait loop iteration we read the TSC and check + * the delta to the previous read. We keep track of the min + * and max values of that delta. The delta is mostly defined + * by the IO time of the PIT access, so we can detect when a + * SMI/SMM disturbance happend between the two reads. If the + * maximum time is significantly larger than the minimum time, + * then we discard the result and have another try. + * + * 2) Reference counter. If available we use the HPET or the + * PMTIMER as a reference to check the sanity of that value. + * We use separate TSC readouts and check inside of the + * reference read for a SMI/SMM disturbance. We dicard + * disturbed values here as well. We do that around the PIT + * calibration delay loop as we have to wait for a certain + * amount of time anyway. */ - delta = (tr2 - tr1); - do_div(delta, 50); - tsc_khz_val = delta; - - /* hpet or pmtimer available ? */ - if (!hpet && !pm1 && !pm2) { - printk(KERN_INFO "TSC calibrated against PIT\n"); - goto out; + + /* Preset PIT loop values */ + latch = CAL_LATCH; + ms = CAL_MS; + loopmin = CAL_PIT_LOOPS; + + for (i = 0; i < 3; i++) { + unsigned long tsc_pit_khz; + + /* + * Read the start value and the reference count of + * hpet/pmtimer when available. Then do the PIT + * calibration, which will take at least 50ms, and + * read the end value. + */ + local_irq_save(flags); + tsc1 = tsc_read_refs(&ref1, hpet); + tsc_pit_khz = pit_calibrate_tsc(latch, ms, loopmin); + tsc2 = tsc_read_refs(&ref2, hpet); + local_irq_restore(flags); + + /* Pick the lowest PIT TSC calibration so far */ + tsc_pit_min = min(tsc_pit_min, tsc_pit_khz); + + /* hpet or pmtimer available ? */ + if (!hpet && !ref1 && !ref2) + continue; + + /* Check, whether the sampling was disturbed by an SMI */ + if (tsc1 == ULLONG_MAX || tsc2 == ULLONG_MAX) + continue; + + tsc2 = (tsc2 - tsc1) * 1000000LL; + if (hpet) + tsc2 = calc_hpet_ref(tsc2, ref1, ref2); + else + tsc2 = calc_pmtimer_ref(tsc2, ref1, ref2); + + tsc_ref_min = min(tsc_ref_min, (unsigned long) tsc2); + + /* Check the reference deviation */ + delta = ((u64) tsc_pit_min) * 100; + do_div(delta, tsc_ref_min); + + /* + * If both calibration results are inside a 10% window + * then we can be sure, that the calibration + * succeeded. We break out of the loop right away. We + * use the reference value, as it is more precise. + */ + if (delta >= 90 && delta <= 110) { + printk(KERN_INFO + "TSC: PIT calibration matches %s. %d loops\n", + hpet ? "HPET" : "PMTIMER", i + 1); + return tsc_ref_min; + } + + /* + * Check whether PIT failed more than once. This + * happens in virtualized environments. We need to + * give the virtual PC a slightly longer timeframe for + * the HPET/PMTIMER to make the result precise. + */ + if (i == 1 && tsc_pit_min == ULONG_MAX) { + latch = CAL2_LATCH; + ms = CAL2_MS; + loopmin = CAL2_PIT_LOOPS; + } } - /* Check, whether the sampling was disturbed by an SMI */ - if (tsc1 == ULLONG_MAX || tsc2 == ULLONG_MAX) { - printk(KERN_WARNING "TSC calibration disturbed by SMI, " - "using PIT calibration result\n"); - goto out; + /* + * Now check the results. + */ + if (tsc_pit_min == ULONG_MAX) { + /* PIT gave no useful value */ + printk(KERN_WARNING "TSC: Unable to calibrate against PIT\n"); + + /* We don't have an alternative source, disable TSC */ + if (!hpet && !ref1 && !ref2) { + printk("TSC: No reference (HPET/PMTIMER) available\n"); + return 0; + } + + /* The alternative source failed as well, disable TSC */ + if (tsc_ref_min == ULONG_MAX) { + printk(KERN_WARNING "TSC: HPET/PMTIMER calibration " + "failed.\n"); + return 0; + } + + /* Use the alternative source */ + printk(KERN_INFO "TSC: using %s reference calibration\n", + hpet ? "HPET" : "PMTIMER"); + + return tsc_ref_min; } - tsc2 = (tsc2 - tsc1) * 1000000LL; - - if (hpet) { - printk(KERN_INFO "TSC calibrated against HPET\n"); - if (hpet2 < hpet1) - hpet2 += 0x100000000ULL; - hpet2 -= hpet1; - tsc1 = ((u64)hpet2 * hpet_readl(HPET_PERIOD)); - do_div(tsc1, 1000000); - } else { - printk(KERN_INFO "TSC calibrated against PM_TIMER\n"); - if (pm2 < pm1) - pm2 += (u64)ACPI_PM_OVRRUN; - pm2 -= pm1; - tsc1 = pm2 * 1000000000LL; - do_div(tsc1, PMTMR_TICKS_PER_SEC); + /* We don't have an alternative source, use the PIT calibration value */ + if (!hpet && !ref1 && !ref2) { + printk(KERN_INFO "TSC: Using PIT calibration value\n"); + return tsc_pit_min; } - do_div(tsc2, tsc1); - tsc_khz_val = tsc2; + /* The alternative source failed, use the PIT calibration value */ + if (tsc_ref_min == ULONG_MAX) { + printk(KERN_WARNING "TSC: HPET/PMTIMER calibration failed. " + "Using PIT calibration\n"); + return tsc_pit_min; + } -out: - return tsc_khz_val; + /* + * The calibration values differ too much. In doubt, we use + * the PIT value as we know that there are PMTIMERs around + * running at double speed. At least we let the user know: + */ + printk(KERN_WARNING "TSC: PIT calibration deviates from %s: %lu %lu.\n", + hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min); + printk(KERN_INFO "TSC: Using PIT calibration value\n"); + return tsc_pit_min; } - #ifdef CONFIG_X86_32 /* Only called from the Powernow K7 cpu freq driver */ int recalibrate_cpu_khz(void) @@ -314,7 +624,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, mark_tsc_unstable("cpufreq changes"); } - set_cyc2ns_scale(tsc_khz_ref, freq->cpu); + set_cyc2ns_scale(tsc_khz, freq->cpu); return 0; } @@ -325,6 +635,10 @@ static struct notifier_block time_cpufreq_notifier_block = { static int __init cpufreq_tsc(void) { + if (!cpu_has_tsc) + return 0; + if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) + return 0; cpufreq_register_notifier(&time_cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); return 0; diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 0577825cf89..9ffb01c31c4 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -88,11 +88,9 @@ static __cpuinit void check_tsc_warp(void) __raw_spin_unlock(&sync_lock); } } - if (!(now-start)) { - printk("Warning: zero tsc calibration delta: %Ld [max: %Ld]\n", + WARN(!(now-start), + "Warning: zero tsc calibration delta: %Ld [max: %Ld]\n", now-start, end-start); - WARN_ON(1); - } } /* diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index 594ef47f0a6..61a97e616f7 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -25,45 +25,31 @@ #include <asm/visws/cobalt.h> #include <asm/visws/piix4.h> #include <asm/arch_hooks.h> +#include <asm/io_apic.h> #include <asm/fixmap.h> #include <asm/reboot.h> #include <asm/setup.h> #include <asm/e820.h> -#include <asm/smp.h> #include <asm/io.h> #include <mach_ipi.h> #include "mach_apic.h" -#include <linux/init.h> -#include <linux/smp.h> - #include <linux/kernel_stat.h> -#include <linux/interrupt.h> -#include <linux/init.h> -#include <asm/io.h> -#include <asm/apic.h> #include <asm/i8259.h> #include <asm/irq_vectors.h> -#include <asm/visws/cobalt.h> #include <asm/visws/lithium.h> -#include <asm/visws/piix4.h> #include <linux/sched.h> #include <linux/kernel.h> -#include <linux/init.h> #include <linux/pci.h> #include <linux/pci_ids.h> extern int no_broadcast; -#include <asm/io.h> #include <asm/apic.h> -#include <asm/arch_hooks.h> -#include <asm/visws/cobalt.h> -#include <asm/visws/lithium.h> char visws_board_type = -1; char visws_board_rev = -1; diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 38f566fa27d..4eeb5cf9720 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -46,6 +46,7 @@ #include <asm/io.h> #include <asm/tlbflush.h> #include <asm/irq.h> +#include <asm/syscalls.h> /* * Known problems: diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 6ca515d6db5..8b6c393ab9f 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -235,7 +235,7 @@ static void vmi_write_ldt_entry(struct desc_struct *dt, int entry, const void *desc) { u32 *ldt_entry = (u32 *)desc; - vmi_ops.write_idt_entry(dt, entry, ldt_entry[0], ldt_entry[1]); + vmi_ops.write_ldt_entry(dt, entry, ldt_entry[0], ldt_entry[1]); } static void vmi_load_sp0(struct tss_struct *tss, @@ -393,13 +393,13 @@ static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type) } #endif -static void vmi_allocate_pte(struct mm_struct *mm, u32 pfn) +static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn) { vmi_set_page_type(pfn, VMI_PAGE_L1); vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0); } -static void vmi_allocate_pmd(struct mm_struct *mm, u32 pfn) +static void vmi_allocate_pmd(struct mm_struct *mm, unsigned long pfn) { /* * This call comes in very early, before mem_map is setup. @@ -410,20 +410,20 @@ static void vmi_allocate_pmd(struct mm_struct *mm, u32 pfn) vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0); } -static void vmi_allocate_pmd_clone(u32 pfn, u32 clonepfn, u32 start, u32 count) +static void vmi_allocate_pmd_clone(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count) { vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE); vmi_check_page_type(clonepfn, VMI_PAGE_L2); vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count); } -static void vmi_release_pte(u32 pfn) +static void vmi_release_pte(unsigned long pfn) { vmi_ops.release_page(pfn, VMI_PAGE_L1); vmi_set_page_type(pfn, VMI_PAGE_NORMAL); } -static void vmi_release_pmd(u32 pfn) +static void vmi_release_pmd(unsigned long pfn) { vmi_ops.release_page(pfn, VMI_PAGE_L2); vmi_set_page_type(pfn, VMI_PAGE_NORMAL); @@ -905,8 +905,8 @@ static inline int __init activate_vmi(void) #endif #ifdef CONFIG_X86_LOCAL_APIC - para_fill(pv_apic_ops.apic_read, APICRead); - para_fill(pv_apic_ops.apic_write, APICWrite); + para_fill(apic_ops->read, APICRead); + para_fill(apic_ops->write, APICWrite); #endif /* diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index af5bdad8460..a9b8560adbc 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -140,10 +140,10 @@ SECTIONS *(.con_initcall.init) __con_initcall_end = .; } - .x86cpuvendor.init : AT(ADDR(.x86cpuvendor.init) - LOAD_OFFSET) { - __x86cpuvendor_start = .; - *(.x86cpuvendor.init) - __x86cpuvendor_end = .; + .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { + __x86_cpu_dev_start = .; + *(.x86_cpu_dev.init) + __x86_cpu_dev_end = .; } SECURITY_INIT . = ALIGN(4); @@ -180,6 +180,7 @@ SECTIONS . = ALIGN(PAGE_SIZE); .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { __per_cpu_start = .; + *(.data.percpu.page_aligned) *(.data.percpu) *(.data.percpu.shared_aligned) __per_cpu_end = .; diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 63e5c1a22e8..46e05447405 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -168,12 +168,11 @@ SECTIONS *(.con_initcall.init) } __con_initcall_end = .; - . = ALIGN(16); - __x86cpuvendor_start = .; - .x86cpuvendor.init : AT(ADDR(.x86cpuvendor.init) - LOAD_OFFSET) { - *(.x86cpuvendor.init) + __x86_cpu_dev_start = .; + .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { + *(.x86_cpu_dev.init) } - __x86cpuvendor_end = .; + __x86_cpu_dev_end = .; SECURITY_INIT . = ALIGN(8); diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c index 0c029e8959c..7766d36983f 100644 --- a/arch/x86/kernel/vsmp_64.c +++ b/arch/x86/kernel/vsmp_64.c @@ -61,7 +61,7 @@ static void vsmp_irq_enable(void) native_restore_fl((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC)); } -static unsigned __init vsmp_patch(u8 type, u16 clobbers, void *ibuf, +static unsigned __init_or_module vsmp_patch(u8 type, u16 clobbers, void *ibuf, unsigned long addr, unsigned len) { switch (type) { diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c new file mode 100644 index 00000000000..9abac8a9d82 --- /dev/null +++ b/arch/x86/kernel/xsave.c @@ -0,0 +1,345 @@ +/* + * xsave/xrstor support. + * + * Author: Suresh Siddha <suresh.b.siddha@intel.com> + */ +#include <linux/bootmem.h> +#include <linux/compat.h> +#include <asm/i387.h> +#ifdef CONFIG_IA32_EMULATION +#include <asm/sigcontext32.h> +#endif +#include <asm/xcr.h> + +/* + * Supported feature mask by the CPU and the kernel. + */ +u64 pcntxt_mask; + +struct _fpx_sw_bytes fx_sw_reserved; +#ifdef CONFIG_IA32_EMULATION +struct _fpx_sw_bytes fx_sw_reserved_ia32; +#endif + +/* + * Check for the presence of extended state information in the + * user fpstate pointer in the sigcontext. + */ +int check_for_xstate(struct i387_fxsave_struct __user *buf, + void __user *fpstate, + struct _fpx_sw_bytes *fx_sw_user) +{ + int min_xstate_size = sizeof(struct i387_fxsave_struct) + + sizeof(struct xsave_hdr_struct); + unsigned int magic2; + int err; + + err = __copy_from_user(fx_sw_user, &buf->sw_reserved[0], + sizeof(struct _fpx_sw_bytes)); + + if (err) + return err; + + /* + * First Magic check failed. + */ + if (fx_sw_user->magic1 != FP_XSTATE_MAGIC1) + return -1; + + /* + * Check for error scenarios. + */ + if (fx_sw_user->xstate_size < min_xstate_size || + fx_sw_user->xstate_size > xstate_size || + fx_sw_user->xstate_size > fx_sw_user->extended_size) + return -1; + + err = __get_user(magic2, (__u32 *) (((void *)fpstate) + + fx_sw_user->extended_size - + FP_XSTATE_MAGIC2_SIZE)); + /* + * Check for the presence of second magic word at the end of memory + * layout. This detects the case where the user just copied the legacy + * fpstate layout with out copying the extended state information + * in the memory layout. + */ + if (err || magic2 != FP_XSTATE_MAGIC2) + return -1; + + return 0; +} + +#ifdef CONFIG_X86_64 +/* + * Signal frame handlers. + */ + +int save_i387_xstate(void __user *buf) +{ + struct task_struct *tsk = current; + int err = 0; + + if (!access_ok(VERIFY_WRITE, buf, sig_xstate_size)) + return -EACCES; + + BUG_ON(sig_xstate_size < xstate_size); + + if ((unsigned long)buf % 64) + printk("save_i387_xstate: bad fpstate %p\n", buf); + + if (!used_math()) + return 0; + clear_used_math(); /* trigger finit */ + if (task_thread_info(tsk)->status & TS_USEDFPU) { + /* + * Start with clearing the user buffer. This will present a + * clean context for the bytes not touched by the fxsave/xsave. + */ + err = __clear_user(buf, sig_xstate_size); + if (err) + return err; + + if (task_thread_info(tsk)->status & TS_XSAVE) + err = xsave_user(buf); + else + err = fxsave_user(buf); + + if (err) + return err; + task_thread_info(tsk)->status &= ~TS_USEDFPU; + stts(); + } else { + if (__copy_to_user(buf, &tsk->thread.xstate->fxsave, + xstate_size)) + return -1; + } + + if (task_thread_info(tsk)->status & TS_XSAVE) { + struct _fpstate __user *fx = buf; + struct _xstate __user *x = buf; + u64 xstate_bv; + + err = __copy_to_user(&fx->sw_reserved, &fx_sw_reserved, + sizeof(struct _fpx_sw_bytes)); + + err |= __put_user(FP_XSTATE_MAGIC2, + (__u32 __user *) (buf + sig_xstate_size + - FP_XSTATE_MAGIC2_SIZE)); + + /* + * Read the xstate_bv which we copied (directly from the cpu or + * from the state in task struct) to the user buffers and + * set the FP/SSE bits. + */ + err |= __get_user(xstate_bv, &x->xstate_hdr.xstate_bv); + + /* + * For legacy compatible, we always set FP/SSE bits in the bit + * vector while saving the state to the user context. This will + * enable us capturing any changes(during sigreturn) to + * the FP/SSE bits by the legacy applications which don't touch + * xstate_bv in the xsave header. + * + * xsave aware apps can change the xstate_bv in the xsave + * header as well as change any contents in the memory layout. + * xrestore as part of sigreturn will capture all the changes. + */ + xstate_bv |= XSTATE_FPSSE; + + err |= __put_user(xstate_bv, &x->xstate_hdr.xstate_bv); + + if (err) + return err; + } + + return 1; +} + +/* + * Restore the extended state if present. Otherwise, restore the FP/SSE + * state. + */ +int restore_user_xstate(void __user *buf) +{ + struct _fpx_sw_bytes fx_sw_user; + u64 mask; + int err; + + if (((unsigned long)buf % 64) || + check_for_xstate(buf, buf, &fx_sw_user)) + goto fx_only; + + mask = fx_sw_user.xstate_bv; + + /* + * restore the state passed by the user. + */ + err = xrestore_user(buf, mask); + if (err) + return err; + + /* + * init the state skipped by the user. + */ + mask = pcntxt_mask & ~mask; + + xrstor_state(init_xstate_buf, mask); + + return 0; + +fx_only: + /* + * couldn't find the extended state information in the + * memory layout. Restore just the FP/SSE and init all + * the other extended state. + */ + xrstor_state(init_xstate_buf, pcntxt_mask & ~XSTATE_FPSSE); + return fxrstor_checking((__force struct i387_fxsave_struct *)buf); +} + +/* + * This restores directly out of user space. Exceptions are handled. + */ +int restore_i387_xstate(void __user *buf) +{ + struct task_struct *tsk = current; + int err = 0; + + if (!buf) { + if (used_math()) + goto clear; + return 0; + } else + if (!access_ok(VERIFY_READ, buf, sig_xstate_size)) + return -EACCES; + + if (!used_math()) { + err = init_fpu(tsk); + if (err) + return err; + } + + if (!(task_thread_info(current)->status & TS_USEDFPU)) { + clts(); + task_thread_info(current)->status |= TS_USEDFPU; + } + if (task_thread_info(tsk)->status & TS_XSAVE) + err = restore_user_xstate(buf); + else + err = fxrstor_checking((__force struct i387_fxsave_struct *) + buf); + if (unlikely(err)) { + /* + * Encountered an error while doing the restore from the + * user buffer, clear the fpu state. + */ +clear: + clear_fpu(tsk); + clear_used_math(); + } + return err; +} +#endif + +/* + * Prepare the SW reserved portion of the fxsave memory layout, indicating + * the presence of the extended state information in the memory layout + * pointed by the fpstate pointer in the sigcontext. + * This will be saved when ever the FP and extended state context is + * saved on the user stack during the signal handler delivery to the user. + */ +void prepare_fx_sw_frame(void) +{ + int size_extended = (xstate_size - sizeof(struct i387_fxsave_struct)) + + FP_XSTATE_MAGIC2_SIZE; + + sig_xstate_size = sizeof(struct _fpstate) + size_extended; + +#ifdef CONFIG_IA32_EMULATION + sig_xstate_ia32_size = sizeof(struct _fpstate_ia32) + size_extended; +#endif + + memset(&fx_sw_reserved, 0, sizeof(fx_sw_reserved)); + + fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1; + fx_sw_reserved.extended_size = sig_xstate_size; + fx_sw_reserved.xstate_bv = pcntxt_mask; + fx_sw_reserved.xstate_size = xstate_size; +#ifdef CONFIG_IA32_EMULATION + memcpy(&fx_sw_reserved_ia32, &fx_sw_reserved, + sizeof(struct _fpx_sw_bytes)); + fx_sw_reserved_ia32.extended_size = sig_xstate_ia32_size; +#endif +} + +/* + * Represents init state for the supported extended state. + */ +struct xsave_struct *init_xstate_buf; + +#ifdef CONFIG_X86_64 +unsigned int sig_xstate_size = sizeof(struct _fpstate); +#endif + +/* + * Enable the extended processor state save/restore feature + */ +void __cpuinit xsave_init(void) +{ + if (!cpu_has_xsave) + return; + + set_in_cr4(X86_CR4_OSXSAVE); + + /* + * Enable all the features that the HW is capable of + * and the Linux kernel is aware of. + */ + xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask); +} + +/* + * setup the xstate image representing the init state + */ +static void __init setup_xstate_init(void) +{ + init_xstate_buf = alloc_bootmem(xstate_size); + init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT; +} + +/* + * Enable and initialize the xsave feature. + */ +void __init xsave_cntxt_init(void) +{ + unsigned int eax, ebx, ecx, edx; + + cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx); + pcntxt_mask = eax + ((u64)edx << 32); + + if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) { + printk(KERN_ERR "FP/SSE not shown under xsave features 0x%llx\n", + pcntxt_mask); + BUG(); + } + + /* + * for now OS knows only about FP/SSE + */ + pcntxt_mask = pcntxt_mask & XCNTXT_MASK; + xsave_init(); + + /* + * Recompute the context size for enabled features + */ + cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx); + xstate_size = ebx; + + prepare_fx_sw_frame(); + + setup_xstate_init(); + + printk(KERN_INFO "xsave/xrstor: enabled xstate_bv 0x%llx, " + "cntxt size 0x%x\n", + pcntxt_mask, xstate_size); +} |