diff options
Diffstat (limited to 'arch/x86/kernel')
104 files changed, 2947 insertions, 1679 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index da140611bb5..3db651fc8ec 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -7,9 +7,10 @@ extra-y := head_$(BITS).o head$(BITS).o head.o init_task.o vmlinu CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) ifdef CONFIG_FTRACE -# Do not profile debug utilities +# Do not profile debug and lowlevel utilities CFLAGS_REMOVE_tsc.o = -pg CFLAGS_REMOVE_rtc.o = -pg +CFLAGS_REMOVE_paravirt.o = -pg endif # @@ -102,6 +103,7 @@ obj-$(CONFIG_OLPC) += olpc.o # 64 bit specific files ifeq ($(CONFIG_X86_64),y) obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o + obj-y += bios_uv.o obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o obj-$(CONFIG_AUDIT) += audit_64.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index f489d7a9be9..bfd10fd211c 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -97,6 +97,8 @@ static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; #warning ACPI uses CMPXCHG, i486 and later hardware #endif +static int acpi_mcfg_64bit_base_addr __initdata = FALSE; + /* -------------------------------------------------------------------------- Boot-time Configuration -------------------------------------------------------------------------- */ @@ -158,6 +160,14 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size) struct acpi_mcfg_allocation *pci_mmcfg_config; int pci_mmcfg_config_num; +static int __init acpi_mcfg_oem_check(struct acpi_table_mcfg *mcfg) +{ + if (!strcmp(mcfg->header.oem_id, "SGI")) + acpi_mcfg_64bit_base_addr = TRUE; + + return 0; +} + int __init acpi_parse_mcfg(struct acpi_table_header *header) { struct acpi_table_mcfg *mcfg; @@ -190,8 +200,12 @@ int __init acpi_parse_mcfg(struct acpi_table_header *header) } memcpy(pci_mmcfg_config, &mcfg[1], config_size); + + acpi_mcfg_oem_check(mcfg); + for (i = 0; i < pci_mmcfg_config_num; ++i) { - if (pci_mmcfg_config[i].address > 0xFFFFFFFF) { + if ((pci_mmcfg_config[i].address > 0xFFFFFFFF) && + !acpi_mcfg_64bit_base_addr) { printk(KERN_ERR PREFIX "MMCONFIG not in low 4GB of memory\n"); kfree(pci_mmcfg_config); @@ -1021,7 +1035,7 @@ void __init mp_config_acpi_legacy_irqs(void) mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA; #endif set_bit(MP_ISA_BUS, mp_bus_not_pci); - Dprintk("Bus #%d is ISA\n", MP_ISA_BUS); + pr_debug("Bus #%d is ISA\n", MP_ISA_BUS); #ifdef CONFIG_X86_ES7000 /* @@ -1127,8 +1141,8 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity) return gsi; } if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) { - Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n", - mp_ioapic_routing[ioapic].apic_id, ioapic_pin); + pr_debug(KERN_DEBUG "Pin %d-%d already programmed\n", + mp_ioapic_routing[ioapic].apic_id, ioapic_pin); #ifdef CONFIG_X86_32 return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]); #else diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index a3ddad18aaa..426e5d91b63 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -20,7 +20,7 @@ unsigned long acpi_realmode_flags; /* address in low memory of the wakeup routine. */ static unsigned long acpi_realmode; -#ifdef CONFIG_64BIT +#if defined(CONFIG_SMP) && defined(CONFIG_64BIT) static char temp_stack[10240]; #endif @@ -86,7 +86,7 @@ int acpi_save_state_mem(void) #endif /* !CONFIG_64BIT */ header->pmode_cr0 = read_cr0(); - header->pmode_cr4 = read_cr4(); + header->pmode_cr4 = read_cr4_safe(); header->realmode_flags = acpi_realmode_flags; header->real_magic = 0x12345678; @@ -150,6 +150,10 @@ static int __init acpi_sleep_setup(char *str) acpi_realmode_flags |= 2; if (strncmp(str, "s3_beep", 7) == 0) acpi_realmode_flags |= 4; +#ifdef CONFIG_HIBERNATION + if (strncmp(str, "s4_nohwsig", 10) == 0) + acpi_no_s4_hw_signature(); +#endif if (strncmp(str, "old_ordering", 12) == 0) acpi_old_suspend_ordering(); str = strchr(str, ','); diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 2763cb37b55..65a0c1b4869 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -145,35 +145,25 @@ static const unsigned char *const p6_nops[ASM_NOP_MAX+1] = { extern char __vsyscall_0; const unsigned char *const *find_nop_table(void) { - return boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || - boot_cpu_data.x86 < 6 ? k8_nops : p6_nops; + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && + boot_cpu_has(X86_FEATURE_NOPL)) + return p6_nops; + else + return k8_nops; } #else /* CONFIG_X86_64 */ -static const struct nop { - int cpuid; - const unsigned char *const *noptable; -} noptypes[] = { - { X86_FEATURE_K8, k8_nops }, - { X86_FEATURE_K7, k7_nops }, - { X86_FEATURE_P4, p6_nops }, - { X86_FEATURE_P3, p6_nops }, - { -1, NULL } -}; - const unsigned char *const *find_nop_table(void) { - const unsigned char *const *noptable = intel_nops; - int i; - - for (i = 0; noptypes[i].cpuid >= 0; i++) { - if (boot_cpu_has(noptypes[i].cpuid)) { - noptable = noptypes[i].noptable; - break; - } - } - return noptable; + if (boot_cpu_has(X86_FEATURE_K8)) + return k8_nops; + else if (boot_cpu_has(X86_FEATURE_K7)) + return k7_nops; + else if (boot_cpu_has(X86_FEATURE_NOPL)) + return p6_nops; + else + return intel_nops; } #endif /* CONFIG_X86_64 */ diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index f2766d84c7a..042fdc27bc9 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -23,36 +23,49 @@ #include <linux/scatterlist.h> #include <linux/iommu-helper.h> #include <asm/proto.h> -#include <asm/gart.h> +#include <asm/iommu.h> #include <asm/amd_iommu_types.h> #include <asm/amd_iommu.h> #define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28)) -#define to_pages(addr, size) \ - (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT) +#define EXIT_LOOP_COUNT 10000000 static DEFINE_RWLOCK(amd_iommu_devtable_lock); -struct command { +/* + * general struct to manage commands send to an IOMMU + */ +struct iommu_cmd { u32 data[4]; }; static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, struct unity_map_entry *e); +/* returns !0 if the IOMMU is caching non-present entries in its TLB */ static int iommu_has_npcache(struct amd_iommu *iommu) { return iommu->cap & IOMMU_CAP_NPCACHE; } -static int __iommu_queue_command(struct amd_iommu *iommu, struct command *cmd) +/**************************************************************************** + * + * IOMMU command queuing functions + * + ****************************************************************************/ + +/* + * Writes the command to the IOMMUs command buffer and informs the + * hardware about the new command. Must be called with iommu->lock held. + */ +static int __iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) { u32 tail, head; u8 *target; tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); - target = (iommu->cmd_buf + tail); + target = iommu->cmd_buf + tail; memcpy_toio(target, cmd, sizeof(*cmd)); tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size; head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); @@ -63,7 +76,11 @@ static int __iommu_queue_command(struct amd_iommu *iommu, struct command *cmd) return 0; } -static int iommu_queue_command(struct amd_iommu *iommu, struct command *cmd) +/* + * General queuing function for commands. Takes iommu->lock and calls + * __iommu_queue_command(). + */ +static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) { unsigned long flags; int ret; @@ -75,35 +92,59 @@ static int iommu_queue_command(struct amd_iommu *iommu, struct command *cmd) return ret; } +/* + * This function is called whenever we need to ensure that the IOMMU has + * completed execution of all commands we sent. It sends a + * COMPLETION_WAIT command and waits for it to finish. The IOMMU informs + * us about that by writing a value to a physical address we pass with + * the command. + */ static int iommu_completion_wait(struct amd_iommu *iommu) { - int ret; - struct command cmd; - volatile u64 ready = 0; - unsigned long ready_phys = virt_to_phys(&ready); + int ret = 0, ready = 0; + unsigned status = 0; + struct iommu_cmd cmd; + unsigned long flags, i = 0; memset(&cmd, 0, sizeof(cmd)); - cmd.data[0] = LOW_U32(ready_phys) | CMD_COMPL_WAIT_STORE_MASK; - cmd.data[1] = HIGH_U32(ready_phys); - cmd.data[2] = 1; /* value written to 'ready' */ + cmd.data[0] = CMD_COMPL_WAIT_INT_MASK; CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT); iommu->need_sync = 0; - ret = iommu_queue_command(iommu, &cmd); + spin_lock_irqsave(&iommu->lock, flags); + + ret = __iommu_queue_command(iommu, &cmd); if (ret) - return ret; + goto out; + + while (!ready && (i < EXIT_LOOP_COUNT)) { + ++i; + /* wait for the bit to become one */ + status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET); + ready = status & MMIO_STATUS_COM_WAIT_INT_MASK; + } - while (!ready) - cpu_relax(); + /* set bit back to zero */ + status &= ~MMIO_STATUS_COM_WAIT_INT_MASK; + writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET); + + if (unlikely((i == EXIT_LOOP_COUNT) && printk_ratelimit())) + printk(KERN_WARNING "AMD IOMMU: Completion wait loop failed\n"); +out: + spin_unlock_irqrestore(&iommu->lock, flags); return 0; } +/* + * Command send function for invalidating a device table entry + */ static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid) { - struct command cmd; + struct iommu_cmd cmd; + int ret; BUG_ON(iommu == NULL); @@ -111,37 +152,50 @@ static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid) CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY); cmd.data[0] = devid; + ret = iommu_queue_command(iommu, &cmd); + iommu->need_sync = 1; - return iommu_queue_command(iommu, &cmd); + return ret; } +/* + * Generic command send function for invalidaing TLB entries + */ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu, u64 address, u16 domid, int pde, int s) { - struct command cmd; + struct iommu_cmd cmd; + int ret; memset(&cmd, 0, sizeof(cmd)); address &= PAGE_MASK; CMD_SET_TYPE(&cmd, CMD_INV_IOMMU_PAGES); cmd.data[1] |= domid; - cmd.data[2] = LOW_U32(address); - cmd.data[3] = HIGH_U32(address); - if (s) + cmd.data[2] = lower_32_bits(address); + cmd.data[3] = upper_32_bits(address); + if (s) /* size bit - we flush more than one 4kb page */ cmd.data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; - if (pde) + if (pde) /* PDE bit - we wan't flush everything not only the PTEs */ cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; + ret = iommu_queue_command(iommu, &cmd); + iommu->need_sync = 1; - return iommu_queue_command(iommu, &cmd); + return ret; } +/* + * TLB invalidation function which is called from the mapping functions. + * It invalidates a single PTE if the range to flush is within a single + * page. Otherwise it flushes the whole TLB of the IOMMU. + */ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid, u64 address, size_t size) { int s = 0; - unsigned pages = to_pages(address, size); + unsigned pages = iommu_num_pages(address, size); address &= PAGE_MASK; @@ -159,6 +213,20 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid, return 0; } +/**************************************************************************** + * + * The functions below are used the create the page table mappings for + * unity mapped regions. + * + ****************************************************************************/ + +/* + * Generic mapping functions. It maps a physical address into a DMA + * address space. It allocates the page table pages if necessary. + * In the future it can be extended to a generic mapping function + * supporting all features of AMD IOMMU page tables like level skipping + * and full 64 bit address spaces. + */ static int iommu_map(struct protection_domain *dom, unsigned long bus_addr, unsigned long phys_addr, @@ -209,6 +277,10 @@ static int iommu_map(struct protection_domain *dom, return 0; } +/* + * This function checks if a specific unity mapping entry is needed for + * this specific IOMMU. + */ static int iommu_for_unity_map(struct amd_iommu *iommu, struct unity_map_entry *entry) { @@ -223,6 +295,12 @@ static int iommu_for_unity_map(struct amd_iommu *iommu, return 0; } +/* + * Init the unity mappings for a specific IOMMU in the system + * + * Basically iterates over all unity mapping entries and applies them to + * the default domain DMA of that IOMMU if necessary. + */ static int iommu_init_unity_mappings(struct amd_iommu *iommu) { struct unity_map_entry *entry; @@ -239,6 +317,10 @@ static int iommu_init_unity_mappings(struct amd_iommu *iommu) return 0; } +/* + * This function actually applies the mapping to the page table of the + * dma_ops domain. + */ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, struct unity_map_entry *e) { @@ -261,6 +343,9 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, return 0; } +/* + * Inits the unity mappings required for a specific device + */ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, u16 devid) { @@ -278,12 +363,26 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, return 0; } +/**************************************************************************** + * + * The next functions belong to the address allocator for the dma_ops + * interface functions. They work like the allocators in the other IOMMU + * drivers. Its basically a bitmap which marks the allocated pages in + * the aperture. Maybe it could be enhanced in the future to a more + * efficient allocator. + * + ****************************************************************************/ static unsigned long dma_mask_to_pages(unsigned long mask) { return (mask >> PAGE_SHIFT) + (PAGE_ALIGN(mask & ~PAGE_MASK) >> PAGE_SHIFT); } +/* + * The address allocator core function. + * + * called with domain->lock held + */ static unsigned long dma_ops_alloc_addresses(struct device *dev, struct dma_ops_domain *dom, unsigned int pages) @@ -317,6 +416,11 @@ static unsigned long dma_ops_alloc_addresses(struct device *dev, return address; } +/* + * The address free function. + * + * called with domain->lock held + */ static void dma_ops_free_addresses(struct dma_ops_domain *dom, unsigned long address, unsigned int pages) @@ -325,6 +429,16 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom, iommu_area_free(dom->bitmap, address, pages); } +/**************************************************************************** + * + * The next functions belong to the domain allocation. A domain is + * allocated for every IOMMU as the default domain. If device isolation + * is enabled, every device get its own domain. The most important thing + * about domains is the page table mapping the DMA address space they + * contain. + * + ****************************************************************************/ + static u16 domain_id_alloc(void) { unsigned long flags; @@ -342,6 +456,10 @@ static u16 domain_id_alloc(void) return id; } +/* + * Used to reserve address ranges in the aperture (e.g. for exclusion + * ranges. + */ static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, unsigned long start_page, unsigned int pages) @@ -382,6 +500,10 @@ static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom) free_page((unsigned long)p1); } +/* + * Free a domain, only used if something went wrong in the + * allocation path and we need to free an already allocated page table + */ static void dma_ops_domain_free(struct dma_ops_domain *dom) { if (!dom) @@ -396,6 +518,11 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom) kfree(dom); } +/* + * Allocates a new protection domain usable for the dma_ops functions. + * It also intializes the page table and the address allocator data + * structures required for the dma_ops interface + */ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, unsigned order) { @@ -436,14 +563,20 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, dma_dom->bitmap[0] = 1; dma_dom->next_bit = 0; + /* Intialize the exclusion range if necessary */ if (iommu->exclusion_start && iommu->exclusion_start < dma_dom->aperture_size) { unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; - int pages = to_pages(iommu->exclusion_start, - iommu->exclusion_length); + int pages = iommu_num_pages(iommu->exclusion_start, + iommu->exclusion_length); dma_ops_reserve_addresses(dma_dom, startpage, pages); } + /* + * At the last step, build the page tables so we don't need to + * allocate page table pages in the dma_ops mapping/unmapping + * path. + */ num_pte_pages = dma_dom->aperture_size / (PAGE_SIZE * 512); dma_dom->pte_pages = kzalloc(num_pte_pages * sizeof(void *), GFP_KERNEL); @@ -472,6 +605,10 @@ free_dma_dom: return NULL; } +/* + * Find out the protection domain structure for a given PCI device. This + * will give us the pointer to the page table root for example. + */ static struct protection_domain *domain_for_device(u16 devid) { struct protection_domain *dom; @@ -484,6 +621,10 @@ static struct protection_domain *domain_for_device(u16 devid) return dom; } +/* + * If a device is not yet associated with a domain, this function does + * assigns it visible for the hardware + */ static void set_device_domain(struct amd_iommu *iommu, struct protection_domain *domain, u16 devid) @@ -508,6 +649,19 @@ static void set_device_domain(struct amd_iommu *iommu, iommu->need_sync = 1; } +/***************************************************************************** + * + * The next functions belong to the dma_ops mapping/unmapping code. + * + *****************************************************************************/ + +/* + * In the dma_ops path we only have the struct device. This function + * finds the corresponding IOMMU, the protection domain and the + * requestor id for a given device. + * If the device is not yet associated with a domain this is also done + * in this function. + */ static int get_device_resources(struct device *dev, struct amd_iommu **iommu, struct protection_domain **domain, @@ -520,9 +674,10 @@ static int get_device_resources(struct device *dev, BUG_ON(!dev || dev->bus != &pci_bus_type || !dev->dma_mask); pcidev = to_pci_dev(dev); - _bdf = (pcidev->bus->number << 8) | pcidev->devfn; + _bdf = calc_devid(pcidev->bus->number, pcidev->devfn); - if (_bdf >= amd_iommu_last_bdf) { + /* device not translated by any IOMMU in the system? */ + if (_bdf > amd_iommu_last_bdf) { *iommu = NULL; *domain = NULL; *bdf = 0xffff; @@ -547,6 +702,10 @@ static int get_device_resources(struct device *dev, return 1; } +/* + * This is the generic map function. It maps one 4kb page at paddr to + * the given address in the DMA address space for the domain. + */ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu, struct dma_ops_domain *dom, unsigned long address, @@ -578,6 +737,9 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu, return (dma_addr_t)address; } +/* + * The generic unmapping function for on page in the DMA address space. + */ static void dma_ops_domain_unmap(struct amd_iommu *iommu, struct dma_ops_domain *dom, unsigned long address) @@ -597,6 +759,12 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu, *pte = 0ULL; } +/* + * This function contains common code for mapping of a physically + * contiguous memory region into DMA address space. It is uses by all + * mapping functions provided by this IOMMU driver. + * Must be called with the domain lock held. + */ static dma_addr_t __map_single(struct device *dev, struct amd_iommu *iommu, struct dma_ops_domain *dma_dom, @@ -609,7 +777,7 @@ static dma_addr_t __map_single(struct device *dev, unsigned int pages; int i; - pages = to_pages(paddr, size); + pages = iommu_num_pages(paddr, size); paddr &= PAGE_MASK; address = dma_ops_alloc_addresses(dev, dma_dom, pages); @@ -628,6 +796,10 @@ out: return address; } +/* + * Does the reverse of the __map_single function. Must be called with + * the domain lock held too + */ static void __unmap_single(struct amd_iommu *iommu, struct dma_ops_domain *dma_dom, dma_addr_t dma_addr, @@ -640,7 +812,7 @@ static void __unmap_single(struct amd_iommu *iommu, if ((dma_addr == 0) || (dma_addr + size > dma_dom->aperture_size)) return; - pages = to_pages(dma_addr, size); + pages = iommu_num_pages(dma_addr, size); dma_addr &= PAGE_MASK; start = dma_addr; @@ -652,6 +824,9 @@ static void __unmap_single(struct amd_iommu *iommu, dma_ops_free_addresses(dma_dom, dma_addr, pages); } +/* + * The exported map_single function for dma_ops. + */ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir) { @@ -664,6 +839,7 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr, get_device_resources(dev, &iommu, &domain, &devid); if (iommu == NULL || domain == NULL) + /* device not handled by any AMD IOMMU */ return (dma_addr_t)paddr; spin_lock_irqsave(&domain->lock, flags); @@ -683,6 +859,9 @@ out: return addr; } +/* + * The exported unmap_single function for dma_ops. + */ static void unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size, int dir) { @@ -692,6 +871,7 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr, u16 devid; if (!get_device_resources(dev, &iommu, &domain, &devid)) + /* device not handled by any AMD IOMMU */ return; spin_lock_irqsave(&domain->lock, flags); @@ -706,6 +886,10 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr, spin_unlock_irqrestore(&domain->lock, flags); } +/* + * This is a special map_sg function which is used if we should map a + * device which is not handled by an AMD IOMMU in the system. + */ static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist, int nelems, int dir) { @@ -720,6 +904,10 @@ static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist, return nelems; } +/* + * The exported map_sg function for dma_ops (handles scatter-gather + * lists). + */ static int map_sg(struct device *dev, struct scatterlist *sglist, int nelems, int dir) { @@ -775,6 +963,10 @@ unmap: goto out; } +/* + * The exported map_sg function for dma_ops (handles scatter-gather + * lists). + */ static void unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems, int dir) { @@ -804,6 +996,9 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist, spin_unlock_irqrestore(&domain->lock, flags); } +/* + * The exported alloc_coherent function for dma_ops. + */ static void *alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, gfp_t flag) { @@ -851,6 +1046,11 @@ out: return virt_addr; } +/* + * The exported free_coherent function for dma_ops. + * FIXME: fix the generic x86 DMA layer so that it actually calls that + * function. + */ static void free_coherent(struct device *dev, size_t size, void *virt_addr, dma_addr_t dma_addr) { @@ -879,6 +1079,8 @@ free_mem: } /* + * The function for pre-allocating protection domains. + * * If the driver core informs the DMA layer if a driver grabs a device * we don't need to preallocate the protection domains anymore. * For now we have to. @@ -893,7 +1095,7 @@ void prealloc_protection_domains(void) while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { devid = (dev->bus->number << 8) | dev->devfn; - if (devid >= amd_iommu_last_bdf) + if (devid > amd_iommu_last_bdf) continue; devid = amd_iommu_alias_table[devid]; if (domain_for_device(devid)) @@ -921,12 +1123,20 @@ static struct dma_mapping_ops amd_iommu_dma_ops = { .unmap_sg = unmap_sg, }; +/* + * The function which clues the AMD IOMMU driver into dma_ops. + */ int __init amd_iommu_init_dma_ops(void) { struct amd_iommu *iommu; int order = amd_iommu_aperture_order; int ret; + /* + * first allocate a default protection domain for every IOMMU we + * found in the system. Devices not assigned to any other + * protection domain will be assigned to the default one. + */ list_for_each_entry(iommu, &amd_iommu_list, list) { iommu->default_dom = dma_ops_domain_alloc(iommu, order); if (iommu->default_dom == NULL) @@ -936,6 +1146,10 @@ int __init amd_iommu_init_dma_ops(void) goto free_domains; } + /* + * If device isolation is enabled, pre-allocate the protection + * domains for each device. + */ if (amd_iommu_isolate) prealloc_protection_domains(); @@ -947,6 +1161,7 @@ int __init amd_iommu_init_dma_ops(void) gart_iommu_aperture = 0; #endif + /* Make the driver finally visible to the drivers */ dma_ops = &amd_iommu_dma_ops; return 0; diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 2a13e430437..a69cc0f5204 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -25,20 +25,13 @@ #include <asm/pci-direct.h> #include <asm/amd_iommu_types.h> #include <asm/amd_iommu.h> -#include <asm/gart.h> +#include <asm/iommu.h> /* * definitions for the ACPI scanning code */ -#define UPDATE_LAST_BDF(x) do {\ - if ((x) > amd_iommu_last_bdf) \ - amd_iommu_last_bdf = (x); \ - } while (0); - -#define DEVID(bus, devfn) (((bus) << 8) | (devfn)) #define PCI_BUS(x) (((x) >> 8) & 0xff) #define IVRS_HEADER_LENGTH 48 -#define TBL_SIZE(x) (1 << (PAGE_SHIFT + get_order(amd_iommu_last_bdf * (x)))) #define ACPI_IVHD_TYPE 0x10 #define ACPI_IVMD_TYPE_ALL 0x20 @@ -71,6 +64,17 @@ #define ACPI_DEVFLAG_LINT1 0x80 #define ACPI_DEVFLAG_ATSDIS 0x10000000 +/* + * ACPI table definitions + * + * These data structures are laid over the table to parse the important values + * out of it. + */ + +/* + * structure describing one IOMMU in the ACPI table. Typically followed by one + * or more ivhd_entrys. + */ struct ivhd_header { u8 type; u8 flags; @@ -83,6 +87,10 @@ struct ivhd_header { u32 reserved; } __attribute__((packed)); +/* + * A device entry describing which devices a specific IOMMU translates and + * which requestor ids they use. + */ struct ivhd_entry { u8 type; u16 devid; @@ -90,6 +98,10 @@ struct ivhd_entry { u32 ext; } __attribute__((packed)); +/* + * An AMD IOMMU memory definition structure. It defines things like exclusion + * ranges for devices and regions that should be unity mapped. + */ struct ivmd_header { u8 type; u8 flags; @@ -103,22 +115,80 @@ struct ivmd_header { static int __initdata amd_iommu_detected; -u16 amd_iommu_last_bdf; -struct list_head amd_iommu_unity_map; -unsigned amd_iommu_aperture_order = 26; -int amd_iommu_isolate; +u16 amd_iommu_last_bdf; /* largest PCI device id we have + to handle */ +LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings + we find in ACPI */ +unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */ +int amd_iommu_isolate; /* if 1, device isolation is enabled */ + +LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the + system */ -struct list_head amd_iommu_list; +/* + * Pointer to the device table which is shared by all AMD IOMMUs + * it is indexed by the PCI device id or the HT unit id and contains + * information about the domain the device belongs to as well as the + * page table root pointer. + */ struct dev_table_entry *amd_iommu_dev_table; + +/* + * The alias table is a driver specific data structure which contains the + * mappings of the PCI device ids to the actual requestor ids on the IOMMU. + * More than one device can share the same requestor id. + */ u16 *amd_iommu_alias_table; + +/* + * The rlookup table is used to find the IOMMU which is responsible + * for a specific device. It is also indexed by the PCI device id. + */ struct amd_iommu **amd_iommu_rlookup_table; + +/* + * The pd table (protection domain table) is used to find the protection domain + * data structure a device belongs to. Indexed with the PCI device id too. + */ struct protection_domain **amd_iommu_pd_table; + +/* + * AMD IOMMU allows up to 2^16 differend protection domains. This is a bitmap + * to know which ones are already in use. + */ unsigned long *amd_iommu_pd_alloc_bitmap; -static u32 dev_table_size; -static u32 alias_table_size; -static u32 rlookup_table_size; +static u32 dev_table_size; /* size of the device table */ +static u32 alias_table_size; /* size of the alias table */ +static u32 rlookup_table_size; /* size if the rlookup table */ + +static inline void update_last_devid(u16 devid) +{ + if (devid > amd_iommu_last_bdf) + amd_iommu_last_bdf = devid; +} + +static inline unsigned long tbl_size(int entry_size) +{ + unsigned shift = PAGE_SHIFT + + get_order(amd_iommu_last_bdf * entry_size); + + return 1UL << shift; +} + +/**************************************************************************** + * + * AMD IOMMU MMIO register space handling functions + * + * These functions are used to program the IOMMU device registers in + * MMIO space required for that driver. + * + ****************************************************************************/ +/* + * This function set the exclusion range in the IOMMU. DMA accesses to the + * exclusion range are passed through untranslated + */ static void __init iommu_set_exclusion_range(struct amd_iommu *iommu) { u64 start = iommu->exclusion_start & PAGE_MASK; @@ -137,6 +207,7 @@ static void __init iommu_set_exclusion_range(struct amd_iommu *iommu) &entry, sizeof(entry)); } +/* Programs the physical address of the device table into the IOMMU hardware */ static void __init iommu_set_device_table(struct amd_iommu *iommu) { u32 entry; @@ -149,6 +220,7 @@ static void __init iommu_set_device_table(struct amd_iommu *iommu) &entry, sizeof(entry)); } +/* Generic functions to enable/disable certain features of the IOMMU. */ static void __init iommu_feature_enable(struct amd_iommu *iommu, u8 bit) { u32 ctrl; @@ -167,6 +239,7 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit) writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET); } +/* Function to enable the hardware */ void __init iommu_enable(struct amd_iommu *iommu) { printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at "); @@ -176,6 +249,10 @@ void __init iommu_enable(struct amd_iommu *iommu) iommu_feature_enable(iommu, CONTROL_IOMMU_EN); } +/* + * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in + * the system has one. + */ static u8 * __init iommu_map_mmio_space(u64 address) { u8 *ret; @@ -199,16 +276,33 @@ static void __init iommu_unmap_mmio_space(struct amd_iommu *iommu) release_mem_region(iommu->mmio_phys, MMIO_REGION_LENGTH); } +/**************************************************************************** + * + * The functions below belong to the first pass of AMD IOMMU ACPI table + * parsing. In this pass we try to find out the highest device id this + * code has to handle. Upon this information the size of the shared data + * structures is determined later. + * + ****************************************************************************/ + +/* + * This function reads the last device id the IOMMU has to handle from the PCI + * capability header for this IOMMU + */ static int __init find_last_devid_on_pci(int bus, int dev, int fn, int cap_ptr) { u32 cap; cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET); - UPDATE_LAST_BDF(DEVID(MMIO_GET_BUS(cap), MMIO_GET_LD(cap))); + update_last_devid(calc_devid(MMIO_GET_BUS(cap), MMIO_GET_LD(cap))); return 0; } +/* + * After reading the highest device id from the IOMMU PCI capability header + * this function looks if there is a higher device id defined in the ACPI table + */ static int __init find_last_devid_from_ivhd(struct ivhd_header *h) { u8 *p = (void *)h, *end = (void *)h; @@ -229,7 +323,8 @@ static int __init find_last_devid_from_ivhd(struct ivhd_header *h) case IVHD_DEV_RANGE_END: case IVHD_DEV_ALIAS: case IVHD_DEV_EXT_SELECT: - UPDATE_LAST_BDF(dev->devid); + /* all the above subfield types refer to device ids */ + update_last_devid(dev->devid); break; default: break; @@ -242,6 +337,11 @@ static int __init find_last_devid_from_ivhd(struct ivhd_header *h) return 0; } +/* + * Iterate over all IVHD entries in the ACPI table and find the highest device + * id which we need to handle. This is the first of three functions which parse + * the ACPI table. So we check the checksum here. + */ static int __init find_last_devid_acpi(struct acpi_table_header *table) { int i; @@ -277,19 +377,31 @@ static int __init find_last_devid_acpi(struct acpi_table_header *table) return 0; } +/**************************************************************************** + * + * The following functions belong the the code path which parses the ACPI table + * the second time. In this ACPI parsing iteration we allocate IOMMU specific + * data structures, initialize the device/alias/rlookup table and also + * basically initialize the hardware. + * + ****************************************************************************/ + +/* + * Allocates the command buffer. This buffer is per AMD IOMMU. We can + * write commands to that buffer later and the IOMMU will execute them + * asynchronously + */ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu) { - u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL, + u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(CMD_BUFFER_SIZE)); - u64 entry = 0; + u64 entry; if (cmd_buf == NULL) return NULL; iommu->cmd_buf_size = CMD_BUFFER_SIZE; - memset(cmd_buf, 0, CMD_BUFFER_SIZE); - entry = (u64)virt_to_phys(cmd_buf); entry |= MMIO_CMD_SIZE_512; memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET, @@ -302,11 +414,10 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu) static void __init free_command_buffer(struct amd_iommu *iommu) { - if (iommu->cmd_buf) - free_pages((unsigned long)iommu->cmd_buf, - get_order(CMD_BUFFER_SIZE)); + free_pages((unsigned long)iommu->cmd_buf, get_order(CMD_BUFFER_SIZE)); } +/* sets a specific bit in the device table entry. */ static void set_dev_entry_bit(u16 devid, u8 bit) { int i = (bit >> 5) & 0x07; @@ -315,7 +426,18 @@ static void set_dev_entry_bit(u16 devid, u8 bit) amd_iommu_dev_table[devid].data[i] |= (1 << _bit); } -static void __init set_dev_entry_from_acpi(u16 devid, u32 flags, u32 ext_flags) +/* Writes the specific IOMMU for a device into the rlookup table */ +static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid) +{ + amd_iommu_rlookup_table[devid] = iommu; +} + +/* + * This function takes the device specific flags read from the ACPI + * table and sets up the device table entry with that information + */ +static void __init set_dev_entry_from_acpi(struct amd_iommu *iommu, + u16 devid, u32 flags, u32 ext_flags) { if (flags & ACPI_DEVFLAG_INITPASS) set_dev_entry_bit(devid, DEV_ENTRY_INIT_PASS); @@ -331,13 +453,14 @@ static void __init set_dev_entry_from_acpi(u16 devid, u32 flags, u32 ext_flags) set_dev_entry_bit(devid, DEV_ENTRY_LINT0_PASS); if (flags & ACPI_DEVFLAG_LINT1) set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS); -} -static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid) -{ - amd_iommu_rlookup_table[devid] = iommu; + set_iommu_for_device(iommu, devid); } +/* + * Reads the device exclusion range from ACPI and initialize IOMMU with + * it + */ static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m) { struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; @@ -346,12 +469,22 @@ static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m) return; if (iommu) { + /* + * We only can configure exclusion ranges per IOMMU, not + * per device. But we can enable the exclusion range per + * device. This is done here + */ set_dev_entry_bit(m->devid, DEV_ENTRY_EX); iommu->exclusion_start = m->range_start; iommu->exclusion_length = m->range_length; } } +/* + * This function reads some important data from the IOMMU PCI space and + * initializes the driver data structure with it. It reads the hardware + * capabilities and the first/last device entries + */ static void __init init_iommu_from_pci(struct amd_iommu *iommu) { int bus = PCI_BUS(iommu->devid); @@ -363,10 +496,16 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu) iommu->cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_CAP_HDR_OFFSET); range = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET); - iommu->first_device = DEVID(MMIO_GET_BUS(range), MMIO_GET_FD(range)); - iommu->last_device = DEVID(MMIO_GET_BUS(range), MMIO_GET_LD(range)); + iommu->first_device = calc_devid(MMIO_GET_BUS(range), + MMIO_GET_FD(range)); + iommu->last_device = calc_devid(MMIO_GET_BUS(range), + MMIO_GET_LD(range)); } +/* + * Takes a pointer to an AMD IOMMU entry in the ACPI table and + * initializes the hardware and our data structures with it. + */ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, struct ivhd_header *h) { @@ -374,7 +513,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, u8 *end = p, flags = 0; u16 dev_i, devid = 0, devid_start = 0, devid_to = 0; u32 ext_flags = 0; - bool alias = 0; + bool alias = false; struct ivhd_entry *e; /* @@ -414,22 +553,23 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, case IVHD_DEV_ALL: for (dev_i = iommu->first_device; dev_i <= iommu->last_device; ++dev_i) - set_dev_entry_from_acpi(dev_i, e->flags, 0); + set_dev_entry_from_acpi(iommu, dev_i, + e->flags, 0); break; case IVHD_DEV_SELECT: devid = e->devid; - set_dev_entry_from_acpi(devid, e->flags, 0); + set_dev_entry_from_acpi(iommu, devid, e->flags, 0); break; case IVHD_DEV_SELECT_RANGE_START: devid_start = e->devid; flags = e->flags; ext_flags = 0; - alias = 0; + alias = false; break; case IVHD_DEV_ALIAS: devid = e->devid; devid_to = e->ext >> 8; - set_dev_entry_from_acpi(devid, e->flags, 0); + set_dev_entry_from_acpi(iommu, devid, e->flags, 0); amd_iommu_alias_table[devid] = devid_to; break; case IVHD_DEV_ALIAS_RANGE: @@ -437,24 +577,25 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, flags = e->flags; devid_to = e->ext >> 8; ext_flags = 0; - alias = 1; + alias = true; break; case IVHD_DEV_EXT_SELECT: devid = e->devid; - set_dev_entry_from_acpi(devid, e->flags, e->ext); + set_dev_entry_from_acpi(iommu, devid, e->flags, + e->ext); break; case IVHD_DEV_EXT_SELECT_RANGE: devid_start = e->devid; flags = e->flags; ext_flags = e->ext; - alias = 0; + alias = false; break; case IVHD_DEV_RANGE_END: devid = e->devid; for (dev_i = devid_start; dev_i <= devid; ++dev_i) { if (alias) amd_iommu_alias_table[dev_i] = devid_to; - set_dev_entry_from_acpi( + set_dev_entry_from_acpi(iommu, amd_iommu_alias_table[dev_i], flags, ext_flags); } @@ -467,6 +608,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, } } +/* Initializes the device->iommu mapping for the driver */ static int __init init_iommu_devices(struct amd_iommu *iommu) { u16 i; @@ -494,6 +636,11 @@ static void __init free_iommu_all(void) } } +/* + * This function clues the initialization function for one IOMMU + * together and also allocates the command buffer and programs the + * hardware. It does NOT enable the IOMMU. This is done afterwards. + */ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) { spin_lock_init(&iommu->lock); @@ -521,6 +668,10 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) return 0; } +/* + * Iterates over all IOMMU entries in the ACPI table, allocates the + * IOMMU structure and initializes it with init_iommu_one() + */ static int __init init_iommu_all(struct acpi_table_header *table) { u8 *p = (u8 *)table, *end = (u8 *)table; @@ -528,8 +679,6 @@ static int __init init_iommu_all(struct acpi_table_header *table) struct amd_iommu *iommu; int ret; - INIT_LIST_HEAD(&amd_iommu_list); - end += table->length; p += IVRS_HEADER_LENGTH; @@ -555,6 +704,14 @@ static int __init init_iommu_all(struct acpi_table_header *table) return 0; } +/**************************************************************************** + * + * The next functions belong to the third pass of parsing the ACPI + * table. In this last pass the memory mapping requirements are + * gathered (like exclusion and unity mapping reanges). + * + ****************************************************************************/ + static void __init free_unity_maps(void) { struct unity_map_entry *entry, *next; @@ -565,6 +722,7 @@ static void __init free_unity_maps(void) } } +/* called when we find an exclusion range definition in ACPI */ static int __init init_exclusion_range(struct ivmd_header *m) { int i; @@ -574,7 +732,7 @@ static int __init init_exclusion_range(struct ivmd_header *m) set_device_exclusion_range(m->devid, m); break; case ACPI_IVMD_TYPE_ALL: - for (i = 0; i < amd_iommu_last_bdf; ++i) + for (i = 0; i <= amd_iommu_last_bdf; ++i) set_device_exclusion_range(i, m); break; case ACPI_IVMD_TYPE_RANGE: @@ -588,6 +746,7 @@ static int __init init_exclusion_range(struct ivmd_header *m) return 0; } +/* called for unity map ACPI definition */ static int __init init_unity_map_range(struct ivmd_header *m) { struct unity_map_entry *e = 0; @@ -619,13 +778,12 @@ static int __init init_unity_map_range(struct ivmd_header *m) return 0; } +/* iterates over all memory definitions we find in the ACPI table */ static int __init init_memory_definitions(struct acpi_table_header *table) { u8 *p = (u8 *)table, *end = (u8 *)table; struct ivmd_header *m; - INIT_LIST_HEAD(&amd_iommu_unity_map); - end += table->length; p += IVRS_HEADER_LENGTH; @@ -642,6 +800,25 @@ static int __init init_memory_definitions(struct acpi_table_header *table) return 0; } +/* + * Init the device table to not allow DMA access for devices and + * suppress all page faults + */ +static void init_device_table(void) +{ + u16 devid; + + for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) { + set_dev_entry_bit(devid, DEV_ENTRY_VALID); + set_dev_entry_bit(devid, DEV_ENTRY_TRANSLATION); + set_dev_entry_bit(devid, DEV_ENTRY_NO_PAGE_FAULT); + } +} + +/* + * This function finally enables all IOMMUs found in the system after + * they have been initialized + */ static void __init enable_iommus(void) { struct amd_iommu *iommu; @@ -678,6 +855,34 @@ static struct sys_device device_amd_iommu = { .cls = &amd_iommu_sysdev_class, }; +/* + * This is the core init function for AMD IOMMU hardware in the system. + * This function is called from the generic x86 DMA layer initialization + * code. + * + * This function basically parses the ACPI table for AMD IOMMU (IVRS) + * three times: + * + * 1 pass) Find the highest PCI device id the driver has to handle. + * Upon this information the size of the data structures is + * determined that needs to be allocated. + * + * 2 pass) Initialize the data structures just allocated with the + * information in the ACPI table about available AMD IOMMUs + * in the system. It also maps the PCI devices in the + * system to specific IOMMUs + * + * 3 pass) After the basic data structures are allocated and + * initialized we update them with information about memory + * remapping requirements parsed out of the ACPI table in + * this last pass. + * + * After that the hardware is initialized and ready to go. In the last + * step we do some Linux specific things like registering the driver in + * the dma_ops interface and initializing the suspend/resume support + * functions. Finally it prints some information about AMD IOMMUs and + * the driver state and enables the hardware. + */ int __init amd_iommu_init(void) { int i, ret = 0; @@ -699,14 +904,14 @@ int __init amd_iommu_init(void) if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0) return -ENODEV; - dev_table_size = TBL_SIZE(DEV_TABLE_ENTRY_SIZE); - alias_table_size = TBL_SIZE(ALIAS_TABLE_ENTRY_SIZE); - rlookup_table_size = TBL_SIZE(RLOOKUP_TABLE_ENTRY_SIZE); + dev_table_size = tbl_size(DEV_TABLE_ENTRY_SIZE); + alias_table_size = tbl_size(ALIAS_TABLE_ENTRY_SIZE); + rlookup_table_size = tbl_size(RLOOKUP_TABLE_ENTRY_SIZE); ret = -ENOMEM; /* Device table - directly used by all IOMMUs */ - amd_iommu_dev_table = (void *)__get_free_pages(GFP_KERNEL, + amd_iommu_dev_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(dev_table_size)); if (amd_iommu_dev_table == NULL) goto out; @@ -730,27 +935,26 @@ int __init amd_iommu_init(void) * Protection Domain table - maps devices to protection domains * This table has the same size as the rlookup_table */ - amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL, + amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(rlookup_table_size)); if (amd_iommu_pd_table == NULL) goto free; - amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(GFP_KERNEL, + amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages( + GFP_KERNEL | __GFP_ZERO, get_order(MAX_DOMAIN_ID/8)); if (amd_iommu_pd_alloc_bitmap == NULL) goto free; + /* init the device table */ + init_device_table(); + /* - * memory is allocated now; initialize the device table with all zeroes - * and let all alias entries point to itself + * let all alias entries point to itself */ - memset(amd_iommu_dev_table, 0, dev_table_size); - for (i = 0; i < amd_iommu_last_bdf; ++i) + for (i = 0; i <= amd_iommu_last_bdf; ++i) amd_iommu_alias_table[i] = i; - memset(amd_iommu_pd_table, 0, rlookup_table_size); - memset(amd_iommu_pd_alloc_bitmap, 0, MAX_DOMAIN_ID / 8); - /* * never allocate domain 0 because its used as the non-allocated and * error value placeholder @@ -768,15 +972,15 @@ int __init amd_iommu_init(void) if (acpi_table_parse("IVRS", init_memory_definitions) != 0) goto free; - ret = amd_iommu_init_dma_ops(); + ret = sysdev_class_register(&amd_iommu_sysdev_class); if (ret) goto free; - ret = sysdev_class_register(&amd_iommu_sysdev_class); + ret = sysdev_register(&device_amd_iommu); if (ret) goto free; - ret = sysdev_register(&device_amd_iommu); + ret = amd_iommu_init_dma_ops(); if (ret) goto free; @@ -795,24 +999,19 @@ out: return ret; free: - if (amd_iommu_pd_alloc_bitmap) - free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1); + free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1); - if (amd_iommu_pd_table) - free_pages((unsigned long)amd_iommu_pd_table, - get_order(rlookup_table_size)); + free_pages((unsigned long)amd_iommu_pd_table, + get_order(rlookup_table_size)); - if (amd_iommu_rlookup_table) - free_pages((unsigned long)amd_iommu_rlookup_table, - get_order(rlookup_table_size)); + free_pages((unsigned long)amd_iommu_rlookup_table, + get_order(rlookup_table_size)); - if (amd_iommu_alias_table) - free_pages((unsigned long)amd_iommu_alias_table, - get_order(alias_table_size)); + free_pages((unsigned long)amd_iommu_alias_table, + get_order(alias_table_size)); - if (amd_iommu_dev_table) - free_pages((unsigned long)amd_iommu_dev_table, - get_order(dev_table_size)); + free_pages((unsigned long)amd_iommu_dev_table, + get_order(dev_table_size)); free_iommu_all(); @@ -821,6 +1020,13 @@ free: goto out; } +/**************************************************************************** + * + * Early detect code. This code runs at IOMMU detection time in the DMA + * layer. It just looks if there is an IVRS ACPI table to detect AMD + * IOMMUs + * + ****************************************************************************/ static int __init early_amd_iommu_detect(struct acpi_table_header *table) { return 0; @@ -828,7 +1034,7 @@ static int __init early_amd_iommu_detect(struct acpi_table_header *table) void __init amd_iommu_detect(void) { - if (swiotlb || no_iommu || iommu_detected) + if (swiotlb || no_iommu || (iommu_detected && !gart_iommu_aperture)) return; if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) { @@ -841,6 +1047,13 @@ void __init amd_iommu_detect(void) } } +/**************************************************************************** + * + * Parsing functions for the AMD IOMMU specific kernel command line + * options. + * + ****************************************************************************/ + static int __init parse_amd_iommu_options(char *str) { for (; *str; ++str) { @@ -853,20 +1066,10 @@ static int __init parse_amd_iommu_options(char *str) static int __init parse_amd_iommu_size_options(char *str) { - for (; *str; ++str) { - if (strcmp(str, "32M") == 0) - amd_iommu_aperture_order = 25; - if (strcmp(str, "64M") == 0) - amd_iommu_aperture_order = 26; - if (strcmp(str, "128M") == 0) - amd_iommu_aperture_order = 27; - if (strcmp(str, "256M") == 0) - amd_iommu_aperture_order = 28; - if (strcmp(str, "512M") == 0) - amd_iommu_aperture_order = 29; - if (strcmp(str, "1G") == 0) - amd_iommu_aperture_order = 30; - } + unsigned order = PAGE_SHIFT + get_order(memparse(str, &str)); + + if ((order > 24) && (order < 31)) + amd_iommu_aperture_order = order; return 1; } diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 9f907806c1a..44e21826db1 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -21,6 +21,7 @@ #include <linux/suspend.h> #include <asm/e820.h> #include <asm/io.h> +#include <asm/iommu.h> #include <asm/gart.h> #include <asm/pci-direct.h> #include <asm/dma.h> diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index a437d027f20..f88bd0d982b 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -75,7 +75,7 @@ char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE}; /* * Debug level, exported for io_apic.c */ -int apic_verbosity; +unsigned int apic_verbosity; int pic_mode; @@ -177,7 +177,7 @@ void __cpuinit enable_NMI_through_LVT0(void) /* Level triggered for 82489DX */ if (!lapic_is_integrated()) v |= APIC_LVT_LEVEL_TRIGGER; - apic_write_around(APIC_LVT0, v); + apic_write(APIC_LVT0, v); } /** @@ -212,9 +212,6 @@ int lapic_get_maxlvt(void) * this function twice on the boot CPU, once with a bogus timeout * value, second time for real. The other (noncalibrating) CPUs * call this function only once, with the real, calibrated value. - * - * We do reads before writes even if unnecessary, to get around the - * P5 APIC double write bug. */ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) { @@ -229,18 +226,18 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) if (!irqen) lvtt_value |= APIC_LVT_MASKED; - apic_write_around(APIC_LVTT, lvtt_value); + apic_write(APIC_LVTT, lvtt_value); /* * Divide PICLK by 16 */ tmp_value = apic_read(APIC_TDCR); - apic_write_around(APIC_TDCR, (tmp_value - & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) - | APIC_TDR_DIV_16); + apic_write(APIC_TDCR, + (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) | + APIC_TDR_DIV_16); if (!oneshot) - apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR); + apic_write(APIC_TMICT, clocks / APIC_DIVISOR); } /* @@ -249,7 +246,7 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) static int lapic_next_event(unsigned long delta, struct clock_event_device *evt) { - apic_write_around(APIC_TMICT, delta); + apic_write(APIC_TMICT, delta); return 0; } @@ -278,7 +275,7 @@ static void lapic_timer_setup(enum clock_event_mode mode, case CLOCK_EVT_MODE_SHUTDOWN: v = apic_read(APIC_LVTT); v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); - apic_write_around(APIC_LVTT, v); + apic_write(APIC_LVTT, v); break; case CLOCK_EVT_MODE_RESUME: /* Nothing to do here */ @@ -372,12 +369,7 @@ static void __init lapic_cal_handler(struct clock_event_device *dev) } } -/* - * Setup the boot APIC - * - * Calibrate and verify the result. - */ -void __init setup_boot_APIC_clock(void) +static int __init calibrate_APIC_clock(void) { struct clock_event_device *levt = &__get_cpu_var(lapic_events); const long pm_100ms = PMTMR_TICKS_PER_SEC/10; @@ -387,24 +379,6 @@ void __init setup_boot_APIC_clock(void) long delta, deltapm; int pm_referenced = 0; - /* - * The local apic timer can be disabled via the kernel - * commandline or from the CPU detection code. Register the lapic - * timer as a dummy clock event source on SMP systems, so the - * broadcast mechanism is used. On UP systems simply ignore it. - */ - if (local_apic_timer_disabled) { - /* No broadcast on UP ! */ - if (num_possible_cpus() > 1) { - lapic_clockevent.mult = 1; - setup_APIC_timer(); - } - return; - } - - apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n" - "calibrating APIC timer ...\n"); - local_irq_disable(); /* Replace the global interrupt handler */ @@ -489,8 +463,6 @@ void __init setup_boot_APIC_clock(void) calibration_result / (1000000 / HZ), calibration_result % (1000000 / HZ)); - local_apic_timer_verify_ok = 1; - /* * Do a sanity check on the APIC calibration result */ @@ -498,12 +470,11 @@ void __init setup_boot_APIC_clock(void) local_irq_enable(); printk(KERN_WARNING "APIC frequency too slow, disabling apic timer\n"); - /* No broadcast on UP ! */ - if (num_possible_cpus() > 1) - setup_APIC_timer(); - return; + return -1; } + local_apic_timer_verify_ok = 1; + /* We trust the pm timer based calibration */ if (!pm_referenced) { apic_printk(APIC_VERBOSE, "... verify APIC timer\n"); @@ -543,22 +514,55 @@ void __init setup_boot_APIC_clock(void) if (!local_apic_timer_verify_ok) { printk(KERN_WARNING "APIC timer disabled due to verification failure.\n"); + return -1; + } + + return 0; +} + +/* + * Setup the boot APIC + * + * Calibrate and verify the result. + */ +void __init setup_boot_APIC_clock(void) +{ + /* + * The local apic timer can be disabled via the kernel + * commandline or from the CPU detection code. Register the lapic + * timer as a dummy clock event source on SMP systems, so the + * broadcast mechanism is used. On UP systems simply ignore it. + */ + if (local_apic_timer_disabled) { /* No broadcast on UP ! */ - if (num_possible_cpus() == 1) - return; - } else { - /* - * If nmi_watchdog is set to IO_APIC, we need the - * PIT/HPET going. Otherwise register lapic as a dummy - * device. - */ - if (nmi_watchdog != NMI_IO_APIC) - lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; - else - printk(KERN_WARNING "APIC timer registered as dummy," - " due to nmi_watchdog=%d!\n", nmi_watchdog); + if (num_possible_cpus() > 1) { + lapic_clockevent.mult = 1; + setup_APIC_timer(); + } + return; } + apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n" + "calibrating APIC timer ...\n"); + + if (calibrate_APIC_clock()) { + /* No broadcast on UP ! */ + if (num_possible_cpus() > 1) + setup_APIC_timer(); + return; + } + + /* + * If nmi_watchdog is set to IO_APIC, we need the + * PIT/HPET going. Otherwise register lapic as a dummy + * device. + */ + if (nmi_watchdog != NMI_IO_APIC) + lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; + else + printk(KERN_WARNING "APIC timer registered as dummy," + " due to nmi_watchdog=%d!\n", nmi_watchdog); + /* Setup the lapic or request the broadcast */ setup_APIC_timer(); } @@ -693,44 +697,44 @@ void clear_local_APIC(void) */ if (maxlvt >= 3) { v = ERROR_APIC_VECTOR; /* any non-zero vector will do */ - apic_write_around(APIC_LVTERR, v | APIC_LVT_MASKED); + apic_write(APIC_LVTERR, v | APIC_LVT_MASKED); } /* * Careful: we have to set masks only first to deassert * any level-triggered sources. */ v = apic_read(APIC_LVTT); - apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED); + apic_write(APIC_LVTT, v | APIC_LVT_MASKED); v = apic_read(APIC_LVT0); - apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); + apic_write(APIC_LVT0, v | APIC_LVT_MASKED); v = apic_read(APIC_LVT1); - apic_write_around(APIC_LVT1, v | APIC_LVT_MASKED); + apic_write(APIC_LVT1, v | APIC_LVT_MASKED); if (maxlvt >= 4) { v = apic_read(APIC_LVTPC); - apic_write_around(APIC_LVTPC, v | APIC_LVT_MASKED); + apic_write(APIC_LVTPC, v | APIC_LVT_MASKED); } /* lets not touch this if we didn't frob it */ #ifdef CONFIG_X86_MCE_P4THERMAL if (maxlvt >= 5) { v = apic_read(APIC_LVTTHMR); - apic_write_around(APIC_LVTTHMR, v | APIC_LVT_MASKED); + apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); } #endif /* * Clean APIC state for other OSs: */ - apic_write_around(APIC_LVTT, APIC_LVT_MASKED); - apic_write_around(APIC_LVT0, APIC_LVT_MASKED); - apic_write_around(APIC_LVT1, APIC_LVT_MASKED); + apic_write(APIC_LVTT, APIC_LVT_MASKED); + apic_write(APIC_LVT0, APIC_LVT_MASKED); + apic_write(APIC_LVT1, APIC_LVT_MASKED); if (maxlvt >= 3) - apic_write_around(APIC_LVTERR, APIC_LVT_MASKED); + apic_write(APIC_LVTERR, APIC_LVT_MASKED); if (maxlvt >= 4) - apic_write_around(APIC_LVTPC, APIC_LVT_MASKED); + apic_write(APIC_LVTPC, APIC_LVT_MASKED); #ifdef CONFIG_X86_MCE_P4THERMAL if (maxlvt >= 5) - apic_write_around(APIC_LVTTHMR, APIC_LVT_MASKED); + apic_write(APIC_LVTTHMR, APIC_LVT_MASKED); #endif /* Integrated APIC (!82489DX) ? */ if (lapic_is_integrated()) { @@ -756,7 +760,7 @@ void disable_local_APIC(void) */ value = apic_read(APIC_SPIV); value &= ~APIC_SPIV_APIC_ENABLED; - apic_write_around(APIC_SPIV, value); + apic_write(APIC_SPIV, value); /* * When LAPIC was disabled by the BIOS and enabled by the kernel, @@ -865,8 +869,8 @@ void __init sync_Arb_IDs(void) apic_wait_icr_idle(); apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); - apic_write_around(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG - | APIC_DM_INIT); + apic_write(APIC_ICR, + APIC_DEST_ALLINC | APIC_INT_LEVELTRIG | APIC_DM_INIT); } /* @@ -902,16 +906,16 @@ void __init init_bsp_APIC(void) else value |= APIC_SPIV_FOCUS_DISABLED; value |= SPURIOUS_APIC_VECTOR; - apic_write_around(APIC_SPIV, value); + apic_write(APIC_SPIV, value); /* * Set up the virtual wire mode. */ - apic_write_around(APIC_LVT0, APIC_DM_EXTINT); + apic_write(APIC_LVT0, APIC_DM_EXTINT); value = APIC_DM_NMI; if (!lapic_is_integrated()) /* 82489DX */ value |= APIC_LVT_LEVEL_TRIGGER; - apic_write_around(APIC_LVT1, value); + apic_write(APIC_LVT1, value); } static void __cpuinit lapic_setup_esr(void) @@ -926,7 +930,7 @@ static void __cpuinit lapic_setup_esr(void) /* enables sending errors */ value = ERROR_APIC_VECTOR; - apic_write_around(APIC_LVTERR, value); + apic_write(APIC_LVTERR, value); /* * spec says clear errors after enabling vector. */ @@ -989,7 +993,7 @@ void __cpuinit setup_local_APIC(void) */ value = apic_read(APIC_TASKPRI); value &= ~APIC_TPRI_MASK; - apic_write_around(APIC_TASKPRI, value); + apic_write(APIC_TASKPRI, value); /* * After a crash, we no longer service the interrupts and a pending @@ -1047,7 +1051,7 @@ void __cpuinit setup_local_APIC(void) * Set spurious IRQ vector */ value |= SPURIOUS_APIC_VECTOR; - apic_write_around(APIC_SPIV, value); + apic_write(APIC_SPIV, value); /* * Set up LVT0, LVT1: @@ -1069,7 +1073,7 @@ void __cpuinit setup_local_APIC(void) apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", smp_processor_id()); } - apic_write_around(APIC_LVT0, value); + apic_write(APIC_LVT0, value); /* * only the BP should see the LINT1 NMI signal, obviously. @@ -1080,7 +1084,7 @@ void __cpuinit setup_local_APIC(void) value = APIC_DM_NMI | APIC_LVT_MASKED; if (!integrated) /* 82489DX */ value |= APIC_LVT_LEVEL_TRIGGER; - apic_write_around(APIC_LVT1, value); + apic_write(APIC_LVT1, value); } void __cpuinit end_local_APIC_setup(void) @@ -1091,7 +1095,7 @@ void __cpuinit end_local_APIC_setup(void) /* Disable the local apic timer */ value = apic_read(APIC_LVTT); value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); - apic_write_around(APIC_LVTT, value); + apic_write(APIC_LVTT, value); setup_apic_nmi_watchdog(NULL); apic_pm_activate(); @@ -1214,9 +1218,6 @@ int apic_version[MAX_APICS]; int __init APIC_init_uniprocessor(void) { - if (disable_apic) - clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); - if (!smp_found_config && !cpu_has_apic) return -1; @@ -1419,7 +1420,7 @@ void disconnect_bsp_APIC(int virt_wire_setup) value &= ~APIC_VECTOR_MASK; value |= APIC_SPIV_APIC_ENABLED; value |= 0xf; - apic_write_around(APIC_SPIV, value); + apic_write(APIC_SPIV, value); if (!virt_wire_setup) { /* @@ -1432,10 +1433,10 @@ void disconnect_bsp_APIC(int virt_wire_setup) APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); - apic_write_around(APIC_LVT0, value); + apic_write(APIC_LVT0, value); } else { /* Disable LVT0 */ - apic_write_around(APIC_LVT0, APIC_LVT_MASKED); + apic_write(APIC_LVT0, APIC_LVT_MASKED); } /* @@ -1449,12 +1450,10 @@ void disconnect_bsp_APIC(int virt_wire_setup) APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); - apic_write_around(APIC_LVT1, value); + apic_write(APIC_LVT1, value); } } -unsigned int __cpuinitdata maxcpus = NR_CPUS; - void __cpuinit generic_processor_info(int apicid, int version) { int cpu; @@ -1481,12 +1480,6 @@ void __cpuinit generic_processor_info(int apicid, int version) return; } - if (num_processors >= maxcpus) { - printk(KERN_WARNING "WARNING: maxcpus limit of %i reached." - " Processor ignored.\n", maxcpus); - return; - } - num_processors++; cpus_complement(tmp_map, cpu_present_map); cpu = first_cpu(tmp_map); @@ -1700,7 +1693,7 @@ early_param("lapic", parse_lapic); static int __init parse_nolapic(char *arg) { disable_apic = 1; - clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); + setup_clear_cpu_cap(X86_FEATURE_APIC); return 0; } early_param("nolapic", parse_nolapic); @@ -1719,15 +1712,19 @@ static int __init parse_lapic_timer_c2_ok(char *arg) } early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); -static int __init apic_set_verbosity(char *str) +static int __init apic_set_verbosity(char *arg) { - if (strcmp("debug", str) == 0) + if (!arg) + return -EINVAL; + + if (strcmp(arg, "debug") == 0) apic_verbosity = APIC_DEBUG; - else if (strcmp("verbose", str) == 0) + else if (strcmp(arg, "verbose") == 0) apic_verbosity = APIC_VERBOSE; - return 1; + + return 0; } -__setup("apic=", apic_set_verbosity); +early_param("apic", apic_set_verbosity); static int __init lapic_insert_resource(void) { diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 1e3d32e27c1..446c062e831 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -54,7 +54,7 @@ EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); /* * Debug level, exported for io_apic.c */ -int apic_verbosity; +unsigned int apic_verbosity; /* Have we found an MP table */ int smp_found_config; @@ -90,7 +90,6 @@ static unsigned long apic_phys; unsigned long mp_lapic_addr; -unsigned int __cpuinitdata maxcpus = NR_CPUS; /* * Get the LAPIC version */ @@ -314,7 +313,7 @@ static void setup_APIC_timer(void) #define TICK_COUNT 100000000 -static void __init calibrate_APIC_clock(void) +static int __init calibrate_APIC_clock(void) { unsigned apic, apic_start; unsigned long tsc, tsc_start; @@ -368,6 +367,17 @@ static void __init calibrate_APIC_clock(void) clockevent_delta2ns(0xF, &lapic_clockevent); calibration_result = result / HZ; + + /* + * Do a sanity check on the APIC calibration result + */ + if (calibration_result < (1000000 / HZ)) { + printk(KERN_WARNING + "APIC frequency too slow, disabling apic timer\n"); + return -1; + } + + return 0; } /* @@ -394,14 +404,7 @@ void __init setup_boot_APIC_clock(void) } printk(KERN_INFO "Using local APIC timer interrupts.\n"); - calibrate_APIC_clock(); - - /* - * Do a sanity check on the APIC calibration result - */ - if (calibration_result < (1000000 / HZ)) { - printk(KERN_WARNING - "APIC frequency too slow, disabling apic timer\n"); + if (calibrate_APIC_clock()) { /* No broadcast on UP ! */ if (num_possible_cpus() > 1) setup_APIC_timer(); @@ -1058,12 +1061,6 @@ void __cpuinit generic_processor_info(int apicid, int version) return; } - if (num_processors >= maxcpus) { - printk(KERN_WARNING "WARNING: maxcpus limit of %i reached." - " Processor ignored.\n", maxcpus); - return; - } - num_processors++; cpus_complement(tmp_map, cpu_present_map); cpu = first_cpu(tmp_map); @@ -1337,7 +1334,7 @@ early_param("apic", apic_set_verbosity); static __init int setup_disableapic(char *str) { disable_apic = 1; - clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); + setup_clear_cpu_cap(X86_FEATURE_APIC); return 0; } early_param("disableapic", setup_disableapic); diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index bf9b441331e..732d1f4e10e 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -219,7 +219,6 @@ #include <linux/time.h> #include <linux/sched.h> #include <linux/pm.h> -#include <linux/pm_legacy.h> #include <linux/capability.h> #include <linux/device.h> #include <linux/kernel.h> @@ -235,6 +234,7 @@ #include <asm/uaccess.h> #include <asm/desc.h> #include <asm/i8253.h> +#include <asm/olpc.h> #include <asm/paravirt.h> #include <asm/reboot.h> @@ -2218,7 +2218,7 @@ static int __init apm_init(void) dmi_check_system(apm_dmi_table); - if (apm_info.bios.version == 0 || paravirt_enabled()) { + if (apm_info.bios.version == 0 || paravirt_enabled() || machine_is_olpc()) { printk(KERN_INFO "apm: BIOS not found.\n"); return -ENODEV; } diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index bacf5deeec2..aa89387006f 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -18,6 +18,8 @@ #include <asm/ia32.h> #include <asm/bootparam.h> +#include <xen/interface/xen.h> + #define __NO_STUBS 1 #undef __SYSCALL #undef _ASM_X86_64_UNISTD_H_ @@ -131,5 +133,14 @@ int main(void) OFFSET(BP_loadflags, boot_params, hdr.loadflags); OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); OFFSET(BP_version, boot_params, hdr.version); + + BLANK(); + DEFINE(PAGE_SIZE_asm, PAGE_SIZE); +#ifdef CONFIG_XEN + BLANK(); + OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask); + OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending); +#undef ENTRY +#endif return 0; } diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c new file mode 100644 index 00000000000..c639bd55391 --- /dev/null +++ b/arch/x86/kernel/bios_uv.c @@ -0,0 +1,48 @@ +/* + * BIOS run time interface routines. + * + * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <asm/uv/bios.h> + +const char * +x86_bios_strerror(long status) +{ + const char *str; + switch (status) { + case 0: str = "Call completed without error"; break; + case -1: str = "Not implemented"; break; + case -2: str = "Invalid argument"; break; + case -3: str = "Call completed with error"; break; + default: str = "Unknown BIOS status code"; break; + } + return str; +} + +long +x86_bios_freq_base(unsigned long which, unsigned long *ticks_per_second, + unsigned long *drift_info) +{ + struct uv_bios_retval isrv; + + BIOS_CALL(isrv, BIOS_FREQ_BASE, which, 0, 0, 0, 0, 0, 0); + *ticks_per_second = isrv.v0; + *drift_info = isrv.v1; + return isrv.status; +} +EXPORT_SYMBOL_GPL(x86_bios_freq_base); diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c index 84a8220a607..a6ef672adbb 100644 --- a/arch/x86/kernel/cpu/addon_cpuid_features.c +++ b/arch/x86/kernel/cpu/addon_cpuid_features.c @@ -56,9 +56,22 @@ void __cpuinit validate_pat_support(struct cpuinfo_x86 *c) switch (c->x86_vendor) { case X86_VENDOR_INTEL: - if (c->x86 == 0xF || (c->x86 == 6 && c->x86_model >= 15)) + /* + * There is a known erratum on Pentium III and Core Solo + * and Core Duo CPUs. + * " Page with PAT set to WC while associated MTRR is UC + * may consolidate to UC " + * Because of this erratum, it is better to stick with + * setting WC in MTRR rather than using PAT on these CPUs. + * + * Enable PAT WC only on P4, Core 2 or later CPUs. + */ + if (c->x86 > 0x6 || (c->x86 == 6 && c->x86_model >= 15)) return; - break; + + pat_disable("PAT WC disabled due to known CPU erratum."); + return; + case X86_VENDOR_AMD: case X86_VENDOR_CENTAUR: case X86_VENDOR_TRANSMETA: diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 81a07ca65d4..18514ed2610 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -24,8 +24,6 @@ extern void vide(void); __asm__(".align 4\nvide: ret"); -int force_mwait __cpuinitdata; - static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) { if (cpuid_eax(0x80000000) >= 0x80000007) { @@ -33,6 +31,11 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) if (c->x86_power & (1<<8)) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); } + + /* Set MTRR capability flag if appropriate */ + if (c->x86_model == 13 || c->x86_model == 9 || + (c->x86_model == 8 && c->x86_mask >= 8)) + set_cpu_cap(c, X86_FEATURE_K6_MTRR); } static void __cpuinit init_amd(struct cpuinfo_x86 *c) @@ -168,10 +171,6 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) mbytes); } - /* Set MTRR capability flag if appropriate */ - if (c->x86_model == 13 || c->x86_model == 9 || - (c->x86_model == 8 && c->x86_mask >= 8)) - set_cpu_cap(c, X86_FEATURE_K6_MTRR); break; } diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c index 7c36fb8a28d..d1692b2a41f 100644 --- a/arch/x86/kernel/cpu/amd_64.c +++ b/arch/x86/kernel/cpu/amd_64.c @@ -115,6 +115,8 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */ if (c->x86_power & (1<<8)) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + + set_cpu_cap(c, X86_FEATURE_SYSCALL32); } static void __cpuinit init_amd(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 1b1c56bb338..c8e315f1aa8 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -50,6 +50,8 @@ static double __initdata y = 3145727.0; */ static void __init check_fpu(void) { + s32 fdiv_bug; + if (!boot_cpu_data.hard_math) { #ifndef CONFIG_MATH_EMULATION printk(KERN_EMERG "No coprocessor found and no math emulation present.\n"); @@ -74,8 +76,10 @@ static void __init check_fpu(void) "fistpl %0\n\t" "fwait\n\t" "fninit" - : "=m" (*&boot_cpu_data.fdiv_bug) + : "=m" (*&fdiv_bug) : "m" (*&x), "m" (*&y)); + + boot_cpu_data.fdiv_bug = fdiv_bug; if (boot_cpu_data.fdiv_bug) printk("Hmm, FPU with FDIV bug.\n"); } @@ -131,13 +135,7 @@ static void __init check_popad(void) * (for due to lack of "invlpg" and working WP on a i386) * - In order to run on anything without a TSC, we need to be * compiled for a i486. - * - In order to support the local APIC on a buggy Pentium machine, - * we need to be compiled with CONFIG_X86_GOOD_APIC disabled, - * which happens implicitly if compiled for a Pentium or lower - * (unless an advanced selection of CPU features is used) as an - * otherwise config implies a properly working local APIC without - * the need to do extra reads from the APIC. -*/ + */ static void __init check_config(void) { @@ -151,21 +149,6 @@ static void __init check_config(void) if (boot_cpu_data.x86 == 3) panic("Kernel requires i486+ for 'invlpg' and other features"); #endif - -/* - * If we were told we had a good local APIC, check for buggy Pentia, - * i.e. all B steppings and the C2 stepping of P54C when using their - * integrated APIC (see 11AP erratum in "Pentium Processor - * Specification Update"). - */ -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_GOOD_APIC) - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL - && cpu_has_apic - && boot_cpu_data.x86 == 5 - && boot_cpu_data.x86_model == 2 - && (boot_cpu_data.x86_mask < 6 || boot_cpu_data.x86_mask == 11)) - panic("Kernel compiled for PMMX+, assumes a local APIC without the read-before-write bug!"); -#endif } diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index e0f45edd6a5..a0534c04d38 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -314,6 +314,16 @@ enum { EAMD3D = 1<<20, }; +static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c) +{ + switch (c->x86) { + case 5: + /* Emulate MTRRs using Centaur's MCR. */ + set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR); + break; + } +} + static void __cpuinit init_centaur(struct cpuinfo_x86 *c) { @@ -462,6 +472,7 @@ centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size) static struct cpu_dev centaur_cpu_dev __cpuinitdata = { .c_vendor = "Centaur", .c_ident = { "CentaurHauls" }, + .c_early_init = early_init_centaur, .c_init = init_centaur, .c_size_cache = centaur_size_cache, }; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 80ab20d4fa3..4e456bd955b 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -13,6 +13,7 @@ #include <asm/mtrr.h> #include <asm/mce.h> #include <asm/pat.h> +#include <asm/asm.h> #ifdef CONFIG_X86_LOCAL_APIC #include <asm/mpspec.h> #include <asm/apic.h> @@ -334,11 +335,24 @@ static void __init early_cpu_detect(void) get_cpu_vendor(c, 1); + early_get_cap(c); + if (c->x86_vendor != X86_VENDOR_UNKNOWN && cpu_devs[c->x86_vendor]->c_early_init) cpu_devs[c->x86_vendor]->c_early_init(c); +} - early_get_cap(c); +/* + * The NOPL instruction is supposed to exist on all CPUs with + * family >= 6; unfortunately, that's not true in practice because + * of early VIA chips and (more importantly) broken virtualizers that + * are not easy to detect. In the latter case it doesn't even *fail* + * reliably, so probing for it doesn't even work. Disable it completely + * unless we can find a reliable way to detect all the broken cases. + */ +static void __cpuinit detect_nopl(struct cpuinfo_x86 *c) +{ + clear_cpu_cap(c, X86_FEATURE_NOPL); } static void __cpuinit generic_identify(struct cpuinfo_x86 *c) @@ -395,8 +409,8 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c) } init_scattered_cpuid_features(c); + detect_nopl(c); } - } static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c index 7b8cc72feb4..a11f5d4477c 100644 --- a/arch/x86/kernel/cpu/common_64.c +++ b/arch/x86/kernel/cpu/common_64.c @@ -7,19 +7,18 @@ #include <linux/module.h> #include <linux/kgdb.h> #include <linux/topology.h> -#include <linux/string.h> #include <linux/delay.h> #include <linux/smp.h> -#include <linux/module.h> #include <linux/percpu.h> -#include <asm/processor.h> #include <asm/i387.h> #include <asm/msr.h> #include <asm/io.h> +#include <asm/linkage.h> #include <asm/mmu_context.h> #include <asm/mtrr.h> #include <asm/mce.h> #include <asm/pat.h> +#include <asm/asm.h> #include <asm/numa.h> #ifdef CONFIG_X86_LOCAL_APIC #include <asm/mpspec.h> @@ -217,6 +216,39 @@ static void __init early_cpu_support_print(void) } } +/* + * The NOPL instruction is supposed to exist on all CPUs with + * family >= 6, unfortunately, that's not true in practice because + * of early VIA chips and (more importantly) broken virtualizers that + * are not easy to detect. Hence, probe for it based on first + * principles. + * + * Note: no 64-bit chip is known to lack these, but put the code here + * for consistency with 32 bits, and to make it utterly trivial to + * diagnose the problem should it ever surface. + */ +static void __cpuinit detect_nopl(struct cpuinfo_x86 *c) +{ + const u32 nopl_signature = 0x888c53b1; /* Random number */ + u32 has_nopl = nopl_signature; + + clear_cpu_cap(c, X86_FEATURE_NOPL); + if (c->x86 >= 6) { + asm volatile("\n" + "1: .byte 0x0f,0x1f,0xc0\n" /* nopl %eax */ + "2:\n" + " .section .fixup,\"ax\"\n" + "3: xor %0,%0\n" + " jmp 2b\n" + " .previous\n" + _ASM_EXTABLE(1b,3b) + : "+a" (has_nopl)); + + if (has_nopl == nopl_signature) + set_cpu_cap(c, X86_FEATURE_NOPL); + } +} + static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c); void __init early_cpu_init(void) @@ -305,7 +337,6 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) c->x86_capability[2] = cpuid_edx(0x80860001); } - c->extended_cpuid_level = cpuid_eax(0x80000000); if (c->extended_cpuid_level >= 0x80000007) c->x86_power = cpuid_edx(0x80000007); @@ -316,18 +347,13 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) c->x86_phys_bits = eax & 0xff; } - /* Assume all 64-bit CPUs support 32-bit syscall */ - set_cpu_cap(c, X86_FEATURE_SYSCALL32); + detect_nopl(c); if (c->x86_vendor != X86_VENDOR_UNKNOWN && cpu_devs[c->x86_vendor]->c_early_init) cpu_devs[c->x86_vendor]->c_early_init(c); validate_pat_support(c); - - /* early_param could clear that, but recall get it set again */ - if (disable_apic) - clear_cpu_cap(c, X86_FEATURE_APIC); } /* @@ -503,22 +529,24 @@ void pda_init(int cpu) /* others are initialized in smpboot.c */ pda->pcurrent = &init_task; pda->irqstackptr = boot_cpu_stack; + pda->irqstackptr += IRQSTACKSIZE - 64; } else { - pda->irqstackptr = (char *) - __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER); - if (!pda->irqstackptr) - panic("cannot allocate irqstack for cpu %d", cpu); + if (!pda->irqstackptr) { + pda->irqstackptr = (char *) + __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER); + if (!pda->irqstackptr) + panic("cannot allocate irqstack for cpu %d", + cpu); + pda->irqstackptr += IRQSTACKSIZE - 64; + } if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE) pda->nodenumber = cpu_to_node(cpu); } - - pda->irqstackptr += IRQSTACKSIZE-64; } char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + - DEBUG_STKSZ] -__attribute__((section(".bss.page_aligned"))); + DEBUG_STKSZ] __page_aligned_bss; extern asmlinkage void ignore_sysret(void); @@ -612,19 +640,22 @@ void __cpuinit cpu_init(void) /* * set up and load the per-CPU TSS */ - for (v = 0; v < N_EXCEPTION_STACKS; v++) { + if (!orig_ist->ist[0]) { static const unsigned int order[N_EXCEPTION_STACKS] = { - [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER, - [DEBUG_STACK - 1] = DEBUG_STACK_ORDER + [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER, + [DEBUG_STACK - 1] = DEBUG_STACK_ORDER }; - if (cpu) { - estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]); - if (!estacks) - panic("Cannot allocate exception stack %ld %d\n", - v, cpu); + for (v = 0; v < N_EXCEPTION_STACKS; v++) { + if (cpu) { + estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]); + if (!estacks) + panic("Cannot allocate exception " + "stack %ld %d\n", v, cpu); + } + estacks += PAGE_SIZE << order[v]; + orig_ist->ist[v] = t->x86_tss.ist[v] = + (unsigned long)estacks; } - estacks += PAGE_SIZE << order[v]; - orig_ist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks; } t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig index cb7a5715596..efae3b22a0f 100644 --- a/arch/x86/kernel/cpu/cpufreq/Kconfig +++ b/arch/x86/kernel/cpu/cpufreq/Kconfig @@ -235,9 +235,9 @@ config X86_LONGHAUL If in doubt, say N. config X86_E_POWERSAVER - tristate "VIA C7 Enhanced PowerSaver (EXPERIMENTAL)" + tristate "VIA C7 Enhanced PowerSaver" select CPU_FREQ_TABLE - depends on X86_32 && EXPERIMENTAL + depends on X86_32 help This adds the CPUFreq driver for VIA C7 processors. diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index b0c8208df9f..dd097b83583 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -202,7 +202,7 @@ static void drv_write(struct drv_cmd *cmd) cpumask_t saved_mask = current->cpus_allowed; unsigned int i; - for_each_cpu_mask(i, cmd->mask) { + for_each_cpu_mask_nr(i, cmd->mask) { set_cpus_allowed_ptr(current, &cpumask_of_cpu(i)); do_drv_write(cmd); } @@ -451,7 +451,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, freqs.old = perf->states[perf->state].core_frequency * 1000; freqs.new = data->freq_table[next_state].frequency; - for_each_cpu_mask(i, cmd.mask) { + for_each_cpu_mask_nr(i, cmd.mask) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); } @@ -466,7 +466,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, } } - for_each_cpu_mask(i, cmd.mask) { + for_each_cpu_mask_nr(i, cmd.mask) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); } diff --git a/arch/x86/kernel/cpu/cpufreq/elanfreq.c b/arch/x86/kernel/cpu/cpufreq/elanfreq.c index 94619c22f56..e4a4bf870e9 100644 --- a/arch/x86/kernel/cpu/cpufreq/elanfreq.c +++ b/arch/x86/kernel/cpu/cpufreq/elanfreq.c @@ -44,7 +44,7 @@ struct s_elan_multiplier { * It is important that the frequencies * are listed in ascending order here! */ -struct s_elan_multiplier elan_multiplier[] = { +static struct s_elan_multiplier elan_multiplier[] = { {1000, 0x02, 0x18}, {2000, 0x02, 0x10}, {4000, 0x02, 0x08}, diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c index 199e4e05e5d..f1685fb91fb 100644 --- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c +++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c @@ -122,7 +122,7 @@ static int cpufreq_p4_target(struct cpufreq_policy *policy, return 0; /* notifiers */ - for_each_cpu_mask(i, policy->cpus) { + for_each_cpu_mask_nr(i, policy->cpus) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); } @@ -130,11 +130,11 @@ static int cpufreq_p4_target(struct cpufreq_policy *policy, /* run on each logical CPU, see section 13.15.3 of IA32 Intel Architecture Software * Developer's Manual, Volume 3 */ - for_each_cpu_mask(i, policy->cpus) + for_each_cpu_mask_nr(i, policy->cpus) cpufreq_p4_setdc(i, p4clockmod_table[newstate].index); /* notifiers */ - for_each_cpu_mask(i, policy->cpus) { + for_each_cpu_mask_nr(i, policy->cpus) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); } diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.h b/arch/x86/kernel/cpu/cpufreq/powernow-k7.h index f8a63b3664e..35fb4eaf6e1 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.h +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.h @@ -1,5 +1,4 @@ /* - * $Id: powernow-k7.h,v 1.2 2003/02/10 18:26:01 davej Exp $ * (C) 2003 Dave Jones. * * Licensed under the terms of the GNU GPL License version 2. diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 206791eb46e..84bb395038d 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -66,7 +66,6 @@ static u32 find_freq_from_fid(u32 fid) return 800 + (fid * 100); } - /* Return a frequency in KHz, given an input fid */ static u32 find_khz_freq_from_fid(u32 fid) { @@ -78,7 +77,6 @@ static u32 find_khz_freq_from_pstate(struct cpufreq_frequency_table *data, u32 p return data[pstate].frequency; } - /* Return the vco fid for an input fid * * Each "low" fid has corresponding "high" fid, and you can get to "low" fids @@ -166,7 +164,6 @@ static void fidvid_msr_init(void) wrmsr(MSR_FIDVID_CTL, lo, hi); } - /* write the new fid value along with the other control fields to the msr */ static int write_new_fid(struct powernow_k8_data *data, u32 fid) { @@ -966,7 +963,7 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data, unsigned i freqs.old = find_khz_freq_from_fid(data->currfid); freqs.new = find_khz_freq_from_fid(fid); - for_each_cpu_mask(i, *(data->available_cores)) { + for_each_cpu_mask_nr(i, *(data->available_cores)) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); } @@ -974,7 +971,7 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data, unsigned i res = transition_fid_vid(data, fid, vid); freqs.new = find_khz_freq_from_fid(data->currfid); - for_each_cpu_mask(i, *(data->available_cores)) { + for_each_cpu_mask_nr(i, *(data->available_cores)) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); } @@ -997,7 +994,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned i freqs.old = find_khz_freq_from_pstate(data->powernow_table, data->currpstate); freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); - for_each_cpu_mask(i, *(data->available_cores)) { + for_each_cpu_mask_nr(i, *(data->available_cores)) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); } @@ -1005,7 +1002,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned i res = transition_pstate(data, pstate); freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); - for_each_cpu_mask(i, *(data->available_cores)) { + for_each_cpu_mask_nr(i, *(data->available_cores)) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); } diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c index 908dd347c67..15e13c01cc3 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c @@ -28,7 +28,8 @@ #define PFX "speedstep-centrino: " #define MAINTAINER "cpufreq@lists.linux.org.uk" -#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg) +#define dprintk(msg...) \ + cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg) #define INTEL_MSR_RANGE (0xffff) @@ -66,11 +67,12 @@ struct cpu_model struct cpufreq_frequency_table *op_points; /* clock/voltage pairs */ }; -static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c, const struct cpu_id *x); +static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c, + const struct cpu_id *x); /* Operating points for current CPU */ -static struct cpu_model *centrino_model[NR_CPUS]; -static const struct cpu_id *centrino_cpu[NR_CPUS]; +static DEFINE_PER_CPU(struct cpu_model *, centrino_model); +static DEFINE_PER_CPU(const struct cpu_id *, centrino_cpu); static struct cpufreq_driver centrino_driver; @@ -255,7 +257,7 @@ static int centrino_cpu_init_table(struct cpufreq_policy *policy) return -ENOENT; } - centrino_model[policy->cpu] = model; + per_cpu(centrino_model, policy->cpu) = model; dprintk("found \"%s\": max frequency: %dkHz\n", model->model_name, model->max_freq); @@ -264,10 +266,14 @@ static int centrino_cpu_init_table(struct cpufreq_policy *policy) } #else -static inline int centrino_cpu_init_table(struct cpufreq_policy *policy) { return -ENODEV; } +static inline int centrino_cpu_init_table(struct cpufreq_policy *policy) +{ + return -ENODEV; +} #endif /* CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE */ -static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c, const struct cpu_id *x) +static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c, + const struct cpu_id *x) { if ((c->x86 == x->x86) && (c->x86_model == x->x86_model) && @@ -286,23 +292,28 @@ static unsigned extract_clock(unsigned msr, unsigned int cpu, int failsafe) * for centrino, as some DSDTs are buggy. * Ideally, this can be done using the acpi_data structure. */ - if ((centrino_cpu[cpu] == &cpu_ids[CPU_BANIAS]) || - (centrino_cpu[cpu] == &cpu_ids[CPU_DOTHAN_A1]) || - (centrino_cpu[cpu] == &cpu_ids[CPU_DOTHAN_B0])) { + if ((per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_BANIAS]) || + (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_A1]) || + (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_B0])) { msr = (msr >> 8) & 0xff; return msr * 100000; } - if ((!centrino_model[cpu]) || (!centrino_model[cpu]->op_points)) + if ((!per_cpu(centrino_model, cpu)) || + (!per_cpu(centrino_model, cpu)->op_points)) return 0; msr &= 0xffff; - for (i=0;centrino_model[cpu]->op_points[i].frequency != CPUFREQ_TABLE_END; i++) { - if (msr == centrino_model[cpu]->op_points[i].index) - return centrino_model[cpu]->op_points[i].frequency; + for (i = 0; + per_cpu(centrino_model, cpu)->op_points[i].frequency + != CPUFREQ_TABLE_END; + i++) { + if (msr == per_cpu(centrino_model, cpu)->op_points[i].index) + return per_cpu(centrino_model, cpu)-> + op_points[i].frequency; } if (failsafe) - return centrino_model[cpu]->op_points[i-1].frequency; + return per_cpu(centrino_model, cpu)->op_points[i-1].frequency; else return 0; } @@ -347,7 +358,8 @@ static int centrino_cpu_init(struct cpufreq_policy *policy) int i; /* Only Intel makes Enhanced Speedstep-capable CPUs */ - if (cpu->x86_vendor != X86_VENDOR_INTEL || !cpu_has(cpu, X86_FEATURE_EST)) + if (cpu->x86_vendor != X86_VENDOR_INTEL || + !cpu_has(cpu, X86_FEATURE_EST)) return -ENODEV; if (cpu_has(cpu, X86_FEATURE_CONSTANT_TSC)) @@ -361,9 +373,9 @@ static int centrino_cpu_init(struct cpufreq_policy *policy) break; if (i != N_IDS) - centrino_cpu[policy->cpu] = &cpu_ids[i]; + per_cpu(centrino_cpu, policy->cpu) = &cpu_ids[i]; - if (!centrino_cpu[policy->cpu]) { + if (!per_cpu(centrino_cpu, policy->cpu)) { dprintk("found unsupported CPU with " "Enhanced SpeedStep: send /proc/cpuinfo to " MAINTAINER "\n"); @@ -386,23 +398,26 @@ static int centrino_cpu_init(struct cpufreq_policy *policy) /* check to see if it stuck */ rdmsr(MSR_IA32_MISC_ENABLE, l, h); if (!(l & (1<<16))) { - printk(KERN_INFO PFX "couldn't enable Enhanced SpeedStep\n"); + printk(KERN_INFO PFX + "couldn't enable Enhanced SpeedStep\n"); return -ENODEV; } } freq = get_cur_freq(policy->cpu); - - policy->cpuinfo.transition_latency = 10000; /* 10uS transition latency */ + policy->cpuinfo.transition_latency = 10000; + /* 10uS transition latency */ policy->cur = freq; dprintk("centrino_cpu_init: cur=%dkHz\n", policy->cur); - ret = cpufreq_frequency_table_cpuinfo(policy, centrino_model[policy->cpu]->op_points); + ret = cpufreq_frequency_table_cpuinfo(policy, + per_cpu(centrino_model, policy->cpu)->op_points); if (ret) return (ret); - cpufreq_frequency_table_get_attr(centrino_model[policy->cpu]->op_points, policy->cpu); + cpufreq_frequency_table_get_attr( + per_cpu(centrino_model, policy->cpu)->op_points, policy->cpu); return 0; } @@ -411,12 +426,12 @@ static int centrino_cpu_exit(struct cpufreq_policy *policy) { unsigned int cpu = policy->cpu; - if (!centrino_model[cpu]) + if (!per_cpu(centrino_model, cpu)) return -ENODEV; cpufreq_frequency_table_put_attr(cpu); - centrino_model[cpu] = NULL; + per_cpu(centrino_model, cpu) = NULL; return 0; } @@ -430,17 +445,26 @@ static int centrino_cpu_exit(struct cpufreq_policy *policy) */ static int centrino_verify (struct cpufreq_policy *policy) { - return cpufreq_frequency_table_verify(policy, centrino_model[policy->cpu]->op_points); + return cpufreq_frequency_table_verify(policy, + per_cpu(centrino_model, policy->cpu)->op_points); } /** * centrino_setpolicy - set a new CPUFreq policy * @policy: new policy * @target_freq: the target frequency - * @relation: how that frequency relates to achieved frequency (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H) + * @relation: how that frequency relates to achieved frequency + * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H) * * Sets a new CPUFreq policy. */ +struct allmasks { + cpumask_t online_policy_cpus; + cpumask_t saved_mask; + cpumask_t set_mask; + cpumask_t covered_cpus; +}; + static int centrino_target (struct cpufreq_policy *policy, unsigned int target_freq, unsigned int relation) @@ -448,48 +472,55 @@ static int centrino_target (struct cpufreq_policy *policy, unsigned int newstate = 0; unsigned int msr, oldmsr = 0, h = 0, cpu = policy->cpu; struct cpufreq_freqs freqs; - cpumask_t online_policy_cpus; - cpumask_t saved_mask; - cpumask_t set_mask; - cpumask_t covered_cpus; int retval = 0; unsigned int j, k, first_cpu, tmp; - - if (unlikely(centrino_model[cpu] == NULL)) - return -ENODEV; + CPUMASK_ALLOC(allmasks); + CPUMASK_PTR(online_policy_cpus, allmasks); + CPUMASK_PTR(saved_mask, allmasks); + CPUMASK_PTR(set_mask, allmasks); + CPUMASK_PTR(covered_cpus, allmasks); + + if (unlikely(allmasks == NULL)) + return -ENOMEM; + + if (unlikely(per_cpu(centrino_model, cpu) == NULL)) { + retval = -ENODEV; + goto out; + } if (unlikely(cpufreq_frequency_table_target(policy, - centrino_model[cpu]->op_points, + per_cpu(centrino_model, cpu)->op_points, target_freq, relation, &newstate))) { - return -EINVAL; + retval = -EINVAL; + goto out; } #ifdef CONFIG_HOTPLUG_CPU /* cpufreq holds the hotplug lock, so we are safe from here on */ - cpus_and(online_policy_cpus, cpu_online_map, policy->cpus); + cpus_and(*online_policy_cpus, cpu_online_map, policy->cpus); #else - online_policy_cpus = policy->cpus; + *online_policy_cpus = policy->cpus; #endif - saved_mask = current->cpus_allowed; + *saved_mask = current->cpus_allowed; first_cpu = 1; - cpus_clear(covered_cpus); - for_each_cpu_mask(j, online_policy_cpus) { + cpus_clear(*covered_cpus); + for_each_cpu_mask_nr(j, *online_policy_cpus) { /* * Support for SMP systems. * Make sure we are running on CPU that wants to change freq */ - cpus_clear(set_mask); + cpus_clear(*set_mask); if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) - cpus_or(set_mask, set_mask, online_policy_cpus); + cpus_or(*set_mask, *set_mask, *online_policy_cpus); else - cpu_set(j, set_mask); + cpu_set(j, *set_mask); - set_cpus_allowed_ptr(current, &set_mask); + set_cpus_allowed_ptr(current, set_mask); preempt_disable(); - if (unlikely(!cpu_isset(smp_processor_id(), set_mask))) { + if (unlikely(!cpu_isset(smp_processor_id(), *set_mask))) { dprintk("couldn't limit to CPUs in this domain\n"); retval = -EAGAIN; if (first_cpu) { @@ -500,7 +531,7 @@ static int centrino_target (struct cpufreq_policy *policy, break; } - msr = centrino_model[cpu]->op_points[newstate].index; + msr = per_cpu(centrino_model, cpu)->op_points[newstate].index; if (first_cpu) { rdmsr(MSR_IA32_PERF_CTL, oldmsr, h); @@ -517,7 +548,7 @@ static int centrino_target (struct cpufreq_policy *policy, dprintk("target=%dkHz old=%d new=%d msr=%04x\n", target_freq, freqs.old, freqs.new, msr); - for_each_cpu_mask(k, online_policy_cpus) { + for_each_cpu_mask_nr(k, *online_policy_cpus) { freqs.cpu = k; cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); @@ -536,11 +567,11 @@ static int centrino_target (struct cpufreq_policy *policy, break; } - cpu_set(j, covered_cpus); + cpu_set(j, *covered_cpus); preempt_enable(); } - for_each_cpu_mask(k, online_policy_cpus) { + for_each_cpu_mask_nr(k, *online_policy_cpus) { freqs.cpu = k; cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); } @@ -553,30 +584,32 @@ static int centrino_target (struct cpufreq_policy *policy, * Best effort undo.. */ - if (!cpus_empty(covered_cpus)) { - for_each_cpu_mask(j, covered_cpus) { + if (!cpus_empty(*covered_cpus)) + for_each_cpu_mask_nr(j, *covered_cpus) { set_cpus_allowed_ptr(current, &cpumask_of_cpu(j)); wrmsr(MSR_IA32_PERF_CTL, oldmsr, h); } - } tmp = freqs.new; freqs.new = freqs.old; freqs.old = tmp; - for_each_cpu_mask(j, online_policy_cpus) { + for_each_cpu_mask_nr(j, *online_policy_cpus) { freqs.cpu = j; cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); } } - set_cpus_allowed_ptr(current, &saved_mask); - return 0; + set_cpus_allowed_ptr(current, saved_mask); + retval = 0; + goto out; migrate_end: preempt_enable(); - set_cpus_allowed_ptr(current, &saved_mask); - return 0; + set_cpus_allowed_ptr(current, saved_mask); +out: + CPUMASK_FREE(allmasks); + return retval; } static struct freq_attr* centrino_attr[] = { diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c index 1b50244b1fd..191f7263c61 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c @@ -279,7 +279,7 @@ static int speedstep_target (struct cpufreq_policy *policy, cpus_allowed = current->cpus_allowed; - for_each_cpu_mask(i, policy->cpus) { + for_each_cpu_mask_nr(i, policy->cpus) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); } @@ -292,7 +292,7 @@ static int speedstep_target (struct cpufreq_policy *policy, /* allow to be run on all CPUs */ set_cpus_allowed_ptr(current, &cpus_allowed); - for_each_cpu_mask(i, policy->cpus) { + for_each_cpu_mask_nr(i, policy->cpus) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); } diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index 3fd7a67bb06..898a5a2002e 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -15,13 +15,11 @@ /* * Read NSC/Cyrix DEVID registers (DIR) to get more detailed info. about the CPU */ -static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) +static void __cpuinit __do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) { unsigned char ccr2, ccr3; - unsigned long flags; /* we test for DEVID by checking whether CCR3 is writable */ - local_irq_save(flags); ccr3 = getCx86(CX86_CCR3); setCx86(CX86_CCR3, ccr3 ^ 0x80); getCx86(0xc0); /* dummy to change bus */ @@ -44,9 +42,16 @@ static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) *dir0 = getCx86(CX86_DIR0); *dir1 = getCx86(CX86_DIR1); } - local_irq_restore(flags); } +static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) +{ + unsigned long flags; + + local_irq_save(flags); + __do_cyrix_devid(dir0, dir1); + local_irq_restore(flags); +} /* * Cx86_dir0_msb is a HACK needed by check_cx686_cpuid/slop in bugs.h in * order to identify the Cyrix CPU model after we're out of setup.c @@ -134,23 +139,6 @@ static void __cpuinit set_cx86_memwb(void) setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x14); } -static void __cpuinit set_cx86_inc(void) -{ - unsigned char ccr3; - - printk(KERN_INFO "Enable Incrementor on Cyrix/NSC processor.\n"); - - ccr3 = getCx86(CX86_CCR3); - setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ - /* PCR1 -- Performance Control */ - /* Incrementor on, whatever that is */ - setCx86(CX86_PCR1, getCx86(CX86_PCR1) | 0x02); - /* PCR0 -- Performance Control */ - /* Incrementor Margin 10 */ - setCx86(CX86_PCR0, getCx86(CX86_PCR0) | 0x04); - setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ -} - /* * Configure later MediaGX and/or Geode processor. */ @@ -174,11 +162,28 @@ static void __cpuinit geode_configure(void) set_cx86_memwb(); set_cx86_reorder(); - set_cx86_inc(); local_irq_restore(flags); } +static void __cpuinit early_init_cyrix(struct cpuinfo_x86 *c) +{ + unsigned char dir0, dir0_msn, dir1 = 0; + + __do_cyrix_devid(&dir0, &dir1); + dir0_msn = dir0 >> 4; /* identifies CPU "family" */ + + switch (dir0_msn) { + case 3: /* 6x86/6x86L */ + /* Emulate MTRRs using Cyrix's ARRs. */ + set_cpu_cap(c, X86_FEATURE_CYRIX_ARR); + break; + case 5: /* 6x86MX/M II */ + /* Emulate MTRRs using Cyrix's ARRs. */ + set_cpu_cap(c, X86_FEATURE_CYRIX_ARR); + break; + } +} static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) { @@ -434,6 +439,7 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c) static struct cpu_dev cyrix_cpu_dev __cpuinitdata = { .c_vendor = "Cyrix", .c_ident = { "CyrixInstead" }, + .c_early_init = early_init_cyrix, .c_init = init_cyrix, .c_identify = cyrix_identify, }; diff --git a/arch/x86/kernel/cpu/feature_names.c b/arch/x86/kernel/cpu/feature_names.c index e43ad4ad4cb..c9017799497 100644 --- a/arch/x86/kernel/cpu/feature_names.c +++ b/arch/x86/kernel/cpu/feature_names.c @@ -39,7 +39,8 @@ const char * const x86_cap_flags[NCAPINTS*32] = { NULL, NULL, NULL, NULL, "constant_tsc", "up", NULL, "arch_perfmon", "pebs", "bts", NULL, NULL, - "rep_good", NULL, NULL, NULL, NULL, NULL, NULL, NULL, + "rep_good", NULL, NULL, NULL, + "nopl", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* Intel-defined (#2) */ diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 70609efdf1d..b75f2569b8f 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -227,6 +227,16 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) if (cpu_has_bts) ds_init_intel(c); + /* + * See if we have a good local APIC by checking for buggy Pentia, + * i.e. all B steppings and the C2 stepping of P54C when using their + * integrated APIC (see 11AP erratum in "Pentium Processor + * Specification Update"). + */ + if (cpu_has_apic && (c->x86<<8 | c->x86_model<<4) == 0x520 && + (c->x86_mask < 0x6 || c->x86_mask == 0xb)) + set_cpu_cap(c, X86_FEATURE_11AP); + #ifdef CONFIG_X86_NUMAQ numaq_tsc_disable(); #endif diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 2c8afafa18e..6b0a10b002f 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -489,7 +489,7 @@ static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index) int sibling; this_leaf = CPUID4_INFO_IDX(cpu, index); - for_each_cpu_mask(sibling, this_leaf->shared_cpu_map) { + for_each_cpu_mask_nr(sibling, this_leaf->shared_cpu_map) { sibling_leaf = CPUID4_INFO_IDX(sibling, index); cpu_clear(cpu, sibling_leaf->shared_cpu_map); } @@ -780,15 +780,14 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) } kobject_put(per_cpu(cache_kobject, cpu)); cpuid4_cache_sysfs_exit(cpu); - break; + return retval; } kobject_uevent(&(this_object->kobj), KOBJ_ADD); } - if (!retval) - cpu_set(cpu, cache_dev_map); + cpu_set(cpu, cache_dev_map); kobject_uevent(per_cpu(cache_kobject, cpu), KOBJ_ADD); - return retval; + return 0; } static void __cpuinit cache_remove_dev(struct sys_device * sys_dev) diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index c4a7ec31394..726a5fcdf34 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -580,7 +580,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, char __user *buf = ubuf; int i, err; - cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL); + cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); if (!cpu_tsc) return -ENOMEM; @@ -759,13 +759,18 @@ static struct sysdev_class mce_sysclass = { }; DEFINE_PER_CPU(struct sys_device, device_mce); +void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinitdata; /* Why are there no generic functions for this? */ #define ACCESSOR(name, var, start) \ - static ssize_t show_ ## name(struct sys_device *s, char *buf) { \ + static ssize_t show_ ## name(struct sys_device *s, \ + struct sysdev_attribute *attr, \ + char *buf) { \ return sprintf(buf, "%lx\n", (unsigned long)var); \ } \ - static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \ + static ssize_t set_ ## name(struct sys_device *s, \ + struct sysdev_attribute *attr, \ + const char *buf, size_t siz) { \ char *end; \ unsigned long new = simple_strtoul(buf, &end, 0); \ if (end == buf) return -EINVAL; \ @@ -786,14 +791,16 @@ ACCESSOR(bank3ctl,bank[3],mce_restart()) ACCESSOR(bank4ctl,bank[4],mce_restart()) ACCESSOR(bank5ctl,bank[5],mce_restart()) -static ssize_t show_trigger(struct sys_device *s, char *buf) +static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr, + char *buf) { strcpy(buf, trigger); strcat(buf, "\n"); return strlen(trigger) + 1; } -static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz) +static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, + const char *buf,size_t siz) { char *p; int len; @@ -806,12 +813,12 @@ static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz) } static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); -ACCESSOR(tolerant,tolerant,) +static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); ACCESSOR(check_interval,check_interval,mce_restart()) static struct sysdev_attribute *mce_attributes[] = { &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl, &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl, - &attr_tolerant, &attr_check_interval, &attr_trigger, + &attr_tolerant.attr, &attr_check_interval, &attr_trigger, NULL }; @@ -877,9 +884,13 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, case CPU_ONLINE: case CPU_ONLINE_FROZEN: mce_create_device(cpu); + if (threshold_cpu_callback) + threshold_cpu_callback(action, cpu); break; case CPU_DEAD: case CPU_DEAD_FROZEN: + if (threshold_cpu_callback) + threshold_cpu_callback(action, cpu); mce_remove_device(cpu); break; } diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c index 7c9a813e119..5eb390a4b2e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c @@ -527,7 +527,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) if (err) goto out_free; - for_each_cpu_mask(i, b->cpus) { + for_each_cpu_mask_nr(i, b->cpus) { if (i == cpu) continue; @@ -617,7 +617,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank) #endif /* remove all sibling symlinks before unregistering */ - for_each_cpu_mask(i, b->cpus) { + for_each_cpu_mask_nr(i, b->cpus) { if (i == cpu) continue; @@ -628,6 +628,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank) deallocate_threshold_block(cpu, bank); free_out: + kobject_del(b->kobj); kobject_put(b->kobj); kfree(b); per_cpu(threshold_banks, cpu)[bank] = NULL; @@ -645,14 +646,11 @@ static void threshold_remove_device(unsigned int cpu) } /* get notified when a cpu comes on/off */ -static int __cpuinit threshold_cpu_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) +static void __cpuinit amd_64_threshold_cpu_callback(unsigned long action, + unsigned int cpu) { - /* cpu was unsigned int to begin with */ - unsigned int cpu = (unsigned long)hcpu; - if (cpu >= NR_CPUS) - goto out; + return; switch (action) { case CPU_ONLINE: @@ -666,14 +664,8 @@ static int __cpuinit threshold_cpu_callback(struct notifier_block *nfb, default: break; } - out: - return NOTIFY_OK; } -static struct notifier_block threshold_cpu_notifier __cpuinitdata = { - .notifier_call = threshold_cpu_callback, -}; - static __init int threshold_init_device(void) { unsigned lcpu = 0; @@ -684,7 +676,7 @@ static __init int threshold_init_device(void) if (err) return err; } - register_hotcpu_notifier(&threshold_cpu_notifier); + threshold_cpu_callback = amd_64_threshold_cpu_callback; return 0; } diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c index eef001ad3bd..9b60fce09f7 100644 --- a/arch/x86/kernel/cpu/mcheck/p4.c +++ b/arch/x86/kernel/cpu/mcheck/p4.c @@ -102,7 +102,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c) /* The temperature transition interrupt handler setup */ h = THERMAL_APIC_VECTOR; /* our delivery vector */ h |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */ - apic_write_around(APIC_LVTTHMR, h); + apic_write(APIC_LVTTHMR, h); rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03 , h); @@ -114,7 +114,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c) wrmsr(MSR_IA32_MISC_ENABLE, l | (1<<3), h); l = apic_read(APIC_LVTTHMR); - apic_write_around(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); + apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); printk(KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu); /* enable thermal throttle processing */ diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 1f4cc48c14c..d5ae2243f0b 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -35,6 +35,7 @@ atomic_t therm_throt_en = ATOMIC_INIT(0); #define define_therm_throt_sysdev_show_func(name) \ static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \ + struct sysdev_attribute *attr, \ char *buf) \ { \ unsigned int cpu = dev->id; \ diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 509bd3d9eac..cb7d3b6a80e 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -379,6 +379,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, unsigned long *size, mtrr_type *type) { unsigned int mask_lo, mask_hi, base_lo, base_hi; + unsigned int tmp, hi; rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi); if ((mask_lo & 0x800) == 0) { @@ -392,8 +393,23 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi); /* Work out the shifted address mask. */ - mask_lo = size_or_mask | mask_hi << (32 - PAGE_SHIFT) - | mask_lo >> PAGE_SHIFT; + tmp = mask_hi << (32 - PAGE_SHIFT) | mask_lo >> PAGE_SHIFT; + mask_lo = size_or_mask | tmp; + /* Expand tmp with high bits to all 1s*/ + hi = fls(tmp); + if (hi > 0) { + tmp |= ~((1<<(hi - 1)) - 1); + + if (tmp != mask_lo) { + static int once = 1; + + if (once) { + printk(KERN_INFO "mtrr: your BIOS has set up an incorrect mask, fixing it up.\n"); + once = 0; + } + mask_lo = tmp; + } + } /* This works correctly if size is a power of two, i.e. a contiguous range. */ diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 6f23969c8fa..b117d7f8a56 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -1496,11 +1496,8 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) /* kvm/qemu doesn't have mtrr set right, don't trim them all */ if (!highest_pfn) { - if (!kvm_para_available()) { - printk(KERN_WARNING + WARN(!kvm_para_available(), KERN_WARNING "WARNING: strange, CPU MTRRs all blank?\n"); - WARN_ON(1); - } return 0; } diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 6d4bdc02388..05cc22dbd4f 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -250,7 +250,7 @@ static void write_watchdog_counter(unsigned int perfctr_msr, do_div(count, nmi_hz); if(descr) - Dprintk("setting %s to -0x%08Lx\n", descr, count); + pr_debug("setting %s to -0x%08Lx\n", descr, count); wrmsrl(perfctr_msr, 0 - count); } @@ -261,7 +261,7 @@ static void write_watchdog_counter32(unsigned int perfctr_msr, do_div(count, nmi_hz); if(descr) - Dprintk("setting %s to -0x%08Lx\n", descr, count); + pr_debug("setting %s to -0x%08Lx\n", descr, count); wrmsr(perfctr_msr, (u32)(-count), 0); } @@ -478,7 +478,13 @@ static int setup_p4_watchdog(unsigned nmi_hz) perfctr_msr = MSR_P4_IQ_PERFCTR1; evntsel_msr = MSR_P4_CRU_ESCR0; cccr_msr = MSR_P4_IQ_CCCR1; - cccr_val = P4_CCCR_OVF_PMI1 | P4_CCCR_ESCR_SELECT(4); + + /* Pentium 4 D processors don't support P4_CCCR_OVF_PMI1 */ + if (boot_cpu_data.x86_model == 4 && boot_cpu_data.x86_mask == 4) + cccr_val = P4_CCCR_OVF_PMI0; + else + cccr_val = P4_CCCR_OVF_PMI1; + cccr_val |= P4_CCCR_ESCR_SELECT(4); } evntsel = P4_ESCR_EVENT_SELECT(0x3F) diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 0d0d9057e7c..a26c480b949 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -160,7 +160,7 @@ static void *c_start(struct seq_file *m, loff_t *pos) { if (*pos == 0) /* just in case, cpu 0 is not the first */ *pos = first_cpu(cpu_online_map); - if ((*pos) < NR_CPUS && cpu_online(*pos)) + if ((*pos) < nr_cpu_ids && cpu_online(*pos)) return &cpu_data(*pos); return NULL; } diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 2de5fa2bbf7..8e9cd6a8ec1 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -89,6 +89,8 @@ static ssize_t cpuid_read(struct file *file, char __user *buf, struct cpuid_regs cmd; int cpu = iminor(file->f_path.dentry->d_inode); u64 pos = *ppos; + ssize_t bytes = 0; + int err = 0; if (count % 16) return -EINVAL; /* Invalid chunk size */ @@ -96,14 +98,19 @@ static ssize_t cpuid_read(struct file *file, char __user *buf, for (; count; count -= 16) { cmd.eax = pos; cmd.ecx = pos >> 32; - smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1); - if (copy_to_user(tmp, &cmd, 16)) - return -EFAULT; + err = smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1); + if (err) + break; + if (copy_to_user(tmp, &cmd, 16)) { + err = -EFAULT; + break; + } tmp += 16; + bytes += 16; *ppos = ++pos; } - return tmp - buf; + return bytes ? bytes : err; } static int cpuid_open(struct inode *inode, struct file *file) @@ -141,8 +148,8 @@ static __cpuinit int cpuid_device_create(int cpu) { struct device *dev; - dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), - "cpu%d", cpu); + dev = device_create_drvdata(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), + NULL, "cpu%d", cpu); return IS_ERR(dev) ? PTR_ERR(dev) : 0; } diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 28c29180b38..66e48aa2dd1 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -877,7 +877,8 @@ void __init early_res_to_bootmem(u64 start, u64 end) for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) count++; - printk(KERN_INFO "(%d early reservations) ==> bootmem\n", count); + printk(KERN_INFO "(%d early reservations) ==> bootmem [%010llx - %010llx]\n", + count, start, end); for (i = 0; i < count; i++) { struct early_res *r = &early_res[i]; printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i, @@ -1202,7 +1203,7 @@ static int __init parse_memmap_opt(char *p) if (!p) return -EINVAL; - if (!strcmp(p, "exactmap")) { + if (!strncmp(p, "exactmap", 8)) { #ifdef CONFIG_CRASH_DUMP /* * If we are doing a crash dump, we still need to know @@ -1298,11 +1299,6 @@ void __init e820_reserve_resources(void) } } -/* - * Non-standard memory setup can be specified via this quirk: - */ -char * (*arch_memory_setup_quirk)(void); - char *__init default_machine_specific_memory_setup(void) { char *who = "BIOS-e820"; @@ -1343,8 +1339,8 @@ char *__init default_machine_specific_memory_setup(void) char *__init __attribute__((weak)) machine_specific_memory_setup(void) { - if (arch_memory_setup_quirk) { - char *who = arch_memory_setup_quirk(); + if (x86_quirks->arch_memory_setup) { + char *who = x86_quirks->arch_memory_setup(); if (who) return who; @@ -1367,24 +1363,3 @@ void __init setup_memory_map(void) printk(KERN_INFO "BIOS-provided physical RAM map:\n"); e820_print_map(who); } - -#ifdef CONFIG_X86_64 -int __init arch_get_ram_range(int slot, u64 *addr, u64 *size) -{ - int i; - - if (slot < 0 || slot >= e820.nr_map) - return -1; - for (i = slot; i < e820.nr_map; i++) { - if (e820.map[i].type != E820_RAM) - continue; - break; - } - if (i == e820.nr_map || e820.map[i].addr > (max_pfn << PAGE_SHIFT)) - return -1; - *addr = e820.map[i].addr; - *size = min_t(u64, e820.map[i].size + e820.map[i].addr, - max_pfn << PAGE_SHIFT) - *addr; - return i + 1; -} -#endif diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index a0e11c0cc87..4353cf5e6fa 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -16,10 +16,7 @@ #include <asm/dma.h> #include <asm/io_apic.h> #include <asm/apic.h> - -#ifdef CONFIG_GART_IOMMU -#include <asm/gart.h> -#endif +#include <asm/iommu.h> static void __init fix_hypertransport_config(int num, int slot, int func) { diff --git a/arch/x86/kernel/efi_32.c b/arch/x86/kernel/efi_32.c index 4b63c8e1f13..5cab48ee61a 100644 --- a/arch/x86/kernel/efi_32.c +++ b/arch/x86/kernel/efi_32.c @@ -53,7 +53,7 @@ void efi_call_phys_prelog(void) * directory. If I have PAE, I just need to duplicate one entry in * page directory. */ - cr4 = read_cr4(); + cr4 = read_cr4_safe(); if (cr4 & X86_CR4_PAE) { efi_bak_pg_dir_pointer[0].pgd = @@ -91,7 +91,7 @@ void efi_call_phys_epilog(void) gdt_descr.size = GDT_SIZE - 1; load_gdt(&gdt_descr); - cr4 = read_cr4(); + cr4 = read_cr4_safe(); if (cr4 & X86_CR4_PAE) { swapper_pg_dir[pgd_index(0)].pgd = diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 6bc07f0f120..109792bc7cf 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -54,6 +54,16 @@ #include <asm/ftrace.h> #include <asm/irq_vectors.h> +/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ +#include <linux/elf-em.h> +#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE) +#define __AUDIT_ARCH_LE 0x40000000 + +#ifndef CONFIG_AUDITSYSCALL +#define sysenter_audit syscall_trace_entry +#define sysexit_audit syscall_exit_work +#endif + /* * We use macros for low-level operations which need to be overridden * for paravirtualization. The following will never clobber any registers: @@ -332,8 +342,9 @@ sysenter_past_esp: GET_THREAD_INFO(%ebp) /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ - testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) - jnz syscall_trace_entry + testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) + jnz sysenter_audit +sysenter_do_call: cmpl $(nr_syscalls), %eax jae syscall_badsys call *sys_call_table(,%eax,4) @@ -343,7 +354,8 @@ sysenter_past_esp: TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx testw $_TIF_ALLWORK_MASK, %cx - jne syscall_exit_work + jne sysexit_audit +sysenter_exit: /* if something modifies registers it must also disable sysexit */ movl PT_EIP(%esp), %edx movl PT_OLDESP(%esp), %ecx @@ -351,6 +363,45 @@ sysenter_past_esp: TRACE_IRQS_ON 1: mov PT_FS(%esp), %fs ENABLE_INTERRUPTS_SYSEXIT + +#ifdef CONFIG_AUDITSYSCALL +sysenter_audit: + testw $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp) + jnz syscall_trace_entry + addl $4,%esp + CFI_ADJUST_CFA_OFFSET -4 + /* %esi already in 8(%esp) 6th arg: 4th syscall arg */ + /* %edx already in 4(%esp) 5th arg: 3rd syscall arg */ + /* %ecx already in 0(%esp) 4th arg: 2nd syscall arg */ + movl %ebx,%ecx /* 3rd arg: 1st syscall arg */ + movl %eax,%edx /* 2nd arg: syscall number */ + movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */ + call audit_syscall_entry + pushl %ebx + CFI_ADJUST_CFA_OFFSET 4 + movl PT_EAX(%esp),%eax /* reload syscall number */ + jmp sysenter_do_call + +sysexit_audit: + testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx + jne syscall_exit_work + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_ANY) + movl %eax,%edx /* second arg, syscall return value */ + cmpl $0,%eax /* is it < 0? */ + setl %al /* 1 if so, 0 if not */ + movzbl %al,%eax /* zero-extend that */ + inc %eax /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ + call audit_syscall_exit + DISABLE_INTERRUPTS(CLBR_ANY) + TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx + testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx + jne syscall_exit_work + movl PT_EAX(%esp),%eax /* reload syscall return value */ + jmp sysenter_exit +#endif + CFI_ENDPROC .pushsection .fixup,"ax" 2: movl $0,PT_FS(%esp) @@ -370,7 +421,7 @@ ENTRY(system_call) GET_THREAD_INFO(%ebp) # system call tracing in operation / emulation /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ - testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) + testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) jnz syscall_trace_entry cmpl $(nr_syscalls), %eax jae syscall_badsys @@ -383,10 +434,6 @@ syscall_exit: # setting need_resched or sigpending # between sampling and the iret TRACE_IRQS_OFF - testl $X86_EFLAGS_TF,PT_EFLAGS(%esp) # If tracing set singlestep flag on exit - jz no_singlestep - orl $_TIF_SINGLESTEP,TI_flags(%ebp) -no_singlestep: movl TI_flags(%ebp), %ecx testw $_TIF_ALLWORK_MASK, %cx # current->work jne syscall_exit_work @@ -514,12 +561,8 @@ END(work_pending) syscall_trace_entry: movl $-ENOSYS,PT_EAX(%esp) movl %esp, %eax - xorl %edx,%edx - call do_syscall_trace - cmpl $0, %eax - jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU, - # so must skip actual syscall - movl PT_ORIG_EAX(%esp), %eax + call syscall_trace_enter + /* What it returned is what we'll actually use. */ cmpl $(nr_syscalls), %eax jnae syscall_call jmp syscall_exit @@ -528,14 +571,13 @@ END(syscall_trace_entry) # perform syscall exit tracing ALIGN syscall_exit_work: - testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl + testb $_TIF_WORK_SYSCALL_EXIT, %cl jz work_pending TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_ANY) # could let do_syscall_trace() call + ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call # schedule() instead movl %esp, %eax - movl $1, %edx - call do_syscall_trace + call syscall_trace_leave jmp resume_userspace END(syscall_exit_work) CFI_ENDPROC @@ -1024,6 +1066,7 @@ ENDPROC(kernel_thread_helper) ENTRY(xen_sysenter_target) RING0_INT_FRAME addl $5*4, %esp /* remove xen-provided frame */ + CFI_ADJUST_CFA_OFFSET -5*4 jmp sysenter_past_esp CFI_ENDPROC diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index ae63e584c34..89434d43960 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -53,6 +53,12 @@ #include <asm/paravirt.h> #include <asm/ftrace.h> +/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ +#include <linux/elf-em.h> +#define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE) +#define __AUDIT_ARCH_64BIT 0x80000000 +#define __AUDIT_ARCH_LE 0x40000000 + .code64 #ifdef CONFIG_FTRACE @@ -349,9 +355,9 @@ ENTRY(system_call_after_swapgs) movq %rcx,RIP-ARGOFFSET(%rsp) CFI_REL_OFFSET rip,RIP-ARGOFFSET GET_THREAD_INFO(%rcx) - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \ - TI_flags(%rcx) + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx) jnz tracesys +system_call_fastpath: cmpq $__NR_syscall_max,%rax ja badsys movq %r10,%rcx @@ -403,16 +409,16 @@ sysret_careful: sysret_signal: TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) - testl $_TIF_DO_NOTIFY_MASK,%edx - jz 1f - - /* Really a signal */ +#ifdef CONFIG_AUDITSYSCALL + bt $TIF_SYSCALL_AUDIT,%edx + jc sysret_audit +#endif /* edx: work flags (arg3) */ leaq do_notify_resume(%rip),%rax leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 xorl %esi,%esi # oldset -> arg2 call ptregscall_common -1: movl $_TIF_WORK_MASK,%edi + movl $_TIF_WORK_MASK,%edi /* Use IRET because user could have changed frame. This works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ DISABLE_INTERRUPTS(CLBR_NONE) @@ -423,14 +429,56 @@ badsys: movq $-ENOSYS,RAX-ARGOFFSET(%rsp) jmp ret_from_sys_call +#ifdef CONFIG_AUDITSYSCALL + /* + * Fast path for syscall audit without full syscall trace. + * We just call audit_syscall_entry() directly, and then + * jump back to the normal fast path. + */ +auditsys: + movq %r10,%r9 /* 6th arg: 4th syscall arg */ + movq %rdx,%r8 /* 5th arg: 3rd syscall arg */ + movq %rsi,%rcx /* 4th arg: 2nd syscall arg */ + movq %rdi,%rdx /* 3rd arg: 1st syscall arg */ + movq %rax,%rsi /* 2nd arg: syscall number */ + movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */ + call audit_syscall_entry + LOAD_ARGS 0 /* reload call-clobbered registers */ + jmp system_call_fastpath + + /* + * Return fast path for syscall audit. Call audit_syscall_exit() + * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT + * masked off. + */ +sysret_audit: + movq %rax,%rsi /* second arg, syscall return value */ + cmpq $0,%rax /* is it < 0? */ + setl %al /* 1 if so, 0 if not */ + movzbl %al,%edi /* zero-extend that into %edi */ + inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ + call audit_syscall_exit + movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi + jmp sysret_check +#endif /* CONFIG_AUDITSYSCALL */ + /* Do syscall tracing */ tracesys: +#ifdef CONFIG_AUDITSYSCALL + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx) + jz auditsys +#endif SAVE_REST movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ FIXUP_TOP_OF_STACK %rdi movq %rsp,%rdi call syscall_trace_enter - LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */ + /* + * Reload arg registers from stack in case ptrace changed them. + * We don't reload %rax because syscall_trace_enter() returned + * the value it wants us to use in the table lookup. + */ + LOAD_ARGS ARGOFFSET, 1 RESTORE_REST cmpq $__NR_syscall_max,%rax ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */ @@ -444,6 +492,7 @@ tracesys: * Has correct top of stack, but partial stack frame. */ .globl int_ret_from_sys_call + .globl int_with_check int_ret_from_sys_call: DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF @@ -483,7 +532,7 @@ int_very_careful: ENABLE_INTERRUPTS(CLBR_NONE) SAVE_REST /* Check for syscall exit trace */ - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx + testl $_TIF_WORK_SYSCALL_EXIT,%edx jz int_signal pushq %rdi CFI_ADJUST_CFA_OFFSET 8 @@ -491,7 +540,7 @@ int_very_careful: call syscall_trace_leave popq %rdi CFI_ADJUST_CFA_OFFSET -8 - andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi + andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi jmp int_restore_rest int_signal: @@ -1189,6 +1238,7 @@ END(device_not_available) /* runs on exception stack */ KPROBE_ENTRY(debug) INTR_FRAME + PARAVIRT_ADJUST_EXCEPTION_FRAME pushq $0 CFI_ADJUST_CFA_OFFSET 8 paranoidentry do_debug, DEBUG_STACK @@ -1198,6 +1248,7 @@ KPROBE_END(debug) /* runs on exception stack */ KPROBE_ENTRY(nmi) INTR_FRAME + PARAVIRT_ADJUST_EXCEPTION_FRAME pushq $-1 CFI_ADJUST_CFA_OFFSET 8 paranoidentry do_nmi, 0, 0 @@ -1211,6 +1262,7 @@ KPROBE_END(nmi) KPROBE_ENTRY(int3) INTR_FRAME + PARAVIRT_ADJUST_EXCEPTION_FRAME pushq $0 CFI_ADJUST_CFA_OFFSET 8 paranoidentry do_int3, DEBUG_STACK @@ -1237,6 +1289,7 @@ END(coprocessor_segment_overrun) /* runs on exception stack */ ENTRY(double_fault) XCPT_FRAME + PARAVIRT_ADJUST_EXCEPTION_FRAME paranoidentry do_double_fault jmp paranoid_exit1 CFI_ENDPROC @@ -1253,6 +1306,7 @@ END(segment_not_present) /* runs on exception stack */ ENTRY(stack_segment) XCPT_FRAME + PARAVIRT_ADJUST_EXCEPTION_FRAME paranoidentry do_stack_segment jmp paranoid_exit1 CFI_ENDPROC @@ -1278,6 +1332,7 @@ END(spurious_interrupt_bug) /* runs on exception stack */ ENTRY(machine_check) INTR_FRAME + PARAVIRT_ADJUST_EXCEPTION_FRAME pushq $0 CFI_ADJUST_CFA_OFFSET 8 paranoidentry do_machine_check @@ -1312,3 +1367,103 @@ KPROBE_ENTRY(ignore_sysret) sysret CFI_ENDPROC ENDPROC(ignore_sysret) + +#ifdef CONFIG_XEN +ENTRY(xen_hypervisor_callback) + zeroentry xen_do_hypervisor_callback +END(xen_hypervisor_callback) + +/* +# A note on the "critical region" in our callback handler. +# We want to avoid stacking callback handlers due to events occurring +# during handling of the last event. To do this, we keep events disabled +# until we've done all processing. HOWEVER, we must enable events before +# popping the stack frame (can't be done atomically) and so it would still +# be possible to get enough handler activations to overflow the stack. +# Although unlikely, bugs of that kind are hard to track down, so we'd +# like to avoid the possibility. +# So, on entry to the handler we detect whether we interrupted an +# existing activation in its critical region -- if so, we pop the current +# activation and restart the handler using the previous one. +*/ +ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs) + CFI_STARTPROC +/* Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will + see the correct pointer to the pt_regs */ + movq %rdi, %rsp # we don't return, adjust the stack frame + CFI_ENDPROC + CFI_DEFAULT_STACK +11: incl %gs:pda_irqcount + movq %rsp,%rbp + CFI_DEF_CFA_REGISTER rbp + cmovzq %gs:pda_irqstackptr,%rsp + pushq %rbp # backlink for old unwinder + call xen_evtchn_do_upcall + popq %rsp + CFI_DEF_CFA_REGISTER rsp + decl %gs:pda_irqcount + jmp error_exit + CFI_ENDPROC +END(do_hypervisor_callback) + +/* +# Hypervisor uses this for application faults while it executes. +# We get here for two reasons: +# 1. Fault while reloading DS, ES, FS or GS +# 2. Fault while executing IRET +# Category 1 we do not need to fix up as Xen has already reloaded all segment +# registers that could be reloaded and zeroed the others. +# Category 2 we fix up by killing the current process. We cannot use the +# normal Linux return path in this case because if we use the IRET hypercall +# to pop the stack frame we end up in an infinite loop of failsafe callbacks. +# We distinguish between categories by comparing each saved segment register +# with its current contents: any discrepancy means we in category 1. +*/ +ENTRY(xen_failsafe_callback) + framesz = (RIP-0x30) /* workaround buggy gas */ + _frame framesz + CFI_REL_OFFSET rcx, 0 + CFI_REL_OFFSET r11, 8 + movw %ds,%cx + cmpw %cx,0x10(%rsp) + CFI_REMEMBER_STATE + jne 1f + movw %es,%cx + cmpw %cx,0x18(%rsp) + jne 1f + movw %fs,%cx + cmpw %cx,0x20(%rsp) + jne 1f + movw %gs,%cx + cmpw %cx,0x28(%rsp) + jne 1f + /* All segments match their saved values => Category 2 (Bad IRET). */ + movq (%rsp),%rcx + CFI_RESTORE rcx + movq 8(%rsp),%r11 + CFI_RESTORE r11 + addq $0x30,%rsp + CFI_ADJUST_CFA_OFFSET -0x30 + pushq $0 + CFI_ADJUST_CFA_OFFSET 8 + pushq %r11 + CFI_ADJUST_CFA_OFFSET 8 + pushq %rcx + CFI_ADJUST_CFA_OFFSET 8 + jmp general_protection + CFI_RESTORE_STATE +1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */ + movq (%rsp),%rcx + CFI_RESTORE rcx + movq 8(%rsp),%r11 + CFI_RESTORE r11 + addq $0x30,%rsp + CFI_ADJUST_CFA_OFFSET -0x30 + pushq $0 + CFI_ADJUST_CFA_OFFSET 8 + SAVE_ALL + jmp error_exit + CFI_ENDPROC +END(xen_failsafe_callback) + +#endif /* CONFIG_XEN */ diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c index 1fa8be5bd21..eaff0bbb144 100644 --- a/arch/x86/kernel/genapic_64.c +++ b/arch/x86/kernel/genapic_64.c @@ -99,3 +99,4 @@ int is_uv_system(void) { return uv_system_type != UV_NONE; } +EXPORT_SYMBOL_GPL(is_uv_system); diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/genapic_flat_64.c index 1a9c68845ee..786548a62d3 100644 --- a/arch/x86/kernel/genapic_flat_64.c +++ b/arch/x86/kernel/genapic_flat_64.c @@ -168,7 +168,7 @@ static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask) * May as well be the first. */ cpu = first_cpu(cpumask); - if ((unsigned)cpu < NR_CPUS) + if ((unsigned)cpu < nr_cpu_ids) return per_cpu(x86_cpu_to_apicid, cpu); else return BAD_APICID; diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c index 711f11c30b0..bfa837cb16b 100644 --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/genx2apic_uv_x.c @@ -24,6 +24,7 @@ #include <asm/pgtable.h> #include <asm/uv/uv_mmrs.h> #include <asm/uv/uv_hub.h> +#include <asm/uv/bios.h> DEFINE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); EXPORT_PER_CPU_SYMBOL_GPL(__uv_hub_info); @@ -40,6 +41,9 @@ EXPORT_SYMBOL_GPL(uv_cpu_to_blade); short uv_possible_blades; EXPORT_SYMBOL_GPL(uv_possible_blades); +unsigned long sn_rtc_cycles_per_second; +EXPORT_SYMBOL(sn_rtc_cycles_per_second); + /* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ static cpumask_t uv_target_cpus(void) @@ -94,7 +98,7 @@ static void uv_send_IPI_mask(cpumask_t mask, int vector) { unsigned int cpu; - for (cpu = 0; cpu < NR_CPUS; ++cpu) + for_each_possible_cpu(cpu) if (cpu_isset(cpu, mask)) uv_send_IPI_one(cpu, vector); } @@ -128,7 +132,7 @@ static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask) * May as well be the first. */ cpu = first_cpu(cpumask); - if ((unsigned)cpu < NR_CPUS) + if ((unsigned)cpu < nr_cpu_ids) return per_cpu(x86_cpu_to_apicid, cpu); else return BAD_APICID; @@ -218,7 +222,7 @@ static __init void map_low_mmrs(void) enum map_type {map_wb, map_uc}; -static void map_high(char *id, unsigned long base, int shift, enum map_type map_type) +static __init void map_high(char *id, unsigned long base, int shift, enum map_type map_type) { unsigned long bytes, paddr; @@ -272,7 +276,26 @@ static __init void map_mmioh_high(int max_pnode) map_high("MMIOH", mmioh.s.base, shift, map_uc); } -static __init void uv_system_init(void) +static __init void uv_rtc_init(void) +{ + long status, ticks_per_sec, drift; + + status = + x86_bios_freq_base(BIOS_FREQ_BASE_REALTIME_CLOCK, &ticks_per_sec, + &drift); + if (status != 0 || ticks_per_sec < 100000) { + printk(KERN_WARNING + "unable to determine platform RTC clock frequency, " + "guessing.\n"); + /* BIOS gives wrong value for clock freq. so guess */ + sn_rtc_cycles_per_second = 1000000000000UL / 30000UL; + } else + sn_rtc_cycles_per_second = ticks_per_sec; +} + +static bool uv_system_inited; + +void __init uv_system_init(void) { union uvh_si_addr_map_config_u m_n_config; union uvh_node_id_u node_id; @@ -326,6 +349,8 @@ static __init void uv_system_init(void) gnode_upper = (((unsigned long)node_id.s.node_id) & ~((1 << n_val) - 1)) << m_val; + uv_rtc_init(); + for_each_present_cpu(cpu) { nid = cpu_to_node(cpu); pnode = uv_apicid_to_pnode(per_cpu(x86_cpu_to_apicid, cpu)); @@ -360,6 +385,7 @@ static __init void uv_system_init(void) map_mmr_high(max_pnode); map_config_high(max_pnode); map_mmioh_high(max_pnode); + uv_system_inited = true; } /* @@ -368,8 +394,7 @@ static __init void uv_system_init(void) */ void __cpuinit uv_cpu_init(void) { - if (!uv_node_to_blade) - uv_system_init(); + BUG_ON(!uv_system_inited); uv_blade_info[uv_numa_blade_id()].nr_online_cpus++; diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index c9781982914..9bfc4d72fb2 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -39,6 +39,13 @@ static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata; static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly; #endif +void __init x86_64_init_pda(void) +{ + _cpu_pda = __cpu_pda; + cpu_pda(0) = &_boot_cpu_pda; + pda_init(0); +} + static void __init zap_identity_mappings(void) { pgd_t *pgd = pgd_offset_k(0UL); @@ -81,6 +88,7 @@ void __init x86_64_start_kernel(char * real_mode_data) BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL)); BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == (__START_KERNEL & PGDIR_MASK))); + BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END); /* clear bss before set_intr_gate with early_idt_handler */ clear_bss(); @@ -102,9 +110,7 @@ void __init x86_64_start_kernel(char * real_mode_data) early_printk("Kernel alive\n"); - _cpu_pda = __cpu_pda; - cpu_pda(0) = &_boot_cpu_pda; - pda_init(0); + x86_64_init_pda(); early_printk("Kernel really alive\n"); diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index f67e93441ca..a7010c3a377 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -456,9 +456,6 @@ is386: movl $2,%ecx # set MP 1: #endif /* CONFIG_SMP */ jmp *(initial_code) -.align 4 -ENTRY(initial_code) - .long i386_start_kernel /* * We depend on ET to be correct. This checks for 287/387. @@ -601,6 +598,11 @@ ignore_int: #endif iret +.section .cpuinit.data,"wa" +.align 4 +ENTRY(initial_code) + .long i386_start_kernel + .section .text /* * Real beginning of normal "text" segment diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index b07ac7b217c..db3280afe88 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -407,6 +407,7 @@ ENTRY(phys_base) /* This must match the first entry in level2_kernel_pgt */ .quad 0x0000000000000000 +#include "../../x86/xen/xen-head.S" .section .bss, "aw", @nobits .align L1_CACHE_BYTES diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 0ea6a19bfdf..73deaffadd0 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -210,8 +210,8 @@ static void hpet_legacy_clockevent_register(void) /* Calculate the min / max delta */ hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, &hpet_clockevent); - hpet_clockevent.min_delta_ns = clockevent_delta2ns(0x30, - &hpet_clockevent); + /* 5 usec minimum reprogramming delta. */ + hpet_clockevent.min_delta_ns = 5000; /* * Start hpet with the boot cpu mask and make it @@ -270,15 +270,22 @@ static void hpet_legacy_set_mode(enum clock_event_mode mode, } static int hpet_legacy_next_event(unsigned long delta, - struct clock_event_device *evt) + struct clock_event_device *evt) { - unsigned long cnt; + u32 cnt; cnt = hpet_readl(HPET_COUNTER); - cnt += delta; + cnt += (u32) delta; hpet_writel(cnt, HPET_T0_CMP); - return ((long)(hpet_readl(HPET_COUNTER) - cnt ) > 0) ? -ETIME : 0; + /* + * We need to read back the CMP register to make sure that + * what we wrote hit the chip before we compare it to the + * counter. + */ + WARN_ON((u32)hpet_readl(HPET_T0_CMP) != cnt); + + return (s32)((u32)hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0; } /* @@ -359,6 +366,7 @@ static int hpet_clocksource_register(void) int __init hpet_enable(void) { unsigned long id; + int i; if (!is_hpet_capable()) return 0; @@ -369,6 +377,29 @@ int __init hpet_enable(void) * Read the period and check for a sane value: */ hpet_period = hpet_readl(HPET_PERIOD); + + /* + * AMD SB700 based systems with spread spectrum enabled use a + * SMM based HPET emulation to provide proper frequency + * setting. The SMM code is initialized with the first HPET + * register access and takes some time to complete. During + * this time the config register reads 0xffffffff. We check + * for max. 1000 loops whether the config register reads a non + * 0xffffffff value to make sure that HPET is up and running + * before we go further. A counting loop is safe, as the HPET + * access takes thousands of CPU cycles. On non SB700 based + * machines this check is only done once and has no side + * effects. + */ + for (i = 0; hpet_readl(HPET_CFG) == 0xFFFFFFFF; i++) { + if (i == 1000) { + printk(KERN_WARNING + "HPET config register value = 0xFFFFFFFF. " + "Disabling HPET\n"); + goto out_nohpet; + } + } + if (hpet_period < HPET_MIN_PERIOD || hpet_period > HPET_MAX_PERIOD) goto out_nohpet; @@ -468,7 +499,7 @@ void hpet_disable(void) #define RTC_NUM_INTS 1 static unsigned long hpet_rtc_flags; -static unsigned long hpet_prev_update_sec; +static int hpet_prev_update_sec; static struct rtc_time hpet_alarm_time; static unsigned long hpet_pie_count; static unsigned long hpet_t1_cmp; @@ -575,6 +606,9 @@ int hpet_set_rtc_irq_bit(unsigned long bit_mask) hpet_rtc_flags |= bit_mask; + if ((bit_mask & RTC_UIE) && !(oldbits & RTC_UIE)) + hpet_prev_update_sec = -1; + if (!oldbits) hpet_rtc_timer_init(); @@ -652,7 +686,7 @@ static void hpet_rtc_timer_reinit(void) if (hpet_rtc_flags & RTC_PIE) hpet_pie_count += lost_ints; if (printk_ratelimit()) - printk(KERN_WARNING "rtc: lost %d interrupts\n", + printk(KERN_WARNING "hpet1: lost %d rtc interrupts\n", lost_ints); } } @@ -670,7 +704,8 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id) if (hpet_rtc_flags & RTC_UIE && curr_time.tm_sec != hpet_prev_update_sec) { - rtc_int_flag = RTC_UF; + if (hpet_prev_update_sec >= 0) + rtc_int_flag = RTC_UF; hpet_prev_update_sec = curr_time.tm_sec; } diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index 558abf4c796..09cddb57bec 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -57,7 +57,7 @@ atomic_t irq_mis_count; static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; static DEFINE_SPINLOCK(ioapic_lock); -static DEFINE_SPINLOCK(vector_lock); +DEFINE_SPINLOCK(vector_lock); int timer_through_8259 __initdata; @@ -756,7 +756,7 @@ void send_IPI_self(int vector) /* * Send the IPI. The write to APIC_ICR fires this off. */ - apic_write_around(APIC_ICR, cfg); + apic_write(APIC_ICR, cfg); } #endif /* !CONFIG_SMP */ @@ -1209,10 +1209,6 @@ static int assign_irq_vector(int irq) return vector; } -void setup_vector_irq(int cpu) -{ -} - static struct irq_chip ioapic_chip; #define IOAPIC_AUTO -1 @@ -2030,7 +2026,7 @@ static void mask_lapic_irq(unsigned int irq) unsigned long v; v = apic_read(APIC_LVT0); - apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); + apic_write(APIC_LVT0, v | APIC_LVT_MASKED); } static void unmask_lapic_irq(unsigned int irq) @@ -2038,7 +2034,7 @@ static void unmask_lapic_irq(unsigned int irq) unsigned long v; v = apic_read(APIC_LVT0); - apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED); + apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); } static struct irq_chip lapic_chip __read_mostly = { @@ -2168,7 +2164,7 @@ static inline void __init check_timer(void) * The AEOI mode will finish them in the 8259A * automatically. */ - apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); + apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); init_8259A(1); timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver)); @@ -2177,8 +2173,9 @@ static inline void __init check_timer(void) pin2 = ioapic_i8259.pin; apic2 = ioapic_i8259.apic; - printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n", - vector, apic1, pin1, apic2, pin2); + apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X " + "apic1=%d pin1=%d apic2=%d pin2=%d\n", + vector, apic1, pin1, apic2, pin2); /* * Some BIOS writers are clueless and report the ExtINTA @@ -2216,12 +2213,13 @@ static inline void __init check_timer(void) } clear_IO_APIC_pin(apic1, pin1); if (!no_pin1) - printk(KERN_ERR "..MP-BIOS bug: " - "8254 timer not connected to IO-APIC\n"); + apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: " + "8254 timer not connected to IO-APIC\n"); - printk(KERN_INFO "...trying to set up timer (IRQ0) " - "through the 8259A ... "); - printk("\n..... (found pin %d) ...", pin2); + apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer " + "(IRQ0) through the 8259A ...\n"); + apic_printk(APIC_QUIET, KERN_INFO + "..... (found apic %d pin %d) ...\n", apic2, pin2); /* * legacy devices should be connected to IO APIC #0 */ @@ -2230,7 +2228,7 @@ static inline void __init check_timer(void) unmask_IO_APIC_irq(0); enable_8259A_irq(0); if (timer_irq_works()) { - printk("works.\n"); + apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); timer_through_8259 = 1; if (nmi_watchdog == NMI_IO_APIC) { disable_8259A_irq(0); @@ -2244,44 +2242,47 @@ static inline void __init check_timer(void) */ disable_8259A_irq(0); clear_IO_APIC_pin(apic2, pin2); - printk(" failed.\n"); + apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); } if (nmi_watchdog == NMI_IO_APIC) { - printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n"); + apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work " + "through the IO-APIC - disabling NMI Watchdog!\n"); nmi_watchdog = NMI_NONE; } timer_ack = 0; - printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); + apic_printk(APIC_QUIET, KERN_INFO + "...trying to set up timer as Virtual Wire IRQ...\n"); lapic_register_intr(0, vector); - apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */ + apic_write(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */ enable_8259A_irq(0); if (timer_irq_works()) { - printk(" works.\n"); + apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); goto out; } disable_8259A_irq(0); - apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector); - printk(" failed.\n"); + apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector); + apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n"); - printk(KERN_INFO "...trying to set up timer as ExtINT IRQ..."); + apic_printk(APIC_QUIET, KERN_INFO + "...trying to set up timer as ExtINT IRQ...\n"); init_8259A(0); make_8259A_irq(0); - apic_write_around(APIC_LVT0, APIC_DM_EXTINT); + apic_write(APIC_LVT0, APIC_DM_EXTINT); unlock_ExtINT_logic(); if (timer_irq_works()) { - printk(" works.\n"); + apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); goto out; } - printk(" failed :(.\n"); + apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n"); panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " - "report. Then try booting with the 'noapic' option"); + "report. Then try booting with the 'noapic' option.\n"); out: local_irq_restore(flags); } diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 6510cde36b3..61a83b70c18 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -45,6 +45,7 @@ #include <asm/proto.h> #include <asm/acpi.h> #include <asm/dma.h> +#include <asm/i8259.h> #include <asm/nmi.h> #include <asm/msidef.h> #include <asm/hypertransport.h> @@ -100,7 +101,7 @@ int timer_through_8259 __initdata; static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; static DEFINE_SPINLOCK(ioapic_lock); -DEFINE_SPINLOCK(vector_lock); +static DEFINE_SPINLOCK(vector_lock); /* * # of IRQ routing registers @@ -696,6 +697,19 @@ static int pin_2_irq(int idx, int apic, int pin) return irq; } +void lock_vector_lock(void) +{ + /* Used to the online set of cpus does not change + * during assign_irq_vector. + */ + spin_lock(&vector_lock); +} + +void unlock_vector_lock(void) +{ + spin_unlock(&vector_lock); +} + static int __assign_irq_vector(int irq, cpumask_t mask) { /* @@ -731,7 +745,7 @@ static int __assign_irq_vector(int irq, cpumask_t mask) return 0; } - for_each_cpu_mask(cpu, mask) { + for_each_cpu_mask_nr(cpu, mask) { cpumask_t domain, new_mask; int new_cpu; int vector, offset; @@ -752,7 +766,7 @@ next: continue; if (vector == IA32_SYSCALL_VECTOR) goto next; - for_each_cpu_mask(new_cpu, new_mask) + for_each_cpu_mask_nr(new_cpu, new_mask) if (per_cpu(vector_irq, new_cpu)[vector] != -1) goto next; /* Found one! */ @@ -762,7 +776,7 @@ next: cfg->move_in_progress = 1; cfg->old_domain = cfg->domain; } - for_each_cpu_mask(new_cpu, new_mask) + for_each_cpu_mask_nr(new_cpu, new_mask) per_cpu(vector_irq, new_cpu)[vector] = irq; cfg->vector = vector; cfg->domain = domain; @@ -794,14 +808,14 @@ static void __clear_irq_vector(int irq) vector = cfg->vector; cpus_and(mask, cfg->domain, cpu_online_map); - for_each_cpu_mask(cpu, mask) + for_each_cpu_mask_nr(cpu, mask) per_cpu(vector_irq, cpu)[vector] = -1; cfg->vector = 0; cpus_clear(cfg->domain); } -static void __setup_vector_irq(int cpu) +void __setup_vector_irq(int cpu) { /* Initialize vector_irq on a new cpu */ /* This function must be called with vector_lock held */ @@ -824,14 +838,6 @@ static void __setup_vector_irq(int cpu) } } -void setup_vector_irq(int cpu) -{ - spin_lock(&vector_lock); - __setup_vector_irq(smp_processor_id()); - spin_unlock(&vector_lock); -} - - static struct irq_chip ioapic_chip; static void ioapic_register_intr(int irq, unsigned long trigger) @@ -1372,12 +1378,10 @@ static unsigned int startup_ioapic_irq(unsigned int irq) static int ioapic_retrigger_irq(unsigned int irq) { struct irq_cfg *cfg = &irq_cfg[irq]; - cpumask_t mask; unsigned long flags; spin_lock_irqsave(&vector_lock, flags); - mask = cpumask_of_cpu(first_cpu(cfg->domain)); - send_IPI_mask(mask, cfg->vector); + send_IPI_mask(cpumask_of_cpu(first_cpu(cfg->domain)), cfg->vector); spin_unlock_irqrestore(&vector_lock, flags); return 1; @@ -1696,8 +1700,9 @@ static inline void __init check_timer(void) pin2 = ioapic_i8259.pin; apic2 = ioapic_i8259.apic; - apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n", - cfg->vector, apic1, pin1, apic2, pin2); + apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X " + "apic1=%d pin1=%d apic2=%d pin2=%d\n", + cfg->vector, apic1, pin1, apic2, pin2); /* * Some BIOS writers are clueless and report the ExtINTA @@ -1735,14 +1740,13 @@ static inline void __init check_timer(void) } clear_IO_APIC_pin(apic1, pin1); if (!no_pin1) - apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: " + apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: " "8254 timer not connected to IO-APIC\n"); - apic_printk(APIC_VERBOSE,KERN_INFO - "...trying to set up timer (IRQ0) " - "through the 8259A ... "); - apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...", - apic2, pin2); + apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer " + "(IRQ0) through the 8259A ...\n"); + apic_printk(APIC_QUIET, KERN_INFO + "..... (found apic %d pin %d) ...\n", apic2, pin2); /* * legacy devices should be connected to IO APIC #0 */ @@ -1751,7 +1755,7 @@ static inline void __init check_timer(void) unmask_IO_APIC_irq(0); enable_8259A_irq(0); if (timer_irq_works()) { - apic_printk(APIC_VERBOSE," works.\n"); + apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); timer_through_8259 = 1; if (nmi_watchdog == NMI_IO_APIC) { disable_8259A_irq(0); @@ -1765,29 +1769,32 @@ static inline void __init check_timer(void) */ disable_8259A_irq(0); clear_IO_APIC_pin(apic2, pin2); - apic_printk(APIC_VERBOSE," failed.\n"); + apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); } if (nmi_watchdog == NMI_IO_APIC) { - printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n"); + apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work " + "through the IO-APIC - disabling NMI Watchdog!\n"); nmi_watchdog = NMI_NONE; } - apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); + apic_printk(APIC_QUIET, KERN_INFO + "...trying to set up timer as Virtual Wire IRQ...\n"); lapic_register_intr(0); apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ enable_8259A_irq(0); if (timer_irq_works()) { - apic_printk(APIC_VERBOSE," works.\n"); + apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); goto out; } disable_8259A_irq(0); apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); - apic_printk(APIC_VERBOSE," failed.\n"); + apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n"); - apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ..."); + apic_printk(APIC_QUIET, KERN_INFO + "...trying to set up timer as ExtINT IRQ...\n"); init_8259A(0); make_8259A_irq(0); @@ -1796,11 +1803,12 @@ static inline void __init check_timer(void) unlock_ExtINT_logic(); if (timer_irq_works()) { - apic_printk(APIC_VERBOSE," works.\n"); + apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); goto out; } - apic_printk(APIC_VERBOSE," failed :(.\n"); - panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n"); + apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n"); + panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " + "report. Then try booting with the 'noapic' option.\n"); out: local_irq_restore(flags); } diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c index 5921e5f0a64..720d2607aac 100644 --- a/arch/x86/kernel/io_delay.c +++ b/arch/x86/kernel/io_delay.c @@ -92,6 +92,14 @@ static struct dmi_system_id __initdata io_delay_0xed_port_dmi_table[] = { DMI_MATCH(DMI_BOARD_NAME, "30BF") } }, + { + .callback = dmi_io_delay_0xed_port, + .ident = "Presario F700", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), + DMI_MATCH(DMI_BOARD_NAME, "30D3") + } + }, { } }; @@ -103,6 +111,9 @@ void __init io_delay_init(void) static int __init io_delay_param(char *s) { + if (!s) + return -EINVAL; + if (!strcmp(s, "0x80")) io_delay_type = CONFIG_IO_DELAY_TYPE_0X80; else if (!strcmp(s, "0xed")) diff --git a/arch/x86/kernel/ipi.c b/arch/x86/kernel/ipi.c index 9d98cda39ad..3f7537b669d 100644 --- a/arch/x86/kernel/ipi.c +++ b/arch/x86/kernel/ipi.c @@ -70,7 +70,7 @@ void __send_IPI_shortcut(unsigned int shortcut, int vector) /* * Send the IPI. The write to APIC_ICR fires this off. */ - apic_write_around(APIC_ICR, cfg); + apic_write(APIC_ICR, cfg); } void send_IPI_self(int vector) @@ -98,7 +98,7 @@ static inline void __send_IPI_dest_field(unsigned long mask, int vector) * prepare target chip field */ cfg = __prepare_ICR2(mask); - apic_write_around(APIC_ICR2, cfg); + apic_write(APIC_ICR2, cfg); /* * program the ICR @@ -108,7 +108,7 @@ static inline void __send_IPI_dest_field(unsigned long mask, int vector) /* * Send the IPI. The write to APIC_ICR fires this off. */ - apic_write_around(APIC_ICR, cfg); + apic_write(APIC_ICR, cfg); } /* diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 47a6f6f1247..1cf8c1fcc08 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -83,11 +83,8 @@ union irq_ctx { static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly; static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly; -static char softirq_stack[NR_CPUS * THREAD_SIZE] - __attribute__((__section__(".bss.page_aligned"))); - -static char hardirq_stack[NR_CPUS * THREAD_SIZE] - __attribute__((__section__(".bss.page_aligned"))); +static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss; +static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss; static void call_on_stack(void *func, void *stack) { diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index 0373e88de95..1f26fd9ec4f 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -43,10 +43,11 @@ #define BUILD_IRQ(nr) \ asmlinkage void IRQ_NAME(nr); \ - asm("\n.p2align\n" \ + asm("\n.text\n.p2align\n" \ "IRQ" #nr "_interrupt:\n\t" \ "push $~(" #nr ") ; " \ - "jmp common_interrupt"); + "jmp common_interrupt\n" \ + ".previous"); #define BI(x,y) \ BUILD_IRQ(x##y) diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c index c0320599171..ff7d3b0124f 100644 --- a/arch/x86/kernel/kdebugfs.c +++ b/arch/x86/kernel/kdebugfs.c @@ -12,9 +12,13 @@ #include <linux/init.h> #include <linux/io.h> #include <linux/mm.h> +#include <linux/module.h> #include <asm/setup.h> +struct dentry *arch_debugfs_dir; +EXPORT_SYMBOL(arch_debugfs_dir); + #ifdef CONFIG_DEBUG_BOOT_PARAMS struct setup_data_node { u64 paddr; @@ -135,6 +139,7 @@ static int __init create_setup_data_nodes(struct dentry *parent) if (PageHighMem(pg)) { data = ioremap_cache(pa_data, sizeof(*data)); if (!data) { + kfree(node); error = -ENXIO; goto err_dir; } @@ -209,6 +214,10 @@ static int __init arch_kdebugfs_init(void) { int error = 0; + arch_debugfs_dir = debugfs_create_dir("x86", NULL); + if (!arch_debugfs_dir) + return -ENOMEM; + #ifdef CONFIG_DEBUG_BOOT_PARAMS error = boot_params_kdebugfs_init(); #endif diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index f47f0eb886b..8282a213968 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -69,6 +69,9 @@ static int gdb_x86vector = -1; */ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) { +#ifndef CONFIG_X86_32 + u32 *gdb_regs32 = (u32 *)gdb_regs; +#endif gdb_regs[GDB_AX] = regs->ax; gdb_regs[GDB_BX] = regs->bx; gdb_regs[GDB_CX] = regs->cx; @@ -76,9 +79,9 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) gdb_regs[GDB_SI] = regs->si; gdb_regs[GDB_DI] = regs->di; gdb_regs[GDB_BP] = regs->bp; - gdb_regs[GDB_PS] = regs->flags; gdb_regs[GDB_PC] = regs->ip; #ifdef CONFIG_X86_32 + gdb_regs[GDB_PS] = regs->flags; gdb_regs[GDB_DS] = regs->ds; gdb_regs[GDB_ES] = regs->es; gdb_regs[GDB_CS] = regs->cs; @@ -94,6 +97,9 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) gdb_regs[GDB_R13] = regs->r13; gdb_regs[GDB_R14] = regs->r14; gdb_regs[GDB_R15] = regs->r15; + gdb_regs32[GDB_PS] = regs->flags; + gdb_regs32[GDB_CS] = regs->cs; + gdb_regs32[GDB_SS] = regs->ss; #endif gdb_regs[GDB_SP] = regs->sp; } @@ -112,6 +118,9 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) */ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) { +#ifndef CONFIG_X86_32 + u32 *gdb_regs32 = (u32 *)gdb_regs; +#endif gdb_regs[GDB_AX] = 0; gdb_regs[GDB_BX] = 0; gdb_regs[GDB_CX] = 0; @@ -129,8 +138,10 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) gdb_regs[GDB_FS] = 0xFFFF; gdb_regs[GDB_GS] = 0xFFFF; #else - gdb_regs[GDB_PS] = *(unsigned long *)(p->thread.sp + 8); - gdb_regs[GDB_PC] = 0; + gdb_regs32[GDB_PS] = *(unsigned long *)(p->thread.sp + 8); + gdb_regs32[GDB_CS] = __KERNEL_CS; + gdb_regs32[GDB_SS] = __KERNEL_DS; + gdb_regs[GDB_PC] = p->thread.ip; gdb_regs[GDB_R8] = 0; gdb_regs[GDB_R9] = 0; gdb_regs[GDB_R10] = 0; @@ -153,6 +164,9 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) */ void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs) { +#ifndef CONFIG_X86_32 + u32 *gdb_regs32 = (u32 *)gdb_regs; +#endif regs->ax = gdb_regs[GDB_AX]; regs->bx = gdb_regs[GDB_BX]; regs->cx = gdb_regs[GDB_CX]; @@ -160,9 +174,9 @@ void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs) regs->si = gdb_regs[GDB_SI]; regs->di = gdb_regs[GDB_DI]; regs->bp = gdb_regs[GDB_BP]; - regs->flags = gdb_regs[GDB_PS]; regs->ip = gdb_regs[GDB_PC]; #ifdef CONFIG_X86_32 + regs->flags = gdb_regs[GDB_PS]; regs->ds = gdb_regs[GDB_DS]; regs->es = gdb_regs[GDB_ES]; regs->cs = gdb_regs[GDB_CS]; @@ -175,6 +189,9 @@ void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs) regs->r13 = gdb_regs[GDB_R13]; regs->r14 = gdb_regs[GDB_R14]; regs->r15 = gdb_regs[GDB_R15]; + regs->flags = gdb_regs32[GDB_PS]; + regs->cs = gdb_regs32[GDB_CS]; + regs->ss = gdb_regs32[GDB_SS]; #endif } @@ -378,10 +395,8 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code, if (remcomInBuffer[0] == 's') { linux_regs->flags |= X86_EFLAGS_TF; kgdb_single_step = 1; - if (kgdb_contthread) { - atomic_set(&kgdb_cpu_doing_single_step, - raw_smp_processor_id()); - } + atomic_set(&kgdb_cpu_doing_single_step, + raw_smp_processor_id()); } get_debugreg(dr6, 6); @@ -466,9 +481,15 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd) case DIE_DEBUG: if (atomic_read(&kgdb_cpu_doing_single_step) == - raw_smp_processor_id() && - user_mode(regs)) - return single_step_cont(regs, args); + raw_smp_processor_id()) { + if (user_mode(regs)) + return single_step_cont(regs, args); + break; + } else if (test_thread_flag(TIF_SINGLESTEP)) + /* This means a user thread is single stepping + * a system call which should be ignored + */ + return NOTIFY_DONE; /* fall through */ default: if (user_mode(regs)) diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index b8c6743a13d..6c27679ec6a 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -431,7 +431,6 @@ static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs) regs->ip = (unsigned long)p->ainsn.insn; } -/* Called with kretprobe_lock held */ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) { @@ -682,8 +681,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs) unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline; INIT_HLIST_HEAD(&empty_rp); - spin_lock_irqsave(&kretprobe_lock, flags); - head = kretprobe_inst_table_head(current); + kretprobe_hash_lock(current, &head, &flags); /* fixup registers */ #ifdef CONFIG_X86_64 regs->cs = __KERNEL_CS; @@ -732,7 +730,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs) kretprobe_assert(ri, orig_ret_address, trampoline_address); - spin_unlock_irqrestore(&kretprobe_lock, flags); + kretprobe_hash_unlock(current, &flags); hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { hlist_del(&ri->hlist); @@ -860,7 +858,6 @@ static int __kprobes post_kprobe_handler(struct pt_regs *regs) resume_execution(cur, regs, kcb); regs->flags |= kcb->kprobe_saved_flags; - trace_hardirqs_fixup_flags(regs->flags); if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) { kcb->kprobe_status = KPROBE_HIT_SSDONE; diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 87edf1ceb1d..d02def06ca9 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -113,7 +113,7 @@ static void kvm_setup_secondary_clock(void) #endif #ifdef CONFIG_SMP -void __init kvm_smp_prepare_boot_cpu(void) +static void __init kvm_smp_prepare_boot_cpu(void) { WARN_ON(kvm_register_clock("primary cpu clock")); native_smp_prepare_boot_cpu(); diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index a8449571858..b68e21f06f4 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -62,12 +62,10 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload) if (reload) { #ifdef CONFIG_SMP - cpumask_t mask; - preempt_disable(); load_LDT(pc); - mask = cpumask_of_cpu(smp_processor_id()); - if (!cpus_equal(current->mm->cpu_vm_mask, mask)) + if (!cpus_equal(current->mm->cpu_vm_mask, + cpumask_of_cpu(smp_processor_id()))) smp_call_function(flush_ldt, current->mm, 1); preempt_enable(); #else diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 8864230d55a..0732adba05c 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -12,6 +12,7 @@ #include <linux/init.h> #include <linux/numa.h> #include <linux/ftrace.h> +#include <linux/suspend.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -22,6 +23,7 @@ #include <asm/cpufeature.h> #include <asm/desc.h> #include <asm/system.h> +#include <asm/cacheflush.h> #define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) static u32 kexec_pgd[1024] PAGE_ALIGNED; @@ -77,7 +79,7 @@ static void load_segments(void) /* * A architecture hook called to validate the * proposed image and prepare the control pages - * as needed. The pages for KEXEC_CONTROL_CODE_SIZE + * as needed. The pages for KEXEC_CONTROL_PAGE_SIZE * have been allocated, but the segments have yet * been copied into the kernel. * @@ -85,10 +87,12 @@ static void load_segments(void) * reboot code buffer to allow us to avoid allocations * later. * - * Currently nothing. + * Make control page executable. */ int machine_kexec_prepare(struct kimage *image) { + if (nx_enabled) + set_pages_x(image->control_code_page, 1); return 0; } @@ -98,27 +102,54 @@ int machine_kexec_prepare(struct kimage *image) */ void machine_kexec_cleanup(struct kimage *image) { + if (nx_enabled) + set_pages_nx(image->control_code_page, 1); } /* * Do not allocate memory (or fail in any way) in machine_kexec(). * We are past the point of no return, committed to rebooting now. */ -NORET_TYPE void machine_kexec(struct kimage *image) +void machine_kexec(struct kimage *image) { unsigned long page_list[PAGES_NR]; void *control_page; + int save_ftrace_enabled; + asmlinkage unsigned long + (*relocate_kernel_ptr)(unsigned long indirection_page, + unsigned long control_page, + unsigned long start_address, + unsigned int has_pae, + unsigned int preserve_context); + +#ifdef CONFIG_KEXEC_JUMP + if (kexec_image->preserve_context) + save_processor_state(); +#endif - tracer_disable(); + save_ftrace_enabled = __ftrace_enabled_save(); /* Interrupts aren't acceptable while we reboot */ local_irq_disable(); + if (image->preserve_context) { +#ifdef CONFIG_X86_IO_APIC + /* We need to put APICs in legacy mode so that we can + * get timer interrupts in second kernel. kexec/kdump + * paths already have calls to disable_IO_APIC() in + * one form or other. kexec jump path also need + * one. + */ + disable_IO_APIC(); +#endif + } + control_page = page_address(image->control_code_page); - memcpy(control_page, relocate_kernel, PAGE_SIZE); + memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE); + relocate_kernel_ptr = control_page; page_list[PA_CONTROL_PAGE] = __pa(control_page); - page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel; + page_list[VA_CONTROL_PAGE] = (unsigned long)control_page; page_list[PA_PGD] = __pa(kexec_pgd); page_list[VA_PGD] = (unsigned long)kexec_pgd; #ifdef CONFIG_X86_PAE @@ -131,6 +162,7 @@ NORET_TYPE void machine_kexec(struct kimage *image) page_list[VA_PTE_0] = (unsigned long)kexec_pte0; page_list[PA_PTE_1] = __pa(kexec_pte1); page_list[VA_PTE_1] = (unsigned long)kexec_pte1; + page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) << PAGE_SHIFT); /* The segment registers are funny things, they have both a * visible and an invisible part. Whenever the visible part is @@ -149,8 +181,17 @@ NORET_TYPE void machine_kexec(struct kimage *image) set_idt(phys_to_virt(0),0); /* now call it */ - relocate_kernel((unsigned long)image->head, (unsigned long)page_list, - image->start, cpu_has_pae); + image->start = relocate_kernel_ptr((unsigned long)image->head, + (unsigned long)page_list, + image->start, cpu_has_pae, + image->preserve_context); + +#ifdef CONFIG_KEXEC_JUMP + if (kexec_image->preserve_context) + restore_processor_state(); +#endif + + __ftrace_enabled_restore(save_ftrace_enabled); } void arch_crash_save_vmcoreinfo(void) diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 9dd9262693a..c43caa3a91f 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -181,7 +181,7 @@ void machine_kexec_cleanup(struct kimage *image) * Do not allocate memory (or fail in any way) in machine_kexec(). * We are past the point of no return, committed to rebooting now. */ -NORET_TYPE void machine_kexec(struct kimage *image) +void machine_kexec(struct kimage *image) { unsigned long page_list[PAGES_NR]; void *control_page; diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c index 07c0f828f48..3b599518c32 100644 --- a/arch/x86/kernel/mfgpt_32.c +++ b/arch/x86/kernel/mfgpt_32.c @@ -33,6 +33,8 @@ #include <linux/module.h> #include <asm/geode.h> +#define MFGPT_DEFAULT_IRQ 7 + static struct mfgpt_timer_t { unsigned int avail:1; } mfgpt_timers[MFGPT_MAX_TIMERS]; @@ -157,29 +159,48 @@ int geode_mfgpt_toggle_event(int timer, int cmp, int event, int enable) } EXPORT_SYMBOL_GPL(geode_mfgpt_toggle_event); -int geode_mfgpt_set_irq(int timer, int cmp, int irq, int enable) +int geode_mfgpt_set_irq(int timer, int cmp, int *irq, int enable) { - u32 val, dummy; - int offset; + u32 zsel, lpc, dummy; + int shift; if (timer < 0 || timer >= MFGPT_MAX_TIMERS) return -EIO; - if (geode_mfgpt_toggle_event(timer, cmp, MFGPT_EVENT_IRQ, enable)) + /* + * Unfortunately, MFGPTs come in pairs sharing their IRQ lines. If VSA + * is using the same CMP of the timer's Siamese twin, the IRQ is set to + * 2, and we mustn't use nor change it. + * XXX: Likewise, 2 Linux drivers might clash if the 2nd overwrites the + * IRQ of the 1st. This can only happen if forcing an IRQ, calling this + * with *irq==0 is safe. Currently there _are_ no 2 drivers. + */ + rdmsr(MSR_PIC_ZSEL_LOW, zsel, dummy); + shift = ((cmp == MFGPT_CMP1 ? 0 : 4) + timer % 4) * 4; + if (((zsel >> shift) & 0xF) == 2) return -EIO; - rdmsr(MSR_PIC_ZSEL_LOW, val, dummy); + /* Choose IRQ: if none supplied, keep IRQ already set or use default */ + if (!*irq) + *irq = (zsel >> shift) & 0xF; + if (!*irq) + *irq = MFGPT_DEFAULT_IRQ; - offset = (timer % 4) * 4; - - val &= ~((0xF << offset) | (0xF << (offset + 16))); + /* Can't use IRQ if it's 0 (=disabled), 2, or routed to LPC */ + if (*irq < 1 || *irq == 2 || *irq > 15) + return -EIO; + rdmsr(MSR_PIC_IRQM_LPC, lpc, dummy); + if (lpc & (1 << *irq)) + return -EIO; + /* All chosen and checked - go for it */ + if (geode_mfgpt_toggle_event(timer, cmp, MFGPT_EVENT_IRQ, enable)) + return -EIO; if (enable) { - val |= (irq & 0x0F) << (offset); - val |= (irq & 0x0F) << (offset + 16); + zsel = (zsel & ~(0xF << shift)) | (*irq << shift); + wrmsr(MSR_PIC_ZSEL_LOW, zsel, dummy); } - wrmsr(MSR_PIC_ZSEL_LOW, val, dummy); return 0; } @@ -242,7 +263,7 @@ EXPORT_SYMBOL_GPL(geode_mfgpt_alloc_timer); static unsigned int mfgpt_tick_mode = CLOCK_EVT_MODE_SHUTDOWN; static u16 mfgpt_event_clock; -static int irq = 7; +static int irq; static int __init mfgpt_setup(char *str) { get_option(&str, &irq); @@ -346,7 +367,7 @@ int __init mfgpt_timer_setup(void) mfgpt_event_clock = timer; /* Set up the IRQ on the MFGPT side */ - if (geode_mfgpt_setup_irq(mfgpt_event_clock, MFGPT_CMP2, irq)) { + if (geode_mfgpt_setup_irq(mfgpt_event_clock, MFGPT_CMP2, &irq)) { printk(KERN_ERR "mfgpt-timer: Could not set up IRQ %d\n", irq); return -EIO; } @@ -374,13 +395,14 @@ int __init mfgpt_timer_setup(void) &mfgpt_clockevent); printk(KERN_INFO - "mfgpt-timer: registering the MFGPT timer as a clock event.\n"); + "mfgpt-timer: Registering MFGPT timer %d as a clock event, using IRQ %d\n", + timer, irq); clockevents_register_device(&mfgpt_clockevent); return 0; err: - geode_mfgpt_release_irq(mfgpt_event_clock, MFGPT_CMP2, irq); + geode_mfgpt_release_irq(mfgpt_event_clock, MFGPT_CMP2, &irq); printk(KERN_ERR "mfgpt-timer: Unable to set up the MFGPT clock source\n"); return -EIO; diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c index 56b933119a0..652fa5c38eb 100644 --- a/arch/x86/kernel/microcode.c +++ b/arch/x86/kernel/microcode.c @@ -644,7 +644,9 @@ static void microcode_fini_cpu(int cpu) mutex_unlock(µcode_mutex); } -static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz) +static ssize_t reload_store(struct sys_device *dev, + struct sysdev_attribute *attr, + const char *buf, size_t sz) { struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; char *end; @@ -655,9 +657,7 @@ static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz) if (end == buf) return -EINVAL; if (val == 1) { - cpumask_t old; - - old = current->cpus_allowed; + cpumask_t old = current->cpus_allowed; get_online_cpus(); set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); @@ -674,14 +674,16 @@ static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz) return sz; } -static ssize_t version_show(struct sys_device *dev, char *buf) +static ssize_t version_show(struct sys_device *dev, + struct sysdev_attribute *attr, char *buf) { struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; return sprintf(buf, "0x%x\n", uci->rev); } -static ssize_t pf_show(struct sys_device *dev, char *buf) +static ssize_t pf_show(struct sys_device *dev, + struct sysdev_attribute *attr, char *buf) { struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c index fdfdc550b36..efc2f361fe8 100644 --- a/arch/x86/kernel/mmconf-fam10h_64.c +++ b/arch/x86/kernel/mmconf-fam10h_64.c @@ -238,7 +238,7 @@ static struct dmi_system_id __devinitdata mmconf_dmi_table[] = { {} }; -void __init check_enable_amd_mmconf_dmi(void) +void __cpuinit check_enable_amd_mmconf_dmi(void) { dmi_check_system(mmconf_dmi_table); } diff --git a/arch/x86/kernel/module_64.c b/arch/x86/kernel/module_64.c index a888e67f587..6ba87830d4b 100644 --- a/arch/x86/kernel/module_64.c +++ b/arch/x86/kernel/module_64.c @@ -22,6 +22,7 @@ #include <linux/fs.h> #include <linux/string.h> #include <linux/kernel.h> +#include <linux/mm.h> #include <linux/slab.h> #include <linux/bug.h> @@ -150,7 +151,8 @@ int module_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *me) { - const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL; + const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL, + *para = NULL; char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { @@ -160,6 +162,8 @@ int module_finalize(const Elf_Ehdr *hdr, alt = s; if (!strcmp(".smp_locks", secstrings + s->sh_name)) locks= s; + if (!strcmp(".parainstructions", secstrings + s->sh_name)) + para = s; } if (alt) { @@ -175,6 +179,11 @@ int module_finalize(const Elf_Ehdr *hdr, tseg, tseg + text->sh_size); } + if (para) { + void *pseg = (void *)para->sh_addr; + apply_paravirt(pseg, pseg + para->sh_size); + } + return module_bug_finalize(hdr, sechdrs, me); } diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 3b25e49380c..b3fb430725c 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -27,6 +27,7 @@ #include <asm/bios_ebda.h> #include <asm/e820.h> #include <asm/trampoline.h> +#include <asm/setup.h> #include <mach_apic.h> #ifdef CONFIG_X86_32 @@ -48,77 +49,7 @@ static int __init mpf_checksum(unsigned char *mp, int len) return sum & 0xFF; } -#ifdef CONFIG_X86_NUMAQ -int found_numaq; -/* - * Have to match translation table entries to main table entries by counter - * hence the mpc_record variable .... can't see a less disgusting way of - * doing this .... - */ -struct mpc_config_translation { - unsigned char mpc_type; - unsigned char trans_len; - unsigned char trans_type; - unsigned char trans_quad; - unsigned char trans_global; - unsigned char trans_local; - unsigned short trans_reserved; -}; - - -static int mpc_record; -static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] - __cpuinitdata; - -static inline int generate_logical_apicid(int quad, int phys_apicid) -{ - return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1); -} - - -static inline int mpc_apic_id(struct mpc_config_processor *m, - struct mpc_config_translation *translation_record) -{ - int quad = translation_record->trans_quad; - int logical_apicid = generate_logical_apicid(quad, m->mpc_apicid); - - printk(KERN_DEBUG "Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n", - m->mpc_apicid, - (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8, - (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4, - m->mpc_apicver, quad, logical_apicid); - return logical_apicid; -} - -int mp_bus_id_to_node[MAX_MP_BUSSES]; - -int mp_bus_id_to_local[MAX_MP_BUSSES]; - -static void mpc_oem_bus_info(struct mpc_config_bus *m, char *name, - struct mpc_config_translation *translation) -{ - int quad = translation->trans_quad; - int local = translation->trans_local; - - mp_bus_id_to_node[m->mpc_busid] = quad; - mp_bus_id_to_local[m->mpc_busid] = local; - printk(KERN_INFO "Bus #%d is %s (node %d)\n", - m->mpc_busid, name, quad); -} - -int quad_local_to_mp_bus_id [NR_CPUS/4][4]; -static void mpc_oem_pci_bus(struct mpc_config_bus *m, - struct mpc_config_translation *translation) -{ - int quad = translation->trans_quad; - int local = translation->trans_local; - - quad_local_to_mp_bus_id[quad][local] = m->mpc_busid; -} - -#endif - -static void __cpuinit MP_processor_info(struct mpc_config_processor *m) +static void __init MP_processor_info(struct mpc_config_processor *m) { int apicid; char *bootup_cpu = ""; @@ -127,14 +58,12 @@ static void __cpuinit MP_processor_info(struct mpc_config_processor *m) disabled_cpus++; return; } -#ifdef CONFIG_X86_NUMAQ - if (found_numaq) - apicid = mpc_apic_id(m, translation_table[mpc_record]); + + if (x86_quirks->mpc_apic_id) + apicid = x86_quirks->mpc_apic_id(m); else apicid = m->mpc_apicid; -#else - apicid = m->mpc_apicid; -#endif + if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { bootup_cpu = " (Bootup-CPU)"; boot_cpu_physical_apicid = m->mpc_apicid; @@ -151,12 +80,10 @@ static void __init MP_bus_info(struct mpc_config_bus *m) memcpy(str, m->mpc_bustype, 6); str[6] = 0; -#ifdef CONFIG_X86_NUMAQ - if (found_numaq) - mpc_oem_bus_info(m, str, translation_table[mpc_record]); -#else - printk(KERN_INFO "Bus #%d is %s\n", m->mpc_busid, str); -#endif + if (x86_quirks->mpc_oem_bus_info) + x86_quirks->mpc_oem_bus_info(m, str); + else + apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->mpc_busid, str); #if MAX_MP_BUSSES < 256 if (m->mpc_busid >= MAX_MP_BUSSES) { @@ -173,10 +100,9 @@ static void __init MP_bus_info(struct mpc_config_bus *m) mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA; #endif } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) { -#ifdef CONFIG_X86_NUMAQ - if (found_numaq) - mpc_oem_pci_bus(m, translation_table[mpc_record]); -#endif + if (x86_quirks->mpc_oem_pci_bus) + x86_quirks->mpc_oem_pci_bus(m); + clear_bit(m->mpc_busid, mp_bus_not_pci); #if defined(CONFIG_EISA) || defined (CONFIG_MCA) mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI; @@ -228,7 +154,7 @@ static void __init MP_ioapic_info(struct mpc_config_ioapic *m) static void print_MP_intsrc_info(struct mpc_config_intsrc *m) { - printk(KERN_CONT "Int: type %d, pol %d, trig %d, bus %02x," + apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x," " IRQ %02x, APIC ID %x, APIC INT %02x\n", m->mpc_irqtype, m->mpc_irqflag & 3, (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus, @@ -237,7 +163,7 @@ static void print_MP_intsrc_info(struct mpc_config_intsrc *m) static void __init print_mp_irq_info(struct mp_config_intsrc *mp_irq) { - printk(KERN_CONT "Int: type %d, pol %d, trig %d, bus %02x," + apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x," " IRQ %02x, APIC ID %x, APIC INT %02x\n", mp_irq->mp_irqtype, mp_irq->mp_irqflag & 3, (mp_irq->mp_irqflag >> 2) & 3, mp_irq->mp_srcbus, @@ -309,90 +235,13 @@ static void __init MP_intsrc_info(struct mpc_config_intsrc *m) static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m) { - printk(KERN_INFO "Lint: type %d, pol %d, trig %d, bus %02x," + apic_printk(APIC_VERBOSE, "Lint: type %d, pol %d, trig %d, bus %02x," " IRQ %02x, APIC ID %x, APIC LINT %02x\n", m->mpc_irqtype, m->mpc_irqflag & 3, (m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid, m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint); } -#ifdef CONFIG_X86_NUMAQ -static void __init MP_translation_info(struct mpc_config_translation *m) -{ - printk(KERN_INFO - "Translation: record %d, type %d, quad %d, global %d, local %d\n", - mpc_record, m->trans_type, m->trans_quad, m->trans_global, - m->trans_local); - - if (mpc_record >= MAX_MPC_ENTRY) - printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n"); - else - translation_table[mpc_record] = m; /* stash this for later */ - if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad)) - node_set_online(m->trans_quad); -} - -/* - * Read/parse the MPC oem tables - */ - -static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable, - unsigned short oemsize) -{ - int count = sizeof(*oemtable); /* the header size */ - unsigned char *oemptr = ((unsigned char *)oemtable) + count; - - mpc_record = 0; - printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n", - oemtable); - if (memcmp(oemtable->oem_signature, MPC_OEM_SIGNATURE, 4)) { - printk(KERN_WARNING - "SMP mpc oemtable: bad signature [%c%c%c%c]!\n", - oemtable->oem_signature[0], oemtable->oem_signature[1], - oemtable->oem_signature[2], oemtable->oem_signature[3]); - return; - } - if (mpf_checksum((unsigned char *)oemtable, oemtable->oem_length)) { - printk(KERN_WARNING "SMP oem mptable: checksum error!\n"); - return; - } - while (count < oemtable->oem_length) { - switch (*oemptr) { - case MP_TRANSLATION: - { - struct mpc_config_translation *m = - (struct mpc_config_translation *)oemptr; - MP_translation_info(m); - oemptr += sizeof(*m); - count += sizeof(*m); - ++mpc_record; - break; - } - default: - { - printk(KERN_WARNING - "Unrecognised OEM table entry type! - %d\n", - (int)*oemptr); - return; - } - } - } -} - -void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem, - char *productid) -{ - if (strncmp(oem, "IBM NUMA", 8)) - printk("Warning! Not a NUMA-Q system!\n"); - else - found_numaq = 1; - - if (mpc->mpc_oemptr) - smp_read_mpc_oem((struct mp_config_oemtable *)mpc->mpc_oemptr, - mpc->mpc_oemsize); -} -#endif /* CONFIG_X86_NUMAQ */ - /* * Read/parse the MPC */ @@ -457,7 +306,6 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early) } else mps_oem_check(mpc, oem, str); #endif - /* save the local APIC address, it might be non-default */ if (!acpi_lapic) mp_lapic_addr = mpc->mpc_lapic; @@ -465,12 +313,17 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early) if (early) return 1; + if (mpc->mpc_oemptr && x86_quirks->smp_read_mpc_oem) { + struct mp_config_oemtable *oem_table = (struct mp_config_oemtable *)(unsigned long)mpc->mpc_oemptr; + x86_quirks->smp_read_mpc_oem(oem_table, mpc->mpc_oemsize); + } + /* * Now process the configuration blocks. */ -#ifdef CONFIG_X86_NUMAQ - mpc_record = 0; -#endif + if (x86_quirks->mpc_record) + *x86_quirks->mpc_record = 0; + while (count < mpc->mpc_length) { switch (*mpt) { case MP_PROCESSOR: @@ -536,9 +389,8 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early) count = mpc->mpc_length; break; } -#ifdef CONFIG_X86_NUMAQ - ++mpc_record; -#endif + if (x86_quirks->mpc_record) + (*x86_quirks->mpc_record)++; } #ifdef CONFIG_X86_GENERICARCH @@ -632,7 +484,7 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type) } -static void construct_ioapic_table(int mpc_default_type) +static void __init construct_ioapic_table(int mpc_default_type) { struct mpc_config_ioapic ioapic; struct mpc_config_bus bus; @@ -677,7 +529,7 @@ static void construct_ioapic_table(int mpc_default_type) construct_default_ioirq_mptable(mpc_default_type); } #else -static inline void construct_ioapic_table(int mpc_default_type) { } +static inline void __init construct_ioapic_table(int mpc_default_type) { } #endif static inline void __init construct_default_ISA_mptable(int mpc_default_type) @@ -726,20 +578,14 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type) static struct intel_mp_floating *mpf_found; /* - * Machine specific quirk for finding the SMP config before other setup - * activities destroy the table: - */ -int (*mach_get_smp_config_quirk)(unsigned int early); - -/* * Scan the memory blocks for an SMP configuration block. */ static void __init __get_smp_config(unsigned int early) { struct intel_mp_floating *mpf = mpf_found; - if (mach_get_smp_config_quirk) { - if (mach_get_smp_config_quirk(early)) + if (x86_quirks->mach_get_smp_config) { + if (x86_quirks->mach_get_smp_config(early)) return; } if (acpi_lapic && early) @@ -849,7 +695,8 @@ static int __init smp_scan_config(unsigned long base, unsigned long length, unsigned int *bp = phys_to_virt(base); struct intel_mp_floating *mpf; - printk(KERN_DEBUG "Scan SMP from %p for %ld bytes.\n", bp, length); + apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n", + bp, length); BUILD_BUG_ON(sizeof(*mpf) != 16); while (length > 0) { @@ -899,14 +746,12 @@ static int __init smp_scan_config(unsigned long base, unsigned long length, return 0; } -int (*mach_find_smp_config_quirk)(unsigned int reserve); - static void __init __find_smp_config(unsigned int reserve) { unsigned int address; - if (mach_find_smp_config_quirk) { - if (mach_find_smp_config_quirk(reserve)) + if (x86_quirks->mach_find_smp_config) { + if (x86_quirks->mach_find_smp_config(reserve)) return; } /* diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index a153b3905f6..2e2af5d1819 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -72,21 +72,28 @@ static ssize_t msr_read(struct file *file, char __user *buf, u32 data[2]; u32 reg = *ppos; int cpu = iminor(file->f_path.dentry->d_inode); - int err; + int err = 0; + ssize_t bytes = 0; if (count % 8) return -EINVAL; /* Invalid chunk size */ for (; count; count -= 8) { err = rdmsr_safe_on_cpu(cpu, reg, &data[0], &data[1]); - if (err) - return -EIO; - if (copy_to_user(tmp, &data, 8)) - return -EFAULT; + if (err) { + if (err == -EFAULT) /* Fix idiotic error code */ + err = -EIO; + break; + } + if (copy_to_user(tmp, &data, 8)) { + err = -EFAULT; + break; + } tmp += 2; + bytes += 8; } - return ((char __user *)tmp) - buf; + return bytes ? bytes : err; } static ssize_t msr_write(struct file *file, const char __user *buf, @@ -96,21 +103,28 @@ static ssize_t msr_write(struct file *file, const char __user *buf, u32 data[2]; u32 reg = *ppos; int cpu = iminor(file->f_path.dentry->d_inode); - int err; + int err = 0; + ssize_t bytes = 0; if (count % 8) return -EINVAL; /* Invalid chunk size */ for (; count; count -= 8) { - if (copy_from_user(&data, tmp, 8)) - return -EFAULT; + if (copy_from_user(&data, tmp, 8)) { + err = -EFAULT; + break; + } err = wrmsr_safe_on_cpu(cpu, reg, data[0], data[1]); - if (err) - return -EIO; + if (err) { + if (err == -EFAULT) /* Fix idiotic error code */ + err = -EIO; + break; + } tmp += 2; + bytes += 8; } - return ((char __user *)tmp) - buf; + return bytes ? bytes : err; } static int msr_open(struct inode *inode, struct file *file) @@ -131,7 +145,7 @@ static int msr_open(struct inode *inode, struct file *file) ret = -EIO; /* MSR not supported */ out: unlock_kernel(); - return 0; + return ret; } /* @@ -149,8 +163,8 @@ static int __cpuinit msr_device_create(int cpu) { struct device *dev; - dev = device_create(msr_class, NULL, MKDEV(MSR_MAJOR, cpu), - "msr%d", cpu); + dev = device_create_drvdata(msr_class, NULL, MKDEV(MSR_MAJOR, cpu), + NULL, "msr%d", cpu); return IS_ERR(dev) ? PTR_ERR(dev) : 0; } diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index ec024b3baad..abb78a2cc4a 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -114,6 +114,23 @@ static __init void nmi_cpu_busy(void *data) } #endif +static void report_broken_nmi(int cpu, int *prev_nmi_count) +{ + printk(KERN_CONT "\n"); + + printk(KERN_WARNING + "WARNING: CPU#%d: NMI appears to be stuck (%d->%d)!\n", + cpu, prev_nmi_count[cpu], get_nmi_count(cpu)); + + printk(KERN_WARNING + "Please report this to bugzilla.kernel.org,\n"); + printk(KERN_WARNING + "and attach the output of the 'dmesg' command.\n"); + + per_cpu(wd_enabled, cpu) = 0; + atomic_dec(&nmi_active); +} + int __init check_nmi_watchdog(void) { unsigned int *prev_nmi_count; @@ -141,15 +158,8 @@ int __init check_nmi_watchdog(void) for_each_online_cpu(cpu) { if (!per_cpu(wd_enabled, cpu)) continue; - if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5) { - printk(KERN_WARNING "WARNING: CPU#%d: NMI " - "appears to be stuck (%d->%d)!\n", - cpu, - prev_nmi_count[cpu], - get_nmi_count(cpu)); - per_cpu(wd_enabled, cpu) = 0; - atomic_dec(&nmi_active); - } + if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5) + report_broken_nmi(cpu, prev_nmi_count); } endflag = 1; if (!atomic_read(&nmi_active)) { @@ -263,7 +273,7 @@ late_initcall(init_lapic_nmi_sysfs); static void __acpi_nmi_enable(void *__unused) { - apic_write_around(APIC_LVT0, APIC_DM_NMI); + apic_write(APIC_LVT0, APIC_DM_NMI); } /* @@ -277,7 +287,7 @@ void acpi_nmi_enable(void) static void __acpi_nmi_disable(void *__unused) { - apic_write_around(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); + apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); } /* @@ -448,6 +458,13 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) #ifdef CONFIG_SYSCTL +static int __init setup_unknown_nmi_panic(char *str) +{ + unknown_nmi_panic = 1; + return 1; +} +__setup("unknown_nmi_panic", setup_unknown_nmi_panic); + static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) { unsigned char reason = get_nmi_reason(); diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c index a23e8233b9a..eecc8c18f01 100644 --- a/arch/x86/kernel/numaq_32.c +++ b/arch/x86/kernel/numaq_32.c @@ -33,6 +33,7 @@ #include <asm/processor.h> #include <asm/mpspec.h> #include <asm/e820.h> +#include <asm/setup.h> #define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT)) @@ -71,6 +72,188 @@ static void __init smp_dump_qct(void) } } + +void __cpuinit numaq_tsc_disable(void) +{ + if (!found_numaq) + return; + + if (num_online_nodes() > 1) { + printk(KERN_DEBUG "NUMAQ: disabling TSC\n"); + setup_clear_cpu_cap(X86_FEATURE_TSC); + } +} + +static int __init numaq_pre_time_init(void) +{ + numaq_tsc_disable(); + return 0; +} + +int found_numaq; +/* + * Have to match translation table entries to main table entries by counter + * hence the mpc_record variable .... can't see a less disgusting way of + * doing this .... + */ +struct mpc_config_translation { + unsigned char mpc_type; + unsigned char trans_len; + unsigned char trans_type; + unsigned char trans_quad; + unsigned char trans_global; + unsigned char trans_local; + unsigned short trans_reserved; +}; + +/* x86_quirks member */ +static int mpc_record; +static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] + __cpuinitdata; + +static inline int generate_logical_apicid(int quad, int phys_apicid) +{ + return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1); +} + +/* x86_quirks member */ +static int mpc_apic_id(struct mpc_config_processor *m) +{ + int quad = translation_table[mpc_record]->trans_quad; + int logical_apicid = generate_logical_apicid(quad, m->mpc_apicid); + + printk(KERN_DEBUG "Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n", + m->mpc_apicid, + (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8, + (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4, + m->mpc_apicver, quad, logical_apicid); + return logical_apicid; +} + +int mp_bus_id_to_node[MAX_MP_BUSSES]; + +int mp_bus_id_to_local[MAX_MP_BUSSES]; + +/* x86_quirks member */ +static void mpc_oem_bus_info(struct mpc_config_bus *m, char *name) +{ + int quad = translation_table[mpc_record]->trans_quad; + int local = translation_table[mpc_record]->trans_local; + + mp_bus_id_to_node[m->mpc_busid] = quad; + mp_bus_id_to_local[m->mpc_busid] = local; + printk(KERN_INFO "Bus #%d is %s (node %d)\n", + m->mpc_busid, name, quad); +} + +int quad_local_to_mp_bus_id [NR_CPUS/4][4]; + +/* x86_quirks member */ +static void mpc_oem_pci_bus(struct mpc_config_bus *m) +{ + int quad = translation_table[mpc_record]->trans_quad; + int local = translation_table[mpc_record]->trans_local; + + quad_local_to_mp_bus_id[quad][local] = m->mpc_busid; +} + +static void __init MP_translation_info(struct mpc_config_translation *m) +{ + printk(KERN_INFO + "Translation: record %d, type %d, quad %d, global %d, local %d\n", + mpc_record, m->trans_type, m->trans_quad, m->trans_global, + m->trans_local); + + if (mpc_record >= MAX_MPC_ENTRY) + printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n"); + else + translation_table[mpc_record] = m; /* stash this for later */ + if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad)) + node_set_online(m->trans_quad); +} + +static int __init mpf_checksum(unsigned char *mp, int len) +{ + int sum = 0; + + while (len--) + sum += *mp++; + + return sum & 0xFF; +} + +/* + * Read/parse the MPC oem tables + */ + +static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable, + unsigned short oemsize) +{ + int count = sizeof(*oemtable); /* the header size */ + unsigned char *oemptr = ((unsigned char *)oemtable) + count; + + mpc_record = 0; + printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n", + oemtable); + if (memcmp(oemtable->oem_signature, MPC_OEM_SIGNATURE, 4)) { + printk(KERN_WARNING + "SMP mpc oemtable: bad signature [%c%c%c%c]!\n", + oemtable->oem_signature[0], oemtable->oem_signature[1], + oemtable->oem_signature[2], oemtable->oem_signature[3]); + return; + } + if (mpf_checksum((unsigned char *)oemtable, oemtable->oem_length)) { + printk(KERN_WARNING "SMP oem mptable: checksum error!\n"); + return; + } + while (count < oemtable->oem_length) { + switch (*oemptr) { + case MP_TRANSLATION: + { + struct mpc_config_translation *m = + (struct mpc_config_translation *)oemptr; + MP_translation_info(m); + oemptr += sizeof(*m); + count += sizeof(*m); + ++mpc_record; + break; + } + default: + { + printk(KERN_WARNING + "Unrecognised OEM table entry type! - %d\n", + (int)*oemptr); + return; + } + } + } +} + +static struct x86_quirks numaq_x86_quirks __initdata = { + .arch_pre_time_init = numaq_pre_time_init, + .arch_time_init = NULL, + .arch_pre_intr_init = NULL, + .arch_memory_setup = NULL, + .arch_intr_init = NULL, + .arch_trap_init = NULL, + .mach_get_smp_config = NULL, + .mach_find_smp_config = NULL, + .mpc_record = &mpc_record, + .mpc_apic_id = mpc_apic_id, + .mpc_oem_bus_info = mpc_oem_bus_info, + .mpc_oem_pci_bus = mpc_oem_pci_bus, + .smp_read_mpc_oem = smp_read_mpc_oem, +}; + +void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem, + char *productid) +{ + if (strncmp(oem, "IBM NUMA", 8)) + printk("Warning! Not a NUMA-Q system!\n"); + else + found_numaq = 1; +} + static __init void early_check_numaq(void) { /* @@ -82,6 +265,9 @@ static __init void early_check_numaq(void) */ if (smp_found_config) early_get_smp_config(); + + if (found_numaq) + x86_quirks = &numaq_x86_quirks; } int __init get_memcfg_numaq(void) @@ -92,14 +278,3 @@ int __init get_memcfg_numaq(void) smp_dump_qct(); return 1; } - -void __init numaq_tsc_disable(void) -{ - if (!found_numaq) - return; - - if (num_online_nodes() > 1) { - printk(KERN_DEBUG "NUMAQ: disabling TSC\n"); - setup_clear_cpu_cap(X86_FEATURE_TSC); - } -} diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index e0f571d58c1..300da17e61c 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -29,6 +29,7 @@ #include <asm/desc.h> #include <asm/setup.h> #include <asm/arch_hooks.h> +#include <asm/pgtable.h> #include <asm/time.h> #include <asm/pgalloc.h> #include <asm/irq.h> @@ -123,6 +124,7 @@ static void *get_call_destination(u8 type) .pv_irq_ops = pv_irq_ops, .pv_apic_ops = pv_apic_ops, .pv_mmu_ops = pv_mmu_ops, + .pv_lock_ops = pv_lock_ops, }; return *((void **)&tmpl + type); } @@ -266,6 +268,17 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void) return __get_cpu_var(paravirt_lazy_mode); } +void __init paravirt_use_bytelocks(void) +{ +#ifdef CONFIG_SMP + pv_lock_ops.spin_is_locked = __byte_spin_is_locked; + pv_lock_ops.spin_is_contended = __byte_spin_is_contended; + pv_lock_ops.spin_lock = __byte_spin_lock; + pv_lock_ops.spin_trylock = __byte_spin_trylock; + pv_lock_ops.spin_unlock = __byte_spin_unlock; +#endif +} + struct pv_info pv_info = { .name = "bare hardware", .paravirt_enabled = 0, @@ -361,7 +374,6 @@ struct pv_cpu_ops pv_cpu_ops = { struct pv_apic_ops pv_apic_ops = { #ifdef CONFIG_X86_LOCAL_APIC .apic_write = native_apic_write, - .apic_write_atomic = native_apic_write_atomic, .apic_read = native_apic_read, .setup_boot_clock = setup_boot_APIC_clock, .setup_secondary_clock = setup_secondary_APIC_clock, @@ -373,6 +385,9 @@ struct pv_mmu_ops pv_mmu_ops = { #ifndef CONFIG_X86_64 .pagetable_setup_start = native_pagetable_setup_start, .pagetable_setup_done = native_pagetable_setup_done, +#else + .pagetable_setup_start = paravirt_nop, + .pagetable_setup_done = paravirt_nop, #endif .read_cr2 = native_read_cr2, @@ -428,7 +443,7 @@ struct pv_mmu_ops pv_mmu_ops = { #endif /* PAGETABLE_LEVELS >= 3 */ .pte_val = native_pte_val, - .pte_flags = native_pte_val, + .pte_flags = native_pte_flags, .pgd_val = native_pgd_val, .make_pte = native_make_pte, @@ -446,6 +461,18 @@ struct pv_mmu_ops pv_mmu_ops = { .set_fixmap = native_set_fixmap, }; +struct pv_lock_ops pv_lock_ops = { +#ifdef CONFIG_SMP + .spin_is_locked = __ticket_spin_is_locked, + .spin_is_contended = __ticket_spin_is_contended, + + .spin_lock = __ticket_spin_lock, + .spin_trylock = __ticket_spin_trylock, + .spin_unlock = __ticket_spin_unlock, +#endif +}; +EXPORT_SYMBOL(pv_lock_ops); + EXPORT_SYMBOL_GPL(pv_time_ops); EXPORT_SYMBOL (pv_cpu_ops); EXPORT_SYMBOL (pv_mmu_ops); diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 6959b5c45df..dcdac6c826e 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -29,6 +29,7 @@ #include <linux/mm.h> #include <linux/spinlock.h> #include <linux/string.h> +#include <linux/crash_dump.h> #include <linux/dma-mapping.h> #include <linux/bitops.h> #include <linux/pci_ids.h> @@ -36,7 +37,8 @@ #include <linux/delay.h> #include <linux/scatterlist.h> #include <linux/iommu-helper.h> -#include <asm/gart.h> + +#include <asm/iommu.h> #include <asm/calgary.h> #include <asm/tce.h> #include <asm/pci-direct.h> @@ -167,6 +169,8 @@ static void calgary_dump_error_regs(struct iommu_table *tbl); static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev); static void calioc2_tce_cache_blast(struct iommu_table *tbl); static void calioc2_dump_error_regs(struct iommu_table *tbl); +static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl); +static void get_tce_space_from_tar(void); static struct cal_chipset_ops calgary_chip_ops = { .handle_quirks = calgary_handle_quirks, @@ -339,9 +343,8 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, /* were we called with bad_dma_address? */ badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE); if (unlikely((dma_addr >= bad_dma_address) && (dma_addr < badend))) { - printk(KERN_ERR "Calgary: driver tried unmapping bad DMA " + WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA " "address 0x%Lx\n", dma_addr); - WARN_ON(1); return; } @@ -410,22 +413,6 @@ static void calgary_unmap_sg(struct device *dev, } } -static int calgary_nontranslate_map_sg(struct device* dev, - struct scatterlist *sg, int nelems, int direction) -{ - struct scatterlist *s; - int i; - - for_each_sg(sg, s, nelems, i) { - struct page *p = sg_page(s); - - BUG_ON(!p); - s->dma_address = virt_to_bus(sg_virt(s)); - s->dma_length = s->length; - } - return nelems; -} - static int calgary_map_sg(struct device *dev, struct scatterlist *sg, int nelems, int direction) { @@ -436,9 +423,6 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg, unsigned long entry; int i; - if (!translation_enabled(tbl)) - return calgary_nontranslate_map_sg(dev, sg, nelems, direction); - for_each_sg(sg, s, nelems, i) { BUG_ON(!sg_page(s)); @@ -474,7 +458,6 @@ error: static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr, size_t size, int direction) { - dma_addr_t dma_handle = bad_dma_address; void *vaddr = phys_to_virt(paddr); unsigned long uaddr; unsigned int npages; @@ -483,12 +466,7 @@ static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr, uaddr = (unsigned long)vaddr; npages = num_dma_pages(uaddr, size); - if (translation_enabled(tbl)) - dma_handle = iommu_alloc(dev, tbl, vaddr, npages, direction); - else - dma_handle = virt_to_bus(vaddr); - - return dma_handle; + return iommu_alloc(dev, tbl, vaddr, npages, direction); } static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle, @@ -497,9 +475,6 @@ static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle, struct iommu_table *tbl = find_iommu_table(dev); unsigned int npages; - if (!translation_enabled(tbl)) - return; - npages = num_dma_pages(dma_handle, size); iommu_free(tbl, dma_handle, npages); } @@ -522,18 +497,12 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size, goto error; memset(ret, 0, size); - if (translation_enabled(tbl)) { - /* set up tces to cover the allocated range */ - mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL); - if (mapping == bad_dma_address) - goto free; - - *dma_handle = mapping; - } else /* non translated slot */ - *dma_handle = virt_to_bus(ret); - + /* set up tces to cover the allocated range */ + mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL); + if (mapping == bad_dma_address) + goto free; + *dma_handle = mapping; return ret; - free: free_pages((unsigned long)ret, get_order(size)); ret = NULL; @@ -541,7 +510,7 @@ error: return ret; } -static const struct dma_mapping_ops calgary_dma_ops = { +static struct dma_mapping_ops calgary_dma_ops = { .alloc_coherent = calgary_alloc_coherent, .map_single = calgary_map_single, .unmap_single = calgary_unmap_single, @@ -830,7 +799,11 @@ static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar) tbl = pci_iommu(dev->bus); tbl->it_base = (unsigned long)bus_info[dev->bus->number].tce_space; - tce_free(tbl, 0, tbl->it_size); + + if (is_kdump_kernel()) + calgary_init_bitmap_from_tce_table(tbl); + else + tce_free(tbl, 0, tbl->it_size); if (is_calgary(dev->device)) tbl->chip_ops = &calgary_chip_ops; @@ -1209,6 +1182,10 @@ static int __init calgary_init(void) if (ret) return ret; + /* Purely for kdump kernel case */ + if (is_kdump_kernel()) + get_tce_space_from_tar(); + do { dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev); if (!dev) @@ -1230,6 +1207,16 @@ static int __init calgary_init(void) goto error; } while (1); + dev = NULL; + for_each_pci_dev(dev) { + struct iommu_table *tbl; + + tbl = find_iommu_table(&dev->dev); + + if (translation_enabled(tbl)) + dev->dev.archdata.dma_ops = &calgary_dma_ops; + } + return ret; error: @@ -1251,6 +1238,7 @@ error: calgary_disable_translation(dev); calgary_free_bus(dev); pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */ + dev->dev.archdata.dma_ops = NULL; } while (1); return ret; @@ -1280,13 +1268,15 @@ static inline int __init determine_tce_table_size(u64 ram) static int __init build_detail_arrays(void) { unsigned long ptr; - int i, scal_detail_size, rio_detail_size; + unsigned numnodes, i; + int scal_detail_size, rio_detail_size; - if (rio_table_hdr->num_scal_dev > MAX_NUMNODES){ + numnodes = rio_table_hdr->num_scal_dev; + if (numnodes > MAX_NUMNODES){ printk(KERN_WARNING "Calgary: MAX_NUMNODES too low! Defined as %d, " "but system has %d nodes.\n", - MAX_NUMNODES, rio_table_hdr->num_scal_dev); + MAX_NUMNODES, numnodes); return -ENODEV; } @@ -1307,8 +1297,7 @@ static int __init build_detail_arrays(void) } ptr = ((unsigned long)rio_table_hdr) + 3; - for (i = 0; i < rio_table_hdr->num_scal_dev; - i++, ptr += scal_detail_size) + for (i = 0; i < numnodes; i++, ptr += scal_detail_size) scal_devs[i] = (struct scal_detail *)ptr; for (i = 0; i < rio_table_hdr->num_rio_dev; @@ -1339,6 +1328,61 @@ static int __init calgary_bus_has_devices(int bus, unsigned short pci_dev) return (val != 0xffffffff); } +/* + * calgary_init_bitmap_from_tce_table(): + * Funtion for kdump case. In the second/kdump kernel initialize + * the bitmap based on the tce table entries obtained from first kernel + */ +static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl) +{ + u64 *tp; + unsigned int index; + tp = ((u64 *)tbl->it_base); + for (index = 0 ; index < tbl->it_size; index++) { + if (*tp != 0x0) + set_bit(index, tbl->it_map); + tp++; + } +} + +/* + * get_tce_space_from_tar(): + * Function for kdump case. Get the tce tables from first kernel + * by reading the contents of the base adress register of calgary iommu + */ +static void __init get_tce_space_from_tar(void) +{ + int bus; + void __iomem *target; + unsigned long tce_space; + + for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) { + struct calgary_bus_info *info = &bus_info[bus]; + unsigned short pci_device; + u32 val; + + val = read_pci_config(bus, 0, 0, 0); + pci_device = (val & 0xFFFF0000) >> 16; + + if (!is_cal_pci_dev(pci_device)) + continue; + if (info->translation_disabled) + continue; + + if (calgary_bus_has_devices(bus, pci_device) || + translate_empty_slots) { + target = calgary_reg(bus_info[bus].bbar, + tar_offset(bus)); + tce_space = be64_to_cpu(readq(target)); + tce_space = tce_space & TAR_SW_BITS; + + tce_space = tce_space & (~specified_table_size); + info->tce_space = (u64 *)__va(tce_space); + } + } + return; +} + void __init detect_calgary(void) { int bus; @@ -1394,7 +1438,8 @@ void __init detect_calgary(void) return; } - specified_table_size = determine_tce_table_size(max_pfn * PAGE_SIZE); + specified_table_size = determine_tce_table_size((is_kdump_kernel() ? + saved_max_pfn : max_pfn) * PAGE_SIZE); for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) { struct calgary_bus_info *info = &bus_info[bus]; @@ -1412,10 +1457,16 @@ void __init detect_calgary(void) if (calgary_bus_has_devices(bus, pci_device) || translate_empty_slots) { - tbl = alloc_tce_table(); - if (!tbl) - goto cleanup; - info->tce_space = tbl; + /* + * If it is kdump kernel, find and use tce tables + * from first kernel, else allocate tce tables here + */ + if (!is_kdump_kernel()) { + tbl = alloc_tce_table(); + if (!tbl) + goto cleanup; + info->tce_space = tbl; + } calgary_found = 1; } } @@ -1430,6 +1481,10 @@ void __init detect_calgary(void) printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, " "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size, debugging ? "enabled" : "disabled"); + + /* swiotlb for devices that aren't behind the Calgary. */ + if (max_pfn > MAX_DMA32_PFN) + swiotlb = 1; } return; @@ -1446,7 +1501,7 @@ int __init calgary_iommu_init(void) { int ret; - if (no_iommu || swiotlb) + if (no_iommu || (swiotlb && !calgary_detected)) return -ENODEV; if (!calgary_detected) @@ -1459,15 +1514,14 @@ int __init calgary_iommu_init(void) if (ret) { printk(KERN_ERR "PCI-DMA: Calgary init failed %d, " "falling back to no_iommu\n", ret); - if (max_pfn > MAX_DMA32_PFN) - printk(KERN_ERR "WARNING more than 4GB of memory, " - "32bit PCI may malfunction.\n"); return ret; } force_iommu = 1; bad_dma_address = 0x0; - dma_ops = &calgary_dma_ops; + /* dma_ops is set to swiotlb or nommu */ + if (!dma_ops) + dma_ops = &nommu_dma_ops; return 0; } diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 8467ec2320f..87d4d6964ec 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -5,14 +5,13 @@ #include <asm/proto.h> #include <asm/dma.h> -#include <asm/gart.h> +#include <asm/iommu.h> #include <asm/calgary.h> #include <asm/amd_iommu.h> -int forbid_dac __read_mostly; -EXPORT_SYMBOL(forbid_dac); +static int forbid_dac __read_mostly; -const struct dma_mapping_ops *dma_ops; +struct dma_mapping_ops *dma_ops; EXPORT_SYMBOL(dma_ops); static int iommu_sac_force __read_mostly; @@ -114,22 +113,24 @@ void __init pci_iommu_alloc(void) * The order of these functions is important for * fall-back/fail-over reasons */ -#ifdef CONFIG_GART_IOMMU gart_iommu_hole_init(); -#endif -#ifdef CONFIG_CALGARY_IOMMU detect_calgary(); -#endif detect_intel_iommu(); amd_iommu_detect(); -#ifdef CONFIG_SWIOTLB pci_swiotlb_init(); -#endif } + +unsigned long iommu_num_pages(unsigned long addr, unsigned long len) +{ + unsigned long size = roundup((addr & ~PAGE_MASK) + len, PAGE_SIZE); + + return size >> PAGE_SHIFT; +} +EXPORT_SYMBOL(iommu_num_pages); #endif /* @@ -184,9 +185,7 @@ static __init int iommu_setup(char *p) swiotlb = 1; #endif -#ifdef CONFIG_GART_IOMMU gart_parse_options(p); -#endif #ifdef CONFIG_CALGARY_IOMMU if (!strncmp(p, "calgary", 7)) @@ -201,136 +200,19 @@ static __init int iommu_setup(char *p) } early_param("iommu", iommu_setup); -#ifdef CONFIG_X86_32 -int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr, - dma_addr_t device_addr, size_t size, int flags) -{ - void __iomem *mem_base = NULL; - int pages = size >> PAGE_SHIFT; - int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long); - - if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0) - goto out; - if (!size) - goto out; - if (dev->dma_mem) - goto out; - - /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */ - - mem_base = ioremap(bus_addr, size); - if (!mem_base) - goto out; - - dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL); - if (!dev->dma_mem) - goto out; - dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL); - if (!dev->dma_mem->bitmap) - goto free1_out; - - dev->dma_mem->virt_base = mem_base; - dev->dma_mem->device_base = device_addr; - dev->dma_mem->size = pages; - dev->dma_mem->flags = flags; - - if (flags & DMA_MEMORY_MAP) - return DMA_MEMORY_MAP; - - return DMA_MEMORY_IO; - - free1_out: - kfree(dev->dma_mem); - out: - if (mem_base) - iounmap(mem_base); - return 0; -} -EXPORT_SYMBOL(dma_declare_coherent_memory); - -void dma_release_declared_memory(struct device *dev) -{ - struct dma_coherent_mem *mem = dev->dma_mem; - - if (!mem) - return; - dev->dma_mem = NULL; - iounmap(mem->virt_base); - kfree(mem->bitmap); - kfree(mem); -} -EXPORT_SYMBOL(dma_release_declared_memory); - -void *dma_mark_declared_memory_occupied(struct device *dev, - dma_addr_t device_addr, size_t size) -{ - struct dma_coherent_mem *mem = dev->dma_mem; - int pos, err; - int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1); - - pages >>= PAGE_SHIFT; - - if (!mem) - return ERR_PTR(-EINVAL); - - pos = (device_addr - mem->device_base) >> PAGE_SHIFT; - err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages)); - if (err != 0) - return ERR_PTR(err); - return mem->virt_base + (pos << PAGE_SHIFT); -} -EXPORT_SYMBOL(dma_mark_declared_memory_occupied); - -static int dma_alloc_from_coherent_mem(struct device *dev, ssize_t size, - dma_addr_t *dma_handle, void **ret) -{ - struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; - int order = get_order(size); - - if (mem) { - int page = bitmap_find_free_region(mem->bitmap, mem->size, - order); - if (page >= 0) { - *dma_handle = mem->device_base + (page << PAGE_SHIFT); - *ret = mem->virt_base + (page << PAGE_SHIFT); - memset(*ret, 0, size); - } - if (mem->flags & DMA_MEMORY_EXCLUSIVE) - *ret = NULL; - } - return (mem != NULL); -} - -static int dma_release_coherent(struct device *dev, int order, void *vaddr) -{ - struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; - - if (mem && vaddr >= mem->virt_base && vaddr < - (mem->virt_base + (mem->size << PAGE_SHIFT))) { - int page = (vaddr - mem->virt_base) >> PAGE_SHIFT; - - bitmap_release_region(mem->bitmap, page, order); - return 1; - } - return 0; -} -#else -#define dma_alloc_from_coherent_mem(dev, size, handle, ret) (0) -#define dma_release_coherent(dev, order, vaddr) (0) -#endif /* CONFIG_X86_32 */ - int dma_supported(struct device *dev, u64 mask) { + struct dma_mapping_ops *ops = get_dma_ops(dev); + #ifdef CONFIG_PCI if (mask > 0xffffffff && forbid_dac > 0) { - printk(KERN_INFO "PCI: Disallowing DAC for device %s\n", - dev->bus_id); + dev_info(dev, "PCI: Disallowing DAC for device\n"); return 0; } #endif - if (dma_ops->dma_supported) - return dma_ops->dma_supported(dev, mask); + if (ops->dma_supported) + return ops->dma_supported(dev, mask); /* Copied from i386. Doesn't make much sense, because it will only work for pci_alloc_coherent. @@ -351,8 +233,7 @@ int dma_supported(struct device *dev, u64 mask) type. Normally this doesn't make any difference, but gives more gentle handling of IOMMU overflow. */ if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) { - printk(KERN_INFO "%s: Force SAC with mask %Lx\n", - dev->bus_id, mask); + dev_info(dev, "Force SAC with mask %Lx\n", mask); return 0; } @@ -378,6 +259,7 @@ void * dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp) { + struct dma_mapping_ops *ops = get_dma_ops(dev); void *memory = NULL; struct page *page; unsigned long dma_mask = 0; @@ -387,7 +269,7 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, /* ignore region specifiers */ gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); - if (dma_alloc_from_coherent_mem(dev, size, dma_handle, &memory)) + if (dma_alloc_from_coherent(dev, size, dma_handle, &memory)) return memory; if (!dev) { @@ -446,8 +328,8 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, /* Let low level make its own zone decisions */ gfp &= ~(GFP_DMA32|GFP_DMA); - if (dma_ops->alloc_coherent) - return dma_ops->alloc_coherent(dev, size, + if (ops->alloc_coherent) + return ops->alloc_coherent(dev, size, dma_handle, gfp); return NULL; } @@ -459,14 +341,14 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, } } - if (dma_ops->alloc_coherent) { + if (ops->alloc_coherent) { free_pages((unsigned long)memory, get_order(size)); gfp &= ~(GFP_DMA|GFP_DMA32); - return dma_ops->alloc_coherent(dev, size, dma_handle, gfp); + return ops->alloc_coherent(dev, size, dma_handle, gfp); } - if (dma_ops->map_simple) { - *dma_handle = dma_ops->map_simple(dev, virt_to_phys(memory), + if (ops->map_simple) { + *dma_handle = ops->map_simple(dev, virt_to_phys(memory), size, PCI_DMA_BIDIRECTIONAL); if (*dma_handle != bad_dma_address) @@ -488,29 +370,27 @@ EXPORT_SYMBOL(dma_alloc_coherent); void dma_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t bus) { + struct dma_mapping_ops *ops = get_dma_ops(dev); + int order = get_order(size); WARN_ON(irqs_disabled()); /* for portability */ - if (dma_release_coherent(dev, order, vaddr)) + if (dma_release_from_coherent(dev, order, vaddr)) return; - if (dma_ops->unmap_single) - dma_ops->unmap_single(dev, bus, size, 0); + if (ops->unmap_single) + ops->unmap_single(dev, bus, size, 0); free_pages((unsigned long)vaddr, order); } EXPORT_SYMBOL(dma_free_coherent); static int __init pci_iommu_init(void) { -#ifdef CONFIG_CALGARY_IOMMU calgary_iommu_init(); -#endif intel_iommu_init(); amd_iommu_init(); -#ifdef CONFIG_GART_IOMMU gart_iommu_init(); -#endif no_iommu_init(); return 0; diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index c3fe78406d1..49285f8fd4d 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -32,6 +32,7 @@ #include <asm/mtrr.h> #include <asm/pgtable.h> #include <asm/proto.h> +#include <asm/iommu.h> #include <asm/gart.h> #include <asm/cacheflush.h> #include <asm/swiotlb.h> @@ -66,9 +67,6 @@ static u32 gart_unmapped_entry; (((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT) #define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28)) -#define to_pages(addr, size) \ - (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT) - #define EMERGENCY_PAGES 32 /* = 128KB */ #ifdef CONFIG_AGP @@ -197,9 +195,7 @@ static void iommu_full(struct device *dev, size_t size, int dir) * out. Hopefully no network devices use single mappings that big. */ - printk(KERN_ERR - "PCI-DMA: Out of IOMMU space for %lu bytes at device %s\n", - size, dev->bus_id); + dev_err(dev, "PCI-DMA: Out of IOMMU space for %lu bytes\n", size); if (size > PAGE_SIZE*EMERGENCY_PAGES) { if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL) @@ -242,7 +238,7 @@ nonforced_iommu(struct device *dev, unsigned long addr, size_t size) static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, size_t size, int dir) { - unsigned long npages = to_pages(phys_mem, size); + unsigned long npages = iommu_num_pages(phys_mem, size); unsigned long iommu_page = alloc_iommu(dev, npages); int i; @@ -305,7 +301,7 @@ static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr, return; iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT; - npages = to_pages(dma_addr, size); + npages = iommu_num_pages(dma_addr, size); for (i = 0; i < npages; i++) { iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; CLEAR_LEAK(iommu_page + i); @@ -388,7 +384,7 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start, } addr = phys_addr; - pages = to_pages(s->offset, s->length); + pages = iommu_num_pages(s->offset, s->length); while (pages--) { iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); SET_LEAK(iommu_page); @@ -471,7 +467,7 @@ gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) seg_size += s->length; need = nextneed; - pages += to_pages(s->offset, s->length); + pages += iommu_num_pages(s->offset, s->length); ps = s; } if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0) @@ -693,8 +689,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info) extern int agp_amd64_init(void); -static const struct dma_mapping_ops gart_dma_ops = { - .mapping_error = NULL, +static struct dma_mapping_ops gart_dma_ops = { .map_single = gart_map_single, .map_simple = gart_map_simple, .unmap_single = gart_unmap_single, diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index aec43d56f49..3f91f71cdc3 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c @@ -7,7 +7,7 @@ #include <linux/dma-mapping.h> #include <linux/scatterlist.h> -#include <asm/gart.h> +#include <asm/iommu.h> #include <asm/processor.h> #include <asm/dma.h> @@ -72,21 +72,9 @@ static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg, return nents; } -/* Make sure we keep the same behaviour */ -static int nommu_mapping_error(dma_addr_t dma_addr) -{ -#ifdef CONFIG_X86_32 - return 0; -#else - return (dma_addr == bad_dma_address); -#endif -} - - -const struct dma_mapping_ops nommu_dma_ops = { +struct dma_mapping_ops nommu_dma_ops = { .map_single = nommu_map_single, .map_sg = nommu_map_sg, - .mapping_error = nommu_mapping_error, .is_phys = 1, }; diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c index 82299cd1d04..c4ce0332759 100644 --- a/arch/x86/kernel/pci-swiotlb_64.c +++ b/arch/x86/kernel/pci-swiotlb_64.c @@ -5,7 +5,7 @@ #include <linux/module.h> #include <linux/dma-mapping.h> -#include <asm/gart.h> +#include <asm/iommu.h> #include <asm/swiotlb.h> #include <asm/dma.h> @@ -18,7 +18,7 @@ swiotlb_map_single_phys(struct device *hwdev, phys_addr_t paddr, size_t size, return swiotlb_map_single(hwdev, phys_to_virt(paddr), size, direction); } -const struct dma_mapping_ops swiotlb_dma_ops = { +struct dma_mapping_ops swiotlb_dma_ops = { .mapping_error = swiotlb_dma_mapping_error, .alloc_coherent = swiotlb_alloc_coherent, .free_coherent = swiotlb_free_coherent, diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 4d629c62f4f..876e9189077 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -15,6 +15,7 @@ unsigned long idle_nomwait; EXPORT_SYMBOL(idle_nomwait); struct kmem_cache *task_xstate_cachep; +static int force_mwait __cpuinitdata; int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { @@ -199,6 +200,7 @@ static void poll_idle(void) * * idle=mwait overrides this decision and forces the usage of mwait. */ +static int __cpuinitdata force_mwait; #define MWAIT_INFO 0x05 #define MWAIT_ECX_EXTENDED_INFO 0x01 @@ -244,6 +246,14 @@ static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c) return 1; } +static cpumask_t c1e_mask = CPU_MASK_NONE; +static int c1e_detected; + +void c1e_remove_cpu(int cpu) +{ + cpu_clear(cpu, c1e_mask); +} + /* * C1E aware idle routine. We check for C1E active in the interrupt * pending message MSR. If we detect C1E, then we handle it the same @@ -251,9 +261,6 @@ static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c) */ static void c1e_idle(void) { - static cpumask_t c1e_mask = CPU_MASK_NONE; - static int c1e_detected; - if (need_resched()) return; @@ -263,8 +270,10 @@ static void c1e_idle(void) rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); if (lo & K8_INTP_C1E_ACTIVE_MASK) { c1e_detected = 1; - mark_tsc_unstable("TSC halt in C1E"); - printk(KERN_INFO "System has C1E enabled\n"); + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) + mark_tsc_unstable("TSC halt in AMD C1E"); + printk(KERN_INFO "System has AMD C1E enabled\n"); + set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E); } } @@ -326,6 +335,9 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) static int __init idle_setup(char *str) { + if (!str) + return -EINVAL; + if (!strcmp(str, "poll")) { printk("using polling idle threads.\n"); pm_idle = poll_idle; diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 0c3927accb0..31f40b24bf5 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -55,6 +55,7 @@ #include <asm/tlbflush.h> #include <asm/cpu.h> #include <asm/kdebug.h> +#include <asm/idle.h> asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); @@ -88,6 +89,7 @@ static void cpu_exit_clear(void) cpu_clear(cpu, cpu_callin_map); numa_remove_cpu(cpu); + c1e_remove_cpu(cpu); } /* We don't actually take CPU down, just spin without interrupts. */ @@ -95,7 +97,6 @@ static inline void play_dead(void) { /* This must be done before dead CPU ack */ cpu_exit_clear(); - wbinvd(); mb(); /* Ack it */ __get_cpu_var(cpu_state) = CPU_DEAD; @@ -104,8 +105,8 @@ static inline void play_dead(void) * With physical CPU hotplug, we should halt the cpu */ local_irq_disable(); - while (1) - halt(); + /* mask all interrupts, flush any and all caches, and halt */ + wbinvd_halt(); } #else static inline void play_dead(void) @@ -128,7 +129,7 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_stop_sched_tick(); + tick_nohz_stop_sched_tick(1); while (!need_resched()) { check_pgt_cache(); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index a8e53626ac9..e12e0e4dd25 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -93,14 +93,15 @@ DECLARE_PER_CPU(int, cpu_state); static inline void play_dead(void) { idle_task_exit(); - wbinvd(); + c1e_remove_cpu(raw_smp_processor_id()); + mb(); /* Ack it */ __get_cpu_var(cpu_state) = CPU_DEAD; local_irq_disable(); - while (1) - halt(); + /* mask all interrupts, flush any and all caches, and halt */ + wbinvd_halt(); } #else static inline void play_dead(void) @@ -120,7 +121,7 @@ void cpu_idle(void) current_thread_info()->status |= TS_POLLING; /* endless idle loop with no priority at all */ while (1) { - tick_nohz_stop_sched_tick(); + tick_nohz_stop_sched_tick(1); while (!need_resched()) { rmb(); @@ -537,8 +538,8 @@ static inline void __switch_to_xtra(struct task_struct *prev_p, struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { - struct thread_struct *prev = &prev_p->thread, - *next = &next_p->thread; + struct thread_struct *prev = &prev_p->thread; + struct thread_struct *next = &next_p->thread; int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(init_tss, cpu); unsigned fsindex, gsindex; @@ -586,35 +587,34 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* * Switch FS and GS. + * + * Segment register != 0 always requires a reload. Also + * reload when it has changed. When prev process used 64bit + * base always reload to avoid an information leak. */ - { - /* segment register != 0 always requires a reload. - also reload when it has changed. - when prev process used 64bit base always reload - to avoid an information leak. */ - if (unlikely(fsindex | next->fsindex | prev->fs)) { - loadsegment(fs, next->fsindex); - /* check if the user used a selector != 0 - * if yes clear 64bit base, since overloaded base - * is always mapped to the Null selector - */ - if (fsindex) + if (unlikely(fsindex | next->fsindex | prev->fs)) { + loadsegment(fs, next->fsindex); + /* + * Check if the user used a selector != 0; if yes + * clear 64bit base, since overloaded base is always + * mapped to the Null selector + */ + if (fsindex) prev->fs = 0; - } - /* when next process has a 64bit base use it */ - if (next->fs) - wrmsrl(MSR_FS_BASE, next->fs); - prev->fsindex = fsindex; - - if (unlikely(gsindex | next->gsindex | prev->gs)) { - load_gs_index(next->gsindex); - if (gsindex) + } + /* when next process has a 64bit base use it */ + if (next->fs) + wrmsrl(MSR_FS_BASE, next->fs); + prev->fsindex = fsindex; + + if (unlikely(gsindex | next->gsindex | prev->gs)) { + load_gs_index(next->gsindex); + if (gsindex) prev->gs = 0; - } - if (next->gs) - wrmsrl(MSR_KERNEL_GS_BASE, next->gs); - prev->gsindex = gsindex; } + if (next->gs) + wrmsrl(MSR_KERNEL_GS_BASE, next->gs); + prev->gsindex = gsindex; /* Must be after DS reload */ unlazy_fpu(prev_p); @@ -627,7 +627,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) write_pda(pcurrent, next_p); write_pda(kernelstack, - (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET); + (unsigned long)task_stack_page(next_p) + + THREAD_SIZE - PDA_STACKOFFSET); #ifdef CONFIG_CC_STACKPROTECTOR write_pda(stack_canary, next_p->stack_canary); /* diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 77040b6070e..e37dccce85d 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1357,8 +1357,6 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task) #endif } -#ifdef CONFIG_X86_32 - void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code) { struct siginfo info; @@ -1377,89 +1375,10 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code) force_sig_info(SIGTRAP, &info, tsk); } -/* notification of system call entry/exit - * - triggered by current->work.syscall_trace - */ -int do_syscall_trace(struct pt_regs *regs, int entryexit) -{ - int is_sysemu = test_thread_flag(TIF_SYSCALL_EMU); - /* - * With TIF_SYSCALL_EMU set we want to ignore TIF_SINGLESTEP for syscall - * interception - */ - int is_singlestep = !is_sysemu && test_thread_flag(TIF_SINGLESTEP); - int ret = 0; - - /* do the secure computing check first */ - if (!entryexit) - secure_computing(regs->orig_ax); - - if (unlikely(current->audit_context)) { - if (entryexit) - audit_syscall_exit(AUDITSC_RESULT(regs->ax), - regs->ax); - /* Debug traps, when using PTRACE_SINGLESTEP, must be sent only - * on the syscall exit path. Normally, when TIF_SYSCALL_AUDIT is - * not used, entry.S will call us only on syscall exit, not - * entry; so when TIF_SYSCALL_AUDIT is used we must avoid - * calling send_sigtrap() on syscall entry. - * - * Note that when PTRACE_SYSEMU_SINGLESTEP is used, - * is_singlestep is false, despite his name, so we will still do - * the correct thing. - */ - else if (is_singlestep) - goto out; - } - - if (!(current->ptrace & PT_PTRACED)) - goto out; - - /* If a process stops on the 1st tracepoint with SYSCALL_TRACE - * and then is resumed with SYSEMU_SINGLESTEP, it will come in - * here. We have to check this and return */ - if (is_sysemu && entryexit) - return 0; - - /* Fake a debug trap */ - if (is_singlestep) - send_sigtrap(current, regs, 0); - - if (!test_thread_flag(TIF_SYSCALL_TRACE) && !is_sysemu) - goto out; - - /* the 0x80 provides a way for the tracing parent to distinguish - between a syscall stop and SIGTRAP delivery */ - /* Note that the debugger could change the result of test_thread_flag!*/ - ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80:0)); - - /* - * this isn't the same as continuing with a signal, but it will do - * for normal use. strace only continues with a signal if the - * stopping signal is not SIGTRAP. -brl - */ - if (current->exit_code) { - send_sig(current->exit_code, current, 1); - current->exit_code = 0; - } - ret = is_sysemu; -out: - if (unlikely(current->audit_context) && !entryexit) - audit_syscall_entry(AUDIT_ARCH_I386, regs->orig_ax, - regs->bx, regs->cx, regs->dx, regs->si); - if (ret == 0) - return 0; - - regs->orig_ax = -1; /* force skip of syscall restarting */ - if (unlikely(current->audit_context)) - audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); - return 1; -} - -#else /* CONFIG_X86_64 */ - static void syscall_trace(struct pt_regs *regs) { + if (!(current->ptrace & PT_PTRACED)) + return; #if 0 printk("trace %s ip %lx sp %lx ax %d origrax %d caller %lx tiflags %x ptrace %x\n", @@ -1481,39 +1400,81 @@ static void syscall_trace(struct pt_regs *regs) } } -asmlinkage void syscall_trace_enter(struct pt_regs *regs) +#ifdef CONFIG_X86_32 +# define IS_IA32 1 +#elif defined CONFIG_IA32_EMULATION +# define IS_IA32 test_thread_flag(TIF_IA32) +#else +# define IS_IA32 0 +#endif + +/* + * We must return the syscall number to actually look up in the table. + * This can be -1L to skip running any syscall at all. + */ +asmregparm long syscall_trace_enter(struct pt_regs *regs) { + long ret = 0; + + /* + * If we stepped into a sysenter/syscall insn, it trapped in + * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP. + * If user-mode had set TF itself, then it's still clear from + * do_debug() and we need to set it again to restore the user + * state. If we entered on the slow path, TF was already set. + */ + if (test_thread_flag(TIF_SINGLESTEP)) + regs->flags |= X86_EFLAGS_TF; + /* do the secure computing check first */ secure_computing(regs->orig_ax); - if (test_thread_flag(TIF_SYSCALL_TRACE) - && (current->ptrace & PT_PTRACED)) + if (unlikely(test_thread_flag(TIF_SYSCALL_EMU))) + ret = -1L; + + if (ret || test_thread_flag(TIF_SYSCALL_TRACE)) syscall_trace(regs); if (unlikely(current->audit_context)) { - if (test_thread_flag(TIF_IA32)) { + if (IS_IA32) audit_syscall_entry(AUDIT_ARCH_I386, regs->orig_ax, regs->bx, regs->cx, regs->dx, regs->si); - } else { +#ifdef CONFIG_X86_64 + else audit_syscall_entry(AUDIT_ARCH_X86_64, regs->orig_ax, regs->di, regs->si, regs->dx, regs->r10); - } +#endif } + + return ret ?: regs->orig_ax; } -asmlinkage void syscall_trace_leave(struct pt_regs *regs) +asmregparm void syscall_trace_leave(struct pt_regs *regs) { if (unlikely(current->audit_context)) audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); - if ((test_thread_flag(TIF_SYSCALL_TRACE) - || test_thread_flag(TIF_SINGLESTEP)) - && (current->ptrace & PT_PTRACED)) + if (test_thread_flag(TIF_SYSCALL_TRACE)) syscall_trace(regs); -} -#endif /* CONFIG_X86_32 */ + /* + * If TIF_SYSCALL_EMU is set, we only get here because of + * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP). + * We already reported this syscall instruction in + * syscall_trace_enter(), so don't do any more now. + */ + if (unlikely(test_thread_flag(TIF_SYSCALL_EMU))) + return; + + /* + * If we are single-stepping, synthesize a trap to follow the + * system call instruction. + */ + if (test_thread_flag(TIF_SINGLESTEP) && + (current->ptrace & PT_PTRACED)) + send_sigtrap(current, regs, 0); +} diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index f8a62160e15..724adfc63cb 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -177,6 +177,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"), }, }, + { /* Handle problems with rebooting on Dell T5400's */ + .callback = set_bios_reboot, + .ident = "Dell Precision T5400", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T5400"), + }, + }, { /* Handle problems with rebooting on HP laptops */ .callback = set_bios_reboot, .ident = "HP Compaq Laptop", @@ -403,10 +411,9 @@ void native_machine_shutdown(void) { /* Stop the cpus and apics */ #ifdef CONFIG_SMP - int reboot_cpu_id; /* The boot cpu is always logical cpu 0 */ - reboot_cpu_id = 0; + int reboot_cpu_id = 0; #ifdef CONFIG_X86_32 /* See if there has been given a command line override */ diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S index c30fe25d470..6f50664b2ba 100644 --- a/arch/x86/kernel/relocate_kernel_32.S +++ b/arch/x86/kernel/relocate_kernel_32.S @@ -20,11 +20,45 @@ #define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) #define PAE_PGD_ATTR (_PAGE_PRESENT) +/* control_page + KEXEC_CONTROL_CODE_MAX_SIZE + * ~ control_page + PAGE_SIZE are used as data storage and stack for + * jumping back + */ +#define DATA(offset) (KEXEC_CONTROL_CODE_MAX_SIZE+(offset)) + +/* Minimal CPU state */ +#define ESP DATA(0x0) +#define CR0 DATA(0x4) +#define CR3 DATA(0x8) +#define CR4 DATA(0xc) + +/* other data */ +#define CP_VA_CONTROL_PAGE DATA(0x10) +#define CP_PA_PGD DATA(0x14) +#define CP_PA_SWAP_PAGE DATA(0x18) +#define CP_PA_BACKUP_PAGES_MAP DATA(0x1c) + .text .align PAGE_SIZE .globl relocate_kernel relocate_kernel: - movl 8(%esp), %ebp /* list of pages */ + /* Save the CPU context, used for jumping back */ + + pushl %ebx + pushl %esi + pushl %edi + pushl %ebp + pushf + + movl 20+8(%esp), %ebp /* list of pages */ + movl PTR(VA_CONTROL_PAGE)(%ebp), %edi + movl %esp, ESP(%edi) + movl %cr0, %eax + movl %eax, CR0(%edi) + movl %cr3, %eax + movl %eax, CR3(%edi) + movl %cr4, %eax + movl %eax, CR4(%edi) #ifdef CONFIG_X86_PAE /* map the control page at its virtual address */ @@ -138,15 +172,25 @@ relocate_kernel: relocate_new_kernel: /* read the arguments and say goodbye to the stack */ - movl 4(%esp), %ebx /* page_list */ - movl 8(%esp), %ebp /* list of pages */ - movl 12(%esp), %edx /* start address */ - movl 16(%esp), %ecx /* cpu_has_pae */ + movl 20+4(%esp), %ebx /* page_list */ + movl 20+8(%esp), %ebp /* list of pages */ + movl 20+12(%esp), %edx /* start address */ + movl 20+16(%esp), %ecx /* cpu_has_pae */ + movl 20+20(%esp), %esi /* preserve_context */ /* zero out flags, and disable interrupts */ pushl $0 popfl + /* save some information for jumping back */ + movl PTR(VA_CONTROL_PAGE)(%ebp), %edi + movl %edi, CP_VA_CONTROL_PAGE(%edi) + movl PTR(PA_PGD)(%ebp), %eax + movl %eax, CP_PA_PGD(%edi) + movl PTR(PA_SWAP_PAGE)(%ebp), %eax + movl %eax, CP_PA_SWAP_PAGE(%edi) + movl %ebx, CP_PA_BACKUP_PAGES_MAP(%edi) + /* get physical address of control page now */ /* this is impossible after page table switch */ movl PTR(PA_CONTROL_PAGE)(%ebp), %edi @@ -197,8 +241,90 @@ identity_mapped: xorl %eax, %eax movl %eax, %cr3 + movl CP_PA_SWAP_PAGE(%edi), %eax + pushl %eax + pushl %ebx + call swap_pages + addl $8, %esp + + /* To be certain of avoiding problems with self-modifying code + * I need to execute a serializing instruction here. + * So I flush the TLB, it's handy, and not processor dependent. + */ + xorl %eax, %eax + movl %eax, %cr3 + + /* set all of the registers to known values */ + /* leave %esp alone */ + + testl %esi, %esi + jnz 1f + xorl %edi, %edi + xorl %eax, %eax + xorl %ebx, %ebx + xorl %ecx, %ecx + xorl %edx, %edx + xorl %esi, %esi + xorl %ebp, %ebp + ret +1: + popl %edx + movl CP_PA_SWAP_PAGE(%edi), %esp + addl $PAGE_SIZE, %esp +2: + call *%edx + + /* get the re-entry point of the peer system */ + movl 0(%esp), %ebp + call 1f +1: + popl %ebx + subl $(1b - relocate_kernel), %ebx + movl CP_VA_CONTROL_PAGE(%ebx), %edi + lea PAGE_SIZE(%ebx), %esp + movl CP_PA_SWAP_PAGE(%ebx), %eax + movl CP_PA_BACKUP_PAGES_MAP(%ebx), %edx + pushl %eax + pushl %edx + call swap_pages + addl $8, %esp + movl CP_PA_PGD(%ebx), %eax + movl %eax, %cr3 + movl %cr0, %eax + orl $(1<<31), %eax + movl %eax, %cr0 + lea PAGE_SIZE(%edi), %esp + movl %edi, %eax + addl $(virtual_mapped - relocate_kernel), %eax + pushl %eax + ret + +virtual_mapped: + movl CR4(%edi), %eax + movl %eax, %cr4 + movl CR3(%edi), %eax + movl %eax, %cr3 + movl CR0(%edi), %eax + movl %eax, %cr0 + movl ESP(%edi), %esp + movl %ebp, %eax + + popf + popl %ebp + popl %edi + popl %esi + popl %ebx + ret + /* Do the copies */ - movl %ebx, %ecx +swap_pages: + movl 8(%esp), %edx + movl 4(%esp), %ecx + pushl %ebp + pushl %ebx + pushl %edi + pushl %esi + movl %ecx, %ebx jmp 1f 0: /* top, read another word from the indirection page */ @@ -226,27 +352,31 @@ identity_mapped: movl %ecx, %esi /* For every source page do a copy */ andl $0xfffff000, %esi + movl %edi, %eax + movl %esi, %ebp + + movl %edx, %edi movl $1024, %ecx rep ; movsl - jmp 0b - -3: - /* To be certain of avoiding problems with self-modifying code - * I need to execute a serializing instruction here. - * So I flush the TLB, it's handy, and not processor dependent. - */ - xorl %eax, %eax - movl %eax, %cr3 + movl %ebp, %edi + movl %eax, %esi + movl $1024, %ecx + rep ; movsl - /* set all of the registers to known values */ - /* leave %esp alone */ + movl %eax, %edi + movl %edx, %esi + movl $1024, %ecx + rep ; movsl - xorl %eax, %eax - xorl %ebx, %ebx - xorl %ecx, %ecx - xorl %edx, %edx - xorl %esi, %esi - xorl %edi, %edi - xorl %ebp, %ebp + lea PAGE_SIZE(%ebp), %esi + jmp 0b +3: + popl %esi + popl %edi + popl %ebx + popl %ebp ret + + .globl kexec_control_code_size +.set kexec_control_code_size, . - relocate_kernel diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 531b55b8e81..9838f2539df 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -57,12 +57,8 @@ #include <linux/slab.h> #include <linux/user.h> #include <linux/delay.h> -#include <linux/highmem.h> #include <linux/kallsyms.h> -#include <linux/edd.h> -#include <linux/iscsi_ibft.h> -#include <linux/kexec.h> #include <linux/cpufreq.h> #include <linux/dma-mapping.h> #include <linux/ctype.h> @@ -96,7 +92,7 @@ #include <asm/smp.h> #include <asm/desc.h> #include <asm/dma.h> -#include <asm/gart.h> +#include <asm/iommu.h> #include <asm/mmu_context.h> #include <asm/proto.h> @@ -104,7 +100,6 @@ #include <asm/paravirt.h> #include <asm/percpu.h> -#include <asm/sections.h> #include <asm/topology.h> #include <asm/apicdef.h> #ifdef CONFIG_X86_64 @@ -450,7 +445,7 @@ static void __init reserve_early_setup_data(void) * @size: Size of the crashkernel memory to reserve. * Returns the base address on success, and -1ULL on failure. */ -unsigned long long find_and_reserve_crashkernel(unsigned long long size) +unsigned long long __init find_and_reserve_crashkernel(unsigned long long size) { const unsigned long long alignment = 16<<20; /* 16M */ unsigned long long start = 0LL; @@ -579,6 +574,10 @@ static int __init setup_elfcorehdr(char *arg) early_param("elfcorehdr", setup_elfcorehdr); #endif +static struct x86_quirks default_x86_quirks __initdata; + +struct x86_quirks *x86_quirks __initdata = &default_x86_quirks; + /* * Determine if we were loaded by an EFI loader. If so, then we have also been * passed the efi memmap, systab, etc., so we should use these data structures @@ -598,11 +597,11 @@ void __init setup_arch(char **cmdline_p) memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); visws_early_detect(); pre_setup_arch_hook(); - early_cpu_init(); #else printk(KERN_INFO "Command line: %s\n", boot_command_line); #endif + early_cpu_init(); early_ioremap_init(); ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); @@ -666,14 +665,23 @@ void __init setup_arch(char **cmdline_p) bss_resource.start = virt_to_phys(&__bss_start); bss_resource.end = virt_to_phys(&__bss_stop)-1; -#ifdef CONFIG_X86_64 - early_cpu_init(); -#endif strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); *cmdline_p = command_line; parse_early_param(); +#ifdef CONFIG_X86_64 + check_efer(); +#endif + +#if defined(CONFIG_VMI) && defined(CONFIG_X86_32) + /* + * Must be before kernel pagetables are setup + * or fixmap area is touched. + */ + vmi_init(); +#endif + /* after early param, so could get panic from serial */ reserve_early_setup_data(); @@ -681,7 +689,7 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_X86_LOCAL_APIC disable_apic = 1; #endif - clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); + setup_clear_cpu_cap(X86_FEATURE_APIC); } #ifdef CONFIG_PCI @@ -734,7 +742,6 @@ void __init setup_arch(char **cmdline_p) #else num_physpages = max_pfn; - check_efer(); /* How many end-of-memory variables you have, grandma! */ /* need this before calling reserve_initrd */ @@ -792,10 +799,6 @@ void __init setup_arch(char **cmdline_p) initmem_init(0, max_pfn); -#ifdef CONFIG_X86_64 - dma32_reserve_bootmem(); -#endif - #ifdef CONFIG_ACPI_SLEEP /* * Reserve low memory region for sleep support. @@ -810,21 +813,25 @@ void __init setup_arch(char **cmdline_p) #endif reserve_crashkernel(); +#ifdef CONFIG_X86_64 + /* + * dma32_reserve_bootmem() allocates bootmem which may conflict + * with the crashkernel command line, so do that after + * reserve_crashkernel() + */ + dma32_reserve_bootmem(); +#endif + reserve_ibft_region(); #ifdef CONFIG_KVM_CLOCK kvmclock_init(); #endif -#if defined(CONFIG_VMI) && defined(CONFIG_X86_32) - /* - * Must be after max_low_pfn is determined, and before kernel - * pagetables are setup. - */ - vmi_init(); -#endif - + paravirt_pagetable_setup_start(swapper_pg_dir); paging_init(); + paravirt_pagetable_setup_done(swapper_pg_dir); + paravirt_post_allocator_init(); #ifdef CONFIG_X86_64 map_vsyscall(); @@ -854,23 +861,9 @@ void __init setup_arch(char **cmdline_p) init_cpu_to_node(); #endif -#ifdef CONFIG_X86_NUMAQ - /* - * need to check online nodes num, call it - * here before time_init/tsc_init - */ - numaq_tsc_disable(); -#endif - init_apic_mappings(); ioapic_init_mappings(); -#if defined(CONFIG_SMP) && defined(CONFIG_X86_PC) && defined(CONFIG_X86_32) - if (def_to_bigsmp) - printk(KERN_WARNING "More than 8 CPUs detected and " - "CONFIG_X86_PC cannot handle it.\nUse " - "CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n"); -#endif kvm_guest_init(); e820_reserve_resources(); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index cac68430d31..76e305e064f 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -80,24 +80,6 @@ static void __init setup_per_cpu_maps(void) #endif } -#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP -cpumask_t *cpumask_of_cpu_map __read_mostly; -EXPORT_SYMBOL(cpumask_of_cpu_map); - -/* requires nr_cpu_ids to be initialized */ -static void __init setup_cpumask_of_cpu(void) -{ - int i; - - /* alloc_bootmem zeroes memory */ - cpumask_of_cpu_map = alloc_bootmem_low(sizeof(cpumask_t) * nr_cpu_ids); - for (i = 0; i < nr_cpu_ids; i++) - cpu_set(i, cpumask_of_cpu_map[i]); -} -#else -static inline void setup_cpumask_of_cpu(void) { } -#endif - #ifdef CONFIG_X86_32 /* * Great future not-so-futuristic plan: make i386 and x86_64 do it @@ -197,9 +179,6 @@ void __init setup_per_cpu_areas(void) /* Setup node to cpumask map */ setup_node_to_cpumask_map(); - - /* Setup cpumask_of_cpu map */ - setup_cpumask_of_cpu(); } #endif @@ -227,8 +206,8 @@ static void __init setup_node_to_cpumask_map(void) /* allocate the map */ map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t)); - Dprintk(KERN_DEBUG "Node to cpumask map at %p for %d nodes\n", - map, nr_node_ids); + pr_debug(KERN_DEBUG "Node to cpumask map at %p for %d nodes\n", + map, nr_node_ids); /* node_to_cpumask() will now work */ node_to_cpumask_map = map; @@ -248,7 +227,7 @@ void __cpuinit numa_set_node(int cpu, int node) per_cpu(x86_cpu_to_node_map, cpu) = node; else - Dprintk(KERN_INFO "Setting node for non-present cpu %d\n", cpu); + pr_debug("Setting node for non-present cpu %d\n", cpu); } void __cpuinit numa_clear_node(int cpu) diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index d9237363096..6fb5bcdd893 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -212,7 +212,7 @@ asmlinkage unsigned long sys_sigreturn(unsigned long __unused) badframe: if (show_unhandled_signals && printk_ratelimit()) { - printk(KERN_INFO "%s%s[%d] bad frame in sigreturn frame:" + printk("%s%s[%d] bad frame in sigreturn frame:" "%p ip:%lx sp:%lx oeax:%lx", task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG, current->comm, task_pid_nr(current), frame, regs->ip, @@ -657,18 +657,9 @@ static void do_signal(struct pt_regs *regs) void do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) { - /* Pending single-step? */ - if (thread_info_flags & _TIF_SINGLESTEP) { - regs->flags |= X86_EFLAGS_TF; - clear_thread_flag(TIF_SINGLESTEP); - } - /* deal with pending signal delivery */ if (thread_info_flags & _TIF_SIGPENDING) do_signal(regs); - if (thread_info_flags & _TIF_HRTICK_RESCHED) - hrtick_resched(); - clear_thread_flag(TIF_IRET); } diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index e53b267662e..ca316b5b742 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -53,6 +53,68 @@ sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, return do_sigaltstack(uss, uoss, regs->sp); } +/* + * Signal frame handlers. + */ + +static inline int save_i387(struct _fpstate __user *buf) +{ + struct task_struct *tsk = current; + int err = 0; + + BUILD_BUG_ON(sizeof(struct user_i387_struct) != + sizeof(tsk->thread.xstate->fxsave)); + + if ((unsigned long)buf % 16) + printk("save_i387: bad fpstate %p\n", buf); + + if (!used_math()) + return 0; + clear_used_math(); /* trigger finit */ + if (task_thread_info(tsk)->status & TS_USEDFPU) { + err = save_i387_checking((struct i387_fxsave_struct __user *) + buf); + if (err) + return err; + task_thread_info(tsk)->status &= ~TS_USEDFPU; + stts(); + } else { + if (__copy_to_user(buf, &tsk->thread.xstate->fxsave, + sizeof(struct i387_fxsave_struct))) + return -1; + } + return 1; +} + +/* + * This restores directly out of user space. Exceptions are handled. + */ +static inline int restore_i387(struct _fpstate __user *buf) +{ + struct task_struct *tsk = current; + int err; + + if (!used_math()) { + err = init_fpu(tsk); + if (err) + return err; + } + + if (!(task_thread_info(current)->status & TS_USEDFPU)) { + clts(); + task_thread_info(current)->status |= TS_USEDFPU; + } + err = restore_fpu_checking((__force struct i387_fxsave_struct *)buf); + if (unlikely(err)) { + /* + * Encountered an error while doing the restore from the + * user buffer, clear the fpu state. + */ + clear_fpu(tsk); + clear_used_math(); + } + return err; +} /* * Do a signal return; undo the signal stack. @@ -487,12 +549,6 @@ static void do_signal(struct pt_regs *regs) void do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) { - /* Pending single-step? */ - if (thread_info_flags & _TIF_SINGLESTEP) { - regs->flags |= X86_EFLAGS_TF; - clear_thread_flag(TIF_SINGLESTEP); - } - #ifdef CONFIG_X86_MCE /* notify userspace of pending MCEs */ if (thread_info_flags & _TIF_MCE_NOTIFY) @@ -502,9 +558,6 @@ void do_notify_resume(struct pt_regs *regs, void *unused, /* deal with pending signal delivery */ if (thread_info_flags & _TIF_SIGPENDING) do_signal(regs); - - if (thread_info_flags & _TIF_HRTICK_RESCHED) - hrtick_resched(); } void signal_fault(struct pt_regs *regs, void __user *frame, char *where) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 687376ab07e..7985c5b3f91 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -216,7 +216,7 @@ static void __cpuinit smp_callin(void) panic("%s: phys CPU#%d, CPU#%d already present??\n", __func__, phys_id, cpuid); } - Dprintk("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id); + pr_debug("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id); /* * STARTUP IPIs are fragile beasts as they might sometimes @@ -251,7 +251,7 @@ static void __cpuinit smp_callin(void) * boards) */ - Dprintk("CALLIN, before setup_local_APIC().\n"); + pr_debug("CALLIN, before setup_local_APIC().\n"); smp_callin_clear_local_apic(); setup_local_APIC(); end_local_APIC_setup(); @@ -266,7 +266,7 @@ static void __cpuinit smp_callin(void) local_irq_enable(); calibrate_delay(); local_irq_disable(); - Dprintk("Stack at about %p\n", &cpuid); + pr_debug("Stack at about %p\n", &cpuid); /* * Save our processor parameters @@ -326,12 +326,16 @@ static void __cpuinit start_secondary(void *unused) * for which cpus receive the IPI. Holding this * lock helps us to not include this cpu in a currently in progress * smp_call_function(). + * + * We need to hold vector_lock so there the set of online cpus + * does not change while we are assigning vectors to cpus. Holding + * this lock ensures we don't half assign or remove an irq from a cpu. */ ipi_call_lock_irq(); -#ifdef CONFIG_X86_IO_APIC - setup_vector_irq(smp_processor_id()); -#endif + lock_vector_lock(); + __setup_vector_irq(smp_processor_id()); cpu_set(smp_processor_id(), cpu_online_map); + unlock_vector_lock(); ipi_call_unlock_irq(); per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; @@ -438,7 +442,7 @@ void __cpuinit set_cpu_sibling_map(int cpu) cpu_set(cpu, cpu_sibling_setup_map); if (smp_num_siblings > 1) { - for_each_cpu_mask(i, cpu_sibling_setup_map) { + for_each_cpu_mask_nr(i, cpu_sibling_setup_map) { if (c->phys_proc_id == cpu_data(i).phys_proc_id && c->cpu_core_id == cpu_data(i).cpu_core_id) { cpu_set(i, per_cpu(cpu_sibling_map, cpu)); @@ -461,7 +465,7 @@ void __cpuinit set_cpu_sibling_map(int cpu) return; } - for_each_cpu_mask(i, cpu_sibling_setup_map) { + for_each_cpu_mask_nr(i, cpu_sibling_setup_map) { if (per_cpu(cpu_llc_id, cpu) != BAD_APICID && per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) { cpu_set(i, c->llc_shared_map); @@ -513,7 +517,7 @@ static void impress_friends(void) /* * Allow the user to impress friends. */ - Dprintk("Before bogomips.\n"); + pr_debug("Before bogomips.\n"); for_each_possible_cpu(cpu) if (cpu_isset(cpu, cpu_callout_map)) bogosum += cpu_data(cpu).loops_per_jiffy; @@ -523,7 +527,7 @@ static void impress_friends(void) bogosum/(500000/HZ), (bogosum/(5000/HZ))%100); - Dprintk("Before bogocount - setting activated=1.\n"); + pr_debug("Before bogocount - setting activated=1.\n"); } static inline void __inquire_remote_apic(int apicid) @@ -546,8 +550,8 @@ static inline void __inquire_remote_apic(int apicid) printk(KERN_CONT "a previous APIC delivery may have failed\n"); - apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); - apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]); + apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); + apic_write(APIC_ICR, APIC_DM_REMRD | regs[i]); timeout = 0; do { @@ -579,29 +583,24 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) int maxlvt; /* Target chip */ - apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid)); + apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid)); /* Boot on the stack */ /* Kick the second */ - apic_write_around(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL); + apic_write(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL); - Dprintk("Waiting for send to finish...\n"); + pr_debug("Waiting for send to finish...\n"); send_status = safe_apic_wait_icr_idle(); /* * Give the other CPU some time to accept the IPI. */ udelay(200); - /* - * Due to the Pentium erratum 3AP. - */ maxlvt = lapic_get_maxlvt(); - if (maxlvt > 3) { - apic_read_around(APIC_SPIV); + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ apic_write(APIC_ESR, 0); - } accept_status = (apic_read(APIC_ESR) & 0xEF); - Dprintk("NMI sent.\n"); + pr_debug("NMI sent.\n"); if (send_status) printk(KERN_ERR "APIC never delivered???\n"); @@ -625,42 +624,44 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) return send_status; } + maxlvt = lapic_get_maxlvt(); + /* * Be paranoid about clearing APIC errors. */ if (APIC_INTEGRATED(apic_version[phys_apicid])) { - apic_read_around(APIC_SPIV); - apic_write(APIC_ESR, 0); + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); apic_read(APIC_ESR); } - Dprintk("Asserting INIT.\n"); + pr_debug("Asserting INIT.\n"); /* * Turn INIT on target chip */ - apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); + apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); /* * Send IPI */ - apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT - | APIC_DM_INIT); + apic_write(APIC_ICR, + APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT); - Dprintk("Waiting for send to finish...\n"); + pr_debug("Waiting for send to finish...\n"); send_status = safe_apic_wait_icr_idle(); mdelay(10); - Dprintk("Deasserting INIT.\n"); + pr_debug("Deasserting INIT.\n"); /* Target chip */ - apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); + apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); /* Send IPI */ - apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT); + apic_write(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT); - Dprintk("Waiting for send to finish...\n"); + pr_debug("Waiting for send to finish...\n"); send_status = safe_apic_wait_icr_idle(); mb(); @@ -687,55 +688,47 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) /* * Run STARTUP IPI loop. */ - Dprintk("#startup loops: %d.\n", num_starts); - - maxlvt = lapic_get_maxlvt(); + pr_debug("#startup loops: %d.\n", num_starts); for (j = 1; j <= num_starts; j++) { - Dprintk("Sending STARTUP #%d.\n", j); - apic_read_around(APIC_SPIV); - apic_write(APIC_ESR, 0); + pr_debug("Sending STARTUP #%d.\n", j); + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); apic_read(APIC_ESR); - Dprintk("After apic_write.\n"); + pr_debug("After apic_write.\n"); /* * STARTUP IPI */ /* Target chip */ - apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); + apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); /* Boot on the stack */ /* Kick the second */ - apic_write_around(APIC_ICR, APIC_DM_STARTUP - | (start_eip >> 12)); + apic_write(APIC_ICR, APIC_DM_STARTUP | (start_eip >> 12)); /* * Give the other CPU some time to accept the IPI. */ udelay(300); - Dprintk("Startup point 1.\n"); + pr_debug("Startup point 1.\n"); - Dprintk("Waiting for send to finish...\n"); + pr_debug("Waiting for send to finish...\n"); send_status = safe_apic_wait_icr_idle(); /* * Give the other CPU some time to accept the IPI. */ udelay(200); - /* - * Due to the Pentium erratum 3AP. - */ - if (maxlvt > 3) { - apic_read_around(APIC_SPIV); + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ apic_write(APIC_ESR, 0); - } accept_status = (apic_read(APIC_ESR) & 0xEF); if (send_status || accept_status) break; } - Dprintk("After Startup.\n"); + pr_debug("After Startup.\n"); if (send_status) printk(KERN_ERR "APIC never delivered???\n"); @@ -763,12 +756,20 @@ static void __cpuinit do_fork_idle(struct work_struct *work) } #ifdef CONFIG_X86_64 + +/* __ref because it's safe to call free_bootmem when after_bootmem == 0. */ +static void __ref free_bootmem_pda(struct x8664_pda *oldpda) +{ + if (!after_bootmem) + free_bootmem((unsigned long)oldpda, sizeof(*oldpda)); +} + /* * Allocate node local memory for the AP pda. * * Must be called after the _cpu_pda pointer table is initialized. */ -static int __cpuinit get_local_pda(int cpu) +int __cpuinit get_local_pda(int cpu) { struct x8664_pda *oldpda, *newpda; unsigned long size = sizeof(struct x8664_pda); @@ -791,8 +792,7 @@ static int __cpuinit get_local_pda(int cpu) if (oldpda) { memcpy(newpda, oldpda, size); - if (!after_bootmem) - free_bootmem((unsigned long)oldpda, size); + free_bootmem_pda(oldpda); } newpda->in_bootmem = 0; @@ -886,7 +886,7 @@ do_rest: if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { - Dprintk("Setting warm reset code and vector.\n"); + pr_debug("Setting warm reset code and vector.\n"); store_NMI_vector(&nmi_high, &nmi_low); @@ -907,9 +907,9 @@ do_rest: /* * allow APs to start initializing. */ - Dprintk("Before Callout %d.\n", cpu); + pr_debug("Before Callout %d.\n", cpu); cpu_set(cpu, cpu_callout_map); - Dprintk("After Callout %d.\n", cpu); + pr_debug("After Callout %d.\n", cpu); /* * Wait 5s total for a response @@ -922,10 +922,10 @@ do_rest: if (cpu_isset(cpu, cpu_callin_map)) { /* number CPUs logically, starting from 1 (BSP is 0) */ - Dprintk("OK.\n"); + pr_debug("OK.\n"); printk(KERN_INFO "CPU%d: ", cpu); print_cpu_info(&cpu_data(cpu)); - Dprintk("CPU has booted.\n"); + pr_debug("CPU has booted.\n"); } else { boot_error = 1; if (*((volatile unsigned char *)trampoline_base) @@ -970,7 +970,7 @@ int __cpuinit native_cpu_up(unsigned int cpu) WARN_ON(irqs_disabled()); - Dprintk("++++++++++++++++++++=_---CPU UP %u\n", cpu); + pr_debug("++++++++++++++++++++=_---CPU UP %u\n", cpu); if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid || !physid_isset(apicid, phys_cpu_present_map)) { @@ -982,7 +982,7 @@ int __cpuinit native_cpu_up(unsigned int cpu) * Already booted CPU? */ if (cpu_isset(cpu, cpu_callin_map)) { - Dprintk("do_boot_cpu %d Already started\n", cpu); + pr_debug("do_boot_cpu %d Already started\n", cpu); return -ENOSYS; } @@ -1009,7 +1009,7 @@ int __cpuinit native_cpu_up(unsigned int cpu) err = do_boot_cpu(apicid, cpu); #endif if (err) { - Dprintk("do_boot_cpu failed %d\n", err); + pr_debug("do_boot_cpu failed %d\n", err); return -EIO; } @@ -1055,6 +1055,34 @@ static __init void disable_smp(void) static int __init smp_sanity_check(unsigned max_cpus) { preempt_disable(); + +#if defined(CONFIG_X86_PC) && defined(CONFIG_X86_32) + if (def_to_bigsmp && nr_cpu_ids > 8) { + unsigned int cpu; + unsigned nr; + + printk(KERN_WARNING + "More than 8 CPUs detected - skipping them.\n" + "Use CONFIG_X86_GENERICARCH and CONFIG_X86_BIGSMP.\n"); + + nr = 0; + for_each_present_cpu(cpu) { + if (nr >= 8) + cpu_clear(cpu, cpu_present_map); + nr++; + } + + nr = 0; + for_each_possible_cpu(cpu) { + if (nr >= 8) + cpu_clear(cpu, cpu_possible_map); + nr++; + } + + nr_cpu_ids = 8; + } +#endif + if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) { printk(KERN_WARNING "weird, boot CPU (#%d) not listed" "by the BIOS.\n", hard_smp_processor_id()); @@ -1193,6 +1221,9 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) printk(KERN_INFO "CPU%d: ", 0); print_cpu_info(&cpu_data(0)); setup_boot_clock(); + + if (is_uv_system()) + uv_system_init(); out: preempt_enable(); } @@ -1213,7 +1244,7 @@ void __init native_smp_prepare_boot_cpu(void) void __init native_smp_cpus_done(unsigned int max_cpus) { - Dprintk("Boot done.\n"); + pr_debug("Boot done.\n"); impress_friends(); smp_checks(); @@ -1230,7 +1261,7 @@ static void remove_siblinginfo(int cpu) int sibling; struct cpuinfo_x86 *c = &cpu_data(cpu); - for_each_cpu_mask(sibling, per_cpu(cpu_core_map, cpu)) { + for_each_cpu_mask_nr(sibling, per_cpu(cpu_core_map, cpu)) { cpu_clear(cpu, per_cpu(cpu_core_map, sibling)); /*/ * last thread sibling in this cpu core going down @@ -1239,7 +1270,7 @@ static void remove_siblinginfo(int cpu) cpu_data(sibling).booted_cores--; } - for_each_cpu_mask(sibling, per_cpu(cpu_sibling_map, cpu)) + for_each_cpu_mask_nr(sibling, per_cpu(cpu_sibling_map, cpu)) cpu_clear(cpu, per_cpu(cpu_sibling_map, sibling)); cpus_clear(per_cpu(cpu_sibling_map, cpu)); cpus_clear(per_cpu(cpu_core_map, cpu)); @@ -1311,7 +1342,7 @@ static void __ref remove_cpu_from_maps(int cpu) cpu_clear(cpu, cpu_callout_map); cpu_clear(cpu, cpu_callin_map); /* was set by cpu_init() */ - clear_bit(cpu, (unsigned long *)&cpu_initialized); + cpu_clear(cpu, cpu_initialized); numa_remove_cpu(cpu); } @@ -1347,7 +1378,9 @@ int __cpu_disable(void) remove_siblinginfo(cpu); /* It's now safe to remove this processor from the online map */ + lock_vector_lock(); remove_cpu_from_maps(cpu); + unlock_vector_lock(); fixup_irqs(cpu_online_map); return 0; } @@ -1381,16 +1414,3 @@ void __cpu_die(unsigned int cpu) BUG(); } #endif - -/* - * If the BIOS enumerates physical processors before logical, - * maxcpus=N at enumeration-time can be used to disable HT. - */ -static int __init parse_maxcpus(char *arg) -{ - extern unsigned int maxcpus; - - maxcpus = simple_strtoul(arg, NULL, 0); - return 0; -} -early_param("maxcpus", parse_maxcpus); diff --git a/arch/x86/kernel/smpcommon.c b/arch/x86/kernel/smpcommon.c index 99941b37eca..397e309839d 100644 --- a/arch/x86/kernel/smpcommon.c +++ b/arch/x86/kernel/smpcommon.c @@ -8,18 +8,21 @@ DEFINE_PER_CPU(unsigned long, this_cpu_off); EXPORT_PER_CPU_SYMBOL(this_cpu_off); -/* Initialize the CPU's GDT. This is either the boot CPU doing itself - (still using the master per-cpu area), or a CPU doing it for a - secondary which will soon come up. */ +/* + * Initialize the CPU's GDT. This is either the boot CPU doing itself + * (still using the master per-cpu area), or a CPU doing it for a + * secondary which will soon come up. + */ __cpuinit void init_gdt(int cpu) { - struct desc_struct *gdt = get_cpu_gdt_table(cpu); + struct desc_struct gdt; - pack_descriptor(&gdt[GDT_ENTRY_PERCPU], - __per_cpu_offset[cpu], 0xFFFFF, + pack_descriptor(&gdt, __per_cpu_offset[cpu], 0xFFFFF, 0x2 | DESCTYPE_S, 0x8); + gdt.s = 1; - gdt[GDT_ENTRY_PERCPU].s = 1; + write_gdt_entry(get_cpu_gdt_table(cpu), + GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S); per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu]; per_cpu(cpu_number, cpu) = cpu; diff --git a/arch/x86/kernel/smpcommon_32.c b/arch/x86/kernel/smpcommon_32.c deleted file mode 100644 index 8b137891791..00000000000 --- a/arch/x86/kernel/smpcommon_32.c +++ /dev/null @@ -1 +0,0 @@ - diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index 92c20fee678..e8b9863ef8c 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -105,6 +105,20 @@ static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs) static int enable_single_step(struct task_struct *child) { struct pt_regs *regs = task_pt_regs(child); + unsigned long oflags; + + /* + * If we stepped into a sysenter/syscall insn, it trapped in + * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP. + * If user-mode had set TF itself, then it's still clear from + * do_debug() and we need to set it again to restore the user + * state so we don't wrongly set TIF_FORCED_TF below. + * If enable_single_step() was used last and that is what + * set TIF_SINGLESTEP, then both TF and TIF_FORCED_TF are + * already set and our bookkeeping is fine. + */ + if (unlikely(test_tsk_thread_flag(child, TIF_SINGLESTEP))) + regs->flags |= X86_EFLAGS_TF; /* * Always set TIF_SINGLESTEP - this guarantees that @@ -113,11 +127,7 @@ static int enable_single_step(struct task_struct *child) */ set_tsk_thread_flag(child, TIF_SINGLESTEP); - /* - * If TF was already set, don't do anything else - */ - if (regs->flags & X86_EFLAGS_TF) - return 0; + oflags = regs->flags; /* Set TF on the kernel stack.. */ regs->flags |= X86_EFLAGS_TF; @@ -126,9 +136,22 @@ static int enable_single_step(struct task_struct *child) * ..but if TF is changed by the instruction we will trace, * don't mark it as being "us" that set it, so that we * won't clear it by hand later. + * + * Note that if we don't actually execute the popf because + * of a signal arriving right now or suchlike, we will lose + * track of the fact that it really was "us" that set it. */ - if (is_setting_trap_flag(child, regs)) + if (is_setting_trap_flag(child, regs)) { + clear_tsk_thread_flag(child, TIF_FORCED_TF); return 0; + } + + /* + * If TF was already set, check whether it was us who set it. + * If not, we should never attempt a block step. + */ + if (oflags & X86_EFLAGS_TF) + return test_tsk_thread_flag(child, TIF_FORCED_TF); set_tsk_thread_flag(child, TIF_FORCED_TF); diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index adff5562f5f..d44395ff34c 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -326,3 +326,9 @@ ENTRY(sys_call_table) .long sys_fallocate .long sys_timerfd_settime /* 325 */ .long sys_timerfd_gettime + .long sys_signalfd4 + .long sys_eventfd2 + .long sys_epoll_create1 + .long sys_dup3 /* 330 */ + .long sys_pipe2 + .long sys_inotify_init1 diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c index 059ca6ee59b..ffe3c664afc 100644 --- a/arch/x86/kernel/time_32.c +++ b/arch/x86/kernel/time_32.c @@ -129,6 +129,7 @@ void __init hpet_time_init(void) */ void __init time_init(void) { + pre_time_init_hook(); tsc_init(); late_time_init = choose_time_init(); } diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index d0fbb7712ab..8b8c0d6640f 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -17,6 +17,7 @@ #include <asm/genapic.h> #include <asm/idle.h> #include <asm/tsc.h> +#include <asm/irq_vectors.h> #include <mach_apic.h> @@ -783,7 +784,7 @@ static int __init uv_bau_init(void) uv_init_blade(blade, node, cur_cpu); cur_cpu += uv_blade_nr_possible_cpus(blade); } - set_intr_gate(UV_BAU_MESSAGE, uv_bau_message_intr1); + alloc_intr_gate(UV_BAU_MESSAGE, uv_bau_message_intr1); uv_enable_timeouts(); return 0; diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 8a768973c4f..03df8e45e5a 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -58,6 +58,7 @@ #include <asm/nmi.h> #include <asm/smp.h> #include <asm/io.h> +#include <asm/traps.h> #include "mach_traps.h" @@ -77,26 +78,6 @@ char ignore_fpu_irq; gate_desc idt_table[256] __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; -asmlinkage void divide_error(void); -asmlinkage void debug(void); -asmlinkage void nmi(void); -asmlinkage void int3(void); -asmlinkage void overflow(void); -asmlinkage void bounds(void); -asmlinkage void invalid_op(void); -asmlinkage void device_not_available(void); -asmlinkage void coprocessor_segment_overrun(void); -asmlinkage void invalid_TSS(void); -asmlinkage void segment_not_present(void); -asmlinkage void stack_segment(void); -asmlinkage void general_protection(void); -asmlinkage void page_fault(void); -asmlinkage void coprocessor_error(void); -asmlinkage void simd_coprocessor_error(void); -asmlinkage void alignment_check(void); -asmlinkage void spurious_interrupt_bug(void); -asmlinkage void machine_check(void); - int panic_on_unrecovered_nmi; int kstack_depth_to_print = 24; static unsigned int code_bytes = 64; @@ -256,7 +237,7 @@ static const struct stacktrace_ops print_trace_ops = { static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, char *log_lvl) + unsigned long *stack, unsigned long bp, char *log_lvl) { dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); printk("%s =======================\n", log_lvl); @@ -383,6 +364,54 @@ int is_valid_bugaddr(unsigned long ip) return ud2 == 0x0b0f; } +static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; +static int die_owner = -1; +static unsigned int die_nest_count; + +unsigned __kprobes long oops_begin(void) +{ + unsigned long flags; + + oops_enter(); + + if (die_owner != raw_smp_processor_id()) { + console_verbose(); + raw_local_irq_save(flags); + __raw_spin_lock(&die_lock); + die_owner = smp_processor_id(); + die_nest_count = 0; + bust_spinlocks(1); + } else { + raw_local_irq_save(flags); + } + die_nest_count++; + return flags; +} + +void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) +{ + bust_spinlocks(0); + die_owner = -1; + add_taint(TAINT_DIE); + __raw_spin_unlock(&die_lock); + raw_local_irq_restore(flags); + + if (!regs) + return; + + if (kexec_should_crash(current)) + crash_kexec(regs); + + if (in_interrupt()) + panic("Fatal exception in interrupt"); + + if (panic_on_oops) + panic("Fatal exception"); + + oops_exit(); + do_exit(signr); +} + int __kprobes __die(const char *str, struct pt_regs *regs, long err) { unsigned short ss; @@ -423,31 +452,9 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err) */ void die(const char *str, struct pt_regs *regs, long err) { - static struct { - raw_spinlock_t lock; - u32 lock_owner; - int lock_owner_depth; - } die = { - .lock = __RAW_SPIN_LOCK_UNLOCKED, - .lock_owner = -1, - .lock_owner_depth = 0 - }; - unsigned long flags; - - oops_enter(); - - if (die.lock_owner != raw_smp_processor_id()) { - console_verbose(); - raw_local_irq_save(flags); - __raw_spin_lock(&die.lock); - die.lock_owner = smp_processor_id(); - die.lock_owner_depth = 0; - bust_spinlocks(1); - } else { - raw_local_irq_save(flags); - } + unsigned long flags = oops_begin(); - if (++die.lock_owner_depth < 3) { + if (die_nest_count < 3) { report_bug(regs->ip, regs); if (__die(str, regs, err)) @@ -456,26 +463,7 @@ void die(const char *str, struct pt_regs *regs, long err) printk(KERN_EMERG "Recursive die() failure, output suppressed\n"); } - bust_spinlocks(0); - die.lock_owner = -1; - add_taint(TAINT_DIE); - __raw_spin_unlock(&die.lock); - raw_local_irq_restore(flags); - - if (!regs) - return; - - if (kexec_should_crash(current)) - crash_kexec(regs); - - if (in_interrupt()) - panic("Fatal exception in interrupt"); - - if (panic_on_oops) - panic("Fatal exception"); - - oops_exit(); - do_exit(SIGSEGV); + oops_end(flags, regs, SIGSEGV); } static inline void diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index 2696a683778..513caaca711 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -51,30 +51,10 @@ #include <asm/pgalloc.h> #include <asm/proto.h> #include <asm/pda.h> +#include <asm/traps.h> #include <mach_traps.h> -asmlinkage void divide_error(void); -asmlinkage void debug(void); -asmlinkage void nmi(void); -asmlinkage void int3(void); -asmlinkage void overflow(void); -asmlinkage void bounds(void); -asmlinkage void invalid_op(void); -asmlinkage void device_not_available(void); -asmlinkage void double_fault(void); -asmlinkage void coprocessor_segment_overrun(void); -asmlinkage void invalid_TSS(void); -asmlinkage void segment_not_present(void); -asmlinkage void stack_segment(void); -asmlinkage void general_protection(void); -asmlinkage void page_fault(void); -asmlinkage void coprocessor_error(void); -asmlinkage void simd_coprocessor_error(void); -asmlinkage void alignment_check(void); -asmlinkage void spurious_interrupt_bug(void); -asmlinkage void machine_check(void); - int panic_on_unrecovered_nmi; int kstack_depth_to_print = 12; static unsigned int code_bytes = 64; @@ -355,17 +335,24 @@ static const struct stacktrace_ops print_trace_ops = { .address = print_trace_address, }; -void show_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp) +static void +show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, unsigned long bp, char *log_lvl) { printk("\nCall Trace:\n"); - dump_trace(task, regs, stack, bp, &print_trace_ops, NULL); + dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); printk("\n"); } +void show_trace(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, unsigned long bp) +{ + show_trace_log_lvl(task, regs, stack, bp, ""); +} + static void -_show_stack(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, unsigned long bp) +show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *sp, unsigned long bp, char *log_lvl) { unsigned long *stack; int i; @@ -399,12 +386,12 @@ _show_stack(struct task_struct *task, struct pt_regs *regs, printk(" %016lx", *stack++); touch_nmi_watchdog(); } - show_trace(task, regs, sp, bp); + show_trace_log_lvl(task, regs, sp, bp, log_lvl); } void show_stack(struct task_struct *task, unsigned long *sp) { - _show_stack(task, NULL, sp, 0); + show_stack_log_lvl(task, NULL, sp, 0, ""); } /* @@ -454,7 +441,8 @@ void show_registers(struct pt_regs *regs) u8 *ip; printk("Stack: "); - _show_stack(NULL, regs, (unsigned long *)sp, regs->bp); + show_stack_log_lvl(NULL, regs, (unsigned long *)sp, + regs->bp, ""); printk("\n"); printk(KERN_EMERG "Code: "); @@ -518,7 +506,7 @@ unsigned __kprobes long oops_begin(void) } void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) -{ +{ die_owner = -1; bust_spinlocks(0); die_nest_count--; @@ -1143,7 +1131,14 @@ asmlinkage void math_state_restore(void) } clts(); /* Allow maths ops (or we recurse) */ - restore_fpu_checking(&me->thread.xstate->fxsave); + /* + * Paranoid restore. send a SIGSEGV if we fail to restore the state. + */ + if (unlikely(restore_fpu_checking(&me->thread.xstate->fxsave))) { + stts(); + force_sig(SIGSEGV, me); + return; + } task_thread_info(me)->status |= TS_USEDFPU; me->fpu_counter++; } diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 7603c055390..8f98e9de1b8 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -104,7 +104,7 @@ __setup("notsc", notsc_setup); /* * Read TSC and the reference counters. Take care of SMI disturbance */ -static u64 __init tsc_read_refs(u64 *pm, u64 *hpet) +static u64 tsc_read_refs(u64 *pm, u64 *hpet) { u64 t1, t2; int i; @@ -122,80 +122,216 @@ static u64 __init tsc_read_refs(u64 *pm, u64 *hpet) return ULLONG_MAX; } -/** - * native_calibrate_tsc - calibrate the tsc on boot +/* + * Try to calibrate the TSC against the Programmable + * Interrupt Timer and return the frequency of the TSC + * in kHz. + * + * Return ULONG_MAX on failure to calibrate. */ -unsigned long native_calibrate_tsc(void) +static unsigned long pit_calibrate_tsc(void) { - unsigned long flags; - u64 tsc1, tsc2, tr1, tr2, delta, pm1, pm2, hpet1, hpet2; - int hpet = is_hpet_enabled(); - unsigned int tsc_khz_val = 0; - - local_irq_save(flags); - - tsc1 = tsc_read_refs(&pm1, hpet ? &hpet1 : NULL); + u64 tsc, t1, t2, delta; + unsigned long tscmin, tscmax; + int pitcnt; + /* Set the Gate high, disable speaker */ outb((inb(0x61) & ~0x02) | 0x01, 0x61); + /* + * Setup CTC channel 2* for mode 0, (interrupt on terminal + * count mode), binary count. Set the latch register to 50ms + * (LSB then MSB) to begin countdown. + */ outb(0xb0, 0x43); outb((CLOCK_TICK_RATE / (1000 / 50)) & 0xff, 0x42); outb((CLOCK_TICK_RATE / (1000 / 50)) >> 8, 0x42); - tr1 = get_cycles(); - while ((inb(0x61) & 0x20) == 0); - tr2 = get_cycles(); - tsc2 = tsc_read_refs(&pm2, hpet ? &hpet2 : NULL); + tsc = t1 = t2 = get_cycles(); - local_irq_restore(flags); + pitcnt = 0; + tscmax = 0; + tscmin = ULONG_MAX; + while ((inb(0x61) & 0x20) == 0) { + t2 = get_cycles(); + delta = t2 - tsc; + tsc = t2; + if ((unsigned long) delta < tscmin) + tscmin = (unsigned int) delta; + if ((unsigned long) delta > tscmax) + tscmax = (unsigned int) delta; + pitcnt++; + } /* - * Preset the result with the raw and inaccurate PIT - * calibration value + * Sanity checks: + * + * If we were not able to read the PIT more than 5000 + * times, then we have been hit by a massive SMI + * + * If the maximum is 10 times larger than the minimum, + * then we got hit by an SMI as well. */ - delta = (tr2 - tr1); + if (pitcnt < 5000 || tscmax > 10 * tscmin) + return ULONG_MAX; + + /* Calculate the PIT value */ + delta = t2 - t1; do_div(delta, 50); - tsc_khz_val = delta; + return delta; +} + + +/** + * native_calibrate_tsc - calibrate the tsc on boot + */ +unsigned long native_calibrate_tsc(void) +{ + u64 tsc1, tsc2, delta, pm1, pm2, hpet1, hpet2; + unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; + unsigned long flags; + int hpet = is_hpet_enabled(), i; + + /* + * Run 5 calibration loops to get the lowest frequency value + * (the best estimate). We use two different calibration modes + * here: + * + * 1) PIT loop. We set the PIT Channel 2 to oneshot mode and + * load a timeout of 50ms. We read the time right after we + * started the timer and wait until the PIT count down reaches + * zero. In each wait loop iteration we read the TSC and check + * the delta to the previous read. We keep track of the min + * and max values of that delta. The delta is mostly defined + * by the IO time of the PIT access, so we can detect when a + * SMI/SMM disturbance happend between the two reads. If the + * maximum time is significantly larger than the minimum time, + * then we discard the result and have another try. + * + * 2) Reference counter. If available we use the HPET or the + * PMTIMER as a reference to check the sanity of that value. + * We use separate TSC readouts and check inside of the + * reference read for a SMI/SMM disturbance. We dicard + * disturbed values here as well. We do that around the PIT + * calibration delay loop as we have to wait for a certain + * amount of time anyway. + */ + for (i = 0; i < 5; i++) { + unsigned long tsc_pit_khz; + + /* + * Read the start value and the reference count of + * hpet/pmtimer when available. Then do the PIT + * calibration, which will take at least 50ms, and + * read the end value. + */ + local_irq_save(flags); + tsc1 = tsc_read_refs(&pm1, hpet ? &hpet1 : NULL); + tsc_pit_khz = pit_calibrate_tsc(); + tsc2 = tsc_read_refs(&pm2, hpet ? &hpet2 : NULL); + local_irq_restore(flags); + + /* Pick the lowest PIT TSC calibration so far */ + tsc_pit_min = min(tsc_pit_min, tsc_pit_khz); + + /* hpet or pmtimer available ? */ + if (!hpet && !pm1 && !pm2) + continue; + + /* Check, whether the sampling was disturbed by an SMI */ + if (tsc1 == ULLONG_MAX || tsc2 == ULLONG_MAX) + continue; + + tsc2 = (tsc2 - tsc1) * 1000000LL; + + if (hpet) { + if (hpet2 < hpet1) + hpet2 += 0x100000000ULL; + hpet2 -= hpet1; + tsc1 = ((u64)hpet2 * hpet_readl(HPET_PERIOD)); + do_div(tsc1, 1000000); + } else { + if (pm2 < pm1) + pm2 += (u64)ACPI_PM_OVRRUN; + pm2 -= pm1; + tsc1 = pm2 * 1000000000LL; + do_div(tsc1, PMTMR_TICKS_PER_SEC); + } + + do_div(tsc2, tsc1); + tsc_ref_min = min(tsc_ref_min, (unsigned long) tsc2); + } + + /* + * Now check the results. + */ + if (tsc_pit_min == ULONG_MAX) { + /* PIT gave no useful value */ + printk(KERN_WARNING "TSC: Unable to calibrate against PIT\n"); + + /* We don't have an alternative source, disable TSC */ + if (!hpet && !pm1 && !pm2) { + printk("TSC: No reference (HPET/PMTIMER) available\n"); + return 0; + } + + /* The alternative source failed as well, disable TSC */ + if (tsc_ref_min == ULONG_MAX) { + printk(KERN_WARNING "TSC: HPET/PMTIMER calibration " + "failed due to SMI disturbance.\n"); + return 0; + } + + /* Use the alternative source */ + printk(KERN_INFO "TSC: using %s reference calibration\n", + hpet ? "HPET" : "PMTIMER"); + + return tsc_ref_min; + } - /* hpet or pmtimer available ? */ + /* We don't have an alternative source, use the PIT calibration value */ if (!hpet && !pm1 && !pm2) { - printk(KERN_INFO "TSC calibrated against PIT\n"); - goto out; + printk(KERN_INFO "TSC: Using PIT calibration value\n"); + return tsc_pit_min; } - /* Check, whether the sampling was disturbed by an SMI */ - if (tsc1 == ULLONG_MAX || tsc2 == ULLONG_MAX) { - printk(KERN_WARNING "TSC calibration disturbed by SMI, " - "using PIT calibration result\n"); - goto out; + /* The alternative source failed, use the PIT calibration value */ + if (tsc_ref_min == ULONG_MAX) { + printk(KERN_WARNING "TSC: HPET/PMTIMER calibration failed due " + "to SMI disturbance. Using PIT calibration\n"); + return tsc_pit_min; } - tsc2 = (tsc2 - tsc1) * 1000000LL; - - if (hpet) { - printk(KERN_INFO "TSC calibrated against HPET\n"); - if (hpet2 < hpet1) - hpet2 += 0x100000000ULL; - hpet2 -= hpet1; - tsc1 = ((u64)hpet2 * hpet_readl(HPET_PERIOD)); - do_div(tsc1, 1000000); - } else { - printk(KERN_INFO "TSC calibrated against PM_TIMER\n"); - if (pm2 < pm1) - pm2 += (u64)ACPI_PM_OVRRUN; - pm2 -= pm1; - tsc1 = pm2 * 1000000000LL; - do_div(tsc1, PMTMR_TICKS_PER_SEC); + /* Check the reference deviation */ + delta = ((u64) tsc_pit_min) * 100; + do_div(delta, tsc_ref_min); + + /* + * If both calibration results are inside a 5% window, the we + * use the lower frequency of those as it is probably the + * closest estimate. + */ + if (delta >= 95 && delta <= 105) { + printk(KERN_INFO "TSC: PIT calibration confirmed by %s.\n", + hpet ? "HPET" : "PMTIMER"); + printk(KERN_INFO "TSC: using %s calibration value\n", + tsc_pit_min <= tsc_ref_min ? "PIT" : + hpet ? "HPET" : "PMTIMER"); + return tsc_pit_min <= tsc_ref_min ? tsc_pit_min : tsc_ref_min; } - do_div(tsc2, tsc1); - tsc_khz_val = tsc2; + printk(KERN_WARNING "TSC: PIT calibration deviates from %s: %lu %lu.\n", + hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min); -out: - return tsc_khz_val; + /* + * The calibration values differ too much. In doubt, we use + * the PIT value as we know that there are PMTIMERs around + * running at double speed. + */ + printk(KERN_INFO "TSC: Using PIT calibration value\n"); + return tsc_pit_min; } - #ifdef CONFIG_X86_32 /* Only called from the Powernow K7 cpu freq driver */ int recalibrate_cpu_khz(void) @@ -314,7 +450,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, mark_tsc_unstable("cpufreq changes"); } - set_cyc2ns_scale(tsc_khz_ref, freq->cpu); + set_cyc2ns_scale(tsc_khz, freq->cpu); return 0; } @@ -325,6 +461,10 @@ static struct notifier_block time_cpufreq_notifier_block = { static int __init cpufreq_tsc(void) { + if (!cpu_has_tsc) + return 0; + if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) + return 0; cpufreq_register_notifier(&time_cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); return 0; diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 0577825cf89..9ffb01c31c4 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -88,11 +88,9 @@ static __cpuinit void check_tsc_warp(void) __raw_spin_unlock(&sync_lock); } } - if (!(now-start)) { - printk("Warning: zero tsc calibration delta: %Ld [max: %Ld]\n", + WARN(!(now-start), + "Warning: zero tsc calibration delta: %Ld [max: %Ld]\n", now-start, end-start); - WARN_ON(1); - } } /* diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index e94bdb6add1..594ef47f0a6 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -73,7 +73,7 @@ int is_visws_box(void) return visws_board_type >= 0; } -static int __init visws_time_init_quirk(void) +static int __init visws_time_init(void) { printk(KERN_INFO "Starting Cobalt Timer system clock\n"); @@ -93,7 +93,7 @@ static int __init visws_time_init_quirk(void) return 0; } -static int __init visws_pre_intr_init_quirk(void) +static int __init visws_pre_intr_init(void) { init_VISWS_APIC_irqs(); @@ -114,7 +114,7 @@ EXPORT_SYMBOL(sgivwfb_mem_size); long long mem_size __initdata = 0; -static char * __init visws_memory_setup_quirk(void) +static char * __init visws_memory_setup(void) { long long gfx_mem_size = 8 * MB; @@ -176,7 +176,7 @@ static void visws_machine_power_off(void) outl(PIIX_SPECIAL_STOP, 0xCFC); } -static int __init visws_get_smp_config_quirk(unsigned int early) +static int __init visws_get_smp_config(unsigned int early) { /* * Prevent MP-table parsing by the generic code: @@ -184,15 +184,13 @@ static int __init visws_get_smp_config_quirk(unsigned int early) return 1; } -extern unsigned int __cpuinitdata maxcpus; - /* * The Visual Workstation is Intel MP compliant in the hardware * sense, but it doesn't have a BIOS(-configuration table). * No problem for Linux. */ -static void __init MP_processor_info (struct mpc_config_processor *m) +static void __init MP_processor_info(struct mpc_config_processor *m) { int ver, logical_apicid; physid_mask_t apic_cpus; @@ -232,7 +230,7 @@ static void __init MP_processor_info (struct mpc_config_processor *m) apic_version[m->mpc_apicid] = ver; } -int __init visws_find_smp_config_quirk(unsigned int reserve) +static int __init visws_find_smp_config(unsigned int reserve) { struct mpc_config_processor *mp = phys_to_virt(CO_CPU_TAB_PHYS); unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS)); @@ -244,8 +242,8 @@ int __init visws_find_smp_config_quirk(unsigned int reserve) ncpus = CO_CPU_MAX; } - if (ncpus > maxcpus) - ncpus = maxcpus; + if (ncpus > setup_max_cpus) + ncpus = setup_max_cpus; #ifdef CONFIG_X86_LOCAL_APIC smp_found_config = 1; @@ -258,7 +256,17 @@ int __init visws_find_smp_config_quirk(unsigned int reserve) return 1; } -extern int visws_trap_init_quirk(void); +static int visws_trap_init(void); + +static struct x86_quirks visws_x86_quirks __initdata = { + .arch_time_init = visws_time_init, + .arch_pre_intr_init = visws_pre_intr_init, + .arch_memory_setup = visws_memory_setup, + .arch_intr_init = NULL, + .arch_trap_init = visws_trap_init, + .mach_get_smp_config = visws_get_smp_config, + .mach_find_smp_config = visws_find_smp_config, +}; void __init visws_early_detect(void) { @@ -272,16 +280,10 @@ void __init visws_early_detect(void) /* * Install special quirks for timer, interrupt and memory setup: - */ - arch_time_init_quirk = visws_time_init_quirk; - arch_pre_intr_init_quirk = visws_pre_intr_init_quirk; - arch_memory_setup_quirk = visws_memory_setup_quirk; - - /* * Fall back to generic behavior for traps: + * Override generic MP-table parsing: */ - arch_intr_init_quirk = NULL; - arch_trap_init_quirk = visws_trap_init_quirk; + x86_quirks = &visws_x86_quirks; /* * Install reboot quirks: @@ -294,12 +296,6 @@ void __init visws_early_detect(void) */ no_broadcast = 0; - /* - * Override generic MP-table parsing: - */ - mach_get_smp_config_quirk = visws_get_smp_config_quirk; - mach_find_smp_config_quirk = visws_find_smp_config_quirk; - #ifdef CONFIG_X86_IO_APIC /* * Turn off IO-APIC detection and initialization: @@ -426,7 +422,7 @@ static __init void cobalt_init(void) co_apic_read(CO_APIC_ID)); } -int __init visws_trap_init_quirk(void) +static int __init visws_trap_init(void) { lithium_init(); cobalt_init(); diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index b15346092b7..6ca515d6db5 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -37,6 +37,7 @@ #include <asm/timer.h> #include <asm/vmi_time.h> #include <asm/kmap_types.h> +#include <asm/setup.h> /* Convenient for calling VMI functions indirectly in the ROM */ typedef u32 __attribute__((regparm(1))) (VROMFUNC)(void); @@ -683,7 +684,7 @@ void vmi_bringup(void) { /* We must establish the lowmem mapping for MMU ops to work */ if (vmi_ops.set_linear_mapping) - vmi_ops.set_linear_mapping(0, (void *)__PAGE_OFFSET, max_low_pfn, 0); + vmi_ops.set_linear_mapping(0, (void *)__PAGE_OFFSET, MAXMEM_PFN, 0); } /* @@ -906,7 +907,6 @@ static inline int __init activate_vmi(void) #ifdef CONFIG_X86_LOCAL_APIC para_fill(pv_apic_ops.apic_read, APICRead); para_fill(pv_apic_ops.apic_write, APICWrite); - para_fill(pv_apic_ops.apic_write_atomic, APICWrite); #endif /* diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index cdb2363697d..af5bdad8460 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -209,3 +209,11 @@ SECTIONS DWARF_DEBUG } + +#ifdef CONFIG_KEXEC +/* Link time checks */ +#include <asm/kexec.h> + +ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, + "kexec control code size is too big") +#endif diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c index 0c029e8959c..7766d36983f 100644 --- a/arch/x86/kernel/vsmp_64.c +++ b/arch/x86/kernel/vsmp_64.c @@ -61,7 +61,7 @@ static void vsmp_irq_enable(void) native_restore_fl((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC)); } -static unsigned __init vsmp_patch(u8 type, u16 clobbers, void *ibuf, +static unsigned __init_or_module vsmp_patch(u8 type, u16 clobbers, void *ibuf, unsigned long addr, unsigned len) { switch (type) { |