diff options
Diffstat (limited to 'arch/x86/kernel')
59 files changed, 2051 insertions, 1128 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 801127cd975..80a93dc9907 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -11,6 +11,8 @@ ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_tsc.o = -pg CFLAGS_REMOVE_rtc.o = -pg CFLAGS_REMOVE_paravirt-spinlocks.o = -pg +CFLAGS_REMOVE_pvclock.o = -pg +CFLAGS_REMOVE_kvmclock.o = -pg CFLAGS_REMOVE_ftrace.o = -pg CFLAGS_REMOVE_early_printk.o = -pg endif @@ -32,7 +34,8 @@ GCOV_PROFILE_paravirt.o := n obj-y := process_$(BITS).o signal.o entry_$(BITS).o obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time.o ioport.o ldt.o dumpstack.o -obj-y += setup.o x86_init.o i8259.o irqinit.o +obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o +obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-$(CONFIG_X86_VISWS) += visws_quirks.o obj-$(CONFIG_X86_32) += probe_roms_32.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o @@ -83,11 +86,12 @@ obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o obj-$(CONFIG_KGDB) += kgdb.o obj-$(CONFIG_VM86) += vm86_32.o obj-$(CONFIG_EARLY_PRINTK) += early_printk.o +obj-$(CONFIG_EARLY_PRINTK_MRST) += early_printk_mrst.o obj-$(CONFIG_HPET_TIMER) += hpet.o obj-$(CONFIG_APB_TIMER) += apb_timer.o -obj-$(CONFIG_K8_NB) += k8.o +obj-$(CONFIG_AMD_NB) += amd_nb.o obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o @@ -103,6 +107,7 @@ obj-$(CONFIG_SCx200) += scx200.o scx200-y += scx200_32.o obj-$(CONFIG_OLPC) += olpc.o +obj-$(CONFIG_OLPC_XO1) += olpc-xo1.o obj-$(CONFIG_OLPC_OPENFIRMWARE) += olpc_ofw.o obj-$(CONFIG_X86_MRST) += mrst.o @@ -119,7 +124,6 @@ obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o # 64 bit specific files ifeq ($(CONFIG_X86_64),y) obj-$(CONFIG_X86_UV) += tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o uv_time.o - obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o obj-$(CONFIG_AUDIT) += audit_64.o obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index fb7a5f052e2..5812404a0d4 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -13,6 +13,7 @@ #include <acpi/processor.h> #include <asm/acpi.h> +#include <asm/mwait.h> /* * Initialize bm_flags based on the CPU cache properties @@ -61,20 +62,10 @@ struct cstate_entry { unsigned int ecx; } states[ACPI_PROCESSOR_MAX_POWER]; }; -static struct cstate_entry *cpu_cstate_entry; /* per CPU ptr */ +static struct cstate_entry __percpu *cpu_cstate_entry; /* per CPU ptr */ static short mwait_supported[ACPI_PROCESSOR_MAX_POWER]; -#define MWAIT_SUBSTATE_MASK (0xf) -#define MWAIT_CSTATE_MASK (0xf) -#define MWAIT_SUBSTATE_SIZE (4) - -#define CPUID_MWAIT_LEAF (5) -#define CPUID5_ECX_EXTENSIONS_SUPPORTED (0x1) -#define CPUID5_ECX_INTERRUPT_BREAK (0x2) - -#define MWAIT_ECX_INTERRUPT_BREAK (0x1) - #define NATIVE_CSTATE_BEYOND_HALT (2) static long acpi_processor_ffh_cstate_probe_cpu(void *_cx) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index f65ab8b014c..a36bb90aef5 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -195,7 +195,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len) extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; extern s32 __smp_locks[], __smp_locks_end[]; -static void *text_poke_early(void *addr, const void *opcode, size_t len); +void *text_poke_early(void *addr, const void *opcode, size_t len); /* Replace instructions with better alternatives for this CPU type. This runs before SMP is initialized to avoid SMP problems with @@ -522,7 +522,7 @@ void __init alternative_instructions(void) * instructions. And on the local CPU you need to be protected again NMI or MCE * handlers seeing an inconsistent instruction while you patch. */ -static void *__init_or_module text_poke_early(void *addr, const void *opcode, +void *__init_or_module text_poke_early(void *addr, const void *opcode, size_t len) { unsigned long flags; @@ -637,7 +637,72 @@ void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len) tpp.len = len; atomic_set(&stop_machine_first, 1); wrote_text = 0; - stop_machine(stop_machine_text_poke, (void *)&tpp, NULL); + /* Use __stop_machine() because the caller already got online_cpus. */ + __stop_machine(stop_machine_text_poke, (void *)&tpp, NULL); return addr; } +#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL) + +unsigned char ideal_nop5[IDEAL_NOP_SIZE_5]; + +void __init arch_init_ideal_nop5(void) +{ + extern const unsigned char ftrace_test_p6nop[]; + extern const unsigned char ftrace_test_nop5[]; + extern const unsigned char ftrace_test_jmp[]; + int faulted = 0; + + /* + * There is no good nop for all x86 archs. + * We will default to using the P6_NOP5, but first we + * will test to make sure that the nop will actually + * work on this CPU. If it faults, we will then + * go to a lesser efficient 5 byte nop. If that fails + * we then just use a jmp as our nop. This isn't the most + * efficient nop, but we can not use a multi part nop + * since we would then risk being preempted in the middle + * of that nop, and if we enabled tracing then, it might + * cause a system crash. + * + * TODO: check the cpuid to determine the best nop. + */ + asm volatile ( + "ftrace_test_jmp:" + "jmp ftrace_test_p6nop\n" + "nop\n" + "nop\n" + "nop\n" /* 2 byte jmp + 3 bytes */ + "ftrace_test_p6nop:" + P6_NOP5 + "jmp 1f\n" + "ftrace_test_nop5:" + ".byte 0x66,0x66,0x66,0x66,0x90\n" + "1:" + ".section .fixup, \"ax\"\n" + "2: movl $1, %0\n" + " jmp ftrace_test_nop5\n" + "3: movl $2, %0\n" + " jmp 1b\n" + ".previous\n" + _ASM_EXTABLE(ftrace_test_p6nop, 2b) + _ASM_EXTABLE(ftrace_test_nop5, 3b) + : "=r"(faulted) : "0" (faulted)); + + switch (faulted) { + case 0: + pr_info("converting mcount calls to 0f 1f 44 00 00\n"); + memcpy(ideal_nop5, ftrace_test_p6nop, IDEAL_NOP_SIZE_5); + break; + case 1: + pr_info("converting mcount calls to 66 66 66 66 90\n"); + memcpy(ideal_nop5, ftrace_test_nop5, IDEAL_NOP_SIZE_5); + break; + case 2: + pr_info("converting mcount calls to jmp . + 5\n"); + memcpy(ideal_nop5, ftrace_test_jmp, IDEAL_NOP_SIZE_5); + break; + } + +} +#endif diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index fa044e1e30a..d2fdb0826df 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2007-2009 Advanced Micro Devices, Inc. + * Copyright (C) 2007-2010 Advanced Micro Devices, Inc. * Author: Joerg Roedel <joerg.roedel@amd.com> * Leo Duran <leo.duran@amd.com> * @@ -1953,6 +1953,7 @@ static void __unmap_single(struct dma_ops_domain *dma_dom, size_t size, int dir) { + dma_addr_t flush_addr; dma_addr_t i, start; unsigned int pages; @@ -1960,6 +1961,7 @@ static void __unmap_single(struct dma_ops_domain *dma_dom, (dma_addr + size > dma_dom->aperture_size)) return; + flush_addr = dma_addr; pages = iommu_num_pages(dma_addr, size, PAGE_SIZE); dma_addr &= PAGE_MASK; start = dma_addr; @@ -1974,7 +1976,7 @@ static void __unmap_single(struct dma_ops_domain *dma_dom, dma_ops_free_addresses(dma_dom, dma_addr, pages); if (amd_iommu_unmap_flush || dma_dom->need_flush) { - iommu_flush_pages(&dma_dom->domain, dma_addr, size); + iommu_flush_pages(&dma_dom->domain, flush_addr, size); dma_dom->need_flush = false; } } diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 3cc63e2b8dd..3cb482e123d 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2007-2009 Advanced Micro Devices, Inc. + * Copyright (C) 2007-2010 Advanced Micro Devices, Inc. * Author: Joerg Roedel <joerg.roedel@amd.com> * Leo Duran <leo.duran@amd.com> * @@ -194,6 +194,39 @@ static inline unsigned long tbl_size(int entry_size) return 1UL << shift; } +/* Access to l1 and l2 indexed register spaces */ + +static u32 iommu_read_l1(struct amd_iommu *iommu, u16 l1, u8 address) +{ + u32 val; + + pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16)); + pci_read_config_dword(iommu->dev, 0xfc, &val); + return val; +} + +static void iommu_write_l1(struct amd_iommu *iommu, u16 l1, u8 address, u32 val) +{ + pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16 | 1 << 31)); + pci_write_config_dword(iommu->dev, 0xfc, val); + pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16)); +} + +static u32 iommu_read_l2(struct amd_iommu *iommu, u8 address) +{ + u32 val; + + pci_write_config_dword(iommu->dev, 0xf0, address); + pci_read_config_dword(iommu->dev, 0xf4, &val); + return val; +} + +static void iommu_write_l2(struct amd_iommu *iommu, u8 address, u32 val) +{ + pci_write_config_dword(iommu->dev, 0xf0, (address | 1 << 8)); + pci_write_config_dword(iommu->dev, 0xf4, val); +} + /**************************************************************************** * * AMD IOMMU MMIO register space handling functions @@ -619,6 +652,7 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu) { int cap_ptr = iommu->cap_ptr; u32 range, misc; + int i, j; pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET, &iommu->cap); @@ -632,6 +666,30 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu) iommu->last_device = calc_devid(MMIO_GET_BUS(range), MMIO_GET_LD(range)); iommu->evt_msi_num = MMIO_MSI_NUM(misc); + + if (!is_rd890_iommu(iommu->dev)) + return; + + /* + * Some rd890 systems may not be fully reconfigured by the BIOS, so + * it's necessary for us to store this information so it can be + * reprogrammed on resume + */ + + pci_read_config_dword(iommu->dev, iommu->cap_ptr + 4, + &iommu->stored_addr_lo); + pci_read_config_dword(iommu->dev, iommu->cap_ptr + 8, + &iommu->stored_addr_hi); + + /* Low bit locks writes to configuration space */ + iommu->stored_addr_lo &= ~1; + + for (i = 0; i < 6; i++) + for (j = 0; j < 0x12; j++) + iommu->stored_l1[i][j] = iommu_read_l1(iommu, i, j); + + for (i = 0; i < 0x83; i++) + iommu->stored_l2[i] = iommu_read_l2(iommu, i); } /* @@ -649,29 +707,9 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, struct ivhd_entry *e; /* - * First set the recommended feature enable bits from ACPI - * into the IOMMU control registers - */ - h->flags & IVHD_FLAG_HT_TUN_EN_MASK ? - iommu_feature_enable(iommu, CONTROL_HT_TUN_EN) : - iommu_feature_disable(iommu, CONTROL_HT_TUN_EN); - - h->flags & IVHD_FLAG_PASSPW_EN_MASK ? - iommu_feature_enable(iommu, CONTROL_PASSPW_EN) : - iommu_feature_disable(iommu, CONTROL_PASSPW_EN); - - h->flags & IVHD_FLAG_RESPASSPW_EN_MASK ? - iommu_feature_enable(iommu, CONTROL_RESPASSPW_EN) : - iommu_feature_disable(iommu, CONTROL_RESPASSPW_EN); - - h->flags & IVHD_FLAG_ISOC_EN_MASK ? - iommu_feature_enable(iommu, CONTROL_ISOC_EN) : - iommu_feature_disable(iommu, CONTROL_ISOC_EN); - - /* - * make IOMMU memory accesses cache coherent + * First save the recommended feature enable bits from ACPI */ - iommu_feature_enable(iommu, CONTROL_COHERENT_EN); + iommu->acpi_flags = h->flags; /* * Done. Now parse the device entries @@ -1116,6 +1154,79 @@ static void init_device_table(void) } } +static void iommu_init_flags(struct amd_iommu *iommu) +{ + iommu->acpi_flags & IVHD_FLAG_HT_TUN_EN_MASK ? + iommu_feature_enable(iommu, CONTROL_HT_TUN_EN) : + iommu_feature_disable(iommu, CONTROL_HT_TUN_EN); + + iommu->acpi_flags & IVHD_FLAG_PASSPW_EN_MASK ? + iommu_feature_enable(iommu, CONTROL_PASSPW_EN) : + iommu_feature_disable(iommu, CONTROL_PASSPW_EN); + + iommu->acpi_flags & IVHD_FLAG_RESPASSPW_EN_MASK ? + iommu_feature_enable(iommu, CONTROL_RESPASSPW_EN) : + iommu_feature_disable(iommu, CONTROL_RESPASSPW_EN); + + iommu->acpi_flags & IVHD_FLAG_ISOC_EN_MASK ? + iommu_feature_enable(iommu, CONTROL_ISOC_EN) : + iommu_feature_disable(iommu, CONTROL_ISOC_EN); + + /* + * make IOMMU memory accesses cache coherent + */ + iommu_feature_enable(iommu, CONTROL_COHERENT_EN); +} + +static void iommu_apply_resume_quirks(struct amd_iommu *iommu) +{ + int i, j; + u32 ioc_feature_control; + struct pci_dev *pdev = NULL; + + /* RD890 BIOSes may not have completely reconfigured the iommu */ + if (!is_rd890_iommu(iommu->dev)) + return; + + /* + * First, we need to ensure that the iommu is enabled. This is + * controlled by a register in the northbridge + */ + pdev = pci_get_bus_and_slot(iommu->dev->bus->number, PCI_DEVFN(0, 0)); + + if (!pdev) + return; + + /* Select Northbridge indirect register 0x75 and enable writing */ + pci_write_config_dword(pdev, 0x60, 0x75 | (1 << 7)); + pci_read_config_dword(pdev, 0x64, &ioc_feature_control); + + /* Enable the iommu */ + if (!(ioc_feature_control & 0x1)) + pci_write_config_dword(pdev, 0x64, ioc_feature_control | 1); + + pci_dev_put(pdev); + + /* Restore the iommu BAR */ + pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4, + iommu->stored_addr_lo); + pci_write_config_dword(iommu->dev, iommu->cap_ptr + 8, + iommu->stored_addr_hi); + + /* Restore the l1 indirect regs for each of the 6 l1s */ + for (i = 0; i < 6; i++) + for (j = 0; j < 0x12; j++) + iommu_write_l1(iommu, i, j, iommu->stored_l1[i][j]); + + /* Restore the l2 indirect regs */ + for (i = 0; i < 0x83; i++) + iommu_write_l2(iommu, i, iommu->stored_l2[i]); + + /* Lock PCI setup registers */ + pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4, + iommu->stored_addr_lo | 1); +} + /* * This function finally enables all IOMMUs found in the system after * they have been initialized @@ -1126,6 +1237,7 @@ static void enable_iommus(void) for_each_iommu(iommu) { iommu_disable(iommu); + iommu_init_flags(iommu); iommu_set_device_table(iommu); iommu_enable_command_buffer(iommu); iommu_enable_event_buffer(iommu); @@ -1150,6 +1262,11 @@ static void disable_iommus(void) static int amd_iommu_resume(struct sys_device *dev) { + struct amd_iommu *iommu; + + for_each_iommu(iommu) + iommu_apply_resume_quirks(iommu); + /* re-load the hardware */ enable_iommus(); diff --git a/arch/x86/kernel/k8.c b/arch/x86/kernel/amd_nb.c index 0f7bc20cfcd..8f6463d8ed0 100644 --- a/arch/x86/kernel/k8.c +++ b/arch/x86/kernel/amd_nb.c @@ -8,21 +8,19 @@ #include <linux/errno.h> #include <linux/module.h> #include <linux/spinlock.h> -#include <asm/k8.h> - -int num_k8_northbridges; -EXPORT_SYMBOL(num_k8_northbridges); +#include <asm/amd_nb.h> static u32 *flush_words; struct pci_device_id k8_nb_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_MISC) }, {} }; EXPORT_SYMBOL(k8_nb_ids); -struct pci_dev **k8_northbridges; +struct k8_northbridge_info k8_northbridges; EXPORT_SYMBOL(k8_northbridges); static struct pci_dev *next_k8_northbridge(struct pci_dev *dev) @@ -40,36 +38,45 @@ int cache_k8_northbridges(void) int i; struct pci_dev *dev; - if (num_k8_northbridges) + if (k8_northbridges.num) return 0; dev = NULL; while ((dev = next_k8_northbridge(dev)) != NULL) - num_k8_northbridges++; + k8_northbridges.num++; + + /* some CPU families (e.g. family 0x11) do not support GART */ + if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 || + boot_cpu_data.x86 == 0x15) + k8_northbridges.gart_supported = 1; - k8_northbridges = kmalloc((num_k8_northbridges + 1) * sizeof(void *), - GFP_KERNEL); - if (!k8_northbridges) + k8_northbridges.nb_misc = kmalloc((k8_northbridges.num + 1) * + sizeof(void *), GFP_KERNEL); + if (!k8_northbridges.nb_misc) return -ENOMEM; - if (!num_k8_northbridges) { - k8_northbridges[0] = NULL; + if (!k8_northbridges.num) { + k8_northbridges.nb_misc[0] = NULL; return 0; } - flush_words = kmalloc(num_k8_northbridges * sizeof(u32), GFP_KERNEL); - if (!flush_words) { - kfree(k8_northbridges); - return -ENOMEM; + if (k8_northbridges.gart_supported) { + flush_words = kmalloc(k8_northbridges.num * sizeof(u32), + GFP_KERNEL); + if (!flush_words) { + kfree(k8_northbridges.nb_misc); + return -ENOMEM; + } } dev = NULL; i = 0; while ((dev = next_k8_northbridge(dev)) != NULL) { - k8_northbridges[i] = dev; - pci_read_config_dword(dev, 0x9c, &flush_words[i++]); + k8_northbridges.nb_misc[i] = dev; + if (k8_northbridges.gart_supported) + pci_read_config_dword(dev, 0x9c, &flush_words[i++]); } - k8_northbridges[i] = NULL; + k8_northbridges.nb_misc[i] = NULL; return 0; } EXPORT_SYMBOL_GPL(cache_k8_northbridges); @@ -93,22 +100,25 @@ void k8_flush_garts(void) unsigned long flags; static DEFINE_SPINLOCK(gart_lock); + if (!k8_northbridges.gart_supported) + return; + /* Avoid races between AGP and IOMMU. In theory it's not needed but I'm not sure if the hardware won't lose flush requests when another is pending. This whole thing is so expensive anyways that it doesn't matter to serialize more. -AK */ spin_lock_irqsave(&gart_lock, flags); flushed = 0; - for (i = 0; i < num_k8_northbridges; i++) { - pci_write_config_dword(k8_northbridges[i], 0x9c, + for (i = 0; i < k8_northbridges.num; i++) { + pci_write_config_dword(k8_northbridges.nb_misc[i], 0x9c, flush_words[i]|1); flushed++; } - for (i = 0; i < num_k8_northbridges; i++) { + for (i = 0; i < k8_northbridges.num; i++) { u32 w; /* Make sure the hardware actually executed the flush*/ for (;;) { - pci_read_config_dword(k8_northbridges[i], + pci_read_config_dword(k8_northbridges.nb_misc[i], 0x9c, &w); if (!(w & 1)) break; diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index 8dd77800ff5..6fe2b5cb4f3 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -343,7 +343,7 @@ void apbt_setup_secondary_clock(void) /* Don't register boot CPU clockevent */ cpu = smp_processor_id(); - if (cpu == boot_cpu_id) + if (!cpu) return; /* * We need to calculate the scaled math multiplication factor for @@ -398,7 +398,7 @@ static int apbt_cpuhp_notify(struct notifier_block *n, } break; default: - pr_debug(KERN_INFO "APBT notified %lu, no action\n", action); + pr_debug("APBT notified %lu, no action\n", action); } return NOTIFY_OK; } @@ -552,7 +552,7 @@ bad_count: pr_debug("APB CS going back %lx:%lx:%lx ", t2, last_read, t2 - last_read); bad_count_x3: - pr_debug(KERN_INFO "tripple check enforced\n"); + pr_debug("triple check enforced\n"); t0 = apbt_readl(phy_cs_timer_id, APBTMR_N_CURRENT_VALUE); udelay(1); diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index a2e0caf26e1..377f5db3b8b 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -27,7 +27,7 @@ #include <asm/gart.h> #include <asm/pci-direct.h> #include <asm/dma.h> -#include <asm/k8.h> +#include <asm/amd_nb.h> #include <asm/x86_init.h> int gart_iommu_aperture; @@ -307,7 +307,7 @@ void __init early_gart_iommu_check(void) continue; ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL); - aper_enabled = ctl & AMD64_GARTEN; + aper_enabled = ctl & GARTEN; aper_order = (ctl >> 1) & 7; aper_size = (32 * 1024 * 1024) << aper_order; aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff; @@ -362,7 +362,7 @@ void __init early_gart_iommu_check(void) continue; ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL); - ctl &= ~AMD64_GARTEN; + ctl &= ~GARTEN; write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl); } } @@ -505,8 +505,13 @@ out: /* Fix up the north bridges */ for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { - int bus; - int dev_base, dev_limit; + int bus, dev_base, dev_limit; + + /* + * Don't enable translation yet but enable GART IO and CPU + * accesses and set DISTLBWALKPRB since GART table memory is UC. + */ + u32 ctl = DISTLBWALKPRB | aper_order << 1; bus = bus_dev_ranges[i].bus; dev_base = bus_dev_ranges[i].dev_base; @@ -515,10 +520,7 @@ out: if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) continue; - /* Don't enable translation yet. That is done later. - Assume this BIOS didn't initialise the GART so - just overwrite all previous bits */ - write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, aper_order << 1); + write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl); write_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE, aper_alloc >> 25); } } diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index f1efebaf551..f1e78940d90 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -162,7 +162,7 @@ int __init arch_early_irq_init(void) cfg = irq_cfgx; count = ARRAY_SIZE(irq_cfgx); - node= cpu_to_node(boot_cpu_id); + node = cpu_to_node(0); for (i = 0; i < count; i++) { desc = irq_to_desc(i); @@ -306,14 +306,19 @@ void arch_init_copy_chip_data(struct irq_desc *old_desc, old_cfg = old_desc->chip_data; - memcpy(cfg, old_cfg, sizeof(struct irq_cfg)); + cfg->vector = old_cfg->vector; + cfg->move_in_progress = old_cfg->move_in_progress; + cpumask_copy(cfg->domain, old_cfg->domain); + cpumask_copy(cfg->old_domain, old_cfg->old_domain); init_copy_irq_2_pin(old_cfg, cfg, node); } -static void free_irq_cfg(struct irq_cfg *old_cfg) +static void free_irq_cfg(struct irq_cfg *cfg) { - kfree(old_cfg); + free_cpumask_var(cfg->domain); + free_cpumask_var(cfg->old_domain); + kfree(cfg); } void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc) @@ -1483,7 +1488,7 @@ static void __init setup_IO_APIC_irqs(void) int notcon = 0; struct irq_desc *desc; struct irq_cfg *cfg; - int node = cpu_to_node(boot_cpu_id); + int node = cpu_to_node(0); apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); @@ -1548,7 +1553,7 @@ static void __init setup_IO_APIC_irqs(void) void setup_IO_APIC_irq_extra(u32 gsi) { int apic_id = 0, pin, idx, irq; - int node = cpu_to_node(boot_cpu_id); + int node = cpu_to_node(0); struct irq_desc *desc; struct irq_cfg *cfg; @@ -2927,7 +2932,7 @@ static inline void __init check_timer(void) { struct irq_desc *desc = irq_to_desc(0); struct irq_cfg *cfg = desc->chip_data; - int node = cpu_to_node(boot_cpu_id); + int node = cpu_to_node(0); int apic1, pin1, apic2, pin2; unsigned long flags; int no_pin1 = 0; @@ -3281,7 +3286,7 @@ unsigned int create_irq_nr(unsigned int irq_want, int node) int create_irq(void) { - int node = cpu_to_node(boot_cpu_id); + int node = cpu_to_node(0); unsigned int irq_want; int irq; @@ -3903,7 +3908,7 @@ static int __io_apic_set_pci_routing(struct device *dev, int irq, if (dev) node = dev_to_node(dev); else - node = cpu_to_node(boot_cpu_id); + node = cpu_to_node(0); desc = irq_to_desc_alloc_node(irq, node); if (!desc) { diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 7b598b84c90..f744f54cb24 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -698,9 +698,11 @@ void __init uv_system_init(void) for (j = 0; j < 64; j++) { if (!test_bit(j, &present)) continue; - uv_blade_info[blade].pnode = (i * 64 + j); + pnode = (i * 64 + j); + uv_blade_info[blade].pnode = pnode; uv_blade_info[blade].nr_possible_cpus = 0; uv_blade_info[blade].nr_online_cpus = 0; + max_pnode = max(pnode, max_pnode); blade++; } } @@ -738,7 +740,6 @@ void __init uv_system_init(void) uv_cpu_hub_info(cpu)->scir.offset = uv_scir_offset(apicid); uv_node_to_blade[nid] = blade; uv_cpu_to_blade[cpu] = blade; - max_pnode = max(pnode, max_pnode); } /* Add blade/pnode info for nodes without cpus */ @@ -750,7 +751,6 @@ void __init uv_system_init(void) pnode = (paddr >> m_val) & pnode_mask; blade = boot_pnode_to_blade(pnode); uv_node_to_blade[nid] = blade; - max_pnode = max(pnode, max_pnode); } map_gru_high(max_pnode); diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index ba5f62f45f0..9e093f8fe78 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -148,7 +148,7 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP /* calling is from identify_secondary_cpu() ? */ - if (c->cpu_index == boot_cpu_id) + if (!c->cpu_index) return; /* @@ -253,37 +253,51 @@ static int __cpuinit nearby_node(int apicid) #endif /* - * Fixup core topology information for AMD multi-node processors. - * Assumption: Number of cores in each internal node is the same. + * Fixup core topology information for + * (1) AMD multi-node processors + * Assumption: Number of cores in each internal node is the same. + * (2) AMD processors supporting compute units */ #ifdef CONFIG_X86_HT -static void __cpuinit amd_fixup_dcm(struct cpuinfo_x86 *c) +static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c) { - unsigned long long value; - u32 nodes, cores_per_node; + u32 nodes; + u8 node_id; int cpu = smp_processor_id(); - if (!cpu_has(c, X86_FEATURE_NODEID_MSR)) - return; + /* get information required for multi-node processors */ + if (cpu_has(c, X86_FEATURE_TOPOEXT)) { + u32 eax, ebx, ecx, edx; - /* fixup topology information only once for a core */ - if (cpu_has(c, X86_FEATURE_AMD_DCM)) - return; + cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); + nodes = ((ecx >> 8) & 7) + 1; + node_id = ecx & 7; - rdmsrl(MSR_FAM10H_NODE_ID, value); + /* get compute unit information */ + smp_num_siblings = ((ebx >> 8) & 3) + 1; + c->compute_unit_id = ebx & 0xff; + } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) { + u64 value; - nodes = ((value >> 3) & 7) + 1; - if (nodes == 1) + rdmsrl(MSR_FAM10H_NODE_ID, value); + nodes = ((value >> 3) & 7) + 1; + node_id = value & 7; + } else return; - set_cpu_cap(c, X86_FEATURE_AMD_DCM); - cores_per_node = c->x86_max_cores / nodes; + /* fixup multi-node processor information */ + if (nodes > 1) { + u32 cores_per_node; + + set_cpu_cap(c, X86_FEATURE_AMD_DCM); + cores_per_node = c->x86_max_cores / nodes; - /* store NodeID, use llc_shared_map to store sibling info */ - per_cpu(cpu_llc_id, cpu) = value & 7; + /* store NodeID, use llc_shared_map to store sibling info */ + per_cpu(cpu_llc_id, cpu) = node_id; - /* fixup core id to be in range from 0 to (cores_per_node - 1) */ - c->cpu_core_id = c->cpu_core_id % cores_per_node; + /* core id to be in range from 0 to (cores_per_node - 1) */ + c->cpu_core_id = c->cpu_core_id % cores_per_node; + } } #endif @@ -304,9 +318,7 @@ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c) c->phys_proc_id = c->initial_apicid >> bits; /* use socket ID also for last level cache */ per_cpu(cpu_llc_id, cpu) = c->phys_proc_id; - /* fixup topology information on multi-node processors */ - if ((c->x86 == 0x10) && (c->x86_model == 9)) - amd_fixup_dcm(c); + amd_get_topology(c); #endif } @@ -412,6 +424,23 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_EXTD_APICID); } #endif + + /* We need to do the following only once */ + if (c != &boot_cpu_data) + return; + + if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) { + + if (c->x86 > 0x10 || + (c->x86 == 0x10 && c->x86_model >= 0x2)) { + u64 val; + + rdmsrl(MSR_K7_HWCR, val); + if (!(val & BIT(24))) + printk(KERN_WARNING FW_BUG "TSC doesn't count " + "with P0 frequency!\n"); + } + } } static void __cpuinit init_amd(struct cpuinfo_x86 *c) @@ -523,7 +552,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) #endif if (c->extended_cpuid_level >= 0x80000006) { - if ((c->x86 >= 0x0f) && (cpuid_edx(0x80000006) & 0xf000)) + if (cpuid_edx(0x80000006) & 0xf000) num_cache_leaves = 4; else num_cache_leaves = 3; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 490dac63c2d..4b68bda3093 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -545,7 +545,7 @@ void __cpuinit cpu_detect(struct cpuinfo_x86 *c) } } -static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) +void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) { u32 tfms, xlvl; u32 ebx; @@ -665,7 +665,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) this_cpu->c_early_init(c); #ifdef CONFIG_SMP - c->cpu_index = boot_cpu_id; + c->cpu_index = 0; #endif filter_cpuid_features(c, false); } @@ -704,16 +704,21 @@ void __init early_cpu_init(void) } /* - * The NOPL instruction is supposed to exist on all CPUs with - * family >= 6; unfortunately, that's not true in practice because - * of early VIA chips and (more importantly) broken virtualizers that - * are not easy to detect. In the latter case it doesn't even *fail* - * reliably, so probing for it doesn't even work. Disable it completely + * The NOPL instruction is supposed to exist on all CPUs of family >= 6; + * unfortunately, that's not true in practice because of early VIA + * chips and (more importantly) broken virtualizers that are not easy + * to detect. In the latter case it doesn't even *fail* reliably, so + * probing for it doesn't even work. Disable it completely on 32-bit * unless we can find a reliable way to detect all the broken cases. + * Enable it explicitly on 64-bit for non-constant inputs of cpu_has(). */ static void __cpuinit detect_nopl(struct cpuinfo_x86 *c) { +#ifdef CONFIG_X86_32 clear_cpu_cap(c, X86_FEATURE_NOPL); +#else + set_cpu_cap(c, X86_FEATURE_NOPL); +#endif } static void __cpuinit generic_identify(struct cpuinfo_x86 *c) @@ -1264,13 +1269,6 @@ void __cpuinit cpu_init(void) clear_all_debug_regs(); dbg_restore_debug_regs(); - /* - * Force FPU initialization: - */ - current_thread_info()->status = 0; - clear_used_math(); - mxcsr_feature_mask_init(); - fpu_init(); xsave_init(); } diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 3624e8a0f71..e765633f210 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -32,6 +32,8 @@ struct cpu_dev { extern const struct cpu_dev *const __x86_cpu_dev_start[], *const __x86_cpu_dev_end[]; +extern void get_cpu_cap(struct cpuinfo_x86 *c); extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); +extern void get_cpu_cap(struct cpuinfo_x86 *c); #endif diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c index 994230d4dc4..4f6f679f279 100644 --- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c @@ -368,16 +368,22 @@ static int __init pcc_cpufreq_do_osc(acpi_handle *handle) return -ENODEV; out_obj = output.pointer; - if (out_obj->type != ACPI_TYPE_BUFFER) - return -ENODEV; + if (out_obj->type != ACPI_TYPE_BUFFER) { + ret = -ENODEV; + goto out_free; + } errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0); - if (errors) - return -ENODEV; + if (errors) { + ret = -ENODEV; + goto out_free; + } supported = *((u32 *)(out_obj->buffer.pointer + 4)); - if (!(supported & 0x1)) - return -ENODEV; + if (!(supported & 0x1)) { + ret = -ENODEV; + goto out_free; + } out_free: kfree(output.pointer); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 85f69cdeae1..695f17731e2 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -39,6 +39,7 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) misc_enable &= ~MSR_IA32_MISC_ENABLE_LIMIT_CPUID; wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable); c->cpuid_level = cpuid_eax(0); + get_cpu_cap(c); } } @@ -169,7 +170,7 @@ static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP /* calling is from identify_secondary_cpu() ? */ - if (c->cpu_index == boot_cpu_id) + if (!c->cpu_index) return; /* diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 898c2f4eab8..12cd823c8d0 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -17,7 +17,7 @@ #include <asm/processor.h> #include <linux/smp.h> -#include <asm/k8.h> +#include <asm/amd_nb.h> #include <asm/smp.h> #define LVL_1_INST 1 @@ -306,7 +306,7 @@ struct _cache_attr { ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count); }; -#ifdef CONFIG_CPU_SUP_AMD +#ifdef CONFIG_AMD_NB /* * L3 cache descriptors @@ -369,7 +369,7 @@ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, return; /* not in virtualized environments */ - if (num_k8_northbridges == 0) + if (k8_northbridges.num == 0) return; /* @@ -377,7 +377,7 @@ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, * never freed but this is done only on shutdown so it doesn't matter. */ if (!l3_caches) { - int size = num_k8_northbridges * sizeof(struct amd_l3_cache *); + int size = k8_northbridges.num * sizeof(struct amd_l3_cache *); l3_caches = kzalloc(size, GFP_ATOMIC); if (!l3_caches) @@ -556,12 +556,12 @@ static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644, static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, show_cache_disable_1, store_cache_disable_1); -#else /* CONFIG_CPU_SUP_AMD */ +#else /* CONFIG_AMD_NB */ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, int index) { }; -#endif /* CONFIG_CPU_SUP_AMD */ +#endif /* CONFIG_AMD_NB */ static int __cpuinit cpuid4_cache_lookup_regs(int index, @@ -1000,7 +1000,7 @@ static struct attribute *default_attrs[] = { static struct attribute *default_l3_attrs[] = { DEFAULT_SYSFS_CACHE_ATTRS, -#ifdef CONFIG_CPU_SUP_AMD +#ifdef CONFIG_AMD_NB &cache_disable_0.attr, &cache_disable_1.attr, #endif diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 224392d8fe8..39aaee5c1ab 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -141,6 +141,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) address = (low & MASK_BLKPTR_LO) >> 21; if (!address) break; + address += MCG_XBLK_ADDR; } else ++address; @@ -148,12 +149,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) if (rdmsr_safe(address, &low, &high)) break; - if (!(high & MASK_VALID_HI)) { - if (block) - continue; - else - break; - } + if (!(high & MASK_VALID_HI)) + continue; if (!(high & MASK_CNTP_HI) || (high & MASK_LOCKED_HI)) @@ -530,7 +527,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) err = -ENOMEM; goto out; } - if (!alloc_cpumask_var(&b->cpus, GFP_KERNEL)) { + if (!zalloc_cpumask_var(&b->cpus, GFP_KERNEL)) { kfree(b); err = -ENOMEM; goto out; @@ -543,7 +540,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) #ifndef CONFIG_SMP cpumask_setall(b->cpus); #else - cpumask_copy(b->cpus, c->llc_shared_map); + cpumask_set_cpu(cpu, b->cpus); #endif per_cpu(threshold_banks, cpu)[bank] = b; diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index c2a8b26d4fe..4b683267eca 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -202,10 +202,11 @@ static int therm_throt_process(bool new_event, int event, int level) #ifdef CONFIG_SYSFS /* Add/Remove thermal_throttle interface for CPU device: */ -static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev) +static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev, + unsigned int cpu) { int err; - struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); + struct cpuinfo_x86 *c = &cpu_data(cpu); err = sysfs_create_group(&sys_dev->kobj, &thermal_attr_group); if (err) @@ -215,7 +216,7 @@ static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev) err = sysfs_add_file_to_group(&sys_dev->kobj, &attr_core_power_limit_count.attr, thermal_attr_group.name); - if (cpu_has(c, X86_FEATURE_PTS)) + if (cpu_has(c, X86_FEATURE_PTS)) { err = sysfs_add_file_to_group(&sys_dev->kobj, &attr_package_throttle_count.attr, thermal_attr_group.name); @@ -223,6 +224,7 @@ static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev) err = sysfs_add_file_to_group(&sys_dev->kobj, &attr_package_power_limit_count.attr, thermal_attr_group.name); + } return err; } @@ -251,7 +253,7 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb, case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: mutex_lock(&therm_cpu_lock); - err = thermal_throttle_add_dev(sys_dev); + err = thermal_throttle_add_dev(sys_dev, cpu); mutex_unlock(&therm_cpu_lock); WARN_ON(err); break; @@ -287,7 +289,7 @@ static __init int thermal_throttle_init_device(void) #endif /* connect live CPUs to sysfs */ for_each_online_cpu(cpu) { - err = thermal_throttle_add_dev(get_cpu_sysdev(cpu)); + err = thermal_throttle_add_dev(get_cpu_sysdev(cpu), cpu); WARN_ON(err); } #ifdef CONFIG_HOTPLUG_CPU @@ -348,7 +350,7 @@ static void intel_thermal_interrupt(void) static void unexpected_thermal_interrupt(void) { - printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n", + printk(KERN_ERR "CPU%d: Unexpected LVT thermal interrupt!\n", smp_processor_id()); add_taint(TAINT_MACHINE_CHECK); } diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index c5f59d07142..ac140c7be39 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -827,7 +827,7 @@ int __init amd_special_default_mtrr(void) if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) return 0; - if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11) + if (boot_cpu_data.x86 < 0xf) return 0; /* In case some hypervisor doesn't pass SYSCFG through: */ if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0) diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 7d28d7d0388..9f27228ceff 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -64,18 +64,59 @@ static inline void k8_check_syscfg_dram_mod_en(void) } } +/* Get the size of contiguous MTRR range */ +static u64 get_mtrr_size(u64 mask) +{ + u64 size; + + mask >>= PAGE_SHIFT; + mask |= size_or_mask; + size = -mask; + size <<= PAGE_SHIFT; + return size; +} + /* - * Returns the effective MTRR type for the region - * Error returns: - * - 0xFE - when the range is "not entirely covered" by _any_ var range MTRR - * - 0xFF - when MTRR is not enabled + * Check and return the effective type for MTRR-MTRR type overlap. + * Returns 1 if the effective type is UNCACHEABLE, else returns 0 */ -u8 mtrr_type_lookup(u64 start, u64 end) +static int check_type_overlap(u8 *prev, u8 *curr) +{ + if (*prev == MTRR_TYPE_UNCACHABLE || *curr == MTRR_TYPE_UNCACHABLE) { + *prev = MTRR_TYPE_UNCACHABLE; + *curr = MTRR_TYPE_UNCACHABLE; + return 1; + } + + if ((*prev == MTRR_TYPE_WRBACK && *curr == MTRR_TYPE_WRTHROUGH) || + (*prev == MTRR_TYPE_WRTHROUGH && *curr == MTRR_TYPE_WRBACK)) { + *prev = MTRR_TYPE_WRTHROUGH; + *curr = MTRR_TYPE_WRTHROUGH; + } + + if (*prev != *curr) { + *prev = MTRR_TYPE_UNCACHABLE; + *curr = MTRR_TYPE_UNCACHABLE; + return 1; + } + + return 0; +} + +/* + * Error/Semi-error returns: + * 0xFF - when MTRR is not enabled + * *repeat == 1 implies [start:end] spanned across MTRR range and type returned + * corresponds only to [start:*partial_end]. + * Caller has to lookup again for [*partial_end:end]. + */ +static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat) { int i; u64 base, mask; u8 prev_match, curr_match; + *repeat = 0; if (!mtrr_state_set) return 0xFF; @@ -126,8 +167,34 @@ u8 mtrr_type_lookup(u64 start, u64 end) start_state = ((start & mask) == (base & mask)); end_state = ((end & mask) == (base & mask)); - if (start_state != end_state) - return 0xFE; + + if (start_state != end_state) { + /* + * We have start:end spanning across an MTRR. + * We split the region into + * either + * (start:mtrr_end) (mtrr_end:end) + * or + * (start:mtrr_start) (mtrr_start:end) + * depending on kind of overlap. + * Return the type for first region and a pointer to + * the start of second region so that caller will + * lookup again on the second region. + * Note: This way we handle multiple overlaps as well. + */ + if (start_state) + *partial_end = base + get_mtrr_size(mask); + else + *partial_end = base; + + if (unlikely(*partial_end <= start)) { + WARN_ON(1); + *partial_end = start + PAGE_SIZE; + } + + end = *partial_end - 1; /* end is inclusive */ + *repeat = 1; + } if ((start & mask) != (base & mask)) continue; @@ -138,21 +205,8 @@ u8 mtrr_type_lookup(u64 start, u64 end) continue; } - if (prev_match == MTRR_TYPE_UNCACHABLE || - curr_match == MTRR_TYPE_UNCACHABLE) { - return MTRR_TYPE_UNCACHABLE; - } - - if ((prev_match == MTRR_TYPE_WRBACK && - curr_match == MTRR_TYPE_WRTHROUGH) || - (prev_match == MTRR_TYPE_WRTHROUGH && - curr_match == MTRR_TYPE_WRBACK)) { - prev_match = MTRR_TYPE_WRTHROUGH; - curr_match = MTRR_TYPE_WRTHROUGH; - } - - if (prev_match != curr_match) - return MTRR_TYPE_UNCACHABLE; + if (check_type_overlap(&prev_match, &curr_match)) + return curr_match; } if (mtrr_tom2) { @@ -166,6 +220,36 @@ u8 mtrr_type_lookup(u64 start, u64 end) return mtrr_state.def_type; } +/* + * Returns the effective MTRR type for the region + * Error return: + * 0xFF - when MTRR is not enabled + */ +u8 mtrr_type_lookup(u64 start, u64 end) +{ + u8 type, prev_type; + int repeat; + u64 partial_end; + + type = __mtrr_type_lookup(start, end, &partial_end, &repeat); + + /* + * Common path is with repeat = 0. + * However, we can have cases where [start:end] spans across some + * MTRR range. Do repeated lookups for that case here. + */ + while (repeat) { + prev_type = type; + start = partial_end; + type = __mtrr_type_lookup(start, end, &partial_end, &repeat); + + if (check_type_overlap(&prev_type, &type)) + return type; + } + + return type; +} + /* Get the MSR pair relating to a var range */ static void get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr) diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index f2da20fda02..fe73c1844a9 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -102,6 +102,7 @@ struct cpu_hw_events { */ struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */ unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + unsigned long running[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; int enabled; int n_events; @@ -530,7 +531,7 @@ static int x86_pmu_hw_config(struct perf_event *event) /* * Setup the hardware configuration for a given attr_type */ -static int __hw_perf_event_init(struct perf_event *event) +static int __x86_pmu_event_init(struct perf_event *event) { int err; @@ -583,7 +584,7 @@ static void x86_pmu_disable_all(void) } } -void hw_perf_disable(void) +static void x86_pmu_disable(struct pmu *pmu) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); @@ -618,7 +619,7 @@ static void x86_pmu_enable_all(int added) } } -static const struct pmu pmu; +static struct pmu pmu; static inline int is_x86_event(struct perf_event *event) { @@ -800,10 +801,10 @@ static inline int match_prev_assignment(struct hw_perf_event *hwc, hwc->last_tag == cpuc->tags[i]; } -static int x86_pmu_start(struct perf_event *event); -static void x86_pmu_stop(struct perf_event *event); +static void x86_pmu_start(struct perf_event *event, int flags); +static void x86_pmu_stop(struct perf_event *event, int flags); -void hw_perf_enable(void) +static void x86_pmu_enable(struct pmu *pmu) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct perf_event *event; @@ -839,7 +840,14 @@ void hw_perf_enable(void) match_prev_assignment(hwc, cpuc, i)) continue; - x86_pmu_stop(event); + /* + * Ensure we don't accidentally enable a stopped + * counter simply because we rescheduled. + */ + if (hwc->state & PERF_HES_STOPPED) + hwc->state |= PERF_HES_ARCH; + + x86_pmu_stop(event, PERF_EF_UPDATE); } for (i = 0; i < cpuc->n_events; i++) { @@ -851,7 +859,10 @@ void hw_perf_enable(void) else if (i < n_running) continue; - x86_pmu_start(event); + if (hwc->state & PERF_HES_ARCH) + continue; + + x86_pmu_start(event, PERF_EF_RELOAD); } cpuc->n_added = 0; perf_events_lapic_init(); @@ -952,15 +963,12 @@ static void x86_pmu_enable_event(struct perf_event *event) } /* - * activate a single event + * Add a single event to the PMU. * * The event is added to the group of enabled events * but only if it can be scehduled with existing events. - * - * Called with PMU disabled. If successful and return value 1, - * then guaranteed to call perf_enable() and hw_perf_enable() */ -static int x86_pmu_enable(struct perf_event *event) +static int x86_pmu_add(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct hw_perf_event *hwc; @@ -969,57 +977,67 @@ static int x86_pmu_enable(struct perf_event *event) hwc = &event->hw; + perf_pmu_disable(event->pmu); n0 = cpuc->n_events; - n = collect_events(cpuc, event, false); - if (n < 0) - return n; + ret = n = collect_events(cpuc, event, false); + if (ret < 0) + goto out; + + hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + if (!(flags & PERF_EF_START)) + hwc->state |= PERF_HES_ARCH; /* * If group events scheduling transaction was started, * skip the schedulability test here, it will be peformed - * at commit time(->commit_txn) as a whole + * at commit time (->commit_txn) as a whole */ if (cpuc->group_flag & PERF_EVENT_TXN) - goto out; + goto done_collect; ret = x86_pmu.schedule_events(cpuc, n, assign); if (ret) - return ret; + goto out; /* * copy new assignment, now we know it is possible * will be used by hw_perf_enable() */ memcpy(cpuc->assign, assign, n*sizeof(int)); -out: +done_collect: cpuc->n_events = n; cpuc->n_added += n - n0; cpuc->n_txn += n - n0; - return 0; + ret = 0; +out: + perf_pmu_enable(event->pmu); + return ret; } -static int x86_pmu_start(struct perf_event *event) +static void x86_pmu_start(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); int idx = event->hw.idx; - if (idx == -1) - return -EAGAIN; + if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) + return; + + if (WARN_ON_ONCE(idx == -1)) + return; + + if (flags & PERF_EF_RELOAD) { + WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE)); + x86_perf_event_set_period(event); + } + + event->hw.state = 0; - x86_perf_event_set_period(event); cpuc->events[idx] = event; __set_bit(idx, cpuc->active_mask); + __set_bit(idx, cpuc->running); x86_pmu.enable(event); perf_event_update_userpage(event); - - return 0; -} - -static void x86_pmu_unthrottle(struct perf_event *event) -{ - int ret = x86_pmu_start(event); - WARN_ON_ONCE(ret); } void perf_event_print_debug(void) @@ -1076,27 +1094,29 @@ void perf_event_print_debug(void) local_irq_restore(flags); } -static void x86_pmu_stop(struct perf_event *event) +static void x86_pmu_stop(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct hw_perf_event *hwc = &event->hw; - int idx = hwc->idx; - - if (!__test_and_clear_bit(idx, cpuc->active_mask)) - return; - - x86_pmu.disable(event); - /* - * Drain the remaining delta count out of a event - * that we are disabling: - */ - x86_perf_event_update(event); + if (__test_and_clear_bit(hwc->idx, cpuc->active_mask)) { + x86_pmu.disable(event); + cpuc->events[hwc->idx] = NULL; + WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); + hwc->state |= PERF_HES_STOPPED; + } - cpuc->events[idx] = NULL; + if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { + /* + * Drain the remaining delta count out of a event + * that we are disabling: + */ + x86_perf_event_update(event); + hwc->state |= PERF_HES_UPTODATE; + } } -static void x86_pmu_disable(struct perf_event *event) +static void x86_pmu_del(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); int i; @@ -1109,7 +1129,7 @@ static void x86_pmu_disable(struct perf_event *event) if (cpuc->group_flag & PERF_EVENT_TXN) return; - x86_pmu_stop(event); + x86_pmu_stop(event, PERF_EF_UPDATE); for (i = 0; i < cpuc->n_events; i++) { if (event == cpuc->event_list[i]) { @@ -1132,7 +1152,6 @@ static int x86_pmu_handle_irq(struct pt_regs *regs) struct perf_sample_data data; struct cpu_hw_events *cpuc; struct perf_event *event; - struct hw_perf_event *hwc; int idx, handled = 0; u64 val; @@ -1141,11 +1160,18 @@ static int x86_pmu_handle_irq(struct pt_regs *regs) cpuc = &__get_cpu_var(cpu_hw_events); for (idx = 0; idx < x86_pmu.num_counters; idx++) { - if (!test_bit(idx, cpuc->active_mask)) + if (!test_bit(idx, cpuc->active_mask)) { + /* + * Though we deactivated the counter some cpus + * might still deliver spurious interrupts still + * in flight. Catch them: + */ + if (__test_and_clear_bit(idx, cpuc->running)) + handled++; continue; + } event = cpuc->events[idx]; - hwc = &event->hw; val = x86_perf_event_update(event); if (val & (1ULL << (x86_pmu.cntval_bits - 1))) @@ -1154,14 +1180,14 @@ static int x86_pmu_handle_irq(struct pt_regs *regs) /* * event overflow */ - handled = 1; + handled++; data.period = event->hw.last_period; if (!x86_perf_event_set_period(event)) continue; if (perf_event_overflow(event, 1, &data, regs)) - x86_pmu_stop(event); + x86_pmu_stop(event, 0); } if (handled) @@ -1170,25 +1196,6 @@ static int x86_pmu_handle_irq(struct pt_regs *regs) return handled; } -void smp_perf_pending_interrupt(struct pt_regs *regs) -{ - irq_enter(); - ack_APIC_irq(); - inc_irq_stat(apic_pending_irqs); - perf_event_do_pending(); - irq_exit(); -} - -void set_perf_event_pending(void) -{ -#ifdef CONFIG_X86_LOCAL_APIC - if (!x86_pmu.apic || !x86_pmu_initialized()) - return; - - apic->send_IPI_self(LOCAL_PENDING_VECTOR); -#endif -} - void perf_events_lapic_init(void) { if (!x86_pmu.apic || !x86_pmu_initialized()) @@ -1200,12 +1207,20 @@ void perf_events_lapic_init(void) apic_write(APIC_LVTPC, APIC_DM_NMI); } +struct pmu_nmi_state { + unsigned int marked; + int handled; +}; + +static DEFINE_PER_CPU(struct pmu_nmi_state, pmu_nmi); + static int __kprobes perf_event_nmi_handler(struct notifier_block *self, unsigned long cmd, void *__args) { struct die_args *args = __args; - struct pt_regs *regs; + unsigned int this_nmi; + int handled; if (!atomic_read(&active_events)) return NOTIFY_DONE; @@ -1214,22 +1229,47 @@ perf_event_nmi_handler(struct notifier_block *self, case DIE_NMI: case DIE_NMI_IPI: break; - + case DIE_NMIUNKNOWN: + this_nmi = percpu_read(irq_stat.__nmi_count); + if (this_nmi != __get_cpu_var(pmu_nmi).marked) + /* let the kernel handle the unknown nmi */ + return NOTIFY_DONE; + /* + * This one is a PMU back-to-back nmi. Two events + * trigger 'simultaneously' raising two back-to-back + * NMIs. If the first NMI handles both, the latter + * will be empty and daze the CPU. So, we drop it to + * avoid false-positive 'unknown nmi' messages. + */ + return NOTIFY_STOP; default: return NOTIFY_DONE; } - regs = args->regs; - apic_write(APIC_LVTPC, APIC_DM_NMI); - /* - * Can't rely on the handled return value to say it was our NMI, two - * events could trigger 'simultaneously' raising two back-to-back NMIs. - * - * If the first NMI handles both, the latter will be empty and daze - * the CPU. - */ - x86_pmu.handle_irq(regs); + + handled = x86_pmu.handle_irq(args->regs); + if (!handled) + return NOTIFY_DONE; + + this_nmi = percpu_read(irq_stat.__nmi_count); + if ((handled > 1) || + /* the next nmi could be a back-to-back nmi */ + ((__get_cpu_var(pmu_nmi).marked == this_nmi) && + (__get_cpu_var(pmu_nmi).handled > 1))) { + /* + * We could have two subsequent back-to-back nmis: The + * first handles more than one counter, the 2nd + * handles only one counter and the 3rd handles no + * counter. + * + * This is the 2nd nmi because the previous was + * handling more than one counter. We will mark the + * next (3rd) and then drop it if unhandled. + */ + __get_cpu_var(pmu_nmi).marked = this_nmi + 1; + __get_cpu_var(pmu_nmi).handled = handled; + } return NOTIFY_STOP; } @@ -1345,7 +1385,6 @@ void __init init_hw_perf_events(void) x86_pmu.num_counters = X86_PMC_MAX_GENERIC; } x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1; - perf_max_events = x86_pmu.num_counters; if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) { WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", @@ -1381,6 +1420,7 @@ void __init init_hw_perf_events(void) pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed); pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl); + perf_pmu_register(&pmu); perf_cpu_notifier(x86_pmu_notifier); } @@ -1394,10 +1434,11 @@ static inline void x86_pmu_read(struct perf_event *event) * Set the flag to make pmu::enable() not perform the * schedulability test, it will be performed at commit time */ -static void x86_pmu_start_txn(const struct pmu *pmu) +static void x86_pmu_start_txn(struct pmu *pmu) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + perf_pmu_disable(pmu); cpuc->group_flag |= PERF_EVENT_TXN; cpuc->n_txn = 0; } @@ -1407,7 +1448,7 @@ static void x86_pmu_start_txn(const struct pmu *pmu) * Clear the flag and pmu::enable() will perform the * schedulability test. */ -static void x86_pmu_cancel_txn(const struct pmu *pmu) +static void x86_pmu_cancel_txn(struct pmu *pmu) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); @@ -1417,6 +1458,7 @@ static void x86_pmu_cancel_txn(const struct pmu *pmu) */ cpuc->n_added -= cpuc->n_txn; cpuc->n_events -= cpuc->n_txn; + perf_pmu_enable(pmu); } /* @@ -1424,7 +1466,7 @@ static void x86_pmu_cancel_txn(const struct pmu *pmu) * Perform the group schedulability test as a whole * Return 0 if success */ -static int x86_pmu_commit_txn(const struct pmu *pmu) +static int x86_pmu_commit_txn(struct pmu *pmu) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); int assign[X86_PMC_IDX_MAX]; @@ -1446,22 +1488,10 @@ static int x86_pmu_commit_txn(const struct pmu *pmu) memcpy(cpuc->assign, assign, n*sizeof(int)); cpuc->group_flag &= ~PERF_EVENT_TXN; - + perf_pmu_enable(pmu); return 0; } -static const struct pmu pmu = { - .enable = x86_pmu_enable, - .disable = x86_pmu_disable, - .start = x86_pmu_start, - .stop = x86_pmu_stop, - .read = x86_pmu_read, - .unthrottle = x86_pmu_unthrottle, - .start_txn = x86_pmu_start_txn, - .cancel_txn = x86_pmu_cancel_txn, - .commit_txn = x86_pmu_commit_txn, -}; - /* * validate that we can schedule this event */ @@ -1536,12 +1566,22 @@ out: return ret; } -const struct pmu *hw_perf_event_init(struct perf_event *event) +int x86_pmu_event_init(struct perf_event *event) { - const struct pmu *tmp; + struct pmu *tmp; int err; - err = __hw_perf_event_init(event); + switch (event->attr.type) { + case PERF_TYPE_RAW: + case PERF_TYPE_HARDWARE: + case PERF_TYPE_HW_CACHE: + break; + + default: + return -ENOENT; + } + + err = __x86_pmu_event_init(event); if (!err) { /* * we temporarily connect event to its pmu @@ -1561,26 +1601,31 @@ const struct pmu *hw_perf_event_init(struct perf_event *event) if (err) { if (event->destroy) event->destroy(event); - return ERR_PTR(err); } - return &pmu; + return err; } -/* - * callchain support - */ +static struct pmu pmu = { + .pmu_enable = x86_pmu_enable, + .pmu_disable = x86_pmu_disable, -static inline -void callchain_store(struct perf_callchain_entry *entry, u64 ip) -{ - if (entry->nr < PERF_MAX_STACK_DEPTH) - entry->ip[entry->nr++] = ip; -} + .event_init = x86_pmu_event_init, -static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry); -static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry); + .add = x86_pmu_add, + .del = x86_pmu_del, + .start = x86_pmu_start, + .stop = x86_pmu_stop, + .read = x86_pmu_read, + .start_txn = x86_pmu_start_txn, + .cancel_txn = x86_pmu_cancel_txn, + .commit_txn = x86_pmu_commit_txn, +}; + +/* + * callchain support + */ static void backtrace_warning_symbol(void *data, char *msg, unsigned long symbol) @@ -1602,7 +1647,7 @@ static void backtrace_address(void *data, unsigned long addr, int reliable) { struct perf_callchain_entry *entry = data; - callchain_store(entry, addr); + perf_callchain_store(entry, addr); } static const struct stacktrace_ops backtrace_ops = { @@ -1613,11 +1658,15 @@ static const struct stacktrace_ops backtrace_ops = { .walk_stack = print_context_stack_bp, }; -static void -perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) +void +perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs) { - callchain_store(entry, PERF_CONTEXT_KERNEL); - callchain_store(entry, regs->ip); + if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { + /* TODO: We don't support guest os callchain now */ + return; + } + + perf_callchain_store(entry, regs->ip); dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry); } @@ -1646,7 +1695,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) if (fp < compat_ptr(regs->sp)) break; - callchain_store(entry, frame.return_address); + perf_callchain_store(entry, frame.return_address); fp = compat_ptr(frame.next_frame); } return 1; @@ -1659,19 +1708,20 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) } #endif -static void -perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) +void +perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) { struct stack_frame frame; const void __user *fp; - if (!user_mode(regs)) - regs = task_pt_regs(current); + if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { + /* TODO: We don't support guest os callchain now */ + return; + } fp = (void __user *)regs->bp; - callchain_store(entry, PERF_CONTEXT_USER); - callchain_store(entry, regs->ip); + perf_callchain_store(entry, regs->ip); if (perf_callchain_user32(regs, entry)) return; @@ -1688,52 +1738,11 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) if ((unsigned long)fp < regs->sp) break; - callchain_store(entry, frame.return_address); + perf_callchain_store(entry, frame.return_address); fp = frame.next_frame; } } -static void -perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry) -{ - int is_user; - - if (!regs) - return; - - is_user = user_mode(regs); - - if (is_user && current->state != TASK_RUNNING) - return; - - if (!is_user) - perf_callchain_kernel(regs, entry); - - if (current->mm) - perf_callchain_user(regs, entry); -} - -struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) -{ - struct perf_callchain_entry *entry; - - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { - /* TODO: We don't support guest os callchain now */ - return NULL; - } - - if (in_nmi()) - entry = &__get_cpu_var(pmc_nmi_entry); - else - entry = &__get_cpu_var(pmc_irq_entry); - - entry->nr = 0; - - perf_do_callchain(regs, entry); - - return entry; -} - unsigned long perf_instruction_pointer(struct pt_regs *regs) { unsigned long ip; diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index c2897b7b4a3..46d58448c3a 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -52,7 +52,7 @@ static __initconst const u64 amd_hw_cache_event_ids [ C(DTLB) ] = { [ C(OP_READ) ] = { [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ - [ C(RESULT_MISS) ] = 0x0046, /* L1 DTLB and L2 DLTB Miss */ + [ C(RESULT_MISS) ] = 0x0746, /* L1_DTLB_AND_L2_DLTB_MISS.ALL */ }, [ C(OP_WRITE) ] = { [ C(RESULT_ACCESS) ] = 0, @@ -66,7 +66,7 @@ static __initconst const u64 amd_hw_cache_event_ids [ C(ITLB) ] = { [ C(OP_READ) ] = { [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */ - [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */ + [ C(RESULT_MISS) ] = 0x0385, /* L1_ITLB_AND_L2_ITLB_MISS.ALL */ }, [ C(OP_WRITE) ] = { [ C(RESULT_ACCESS) ] = -1, diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index d8d86d01400..c8f5c088cad 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -712,22 +712,24 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) struct perf_sample_data data; struct cpu_hw_events *cpuc; int bit, loops; - u64 ack, status; + u64 status; + int handled; perf_sample_data_init(&data, 0); cpuc = &__get_cpu_var(cpu_hw_events); intel_pmu_disable_all(); - intel_pmu_drain_bts_buffer(); + handled = intel_pmu_drain_bts_buffer(); status = intel_pmu_get_status(); if (!status) { intel_pmu_enable_all(0); - return 0; + return handled; } loops = 0; again: + intel_pmu_ack_status(status); if (++loops > 100) { WARN_ONCE(1, "perfevents: irq loop stuck!\n"); perf_event_print_debug(); @@ -736,19 +738,22 @@ again: } inc_irq_stat(apic_perf_irqs); - ack = status; intel_pmu_lbr_read(); /* * PEBS overflow sets bit 62 in the global status register */ - if (__test_and_clear_bit(62, (unsigned long *)&status)) + if (__test_and_clear_bit(62, (unsigned long *)&status)) { + handled++; x86_pmu.drain_pebs(regs); + } for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { struct perf_event *event = cpuc->events[bit]; + handled++; + if (!test_bit(bit, cpuc->active_mask)) continue; @@ -758,11 +763,9 @@ again: data.period = event->hw.last_period; if (perf_event_overflow(event, 1, &data, regs)) - x86_pmu_stop(event); + x86_pmu_stop(event, 0); } - intel_pmu_ack_status(ack); - /* * Repeat if there is more work to be done: */ @@ -772,7 +775,7 @@ again: done: intel_pmu_enable_all(0); - return 1; + return handled; } static struct event_constraint * diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 18018d1311c..4977f9c400e 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -214,7 +214,7 @@ static void intel_pmu_disable_bts(void) update_debugctlmsr(debugctlmsr); } -static void intel_pmu_drain_bts_buffer(void) +static int intel_pmu_drain_bts_buffer(void) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct debug_store *ds = cpuc->ds; @@ -231,16 +231,16 @@ static void intel_pmu_drain_bts_buffer(void) struct pt_regs regs; if (!event) - return; + return 0; if (!ds) - return; + return 0; at = (struct bts_record *)(unsigned long)ds->bts_buffer_base; top = (struct bts_record *)(unsigned long)ds->bts_index; if (top <= at) - return; + return 0; ds->bts_index = ds->bts_buffer_base; @@ -256,7 +256,7 @@ static void intel_pmu_drain_bts_buffer(void) perf_prepare_sample(&header, &data, event, ®s); if (perf_output_begin(&handle, event, header.size * (top - at), 1, 1)) - return; + return 1; for (; at < top; at++) { data.ip = at->from; @@ -270,6 +270,7 @@ static void intel_pmu_drain_bts_buffer(void) /* There's new data available. */ event->hw.interrupts++; event->pending_kill = POLL_IN; + return 1; } /* @@ -491,7 +492,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event, regs.flags &= ~PERF_EFLAGS_EXACT; if (perf_event_overflow(event, 1, &data, ®s)) - x86_pmu_stop(event); + x86_pmu_stop(event, 0); } static void intel_pmu_drain_pebs_core(struct pt_regs *iregs) diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index febb12cea79..81400b93e69 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -18,6 +18,8 @@ struct p4_event_bind { unsigned int opcode; /* Event code and ESCR selector */ unsigned int escr_msr[2]; /* ESCR MSR for this event */ + unsigned int escr_emask; /* valid ESCR EventMask bits */ + unsigned int shared; /* event is shared across threads */ char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on abscence */ }; @@ -66,231 +68,435 @@ static struct p4_event_bind p4_event_bind_map[] = { [P4_EVENT_TC_DELIVER_MODE] = { .opcode = P4_OPCODE(P4_EVENT_TC_DELIVER_MODE), .escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DD) | + P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DB) | + P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DI) | + P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BD) | + P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BB) | + P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BI) | + P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, ID), + .shared = 1, .cntr = { {4, 5, -1}, {6, 7, -1} }, }, [P4_EVENT_BPU_FETCH_REQUEST] = { .opcode = P4_OPCODE(P4_EVENT_BPU_FETCH_REQUEST), .escr_msr = { MSR_P4_BPU_ESCR0, MSR_P4_BPU_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_BPU_FETCH_REQUEST, TCMISS), .cntr = { {0, -1, -1}, {2, -1, -1} }, }, [P4_EVENT_ITLB_REFERENCE] = { .opcode = P4_OPCODE(P4_EVENT_ITLB_REFERENCE), .escr_msr = { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, HIT) | + P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, MISS) | + P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, HIT_UK), .cntr = { {0, -1, -1}, {2, -1, -1} }, }, [P4_EVENT_MEMORY_CANCEL] = { .opcode = P4_OPCODE(P4_EVENT_MEMORY_CANCEL), .escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_CANCEL, ST_RB_FULL) | + P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_CANCEL, 64K_CONF), .cntr = { {8, 9, -1}, {10, 11, -1} }, }, [P4_EVENT_MEMORY_COMPLETE] = { .opcode = P4_OPCODE(P4_EVENT_MEMORY_COMPLETE), .escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_COMPLETE, LSC) | + P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_COMPLETE, SSC), .cntr = { {8, 9, -1}, {10, 11, -1} }, }, [P4_EVENT_LOAD_PORT_REPLAY] = { .opcode = P4_OPCODE(P4_EVENT_LOAD_PORT_REPLAY), .escr_msr = { MSR_P4_SAAT_ESCR0, MSR_P4_SAAT_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_LOAD_PORT_REPLAY, SPLIT_LD), .cntr = { {8, 9, -1}, {10, 11, -1} }, }, [P4_EVENT_STORE_PORT_REPLAY] = { .opcode = P4_OPCODE(P4_EVENT_STORE_PORT_REPLAY), .escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_STORE_PORT_REPLAY, SPLIT_ST), .cntr = { {8, 9, -1}, {10, 11, -1} }, }, [P4_EVENT_MOB_LOAD_REPLAY] = { .opcode = P4_OPCODE(P4_EVENT_MOB_LOAD_REPLAY), .escr_msr = { MSR_P4_MOB_ESCR0, MSR_P4_MOB_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, NO_STA) | + P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, NO_STD) | + P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, PARTIAL_DATA) | + P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, UNALGN_ADDR), .cntr = { {0, -1, -1}, {2, -1, -1} }, }, [P4_EVENT_PAGE_WALK_TYPE] = { .opcode = P4_OPCODE(P4_EVENT_PAGE_WALK_TYPE), .escr_msr = { MSR_P4_PMH_ESCR0, MSR_P4_PMH_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_PAGE_WALK_TYPE, DTMISS) | + P4_ESCR_EMASK_BIT(P4_EVENT_PAGE_WALK_TYPE, ITMISS), + .shared = 1, .cntr = { {0, -1, -1}, {2, -1, -1} }, }, [P4_EVENT_BSQ_CACHE_REFERENCE] = { .opcode = P4_OPCODE(P4_EVENT_BSQ_CACHE_REFERENCE), .escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS), .cntr = { {0, -1, -1}, {2, -1, -1} }, }, [P4_EVENT_IOQ_ALLOCATION] = { .opcode = P4_OPCODE(P4_EVENT_IOQ_ALLOCATION), .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, DEFAULT) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, ALL_READ) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, ALL_WRITE) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_UC) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WC) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WT) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WP) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WB) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, OWN) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, OTHER) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, PREFETCH), .cntr = { {0, -1, -1}, {2, -1, -1} }, }, [P4_EVENT_IOQ_ACTIVE_ENTRIES] = { /* shared ESCR */ .opcode = P4_OPCODE(P4_EVENT_IOQ_ACTIVE_ENTRIES), .escr_msr = { MSR_P4_FSB_ESCR1, MSR_P4_FSB_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, DEFAULT) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_READ) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_WRITE) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_UC) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WC) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WT) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WP) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WB) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, OWN) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, OTHER) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, PREFETCH), .cntr = { {2, -1, -1}, {3, -1, -1} }, }, [P4_EVENT_FSB_DATA_ACTIVITY] = { .opcode = P4_OPCODE(P4_EVENT_FSB_DATA_ACTIVITY), .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV) | + P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN) | + P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OTHER) | + P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_DRV) | + P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OWN) | + P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OTHER), + .shared = 1, .cntr = { {0, -1, -1}, {2, -1, -1} }, }, [P4_EVENT_BSQ_ALLOCATION] = { /* shared ESCR, broken CCCR1 */ .opcode = P4_OPCODE(P4_EVENT_BSQ_ALLOCATION), .escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR0 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE0) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE1) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LEN0) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LEN1) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_IO_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LOCK_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_CACHE_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_SPLIT_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_DEM_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_ORD_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE0) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE1) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE2), .cntr = { {0, -1, -1}, {1, -1, -1} }, }, [P4_EVENT_BSQ_ACTIVE_ENTRIES] = { /* shared ESCR */ .opcode = P4_OPCODE(P4_EVENT_BSQ_ACTIVE_ENTRIES), .escr_msr = { MSR_P4_BSU_ESCR1 , MSR_P4_BSU_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE0) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE1) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN0) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN1) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_IO_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LOCK_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_CACHE_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_SPLIT_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_DEM_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_ORD_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE0) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE1) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE2), .cntr = { {2, -1, -1}, {3, -1, -1} }, }, [P4_EVENT_SSE_INPUT_ASSIST] = { .opcode = P4_OPCODE(P4_EVENT_SSE_INPUT_ASSIST), .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_SSE_INPUT_ASSIST, ALL), + .shared = 1, .cntr = { {8, 9, -1}, {10, 11, -1} }, }, [P4_EVENT_PACKED_SP_UOP] = { .opcode = P4_OPCODE(P4_EVENT_PACKED_SP_UOP), .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_PACKED_SP_UOP, ALL), + .shared = 1, .cntr = { {8, 9, -1}, {10, 11, -1} }, }, [P4_EVENT_PACKED_DP_UOP] = { .opcode = P4_OPCODE(P4_EVENT_PACKED_DP_UOP), .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_PACKED_DP_UOP, ALL), + .shared = 1, .cntr = { {8, 9, -1}, {10, 11, -1} }, }, [P4_EVENT_SCALAR_SP_UOP] = { .opcode = P4_OPCODE(P4_EVENT_SCALAR_SP_UOP), .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_SCALAR_SP_UOP, ALL), + .shared = 1, .cntr = { {8, 9, -1}, {10, 11, -1} }, }, [P4_EVENT_SCALAR_DP_UOP] = { .opcode = P4_OPCODE(P4_EVENT_SCALAR_DP_UOP), .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_SCALAR_DP_UOP, ALL), + .shared = 1, .cntr = { {8, 9, -1}, {10, 11, -1} }, }, [P4_EVENT_64BIT_MMX_UOP] = { .opcode = P4_OPCODE(P4_EVENT_64BIT_MMX_UOP), .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_64BIT_MMX_UOP, ALL), + .shared = 1, .cntr = { {8, 9, -1}, {10, 11, -1} }, }, [P4_EVENT_128BIT_MMX_UOP] = { .opcode = P4_OPCODE(P4_EVENT_128BIT_MMX_UOP), .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_128BIT_MMX_UOP, ALL), + .shared = 1, .cntr = { {8, 9, -1}, {10, 11, -1} }, }, [P4_EVENT_X87_FP_UOP] = { .opcode = P4_OPCODE(P4_EVENT_X87_FP_UOP), .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_X87_FP_UOP, ALL), + .shared = 1, .cntr = { {8, 9, -1}, {10, 11, -1} }, }, [P4_EVENT_TC_MISC] = { .opcode = P4_OPCODE(P4_EVENT_TC_MISC), .escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_TC_MISC, FLUSH), .cntr = { {4, 5, -1}, {6, 7, -1} }, }, [P4_EVENT_GLOBAL_POWER_EVENTS] = { .opcode = P4_OPCODE(P4_EVENT_GLOBAL_POWER_EVENTS), .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING), .cntr = { {0, -1, -1}, {2, -1, -1} }, }, [P4_EVENT_TC_MS_XFER] = { .opcode = P4_OPCODE(P4_EVENT_TC_MS_XFER), .escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_TC_MS_XFER, CISC), .cntr = { {4, 5, -1}, {6, 7, -1} }, }, [P4_EVENT_UOP_QUEUE_WRITES] = { .opcode = P4_OPCODE(P4_EVENT_UOP_QUEUE_WRITES), .escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_BUILD) | + P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_DELIVER) | + P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_ROM), .cntr = { {4, 5, -1}, {6, 7, -1} }, }, [P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE] = { .opcode = P4_OPCODE(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE), .escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR0 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CONDITIONAL) | + P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CALL) | + P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, RETURN) | + P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, INDIRECT), .cntr = { {4, 5, -1}, {6, 7, -1} }, }, [P4_EVENT_RETIRED_BRANCH_TYPE] = { .opcode = P4_OPCODE(P4_EVENT_RETIRED_BRANCH_TYPE), .escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL) | + P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL) | + P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN) | + P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT), .cntr = { {4, 5, -1}, {6, 7, -1} }, }, [P4_EVENT_RESOURCE_STALL] = { .opcode = P4_OPCODE(P4_EVENT_RESOURCE_STALL), .escr_msr = { MSR_P4_ALF_ESCR0, MSR_P4_ALF_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_RESOURCE_STALL, SBFULL), .cntr = { {12, 13, 16}, {14, 15, 17} }, }, [P4_EVENT_WC_BUFFER] = { .opcode = P4_OPCODE(P4_EVENT_WC_BUFFER), .escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_WC_BUFFER, WCB_EVICTS) | + P4_ESCR_EMASK_BIT(P4_EVENT_WC_BUFFER, WCB_FULL_EVICTS), + .shared = 1, .cntr = { {8, 9, -1}, {10, 11, -1} }, }, [P4_EVENT_B2B_CYCLES] = { .opcode = P4_OPCODE(P4_EVENT_B2B_CYCLES), .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, + .escr_emask = 0, .cntr = { {0, -1, -1}, {2, -1, -1} }, }, [P4_EVENT_BNR] = { .opcode = P4_OPCODE(P4_EVENT_BNR), .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, + .escr_emask = 0, .cntr = { {0, -1, -1}, {2, -1, -1} }, }, [P4_EVENT_SNOOP] = { .opcode = P4_OPCODE(P4_EVENT_SNOOP), .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, + .escr_emask = 0, .cntr = { {0, -1, -1}, {2, -1, -1} }, }, [P4_EVENT_RESPONSE] = { .opcode = P4_OPCODE(P4_EVENT_RESPONSE), .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, + .escr_emask = 0, .cntr = { {0, -1, -1}, {2, -1, -1} }, }, [P4_EVENT_FRONT_END_EVENT] = { .opcode = P4_OPCODE(P4_EVENT_FRONT_END_EVENT), .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_FRONT_END_EVENT, NBOGUS) | + P4_ESCR_EMASK_BIT(P4_EVENT_FRONT_END_EVENT, BOGUS), .cntr = { {12, 13, 16}, {14, 15, 17} }, }, [P4_EVENT_EXECUTION_EVENT] = { .opcode = P4_OPCODE(P4_EVENT_EXECUTION_EVENT), .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3), .cntr = { {12, 13, 16}, {14, 15, 17} }, }, [P4_EVENT_REPLAY_EVENT] = { .opcode = P4_OPCODE(P4_EVENT_REPLAY_EVENT), .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_REPLAY_EVENT, NBOGUS) | + P4_ESCR_EMASK_BIT(P4_EVENT_REPLAY_EVENT, BOGUS), .cntr = { {12, 13, 16}, {14, 15, 17} }, }, [P4_EVENT_INSTR_RETIRED] = { .opcode = P4_OPCODE(P4_EVENT_INSTR_RETIRED), .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG) | + P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSTAG) | + P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG) | + P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSTAG), .cntr = { {12, 13, 16}, {14, 15, 17} }, }, [P4_EVENT_UOPS_RETIRED] = { .opcode = P4_OPCODE(P4_EVENT_UOPS_RETIRED), .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_UOPS_RETIRED, NBOGUS) | + P4_ESCR_EMASK_BIT(P4_EVENT_UOPS_RETIRED, BOGUS), .cntr = { {12, 13, 16}, {14, 15, 17} }, }, [P4_EVENT_UOP_TYPE] = { .opcode = P4_OPCODE(P4_EVENT_UOP_TYPE), .escr_msr = { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_UOP_TYPE, TAGLOADS) | + P4_ESCR_EMASK_BIT(P4_EVENT_UOP_TYPE, TAGSTORES), .cntr = { {12, 13, 16}, {14, 15, 17} }, }, [P4_EVENT_BRANCH_RETIRED] = { .opcode = P4_OPCODE(P4_EVENT_BRANCH_RETIRED), .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMNP) | + P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMNM) | + P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMTP) | + P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMTM), .cntr = { {12, 13, 16}, {14, 15, 17} }, }, [P4_EVENT_MISPRED_BRANCH_RETIRED] = { .opcode = P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED), .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS), .cntr = { {12, 13, 16}, {14, 15, 17} }, }, [P4_EVENT_X87_ASSIST] = { .opcode = P4_OPCODE(P4_EVENT_X87_ASSIST), .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, FPSU) | + P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, FPSO) | + P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, POAO) | + P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, POAU) | + P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, PREA), .cntr = { {12, 13, 16}, {14, 15, 17} }, }, [P4_EVENT_MACHINE_CLEAR] = { .opcode = P4_OPCODE(P4_EVENT_MACHINE_CLEAR), .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, CLEAR) | + P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, MOCLEAR) | + P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, SMCLEAR), .cntr = { {12, 13, 16}, {14, 15, 17} }, }, [P4_EVENT_INSTR_COMPLETED] = { .opcode = P4_OPCODE(P4_EVENT_INSTR_COMPLETED), .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_COMPLETED, NBOGUS) | + P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_COMPLETED, BOGUS), .cntr = { {12, 13, 16}, {14, 15, 17} }, }, }; @@ -428,29 +634,73 @@ static u64 p4_pmu_event_map(int hw_event) return config; } +/* check cpu model specifics */ +static bool p4_event_match_cpu_model(unsigned int event_idx) +{ + /* INSTR_COMPLETED event only exist for model 3, 4, 6 (Prescott) */ + if (event_idx == P4_EVENT_INSTR_COMPLETED) { + if (boot_cpu_data.x86_model != 3 && + boot_cpu_data.x86_model != 4 && + boot_cpu_data.x86_model != 6) + return false; + } + + /* + * For info + * - IQ_ESCR0, IQ_ESCR1 only for models 1 and 2 + */ + + return true; +} + static int p4_validate_raw_event(struct perf_event *event) { - unsigned int v; + unsigned int v, emask; - /* user data may have out-of-bound event index */ + /* User data may have out-of-bound event index */ v = p4_config_unpack_event(event->attr.config); - if (v >= ARRAY_SIZE(p4_event_bind_map)) { - pr_warning("P4 PMU: Unknown event code: %d\n", v); + if (v >= ARRAY_SIZE(p4_event_bind_map)) + return -EINVAL; + + /* It may be unsupported: */ + if (!p4_event_match_cpu_model(v)) return -EINVAL; + + /* + * NOTE: P4_CCCR_THREAD_ANY has not the same meaning as + * in Architectural Performance Monitoring, it means not + * on _which_ logical cpu to count but rather _when_, ie it + * depends on logical cpu state -- count event if one cpu active, + * none, both or any, so we just allow user to pass any value + * desired. + * + * In turn we always set Tx_OS/Tx_USR bits bound to logical + * cpu without their propagation to another cpu + */ + + /* + * if an event is shared accross the logical threads + * the user needs special permissions to be able to use it + */ + if (p4_event_bind_map[v].shared) { + if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) + return -EACCES; } + /* ESCR EventMask bits may be invalid */ + emask = p4_config_unpack_escr(event->attr.config) & P4_ESCR_EVENTMASK_MASK; + if (emask & ~p4_event_bind_map[v].escr_emask) + return -EINVAL; + /* - * it may have some screwed PEBS bits + * it may have some invalid PEBS bits */ - if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE)) { - pr_warning("P4 PMU: PEBS are not supported yet\n"); + if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE)) return -EINVAL; - } + v = p4_config_unpack_metric(event->attr.config); - if (v >= ARRAY_SIZE(p4_pebs_bind_map)) { - pr_warning("P4 PMU: Unknown metric code: %d\n", v); + if (v >= ARRAY_SIZE(p4_pebs_bind_map)) return -EINVAL; - } return 0; } @@ -478,25 +728,21 @@ static int p4_hw_config(struct perf_event *event) if (event->attr.type == PERF_TYPE_RAW) { + /* + * Clear bits we reserve to be managed by kernel itself + * and never allowed from a user space + */ + event->attr.config &= P4_CONFIG_MASK; + rc = p4_validate_raw_event(event); if (rc) goto out; /* - * We don't control raw events so it's up to the caller - * to pass sane values (and we don't count the thread number - * on HT machine but allow HT-compatible specifics to be - * passed on) - * * Note that for RAW events we allow user to use P4_CCCR_RESERVED * bits since we keep additional info here (for cache events and etc) - * - * XXX: HT wide things should check perf_paranoid_cpu() && - * CAP_SYS_ADMIN */ - event->hw.config |= event->attr.config & - (p4_config_pack_escr(P4_ESCR_MASK_HT) | - p4_config_pack_cccr(P4_CCCR_MASK_HT | P4_CCCR_RESERVED)); + event->hw.config |= event->attr.config; } rc = x86_setup_perfctr(event); @@ -658,8 +904,12 @@ static int p4_pmu_handle_irq(struct pt_regs *regs) for (idx = 0; idx < x86_pmu.num_counters; idx++) { int overflow; - if (!test_bit(idx, cpuc->active_mask)) + if (!test_bit(idx, cpuc->active_mask)) { + /* catch in-flight IRQs */ + if (__test_and_clear_bit(idx, cpuc->running)) + handled++; continue; + } event = cpuc->events[idx]; hwc = &event->hw; @@ -690,7 +940,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs) inc_irq_stat(apic_perf_irqs); } - return handled > 0; + return handled; } /* diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index fb329e9f849..d9f4ff8fcd6 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -700,11 +700,10 @@ static void probe_nmi_watchdog(void) { switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: - if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 && - boot_cpu_data.x86 != 16 && boot_cpu_data.x86 != 17) - return; - wd_ops = &k7_wd_ops; - break; + if (boot_cpu_data.x86 == 6 || + (boot_cpu_data.x86 >= 0xf && boot_cpu_data.x86 <= 0x15)) + wd_ops = &k7_wd_ops; + return; case X86_VENDOR_INTEL: /* Work around where perfctr1 doesn't have a working enable * bit as described in the following errata: diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 34b4dad6f0b..c7f64e6f537 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -31,6 +31,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) const struct cpuid_bit *cb; static const struct cpuid_bit __cpuinitconst cpuid_bits[] = { + { X86_FEATURE_DTS, CR_EAX, 0, 0x00000006, 0 }, { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006, 0 }, { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006, 0 }, { X86_FEATURE_PLN, CR_EAX, 4, 0x00000006, 0 }, @@ -43,6 +44,12 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 }, { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 }, { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a, 0 }, + { X86_FEATURE_TSCRATEMSR, CR_EDX, 4, 0x8000000a, 0 }, + { X86_FEATURE_VMCBCLEAN, CR_EDX, 5, 0x8000000a, 0 }, + { X86_FEATURE_FLUSHBYASID, CR_EDX, 6, 0x8000000a, 0 }, + { X86_FEATURE_DECODEASSISTS, CR_EDX, 7, 0x8000000a, 0 }, + { X86_FEATURE_PAUSEFILTER, CR_EDX,10, 0x8000000a, 0 }, + { X86_FEATURE_PFTHRESHOLD, CR_EDX,12, 0x8000000a, 0 }, { 0, 0, 0, 0, 0 } }; diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c index 045b36cada6..994828899e0 100644 --- a/arch/x86/kernel/crash_dump_64.c +++ b/arch/x86/kernel/crash_dump_64.c @@ -34,7 +34,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, if (!csize) return 0; - vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE); + vaddr = ioremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE); if (!vaddr) return -ENOMEM; @@ -46,6 +46,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, } else memcpy(buf, vaddr + offset, csize); + set_iounmap_nonlazy(); iounmap(vaddr); return csize; } diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index e5cc7e82e60..76b8cd953de 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -18,7 +18,6 @@ #include <asm/apic.h> #include <asm/iommu.h> #include <asm/gart.h> -#include <asm/hpet.h> static void __init fix_hypertransport_config(int num, int slot, int func) { @@ -98,7 +97,6 @@ static void __init nvidia_bugs(int num, int slot, int func) } #if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC) -#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC) static u32 __init ati_ixp4x0_rev(int num, int slot, int func) { u32 d; @@ -116,7 +114,6 @@ static u32 __init ati_ixp4x0_rev(int num, int slot, int func) d &= 0xff; return d; } -#endif static void __init ati_bugs(int num, int slot, int func) { @@ -192,21 +189,6 @@ static void __init ati_bugs_contd(int num, int slot, int func) } #endif -/* - * Force the read back of the CMP register in hpet_next_event() - * to work around the problem that the CMP register write seems to be - * delayed. See hpet_next_event() for details. - * - * We do this on all SMBUS incarnations for now until we have more - * information about the affected chipsets. - */ -static void __init ati_hpet_bugs(int num, int slot, int func) -{ -#ifdef CONFIG_HPET_TIMER - hpet_readback_cmp = 1; -#endif -} - #define QFLAG_APPLY_ONCE 0x1 #define QFLAG_APPLIED 0x2 #define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED) @@ -236,8 +218,6 @@ static struct chipset early_qrk[] __initdata = { PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs }, { PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS, PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs_contd }, - { PCI_VENDOR_ID_ATI, PCI_ANY_ID, - PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_hpet_bugs }, {} }; diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index fa99bae75ac..4572f25f932 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -14,6 +14,7 @@ #include <xen/hvc-console.h> #include <asm/pci-direct.h> #include <asm/fixmap.h> +#include <asm/mrst.h> #include <asm/pgtable.h> #include <linux/usb/ehci_def.h> @@ -239,6 +240,18 @@ static int __init setup_early_printk(char *buf) if (!strncmp(buf, "xen", 3)) early_console_register(&xenboot_console, keep); #endif +#ifdef CONFIG_X86_MRST_EARLY_PRINTK + if (!strncmp(buf, "mrst", 4)) { + mrst_early_console_init(); + early_console_register(&early_mrst_console, keep); + } + + if (!strncmp(buf, "hsu", 3)) { + hsu_early_console_init(); + early_console_register(&early_hsu_console, keep); + } + +#endif buf++; } return 0; diff --git a/arch/x86/kernel/early_printk_mrst.c b/arch/x86/kernel/early_printk_mrst.c new file mode 100644 index 00000000000..65df603622b --- /dev/null +++ b/arch/x86/kernel/early_printk_mrst.c @@ -0,0 +1,319 @@ +/* + * early_printk_mrst.c - early consoles for Intel MID platforms + * + * Copyright (c) 2008-2010, Intel Corporation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +/* + * This file implements two early consoles named mrst and hsu. + * mrst is based on Maxim3110 spi-uart device, it exists in both + * Moorestown and Medfield platforms, while hsu is based on a High + * Speed UART device which only exists in the Medfield platform + */ + +#include <linux/serial_reg.h> +#include <linux/serial_mfd.h> +#include <linux/kmsg_dump.h> +#include <linux/console.h> +#include <linux/kernel.h> +#include <linux/delay.h> +#include <linux/init.h> +#include <linux/io.h> + +#include <asm/fixmap.h> +#include <asm/pgtable.h> +#include <asm/mrst.h> + +#define MRST_SPI_TIMEOUT 0x200000 +#define MRST_REGBASE_SPI0 0xff128000 +#define MRST_REGBASE_SPI1 0xff128400 +#define MRST_CLK_SPI0_REG 0xff11d86c + +/* Bit fields in CTRLR0 */ +#define SPI_DFS_OFFSET 0 + +#define SPI_FRF_OFFSET 4 +#define SPI_FRF_SPI 0x0 +#define SPI_FRF_SSP 0x1 +#define SPI_FRF_MICROWIRE 0x2 +#define SPI_FRF_RESV 0x3 + +#define SPI_MODE_OFFSET 6 +#define SPI_SCPH_OFFSET 6 +#define SPI_SCOL_OFFSET 7 +#define SPI_TMOD_OFFSET 8 +#define SPI_TMOD_TR 0x0 /* xmit & recv */ +#define SPI_TMOD_TO 0x1 /* xmit only */ +#define SPI_TMOD_RO 0x2 /* recv only */ +#define SPI_TMOD_EPROMREAD 0x3 /* eeprom read mode */ + +#define SPI_SLVOE_OFFSET 10 +#define SPI_SRL_OFFSET 11 +#define SPI_CFS_OFFSET 12 + +/* Bit fields in SR, 7 bits */ +#define SR_MASK 0x7f /* cover 7 bits */ +#define SR_BUSY (1 << 0) +#define SR_TF_NOT_FULL (1 << 1) +#define SR_TF_EMPT (1 << 2) +#define SR_RF_NOT_EMPT (1 << 3) +#define SR_RF_FULL (1 << 4) +#define SR_TX_ERR (1 << 5) +#define SR_DCOL (1 << 6) + +struct dw_spi_reg { + u32 ctrl0; + u32 ctrl1; + u32 ssienr; + u32 mwcr; + u32 ser; + u32 baudr; + u32 txfltr; + u32 rxfltr; + u32 txflr; + u32 rxflr; + u32 sr; + u32 imr; + u32 isr; + u32 risr; + u32 txoicr; + u32 rxoicr; + u32 rxuicr; + u32 msticr; + u32 icr; + u32 dmacr; + u32 dmatdlr; + u32 dmardlr; + u32 idr; + u32 version; + + /* Currently operates as 32 bits, though only the low 16 bits matter */ + u32 dr; +} __packed; + +#define dw_readl(dw, name) __raw_readl(&(dw)->name) +#define dw_writel(dw, name, val) __raw_writel((val), &(dw)->name) + +/* Default use SPI0 register for mrst, we will detect Penwell and use SPI1 */ +static unsigned long mrst_spi_paddr = MRST_REGBASE_SPI0; + +static u32 *pclk_spi0; +/* Always contains an accessable address, start with 0 */ +static struct dw_spi_reg *pspi; + +static struct kmsg_dumper dw_dumper; +static int dumper_registered; + +static void dw_kmsg_dump(struct kmsg_dumper *dumper, + enum kmsg_dump_reason reason, + const char *s1, unsigned long l1, + const char *s2, unsigned long l2) +{ + int i; + + /* When run to this, we'd better re-init the HW */ + mrst_early_console_init(); + + for (i = 0; i < l1; i++) + early_mrst_console.write(&early_mrst_console, s1 + i, 1); + for (i = 0; i < l2; i++) + early_mrst_console.write(&early_mrst_console, s2 + i, 1); +} + +/* Set the ratio rate to 115200, 8n1, IRQ disabled */ +static void max3110_write_config(void) +{ + u16 config; + + config = 0xc001; + dw_writel(pspi, dr, config); +} + +/* Translate char to a eligible word and send to max3110 */ +static void max3110_write_data(char c) +{ + u16 data; + + data = 0x8000 | c; + dw_writel(pspi, dr, data); +} + +void mrst_early_console_init(void) +{ + u32 ctrlr0 = 0; + u32 spi0_cdiv; + u32 freq; /* Freqency info only need be searched once */ + + /* Base clk is 100 MHz, the actual clk = 100M / (clk_divider + 1) */ + pclk_spi0 = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE, + MRST_CLK_SPI0_REG); + spi0_cdiv = ((*pclk_spi0) & 0xe00) >> 9; + freq = 100000000 / (spi0_cdiv + 1); + + if (mrst_identify_cpu() == MRST_CPU_CHIP_PENWELL) + mrst_spi_paddr = MRST_REGBASE_SPI1; + + pspi = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE, + mrst_spi_paddr); + + /* Disable SPI controller */ + dw_writel(pspi, ssienr, 0); + + /* Set control param, 8 bits, transmit only mode */ + ctrlr0 = dw_readl(pspi, ctrl0); + + ctrlr0 &= 0xfcc0; + ctrlr0 |= 0xf | (SPI_FRF_SPI << SPI_FRF_OFFSET) + | (SPI_TMOD_TO << SPI_TMOD_OFFSET); + dw_writel(pspi, ctrl0, ctrlr0); + + /* + * Change the spi0 clk to comply with 115200 bps, use 100000 to + * calculate the clk dividor to make the clock a little slower + * than real baud rate. + */ + dw_writel(pspi, baudr, freq/100000); + + /* Disable all INT for early phase */ + dw_writel(pspi, imr, 0x0); + + /* Set the cs to spi-uart */ + dw_writel(pspi, ser, 0x2); + + /* Enable the HW, the last step for HW init */ + dw_writel(pspi, ssienr, 0x1); + + /* Set the default configuration */ + max3110_write_config(); + + /* Register the kmsg dumper */ + if (!dumper_registered) { + dw_dumper.dump = dw_kmsg_dump; + kmsg_dump_register(&dw_dumper); + dumper_registered = 1; + } +} + +/* Slave select should be called in the read/write function */ +static void early_mrst_spi_putc(char c) +{ + unsigned int timeout; + u32 sr; + + timeout = MRST_SPI_TIMEOUT; + /* Early putc needs to make sure the TX FIFO is not full */ + while (--timeout) { + sr = dw_readl(pspi, sr); + if (!(sr & SR_TF_NOT_FULL)) + cpu_relax(); + else + break; + } + + if (!timeout) + pr_warning("MRST earlycon: timed out\n"); + else + max3110_write_data(c); +} + +/* Early SPI only uses polling mode */ +static void early_mrst_spi_write(struct console *con, const char *str, unsigned n) +{ + int i; + + for (i = 0; i < n && *str; i++) { + if (*str == '\n') + early_mrst_spi_putc('\r'); + early_mrst_spi_putc(*str); + str++; + } +} + +struct console early_mrst_console = { + .name = "earlymrst", + .write = early_mrst_spi_write, + .flags = CON_PRINTBUFFER, + .index = -1, +}; + +/* + * Following is the early console based on Medfield HSU (High + * Speed UART) device. + */ +#define HSU_PORT2_PADDR 0xffa28180 + +static void __iomem *phsu; + +void hsu_early_console_init(void) +{ + u8 lcr; + + phsu = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE, + HSU_PORT2_PADDR); + + /* Disable FIFO */ + writeb(0x0, phsu + UART_FCR); + + /* Set to default 115200 bps, 8n1 */ + lcr = readb(phsu + UART_LCR); + writeb((0x80 | lcr), phsu + UART_LCR); + writeb(0x18, phsu + UART_DLL); + writeb(lcr, phsu + UART_LCR); + writel(0x3600, phsu + UART_MUL*4); + + writeb(0x8, phsu + UART_MCR); + writeb(0x7, phsu + UART_FCR); + writeb(0x3, phsu + UART_LCR); + + /* Clear IRQ status */ + readb(phsu + UART_LSR); + readb(phsu + UART_RX); + readb(phsu + UART_IIR); + readb(phsu + UART_MSR); + + /* Enable FIFO */ + writeb(0x7, phsu + UART_FCR); +} + +#define BOTH_EMPTY (UART_LSR_TEMT | UART_LSR_THRE) + +static void early_hsu_putc(char ch) +{ + unsigned int timeout = 10000; /* 10ms */ + u8 status; + + while (--timeout) { + status = readb(phsu + UART_LSR); + if (status & BOTH_EMPTY) + break; + udelay(1); + } + + /* Only write the char when there was no timeout */ + if (timeout) + writeb(ch, phsu + UART_TX); +} + +static void early_hsu_write(struct console *con, const char *str, unsigned n) +{ + int i; + + for (i = 0; i < n && *str; i++) { + if (*str == '\n') + early_hsu_putc('\r'); + early_hsu_putc(*str); + str++; + } +} + +struct console early_hsu_console = { + .name = "earlyhsu", + .write = early_hsu_write, + .flags = CON_PRINTBUFFER, + .index = -1, +}; diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 227d00920d2..9fb188d7bc7 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -115,8 +115,7 @@ /* unfortunately push/pop can't be no-op */ .macro PUSH_GS - pushl $0 - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $0 .endm .macro POP_GS pop=0 addl $(4 + \pop), %esp @@ -140,14 +139,12 @@ #else /* CONFIG_X86_32_LAZY_GS */ .macro PUSH_GS - pushl %gs - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %gs /*CFI_REL_OFFSET gs, 0*/ .endm .macro POP_GS pop=0 -98: popl %gs - CFI_ADJUST_CFA_OFFSET -4 +98: popl_cfi %gs /*CFI_RESTORE gs*/ .if \pop <> 0 add $\pop, %esp @@ -195,35 +192,25 @@ .macro SAVE_ALL cld PUSH_GS - pushl %fs - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %fs /*CFI_REL_OFFSET fs, 0;*/ - pushl %es - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %es /*CFI_REL_OFFSET es, 0;*/ - pushl %ds - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ds /*CFI_REL_OFFSET ds, 0;*/ - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %eax CFI_REL_OFFSET eax, 0 - pushl %ebp - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ebp CFI_REL_OFFSET ebp, 0 - pushl %edi - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %edi CFI_REL_OFFSET edi, 0 - pushl %esi - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %esi CFI_REL_OFFSET esi, 0 - pushl %edx - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %edx CFI_REL_OFFSET edx, 0 - pushl %ecx - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ecx CFI_REL_OFFSET ecx, 0 - pushl %ebx - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ebx CFI_REL_OFFSET ebx, 0 movl $(__USER_DS), %edx movl %edx, %ds @@ -234,39 +221,29 @@ .endm .macro RESTORE_INT_REGS - popl %ebx - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %ebx CFI_RESTORE ebx - popl %ecx - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %ecx CFI_RESTORE ecx - popl %edx - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %edx CFI_RESTORE edx - popl %esi - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %esi CFI_RESTORE esi - popl %edi - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %edi CFI_RESTORE edi - popl %ebp - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %ebp CFI_RESTORE ebp - popl %eax - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %eax CFI_RESTORE eax .endm .macro RESTORE_REGS pop=0 RESTORE_INT_REGS -1: popl %ds - CFI_ADJUST_CFA_OFFSET -4 +1: popl_cfi %ds /*CFI_RESTORE ds;*/ -2: popl %es - CFI_ADJUST_CFA_OFFSET -4 +2: popl_cfi %es /*CFI_RESTORE es;*/ -3: popl %fs - CFI_ADJUST_CFA_OFFSET -4 +3: popl_cfi %fs /*CFI_RESTORE fs;*/ POP_GS \pop .pushsection .fixup, "ax" @@ -320,16 +297,12 @@ ENTRY(ret_from_fork) CFI_STARTPROC - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %eax call schedule_tail GET_THREAD_INFO(%ebp) - popl %eax - CFI_ADJUST_CFA_OFFSET -4 - pushl $0x0202 # Reset kernel eflags - CFI_ADJUST_CFA_OFFSET 4 - popfl - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %eax + pushl_cfi $0x0202 # Reset kernel eflags + popfl_cfi jmp syscall_exit CFI_ENDPROC END(ret_from_fork) @@ -409,29 +382,23 @@ sysenter_past_esp: * enough kernel state to call TRACE_IRQS_OFF can be called - but * we immediately enable interrupts at that point anyway. */ - pushl $(__USER_DS) - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $(__USER_DS) /*CFI_REL_OFFSET ss, 0*/ - pushl %ebp - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ebp CFI_REL_OFFSET esp, 0 - pushfl + pushfl_cfi orl $X86_EFLAGS_IF, (%esp) - CFI_ADJUST_CFA_OFFSET 4 - pushl $(__USER_CS) - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $(__USER_CS) /*CFI_REL_OFFSET cs, 0*/ /* * Push current_thread_info()->sysenter_return to the stack. * A tiny bit of offset fixup is necessary - 4*4 means the 4 words * pushed above; +8 corresponds to copy_thread's esp0 setting. */ - pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp) - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp) CFI_REL_OFFSET eip, 0 - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %eax SAVE_ALL ENABLE_INTERRUPTS(CLBR_NONE) @@ -486,8 +453,7 @@ sysenter_audit: movl %eax,%edx /* 2nd arg: syscall number */ movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */ call audit_syscall_entry - pushl %ebx - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ebx movl PT_EAX(%esp),%eax /* reload syscall number */ jmp sysenter_do_call @@ -529,8 +495,7 @@ ENDPROC(ia32_sysenter_target) # system call handler stub ENTRY(system_call) RING0_INT_FRAME # can't unwind into user space anyway - pushl %eax # save orig_eax - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %eax # save orig_eax SAVE_ALL GET_THREAD_INFO(%ebp) # system call tracing in operation / emulation @@ -566,7 +531,6 @@ restore_all_notrace: je ldt_ss # returning to user-space with LDT SS restore_nocheck: RESTORE_REGS 4 # skip orig_eax/error_code - CFI_ADJUST_CFA_OFFSET -4 irq_return: INTERRUPT_RETURN .section .fixup,"ax" @@ -619,10 +583,8 @@ ldt_ss: shr $16, %edx mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */ mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */ - pushl $__ESPFIX_SS - CFI_ADJUST_CFA_OFFSET 4 - push %eax /* new kernel esp */ - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $__ESPFIX_SS + pushl_cfi %eax /* new kernel esp */ /* Disable interrupts, but do not irqtrace this section: we * will soon execute iret and the tracer was already set to * the irqstate after the iret */ @@ -666,11 +628,9 @@ work_notifysig: # deal with pending signals and ALIGN work_notifysig_v86: - pushl %ecx # save ti_flags for do_notify_resume - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ecx # save ti_flags for do_notify_resume call save_v86_state # %eax contains pt_regs pointer - popl %ecx - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %ecx movl %eax, %esp #else movl %esp, %eax @@ -750,14 +710,18 @@ ptregs_##name: \ #define PTREGSCALL3(name) \ ALIGN; \ ptregs_##name: \ + CFI_STARTPROC; \ leal 4(%esp),%eax; \ - pushl %eax; \ + pushl_cfi %eax; \ movl PT_EDX(%eax),%ecx; \ movl PT_ECX(%eax),%edx; \ movl PT_EBX(%eax),%eax; \ call sys_##name; \ addl $4,%esp; \ - ret + CFI_ADJUST_CFA_OFFSET -4; \ + ret; \ + CFI_ENDPROC; \ +ENDPROC(ptregs_##name) PTREGSCALL1(iopl) PTREGSCALL0(fork) @@ -772,15 +736,19 @@ PTREGSCALL1(vm86old) /* Clone is an oddball. The 4th arg is in %edi */ ALIGN; ptregs_clone: + CFI_STARTPROC leal 4(%esp),%eax - pushl %eax - pushl PT_EDI(%eax) + pushl_cfi %eax + pushl_cfi PT_EDI(%eax) movl PT_EDX(%eax),%ecx movl PT_ECX(%eax),%edx movl PT_EBX(%eax),%eax call sys_clone addl $8,%esp + CFI_ADJUST_CFA_OFFSET -8 ret + CFI_ENDPROC +ENDPROC(ptregs_clone) .macro FIXUP_ESPFIX_STACK /* @@ -795,10 +763,8 @@ ptregs_clone: mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */ shl $16, %eax addl %esp, %eax /* the adjusted stack pointer */ - pushl $__KERNEL_DS - CFI_ADJUST_CFA_OFFSET 4 - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $__KERNEL_DS + pushl_cfi %eax lss (%esp), %esp /* switch to the normal stack segment */ CFI_ADJUST_CFA_OFFSET -8 .endm @@ -835,8 +801,7 @@ vector=FIRST_EXTERNAL_VECTOR .if vector <> FIRST_EXTERNAL_VECTOR CFI_ADJUST_CFA_OFFSET -4 .endif -1: pushl $(~vector+0x80) /* Note: always in signed byte range */ - CFI_ADJUST_CFA_OFFSET 4 +1: pushl_cfi $(~vector+0x80) /* Note: always in signed byte range */ .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6 jmp 2f .endif @@ -876,8 +841,7 @@ ENDPROC(common_interrupt) #define BUILD_INTERRUPT3(name, nr, fn) \ ENTRY(name) \ RING0_INT_FRAME; \ - pushl $~(nr); \ - CFI_ADJUST_CFA_OFFSET 4; \ + pushl_cfi $~(nr); \ SAVE_ALL; \ TRACE_IRQS_OFF \ movl %esp,%eax; \ @@ -893,21 +857,18 @@ ENDPROC(name) ENTRY(coprocessor_error) RING0_INT_FRAME - pushl $0 - CFI_ADJUST_CFA_OFFSET 4 - pushl $do_coprocessor_error - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $0 + pushl_cfi $do_coprocessor_error jmp error_code CFI_ENDPROC END(coprocessor_error) ENTRY(simd_coprocessor_error) RING0_INT_FRAME - pushl $0 - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $0 #ifdef CONFIG_X86_INVD_BUG /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */ -661: pushl $do_general_protection +661: pushl_cfi $do_general_protection 662: .section .altinstructions,"a" .balign 4 @@ -922,19 +883,16 @@ ENTRY(simd_coprocessor_error) 664: .previous #else - pushl $do_simd_coprocessor_error + pushl_cfi $do_simd_coprocessor_error #endif - CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC END(simd_coprocessor_error) ENTRY(device_not_available) RING0_INT_FRAME - pushl $-1 # mark this as an int - CFI_ADJUST_CFA_OFFSET 4 - pushl $do_device_not_available - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $-1 # mark this as an int + pushl_cfi $do_device_not_available jmp error_code CFI_ENDPROC END(device_not_available) @@ -956,82 +914,68 @@ END(native_irq_enable_sysexit) ENTRY(overflow) RING0_INT_FRAME - pushl $0 - CFI_ADJUST_CFA_OFFSET 4 - pushl $do_overflow - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $0 + pushl_cfi $do_overflow jmp error_code CFI_ENDPROC END(overflow) ENTRY(bounds) RING0_INT_FRAME - pushl $0 - CFI_ADJUST_CFA_OFFSET 4 - pushl $do_bounds - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $0 + pushl_cfi $do_bounds jmp error_code CFI_ENDPROC END(bounds) ENTRY(invalid_op) RING0_INT_FRAME - pushl $0 - CFI_ADJUST_CFA_OFFSET 4 - pushl $do_invalid_op - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $0 + pushl_cfi $do_invalid_op jmp error_code CFI_ENDPROC END(invalid_op) ENTRY(coprocessor_segment_overrun) RING0_INT_FRAME - pushl $0 - CFI_ADJUST_CFA_OFFSET 4 - pushl $do_coprocessor_segment_overrun - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $0 + pushl_cfi $do_coprocessor_segment_overrun jmp error_code CFI_ENDPROC END(coprocessor_segment_overrun) ENTRY(invalid_TSS) RING0_EC_FRAME - pushl $do_invalid_TSS - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $do_invalid_TSS jmp error_code CFI_ENDPROC END(invalid_TSS) ENTRY(segment_not_present) RING0_EC_FRAME - pushl $do_segment_not_present - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $do_segment_not_present jmp error_code CFI_ENDPROC END(segment_not_present) ENTRY(stack_segment) RING0_EC_FRAME - pushl $do_stack_segment - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $do_stack_segment jmp error_code CFI_ENDPROC END(stack_segment) ENTRY(alignment_check) RING0_EC_FRAME - pushl $do_alignment_check - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $do_alignment_check jmp error_code CFI_ENDPROC END(alignment_check) ENTRY(divide_error) RING0_INT_FRAME - pushl $0 # no error code - CFI_ADJUST_CFA_OFFSET 4 - pushl $do_divide_error - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $0 # no error code + pushl_cfi $do_divide_error jmp error_code CFI_ENDPROC END(divide_error) @@ -1039,10 +983,8 @@ END(divide_error) #ifdef CONFIG_X86_MCE ENTRY(machine_check) RING0_INT_FRAME - pushl $0 - CFI_ADJUST_CFA_OFFSET 4 - pushl machine_check_vector - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $0 + pushl_cfi machine_check_vector jmp error_code CFI_ENDPROC END(machine_check) @@ -1050,10 +992,8 @@ END(machine_check) ENTRY(spurious_interrupt_bug) RING0_INT_FRAME - pushl $0 - CFI_ADJUST_CFA_OFFSET 4 - pushl $do_spurious_interrupt_bug - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $0 + pushl_cfi $do_spurious_interrupt_bug jmp error_code CFI_ENDPROC END(spurious_interrupt_bug) @@ -1084,8 +1024,7 @@ ENTRY(xen_sysenter_target) ENTRY(xen_hypervisor_callback) CFI_STARTPROC - pushl $0 - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $0 SAVE_ALL TRACE_IRQS_OFF @@ -1121,23 +1060,20 @@ ENDPROC(xen_hypervisor_callback) # We distinguish between categories by maintaining a status value in EAX. ENTRY(xen_failsafe_callback) CFI_STARTPROC - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %eax movl $1,%eax 1: mov 4(%esp),%ds 2: mov 8(%esp),%es 3: mov 12(%esp),%fs 4: mov 16(%esp),%gs testl %eax,%eax - popl %eax - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %eax lea 16(%esp),%esp CFI_ADJUST_CFA_OFFSET -16 jz 5f addl $16,%esp jmp iret_exc # EAX != 0 => Category 2 (Bad IRET) -5: pushl $0 # EAX == 0 => Category 1 (Bad segment) - CFI_ADJUST_CFA_OFFSET 4 +5: pushl_cfi $0 # EAX == 0 => Category 1 (Bad segment) SAVE_ALL jmp ret_from_exception CFI_ENDPROC @@ -1287,40 +1223,29 @@ syscall_table_size=(.-sys_call_table) ENTRY(page_fault) RING0_EC_FRAME - pushl $do_page_fault - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $do_page_fault ALIGN error_code: /* the function address is in %gs's slot on the stack */ - pushl %fs - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %fs /*CFI_REL_OFFSET fs, 0*/ - pushl %es - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %es /*CFI_REL_OFFSET es, 0*/ - pushl %ds - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ds /*CFI_REL_OFFSET ds, 0*/ - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %eax CFI_REL_OFFSET eax, 0 - pushl %ebp - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ebp CFI_REL_OFFSET ebp, 0 - pushl %edi - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %edi CFI_REL_OFFSET edi, 0 - pushl %esi - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %esi CFI_REL_OFFSET esi, 0 - pushl %edx - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %edx CFI_REL_OFFSET edx, 0 - pushl %ecx - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ecx CFI_REL_OFFSET ecx, 0 - pushl %ebx - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ebx CFI_REL_OFFSET ebx, 0 cld movl $(__KERNEL_PERCPU), %ecx @@ -1362,12 +1287,9 @@ END(page_fault) movl TSS_sysenter_sp0 + \offset(%esp), %esp CFI_DEF_CFA esp, 0 CFI_UNDEFINED eip - pushfl - CFI_ADJUST_CFA_OFFSET 4 - pushl $__KERNEL_CS - CFI_ADJUST_CFA_OFFSET 4 - pushl $sysenter_past_esp - CFI_ADJUST_CFA_OFFSET 4 + pushfl_cfi + pushl_cfi $__KERNEL_CS + pushl_cfi $sysenter_past_esp CFI_REL_OFFSET eip, 0 .endm @@ -1377,8 +1299,7 @@ ENTRY(debug) jne debug_stack_correct FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn debug_stack_correct: - pushl $-1 # mark this as an int - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $-1 # mark this as an int SAVE_ALL TRACE_IRQS_OFF xorl %edx,%edx # error code 0 @@ -1398,32 +1319,27 @@ END(debug) */ ENTRY(nmi) RING0_INT_FRAME - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %eax movl %ss, %eax cmpw $__ESPFIX_SS, %ax - popl %eax - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %eax je nmi_espfix_stack cmpl $ia32_sysenter_target,(%esp) je nmi_stack_fixup - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %eax movl %esp,%eax /* Do not access memory above the end of our stack page, * it might not exist. */ andl $(THREAD_SIZE-1),%eax cmpl $(THREAD_SIZE-20),%eax - popl %eax - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %eax jae nmi_stack_correct cmpl $ia32_sysenter_target,12(%esp) je nmi_debug_stack_check nmi_stack_correct: /* We have a RING0_INT_FRAME here */ - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %eax SAVE_ALL xorl %edx,%edx # zero error code movl %esp,%eax # pt_regs pointer @@ -1452,18 +1368,14 @@ nmi_espfix_stack: * * create the pointer to lss back */ - pushl %ss - CFI_ADJUST_CFA_OFFSET 4 - pushl %esp - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ss + pushl_cfi %esp addl $4, (%esp) /* copy the iret frame of 12 bytes */ .rept 3 - pushl 16(%esp) - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi 16(%esp) .endr - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %eax SAVE_ALL FIXUP_ESPFIX_STACK # %eax == %esp xorl %edx,%edx # zero error code @@ -1477,8 +1389,7 @@ END(nmi) ENTRY(int3) RING0_INT_FRAME - pushl $-1 # mark this as an int - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $-1 # mark this as an int SAVE_ALL TRACE_IRQS_OFF xorl %edx,%edx # zero error code @@ -1490,8 +1401,7 @@ END(int3) ENTRY(general_protection) RING0_EC_FRAME - pushl $do_general_protection - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $do_general_protection jmp error_code CFI_ENDPROC END(general_protection) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 17be5ec7cbb..a7ae7fd1010 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -213,23 +213,17 @@ ENDPROC(native_usergs_sysret64) .macro FAKE_STACK_FRAME child_rip /* push in order ss, rsp, eflags, cs, rip */ xorl %eax, %eax - pushq $__KERNEL_DS /* ss */ - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi $__KERNEL_DS /* ss */ /*CFI_REL_OFFSET ss,0*/ - pushq %rax /* rsp */ - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi %rax /* rsp */ CFI_REL_OFFSET rsp,0 - pushq $X86_EFLAGS_IF /* eflags - interrupts on */ - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi $X86_EFLAGS_IF /* eflags - interrupts on */ /*CFI_REL_OFFSET rflags,0*/ - pushq $__KERNEL_CS /* cs */ - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi $__KERNEL_CS /* cs */ /*CFI_REL_OFFSET cs,0*/ - pushq \child_rip /* rip */ - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi \child_rip /* rip */ CFI_REL_OFFSET rip,0 - pushq %rax /* orig rax */ - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi %rax /* orig rax */ .endm .macro UNFAKE_STACK_FRAME @@ -398,10 +392,8 @@ ENTRY(ret_from_fork) LOCK ; btr $TIF_FORK,TI_flags(%r8) - push kernel_eflags(%rip) - CFI_ADJUST_CFA_OFFSET 8 - popf # reset kernel eflags - CFI_ADJUST_CFA_OFFSET -8 + pushq_cfi kernel_eflags(%rip) + popfq_cfi # reset kernel eflags call schedule_tail # rdi: 'prev' task parameter @@ -521,11 +513,9 @@ sysret_careful: jnc sysret_signal TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) - pushq %rdi - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi %rdi call schedule - popq %rdi - CFI_ADJUST_CFA_OFFSET -8 + popq_cfi %rdi jmp sysret_check /* Handle a signal */ @@ -634,11 +624,9 @@ int_careful: jnc int_very_careful TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) - pushq %rdi - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi %rdi call schedule - popq %rdi - CFI_ADJUST_CFA_OFFSET -8 + popq_cfi %rdi DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF jmp int_with_check @@ -652,12 +640,10 @@ int_check_syscall_exit_work: /* Check for syscall exit trace */ testl $_TIF_WORK_SYSCALL_EXIT,%edx jz int_signal - pushq %rdi - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi %rdi leaq 8(%rsp),%rdi # &ptregs -> arg1 call syscall_trace_leave - popq %rdi - CFI_ADJUST_CFA_OFFSET -8 + popq_cfi %rdi andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi jmp int_restore_rest @@ -714,9 +700,8 @@ END(ptregscall_common) ENTRY(stub_execve) CFI_STARTPROC - popq %r11 - CFI_ADJUST_CFA_OFFSET -8 - CFI_REGISTER rip, r11 + addq $8, %rsp + PARTIAL_FRAME 0 SAVE_REST FIXUP_TOP_OF_STACK %r11 movq %rsp, %rcx @@ -735,7 +720,7 @@ END(stub_execve) ENTRY(stub_rt_sigreturn) CFI_STARTPROC addq $8, %rsp - CFI_ADJUST_CFA_OFFSET -8 + PARTIAL_FRAME 0 SAVE_REST movq %rsp,%rdi FIXUP_TOP_OF_STACK %r11 @@ -766,8 +751,7 @@ vector=FIRST_EXTERNAL_VECTOR .if vector <> FIRST_EXTERNAL_VECTOR CFI_ADJUST_CFA_OFFSET -8 .endif -1: pushq $(~vector+0x80) /* Note: always in signed byte range */ - CFI_ADJUST_CFA_OFFSET 8 +1: pushq_cfi $(~vector+0x80) /* Note: always in signed byte range */ .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6 jmp 2f .endif @@ -796,8 +780,8 @@ END(interrupt) /* 0(%rsp): ~(interrupt number) */ .macro interrupt func - subq $10*8, %rsp - CFI_ADJUST_CFA_OFFSET 10*8 + subq $ORIG_RAX-ARGOFFSET+8, %rsp + CFI_ADJUST_CFA_OFFSET ORIG_RAX-ARGOFFSET+8 call save_args PARTIAL_FRAME 0 call \func @@ -822,6 +806,7 @@ ret_from_intr: TRACE_IRQS_OFF decl PER_CPU_VAR(irq_count) leaveq + CFI_RESTORE rbp CFI_DEF_CFA_REGISTER rsp CFI_ADJUST_CFA_OFFSET -8 exit_intr: @@ -903,11 +888,9 @@ retint_careful: jnc retint_signal TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) - pushq %rdi - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi %rdi call schedule - popq %rdi - CFI_ADJUST_CFA_OFFSET -8 + popq_cfi %rdi GET_THREAD_INFO(%rcx) DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF @@ -956,8 +939,7 @@ END(common_interrupt) .macro apicinterrupt num sym do_sym ENTRY(\sym) INTR_FRAME - pushq $~(\num) - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi $~(\num) interrupt \do_sym jmp ret_from_intr CFI_ENDPROC @@ -1023,9 +1005,9 @@ apicinterrupt ERROR_APIC_VECTOR \ apicinterrupt SPURIOUS_APIC_VECTOR \ spurious_interrupt smp_spurious_interrupt -#ifdef CONFIG_PERF_EVENTS -apicinterrupt LOCAL_PENDING_VECTOR \ - perf_pending_interrupt smp_perf_pending_interrupt +#ifdef CONFIG_IRQ_WORK +apicinterrupt IRQ_WORK_VECTOR \ + irq_work_interrupt smp_irq_work_interrupt #endif /* @@ -1036,8 +1018,8 @@ ENTRY(\sym) INTR_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ - subq $15*8,%rsp - CFI_ADJUST_CFA_OFFSET 15*8 + subq $ORIG_RAX-R15, %rsp + CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 call error_entry DEFAULT_FRAME 0 movq %rsp,%rdi /* pt_regs pointer */ @@ -1052,9 +1034,9 @@ END(\sym) ENTRY(\sym) INTR_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME - pushq $-1 /* ORIG_RAX: no syscall to restart */ - CFI_ADJUST_CFA_OFFSET 8 - subq $15*8, %rsp + pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ + subq $ORIG_RAX-R15, %rsp + CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 call save_paranoid TRACE_IRQS_OFF movq %rsp,%rdi /* pt_regs pointer */ @@ -1070,9 +1052,9 @@ END(\sym) ENTRY(\sym) INTR_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME - pushq $-1 /* ORIG_RAX: no syscall to restart */ - CFI_ADJUST_CFA_OFFSET 8 - subq $15*8, %rsp + pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ + subq $ORIG_RAX-R15, %rsp + CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 call save_paranoid TRACE_IRQS_OFF movq %rsp,%rdi /* pt_regs pointer */ @@ -1089,8 +1071,8 @@ END(\sym) ENTRY(\sym) XCPT_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME - subq $15*8,%rsp - CFI_ADJUST_CFA_OFFSET 15*8 + subq $ORIG_RAX-R15, %rsp + CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 call error_entry DEFAULT_FRAME 0 movq %rsp,%rdi /* pt_regs pointer */ @@ -1107,8 +1089,8 @@ END(\sym) ENTRY(\sym) XCPT_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME - subq $15*8,%rsp - CFI_ADJUST_CFA_OFFSET 15*8 + subq $ORIG_RAX-R15, %rsp + CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 call save_paranoid DEFAULT_FRAME 0 TRACE_IRQS_OFF @@ -1139,16 +1121,14 @@ zeroentry simd_coprocessor_error do_simd_coprocessor_error /* edi: new selector */ ENTRY(native_load_gs_index) CFI_STARTPROC - pushf - CFI_ADJUST_CFA_OFFSET 8 + pushfq_cfi DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI) SWAPGS gs_change: movl %edi,%gs 2: mfence /* workaround */ SWAPGS - popf - CFI_ADJUST_CFA_OFFSET -8 + popfq_cfi ret CFI_ENDPROC END(native_load_gs_index) @@ -1215,8 +1195,7 @@ END(kernel_execve) /* Call softirq on interrupt stack. Interrupts are off. */ ENTRY(call_softirq) CFI_STARTPROC - push %rbp - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi %rbp CFI_REL_OFFSET rbp,0 mov %rsp,%rbp CFI_DEF_CFA_REGISTER rbp @@ -1225,6 +1204,7 @@ ENTRY(call_softirq) push %rbp # backlink for old unwinder call __do_softirq leaveq + CFI_RESTORE rbp CFI_DEF_CFA_REGISTER rsp CFI_ADJUST_CFA_OFFSET -8 decl PER_CPU_VAR(irq_count) @@ -1368,7 +1348,7 @@ paranoidzeroentry machine_check *machine_check_vector(%rip) /* ebx: no swapgs flag */ ENTRY(paranoid_exit) - INTR_FRAME + DEFAULT_FRAME DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF testl %ebx,%ebx /* swapgs needed? */ @@ -1445,7 +1425,6 @@ error_swapgs: error_sti: TRACE_IRQS_OFF ret - CFI_ENDPROC /* * There are two places in the kernel that can potentially fault with @@ -1470,6 +1449,7 @@ bstep_iret: /* Fix truncated RIP */ movq %rcx,RIP+8(%rsp) jmp error_swapgs + CFI_ENDPROC END(error_entry) @@ -1498,8 +1478,8 @@ ENTRY(nmi) INTR_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME pushq_cfi $-1 - subq $15*8, %rsp - CFI_ADJUST_CFA_OFFSET 15*8 + subq $ORIG_RAX-R15, %rsp + CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 call save_paranoid DEFAULT_FRAME 0 /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index cd37469b54e..3afb33f14d2 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -257,14 +257,9 @@ do_ftrace_mod_code(unsigned long ip, void *new_code) return mod_code_status; } - - - -static unsigned char ftrace_nop[MCOUNT_INSN_SIZE]; - static unsigned char *ftrace_nop_replace(void) { - return ftrace_nop; + return ideal_nop5; } static int @@ -338,62 +333,6 @@ int ftrace_update_ftrace_func(ftrace_func_t func) int __init ftrace_dyn_arch_init(void *data) { - extern const unsigned char ftrace_test_p6nop[]; - extern const unsigned char ftrace_test_nop5[]; - extern const unsigned char ftrace_test_jmp[]; - int faulted = 0; - - /* - * There is no good nop for all x86 archs. - * We will default to using the P6_NOP5, but first we - * will test to make sure that the nop will actually - * work on this CPU. If it faults, we will then - * go to a lesser efficient 5 byte nop. If that fails - * we then just use a jmp as our nop. This isn't the most - * efficient nop, but we can not use a multi part nop - * since we would then risk being preempted in the middle - * of that nop, and if we enabled tracing then, it might - * cause a system crash. - * - * TODO: check the cpuid to determine the best nop. - */ - asm volatile ( - "ftrace_test_jmp:" - "jmp ftrace_test_p6nop\n" - "nop\n" - "nop\n" - "nop\n" /* 2 byte jmp + 3 bytes */ - "ftrace_test_p6nop:" - P6_NOP5 - "jmp 1f\n" - "ftrace_test_nop5:" - ".byte 0x66,0x66,0x66,0x66,0x90\n" - "1:" - ".section .fixup, \"ax\"\n" - "2: movl $1, %0\n" - " jmp ftrace_test_nop5\n" - "3: movl $2, %0\n" - " jmp 1b\n" - ".previous\n" - _ASM_EXTABLE(ftrace_test_p6nop, 2b) - _ASM_EXTABLE(ftrace_test_nop5, 3b) - : "=r"(faulted) : "0" (faulted)); - - switch (faulted) { - case 0: - pr_info("converting mcount calls to 0f 1f 44 00 00\n"); - memcpy(ftrace_nop, ftrace_test_p6nop, MCOUNT_INSN_SIZE); - break; - case 1: - pr_info("converting mcount calls to 66 66 66 66 90\n"); - memcpy(ftrace_nop, ftrace_test_nop5, MCOUNT_INSN_SIZE); - break; - case 2: - pr_info("converting mcount calls to jmp . + 5\n"); - memcpy(ftrace_nop, ftrace_test_jmp, MCOUNT_INSN_SIZE); - break; - } - /* The return code is retured via data */ *(unsigned long *)data = 0; diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 351f9c0fea1..7494999141b 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -35,7 +35,6 @@ unsigned long hpet_address; u8 hpet_blockid; /* OS timer block num */ u8 hpet_msi_disable; -u8 hpet_readback_cmp; #ifdef CONFIG_PCI_MSI static unsigned long hpet_num_timers; @@ -395,23 +394,27 @@ static int hpet_next_event(unsigned long delta, * at that point and we would wait for the next hpet interrupt * forever. We found out that reading the CMP register back * forces the transfer so we can rely on the comparison with - * the counter register below. + * the counter register below. If the read back from the + * compare register does not match the value we programmed + * then we might have a real hardware problem. We can not do + * much about it here, but at least alert the user/admin with + * a prominent warning. * - * That works fine on those ATI chipsets, but on newer Intel - * chipsets (ICH9...) this triggers due to an erratum: Reading - * the comparator immediately following a write is returning - * the old value. + * An erratum on some chipsets (ICH9,..), results in + * comparator read immediately following a write returning old + * value. Workaround for this is to read this value second + * time, when first read returns old value. * - * We restrict the read back to the affected ATI chipsets (set - * by quirks) and also run it with hpet=verbose for debugging - * purposes. + * In fact the write to the comparator register is delayed up + * to two HPET cycles so the workaround we tried to restrict + * the readback to those known to be borked ATI chipsets + * failed miserably. So we give up on optimizations forever + * and penalize all HPET incarnations unconditionally. */ - if (hpet_readback_cmp || hpet_verbose) { - u32 cmp = hpet_readl(HPET_Tn_CMP(timer)); - - if (cmp != cnt) + if (unlikely((u32)hpet_readl(HPET_Tn_CMP(timer)) != cnt)) { + if (hpet_readl(HPET_Tn_CMP(timer)) != cnt) printk_once(KERN_WARNING - "hpet: compare register read back failed.\n"); + "hpet: compare register read back failed.\n"); } return (s32)(hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0; @@ -503,7 +506,7 @@ static int hpet_assign_irq(struct hpet_dev *dev) { unsigned int irq; - irq = create_irq(); + irq = create_irq_nr(0, -1); if (!irq) return -EINVAL; diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index a474ec37c32..ff15c9dcc25 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -206,11 +206,27 @@ int arch_check_bp_in_kernelspace(struct perf_event *bp) int arch_bp_generic_fields(int x86_len, int x86_type, int *gen_len, int *gen_type) { - /* Len */ - switch (x86_len) { - case X86_BREAKPOINT_LEN_X: + /* Type */ + switch (x86_type) { + case X86_BREAKPOINT_EXECUTE: + if (x86_len != X86_BREAKPOINT_LEN_X) + return -EINVAL; + + *gen_type = HW_BREAKPOINT_X; *gen_len = sizeof(long); + return 0; + case X86_BREAKPOINT_WRITE: + *gen_type = HW_BREAKPOINT_W; break; + case X86_BREAKPOINT_RW: + *gen_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R; + break; + default: + return -EINVAL; + } + + /* Len */ + switch (x86_len) { case X86_BREAKPOINT_LEN_1: *gen_len = HW_BREAKPOINT_LEN_1; break; @@ -229,21 +245,6 @@ int arch_bp_generic_fields(int x86_len, int x86_type, return -EINVAL; } - /* Type */ - switch (x86_type) { - case X86_BREAKPOINT_EXECUTE: - *gen_type = HW_BREAKPOINT_X; - break; - case X86_BREAKPOINT_WRITE: - *gen_type = HW_BREAKPOINT_W; - break; - case X86_BREAKPOINT_RW: - *gen_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R; - break; - default: - return -EINVAL; - } - return 0; } @@ -316,9 +317,6 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp) ret = -EINVAL; switch (info->len) { - case X86_BREAKPOINT_LEN_X: - align = sizeof(long) -1; - break; case X86_BREAKPOINT_LEN_1: align = 0; break; diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index a46cb3522c0..58bb239a2fd 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -68,19 +68,22 @@ static void __cpuinit init_thread_xstate(void) */ if (!HAVE_HWFP) { + /* + * Disable xsave as we do not support it if i387 + * emulation is enabled. + */ + setup_clear_cpu_cap(X86_FEATURE_XSAVE); + setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); xstate_size = sizeof(struct i387_soft_struct); return; } if (cpu_has_fxsr) xstate_size = sizeof(struct i387_fxsave_struct); -#ifdef CONFIG_X86_32 else xstate_size = sizeof(struct i387_fsave_struct); -#endif } -#ifdef CONFIG_X86_64 /* * Called at bootup to set up the initial FPU state that is later cloned * into all processes. @@ -88,12 +91,21 @@ static void __cpuinit init_thread_xstate(void) void __cpuinit fpu_init(void) { - unsigned long oldcr0 = read_cr0(); - - set_in_cr4(X86_CR4_OSFXSR); - set_in_cr4(X86_CR4_OSXMMEXCPT); + unsigned long cr0; + unsigned long cr4_mask = 0; - write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */ + if (cpu_has_fxsr) + cr4_mask |= X86_CR4_OSFXSR; + if (cpu_has_xmm) + cr4_mask |= X86_CR4_OSXMMEXCPT; + if (cr4_mask) + set_in_cr4(cr4_mask); + + cr0 = read_cr0(); + cr0 &= ~(X86_CR0_TS|X86_CR0_EM); /* clear TS and EM */ + if (!HAVE_HWFP) + cr0 |= X86_CR0_EM; + write_cr0(cr0); if (!smp_processor_id()) init_thread_xstate(); @@ -104,24 +116,12 @@ void __cpuinit fpu_init(void) clear_used_math(); } -#else /* CONFIG_X86_64 */ - -void __cpuinit fpu_init(void) -{ - if (!smp_processor_id()) - init_thread_xstate(); -} - -#endif /* CONFIG_X86_32 */ - void fpu_finit(struct fpu *fpu) { -#ifdef CONFIG_X86_32 if (!HAVE_HWFP) { finit_soft_fpu(&fpu->state->soft); return; } -#endif if (cpu_has_fxsr) { struct i387_fxsave_struct *fx = &fpu->state->fxsave; @@ -386,19 +386,17 @@ convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk) #ifdef CONFIG_X86_64 env->fip = fxsave->rip; env->foo = fxsave->rdp; + /* + * should be actually ds/cs at fpu exception time, but + * that information is not available in 64bit mode. + */ + env->fcs = task_pt_regs(tsk)->cs; if (tsk == current) { - /* - * should be actually ds/cs at fpu exception time, but - * that information is not available in 64bit mode. - */ - asm("mov %%ds, %[fos]" : [fos] "=r" (env->fos)); - asm("mov %%cs, %[fcs]" : [fcs] "=r" (env->fcs)); + savesegment(ds, env->fos); } else { - struct pt_regs *regs = task_pt_regs(tsk); - - env->fos = 0xffff0000 | tsk->thread.ds; - env->fcs = regs->cs; + env->fos = tsk->thread.ds; } + env->fos |= 0xffff0000; #else env->fip = fxsave->fip; env->fcs = (u16) fxsave->fcs | ((u32) fxsave->fop << 16); diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 91fd0c70a18..44edb03fc9e 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -67,10 +67,10 @@ static int show_other_interrupts(struct seq_file *p, int prec) for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs); seq_printf(p, " Performance monitoring interrupts\n"); - seq_printf(p, "%*s: ", prec, "PND"); + seq_printf(p, "%*s: ", prec, "IWI"); for_each_online_cpu(j) - seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs); - seq_printf(p, " Performance pending work\n"); + seq_printf(p, "%10u ", irq_stats(j)->apic_irq_work_irqs); + seq_printf(p, " IRQ work interrupts\n"); #endif if (x86_platform_ipi_callback) { seq_printf(p, "%*s: ", prec, "PLT"); @@ -185,7 +185,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu) sum += irq_stats(cpu)->apic_timer_irqs; sum += irq_stats(cpu)->irq_spurious_count; sum += irq_stats(cpu)->apic_perf_irqs; - sum += irq_stats(cpu)->apic_pending_irqs; + sum += irq_stats(cpu)->apic_irq_work_irqs; #endif if (x86_platform_ipi_callback) sum += irq_stats(cpu)->x86_platform_ipis; diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c new file mode 100644 index 00000000000..ca8f703a1e7 --- /dev/null +++ b/arch/x86/kernel/irq_work.c @@ -0,0 +1,30 @@ +/* + * x86 specific code for irq_work + * + * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + */ + +#include <linux/kernel.h> +#include <linux/irq_work.h> +#include <linux/hardirq.h> +#include <asm/apic.h> + +void smp_irq_work_interrupt(struct pt_regs *regs) +{ + irq_enter(); + ack_APIC_irq(); + inc_irq_stat(apic_irq_work_irqs); + irq_work_run(); + irq_exit(); +} + +void arch_irq_work_raise(void) +{ +#ifdef CONFIG_X86_LOCAL_APIC + if (!cpu_has_apic) + return; + + apic->send_IPI_self(IRQ_WORK_VECTOR); + apic_wait_icr_idle(); +#endif +} diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 990ae7cfc57..713969b9266 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -224,9 +224,9 @@ static void __init apic_intr_init(void) alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); - /* Performance monitoring interrupts: */ -# ifdef CONFIG_PERF_EVENTS - alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt); + /* IRQ work interrupts: */ +# ifdef CONFIG_IRQ_WORK + alloc_intr_gate(IRQ_WORK_VECTOR, irq_work_interrupt); # endif #endif diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c new file mode 100644 index 00000000000..961b6b30ba9 --- /dev/null +++ b/arch/x86/kernel/jump_label.c @@ -0,0 +1,50 @@ +/* + * jump label x86 support + * + * Copyright (C) 2009 Jason Baron <jbaron@redhat.com> + * + */ +#include <linux/jump_label.h> +#include <linux/memory.h> +#include <linux/uaccess.h> +#include <linux/module.h> +#include <linux/list.h> +#include <linux/jhash.h> +#include <linux/cpu.h> +#include <asm/kprobes.h> +#include <asm/alternative.h> + +#ifdef HAVE_JUMP_LABEL + +union jump_code_union { + char code[JUMP_LABEL_NOP_SIZE]; + struct { + char jump; + int offset; + } __attribute__((packed)); +}; + +void arch_jump_label_transform(struct jump_entry *entry, + enum jump_label_type type) +{ + union jump_code_union code; + + if (type == JUMP_LABEL_ENABLE) { + code.jump = 0xe9; + code.offset = entry->target - + (entry->code + JUMP_LABEL_NOP_SIZE); + } else + memcpy(&code, ideal_nop5, JUMP_LABEL_NOP_SIZE); + get_online_cpus(); + mutex_lock(&text_mutex); + text_poke_smp((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE); + mutex_unlock(&text_mutex); + put_online_cpus(); +} + +void arch_jump_label_text_poke_early(jump_label_t addr) +{ + text_poke_early((void *)addr, ideal_nop5, JUMP_LABEL_NOP_SIZE); +} + +#endif diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 770ebfb349e..1cbd54c0df9 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -230,9 +230,6 @@ static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr) return 0; } -/* Dummy buffers for kallsyms_lookup */ -static char __dummy_buf[KSYM_NAME_LEN]; - /* Check if paddr is at an instruction boundary */ static int __kprobes can_probe(unsigned long paddr) { @@ -241,7 +238,7 @@ static int __kprobes can_probe(unsigned long paddr) struct insn insn; kprobe_opcode_t buf[MAX_INSN_SIZE]; - if (!kallsyms_lookup(paddr, NULL, &offset, NULL, __dummy_buf)) + if (!kallsyms_lookup_size_offset(paddr, NULL, &offset)) return 0; /* Decode instructions */ @@ -1129,7 +1126,7 @@ static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr, *(unsigned long *)addr = val; } -void __kprobes kprobes_optinsn_template_holder(void) +static void __used __kprobes kprobes_optinsn_template_holder(void) { asm volatile ( ".global optprobe_template_entry\n" @@ -1221,7 +1218,8 @@ static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src) } /* Check whether the address range is reserved */ if (ftrace_text_reserved(src, src + len - 1) || - alternatives_text_reserved(src, src + len - 1)) + alternatives_text_reserved(src, src + len - 1) || + jump_label_text_reserved(src, src + len - 1)) return -EBUSY; return len; @@ -1269,11 +1267,9 @@ static int __kprobes can_optimize(unsigned long paddr) unsigned long addr, size = 0, offset = 0; struct insn insn; kprobe_opcode_t buf[MAX_INSN_SIZE]; - /* Dummy buffers for lookup_symbol_attrs */ - static char __dummy_buf[KSYM_NAME_LEN]; /* Lookup symbol including addr */ - if (!kallsyms_lookup(paddr, &size, &offset, NULL, __dummy_buf)) + if (!kallsyms_lookup_size_offset(paddr, &size, &offset)) return 0; /* Check there is enough space for a relative jump. */ diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 035c8c52918..b3ea9db39db 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -36,7 +36,7 @@ static int init_one_level2_page(struct kimage *image, pgd_t *pgd, if (!page) goto out; pud = (pud_t *)page_address(page); - memset(pud, 0, PAGE_SIZE); + clear_page(pud); set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); } pud = pud_offset(pgd, addr); @@ -45,7 +45,7 @@ static int init_one_level2_page(struct kimage *image, pgd_t *pgd, if (!page) goto out; pmd = (pmd_t *)page_address(page); - memset(pmd, 0, PAGE_SIZE); + clear_page(pmd); set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); } pmd = pmd_offset(pud, addr); diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index e0bc186d750..8f295609173 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -239,11 +239,13 @@ int module_finalize(const Elf_Ehdr *hdr, apply_paravirt(pseg, pseg + para->sh_size); } - return module_bug_finalize(hdr, sechdrs, me); + /* make jump label nops */ + jump_label_apply_nops(me); + + return 0; } void module_arch_cleanup(struct module *mod) { alternatives_smp_module_del(mod); - module_bug_cleanup(mod); } diff --git a/arch/x86/kernel/olpc-xo1.c b/arch/x86/kernel/olpc-xo1.c new file mode 100644 index 00000000000..f5442c03abc --- /dev/null +++ b/arch/x86/kernel/olpc-xo1.c @@ -0,0 +1,140 @@ +/* + * Support for features of the OLPC XO-1 laptop + * + * Copyright (C) 2010 One Laptop per Child + * Copyright (C) 2006 Red Hat, Inc. + * Copyright (C) 2006 Advanced Micro Devices, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/pci.h> +#include <linux/pci_ids.h> +#include <linux/platform_device.h> +#include <linux/pm.h> + +#include <asm/io.h> +#include <asm/olpc.h> + +#define DRV_NAME "olpc-xo1" + +#define PMS_BAR 4 +#define ACPI_BAR 5 + +/* PMC registers (PMS block) */ +#define PM_SCLK 0x10 +#define PM_IN_SLPCTL 0x20 +#define PM_WKXD 0x34 +#define PM_WKD 0x30 +#define PM_SSC 0x54 + +/* PM registers (ACPI block) */ +#define PM1_CNT 0x08 +#define PM_GPE0_STS 0x18 + +static unsigned long acpi_base; +static unsigned long pms_base; + +static void xo1_power_off(void) +{ + printk(KERN_INFO "OLPC XO-1 power off sequence...\n"); + + /* Enable all of these controls with 0 delay */ + outl(0x40000000, pms_base + PM_SCLK); + outl(0x40000000, pms_base + PM_IN_SLPCTL); + outl(0x40000000, pms_base + PM_WKXD); + outl(0x40000000, pms_base + PM_WKD); + + /* Clear status bits (possibly unnecessary) */ + outl(0x0002ffff, pms_base + PM_SSC); + outl(0xffffffff, acpi_base + PM_GPE0_STS); + + /* Write SLP_EN bit to start the machinery */ + outl(0x00002000, acpi_base + PM1_CNT); +} + +/* Read the base addresses from the PCI BAR info */ +static int __devinit setup_bases(struct pci_dev *pdev) +{ + int r; + + r = pci_enable_device_io(pdev); + if (r) { + dev_err(&pdev->dev, "can't enable device IO\n"); + return r; + } + + r = pci_request_region(pdev, ACPI_BAR, DRV_NAME); + if (r) { + dev_err(&pdev->dev, "can't alloc PCI BAR #%d\n", ACPI_BAR); + return r; + } + + r = pci_request_region(pdev, PMS_BAR, DRV_NAME); + if (r) { + dev_err(&pdev->dev, "can't alloc PCI BAR #%d\n", PMS_BAR); + pci_release_region(pdev, ACPI_BAR); + return r; + } + + acpi_base = pci_resource_start(pdev, ACPI_BAR); + pms_base = pci_resource_start(pdev, PMS_BAR); + + return 0; +} + +static int __devinit olpc_xo1_probe(struct platform_device *pdev) +{ + struct pci_dev *pcidev; + int r; + + pcidev = pci_get_device(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, + NULL); + if (!pdev) + return -ENODEV; + + r = setup_bases(pcidev); + if (r) + return r; + + pm_power_off = xo1_power_off; + + printk(KERN_INFO "OLPC XO-1 support registered\n"); + return 0; +} + +static int __devexit olpc_xo1_remove(struct platform_device *pdev) +{ + pm_power_off = NULL; + return 0; +} + +static struct platform_driver olpc_xo1_driver = { + .driver = { + .name = DRV_NAME, + .owner = THIS_MODULE, + }, + .probe = olpc_xo1_probe, + .remove = __devexit_p(olpc_xo1_remove), +}; + +static int __init olpc_xo1_init(void) +{ + return platform_driver_register(&olpc_xo1_driver); +} + +static void __exit olpc_xo1_exit(void) +{ + platform_driver_unregister(&olpc_xo1_driver); +} + +MODULE_AUTHOR("Daniel Drake <dsd@laptop.org>"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:olpc-xo1"); + +module_init(olpc_xo1_init); +module_exit(olpc_xo1_exit); diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c index 0e0cdde519b..edaf3fe8dc5 100644 --- a/arch/x86/kernel/olpc.c +++ b/arch/x86/kernel/olpc.c @@ -17,6 +17,7 @@ #include <linux/spinlock.h> #include <linux/io.h> #include <linux/string.h> +#include <linux/platform_device.h> #include <asm/geode.h> #include <asm/setup.h> @@ -114,6 +115,7 @@ int olpc_ec_cmd(unsigned char cmd, unsigned char *inbuf, size_t inlen, unsigned long flags; int ret = -EIO; int i; + int restarts = 0; spin_lock_irqsave(&ec_lock, flags); @@ -169,7 +171,9 @@ restart: if (wait_on_obf(0x6c, 1)) { printk(KERN_ERR "olpc-ec: timeout waiting for" " EC to provide data!\n"); - goto restart; + if (restarts++ < 10) + goto restart; + goto err; } outbuf[i] = inb(0x68); pr_devel("olpc-ec: received 0x%x\n", outbuf[i]); @@ -183,8 +187,21 @@ err: } EXPORT_SYMBOL_GPL(olpc_ec_cmd); -#ifdef CONFIG_OLPC_OPENFIRMWARE -static void __init platform_detect(void) +static bool __init check_ofw_architecture(void) +{ + size_t propsize; + char olpc_arch[5]; + const void *args[] = { NULL, "architecture", olpc_arch, (void *)5 }; + void *res[] = { &propsize }; + + if (olpc_ofw("getprop", args, res)) { + printk(KERN_ERR "ofw: getprop call failed!\n"); + return false; + } + return propsize == 5 && strncmp("OLPC", olpc_arch, 5) == 0; +} + +static u32 __init get_board_revision(void) { size_t propsize; __be32 rev; @@ -193,45 +210,43 @@ static void __init platform_detect(void) if (olpc_ofw("getprop", args, res) || propsize != 4) { printk(KERN_ERR "ofw: getprop call failed!\n"); - rev = cpu_to_be32(0); + return cpu_to_be32(0); } - olpc_platform_info.boardrev = be32_to_cpu(rev); + return be32_to_cpu(rev); } -#else -static void __init platform_detect(void) + +static bool __init platform_detect(void) { - /* stopgap until OFW support is added to the kernel */ - olpc_platform_info.boardrev = olpc_board(0xc2); + if (!check_ofw_architecture()) + return false; + olpc_platform_info.flags |= OLPC_F_PRESENT; + olpc_platform_info.boardrev = get_board_revision(); + return true; } -#endif -static int __init olpc_init(void) +static int __init add_xo1_platform_devices(void) { - unsigned char *romsig; + struct platform_device *pdev; - /* The ioremap check is dangerous; limit what we run it on */ - if (!is_geode() || cs5535_has_vsa2()) - return 0; + pdev = platform_device_register_simple("xo1-rfkill", -1, NULL, 0); + if (IS_ERR(pdev)) + return PTR_ERR(pdev); - spin_lock_init(&ec_lock); + pdev = platform_device_register_simple("olpc-xo1", -1, NULL, 0); + if (IS_ERR(pdev)) + return PTR_ERR(pdev); - romsig = ioremap(0xffffffc0, 16); - if (!romsig) - return 0; + return 0; +} - if (strncmp(romsig, "CL1 Q", 7)) - goto unmap; - if (strncmp(romsig+6, romsig+13, 3)) { - printk(KERN_INFO "OLPC BIOS signature looks invalid. " - "Assuming not OLPC\n"); - goto unmap; - } +static int __init olpc_init(void) +{ + int r = 0; - printk(KERN_INFO "OLPC board with OpenFirmware %.16s\n", romsig); - olpc_platform_info.flags |= OLPC_F_PRESENT; + if (!olpc_ofw_present() || !platform_detect()) + return 0; - /* get the platform revision */ - platform_detect(); + spin_lock_init(&ec_lock); /* assume B1 and above models always have a DCON */ if (olpc_board_at_least(olpc_board(0xb1))) @@ -242,8 +257,10 @@ static int __init olpc_init(void) (unsigned char *) &olpc_platform_info.ecver, 1); #ifdef CONFIG_PCI_OLPC - /* If the VSA exists let it emulate PCI, if not emulate in kernel */ - if (!cs5535_has_vsa2()) + /* If the VSA exists let it emulate PCI, if not emulate in kernel. + * XO-1 only. */ + if (olpc_platform_info.boardrev < olpc_board_pre(0xd0) && + !cs5535_has_vsa2()) x86_init.pci.arch_init = pci_olpc_init; #endif @@ -252,8 +269,12 @@ static int __init olpc_init(void) olpc_platform_info.boardrev >> 4, olpc_platform_info.ecver); -unmap: - iounmap(romsig); + if (olpc_platform_info.boardrev < olpc_board_pre(0xd0)) { /* XO-1 */ + r = add_xo1_platform_devices(); + if (r) + return r; + } + return 0; } diff --git a/arch/x86/kernel/olpc_ofw.c b/arch/x86/kernel/olpc_ofw.c index 3218aa71ab5..78732046437 100644 --- a/arch/x86/kernel/olpc_ofw.c +++ b/arch/x86/kernel/olpc_ofw.c @@ -74,6 +74,12 @@ int __olpc_ofw(const char *name, int nr_args, const void **args, int nr_res, } EXPORT_SYMBOL_GPL(__olpc_ofw); +bool olpc_ofw_present(void) +{ + return olpc_ofw_cif != NULL; +} +EXPORT_SYMBOL_GPL(olpc_ofw_present); + /* OFW cif _should_ be above this address */ #define OFW_MIN 0xff000000 diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 0f7f130caa6..c562207b1b3 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -39,7 +39,7 @@ #include <asm/cacheflush.h> #include <asm/swiotlb.h> #include <asm/dma.h> -#include <asm/k8.h> +#include <asm/amd_nb.h> #include <asm/x86_init.h> static unsigned long iommu_bus_base; /* GART remapping area (physical) */ @@ -560,8 +560,11 @@ static void enable_gart_translations(void) { int i; - for (i = 0; i < num_k8_northbridges; i++) { - struct pci_dev *dev = k8_northbridges[i]; + if (!k8_northbridges.gart_supported) + return; + + for (i = 0; i < k8_northbridges.num; i++) { + struct pci_dev *dev = k8_northbridges.nb_misc[i]; enable_gart_translation(dev, __pa(agp_gatt_table)); } @@ -592,16 +595,19 @@ static void gart_fixup_northbridges(struct sys_device *dev) if (!fix_up_north_bridges) return; + if (!k8_northbridges.gart_supported) + return; + pr_info("PCI-DMA: Restoring GART aperture settings\n"); - for (i = 0; i < num_k8_northbridges; i++) { - struct pci_dev *dev = k8_northbridges[i]; + for (i = 0; i < k8_northbridges.num; i++) { + struct pci_dev *dev = k8_northbridges.nb_misc[i]; /* * Don't enable translations just yet. That is the next * step. Restore the pre-suspend aperture settings. */ - pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, aperture_order << 1); + gart_set_size_and_enable(dev, aperture_order); pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, aperture_alloc >> 25); } } @@ -649,8 +655,8 @@ static __init int init_k8_gatt(struct agp_kern_info *info) aper_size = aper_base = info->aper_size = 0; dev = NULL; - for (i = 0; i < num_k8_northbridges; i++) { - dev = k8_northbridges[i]; + for (i = 0; i < k8_northbridges.num; i++) { + dev = k8_northbridges.nb_misc[i]; new_aper_base = read_aperture(dev, &new_aper_size); if (!new_aper_base) goto nommu; @@ -718,10 +724,13 @@ static void gart_iommu_shutdown(void) if (!no_agp) return; - for (i = 0; i < num_k8_northbridges; i++) { + if (!k8_northbridges.gart_supported) + return; + + for (i = 0; i < k8_northbridges.num; i++) { u32 ctl; - dev = k8_northbridges[i]; + dev = k8_northbridges.nb_misc[i]; pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl); ctl &= ~GARTEN; @@ -739,7 +748,7 @@ int __init gart_iommu_init(void) unsigned long scratch; long i; - if (num_k8_northbridges == 0) + if (!k8_northbridges.gart_supported) return 0; #ifndef CONFIG_AGP_AMD64 diff --git a/arch/x86/kernel/pmtimer_64.c b/arch/x86/kernel/pmtimer_64.c deleted file mode 100644 index b112406f199..00000000000 --- a/arch/x86/kernel/pmtimer_64.c +++ /dev/null @@ -1,69 +0,0 @@ -/* Ported over from i386 by AK, original copyright was: - * - * (C) Dominik Brodowski <linux@brodo.de> 2003 - * - * Driver to use the Power Management Timer (PMTMR) available in some - * southbridges as primary timing source for the Linux kernel. - * - * Based on parts of linux/drivers/acpi/hardware/hwtimer.c, timer_pit.c, - * timer_hpet.c, and on Arjan van de Ven's implementation for 2.4. - * - * This file is licensed under the GPL v2. - * - * Dropped all the hardware bug workarounds for now. Hopefully they - * are not needed on 64bit chipsets. - */ - -#include <linux/jiffies.h> -#include <linux/kernel.h> -#include <linux/time.h> -#include <linux/init.h> -#include <linux/cpumask.h> -#include <linux/acpi_pmtmr.h> - -#include <asm/io.h> -#include <asm/proto.h> -#include <asm/msr.h> -#include <asm/vsyscall.h> - -static inline u32 cyc2us(u32 cycles) -{ - /* The Power Management Timer ticks at 3.579545 ticks per microsecond. - * 1 / PM_TIMER_FREQUENCY == 0.27936511 =~ 286/1024 [error: 0.024%] - * - * Even with HZ = 100, delta is at maximum 35796 ticks, so it can - * easily be multiplied with 286 (=0x11E) without having to fear - * u32 overflows. - */ - cycles *= 286; - return (cycles >> 10); -} - -static unsigned pmtimer_wait_tick(void) -{ - u32 a, b; - for (a = b = inl(pmtmr_ioport) & ACPI_PM_MASK; - a == b; - b = inl(pmtmr_ioport) & ACPI_PM_MASK) - cpu_relax(); - return b; -} - -/* note: wait time is rounded up to one tick */ -void pmtimer_wait(unsigned us) -{ - u32 a, b; - a = pmtimer_wait_tick(); - do { - b = inl(pmtmr_ioport); - cpu_relax(); - } while (cyc2us(b - a) < us); -} - -static int __init nopmtimer_setup(char *s) -{ - pmtmr_ioport = 0; - return 1; -} - -__setup("nopmtimer", nopmtimer_setup); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 3d9ea531ddd..b3d7a3a04f3 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -424,7 +424,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) load_TLS(next, cpu); /* Must be after DS reload */ - unlazy_fpu(prev_p); + __unlazy_fpu(prev_p); /* Make sure cpu is ready for new context */ if (preload_fpu) diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index e3af342fe83..7a4cf14223b 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -84,7 +84,7 @@ static int __init reboot_setup(char *str) } /* we will leave sorting out the final value when we are ready to reboot, since we might not - have set up boot_cpu_id or smp_num_cpu */ + have detected BSP APIC ID or smp_num_cpu */ break; #endif /* CONFIG_SMP */ diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index feb4c21e499..a59f6a6df5e 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -106,11 +106,12 @@ #include <asm/percpu.h> #include <asm/topology.h> #include <asm/apicdef.h> -#include <asm/k8.h> +#include <asm/amd_nb.h> #ifdef CONFIG_X86_64 #include <asm/numa_64.h> #endif #include <asm/mce.h> +#include <asm/alternative.h> /* * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. @@ -124,7 +125,6 @@ unsigned long max_pfn_mapped; RESERVE_BRK(dmi_alloc, 65536); #endif -unsigned int boot_cpu_id __read_mostly; static __initdata unsigned long _brk_start = (unsigned long)__brk_base; unsigned long _brk_end = (unsigned long)__brk_base; @@ -617,79 +617,7 @@ static __init void reserve_ibft_region(void) reserve_early_overlap_ok(addr, addr + size, "ibft"); } -#ifdef CONFIG_X86_RESERVE_LOW_64K -static int __init dmi_low_memory_corruption(const struct dmi_system_id *d) -{ - printk(KERN_NOTICE - "%s detected: BIOS may corrupt low RAM, working around it.\n", - d->ident); - - e820_update_range(0, 0x10000, E820_RAM, E820_RESERVED); - sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); - - return 0; -} -#endif - -/* List of systems that have known low memory corruption BIOS problems */ -static struct dmi_system_id __initdata bad_bios_dmi_table[] = { -#ifdef CONFIG_X86_RESERVE_LOW_64K - { - .callback = dmi_low_memory_corruption, - .ident = "AMI BIOS", - .matches = { - DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."), - }, - }, - { - .callback = dmi_low_memory_corruption, - .ident = "Phoenix BIOS", - .matches = { - DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies"), - }, - }, - { - .callback = dmi_low_memory_corruption, - .ident = "Phoenix/MSC BIOS", - .matches = { - DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix/MSC"), - }, - }, - /* - * AMI BIOS with low memory corruption was found on Intel DG45ID and - * DG45FC boards. - * It has a different DMI_BIOS_VENDOR = "Intel Corp.", for now we will - * match only DMI_BOARD_NAME and see if there is more bad products - * with this vendor. - */ - { - .callback = dmi_low_memory_corruption, - .ident = "AMI BIOS", - .matches = { - DMI_MATCH(DMI_BOARD_NAME, "DG45ID"), - }, - }, - { - .callback = dmi_low_memory_corruption, - .ident = "AMI BIOS", - .matches = { - DMI_MATCH(DMI_BOARD_NAME, "DG45FC"), - }, - }, - /* - * The Dell Inspiron Mini 1012 has DMI_BIOS_VENDOR = "Dell Inc.", so - * match on the product name. - */ - { - .callback = dmi_low_memory_corruption, - .ident = "Phoenix BIOS", - .matches = { - DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 1012"), - }, - }, -#endif - {} -}; +static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10; static void __init trim_bios_range(void) { @@ -697,8 +625,14 @@ static void __init trim_bios_range(void) * A special case is the first 4Kb of memory; * This is a BIOS owned area, not kernel ram, but generally * not listed as such in the E820 table. + * + * This typically reserves additional memory (64KiB by default) + * since some BIOSes are known to corrupt low memory. See the + * Kconfig help text for X86_RESERVE_LOW. */ - e820_update_range(0, PAGE_SIZE, E820_RAM, E820_RESERVED); + e820_update_range(0, ALIGN(reserve_low, PAGE_SIZE), + E820_RAM, E820_RESERVED); + /* * special case: Some BIOSen report the PC BIOS * area (640->1Mb) as ram even though it is not. @@ -708,6 +642,28 @@ static void __init trim_bios_range(void) sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); } +static int __init parse_reservelow(char *p) +{ + unsigned long long size; + + if (!p) + return -EINVAL; + + size = memparse(p, &p); + + if (size < 4096) + size = 4096; + + if (size > 640*1024) + size = 640*1024; + + reserve_low = size; + + return 0; +} + +early_param("reservelow", parse_reservelow); + /* * Determine if we were loaded by an EFI loader. If so, then we have also been * passed the efi memmap, systab, etc., so we should use these data structures @@ -725,6 +681,7 @@ void __init setup_arch(char **cmdline_p) { int acpi = 0; int k8 = 0; + unsigned long flags; #ifdef CONFIG_X86_32 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); @@ -859,8 +816,6 @@ void __init setup_arch(char **cmdline_p) dmi_scan_machine(); - dmi_check_system(bad_bios_dmi_table); - /* * VMware detection requires dmi to be available, so this * needs to be done after dmi_scan_machine, for the BP. @@ -1067,6 +1022,10 @@ void __init setup_arch(char **cmdline_p) x86_init.oem.banner(); mcheck_init(); + + local_irq_save(flags); + arch_init_ideal_nop5(); + local_irq_restore(flags); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index a60df9ae645..2335c15c93a 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -253,7 +253,7 @@ void __init setup_per_cpu_areas(void) * Up to this point, the boot CPU has been using .init.data * area. Reload any changed state for the boot CPU. */ - if (cpu == boot_cpu_id) + if (!cpu) switch_to_new_gdt(cpu); } diff --git a/arch/x86/kernel/sfi.c b/arch/x86/kernel/sfi.c index cb22acf3ed0..dd4c281ffe5 100644 --- a/arch/x86/kernel/sfi.c +++ b/arch/x86/kernel/sfi.c @@ -34,7 +34,7 @@ #ifdef CONFIG_X86_LOCAL_APIC static unsigned long sfi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; -void __init mp_sfi_register_lapic_address(unsigned long address) +static void __init mp_sfi_register_lapic_address(unsigned long address) { mp_lapic_addr = address; @@ -46,7 +46,7 @@ void __init mp_sfi_register_lapic_address(unsigned long address) } /* All CPUs enumerated by SFI must be present and enabled */ -void __cpuinit mp_sfi_register_lapic(u8 id) +static void __cpuinit mp_sfi_register_lapic(u8 id) { if (MAX_APICS - id <= 0) { pr_warning("Processor #%d invalid (max %d)\n", diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 63a1a5596ac..a3df9f83003 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -62,6 +62,7 @@ #include <asm/pgtable.h> #include <asm/tlbflush.h> #include <asm/mtrr.h> +#include <asm/mwait.h> #include <asm/apic.h> #include <asm/setup.h> #include <asm/uv/uv.h> @@ -395,6 +396,19 @@ void __cpuinit smp_store_cpu_info(int id) identify_secondary_cpu(c); } +static void __cpuinit link_thread_siblings(int cpu1, int cpu2) +{ + struct cpuinfo_x86 *c1 = &cpu_data(cpu1); + struct cpuinfo_x86 *c2 = &cpu_data(cpu2); + + cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2)); + cpumask_set_cpu(cpu2, cpu_sibling_mask(cpu1)); + cpumask_set_cpu(cpu1, cpu_core_mask(cpu2)); + cpumask_set_cpu(cpu2, cpu_core_mask(cpu1)); + cpumask_set_cpu(cpu1, c2->llc_shared_map); + cpumask_set_cpu(cpu2, c1->llc_shared_map); +} + void __cpuinit set_cpu_sibling_map(int cpu) { @@ -407,14 +421,13 @@ void __cpuinit set_cpu_sibling_map(int cpu) for_each_cpu(i, cpu_sibling_setup_mask) { struct cpuinfo_x86 *o = &cpu_data(i); - if (c->phys_proc_id == o->phys_proc_id && - c->cpu_core_id == o->cpu_core_id) { - cpumask_set_cpu(i, cpu_sibling_mask(cpu)); - cpumask_set_cpu(cpu, cpu_sibling_mask(i)); - cpumask_set_cpu(i, cpu_core_mask(cpu)); - cpumask_set_cpu(cpu, cpu_core_mask(i)); - cpumask_set_cpu(i, c->llc_shared_map); - cpumask_set_cpu(cpu, o->llc_shared_map); + if (cpu_has(c, X86_FEATURE_TOPOEXT)) { + if (c->phys_proc_id == o->phys_proc_id && + c->compute_unit_id == o->compute_unit_id) + link_thread_siblings(cpu, i); + } else if (c->phys_proc_id == o->phys_proc_id && + c->cpu_core_id == o->cpu_core_id) { + link_thread_siblings(cpu, i); } } } else { @@ -1381,11 +1394,88 @@ void play_dead_common(void) local_irq_disable(); } +/* + * We need to flush the caches before going to sleep, lest we have + * dirty data in our caches when we come back up. + */ +static inline void mwait_play_dead(void) +{ + unsigned int eax, ebx, ecx, edx; + unsigned int highest_cstate = 0; + unsigned int highest_subcstate = 0; + int i; + void *mwait_ptr; + + if (!cpu_has(¤t_cpu_data, X86_FEATURE_MWAIT)) + return; + if (!cpu_has(¤t_cpu_data, X86_FEATURE_CLFLSH)) + return; + if (current_cpu_data.cpuid_level < CPUID_MWAIT_LEAF) + return; + + eax = CPUID_MWAIT_LEAF; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + + /* + * eax will be 0 if EDX enumeration is not valid. + * Initialized below to cstate, sub_cstate value when EDX is valid. + */ + if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) { + eax = 0; + } else { + edx >>= MWAIT_SUBSTATE_SIZE; + for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) { + if (edx & MWAIT_SUBSTATE_MASK) { + highest_cstate = i; + highest_subcstate = edx & MWAIT_SUBSTATE_MASK; + } + } + eax = (highest_cstate << MWAIT_SUBSTATE_SIZE) | + (highest_subcstate - 1); + } + + /* + * This should be a memory location in a cache line which is + * unlikely to be touched by other processors. The actual + * content is immaterial as it is not actually modified in any way. + */ + mwait_ptr = ¤t_thread_info()->flags; + + wbinvd(); + + while (1) { + /* + * The CLFLUSH is a workaround for erratum AAI65 for + * the Xeon 7400 series. It's not clear it is actually + * needed, but it should be harmless in either case. + * The WBINVD is insufficient due to the spurious-wakeup + * case where we return around the loop. + */ + clflush(mwait_ptr); + __monitor(mwait_ptr, 0, 0); + mb(); + __mwait(eax, 0); + } +} + +static inline void hlt_play_dead(void) +{ + if (current_cpu_data.x86 >= 4) + wbinvd(); + + while (1) { + native_halt(); + } +} + void native_play_dead(void) { play_dead_common(); tboot_shutdown(TB_SHUTDOWN_WFS); - wbinvd_halt(); + + mwait_play_dead(); /* Only returns on failure */ + hlt_play_dead(); } #else /* ... !CONFIG_HOTPLUG_CPU */ diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c index d5e06624e34..0b0cb5fede1 100644 --- a/arch/x86/kernel/sys_i386_32.c +++ b/arch/x86/kernel/sys_i386_32.c @@ -33,8 +33,8 @@ int kernel_execve(const char *filename, const char *const envp[]) { long __res; - asm volatile ("push %%ebx ; movl %2,%%ebx ; int $0x80 ; pop %%ebx" + asm volatile ("int $0x80" : "=a" (__res) - : "0" (__NR_execve), "ri" (filename), "c" (argv), "d" (envp) : "memory"); + : "0" (__NR_execve), "b" (filename), "c" (argv), "d" (envp) : "memory"); return __res; } diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c index a874495b367..e2a59525739 100644 --- a/arch/x86/kernel/trampoline.c +++ b/arch/x86/kernel/trampoline.c @@ -45,8 +45,7 @@ void __init setup_trampoline_page_table(void) /* Copy kernel address range */ clone_pgd_range(trampoline_pg_dir + KERNEL_PGD_BOUNDARY, swapper_pg_dir + KERNEL_PGD_BOUNDARY, - min_t(unsigned long, KERNEL_PGD_PTRS, - KERNEL_PGD_BOUNDARY)); + KERNEL_PGD_PTRS); /* Initialize low mappings */ clone_pgd_range(trampoline_pg_dir, diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 60788dee0f8..d43968503dd 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -776,21 +776,10 @@ asmlinkage void math_state_restore(void) } EXPORT_SYMBOL_GPL(math_state_restore); -#ifndef CONFIG_MATH_EMULATION -void math_emulate(struct math_emu_info *info) -{ - printk(KERN_EMERG - "math-emulation not enabled and no coprocessor found.\n"); - printk(KERN_EMERG "killing %s.\n", current->comm); - force_sig(SIGFPE, current); - schedule(); -} -#endif /* CONFIG_MATH_EMULATION */ - dotraplinkage void __kprobes do_device_not_available(struct pt_regs *regs, long error_code) { -#ifdef CONFIG_X86_32 +#ifdef CONFIG_MATH_EMULATION if (read_cr0() & X86_CR0_EM) { struct math_emu_info info = { }; @@ -798,12 +787,12 @@ do_device_not_available(struct pt_regs *regs, long error_code) info.regs = regs; math_emulate(&info); - } else { - math_state_restore(); /* interrupts still off */ - conditional_sti(regs); + return; } -#else - math_state_restore(); +#endif + math_state_restore(); /* interrupts still off */ +#ifdef CONFIG_X86_32 + conditional_sti(regs); #endif } @@ -881,18 +870,6 @@ void __init trap_init(void) #endif #ifdef CONFIG_X86_32 - if (cpu_has_fxsr) { - printk(KERN_INFO "Enabling fast FPU save and restore... "); - set_in_cr4(X86_CR4_OSFXSR); - printk("done.\n"); - } - if (cpu_has_xmm) { - printk(KERN_INFO - "Enabling unmasked SIMD FPU exception support... "); - set_in_cr4(X86_CR4_OSXMMEXCPT); - printk("done.\n"); - } - set_system_trap_gate(SYSCALL_VECTOR, &system_call); set_bit(SYSCALL_VECTOR, used_vectors); #endif diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index ce8e5023933..0c40d8b7241 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -104,10 +104,14 @@ int __init notsc_setup(char *str) __setup("notsc", notsc_setup); +static int no_sched_irq_time; + static int __init tsc_setup(char *str) { if (!strcmp(str, "reliable")) tsc_clocksource_reliable = 1; + if (!strncmp(str, "noirqtime", 9)) + no_sched_irq_time = 1; return 1; } @@ -626,6 +630,44 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) local_irq_restore(flags); } +static unsigned long long cyc2ns_suspend; + +void save_sched_clock_state(void) +{ + if (!sched_clock_stable) + return; + + cyc2ns_suspend = sched_clock(); +} + +/* + * Even on processors with invariant TSC, TSC gets reset in some the + * ACPI system sleep states. And in some systems BIOS seem to reinit TSC to + * arbitrary value (still sync'd across cpu's) during resume from such sleep + * states. To cope up with this, recompute the cyc2ns_offset for each cpu so + * that sched_clock() continues from the point where it was left off during + * suspend. + */ +void restore_sched_clock_state(void) +{ + unsigned long long offset; + unsigned long flags; + int cpu; + + if (!sched_clock_stable) + return; + + local_irq_save(flags); + + __get_cpu_var(cyc2ns_offset) = 0; + offset = cyc2ns_suspend - sched_clock(); + + for_each_possible_cpu(cpu) + per_cpu(cyc2ns_offset, cpu) = offset; + + local_irq_restore(flags); +} + #ifdef CONFIG_CPU_FREQ /* Frequency scaling support. Adjust the TSC based timer when the cpu frequency @@ -763,6 +805,7 @@ void mark_tsc_unstable(char *reason) if (!tsc_unstable) { tsc_unstable = 1; sched_clock_stable = 0; + disable_sched_clock_irqtime(); printk(KERN_INFO "Marking TSC unstable due to %s\n", reason); /* Change only the rating, when not registered */ if (clocksource_tsc.mult) @@ -854,60 +897,6 @@ static void __init init_tsc_clocksource(void) clocksource_register_khz(&clocksource_tsc, tsc_khz); } -#ifdef CONFIG_X86_64 -/* - * calibrate_cpu is used on systems with fixed rate TSCs to determine - * processor frequency - */ -#define TICK_COUNT 100000000 -static unsigned long __init calibrate_cpu(void) -{ - int tsc_start, tsc_now; - int i, no_ctr_free; - unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0; - unsigned long flags; - - for (i = 0; i < 4; i++) - if (avail_to_resrv_perfctr_nmi_bit(i)) - break; - no_ctr_free = (i == 4); - if (no_ctr_free) { - WARN(1, KERN_WARNING "Warning: AMD perfctrs busy ... " - "cpu_khz value may be incorrect.\n"); - i = 3; - rdmsrl(MSR_K7_EVNTSEL3, evntsel3); - wrmsrl(MSR_K7_EVNTSEL3, 0); - rdmsrl(MSR_K7_PERFCTR3, pmc3); - } else { - reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i); - reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i); - } - local_irq_save(flags); - /* start measuring cycles, incrementing from 0 */ - wrmsrl(MSR_K7_PERFCTR0 + i, 0); - wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76); - rdtscl(tsc_start); - do { - rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now); - tsc_now = get_cycles(); - } while ((tsc_now - tsc_start) < TICK_COUNT); - - local_irq_restore(flags); - if (no_ctr_free) { - wrmsrl(MSR_K7_EVNTSEL3, 0); - wrmsrl(MSR_K7_PERFCTR3, pmc3); - wrmsrl(MSR_K7_EVNTSEL3, evntsel3); - } else { - release_perfctr_nmi(MSR_K7_PERFCTR0 + i); - release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); - } - - return pmc_now * tsc_khz / (tsc_now - tsc_start); -} -#else -static inline unsigned long calibrate_cpu(void) { return cpu_khz; } -#endif - void __init tsc_init(void) { u64 lpj; @@ -926,10 +915,6 @@ void __init tsc_init(void) return; } - if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) && - (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)) - cpu_khz = calibrate_cpu(); - printk("Detected %lu.%03lu MHz processor.\n", (unsigned long)cpu_khz / 1000, (unsigned long)cpu_khz % 1000); @@ -949,6 +934,9 @@ void __init tsc_init(void) /* now allow native_sched_clock() to use rdtsc */ tsc_disabled = 0; + if (!no_sched_irq_time) + enable_sched_clock_irqtime(); + lpj = ((u64)tsc_khz * 1000); do_div(lpj, HZ); lpj_fine = lpj; |