summaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile1
-rw-r--r--arch/x86/kernel/acpi/boot.c170
-rw-r--r--arch/x86/kernel/acpi/sleep.c2
-rw-r--r--arch/x86/kernel/alternative.c122
-rw-r--r--arch/x86/kernel/amd_iommu.c45
-rw-r--r--arch/x86/kernel/amd_iommu_init.c53
-rw-r--r--arch/x86/kernel/apb_timer.c785
-rw-r--r--arch/x86/kernel/aperture_64.c15
-rw-r--r--arch/x86/kernel/apic/apic.c30
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c7
-rw-r--r--arch/x86/kernel/apic/es7000_32.c1
-rw-r--r--arch/x86/kernel/apic/io_apic.c358
-rw-r--r--arch/x86/kernel/apic/nmi.c15
-rw-r--r--arch/x86/kernel/apic/numaq_32.c3
-rw-r--r--arch/x86/kernel/apic/probe_32.c29
-rw-r--r--arch/x86/kernel/apic/probe_64.c13
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c132
-rw-r--r--arch/x86/kernel/apm_32.c4
-rw-r--r--arch/x86/kernel/bios_uv.c39
-rw-r--r--arch/x86/kernel/bootflag.c1
-rw-r--r--arch/x86/kernel/cpu/Makefile2
-rw-r--r--arch/x86/kernel/cpu/addon_cpuid_features.c4
-rw-r--r--arch/x86/kernel/cpu/cpu_debug.c688
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Kconfig14
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Makefile1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/elanfreq.c1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/gx-suspmod.c1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longrun.c1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/p4-clockmod.c1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c621
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k6.c1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c9
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-ich.c1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-lib.c1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-smi.c1
-rw-r--r--arch/x86/kernel/cpu/intel.c3
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c252
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-inject.c1
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c21
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c3
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c5
-rw-r--r--arch/x86/kernel/cpu/mtrr/Makefile2
-rw-r--r--arch/x86/kernel/cpu/mtrr/amd.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/centaur.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c208
-rw-r--r--arch/x86/kernel/cpu/mtrr/cyrix.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c11
-rw-r--r--arch/x86/kernel/cpu/mtrr/if.c1
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c7
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.h6
-rw-r--r--arch/x86/kernel/cpu/mtrr/state.c94
-rw-r--r--arch/x86/kernel/cpu/perf_event.c1968
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c422
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c980
-rw-r--r--arch/x86/kernel/cpu/perf_event_p6.c159
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c13
-rw-r--r--arch/x86/kernel/cpu/vmware.c2
-rw-r--r--arch/x86/kernel/cpuid.c3
-rw-r--r--arch/x86/kernel/crash.c6
-rw-r--r--arch/x86/kernel/crash_dump_32.c1
-rw-r--r--arch/x86/kernel/dumpstack.c14
-rw-r--r--arch/x86/kernel/dumpstack.h24
-rw-r--r--arch/x86/kernel/dumpstack_32.c5
-rw-r--r--arch/x86/kernel/dumpstack_64.c20
-rw-r--r--arch/x86/kernel/e820.c379
-rw-r--r--arch/x86/kernel/efi.c2
-rw-r--r--arch/x86/kernel/ftrace.c36
-rw-r--r--arch/x86/kernel/head32.c14
-rw-r--r--arch/x86/kernel/head64.c3
-rw-r--r--arch/x86/kernel/head_32.S6
-rw-r--r--arch/x86/kernel/head_64.S2
-rw-r--r--arch/x86/kernel/hpet.c20
-rw-r--r--arch/x86/kernel/hw_breakpoint.c38
-rw-r--r--arch/x86/kernel/i387.c72
-rw-r--r--arch/x86/kernel/i8259.c95
-rw-r--r--arch/x86/kernel/irqinit.c59
-rw-r--r--arch/x86/kernel/k8.c16
-rw-r--r--arch/x86/kernel/kdebugfs.c1
-rw-r--r--arch/x86/kernel/kgdb.c222
-rw-r--r--arch/x86/kernel/kprobes.c614
-rw-r--r--arch/x86/kernel/ldt.c1
-rw-r--r--arch/x86/kernel/machine_kexec_64.c1
-rw-r--r--arch/x86/kernel/mca_32.c1
-rw-r--r--arch/x86/kernel/microcode_amd.c44
-rw-r--r--arch/x86/kernel/microcode_core.c6
-rw-r--r--arch/x86/kernel/microcode_intel.c2
-rw-r--r--arch/x86/kernel/mmconf-fam10h_64.c7
-rw-r--r--arch/x86/kernel/module.c1
-rw-r--r--arch/x86/kernel/mpparse.c11
-rw-r--r--arch/x86/kernel/mrst.c216
-rw-r--r--arch/x86/kernel/msr.c3
-rw-r--r--arch/x86/kernel/olpc.c10
-rw-r--r--arch/x86/kernel/paravirt.c4
-rw-r--r--arch/x86/kernel/pci-calgary_64.c2
-rw-r--r--arch/x86/kernel/pci-dma.c16
-rw-r--r--arch/x86/kernel/pci-gart_64.c6
-rw-r--r--arch/x86/kernel/pci-nommu.c1
-rw-r--r--arch/x86/kernel/process.c52
-rw-r--r--arch/x86/kernel/process_32.c14
-rw-r--r--arch/x86/kernel/process_64.c36
-rw-r--r--arch/x86/kernel/ptrace.c68
-rw-r--r--arch/x86/kernel/quirks.c13
-rw-r--r--arch/x86/kernel/reboot.c17
-rw-r--r--arch/x86/kernel/setup.c70
-rw-r--r--arch/x86/kernel/setup_percpu.c6
-rw-r--r--arch/x86/kernel/smp.c1
-rw-r--r--arch/x86/kernel/smpboot.c28
-rw-r--r--arch/x86/kernel/sys_i386_32.c185
-rw-r--r--arch/x86/kernel/sys_x86_64.c12
-rw-r--r--arch/x86/kernel/syscall_table_32.S4
-rw-r--r--arch/x86/kernel/time.c4
-rw-r--r--arch/x86/kernel/tlb_uv.c1
-rw-r--r--arch/x86/kernel/traps.c3
-rw-r--r--arch/x86/kernel/tsc.c6
-rw-r--r--arch/x86/kernel/uv_irq.c1
-rw-r--r--arch/x86/kernel/uv_sysfs.c6
-rw-r--r--arch/x86/kernel/uv_time.c14
-rw-r--r--arch/x86/kernel/visws_quirks.c27
-rw-r--r--arch/x86/kernel/vmi_32.c36
-rw-r--r--arch/x86/kernel/vmiclock_32.c8
-rw-r--r--arch/x86/kernel/vmlinux.lds.S6
-rw-r--r--arch/x86/kernel/vsyscall_64.c3
-rw-r--r--arch/x86/kernel/x86_init.c11
-rw-r--r--arch/x86/kernel/xsave.c1
126 files changed, 6079 insertions, 3876 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index d87f09bc5a5..4c58352209e 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -87,6 +87,7 @@ obj-$(CONFIG_VM86) += vm86_32.o
obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
obj-$(CONFIG_HPET_TIMER) += hpet.o
+obj-$(CONFIG_APB_TIMER) += apb_timer.o
obj-$(CONFIG_K8_NB) += k8.o
obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index fb1035cd9a6..cd40aba6aa9 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -31,10 +31,12 @@
#include <linux/module.h>
#include <linux/dmi.h>
#include <linux/irq.h>
+#include <linux/slab.h>
#include <linux/bootmem.h>
#include <linux/ioport.h>
#include <linux/pci.h>
+#include <asm/pci_x86.h>
#include <asm/pgtable.h>
#include <asm/io_apic.h>
#include <asm/apic.h>
@@ -49,6 +51,7 @@ EXPORT_SYMBOL(acpi_disabled);
#ifdef CONFIG_X86_64
# include <asm/proto.h>
+# include <asm/numa_64.h>
#endif /* X86 */
#define BAD_MADT_ENTRY(entry, end) ( \
@@ -446,6 +449,12 @@ void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger)
int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
{
*irq = gsi;
+
+#ifdef CONFIG_X86_IO_APIC
+ if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC)
+ setup_IO_APIC_irq_extra(gsi);
+#endif
+
return 0;
}
@@ -473,7 +482,8 @@ int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity);
}
#endif
- acpi_gsi_to_irq(plat_gsi, &irq);
+ irq = plat_gsi;
+
return irq;
}
@@ -481,6 +491,26 @@ int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
* ACPI based hotplug support for CPU
*/
#ifdef CONFIG_ACPI_HOTPLUG_CPU
+#include <acpi/processor.h>
+
+static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
+{
+#ifdef CONFIG_ACPI_NUMA
+ int nid;
+
+ nid = acpi_get_node(handle);
+ if (nid == -1 || !node_online(nid))
+ return;
+#ifdef CONFIG_X86_64
+ apicid_to_node[physid] = nid;
+ numa_set_node(cpu, nid);
+#else /* CONFIG_X86_32 */
+ apicid_2_node[physid] = nid;
+ cpu_to_node_map[cpu] = nid;
+#endif
+
+#endif
+}
static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu)
{
@@ -539,7 +569,10 @@ static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu)
goto free_new_map;
}
+ acpi_processor_set_pdc(handle);
+
cpu = cpumask_first(new_map);
+ acpi_map_cpu2node(handle, cpu, physid);
*pcpu = cpu;
retval = 0;
@@ -1185,9 +1218,6 @@ static void __init acpi_process_madt(void)
if (!error) {
acpi_lapic = 1;
-#ifdef CONFIG_X86_BIGSMP
- generic_bigsmp_probe();
-#endif
/*
* Parse MADT IO-APIC entries
*/
@@ -1197,8 +1227,6 @@ static void __init acpi_process_madt(void)
acpi_ioapic = 1;
smp_found_config = 1;
- if (apic->setup_apic_routing)
- apic->setup_apic_routing();
}
}
if (error == -EINVAL) {
@@ -1269,23 +1297,6 @@ static int __init dmi_disable_acpi(const struct dmi_system_id *d)
}
/*
- * Limit ACPI to CPU enumeration for HT
- */
-static int __init force_acpi_ht(const struct dmi_system_id *d)
-{
- if (!acpi_force) {
- printk(KERN_NOTICE "%s detected: force use of acpi=ht\n",
- d->ident);
- disable_acpi();
- acpi_ht = 1;
- } else {
- printk(KERN_NOTICE
- "Warning: acpi=force overrules DMI blacklist: acpi=ht\n");
- }
- return 0;
-}
-
-/*
* Force ignoring BIOS IRQ0 pin2 override
*/
static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d)
@@ -1321,90 +1332,6 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {
},
/*
- * Boxes that need acpi=ht
- */
- {
- .callback = force_acpi_ht,
- .ident = "FSC Primergy T850",
- .matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU SIEMENS"),
- DMI_MATCH(DMI_PRODUCT_NAME, "PRIMERGY T850"),
- },
- },
- {
- .callback = force_acpi_ht,
- .ident = "HP VISUALIZE NT Workstation",
- .matches = {
- DMI_MATCH(DMI_BOARD_VENDOR, "Hewlett-Packard"),
- DMI_MATCH(DMI_PRODUCT_NAME, "HP VISUALIZE NT Workstation"),
- },
- },
- {
- .callback = force_acpi_ht,
- .ident = "Compaq Workstation W8000",
- .matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Compaq"),
- DMI_MATCH(DMI_PRODUCT_NAME, "Workstation W8000"),
- },
- },
- {
- .callback = force_acpi_ht,
- .ident = "ASUS P2B-DS",
- .matches = {
- DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
- DMI_MATCH(DMI_BOARD_NAME, "P2B-DS"),
- },
- },
- {
- .callback = force_acpi_ht,
- .ident = "ASUS CUR-DLS",
- .matches = {
- DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
- DMI_MATCH(DMI_BOARD_NAME, "CUR-DLS"),
- },
- },
- {
- .callback = force_acpi_ht,
- .ident = "ABIT i440BX-W83977",
- .matches = {
- DMI_MATCH(DMI_BOARD_VENDOR, "ABIT <http://www.abit.com>"),
- DMI_MATCH(DMI_BOARD_NAME, "i440BX-W83977 (BP6)"),
- },
- },
- {
- .callback = force_acpi_ht,
- .ident = "IBM Bladecenter",
- .matches = {
- DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
- DMI_MATCH(DMI_BOARD_NAME, "IBM eServer BladeCenter HS20"),
- },
- },
- {
- .callback = force_acpi_ht,
- .ident = "IBM eServer xSeries 360",
- .matches = {
- DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
- DMI_MATCH(DMI_BOARD_NAME, "eServer xSeries 360"),
- },
- },
- {
- .callback = force_acpi_ht,
- .ident = "IBM eserver xSeries 330",
- .matches = {
- DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
- DMI_MATCH(DMI_BOARD_NAME, "eserver xSeries 330"),
- },
- },
- {
- .callback = force_acpi_ht,
- .ident = "IBM eserver xSeries 440",
- .matches = {
- DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
- DMI_MATCH(DMI_PRODUCT_NAME, "eserver xSeries 440"),
- },
- },
-
- /*
* Boxes that need ACPI PCI IRQ routing disabled
*/
{
@@ -1529,16 +1456,10 @@ static struct dmi_system_id __initdata acpi_dmi_table_late[] = {
* if acpi_blacklisted() acpi_disabled = 1;
* acpi_irq_model=...
* ...
- *
- * return value: (currently ignored)
- * 0: success
- * !0: failure
*/
-int __init acpi_boot_table_init(void)
+void __init acpi_boot_table_init(void)
{
- int error;
-
dmi_check_system(acpi_dmi_table);
/*
@@ -1546,15 +1467,14 @@ int __init acpi_boot_table_init(void)
* One exception: acpi=ht continues far enough to enumerate LAPICs
*/
if (acpi_disabled && !acpi_ht)
- return 1;
+ return;
/*
* Initialize the ACPI boot-time table parser.
*/
- error = acpi_table_init();
- if (error) {
+ if (acpi_table_init()) {
disable_acpi();
- return error;
+ return;
}
acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf);
@@ -1562,18 +1482,15 @@ int __init acpi_boot_table_init(void)
/*
* blacklist may disable ACPI entirely
*/
- error = acpi_blacklisted();
- if (error) {
+ if (acpi_blacklisted()) {
if (acpi_force) {
printk(KERN_WARNING PREFIX "acpi=force override\n");
} else {
printk(KERN_WARNING PREFIX "Disabling ACPI support\n");
disable_acpi();
- return error;
+ return;
}
}
-
- return 0;
}
int __init early_acpi_boot_init(void)
@@ -1619,6 +1536,9 @@ int __init acpi_boot_init(void)
acpi_table_parse(ACPI_SIG_HPET, acpi_parse_hpet);
+ if (!acpi_noirq)
+ x86_init.pci.init = pci_acpi_init;
+
return 0;
}
@@ -1643,8 +1563,10 @@ static int __init parse_acpi(char *arg)
}
/* Limit ACPI just to boot-time to enable HT */
else if (strcmp(arg, "ht") == 0) {
- if (!acpi_force)
+ if (!acpi_force) {
+ printk(KERN_WARNING "acpi=ht will be removed in Linux-2.6.35\n");
disable_acpi();
+ }
acpi_ht = 1;
}
/* acpi=rsdt use RSDT instead of XSDT */
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 82e508677b9..f9961034e55 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -162,6 +162,8 @@ static int __init acpi_sleep_setup(char *str)
#endif
if (strncmp(str, "old_ordering", 12) == 0)
acpi_old_suspend_ordering();
+ if (strncmp(str, "sci_force_enable", 16) == 0)
+ acpi_set_sci_en_on_resume();
str = strchr(str, ',');
if (str != NULL)
str += strspn(str, ", \t");
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 80b222ea4cf..70237732a6c 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -7,6 +7,8 @@
#include <linux/mm.h>
#include <linux/vmalloc.h>
#include <linux/memory.h>
+#include <linux/stop_machine.h>
+#include <linux/slab.h>
#include <asm/alternative.h>
#include <asm/sections.h>
#include <asm/pgtable.h>
@@ -192,7 +194,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len)
}
extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
-extern u8 *__smp_locks[], *__smp_locks_end[];
+extern s32 __smp_locks[], __smp_locks_end[];
static void *text_poke_early(void *addr, const void *opcode, size_t len);
/* Replace instructions with better alternatives for this CPU type.
@@ -233,39 +235,41 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
#ifdef CONFIG_SMP
-static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end)
+static void alternatives_smp_lock(const s32 *start, const s32 *end,
+ u8 *text, u8 *text_end)
{
- u8 **ptr;
+ const s32 *poff;
mutex_lock(&text_mutex);
- for (ptr = start; ptr < end; ptr++) {
- if (*ptr < text)
- continue;
- if (*ptr > text_end)
+ for (poff = start; poff < end; poff++) {
+ u8 *ptr = (u8 *)poff + *poff;
+
+ if (!*poff || ptr < text || ptr >= text_end)
continue;
/* turn DS segment override prefix into lock prefix */
- if (**ptr == 0x3e)
- text_poke(*ptr, ((unsigned char []){0xf0}), 1);
+ if (*ptr == 0x3e)
+ text_poke(ptr, ((unsigned char []){0xf0}), 1);
};
mutex_unlock(&text_mutex);
}
-static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end)
+static void alternatives_smp_unlock(const s32 *start, const s32 *end,
+ u8 *text, u8 *text_end)
{
- u8 **ptr;
+ const s32 *poff;
if (noreplace_smp)
return;
mutex_lock(&text_mutex);
- for (ptr = start; ptr < end; ptr++) {
- if (*ptr < text)
- continue;
- if (*ptr > text_end)
+ for (poff = start; poff < end; poff++) {
+ u8 *ptr = (u8 *)poff + *poff;
+
+ if (!*poff || ptr < text || ptr >= text_end)
continue;
/* turn lock prefix into DS segment override prefix */
- if (**ptr == 0xf0)
- text_poke(*ptr, ((unsigned char []){0x3E}), 1);
+ if (*ptr == 0xf0)
+ text_poke(ptr, ((unsigned char []){0x3E}), 1);
};
mutex_unlock(&text_mutex);
}
@@ -276,8 +280,8 @@ struct smp_alt_module {
char *name;
/* ptrs to lock prefixes */
- u8 **locks;
- u8 **locks_end;
+ const s32 *locks;
+ const s32 *locks_end;
/* .text segment, needed to avoid patching init code ;) */
u8 *text;
@@ -394,6 +398,27 @@ void alternatives_smp_switch(int smp)
mutex_unlock(&smp_alt);
}
+/* Return 1 if the address range is reserved for smp-alternatives */
+int alternatives_text_reserved(void *start, void *end)
+{
+ struct smp_alt_module *mod;
+ const s32 *poff;
+ u8 *text_start = start;
+ u8 *text_end = end;
+
+ list_for_each_entry(mod, &smp_alt_modules, next) {
+ if (mod->text > text_end || mod->text_end < text_start)
+ continue;
+ for (poff = mod->locks; poff < mod->locks_end; poff++) {
+ const u8 *ptr = (const u8 *)poff + *poff;
+
+ if (text_start <= ptr && text_end > ptr)
+ return 1;
+ }
+ }
+
+ return 0;
+}
#endif
#ifdef CONFIG_PARAVIRT
@@ -556,3 +581,62 @@ void *__kprobes text_poke(void *addr, const void *opcode, size_t len)
local_irq_restore(flags);
return addr;
}
+
+/*
+ * Cross-modifying kernel text with stop_machine().
+ * This code originally comes from immediate value.
+ */
+static atomic_t stop_machine_first;
+static int wrote_text;
+
+struct text_poke_params {
+ void *addr;
+ const void *opcode;
+ size_t len;
+};
+
+static int __kprobes stop_machine_text_poke(void *data)
+{
+ struct text_poke_params *tpp = data;
+
+ if (atomic_dec_and_test(&stop_machine_first)) {
+ text_poke(tpp->addr, tpp->opcode, tpp->len);
+ smp_wmb(); /* Make sure other cpus see that this has run */
+ wrote_text = 1;
+ } else {
+ while (!wrote_text)
+ cpu_relax();
+ smp_mb(); /* Load wrote_text before following execution */
+ }
+
+ flush_icache_range((unsigned long)tpp->addr,
+ (unsigned long)tpp->addr + tpp->len);
+ return 0;
+}
+
+/**
+ * text_poke_smp - Update instructions on a live kernel on SMP
+ * @addr: address to modify
+ * @opcode: source of the copy
+ * @len: length to copy
+ *
+ * Modify multi-byte instruction by using stop_machine() on SMP. This allows
+ * user to poke/set multi-byte text on SMP. Only non-NMI/MCE code modifying
+ * should be allowed, since stop_machine() does _not_ protect code against
+ * NMI and MCE.
+ *
+ * Note: Must be called under get_online_cpus() and text_mutex.
+ */
+void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len)
+{
+ struct text_poke_params tpp;
+
+ tpp.addr = addr;
+ tpp.opcode = opcode;
+ tpp.len = len;
+ atomic_set(&stop_machine_first, 1);
+ wrote_text = 0;
+ stop_machine(stop_machine_text_poke, (void *)&tpp, NULL);
+ return addr;
+}
+
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 23824fef789..f854d89b7ed 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -18,8 +18,8 @@
*/
#include <linux/pci.h>
-#include <linux/gfp.h>
#include <linux/bitmap.h>
+#include <linux/slab.h>
#include <linux/debugfs.h>
#include <linux/scatterlist.h>
#include <linux/dma-mapping.h>
@@ -118,7 +118,7 @@ static bool check_device(struct device *dev)
return false;
/* No device or no PCI device */
- if (!dev || dev->bus != &pci_bus_type)
+ if (dev->bus != &pci_bus_type)
return false;
devid = get_device_id(dev);
@@ -392,6 +392,7 @@ static int __iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
u32 tail, head;
u8 *target;
+ WARN_ON(iommu->cmd_buf_size & CMD_BUFFER_UNINITIALIZED);
tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
target = iommu->cmd_buf + tail;
memcpy_toio(target, cmd, sizeof(*cmd));
@@ -980,7 +981,7 @@ static int alloc_new_range(struct dma_ops_domain *dma_dom,
{
int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT;
struct amd_iommu *iommu;
- int i;
+ unsigned long i;
#ifdef CONFIG_IOMMU_STRESS
populate = false;
@@ -1489,11 +1490,14 @@ static void __detach_device(struct device *dev)
{
struct iommu_dev_data *dev_data = get_dev_data(dev);
struct iommu_dev_data *alias_data;
+ struct protection_domain *domain;
unsigned long flags;
BUG_ON(!dev_data->domain);
- spin_lock_irqsave(&dev_data->domain->lock, flags);
+ domain = dev_data->domain;
+
+ spin_lock_irqsave(&domain->lock, flags);
if (dev_data->alias != dev) {
alias_data = get_dev_data(dev_data->alias);
@@ -1504,13 +1508,15 @@ static void __detach_device(struct device *dev)
if (atomic_dec_and_test(&dev_data->bind))
do_detach(dev);
- spin_unlock_irqrestore(&dev_data->domain->lock, flags);
+ spin_unlock_irqrestore(&domain->lock, flags);
/*
* If we run in passthrough mode the device must be assigned to the
- * passthrough domain if it is detached from any other domain
+ * passthrough domain if it is detached from any other domain.
+ * Make sure we can deassign from the pt_domain itself.
*/
- if (iommu_pass_through && dev_data->domain == NULL)
+ if (iommu_pass_through &&
+ (dev_data->domain == NULL && domain != pt_domain))
__attach_device(dev, pt_domain);
}
@@ -2181,7 +2187,7 @@ static void prealloc_protection_domains(void)
struct dma_ops_domain *dma_dom;
u16 devid;
- while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
+ for_each_pci_dev(dev) {
/* Do we handle this device? */
if (!check_device(&dev->dev))
@@ -2218,6 +2224,12 @@ static struct dma_map_ops amd_iommu_dma_ops = {
/*
* The function which clues the AMD IOMMU driver into dma_ops.
*/
+
+void __init amd_iommu_init_api(void)
+{
+ register_iommu(&amd_iommu_ops);
+}
+
int __init amd_iommu_init_dma_ops(void)
{
struct amd_iommu *iommu;
@@ -2253,8 +2265,6 @@ int __init amd_iommu_init_dma_ops(void)
/* Make the driver finally visible to the drivers */
dma_ops = &amd_iommu_dma_ops;
- register_iommu(&amd_iommu_ops);
-
amd_iommu_stats_init();
return 0;
@@ -2289,7 +2299,7 @@ static void cleanup_domain(struct protection_domain *domain)
list_for_each_entry_safe(dev_data, next, &domain->dev_list, list) {
struct device *dev = dev_data->dev;
- do_detach(dev);
+ __detach_device(dev);
atomic_set(&dev_data->bind, 0);
}
@@ -2318,6 +2328,7 @@ static struct protection_domain *protection_domain_alloc(void)
return NULL;
spin_lock_init(&domain->lock);
+ mutex_init(&domain->api_lock);
domain->id = domain_id_alloc();
if (!domain->id)
goto out_err;
@@ -2370,9 +2381,7 @@ static void amd_iommu_domain_destroy(struct iommu_domain *dom)
free_pagetable(domain);
- domain_id_free(domain->id);
-
- kfree(domain);
+ protection_domain_free(domain);
dom->priv = NULL;
}
@@ -2447,6 +2456,8 @@ static int amd_iommu_map_range(struct iommu_domain *dom,
iova &= PAGE_MASK;
paddr &= PAGE_MASK;
+ mutex_lock(&domain->api_lock);
+
for (i = 0; i < npages; ++i) {
ret = iommu_map_page(domain, iova, paddr, prot, PM_MAP_4k);
if (ret)
@@ -2456,6 +2467,8 @@ static int amd_iommu_map_range(struct iommu_domain *dom,
paddr += PAGE_SIZE;
}
+ mutex_unlock(&domain->api_lock);
+
return 0;
}
@@ -2468,12 +2481,16 @@ static void amd_iommu_unmap_range(struct iommu_domain *dom,
iova &= PAGE_MASK;
+ mutex_lock(&domain->api_lock);
+
for (i = 0; i < npages; ++i) {
iommu_unmap_page(domain, iova, PM_MAP_4k);
iova += PAGE_SIZE;
}
iommu_flush_tlb_pde(domain);
+
+ mutex_unlock(&domain->api_lock);
}
static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 1dca9c34eae..6360abf993d 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -19,8 +19,8 @@
#include <linux/pci.h>
#include <linux/acpi.h>
-#include <linux/gfp.h>
#include <linux/list.h>
+#include <linux/slab.h>
#include <linux/sysdev.h>
#include <linux/interrupt.h>
#include <linux/msi.h>
@@ -138,6 +138,11 @@ int amd_iommus_present;
bool amd_iommu_np_cache __read_mostly;
/*
+ * The ACPI table parsing functions set this variable on an error
+ */
+static int __initdata amd_iommu_init_err;
+
+/*
* List of protection domains - used during resume
*/
LIST_HEAD(amd_iommu_pd_list);
@@ -386,9 +391,11 @@ static int __init find_last_devid_acpi(struct acpi_table_header *table)
*/
for (i = 0; i < table->length; ++i)
checksum += p[i];
- if (checksum != 0)
+ if (checksum != 0) {
/* ACPI table corrupt */
- return -ENODEV;
+ amd_iommu_init_err = -ENODEV;
+ return 0;
+ }
p += IVRS_HEADER_LENGTH;
@@ -431,7 +438,7 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
if (cmd_buf == NULL)
return NULL;
- iommu->cmd_buf_size = CMD_BUFFER_SIZE;
+ iommu->cmd_buf_size = CMD_BUFFER_SIZE | CMD_BUFFER_UNINITIALIZED;
return cmd_buf;
}
@@ -467,12 +474,13 @@ static void iommu_enable_command_buffer(struct amd_iommu *iommu)
&entry, sizeof(entry));
amd_iommu_reset_cmd_buffer(iommu);
+ iommu->cmd_buf_size &= ~(CMD_BUFFER_UNINITIALIZED);
}
static void __init free_command_buffer(struct amd_iommu *iommu)
{
free_pages((unsigned long)iommu->cmd_buf,
- get_order(iommu->cmd_buf_size));
+ get_order(iommu->cmd_buf_size & ~(CMD_BUFFER_UNINITIALIZED)));
}
/* allocates the memory where the IOMMU will log its events to */
@@ -915,11 +923,16 @@ static int __init init_iommu_all(struct acpi_table_header *table)
h->mmio_phys);
iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL);
- if (iommu == NULL)
- return -ENOMEM;
+ if (iommu == NULL) {
+ amd_iommu_init_err = -ENOMEM;
+ return 0;
+ }
+
ret = init_iommu_one(iommu, h);
- if (ret)
- return ret;
+ if (ret) {
+ amd_iommu_init_err = ret;
+ return 0;
+ }
break;
default:
break;
@@ -1204,6 +1217,10 @@ static int __init amd_iommu_init(void)
if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0)
return -ENODEV;
+ ret = amd_iommu_init_err;
+ if (ret)
+ goto out;
+
dev_table_size = tbl_size(DEV_TABLE_ENTRY_SIZE);
alias_table_size = tbl_size(ALIAS_TABLE_ENTRY_SIZE);
rlookup_table_size = tbl_size(RLOOKUP_TABLE_ENTRY_SIZE);
@@ -1263,9 +1280,19 @@ static int __init amd_iommu_init(void)
if (acpi_table_parse("IVRS", init_iommu_all) != 0)
goto free;
+ if (amd_iommu_init_err) {
+ ret = amd_iommu_init_err;
+ goto free;
+ }
+
if (acpi_table_parse("IVRS", init_memory_definitions) != 0)
goto free;
+ if (amd_iommu_init_err) {
+ ret = amd_iommu_init_err;
+ goto free;
+ }
+
ret = sysdev_class_register(&amd_iommu_sysdev_class);
if (ret)
goto free;
@@ -1278,16 +1305,19 @@ static int __init amd_iommu_init(void)
if (ret)
goto free;
+ enable_iommus();
+
if (iommu_pass_through)
ret = amd_iommu_init_passthrough();
else
ret = amd_iommu_init_dma_ops();
+
if (ret)
goto free;
- amd_iommu_init_notifier();
+ amd_iommu_init_api();
- enable_iommus();
+ amd_iommu_init_notifier();
if (iommu_pass_through)
goto out;
@@ -1302,6 +1332,7 @@ out:
return ret;
free:
+ disable_iommus();
amd_iommu_uninit_devices();
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
new file mode 100644
index 00000000000..ff469e47005
--- /dev/null
+++ b/arch/x86/kernel/apb_timer.c
@@ -0,0 +1,785 @@
+/*
+ * apb_timer.c: Driver for Langwell APB timers
+ *
+ * (C) Copyright 2009 Intel Corporation
+ * Author: Jacob Pan (jacob.jun.pan@intel.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ *
+ * Note:
+ * Langwell is the south complex of Intel Moorestown MID platform. There are
+ * eight external timers in total that can be used by the operating system.
+ * The timer information, such as frequency and addresses, is provided to the
+ * OS via SFI tables.
+ * Timer interrupts are routed via FW/HW emulated IOAPIC independently via
+ * individual redirection table entries (RTE).
+ * Unlike HPET, there is no master counter, therefore one of the timers are
+ * used as clocksource. The overall allocation looks like:
+ * - timer 0 - NR_CPUs for per cpu timer
+ * - one timer for clocksource
+ * - one timer for watchdog driver.
+ * It is also worth notice that APB timer does not support true one-shot mode,
+ * free-running mode will be used here to emulate one-shot mode.
+ * APB timer can also be used as broadcast timer along with per cpu local APIC
+ * timer, but by default APB timer has higher rating than local APIC timers.
+ */
+
+#include <linux/clocksource.h>
+#include <linux/clockchips.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/sysdev.h>
+#include <linux/slab.h>
+#include <linux/pm.h>
+#include <linux/pci.h>
+#include <linux/sfi.h>
+#include <linux/interrupt.h>
+#include <linux/cpu.h>
+#include <linux/irq.h>
+
+#include <asm/fixmap.h>
+#include <asm/apb_timer.h>
+
+#define APBT_MASK CLOCKSOURCE_MASK(32)
+#define APBT_SHIFT 22
+#define APBT_CLOCKEVENT_RATING 150
+#define APBT_CLOCKSOURCE_RATING 250
+#define APBT_MIN_DELTA_USEC 200
+
+#define EVT_TO_APBT_DEV(evt) container_of(evt, struct apbt_dev, evt)
+#define APBT_CLOCKEVENT0_NUM (0)
+#define APBT_CLOCKEVENT1_NUM (1)
+#define APBT_CLOCKSOURCE_NUM (2)
+
+static unsigned long apbt_address;
+static int apb_timer_block_enabled;
+static void __iomem *apbt_virt_address;
+static int phy_cs_timer_id;
+
+/*
+ * Common DW APB timer info
+ */
+static uint64_t apbt_freq;
+
+static void apbt_set_mode(enum clock_event_mode mode,
+ struct clock_event_device *evt);
+static int apbt_next_event(unsigned long delta,
+ struct clock_event_device *evt);
+static cycle_t apbt_read_clocksource(struct clocksource *cs);
+static void apbt_restart_clocksource(struct clocksource *cs);
+
+struct apbt_dev {
+ struct clock_event_device evt;
+ unsigned int num;
+ int cpu;
+ unsigned int irq;
+ unsigned int tick;
+ unsigned int count;
+ unsigned int flags;
+ char name[10];
+};
+
+int disable_apbt_percpu __cpuinitdata;
+
+static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev);
+
+#ifdef CONFIG_SMP
+static unsigned int apbt_num_timers_used;
+static struct apbt_dev *apbt_devs;
+#endif
+
+static inline unsigned long apbt_readl_reg(unsigned long a)
+{
+ return readl(apbt_virt_address + a);
+}
+
+static inline void apbt_writel_reg(unsigned long d, unsigned long a)
+{
+ writel(d, apbt_virt_address + a);
+}
+
+static inline unsigned long apbt_readl(int n, unsigned long a)
+{
+ return readl(apbt_virt_address + a + n * APBTMRS_REG_SIZE);
+}
+
+static inline void apbt_writel(int n, unsigned long d, unsigned long a)
+{
+ writel(d, apbt_virt_address + a + n * APBTMRS_REG_SIZE);
+}
+
+static inline void apbt_set_mapping(void)
+{
+ struct sfi_timer_table_entry *mtmr;
+
+ if (apbt_virt_address) {
+ pr_debug("APBT base already mapped\n");
+ return;
+ }
+ mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM);
+ if (mtmr == NULL) {
+ printk(KERN_ERR "Failed to get MTMR %d from SFI\n",
+ APBT_CLOCKEVENT0_NUM);
+ return;
+ }
+ apbt_address = (unsigned long)mtmr->phys_addr;
+ if (!apbt_address) {
+ printk(KERN_WARNING "No timer base from SFI, use default\n");
+ apbt_address = APBT_DEFAULT_BASE;
+ }
+ apbt_virt_address = ioremap_nocache(apbt_address, APBT_MMAP_SIZE);
+ if (apbt_virt_address) {
+ pr_debug("Mapped APBT physical addr %p at virtual addr %p\n",\
+ (void *)apbt_address, (void *)apbt_virt_address);
+ } else {
+ pr_debug("Failed mapping APBT phy address at %p\n",\
+ (void *)apbt_address);
+ goto panic_noapbt;
+ }
+ apbt_freq = mtmr->freq_hz / USEC_PER_SEC;
+ sfi_free_mtmr(mtmr);
+
+ /* Now figure out the physical timer id for clocksource device */
+ mtmr = sfi_get_mtmr(APBT_CLOCKSOURCE_NUM);
+ if (mtmr == NULL)
+ goto panic_noapbt;
+
+ /* Now figure out the physical timer id */
+ phy_cs_timer_id = (unsigned int)(mtmr->phys_addr & 0xff)
+ / APBTMRS_REG_SIZE;
+ pr_debug("Use timer %d for clocksource\n", phy_cs_timer_id);
+ return;
+
+panic_noapbt:
+ panic("Failed to setup APB system timer\n");
+
+}
+
+static inline void apbt_clear_mapping(void)
+{
+ iounmap(apbt_virt_address);
+ apbt_virt_address = NULL;
+}
+
+/*
+ * APBT timer interrupt enable / disable
+ */
+static inline int is_apbt_capable(void)
+{
+ return apbt_virt_address ? 1 : 0;
+}
+
+static struct clocksource clocksource_apbt = {
+ .name = "apbt",
+ .rating = APBT_CLOCKSOURCE_RATING,
+ .read = apbt_read_clocksource,
+ .mask = APBT_MASK,
+ .shift = APBT_SHIFT,
+ .flags = CLOCK_SOURCE_IS_CONTINUOUS,
+ .resume = apbt_restart_clocksource,
+};
+
+/* boot APB clock event device */
+static struct clock_event_device apbt_clockevent = {
+ .name = "apbt0",
+ .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
+ .set_mode = apbt_set_mode,
+ .set_next_event = apbt_next_event,
+ .shift = APBT_SHIFT,
+ .irq = 0,
+ .rating = APBT_CLOCKEVENT_RATING,
+};
+
+/*
+ * if user does not want to use per CPU apb timer, just give it a lower rating
+ * than local apic timer and skip the late per cpu timer init.
+ */
+static inline int __init setup_x86_mrst_timer(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
+
+ if (strcmp("apbt_only", arg) == 0)
+ disable_apbt_percpu = 0;
+ else if (strcmp("lapic_and_apbt", arg) == 0)
+ disable_apbt_percpu = 1;
+ else {
+ pr_warning("X86 MRST timer option %s not recognised"
+ " use x86_mrst_timer=apbt_only or lapic_and_apbt\n",
+ arg);
+ return -EINVAL;
+ }
+ return 0;
+}
+__setup("x86_mrst_timer=", setup_x86_mrst_timer);
+
+/*
+ * start count down from 0xffff_ffff. this is done by toggling the enable bit
+ * then load initial load count to ~0.
+ */
+static void apbt_start_counter(int n)
+{
+ unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
+
+ ctrl &= ~APBTMR_CONTROL_ENABLE;
+ apbt_writel(n, ctrl, APBTMR_N_CONTROL);
+ apbt_writel(n, ~0, APBTMR_N_LOAD_COUNT);
+ /* enable, mask interrupt */
+ ctrl &= ~APBTMR_CONTROL_MODE_PERIODIC;
+ ctrl |= (APBTMR_CONTROL_ENABLE | APBTMR_CONTROL_INT);
+ apbt_writel(n, ctrl, APBTMR_N_CONTROL);
+ /* read it once to get cached counter value initialized */
+ apbt_read_clocksource(&clocksource_apbt);
+}
+
+static irqreturn_t apbt_interrupt_handler(int irq, void *data)
+{
+ struct apbt_dev *dev = (struct apbt_dev *)data;
+ struct clock_event_device *aevt = &dev->evt;
+
+ if (!aevt->event_handler) {
+ printk(KERN_INFO "Spurious APBT timer interrupt on %d\n",
+ dev->num);
+ return IRQ_NONE;
+ }
+ aevt->event_handler(aevt);
+ return IRQ_HANDLED;
+}
+
+static void apbt_restart_clocksource(struct clocksource *cs)
+{
+ apbt_start_counter(phy_cs_timer_id);
+}
+
+/* Setup IRQ routing via IOAPIC */
+#ifdef CONFIG_SMP
+static void apbt_setup_irq(struct apbt_dev *adev)
+{
+ struct irq_chip *chip;
+ struct irq_desc *desc;
+
+ /* timer0 irq has been setup early */
+ if (adev->irq == 0)
+ return;
+ desc = irq_to_desc(adev->irq);
+ chip = get_irq_chip(adev->irq);
+ disable_irq(adev->irq);
+ desc->status |= IRQ_MOVE_PCNTXT;
+ irq_set_affinity(adev->irq, cpumask_of(adev->cpu));
+ /* APB timer irqs are set up as mp_irqs, timer is edge triggerred */
+ set_irq_chip_and_handler_name(adev->irq, chip, handle_edge_irq, "edge");
+ enable_irq(adev->irq);
+ if (system_state == SYSTEM_BOOTING)
+ if (request_irq(adev->irq, apbt_interrupt_handler,
+ IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING,
+ adev->name, adev)) {
+ printk(KERN_ERR "Failed request IRQ for APBT%d\n",
+ adev->num);
+ }
+}
+#endif
+
+static void apbt_enable_int(int n)
+{
+ unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
+ /* clear pending intr */
+ apbt_readl(n, APBTMR_N_EOI);
+ ctrl &= ~APBTMR_CONTROL_INT;
+ apbt_writel(n, ctrl, APBTMR_N_CONTROL);
+}
+
+static void apbt_disable_int(int n)
+{
+ unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
+
+ ctrl |= APBTMR_CONTROL_INT;
+ apbt_writel(n, ctrl, APBTMR_N_CONTROL);
+}
+
+
+static int __init apbt_clockevent_register(void)
+{
+ struct sfi_timer_table_entry *mtmr;
+ struct apbt_dev *adev = &__get_cpu_var(cpu_apbt_dev);
+
+ mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM);
+ if (mtmr == NULL) {
+ printk(KERN_ERR "Failed to get MTMR %d from SFI\n",
+ APBT_CLOCKEVENT0_NUM);
+ return -ENODEV;
+ }
+
+ /*
+ * We need to calculate the scaled math multiplication factor for
+ * nanosecond to apbt tick conversion.
+ * mult = (nsec/cycle)*2^APBT_SHIFT
+ */
+ apbt_clockevent.mult = div_sc((unsigned long) mtmr->freq_hz
+ , NSEC_PER_SEC, APBT_SHIFT);
+
+ /* Calculate the min / max delta */
+ apbt_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF,
+ &apbt_clockevent);
+ apbt_clockevent.min_delta_ns = clockevent_delta2ns(
+ APBT_MIN_DELTA_USEC*apbt_freq,
+ &apbt_clockevent);
+ /*
+ * Start apbt with the boot cpu mask and make it
+ * global if not used for per cpu timer.
+ */
+ apbt_clockevent.cpumask = cpumask_of(smp_processor_id());
+ adev->num = smp_processor_id();
+ memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device));
+
+ if (disable_apbt_percpu) {
+ apbt_clockevent.rating = APBT_CLOCKEVENT_RATING - 100;
+ global_clock_event = &adev->evt;
+ printk(KERN_DEBUG "%s clockevent registered as global\n",
+ global_clock_event->name);
+ }
+
+ if (request_irq(apbt_clockevent.irq, apbt_interrupt_handler,
+ IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING,
+ apbt_clockevent.name, adev)) {
+ printk(KERN_ERR "Failed request IRQ for APBT%d\n",
+ apbt_clockevent.irq);
+ }
+
+ clockevents_register_device(&adev->evt);
+ /* Start APBT 0 interrupts */
+ apbt_enable_int(APBT_CLOCKEVENT0_NUM);
+
+ sfi_free_mtmr(mtmr);
+ return 0;
+}
+
+#ifdef CONFIG_SMP
+/* Should be called with per cpu */
+void apbt_setup_secondary_clock(void)
+{
+ struct apbt_dev *adev;
+ struct clock_event_device *aevt;
+ int cpu;
+
+ /* Don't register boot CPU clockevent */
+ cpu = smp_processor_id();
+ if (cpu == boot_cpu_id)
+ return;
+ /*
+ * We need to calculate the scaled math multiplication factor for
+ * nanosecond to apbt tick conversion.
+ * mult = (nsec/cycle)*2^APBT_SHIFT
+ */
+ printk(KERN_INFO "Init per CPU clockevent %d\n", cpu);
+ adev = &per_cpu(cpu_apbt_dev, cpu);
+ aevt = &adev->evt;
+
+ memcpy(aevt, &apbt_clockevent, sizeof(*aevt));
+ aevt->cpumask = cpumask_of(cpu);
+ aevt->name = adev->name;
+ aevt->mode = CLOCK_EVT_MODE_UNUSED;
+
+ printk(KERN_INFO "Registering CPU %d clockevent device %s, mask %08x\n",
+ cpu, aevt->name, *(u32 *)aevt->cpumask);
+
+ apbt_setup_irq(adev);
+
+ clockevents_register_device(aevt);
+
+ apbt_enable_int(cpu);
+
+ return;
+}
+
+/*
+ * this notify handler process CPU hotplug events. in case of S0i3, nonboot
+ * cpus are disabled/enabled frequently, for performance reasons, we keep the
+ * per cpu timer irq registered so that we do need to do free_irq/request_irq.
+ *
+ * TODO: it might be more reliable to directly disable percpu clockevent device
+ * without the notifier chain. currently, cpu 0 may get interrupts from other
+ * cpu timers during the offline process due to the ordering of notification.
+ * the extra interrupt is harmless.
+ */
+static int apbt_cpuhp_notify(struct notifier_block *n,
+ unsigned long action, void *hcpu)
+{
+ unsigned long cpu = (unsigned long)hcpu;
+ struct apbt_dev *adev = &per_cpu(cpu_apbt_dev, cpu);
+
+ switch (action & 0xf) {
+ case CPU_DEAD:
+ apbt_disable_int(cpu);
+ if (system_state == SYSTEM_RUNNING)
+ pr_debug("skipping APBT CPU %lu offline\n", cpu);
+ else if (adev) {
+ pr_debug("APBT clockevent for cpu %lu offline\n", cpu);
+ free_irq(adev->irq, adev);
+ }
+ break;
+ default:
+ pr_debug(KERN_INFO "APBT notified %lu, no action\n", action);
+ }
+ return NOTIFY_OK;
+}
+
+static __init int apbt_late_init(void)
+{
+ if (disable_apbt_percpu)
+ return 0;
+ /* This notifier should be called after workqueue is ready */
+ hotcpu_notifier(apbt_cpuhp_notify, -20);
+ return 0;
+}
+fs_initcall(apbt_late_init);
+#else
+
+void apbt_setup_secondary_clock(void) {}
+
+#endif /* CONFIG_SMP */
+
+static void apbt_set_mode(enum clock_event_mode mode,
+ struct clock_event_device *evt)
+{
+ unsigned long ctrl;
+ uint64_t delta;
+ int timer_num;
+ struct apbt_dev *adev = EVT_TO_APBT_DEV(evt);
+
+ timer_num = adev->num;
+ pr_debug("%s CPU %d timer %d mode=%d\n",
+ __func__, first_cpu(*evt->cpumask), timer_num, mode);
+
+ switch (mode) {
+ case CLOCK_EVT_MODE_PERIODIC:
+ delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * apbt_clockevent.mult;
+ delta >>= apbt_clockevent.shift;
+ ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
+ ctrl |= APBTMR_CONTROL_MODE_PERIODIC;
+ apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
+ /*
+ * DW APB p. 46, have to disable timer before load counter,
+ * may cause sync problem.
+ */
+ ctrl &= ~APBTMR_CONTROL_ENABLE;
+ apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
+ udelay(1);
+ pr_debug("Setting clock period %d for HZ %d\n", (int)delta, HZ);
+ apbt_writel(timer_num, delta, APBTMR_N_LOAD_COUNT);
+ ctrl |= APBTMR_CONTROL_ENABLE;
+ apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
+ break;
+ /* APB timer does not have one-shot mode, use free running mode */
+ case CLOCK_EVT_MODE_ONESHOT:
+ ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
+ /*
+ * set free running mode, this mode will let timer reload max
+ * timeout which will give time (3min on 25MHz clock) to rearm
+ * the next event, therefore emulate the one-shot mode.
+ */
+ ctrl &= ~APBTMR_CONTROL_ENABLE;
+ ctrl &= ~APBTMR_CONTROL_MODE_PERIODIC;
+
+ apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
+ /* write again to set free running mode */
+ apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
+
+ /*
+ * DW APB p. 46, load counter with all 1s before starting free
+ * running mode.
+ */
+ apbt_writel(timer_num, ~0, APBTMR_N_LOAD_COUNT);
+ ctrl &= ~APBTMR_CONTROL_INT;
+ ctrl |= APBTMR_CONTROL_ENABLE;
+ apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
+ break;
+
+ case CLOCK_EVT_MODE_UNUSED:
+ case CLOCK_EVT_MODE_SHUTDOWN:
+ apbt_disable_int(timer_num);
+ ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
+ ctrl &= ~APBTMR_CONTROL_ENABLE;
+ apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
+ break;
+
+ case CLOCK_EVT_MODE_RESUME:
+ apbt_enable_int(timer_num);
+ break;
+ }
+}
+
+static int apbt_next_event(unsigned long delta,
+ struct clock_event_device *evt)
+{
+ unsigned long ctrl;
+ int timer_num;
+
+ struct apbt_dev *adev = EVT_TO_APBT_DEV(evt);
+
+ timer_num = adev->num;
+ /* Disable timer */
+ ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
+ ctrl &= ~APBTMR_CONTROL_ENABLE;
+ apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
+ /* write new count */
+ apbt_writel(timer_num, delta, APBTMR_N_LOAD_COUNT);
+ ctrl |= APBTMR_CONTROL_ENABLE;
+ apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
+ return 0;
+}
+
+/*
+ * APB timer clock is not in sync with pclk on Langwell, which translates to
+ * unreliable read value caused by sampling error. the error does not add up
+ * overtime and only happens when sampling a 0 as a 1 by mistake. so the time
+ * would go backwards. the following code is trying to prevent time traveling
+ * backwards. little bit paranoid.
+ */
+static cycle_t apbt_read_clocksource(struct clocksource *cs)
+{
+ unsigned long t0, t1, t2;
+ static unsigned long last_read;
+
+bad_count:
+ t1 = apbt_readl(phy_cs_timer_id,
+ APBTMR_N_CURRENT_VALUE);
+ t2 = apbt_readl(phy_cs_timer_id,
+ APBTMR_N_CURRENT_VALUE);
+ if (unlikely(t1 < t2)) {
+ pr_debug("APBT: read current count error %lx:%lx:%lx\n",
+ t1, t2, t2 - t1);
+ goto bad_count;
+ }
+ /*
+ * check against cached last read, makes sure time does not go back.
+ * it could be a normal rollover but we will do tripple check anyway
+ */
+ if (unlikely(t2 > last_read)) {
+ /* check if we have a normal rollover */
+ unsigned long raw_intr_status =
+ apbt_readl_reg(APBTMRS_RAW_INT_STATUS);
+ /*
+ * cs timer interrupt is masked but raw intr bit is set if
+ * rollover occurs. then we read EOI reg to clear it.
+ */
+ if (raw_intr_status & (1 << phy_cs_timer_id)) {
+ apbt_readl(phy_cs_timer_id, APBTMR_N_EOI);
+ goto out;
+ }
+ pr_debug("APB CS going back %lx:%lx:%lx ",
+ t2, last_read, t2 - last_read);
+bad_count_x3:
+ pr_debug(KERN_INFO "tripple check enforced\n");
+ t0 = apbt_readl(phy_cs_timer_id,
+ APBTMR_N_CURRENT_VALUE);
+ udelay(1);
+ t1 = apbt_readl(phy_cs_timer_id,
+ APBTMR_N_CURRENT_VALUE);
+ udelay(1);
+ t2 = apbt_readl(phy_cs_timer_id,
+ APBTMR_N_CURRENT_VALUE);
+ if ((t2 > t1) || (t1 > t0)) {
+ printk(KERN_ERR "Error: APB CS tripple check failed\n");
+ goto bad_count_x3;
+ }
+ }
+out:
+ last_read = t2;
+ return (cycle_t)~t2;
+}
+
+static int apbt_clocksource_register(void)
+{
+ u64 start, now;
+ cycle_t t1;
+
+ /* Start the counter, use timer 2 as source, timer 0/1 for event */
+ apbt_start_counter(phy_cs_timer_id);
+
+ /* Verify whether apbt counter works */
+ t1 = apbt_read_clocksource(&clocksource_apbt);
+ rdtscll(start);
+
+ /*
+ * We don't know the TSC frequency yet, but waiting for
+ * 200000 TSC cycles is safe:
+ * 4 GHz == 50us
+ * 1 GHz == 200us
+ */
+ do {
+ rep_nop();
+ rdtscll(now);
+ } while ((now - start) < 200000UL);
+
+ /* APBT is the only always on clocksource, it has to work! */
+ if (t1 == apbt_read_clocksource(&clocksource_apbt))
+ panic("APBT counter not counting. APBT disabled\n");
+
+ /*
+ * initialize and register APBT clocksource
+ * convert that to ns/clock cycle
+ * mult = (ns/c) * 2^APBT_SHIFT
+ */
+ clocksource_apbt.mult = div_sc(MSEC_PER_SEC,
+ (unsigned long) apbt_freq, APBT_SHIFT);
+ clocksource_register(&clocksource_apbt);
+
+ return 0;
+}
+
+/*
+ * Early setup the APBT timer, only use timer 0 for booting then switch to
+ * per CPU timer if possible.
+ * returns 1 if per cpu apbt is setup
+ * returns 0 if no per cpu apbt is chosen
+ * panic if set up failed, this is the only platform timer on Moorestown.
+ */
+void __init apbt_time_init(void)
+{
+#ifdef CONFIG_SMP
+ int i;
+ struct sfi_timer_table_entry *p_mtmr;
+ unsigned int percpu_timer;
+ struct apbt_dev *adev;
+#endif
+
+ if (apb_timer_block_enabled)
+ return;
+ apbt_set_mapping();
+ if (apbt_virt_address) {
+ pr_debug("Found APBT version 0x%lx\n",\
+ apbt_readl_reg(APBTMRS_COMP_VERSION));
+ } else
+ goto out_noapbt;
+ /*
+ * Read the frequency and check for a sane value, for ESL model
+ * we extend the possible clock range to allow time scaling.
+ */
+
+ if (apbt_freq < APBT_MIN_FREQ || apbt_freq > APBT_MAX_FREQ) {
+ pr_debug("APBT has invalid freq 0x%llx\n", apbt_freq);
+ goto out_noapbt;
+ }
+ if (apbt_clocksource_register()) {
+ pr_debug("APBT has failed to register clocksource\n");
+ goto out_noapbt;
+ }
+ if (!apbt_clockevent_register())
+ apb_timer_block_enabled = 1;
+ else {
+ pr_debug("APBT has failed to register clockevent\n");
+ goto out_noapbt;
+ }
+#ifdef CONFIG_SMP
+ /* kernel cmdline disable apb timer, so we will use lapic timers */
+ if (disable_apbt_percpu) {
+ printk(KERN_INFO "apbt: disabled per cpu timer\n");
+ return;
+ }
+ pr_debug("%s: %d CPUs online\n", __func__, num_online_cpus());
+ if (num_possible_cpus() <= sfi_mtimer_num) {
+ percpu_timer = 1;
+ apbt_num_timers_used = num_possible_cpus();
+ } else {
+ percpu_timer = 0;
+ apbt_num_timers_used = 1;
+ adev = &per_cpu(cpu_apbt_dev, 0);
+ adev->flags &= ~APBT_DEV_USED;
+ }
+ pr_debug("%s: %d APB timers used\n", __func__, apbt_num_timers_used);
+
+ /* here we set up per CPU timer data structure */
+ apbt_devs = kzalloc(sizeof(struct apbt_dev) * apbt_num_timers_used,
+ GFP_KERNEL);
+ if (!apbt_devs) {
+ printk(KERN_ERR "Failed to allocate APB timer devices\n");
+ return;
+ }
+ for (i = 0; i < apbt_num_timers_used; i++) {
+ adev = &per_cpu(cpu_apbt_dev, i);
+ adev->num = i;
+ adev->cpu = i;
+ p_mtmr = sfi_get_mtmr(i);
+ if (p_mtmr) {
+ adev->tick = p_mtmr->freq_hz;
+ adev->irq = p_mtmr->irq;
+ } else
+ printk(KERN_ERR "Failed to get timer for cpu %d\n", i);
+ adev->count = 0;
+ sprintf(adev->name, "apbt%d", i);
+ }
+#endif
+
+ return;
+
+out_noapbt:
+ apbt_clear_mapping();
+ apb_timer_block_enabled = 0;
+ panic("failed to enable APB timer\n");
+}
+
+static inline void apbt_disable(int n)
+{
+ if (is_apbt_capable()) {
+ unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
+ ctrl &= ~APBTMR_CONTROL_ENABLE;
+ apbt_writel(n, ctrl, APBTMR_N_CONTROL);
+ }
+}
+
+/* called before apb_timer_enable, use early map */
+unsigned long apbt_quick_calibrate()
+{
+ int i, scale;
+ u64 old, new;
+ cycle_t t1, t2;
+ unsigned long khz = 0;
+ u32 loop, shift;
+
+ apbt_set_mapping();
+ apbt_start_counter(phy_cs_timer_id);
+
+ /* check if the timer can count down, otherwise return */
+ old = apbt_read_clocksource(&clocksource_apbt);
+ i = 10000;
+ while (--i) {
+ if (old != apbt_read_clocksource(&clocksource_apbt))
+ break;
+ }
+ if (!i)
+ goto failed;
+
+ /* count 16 ms */
+ loop = (apbt_freq * 1000) << 4;
+
+ /* restart the timer to ensure it won't get to 0 in the calibration */
+ apbt_start_counter(phy_cs_timer_id);
+
+ old = apbt_read_clocksource(&clocksource_apbt);
+ old += loop;
+
+ t1 = __native_read_tsc();
+
+ do {
+ new = apbt_read_clocksource(&clocksource_apbt);
+ } while (new < old);
+
+ t2 = __native_read_tsc();
+
+ shift = 5;
+ if (unlikely(loop >> shift == 0)) {
+ printk(KERN_INFO
+ "APBT TSC calibration failed, not enough resolution\n");
+ return 0;
+ }
+ scale = (int)div_u64((t2 - t1), loop >> shift);
+ khz = (scale * apbt_freq * 1000) >> shift;
+ printk(KERN_INFO "TSC freq calculated by APB timer is %lu khz\n", khz);
+ return khz;
+failed:
+ return 0;
+}
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 3704997e8b2..b5d8b0bcf23 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -393,6 +393,7 @@ void __init gart_iommu_hole_init(void)
for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
int bus;
int dev_base, dev_limit;
+ u32 ctl;
bus = bus_dev_ranges[i].bus;
dev_base = bus_dev_ranges[i].dev_base;
@@ -406,7 +407,19 @@ void __init gart_iommu_hole_init(void)
gart_iommu_aperture = 1;
x86_init.iommu.iommu_init = gart_iommu_init;
- aper_order = (read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL) >> 1) & 7;
+ ctl = read_pci_config(bus, slot, 3,
+ AMD64_GARTAPERTURECTL);
+
+ /*
+ * Before we do anything else disable the GART. It may
+ * still be enabled if we boot into a crash-kernel here.
+ * Reconfiguring the GART while it is enabled could have
+ * unknown side-effects.
+ */
+ ctl &= ~GARTEN;
+ write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
+
+ aper_order = (ctl >> 1) & 7;
aper_size = (32 * 1024 * 1024) << aper_order;
aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff;
aper_base <<= 25;
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index aa57c079c98..e5a4a1e0161 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -61,12 +61,6 @@ unsigned int boot_cpu_physical_apicid = -1U;
/*
* The highest APIC ID seen during enumeration.
- *
- * On AMD, this determines the messaging protocol we can use: if all APIC IDs
- * are in the 0 ... 7 range, then we can use logical addressing which
- * has some performance advantages (better broadcasting).
- *
- * If there's an APIC ID above 8, we use physical addressing.
*/
unsigned int max_physical_apicid;
@@ -587,7 +581,7 @@ calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc)
res = (((u64)(*deltatsc)) * pm_100ms);
do_div(res, deltapm);
apic_printk(APIC_VERBOSE, "TSC delta adjusted to "
- "PM-Timer: %lu (%ld) \n",
+ "PM-Timer: %lu (%ld)\n",
(unsigned long)res, *deltatsc);
*deltatsc = (long)res;
}
@@ -1396,7 +1390,7 @@ void __init enable_IR_x2apic(void)
}
local_irq_save(flags);
- mask_8259A();
+ legacy_pic->mask_all();
mask_IO_APIC_setup(ioapic_entries);
if (dmar_table_init_ret)
@@ -1428,7 +1422,7 @@ void __init enable_IR_x2apic(void)
nox2apic:
if (!ret) /* IR enabling failed */
restore_IO_APIC_setup(ioapic_entries);
- unmask_8259A();
+ legacy_pic->restore_mask();
local_irq_restore(flags);
out:
@@ -1646,8 +1640,8 @@ int __init APIC_init_uniprocessor(void)
}
#endif
+#ifndef CONFIG_SMP
enable_IR_x2apic();
-#ifdef CONFIG_X86_64
default_setup_apic_routing();
#endif
@@ -1897,18 +1891,6 @@ void __cpuinit generic_processor_info(int apicid, int version)
if (apicid > max_physical_apicid)
max_physical_apicid = apicid;
-#ifdef CONFIG_X86_32
- switch (boot_cpu_data.x86_vendor) {
- case X86_VENDOR_INTEL:
- if (num_processors > 8)
- def_to_bigsmp = 1;
- break;
- case X86_VENDOR_AMD:
- if (max_physical_apicid >= 8)
- def_to_bigsmp = 1;
- }
-#endif
-
#if defined(CONFIG_SMP) || defined(CONFIG_X86_64)
early_per_cpu(x86_cpu_to_apicid, cpu) = apicid;
early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
@@ -2038,7 +2020,7 @@ static int lapic_resume(struct sys_device *dev)
}
mask_IO_APIC_setup(ioapic_entries);
- mask_8259A();
+ legacy_pic->mask_all();
}
if (x2apic_mode)
@@ -2082,7 +2064,7 @@ static int lapic_resume(struct sys_device *dev)
if (intr_remapping_enabled) {
reenable_intr_remapping(x2apic_mode);
- unmask_8259A();
+ legacy_pic->restore_mask();
restore_IO_APIC_setup(ioapic_entries);
free_ioapic_entries(ioapic_entries);
}
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index eacbd2b31d2..09d3b17ce0c 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -223,7 +223,7 @@ struct apic apic_flat = {
};
/*
- * Physflat mode is used when there are more than 8 CPUs on a AMD system.
+ * Physflat mode is used when there are more than 8 CPUs on a system.
* We cannot use logical delivery in this case because the mask
* overflows, so use physical mode.
*/
@@ -240,6 +240,11 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
printk(KERN_DEBUG "system APIC only can use physical flat");
return 1;
}
+
+ if (!strncmp(oem_id, "IBM", 3) && !strncmp(oem_table_id, "EXA", 3)) {
+ printk(KERN_DEBUG "IBM Summit detected, will use apic physical");
+ return 1;
+ }
#endif
return 0;
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index dd2b5f26464..03ba1b895f5 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -42,6 +42,7 @@
#include <linux/errno.h>
#include <linux/acpi.h>
#include <linux/init.h>
+#include <linux/gfp.h>
#include <linux/nmi.h>
#include <linux/smp.h>
#include <linux/io.h>
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index de00c4619a5..127b8718abf 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -36,6 +36,7 @@
#include <linux/freezer.h>
#include <linux/kthread.h>
#include <linux/jiffies.h> /* time_after() */
+#include <linux/slab.h>
#ifdef CONFIG_ACPI
#include <acpi/acpi_bus.h>
#endif
@@ -73,8 +74,8 @@
*/
int sis_apic_bug = -1;
-static DEFINE_SPINLOCK(ioapic_lock);
-static DEFINE_SPINLOCK(vector_lock);
+static DEFINE_RAW_SPINLOCK(ioapic_lock);
+static DEFINE_RAW_SPINLOCK(vector_lock);
/*
* # of IRQ routing registers
@@ -94,8 +95,6 @@ struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
/* # of MP IRQ source entries */
int mp_irq_entries;
-/* Number of legacy interrupts */
-static int nr_legacy_irqs __read_mostly = NR_IRQS_LEGACY;
/* GSI interrupts */
static int nr_irqs_gsi = NR_IRQS_LEGACY;
@@ -140,33 +139,10 @@ static struct irq_pin_list *get_one_free_irq_2_pin(int node)
/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
#ifdef CONFIG_SPARSE_IRQ
-static struct irq_cfg irq_cfgx[] = {
+static struct irq_cfg irq_cfgx[NR_IRQS_LEGACY];
#else
-static struct irq_cfg irq_cfgx[NR_IRQS] = {
+static struct irq_cfg irq_cfgx[NR_IRQS];
#endif
- [0] = { .vector = IRQ0_VECTOR, },
- [1] = { .vector = IRQ1_VECTOR, },
- [2] = { .vector = IRQ2_VECTOR, },
- [3] = { .vector = IRQ3_VECTOR, },
- [4] = { .vector = IRQ4_VECTOR, },
- [5] = { .vector = IRQ5_VECTOR, },
- [6] = { .vector = IRQ6_VECTOR, },
- [7] = { .vector = IRQ7_VECTOR, },
- [8] = { .vector = IRQ8_VECTOR, },
- [9] = { .vector = IRQ9_VECTOR, },
- [10] = { .vector = IRQ10_VECTOR, },
- [11] = { .vector = IRQ11_VECTOR, },
- [12] = { .vector = IRQ12_VECTOR, },
- [13] = { .vector = IRQ13_VECTOR, },
- [14] = { .vector = IRQ14_VECTOR, },
- [15] = { .vector = IRQ15_VECTOR, },
-};
-
-void __init io_apic_disable_legacy(void)
-{
- nr_legacy_irqs = 0;
- nr_irqs_gsi = 0;
-}
int __init arch_early_irq_init(void)
{
@@ -176,6 +152,11 @@ int __init arch_early_irq_init(void)
int node;
int i;
+ if (!legacy_pic->nr_legacy_irqs) {
+ nr_irqs_gsi = 0;
+ io_apic_irqs = ~0UL;
+ }
+
cfg = irq_cfgx;
count = ARRAY_SIZE(irq_cfgx);
node= cpu_to_node(boot_cpu_id);
@@ -185,8 +166,14 @@ int __init arch_early_irq_init(void)
desc->chip_data = &cfg[i];
zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node);
zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node);
- if (i < nr_legacy_irqs)
- cpumask_setall(cfg[i].domain);
+ /*
+ * For legacy IRQ's, start with assigning irq0 to irq15 to
+ * IRQ0_VECTOR to IRQ15_VECTOR on cpu 0.
+ */
+ if (i < legacy_pic->nr_legacy_irqs) {
+ cfg[i].vector = IRQ0_VECTOR + i;
+ cpumask_set_cpu(0, cfg[i].domain);
+ }
}
return 0;
@@ -406,7 +393,7 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
struct irq_pin_list *entry;
unsigned long flags;
- spin_lock_irqsave(&ioapic_lock, flags);
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
for_each_irq_pin(entry, cfg->irq_2_pin) {
unsigned int reg;
int pin;
@@ -415,11 +402,11 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
reg = io_apic_read(entry->apic, 0x10 + pin*2);
/* Is the remote IRR bit set? */
if (reg & IO_APIC_REDIR_REMOTE_IRR) {
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
return true;
}
}
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
return false;
}
@@ -433,10 +420,10 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
{
union entry_union eu;
unsigned long flags;
- spin_lock_irqsave(&ioapic_lock, flags);
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
return eu.entry;
}
@@ -459,9 +446,9 @@ __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
{
unsigned long flags;
- spin_lock_irqsave(&ioapic_lock, flags);
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
__ioapic_write_entry(apic, pin, e);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
}
/*
@@ -474,10 +461,10 @@ static void ioapic_mask_entry(int apic, int pin)
unsigned long flags;
union entry_union eu = { .entry.mask = 1 };
- spin_lock_irqsave(&ioapic_lock, flags);
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
io_apic_write(apic, 0x10 + 2*pin, eu.w1);
io_apic_write(apic, 0x11 + 2*pin, eu.w2);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
}
/*
@@ -604,9 +591,9 @@ static void mask_IO_APIC_irq_desc(struct irq_desc *desc)
BUG_ON(!cfg);
- spin_lock_irqsave(&ioapic_lock, flags);
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
__mask_IO_APIC_irq(cfg);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
}
static void unmask_IO_APIC_irq_desc(struct irq_desc *desc)
@@ -614,9 +601,9 @@ static void unmask_IO_APIC_irq_desc(struct irq_desc *desc)
struct irq_cfg *cfg = desc->chip_data;
unsigned long flags;
- spin_lock_irqsave(&ioapic_lock, flags);
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
__unmask_IO_APIC_irq(cfg);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
}
static void mask_IO_APIC_irq(unsigned int irq)
@@ -865,7 +852,7 @@ static int __init find_isa_irq_apic(int irq, int type)
*/
static int EISA_ELCR(unsigned int irq)
{
- if (irq < nr_legacy_irqs) {
+ if (irq < legacy_pic->nr_legacy_irqs) {
unsigned int port = 0x4d0 + (irq >> 3);
return (inb(port) >> (irq & 7)) & 1;
}
@@ -1140,12 +1127,12 @@ void lock_vector_lock(void)
/* Used to the online set of cpus does not change
* during assign_irq_vector.
*/
- spin_lock(&vector_lock);
+ raw_spin_lock(&vector_lock);
}
void unlock_vector_lock(void)
{
- spin_unlock(&vector_lock);
+ raw_spin_unlock(&vector_lock);
}
static int
@@ -1162,7 +1149,8 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
* Also, we've got to be careful not to trash gate
* 0x80, because int 0x80 is hm, kind of importantish. ;)
*/
- static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
+ static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START;
+ static int current_offset = VECTOR_OFFSET_START % 8;
unsigned int old_vector;
int cpu, err;
cpumask_var_t tmp_mask;
@@ -1198,7 +1186,7 @@ next:
if (vector >= first_system_vector) {
/* If out of vectors on large boxen, must share them. */
offset = (offset + 1) % 8;
- vector = FIRST_DEVICE_VECTOR + offset;
+ vector = FIRST_EXTERNAL_VECTOR + offset;
}
if (unlikely(current_vector == vector))
continue;
@@ -1232,9 +1220,9 @@ int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
int err;
unsigned long flags;
- spin_lock_irqsave(&vector_lock, flags);
+ raw_spin_lock_irqsave(&vector_lock, flags);
err = __assign_irq_vector(irq, cfg, mask);
- spin_unlock_irqrestore(&vector_lock, flags);
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
return err;
}
@@ -1268,14 +1256,27 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
void __setup_vector_irq(int cpu)
{
/* Initialize vector_irq on a new cpu */
- /* This function must be called with vector_lock held */
int irq, vector;
struct irq_cfg *cfg;
struct irq_desc *desc;
+ /*
+ * vector_lock will make sure that we don't run into irq vector
+ * assignments that might be happening on another cpu in parallel,
+ * while we setup our initial vector to irq mappings.
+ */
+ raw_spin_lock(&vector_lock);
/* Mark the inuse vectors */
for_each_irq_desc(irq, desc) {
cfg = desc->chip_data;
+
+ /*
+ * If it is a legacy IRQ handled by the legacy PIC, this cpu
+ * will be part of the irq_cfg's domain.
+ */
+ if (irq < legacy_pic->nr_legacy_irqs && !IO_APIC_IRQ(irq))
+ cpumask_set_cpu(cpu, cfg->domain);
+
if (!cpumask_test_cpu(cpu, cfg->domain))
continue;
vector = cfg->vector;
@@ -1291,6 +1292,7 @@ void __setup_vector_irq(int cpu)
if (!cpumask_test_cpu(cpu, cfg->domain))
per_cpu(vector_irq, cpu)[vector] = -1;
}
+ raw_spin_unlock(&vector_lock);
}
static struct irq_chip ioapic_chip;
@@ -1440,6 +1442,14 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq
cfg = desc->chip_data;
+ /*
+ * For legacy irqs, cfg->domain starts with cpu 0 for legacy
+ * controllers like 8259. Now that IO-APIC can handle this irq, update
+ * the cfg->domain.
+ */
+ if (irq < legacy_pic->nr_legacy_irqs && cpumask_test_cpu(0, cfg->domain))
+ apic->vector_allocation_domain(0, cfg->domain);
+
if (assign_irq_vector(irq, cfg, apic->target_cpus()))
return;
@@ -1461,8 +1471,8 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq
}
ioapic_register_intr(irq, desc, trigger);
- if (irq < nr_legacy_irqs)
- disable_8259A_irq(irq);
+ if (irq < legacy_pic->nr_legacy_irqs)
+ legacy_pic->chip->mask(irq);
ioapic_write_entry(apic_id, pin, entry);
}
@@ -1473,7 +1483,7 @@ static struct {
static void __init setup_IO_APIC_irqs(void)
{
- int apic_id = 0, pin, idx, irq;
+ int apic_id, pin, idx, irq;
int notcon = 0;
struct irq_desc *desc;
struct irq_cfg *cfg;
@@ -1481,14 +1491,7 @@ static void __init setup_IO_APIC_irqs(void)
apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
-#ifdef CONFIG_ACPI
- if (!acpi_disabled && acpi_ioapic) {
- apic_id = mp_find_ioapic(0);
- if (apic_id < 0)
- apic_id = 0;
- }
-#endif
-
+ for (apic_id = 0; apic_id < nr_ioapics; apic_id++)
for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) {
idx = find_irq_entry(apic_id, pin, mp_INT);
if (idx == -1) {
@@ -1510,6 +1513,9 @@ static void __init setup_IO_APIC_irqs(void)
irq = pin_2_irq(idx, apic_id, pin);
+ if ((apic_id > 0) && (irq > 16))
+ continue;
+
/*
* Skip the timer IRQ if there's a quirk handler
* installed and if it returns 1:
@@ -1539,6 +1545,56 @@ static void __init setup_IO_APIC_irqs(void)
}
/*
+ * for the gsit that is not in first ioapic
+ * but could not use acpi_register_gsi()
+ * like some special sci in IBM x3330
+ */
+void setup_IO_APIC_irq_extra(u32 gsi)
+{
+ int apic_id = 0, pin, idx, irq;
+ int node = cpu_to_node(boot_cpu_id);
+ struct irq_desc *desc;
+ struct irq_cfg *cfg;
+
+ /*
+ * Convert 'gsi' to 'ioapic.pin'.
+ */
+ apic_id = mp_find_ioapic(gsi);
+ if (apic_id < 0)
+ return;
+
+ pin = mp_find_ioapic_pin(apic_id, gsi);
+ idx = find_irq_entry(apic_id, pin, mp_INT);
+ if (idx == -1)
+ return;
+
+ irq = pin_2_irq(idx, apic_id, pin);
+#ifdef CONFIG_SPARSE_IRQ
+ desc = irq_to_desc(irq);
+ if (desc)
+ return;
+#endif
+ desc = irq_to_desc_alloc_node(irq, node);
+ if (!desc) {
+ printk(KERN_INFO "can not get irq_desc for %d\n", irq);
+ return;
+ }
+
+ cfg = desc->chip_data;
+ add_pin_to_irq_node(cfg, node, apic_id, pin);
+
+ if (test_bit(pin, mp_ioapic_routing[apic_id].pin_programmed)) {
+ pr_debug("Pin %d-%d already programmed\n",
+ mp_ioapics[apic_id].apicid, pin);
+ return;
+ }
+ set_bit(pin, mp_ioapic_routing[apic_id].pin_programmed);
+
+ setup_IO_APIC_irq(apic_id, pin, irq, desc,
+ irq_trigger(idx), irq_polarity(idx));
+}
+
+/*
* Set up the timer pin, possibly with the 8259A-master behind.
*/
static void __init setup_timer_IRQ0_pin(unsigned int apic_id, unsigned int pin,
@@ -1601,14 +1657,14 @@ __apicdebuginit(void) print_IO_APIC(void)
for (apic = 0; apic < nr_ioapics; apic++) {
- spin_lock_irqsave(&ioapic_lock, flags);
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
reg_00.raw = io_apic_read(apic, 0);
reg_01.raw = io_apic_read(apic, 1);
if (reg_01.bits.version >= 0x10)
reg_02.raw = io_apic_read(apic, 2);
if (reg_01.bits.version >= 0x20)
reg_03.raw = io_apic_read(apic, 3);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
printk("\n");
printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].apicid);
@@ -1647,7 +1703,7 @@ __apicdebuginit(void) print_IO_APIC(void)
printk(KERN_DEBUG ".... IRQ redirection table:\n");
printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol"
- " Stat Dmod Deli Vect: \n");
+ " Stat Dmod Deli Vect:\n");
for (i = 0; i <= reg_01.bits.entries; i++) {
struct IO_APIC_route_entry entry;
@@ -1825,12 +1881,12 @@ __apicdebuginit(void) print_PIC(void)
unsigned int v;
unsigned long flags;
- if (!nr_legacy_irqs)
+ if (!legacy_pic->nr_legacy_irqs)
return;
printk(KERN_DEBUG "\nprinting PIC contents\n");
- spin_lock_irqsave(&i8259A_lock, flags);
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
v = inb(0xa1) << 8 | inb(0x21);
printk(KERN_DEBUG "... PIC IMR: %04x\n", v);
@@ -1844,7 +1900,7 @@ __apicdebuginit(void) print_PIC(void)
outb(0x0a,0xa0);
outb(0x0a,0x20);
- spin_unlock_irqrestore(&i8259A_lock, flags);
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
printk(KERN_DEBUG "... PIC ISR: %04x\n", v);
@@ -1903,13 +1959,13 @@ void __init enable_IO_APIC(void)
* The number of IO-APIC IRQ registers (== #pins):
*/
for (apic = 0; apic < nr_ioapics; apic++) {
- spin_lock_irqsave(&ioapic_lock, flags);
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
reg_01.raw = io_apic_read(apic, 1);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
nr_ioapic_registers[apic] = reg_01.bits.entries+1;
}
- if (!nr_legacy_irqs)
+ if (!legacy_pic->nr_legacy_irqs)
return;
for(apic = 0; apic < nr_ioapics; apic++) {
@@ -1966,7 +2022,7 @@ void disable_IO_APIC(void)
*/
clear_IO_APIC();
- if (!nr_legacy_irqs)
+ if (!legacy_pic->nr_legacy_irqs)
return;
/*
@@ -2045,9 +2101,9 @@ void __init setup_ioapic_ids_from_mpc(void)
for (apic_id = 0; apic_id < nr_ioapics; apic_id++) {
/* Read the register 0 value */
- spin_lock_irqsave(&ioapic_lock, flags);
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
reg_00.raw = io_apic_read(apic_id, 0);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
old_id = mp_ioapics[apic_id].apicid;
@@ -2106,16 +2162,16 @@ void __init setup_ioapic_ids_from_mpc(void)
mp_ioapics[apic_id].apicid);
reg_00.bits.ID = mp_ioapics[apic_id].apicid;
- spin_lock_irqsave(&ioapic_lock, flags);
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
io_apic_write(apic_id, 0, reg_00.raw);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
/*
* Sanity check
*/
- spin_lock_irqsave(&ioapic_lock, flags);
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
reg_00.raw = io_apic_read(apic_id, 0);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
if (reg_00.bits.ID != mp_ioapics[apic_id].apicid)
printk("could not set ID!\n");
else
@@ -2198,15 +2254,15 @@ static unsigned int startup_ioapic_irq(unsigned int irq)
unsigned long flags;
struct irq_cfg *cfg;
- spin_lock_irqsave(&ioapic_lock, flags);
- if (irq < nr_legacy_irqs) {
- disable_8259A_irq(irq);
- if (i8259A_irq_pending(irq))
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ if (irq < legacy_pic->nr_legacy_irqs) {
+ legacy_pic->chip->mask(irq);
+ if (legacy_pic->irq_pending(irq))
was_pending = 1;
}
cfg = irq_cfg(irq);
__unmask_IO_APIC_irq(cfg);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
return was_pending;
}
@@ -2217,9 +2273,9 @@ static int ioapic_retrigger_irq(unsigned int irq)
struct irq_cfg *cfg = irq_cfg(irq);
unsigned long flags;
- spin_lock_irqsave(&vector_lock, flags);
+ raw_spin_lock_irqsave(&vector_lock, flags);
apic->send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector);
- spin_unlock_irqrestore(&vector_lock, flags);
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
return 1;
}
@@ -2312,14 +2368,14 @@ set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
irq = desc->irq;
cfg = desc->chip_data;
- spin_lock_irqsave(&ioapic_lock, flags);
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
ret = set_desc_affinity(desc, mask, &dest);
if (!ret) {
/* Only the high 8 bits are valid. */
dest = SET_APIC_LOGICAL_ID(dest);
__target_IO_APIC_irq(irq, dest, cfg);
}
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
return ret;
}
@@ -2434,6 +2490,13 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
cfg = irq_cfg(irq);
raw_spin_lock(&desc->lock);
+ /*
+ * Check if the irq migration is in progress. If so, we
+ * haven't received the cleanup request yet for this irq.
+ */
+ if (cfg->move_in_progress)
+ goto unlock;
+
if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
goto unlock;
@@ -2547,9 +2610,9 @@ static void eoi_ioapic_irq(struct irq_desc *desc)
irq = desc->irq;
cfg = desc->chip_data;
- spin_lock_irqsave(&ioapic_lock, flags);
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
__eoi_ioapic_irq(irq, cfg);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
}
static void ack_apic_level(unsigned int irq)
@@ -2727,8 +2790,8 @@ static inline void init_IO_APIC_traps(void)
* so default to an old-fashioned 8259
* interrupt if we can..
*/
- if (irq < nr_legacy_irqs)
- make_8259A_irq(irq);
+ if (irq < legacy_pic->nr_legacy_irqs)
+ legacy_pic->make_irq(irq);
else
/* Strange. Oh, well.. */
desc->chip = &no_irq_chip;
@@ -2885,7 +2948,7 @@ static inline void __init check_timer(void)
/*
* get/set the timer IRQ vector:
*/
- disable_8259A_irq(0);
+ legacy_pic->chip->mask(0);
assign_irq_vector(0, cfg, apic->target_cpus());
/*
@@ -2898,7 +2961,7 @@ static inline void __init check_timer(void)
* automatically.
*/
apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
- init_8259A(1);
+ legacy_pic->init(1);
#ifdef CONFIG_X86_32
{
unsigned int ver;
@@ -2957,7 +3020,7 @@ static inline void __init check_timer(void)
if (timer_irq_works()) {
if (nmi_watchdog == NMI_IO_APIC) {
setup_nmi();
- enable_8259A_irq(0);
+ legacy_pic->chip->unmask(0);
}
if (disable_timer_pin_1 > 0)
clear_IO_APIC_pin(0, pin1);
@@ -2980,14 +3043,14 @@ static inline void __init check_timer(void)
*/
replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2);
setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
- enable_8259A_irq(0);
+ legacy_pic->chip->unmask(0);
if (timer_irq_works()) {
apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
timer_through_8259 = 1;
if (nmi_watchdog == NMI_IO_APIC) {
- disable_8259A_irq(0);
+ legacy_pic->chip->mask(0);
setup_nmi();
- enable_8259A_irq(0);
+ legacy_pic->chip->unmask(0);
}
goto out;
}
@@ -2995,7 +3058,7 @@ static inline void __init check_timer(void)
* Cleanup, just in case ...
*/
local_irq_disable();
- disable_8259A_irq(0);
+ legacy_pic->chip->mask(0);
clear_IO_APIC_pin(apic2, pin2);
apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
}
@@ -3014,22 +3077,22 @@ static inline void __init check_timer(void)
lapic_register_intr(0, desc);
apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */
- enable_8259A_irq(0);
+ legacy_pic->chip->unmask(0);
if (timer_irq_works()) {
apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
goto out;
}
local_irq_disable();
- disable_8259A_irq(0);
+ legacy_pic->chip->mask(0);
apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
apic_printk(APIC_QUIET, KERN_INFO
"...trying to set up timer as ExtINT IRQ...\n");
- init_8259A(0);
- make_8259A_irq(0);
+ legacy_pic->init(0);
+ legacy_pic->make_irq(0);
apic_write(APIC_LVT0, APIC_DM_EXTINT);
unlock_ExtINT_logic();
@@ -3071,7 +3134,7 @@ void __init setup_IO_APIC(void)
/*
* calling enable_IO_APIC() is moved to setup_local_APIC for BP
*/
- io_apic_irqs = nr_legacy_irqs ? ~PIC_IRQS : ~0UL;
+ io_apic_irqs = legacy_pic->nr_legacy_irqs ? ~PIC_IRQS : ~0UL;
apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
/*
@@ -3082,7 +3145,7 @@ void __init setup_IO_APIC(void)
sync_Arb_IDs();
setup_IO_APIC_irqs();
init_IO_APIC_traps();
- if (nr_legacy_irqs)
+ if (legacy_pic->nr_legacy_irqs)
check_timer();
}
@@ -3131,13 +3194,13 @@ static int ioapic_resume(struct sys_device *dev)
data = container_of(dev, struct sysfs_ioapic_data, dev);
entry = data->entry;
- spin_lock_irqsave(&ioapic_lock, flags);
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
reg_00.raw = io_apic_read(dev->id, 0);
if (reg_00.bits.ID != mp_ioapics[dev->id].apicid) {
reg_00.bits.ID = mp_ioapics[dev->id].apicid;
io_apic_write(dev->id, 0, reg_00.raw);
}
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
ioapic_write_entry(dev->id, i, entry[i]);
@@ -3200,7 +3263,7 @@ unsigned int create_irq_nr(unsigned int irq_want, int node)
if (irq_want < nr_irqs_gsi)
irq_want = nr_irqs_gsi;
- spin_lock_irqsave(&vector_lock, flags);
+ raw_spin_lock_irqsave(&vector_lock, flags);
for (new = irq_want; new < nr_irqs; new++) {
desc_new = irq_to_desc_alloc_node(new, node);
if (!desc_new) {
@@ -3219,14 +3282,11 @@ unsigned int create_irq_nr(unsigned int irq_want, int node)
irq = new;
break;
}
- spin_unlock_irqrestore(&vector_lock, flags);
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
+
+ if (irq > 0)
+ dynamic_irq_init_keep_chip_data(irq);
- if (irq > 0) {
- dynamic_irq_init(irq);
- /* restore it, in case dynamic_irq_init clear it */
- if (desc_new)
- desc_new->chip_data = cfg_new;
- }
return irq;
}
@@ -3248,20 +3308,13 @@ int create_irq(void)
void destroy_irq(unsigned int irq)
{
unsigned long flags;
- struct irq_cfg *cfg;
- struct irq_desc *desc;
- /* store it, in case dynamic_irq_cleanup clear it */
- desc = irq_to_desc(irq);
- cfg = desc->chip_data;
- dynamic_irq_cleanup(irq);
- /* connect back irq_cfg */
- desc->chip_data = cfg;
+ dynamic_irq_cleanup_keep_chip_data(irq);
free_irte(irq);
- spin_lock_irqsave(&vector_lock, flags);
- __clear_irq_vector(irq, cfg);
- spin_unlock_irqrestore(&vector_lock, flags);
+ raw_spin_lock_irqsave(&vector_lock, flags);
+ __clear_irq_vector(irq, get_irq_chip_data(irq));
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
}
/*
@@ -3798,9 +3851,9 @@ int __init io_apic_get_redir_entries (int ioapic)
union IO_APIC_reg_01 reg_01;
unsigned long flags;
- spin_lock_irqsave(&ioapic_lock, flags);
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
reg_01.raw = io_apic_read(ioapic, 1);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
return reg_01.bits.entries;
}
@@ -3883,7 +3936,7 @@ static int __io_apic_set_pci_routing(struct device *dev, int irq,
/*
* IRQs < 16 are already in the irq_2_pin[] map
*/
- if (irq >= nr_legacy_irqs) {
+ if (irq >= legacy_pic->nr_legacy_irqs) {
cfg = desc->chip_data;
if (add_pin_to_irq_node_nopanic(cfg, node, ioapic, pin)) {
printk(KERN_INFO "can not add pin %d for irq %d\n",
@@ -3962,9 +4015,9 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
if (physids_empty(apic_id_map))
apic->ioapic_phys_id_map(&phys_cpu_present_map, &apic_id_map);
- spin_lock_irqsave(&ioapic_lock, flags);
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
reg_00.raw = io_apic_read(ioapic, 0);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
if (apic_id >= get_physical_broadcast()) {
printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
@@ -3998,10 +4051,10 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
if (reg_00.bits.ID != apic_id) {
reg_00.bits.ID = apic_id;
- spin_lock_irqsave(&ioapic_lock, flags);
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
io_apic_write(ioapic, 0, reg_00.raw);
reg_00.raw = io_apic_read(ioapic, 0);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
/* Sanity check */
if (reg_00.bits.ID != apic_id) {
@@ -4022,9 +4075,9 @@ int __init io_apic_get_version(int ioapic)
union IO_APIC_reg_01 reg_01;
unsigned long flags;
- spin_lock_irqsave(&ioapic_lock, flags);
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
reg_01.raw = io_apic_read(ioapic, 1);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
return reg_01.bits.version;
}
@@ -4056,27 +4109,23 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
#ifdef CONFIG_SMP
void __init setup_ioapic_dest(void)
{
- int pin, ioapic = 0, irq, irq_entry;
+ int pin, ioapic, irq, irq_entry;
struct irq_desc *desc;
const struct cpumask *mask;
if (skip_ioapic_setup == 1)
return;
-#ifdef CONFIG_ACPI
- if (!acpi_disabled && acpi_ioapic) {
- ioapic = mp_find_ioapic(0);
- if (ioapic < 0)
- ioapic = 0;
- }
-#endif
-
+ for (ioapic = 0; ioapic < nr_ioapics; ioapic++)
for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
irq_entry = find_irq_entry(ioapic, pin, mp_INT);
if (irq_entry == -1)
continue;
irq = pin_2_irq(irq_entry, ioapic, pin);
+ if ((ioapic > 0) && (irq > 16))
+ continue;
+
desc = irq_to_desc(irq);
/*
@@ -4261,3 +4310,24 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
nr_ioapics++;
}
+
+/* Enable IOAPIC early just for system timer */
+void __init pre_init_apic_IRQ0(void)
+{
+ struct irq_cfg *cfg;
+ struct irq_desc *desc;
+
+ printk(KERN_INFO "Early APIC setup for system timer0\n");
+#ifndef CONFIG_SMP
+ phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid);
+#endif
+ desc = irq_to_desc_alloc_node(0, 0);
+
+ setup_local_APIC();
+
+ cfg = irq_cfg(0);
+ add_pin_to_irq_node(cfg, 0, 0, 0);
+ set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge");
+
+ setup_IO_APIC_irq(0, 0, 0, desc, 0, 0);
+}
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index 0159a69396c..1edaf15c0b8 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -18,6 +18,7 @@
#include <linux/delay.h>
#include <linux/interrupt.h>
#include <linux/module.h>
+#include <linux/slab.h>
#include <linux/sysdev.h>
#include <linux/sysctl.h>
#include <linux/percpu.h>
@@ -177,7 +178,7 @@ int __init check_nmi_watchdog(void)
error:
if (nmi_watchdog == NMI_IO_APIC) {
if (!timer_through_8259)
- disable_8259A_irq(0);
+ legacy_pic->chip->mask(0);
on_each_cpu(__acpi_nmi_disable, NULL, 1);
}
@@ -416,13 +417,13 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
/* We can be called before check_nmi_watchdog, hence NULL check. */
if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
- static DEFINE_SPINLOCK(lock); /* Serialise the printks */
+ static DEFINE_RAW_SPINLOCK(lock); /* Serialise the printks */
- spin_lock(&lock);
+ raw_spin_lock(&lock);
printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
show_regs(regs);
dump_stack();
- spin_unlock(&lock);
+ raw_spin_unlock(&lock);
cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
rc = 1;
@@ -438,8 +439,8 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
* Ayiee, looks like this CPU is stuck ...
* wait a few IRQs (5 seconds) before doing the oops ...
*/
- __this_cpu_inc(per_cpu_var(alert_counter));
- if (__this_cpu_read(per_cpu_var(alert_counter)) == 5 * nmi_hz)
+ __this_cpu_inc(alert_counter);
+ if (__this_cpu_read(alert_counter) == 5 * nmi_hz)
/*
* die_nmi will return ONLY if NOTIFY_STOP happens..
*/
@@ -447,7 +448,7 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
regs, panic_on_timeout);
} else {
__get_cpu_var(last_irq_sum) = sum;
- __this_cpu_write(per_cpu_var(alert_counter), 0);
+ __this_cpu_write(alert_counter, 0);
}
/* see if the nmi watchdog went off */
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index 98c4665f251..3e28401f161 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -225,7 +225,7 @@ static void __init smp_read_mpc_oem(struct mpc_table *mpc)
mpc_record = 0;
printk(KERN_INFO
- "Found an OEM MPC table at %8p - parsing it ... \n", oemtable);
+ "Found an OEM MPC table at %8p - parsing it...\n", oemtable);
if (memcmp(oemtable->signature, MPC_OEM_SIGNATURE, 4)) {
printk(KERN_WARNING
@@ -277,6 +277,7 @@ static __init void early_check_numaq(void)
x86_init.mpparse.mpc_oem_pci_bus = mpc_oem_pci_bus;
x86_init.mpparse.mpc_oem_bus_info = mpc_oem_bus_info;
x86_init.timers.tsc_pre_init = numaq_tsc_init;
+ x86_init.pci.init = pci_numaq_init;
}
}
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 1a6559f6768..99d2fe01608 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -52,7 +52,32 @@ static int __init print_ipi_mode(void)
}
late_initcall(print_ipi_mode);
-void default_setup_apic_routing(void)
+void __init default_setup_apic_routing(void)
+{
+ int version = apic_version[boot_cpu_physical_apicid];
+
+ if (num_possible_cpus() > 8) {
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_INTEL:
+ if (!APIC_XAPIC(version)) {
+ def_to_bigsmp = 0;
+ break;
+ }
+ /* If P4 and above fall through */
+ case X86_VENDOR_AMD:
+ def_to_bigsmp = 1;
+ }
+ }
+
+#ifdef CONFIG_X86_BIGSMP
+ generic_bigsmp_probe();
+#endif
+
+ if (apic->setup_apic_routing)
+ apic->setup_apic_routing();
+}
+
+static void setup_apic_flat_routing(void)
{
#ifdef CONFIG_X86_IO_APIC
printk(KERN_INFO
@@ -103,7 +128,7 @@ struct apic apic_default = {
.init_apic_ldr = default_init_apic_ldr,
.ioapic_phys_id_map = default_ioapic_phys_id_map,
- .setup_apic_routing = default_setup_apic_routing,
+ .setup_apic_routing = setup_apic_flat_routing,
.multi_timer_check = NULL,
.apicid_to_node = default_apicid_to_node,
.cpu_to_logical_apicid = default_cpu_to_logical_apicid,
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index c4cbd3080c1..83e9be4778e 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -67,17 +67,8 @@ void __init default_setup_apic_routing(void)
}
#endif
- if (apic == &apic_flat) {
- switch (boot_cpu_data.x86_vendor) {
- case X86_VENDOR_INTEL:
- if (num_processors > 8)
- apic = &apic_physflat;
- break;
- case X86_VENDOR_AMD:
- if (max_physical_apicid >= 8)
- apic = &apic_physflat;
- }
- }
+ if (apic == &apic_flat && num_possible_cpus() > 8)
+ apic = &apic_physflat;
printk(KERN_INFO "Setting APIC routing to %s\n", apic->name);
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index d56b0efb205..c085d52dbaf 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -5,7 +5,7 @@
*
* SGI UV APIC functions (note: not an Intel compatible APIC)
*
- * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved.
+ * Copyright (C) 2007-2009 Silicon Graphics, Inc. All rights reserved.
*/
#include <linux/cpumask.h>
#include <linux/hardirq.h>
@@ -17,9 +17,12 @@
#include <linux/ctype.h>
#include <linux/sched.h>
#include <linux/timer.h>
+#include <linux/slab.h>
#include <linux/cpu.h>
#include <linux/init.h>
#include <linux/io.h>
+#include <linux/pci.h>
+#include <linux/kdebug.h>
#include <asm/uv/uv_mmrs.h>
#include <asm/uv/uv_hub.h>
@@ -34,8 +37,13 @@
DEFINE_PER_CPU(int, x2apic_extra_bits);
+#define PR_DEVEL(fmt, args...) pr_devel("%s: " fmt, __func__, args)
+
static enum uv_system_type uv_system_type;
static u64 gru_start_paddr, gru_end_paddr;
+int uv_min_hub_revision_id;
+EXPORT_SYMBOL_GPL(uv_min_hub_revision_id);
+static DEFINE_SPINLOCK(uv_nmi_lock);
static inline bool is_GRU_range(u64 start, u64 end)
{
@@ -55,20 +63,28 @@ static int early_get_nodeid(void)
mmr = early_ioremap(UV_LOCAL_MMR_BASE | UVH_NODE_ID, sizeof(*mmr));
node_id.v = *mmr;
early_iounmap(mmr, sizeof(*mmr));
+
+ /* Currently, all blades have same revision number */
+ uv_min_hub_revision_id = node_id.s.revision;
+
return node_id.s.node_id;
}
static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
+ int nodeid;
+
if (!strcmp(oem_id, "SGI")) {
+ nodeid = early_get_nodeid();
x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range;
+ x86_platform.nmi_init = uv_nmi_init;
if (!strcmp(oem_table_id, "UVL"))
uv_system_type = UV_LEGACY_APIC;
else if (!strcmp(oem_table_id, "UVX"))
uv_system_type = UV_X2APIC;
else if (!strcmp(oem_table_id, "UVH")) {
__get_cpu_var(x2apic_extra_bits) =
- early_get_nodeid() << (UV_APIC_PNODE_SHIFT - 1);
+ nodeid << (UV_APIC_PNODE_SHIFT - 1);
uv_system_type = UV_NON_UNIQUE_APIC;
return 1;
}
@@ -105,11 +121,9 @@ EXPORT_SYMBOL_GPL(uv_possible_blades);
unsigned long sn_rtc_cycles_per_second;
EXPORT_SYMBOL(sn_rtc_cycles_per_second);
-/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
-
static const struct cpumask *uv_target_cpus(void)
{
- return cpumask_of(0);
+ return cpu_online_mask;
}
static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask)
@@ -374,13 +388,13 @@ static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size)
enum map_type {map_wb, map_uc};
-static __init void map_high(char *id, unsigned long base, int shift,
- int max_pnode, enum map_type map_type)
+static __init void map_high(char *id, unsigned long base, int pshift,
+ int bshift, int max_pnode, enum map_type map_type)
{
unsigned long bytes, paddr;
- paddr = base << shift;
- bytes = (1UL << shift) * (max_pnode + 1);
+ paddr = base << pshift;
+ bytes = (1UL << bshift) * (max_pnode + 1);
printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr,
paddr + bytes);
if (map_type == map_uc)
@@ -396,7 +410,7 @@ static __init void map_gru_high(int max_pnode)
gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR);
if (gru.s.enable) {
- map_high("GRU", gru.s.base, shift, max_pnode, map_wb);
+ map_high("GRU", gru.s.base, shift, shift, max_pnode, map_wb);
gru_start_paddr = ((u64)gru.s.base << shift);
gru_end_paddr = gru_start_paddr + (1UL << shift) * (max_pnode + 1);
@@ -410,7 +424,7 @@ static __init void map_mmr_high(int max_pnode)
mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR);
if (mmr.s.enable)
- map_high("MMR", mmr.s.base, shift, max_pnode, map_uc);
+ map_high("MMR", mmr.s.base, shift, shift, max_pnode, map_uc);
}
static __init void map_mmioh_high(int max_pnode)
@@ -420,7 +434,8 @@ static __init void map_mmioh_high(int max_pnode)
mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR);
if (mmioh.s.enable)
- map_high("MMIOH", mmioh.s.base, shift, max_pnode, map_uc);
+ map_high("MMIOH", mmioh.s.base, shift, mmioh.s.m_io,
+ max_pnode, map_uc);
}
static __init void map_low_mmrs(void)
@@ -472,7 +487,7 @@ static void uv_heartbeat(unsigned long ignored)
static void __cpuinit uv_heartbeat_enable(int cpu)
{
- if (!uv_cpu_hub_info(cpu)->scir.enabled) {
+ while (!uv_cpu_hub_info(cpu)->scir.enabled) {
struct timer_list *timer = &uv_cpu_hub_info(cpu)->scir.timer;
uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY);
@@ -480,11 +495,10 @@ static void __cpuinit uv_heartbeat_enable(int cpu)
timer->expires = jiffies + SCIR_CPU_HB_INTERVAL;
add_timer_on(timer, cpu);
uv_cpu_hub_info(cpu)->scir.enabled = 1;
- }
- /* check boot cpu */
- if (!uv_cpu_hub_info(0)->scir.enabled)
- uv_heartbeat_enable(0);
+ /* also ensure that boot cpu is enabled */
+ cpu = 0;
+ }
}
#ifdef CONFIG_HOTPLUG_CPU
@@ -543,6 +557,30 @@ late_initcall(uv_init_heartbeat);
#endif /* !CONFIG_HOTPLUG_CPU */
+/* Direct Legacy VGA I/O traffic to designated IOH */
+int uv_set_vga_state(struct pci_dev *pdev, bool decode,
+ unsigned int command_bits, bool change_bridge)
+{
+ int domain, bus, rc;
+
+ PR_DEVEL("devfn %x decode %d cmd %x chg_brdg %d\n",
+ pdev->devfn, decode, command_bits, change_bridge);
+
+ if (!change_bridge)
+ return 0;
+
+ if ((command_bits & PCI_COMMAND_IO) == 0)
+ return 0;
+
+ domain = pci_domain_nr(pdev->bus);
+ bus = pdev->bus->number;
+
+ rc = uv_bios_set_legacy_vga_target(decode, domain, bus);
+ PR_DEVEL("vga decode %d %x:%x, rc: %d\n", decode, domain, bus, rc);
+
+ return rc;
+}
+
/*
* Called on each cpu to initialize the per_cpu UV data area.
* FIXME: hotplug not supported yet
@@ -559,6 +597,46 @@ void __cpuinit uv_cpu_init(void)
set_x2apic_extra_bits(uv_hub_info->pnode);
}
+/*
+ * When NMI is received, print a stack trace.
+ */
+int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data)
+{
+ if (reason != DIE_NMI_IPI)
+ return NOTIFY_OK;
+ /*
+ * Use a lock so only one cpu prints at a time
+ * to prevent intermixed output.
+ */
+ spin_lock(&uv_nmi_lock);
+ pr_info("NMI stack dump cpu %u:\n", smp_processor_id());
+ dump_stack();
+ spin_unlock(&uv_nmi_lock);
+
+ return NOTIFY_STOP;
+}
+
+static struct notifier_block uv_dump_stack_nmi_nb = {
+ .notifier_call = uv_handle_nmi
+};
+
+void uv_register_nmi_notifier(void)
+{
+ if (register_die_notifier(&uv_dump_stack_nmi_nb))
+ printk(KERN_WARNING "UV NMI handler failed to register\n");
+}
+
+void uv_nmi_init(void)
+{
+ unsigned int value;
+
+ /*
+ * Unmask NMI on all cpus
+ */
+ value = apic_read(APIC_LVT1) | APIC_DM_NMI;
+ value &= ~APIC_LVT_MASKED;
+ apic_write(APIC_LVT1, value);
+}
void __init uv_system_init(void)
{
@@ -624,13 +702,15 @@ void __init uv_system_init(void)
}
uv_bios_init();
- uv_bios_get_sn_info(0, &uv_type, &sn_partition_id,
- &sn_coherency_id, &sn_region_size);
+ uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, &sn_coherency_id,
+ &sn_region_size, &system_serial_number);
uv_rtc_init();
for_each_present_cpu(cpu) {
+ int apicid = per_cpu(x86_cpu_to_apicid, cpu);
+
nid = cpu_to_node(cpu);
- pnode = uv_apicid_to_pnode(per_cpu(x86_cpu_to_apicid, cpu));
+ pnode = uv_apicid_to_pnode(apicid);
blade = boot_pnode_to_blade(pnode);
lcpu = uv_blade_info[blade].nr_possible_cpus;
uv_blade_info[blade].nr_possible_cpus++;
@@ -651,15 +731,13 @@ void __init uv_system_init(void)
uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra;
uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base;
uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id;
- uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu;
+ uv_cpu_hub_info(cpu)->scir.offset = uv_scir_offset(apicid);
uv_node_to_blade[nid] = blade;
uv_cpu_to_blade[cpu] = blade;
max_pnode = max(pnode, max_pnode);
- printk(KERN_DEBUG "UV: cpu %d, apicid 0x%x, pnode %d, nid %d, "
- "lcpu %d, blade %d\n",
- cpu, per_cpu(x86_cpu_to_apicid, cpu), pnode, nid,
- lcpu, blade);
+ printk(KERN_DEBUG "UV: cpu %d, apicid 0x%x, pnode %d, nid %d, lcpu %d, blade %d\n",
+ cpu, apicid, pnode, nid, lcpu, blade);
}
/* Add blade/pnode info for nodes without cpus */
@@ -680,5 +758,9 @@ void __init uv_system_init(void)
uv_cpu_init();
uv_scir_register_cpu_notifier();
+ uv_register_nmi_notifier();
proc_mkdir("sgi_uv", NULL);
+
+ /* register Legacy VGA I/O redirection handler */
+ pci_register_set_vga_state(uv_set_vga_state);
}
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index b5b6b23bce5..031aa887b0e 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -1992,8 +1992,8 @@ static int __init apm_is_horked_d850md(const struct dmi_system_id *d)
apm_info.disabled = 1;
printk(KERN_INFO "%s machine detected. "
"Disabling APM.\n", d->ident);
- printk(KERN_INFO "This bug is fixed in bios P15 which is available for \n");
- printk(KERN_INFO "download from support.intel.com \n");
+ printk(KERN_INFO "This bug is fixed in bios P15 which is available for\n");
+ printk(KERN_INFO "download from support.intel.com\n");
}
return 0;
}
diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c
index b0206a211b0..8bc57baaa9a 100644
--- a/arch/x86/kernel/bios_uv.c
+++ b/arch/x86/kernel/bios_uv.c
@@ -15,8 +15,8 @@
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
- * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
- * Copyright (c) Russ Anderson
+ * Copyright (c) 2008-2009 Silicon Graphics, Inc. All Rights Reserved.
+ * Copyright (c) Russ Anderson <rja@sgi.com>
*/
#include <linux/efi.h>
@@ -30,6 +30,7 @@ static struct uv_systab uv_systab;
s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5)
{
struct uv_systab *tab = &uv_systab;
+ s64 ret;
if (!tab->function)
/*
@@ -37,9 +38,11 @@ s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5)
*/
return BIOS_STATUS_UNIMPLEMENTED;
- return efi_call6((void *)__va(tab->function),
- (u64)which, a1, a2, a3, a4, a5);
+ ret = efi_call6((void *)__va(tab->function), (u64)which,
+ a1, a2, a3, a4, a5);
+ return ret;
}
+EXPORT_SYMBOL_GPL(uv_bios_call);
s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
u64 a4, u64 a5)
@@ -73,11 +76,14 @@ long sn_coherency_id;
EXPORT_SYMBOL_GPL(sn_coherency_id);
long sn_region_size;
EXPORT_SYMBOL_GPL(sn_region_size);
+long system_serial_number;
+EXPORT_SYMBOL_GPL(system_serial_number);
int uv_type;
+EXPORT_SYMBOL_GPL(uv_type);
s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher,
- long *region)
+ long *region, long *ssn)
{
s64 ret;
u64 v0, v1;
@@ -97,8 +103,11 @@ s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher,
*coher = part.coherence_id;
if (region)
*region = part.region_size;
+ if (ssn)
+ *ssn = v1;
return ret;
}
+EXPORT_SYMBOL_GPL(uv_bios_get_sn_info);
int
uv_bios_mq_watchlist_alloc(unsigned long addr, unsigned int mq_size,
@@ -154,6 +163,25 @@ s64 uv_bios_freq_base(u64 clock_type, u64 *ticks_per_second)
}
EXPORT_SYMBOL_GPL(uv_bios_freq_base);
+/*
+ * uv_bios_set_legacy_vga_target - Set Legacy VGA I/O Target
+ * @decode: true to enable target, false to disable target
+ * @domain: PCI domain number
+ * @bus: PCI bus number
+ *
+ * Returns:
+ * 0: Success
+ * -EINVAL: Invalid domain or bus number
+ * -ENOSYS: Capability not available
+ * -EBUSY: Legacy VGA I/O cannot be retargeted at this time
+ */
+int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus)
+{
+ return uv_bios_call(UV_BIOS_SET_LEGACY_VGA_TARGET,
+ (u64)decode, (u64)domain, (u64)bus, 0, 0);
+}
+EXPORT_SYMBOL_GPL(uv_bios_set_legacy_vga_target);
+
#ifdef CONFIG_EFI
void uv_bios_init(void)
@@ -185,4 +213,3 @@ void uv_bios_init(void)
void uv_bios_init(void) { }
#endif
-
diff --git a/arch/x86/kernel/bootflag.c b/arch/x86/kernel/bootflag.c
index 30f25a75fe2..5de7f4c5697 100644
--- a/arch/x86/kernel/bootflag.c
+++ b/arch/x86/kernel/bootflag.c
@@ -5,7 +5,6 @@
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/string.h>
-#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/acpi.h>
#include <asm/io.h>
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 1d2cb383410..c202b62f367 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -19,8 +19,6 @@ obj-y += vmware.o hypervisor.o sched.o
obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o
obj-$(CONFIG_X86_64) += bugs_64.o
-obj-$(CONFIG_X86_CPU_DEBUG) += cpu_debug.o
-
obj-$(CONFIG_CPU_SUP_INTEL) += intel.o
obj-$(CONFIG_CPU_SUP_AMD) += amd.o
obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index 468489b57aa..97ad79cdf68 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -32,6 +32,10 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
{ X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 },
{ X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006 },
+ { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a },
+ { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a },
+ { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a },
+ { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a },
{ 0, 0, 0, 0 }
};
diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c
deleted file mode 100644
index b368cd86299..00000000000
--- a/arch/x86/kernel/cpu/cpu_debug.c
+++ /dev/null
@@ -1,688 +0,0 @@
-/*
- * CPU x86 architecture debug code
- *
- * Copyright(C) 2009 Jaswinder Singh Rajput
- *
- * For licencing details see kernel-base/COPYING
- */
-
-#include <linux/interrupt.h>
-#include <linux/compiler.h>
-#include <linux/seq_file.h>
-#include <linux/debugfs.h>
-#include <linux/kprobes.h>
-#include <linux/uaccess.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/percpu.h>
-#include <linux/signal.h>
-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/smp.h>
-
-#include <asm/cpu_debug.h>
-#include <asm/paravirt.h>
-#include <asm/system.h>
-#include <asm/traps.h>
-#include <asm/apic.h>
-#include <asm/desc.h>
-
-static DEFINE_PER_CPU(struct cpu_cpuX_base [CPU_REG_ALL_BIT], cpud_arr);
-static DEFINE_PER_CPU(struct cpu_private * [MAX_CPU_FILES], cpud_priv_arr);
-static DEFINE_PER_CPU(int, cpud_priv_count);
-
-static DEFINE_MUTEX(cpu_debug_lock);
-
-static struct dentry *cpu_debugfs_dir;
-
-static struct cpu_debug_base cpu_base[] = {
- { "mc", CPU_MC, 0 },
- { "monitor", CPU_MONITOR, 0 },
- { "time", CPU_TIME, 0 },
- { "pmc", CPU_PMC, 1 },
- { "platform", CPU_PLATFORM, 0 },
- { "apic", CPU_APIC, 0 },
- { "poweron", CPU_POWERON, 0 },
- { "control", CPU_CONTROL, 0 },
- { "features", CPU_FEATURES, 0 },
- { "lastbranch", CPU_LBRANCH, 0 },
- { "bios", CPU_BIOS, 0 },
- { "freq", CPU_FREQ, 0 },
- { "mtrr", CPU_MTRR, 0 },
- { "perf", CPU_PERF, 0 },
- { "cache", CPU_CACHE, 0 },
- { "sysenter", CPU_SYSENTER, 0 },
- { "therm", CPU_THERM, 0 },
- { "misc", CPU_MISC, 0 },
- { "debug", CPU_DEBUG, 0 },
- { "pat", CPU_PAT, 0 },
- { "vmx", CPU_VMX, 0 },
- { "call", CPU_CALL, 0 },
- { "base", CPU_BASE, 0 },
- { "ver", CPU_VER, 0 },
- { "conf", CPU_CONF, 0 },
- { "smm", CPU_SMM, 0 },
- { "svm", CPU_SVM, 0 },
- { "osvm", CPU_OSVM, 0 },
- { "tss", CPU_TSS, 0 },
- { "cr", CPU_CR, 0 },
- { "dt", CPU_DT, 0 },
- { "registers", CPU_REG_ALL, 0 },
-};
-
-static struct cpu_file_base cpu_file[] = {
- { "index", CPU_REG_ALL, 0 },
- { "value", CPU_REG_ALL, 1 },
-};
-
-/* CPU Registers Range */
-static struct cpu_debug_range cpu_reg_range[] = {
- { 0x00000000, 0x00000001, CPU_MC, },
- { 0x00000006, 0x00000007, CPU_MONITOR, },
- { 0x00000010, 0x00000010, CPU_TIME, },
- { 0x00000011, 0x00000013, CPU_PMC, },
- { 0x00000017, 0x00000017, CPU_PLATFORM, },
- { 0x0000001B, 0x0000001B, CPU_APIC, },
- { 0x0000002A, 0x0000002B, CPU_POWERON, },
- { 0x0000002C, 0x0000002C, CPU_FREQ, },
- { 0x0000003A, 0x0000003A, CPU_CONTROL, },
- { 0x00000040, 0x00000047, CPU_LBRANCH, },
- { 0x00000060, 0x00000067, CPU_LBRANCH, },
- { 0x00000079, 0x00000079, CPU_BIOS, },
- { 0x00000088, 0x0000008A, CPU_CACHE, },
- { 0x0000008B, 0x0000008B, CPU_BIOS, },
- { 0x0000009B, 0x0000009B, CPU_MONITOR, },
- { 0x000000C1, 0x000000C4, CPU_PMC, },
- { 0x000000CD, 0x000000CD, CPU_FREQ, },
- { 0x000000E7, 0x000000E8, CPU_PERF, },
- { 0x000000FE, 0x000000FE, CPU_MTRR, },
-
- { 0x00000116, 0x0000011E, CPU_CACHE, },
- { 0x00000174, 0x00000176, CPU_SYSENTER, },
- { 0x00000179, 0x0000017B, CPU_MC, },
- { 0x00000186, 0x00000189, CPU_PMC, },
- { 0x00000198, 0x00000199, CPU_PERF, },
- { 0x0000019A, 0x0000019A, CPU_TIME, },
- { 0x0000019B, 0x0000019D, CPU_THERM, },
- { 0x000001A0, 0x000001A0, CPU_MISC, },
- { 0x000001C9, 0x000001C9, CPU_LBRANCH, },
- { 0x000001D7, 0x000001D8, CPU_LBRANCH, },
- { 0x000001D9, 0x000001D9, CPU_DEBUG, },
- { 0x000001DA, 0x000001E0, CPU_LBRANCH, },
-
- { 0x00000200, 0x0000020F, CPU_MTRR, },
- { 0x00000250, 0x00000250, CPU_MTRR, },
- { 0x00000258, 0x00000259, CPU_MTRR, },
- { 0x00000268, 0x0000026F, CPU_MTRR, },
- { 0x00000277, 0x00000277, CPU_PAT, },
- { 0x000002FF, 0x000002FF, CPU_MTRR, },
-
- { 0x00000300, 0x00000311, CPU_PMC, },
- { 0x00000345, 0x00000345, CPU_PMC, },
- { 0x00000360, 0x00000371, CPU_PMC, },
- { 0x0000038D, 0x00000390, CPU_PMC, },
- { 0x000003A0, 0x000003BE, CPU_PMC, },
- { 0x000003C0, 0x000003CD, CPU_PMC, },
- { 0x000003E0, 0x000003E1, CPU_PMC, },
- { 0x000003F0, 0x000003F2, CPU_PMC, },
-
- { 0x00000400, 0x00000417, CPU_MC, },
- { 0x00000480, 0x0000048B, CPU_VMX, },
-
- { 0x00000600, 0x00000600, CPU_DEBUG, },
- { 0x00000680, 0x0000068F, CPU_LBRANCH, },
- { 0x000006C0, 0x000006CF, CPU_LBRANCH, },
-
- { 0x000107CC, 0x000107D3, CPU_PMC, },
-
- { 0xC0000080, 0xC0000080, CPU_FEATURES, },
- { 0xC0000081, 0xC0000084, CPU_CALL, },
- { 0xC0000100, 0xC0000102, CPU_BASE, },
- { 0xC0000103, 0xC0000103, CPU_TIME, },
-
- { 0xC0010000, 0xC0010007, CPU_PMC, },
- { 0xC0010010, 0xC0010010, CPU_CONF, },
- { 0xC0010015, 0xC0010015, CPU_CONF, },
- { 0xC0010016, 0xC001001A, CPU_MTRR, },
- { 0xC001001D, 0xC001001D, CPU_MTRR, },
- { 0xC001001F, 0xC001001F, CPU_CONF, },
- { 0xC0010030, 0xC0010035, CPU_BIOS, },
- { 0xC0010044, 0xC0010048, CPU_MC, },
- { 0xC0010050, 0xC0010056, CPU_SMM, },
- { 0xC0010058, 0xC0010058, CPU_CONF, },
- { 0xC0010060, 0xC0010060, CPU_CACHE, },
- { 0xC0010061, 0xC0010068, CPU_SMM, },
- { 0xC0010069, 0xC001006B, CPU_SMM, },
- { 0xC0010070, 0xC0010071, CPU_SMM, },
- { 0xC0010111, 0xC0010113, CPU_SMM, },
- { 0xC0010114, 0xC0010118, CPU_SVM, },
- { 0xC0010140, 0xC0010141, CPU_OSVM, },
- { 0xC0011022, 0xC0011023, CPU_CONF, },
-};
-
-static int is_typeflag_valid(unsigned cpu, unsigned flag)
-{
- int i;
-
- /* Standard Registers should be always valid */
- if (flag >= CPU_TSS)
- return 1;
-
- for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
- if (cpu_reg_range[i].flag == flag)
- return 1;
- }
-
- /* Invalid */
- return 0;
-}
-
-static unsigned get_cpu_range(unsigned cpu, unsigned *min, unsigned *max,
- int index, unsigned flag)
-{
- if (cpu_reg_range[index].flag == flag) {
- *min = cpu_reg_range[index].min;
- *max = cpu_reg_range[index].max;
- } else
- *max = 0;
-
- return *max;
-}
-
-/* This function can also be called with seq = NULL for printk */
-static void print_cpu_data(struct seq_file *seq, unsigned type,
- u32 low, u32 high)
-{
- struct cpu_private *priv;
- u64 val = high;
-
- if (seq) {
- priv = seq->private;
- if (priv->file) {
- val = (val << 32) | low;
- seq_printf(seq, "0x%llx\n", val);
- } else
- seq_printf(seq, " %08x: %08x_%08x\n",
- type, high, low);
- } else
- printk(KERN_INFO " %08x: %08x_%08x\n", type, high, low);
-}
-
-/* This function can also be called with seq = NULL for printk */
-static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag)
-{
- unsigned msr, msr_min, msr_max;
- struct cpu_private *priv;
- u32 low, high;
- int i;
-
- if (seq) {
- priv = seq->private;
- if (priv->file) {
- if (!rdmsr_safe_on_cpu(priv->cpu, priv->reg,
- &low, &high))
- print_cpu_data(seq, priv->reg, low, high);
- return;
- }
- }
-
- for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
- if (!get_cpu_range(cpu, &msr_min, &msr_max, i, flag))
- continue;
-
- for (msr = msr_min; msr <= msr_max; msr++) {
- if (rdmsr_safe_on_cpu(cpu, msr, &low, &high))
- continue;
- print_cpu_data(seq, msr, low, high);
- }
- }
-}
-
-static void print_tss(void *arg)
-{
- struct pt_regs *regs = task_pt_regs(current);
- struct seq_file *seq = arg;
- unsigned int seg;
-
- seq_printf(seq, " RAX\t: %016lx\n", regs->ax);
- seq_printf(seq, " RBX\t: %016lx\n", regs->bx);
- seq_printf(seq, " RCX\t: %016lx\n", regs->cx);
- seq_printf(seq, " RDX\t: %016lx\n", regs->dx);
-
- seq_printf(seq, " RSI\t: %016lx\n", regs->si);
- seq_printf(seq, " RDI\t: %016lx\n", regs->di);
- seq_printf(seq, " RBP\t: %016lx\n", regs->bp);
- seq_printf(seq, " ESP\t: %016lx\n", regs->sp);
-
-#ifdef CONFIG_X86_64
- seq_printf(seq, " R08\t: %016lx\n", regs->r8);
- seq_printf(seq, " R09\t: %016lx\n", regs->r9);
- seq_printf(seq, " R10\t: %016lx\n", regs->r10);
- seq_printf(seq, " R11\t: %016lx\n", regs->r11);
- seq_printf(seq, " R12\t: %016lx\n", regs->r12);
- seq_printf(seq, " R13\t: %016lx\n", regs->r13);
- seq_printf(seq, " R14\t: %016lx\n", regs->r14);
- seq_printf(seq, " R15\t: %016lx\n", regs->r15);
-#endif
-
- asm("movl %%cs,%0" : "=r" (seg));
- seq_printf(seq, " CS\t: %04x\n", seg);
- asm("movl %%ds,%0" : "=r" (seg));
- seq_printf(seq, " DS\t: %04x\n", seg);
- seq_printf(seq, " SS\t: %04lx\n", regs->ss & 0xffff);
- asm("movl %%es,%0" : "=r" (seg));
- seq_printf(seq, " ES\t: %04x\n", seg);
- asm("movl %%fs,%0" : "=r" (seg));
- seq_printf(seq, " FS\t: %04x\n", seg);
- asm("movl %%gs,%0" : "=r" (seg));
- seq_printf(seq, " GS\t: %04x\n", seg);
-
- seq_printf(seq, " EFLAGS\t: %016lx\n", regs->flags);
-
- seq_printf(seq, " EIP\t: %016lx\n", regs->ip);
-}
-
-static void print_cr(void *arg)
-{
- struct seq_file *seq = arg;
-
- seq_printf(seq, " cr0\t: %016lx\n", read_cr0());
- seq_printf(seq, " cr2\t: %016lx\n", read_cr2());
- seq_printf(seq, " cr3\t: %016lx\n", read_cr3());
- seq_printf(seq, " cr4\t: %016lx\n", read_cr4_safe());
-#ifdef CONFIG_X86_64
- seq_printf(seq, " cr8\t: %016lx\n", read_cr8());
-#endif
-}
-
-static void print_desc_ptr(char *str, struct seq_file *seq, struct desc_ptr dt)
-{
- seq_printf(seq, " %s\t: %016llx\n", str, (u64)(dt.address | dt.size));
-}
-
-static void print_dt(void *seq)
-{
- struct desc_ptr dt;
- unsigned long ldt;
-
- /* IDT */
- store_idt((struct desc_ptr *)&dt);
- print_desc_ptr("IDT", seq, dt);
-
- /* GDT */
- store_gdt((struct desc_ptr *)&dt);
- print_desc_ptr("GDT", seq, dt);
-
- /* LDT */
- store_ldt(ldt);
- seq_printf(seq, " LDT\t: %016lx\n", ldt);
-
- /* TR */
- store_tr(ldt);
- seq_printf(seq, " TR\t: %016lx\n", ldt);
-}
-
-static void print_dr(void *arg)
-{
- struct seq_file *seq = arg;
- unsigned long dr;
- int i;
-
- for (i = 0; i < 8; i++) {
- /* Ignore db4, db5 */
- if ((i == 4) || (i == 5))
- continue;
- get_debugreg(dr, i);
- seq_printf(seq, " dr%d\t: %016lx\n", i, dr);
- }
-
- seq_printf(seq, "\n MSR\t:\n");
-}
-
-static void print_apic(void *arg)
-{
- struct seq_file *seq = arg;
-
-#ifdef CONFIG_X86_LOCAL_APIC
- seq_printf(seq, " LAPIC\t:\n");
- seq_printf(seq, " ID\t\t: %08x\n", apic_read(APIC_ID) >> 24);
- seq_printf(seq, " LVR\t\t: %08x\n", apic_read(APIC_LVR));
- seq_printf(seq, " TASKPRI\t: %08x\n", apic_read(APIC_TASKPRI));
- seq_printf(seq, " ARBPRI\t\t: %08x\n", apic_read(APIC_ARBPRI));
- seq_printf(seq, " PROCPRI\t: %08x\n", apic_read(APIC_PROCPRI));
- seq_printf(seq, " LDR\t\t: %08x\n", apic_read(APIC_LDR));
- seq_printf(seq, " DFR\t\t: %08x\n", apic_read(APIC_DFR));
- seq_printf(seq, " SPIV\t\t: %08x\n", apic_read(APIC_SPIV));
- seq_printf(seq, " ISR\t\t: %08x\n", apic_read(APIC_ISR));
- seq_printf(seq, " ESR\t\t: %08x\n", apic_read(APIC_ESR));
- seq_printf(seq, " ICR\t\t: %08x\n", apic_read(APIC_ICR));
- seq_printf(seq, " ICR2\t\t: %08x\n", apic_read(APIC_ICR2));
- seq_printf(seq, " LVTT\t\t: %08x\n", apic_read(APIC_LVTT));
- seq_printf(seq, " LVTTHMR\t: %08x\n", apic_read(APIC_LVTTHMR));
- seq_printf(seq, " LVTPC\t\t: %08x\n", apic_read(APIC_LVTPC));
- seq_printf(seq, " LVT0\t\t: %08x\n", apic_read(APIC_LVT0));
- seq_printf(seq, " LVT1\t\t: %08x\n", apic_read(APIC_LVT1));
- seq_printf(seq, " LVTERR\t\t: %08x\n", apic_read(APIC_LVTERR));
- seq_printf(seq, " TMICT\t\t: %08x\n", apic_read(APIC_TMICT));
- seq_printf(seq, " TMCCT\t\t: %08x\n", apic_read(APIC_TMCCT));
- seq_printf(seq, " TDCR\t\t: %08x\n", apic_read(APIC_TDCR));
- if (boot_cpu_has(X86_FEATURE_EXTAPIC)) {
- unsigned int i, v, maxeilvt;
-
- v = apic_read(APIC_EFEAT);
- maxeilvt = (v >> 16) & 0xff;
- seq_printf(seq, " EFEAT\t\t: %08x\n", v);
- seq_printf(seq, " ECTRL\t\t: %08x\n", apic_read(APIC_ECTRL));
-
- for (i = 0; i < maxeilvt; i++) {
- v = apic_read(APIC_EILVTn(i));
- seq_printf(seq, " EILVT%d\t\t: %08x\n", i, v);
- }
- }
-#endif /* CONFIG_X86_LOCAL_APIC */
- seq_printf(seq, "\n MSR\t:\n");
-}
-
-static int cpu_seq_show(struct seq_file *seq, void *v)
-{
- struct cpu_private *priv = seq->private;
-
- if (priv == NULL)
- return -EINVAL;
-
- switch (cpu_base[priv->type].flag) {
- case CPU_TSS:
- smp_call_function_single(priv->cpu, print_tss, seq, 1);
- break;
- case CPU_CR:
- smp_call_function_single(priv->cpu, print_cr, seq, 1);
- break;
- case CPU_DT:
- smp_call_function_single(priv->cpu, print_dt, seq, 1);
- break;
- case CPU_DEBUG:
- if (priv->file == CPU_INDEX_BIT)
- smp_call_function_single(priv->cpu, print_dr, seq, 1);
- print_msr(seq, priv->cpu, cpu_base[priv->type].flag);
- break;
- case CPU_APIC:
- if (priv->file == CPU_INDEX_BIT)
- smp_call_function_single(priv->cpu, print_apic, seq, 1);
- print_msr(seq, priv->cpu, cpu_base[priv->type].flag);
- break;
-
- default:
- print_msr(seq, priv->cpu, cpu_base[priv->type].flag);
- break;
- }
- seq_printf(seq, "\n");
-
- return 0;
-}
-
-static void *cpu_seq_start(struct seq_file *seq, loff_t *pos)
-{
- if (*pos == 0) /* One time is enough ;-) */
- return seq;
-
- return NULL;
-}
-
-static void *cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
-{
- (*pos)++;
-
- return cpu_seq_start(seq, pos);
-}
-
-static void cpu_seq_stop(struct seq_file *seq, void *v)
-{
-}
-
-static const struct seq_operations cpu_seq_ops = {
- .start = cpu_seq_start,
- .next = cpu_seq_next,
- .stop = cpu_seq_stop,
- .show = cpu_seq_show,
-};
-
-static int cpu_seq_open(struct inode *inode, struct file *file)
-{
- struct cpu_private *priv = inode->i_private;
- struct seq_file *seq;
- int err;
-
- err = seq_open(file, &cpu_seq_ops);
- if (!err) {
- seq = file->private_data;
- seq->private = priv;
- }
-
- return err;
-}
-
-static int write_msr(struct cpu_private *priv, u64 val)
-{
- u32 low, high;
-
- high = (val >> 32) & 0xffffffff;
- low = val & 0xffffffff;
-
- if (!wrmsr_safe_on_cpu(priv->cpu, priv->reg, low, high))
- return 0;
-
- return -EPERM;
-}
-
-static int write_cpu_register(struct cpu_private *priv, const char *buf)
-{
- int ret = -EPERM;
- u64 val;
-
- ret = strict_strtoull(buf, 0, &val);
- if (ret < 0)
- return ret;
-
- /* Supporting only MSRs */
- if (priv->type < CPU_TSS_BIT)
- return write_msr(priv, val);
-
- return ret;
-}
-
-static ssize_t cpu_write(struct file *file, const char __user *ubuf,
- size_t count, loff_t *off)
-{
- struct seq_file *seq = file->private_data;
- struct cpu_private *priv = seq->private;
- char buf[19];
-
- if ((priv == NULL) || (count >= sizeof(buf)))
- return -EINVAL;
-
- if (copy_from_user(&buf, ubuf, count))
- return -EFAULT;
-
- buf[count] = 0;
-
- if ((cpu_base[priv->type].write) && (cpu_file[priv->file].write))
- if (!write_cpu_register(priv, buf))
- return count;
-
- return -EACCES;
-}
-
-static const struct file_operations cpu_fops = {
- .owner = THIS_MODULE,
- .open = cpu_seq_open,
- .read = seq_read,
- .write = cpu_write,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
-static int cpu_create_file(unsigned cpu, unsigned type, unsigned reg,
- unsigned file, struct dentry *dentry)
-{
- struct cpu_private *priv = NULL;
-
- /* Already intialized */
- if (file == CPU_INDEX_BIT)
- if (per_cpu(cpud_arr[type].init, cpu))
- return 0;
-
- priv = kzalloc(sizeof(*priv), GFP_KERNEL);
- if (priv == NULL)
- return -ENOMEM;
-
- priv->cpu = cpu;
- priv->type = type;
- priv->reg = reg;
- priv->file = file;
- mutex_lock(&cpu_debug_lock);
- per_cpu(cpud_priv_arr[type], cpu) = priv;
- per_cpu(cpud_priv_count, cpu)++;
- mutex_unlock(&cpu_debug_lock);
-
- if (file)
- debugfs_create_file(cpu_file[file].name, S_IRUGO,
- dentry, (void *)priv, &cpu_fops);
- else {
- debugfs_create_file(cpu_base[type].name, S_IRUGO,
- per_cpu(cpud_arr[type].dentry, cpu),
- (void *)priv, &cpu_fops);
- mutex_lock(&cpu_debug_lock);
- per_cpu(cpud_arr[type].init, cpu) = 1;
- mutex_unlock(&cpu_debug_lock);
- }
-
- return 0;
-}
-
-static int cpu_init_regfiles(unsigned cpu, unsigned int type, unsigned reg,
- struct dentry *dentry)
-{
- unsigned file;
- int err = 0;
-
- for (file = 0; file < ARRAY_SIZE(cpu_file); file++) {
- err = cpu_create_file(cpu, type, reg, file, dentry);
- if (err)
- return err;
- }
-
- return err;
-}
-
-static int cpu_init_msr(unsigned cpu, unsigned type, struct dentry *dentry)
-{
- struct dentry *cpu_dentry = NULL;
- unsigned reg, reg_min, reg_max;
- int i, err = 0;
- char reg_dir[12];
- u32 low, high;
-
- for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
- if (!get_cpu_range(cpu, &reg_min, &reg_max, i,
- cpu_base[type].flag))
- continue;
-
- for (reg = reg_min; reg <= reg_max; reg++) {
- if (rdmsr_safe_on_cpu(cpu, reg, &low, &high))
- continue;
-
- sprintf(reg_dir, "0x%x", reg);
- cpu_dentry = debugfs_create_dir(reg_dir, dentry);
- err = cpu_init_regfiles(cpu, type, reg, cpu_dentry);
- if (err)
- return err;
- }
- }
-
- return err;
-}
-
-static int cpu_init_allreg(unsigned cpu, struct dentry *dentry)
-{
- struct dentry *cpu_dentry = NULL;
- unsigned type;
- int err = 0;
-
- for (type = 0; type < ARRAY_SIZE(cpu_base) - 1; type++) {
- if (!is_typeflag_valid(cpu, cpu_base[type].flag))
- continue;
- cpu_dentry = debugfs_create_dir(cpu_base[type].name, dentry);
- per_cpu(cpud_arr[type].dentry, cpu) = cpu_dentry;
-
- if (type < CPU_TSS_BIT)
- err = cpu_init_msr(cpu, type, cpu_dentry);
- else
- err = cpu_create_file(cpu, type, 0, CPU_INDEX_BIT,
- cpu_dentry);
- if (err)
- return err;
- }
-
- return err;
-}
-
-static int cpu_init_cpu(void)
-{
- struct dentry *cpu_dentry = NULL;
- struct cpuinfo_x86 *cpui;
- char cpu_dir[12];
- unsigned cpu;
- int err = 0;
-
- for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
- cpui = &cpu_data(cpu);
- if (!cpu_has(cpui, X86_FEATURE_MSR))
- continue;
-
- sprintf(cpu_dir, "cpu%d", cpu);
- cpu_dentry = debugfs_create_dir(cpu_dir, cpu_debugfs_dir);
- err = cpu_init_allreg(cpu, cpu_dentry);
-
- pr_info("cpu%d(%d) debug files %d\n",
- cpu, nr_cpu_ids, per_cpu(cpud_priv_count, cpu));
- if (per_cpu(cpud_priv_count, cpu) > MAX_CPU_FILES) {
- pr_err("Register files count %d exceeds limit %d\n",
- per_cpu(cpud_priv_count, cpu), MAX_CPU_FILES);
- per_cpu(cpud_priv_count, cpu) = MAX_CPU_FILES;
- err = -ENFILE;
- }
- if (err)
- return err;
- }
-
- return err;
-}
-
-static int __init cpu_debug_init(void)
-{
- cpu_debugfs_dir = debugfs_create_dir("cpu", arch_debugfs_dir);
-
- return cpu_init_cpu();
-}
-
-static void __exit cpu_debug_exit(void)
-{
- int i, cpu;
-
- if (cpu_debugfs_dir)
- debugfs_remove_recursive(cpu_debugfs_dir);
-
- for (cpu = 0; cpu < nr_cpu_ids; cpu++)
- for (i = 0; i < per_cpu(cpud_priv_count, cpu); i++)
- kfree(per_cpu(cpud_priv_arr[i], cpu));
-}
-
-module_init(cpu_debug_init);
-module_exit(cpu_debug_exit);
-
-MODULE_AUTHOR("Jaswinder Singh Rajput");
-MODULE_DESCRIPTION("CPU Debug module");
-MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig
index f138c6c389b..870e6cc6ad2 100644
--- a/arch/x86/kernel/cpu/cpufreq/Kconfig
+++ b/arch/x86/kernel/cpu/cpufreq/Kconfig
@@ -10,6 +10,20 @@ if CPU_FREQ
comment "CPUFreq processor drivers"
+config X86_PCC_CPUFREQ
+ tristate "Processor Clocking Control interface driver"
+ depends on ACPI && ACPI_PROCESSOR
+ help
+ This driver adds support for the PCC interface.
+
+ For details, take a look at:
+ <file:Documentation/cpu-freq/pcc-cpufreq.txt>.
+
+ To compile this driver as a module, choose M here: the
+ module will be called pcc-cpufreq.
+
+ If in doubt, say N.
+
config X86_ACPI_CPUFREQ
tristate "ACPI Processor P-States driver"
select CPU_FREQ_TABLE
diff --git a/arch/x86/kernel/cpu/cpufreq/Makefile b/arch/x86/kernel/cpu/cpufreq/Makefile
index 509296df294..1840c0a5170 100644
--- a/arch/x86/kernel/cpu/cpufreq/Makefile
+++ b/arch/x86/kernel/cpu/cpufreq/Makefile
@@ -4,6 +4,7 @@
obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o
obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o
+obj-$(CONFIG_X86_PCC_CPUFREQ) += pcc-cpufreq.o
obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o
obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o
obj-$(CONFIG_X86_LONGHAUL) += longhaul.o
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 1b1920fa7c8..459168083b7 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -33,6 +33,7 @@
#include <linux/cpufreq.h>
#include <linux/compiler.h>
#include <linux/dmi.h>
+#include <linux/slab.h>
#include <trace/events/power.h>
#include <linux/acpi.h>
diff --git a/arch/x86/kernel/cpu/cpufreq/elanfreq.c b/arch/x86/kernel/cpu/cpufreq/elanfreq.c
index 006b278b0d5..c587db472a7 100644
--- a/arch/x86/kernel/cpu/cpufreq/elanfreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/elanfreq.c
@@ -20,7 +20,6 @@
#include <linux/module.h>
#include <linux/init.h>
-#include <linux/slab.h>
#include <linux/delay.h>
#include <linux/cpufreq.h>
diff --git a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
index ac27ec2264d..16e3483be9e 100644
--- a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
@@ -80,6 +80,7 @@
#include <linux/cpufreq.h>
#include <linux/pci.h>
#include <linux/errno.h>
+#include <linux/slab.h>
#include <asm/processor-cyrix.h>
diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c
index da5f70fcb76..e7b559d74c5 100644
--- a/arch/x86/kernel/cpu/cpufreq/longrun.c
+++ b/arch/x86/kernel/cpu/cpufreq/longrun.c
@@ -9,7 +9,6 @@
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
-#include <linux/slab.h>
#include <linux/cpufreq.h>
#include <linux/timex.h>
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
index 86961519372..7b8a8ba67b0 100644
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
@@ -25,7 +25,6 @@
#include <linux/init.h>
#include <linux/smp.h>
#include <linux/cpufreq.h>
-#include <linux/slab.h>
#include <linux/cpumask.h>
#include <linux/timex.h>
diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
new file mode 100644
index 00000000000..ce7cde713e7
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
@@ -0,0 +1,621 @@
+/*
+ * pcc-cpufreq.c - Processor Clocking Control firmware cpufreq interface
+ *
+ * Copyright (C) 2009 Red Hat, Matthew Garrett <mjg@redhat.com>
+ * Copyright (C) 2009 Hewlett-Packard Development Company, L.P.
+ * Nagananda Chumbalkar <nagananda.chumbalkar@hp.com>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or NON
+ * INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/sched.h>
+#include <linux/cpufreq.h>
+#include <linux/compiler.h>
+#include <linux/slab.h>
+
+#include <linux/acpi.h>
+#include <linux/io.h>
+#include <linux/spinlock.h>
+#include <linux/uaccess.h>
+
+#include <acpi/processor.h>
+
+#define PCC_VERSION "1.00.00"
+#define POLL_LOOPS 300
+
+#define CMD_COMPLETE 0x1
+#define CMD_GET_FREQ 0x0
+#define CMD_SET_FREQ 0x1
+
+#define BUF_SZ 4
+
+#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
+ "pcc-cpufreq", msg)
+
+struct pcc_register_resource {
+ u8 descriptor;
+ u16 length;
+ u8 space_id;
+ u8 bit_width;
+ u8 bit_offset;
+ u8 access_size;
+ u64 address;
+} __attribute__ ((packed));
+
+struct pcc_memory_resource {
+ u8 descriptor;
+ u16 length;
+ u8 space_id;
+ u8 resource_usage;
+ u8 type_specific;
+ u64 granularity;
+ u64 minimum;
+ u64 maximum;
+ u64 translation_offset;
+ u64 address_length;
+} __attribute__ ((packed));
+
+static struct cpufreq_driver pcc_cpufreq_driver;
+
+struct pcc_header {
+ u32 signature;
+ u16 length;
+ u8 major;
+ u8 minor;
+ u32 features;
+ u16 command;
+ u16 status;
+ u32 latency;
+ u32 minimum_time;
+ u32 maximum_time;
+ u32 nominal;
+ u32 throttled_frequency;
+ u32 minimum_frequency;
+};
+
+static void __iomem *pcch_virt_addr;
+static struct pcc_header __iomem *pcch_hdr;
+
+static DEFINE_SPINLOCK(pcc_lock);
+
+static struct acpi_generic_address doorbell;
+
+static u64 doorbell_preserve;
+static u64 doorbell_write;
+
+static u8 OSC_UUID[16] = {0x63, 0x9B, 0x2C, 0x9F, 0x70, 0x91, 0x49, 0x1f,
+ 0xBB, 0x4F, 0xA5, 0x98, 0x2F, 0xA1, 0xB5, 0x46};
+
+struct pcc_cpu {
+ u32 input_offset;
+ u32 output_offset;
+};
+
+static struct pcc_cpu *pcc_cpu_info;
+
+static int pcc_cpufreq_verify(struct cpufreq_policy *policy)
+{
+ cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq,
+ policy->cpuinfo.max_freq);
+ return 0;
+}
+
+static inline void pcc_cmd(void)
+{
+ u64 doorbell_value;
+ int i;
+
+ acpi_read(&doorbell_value, &doorbell);
+ acpi_write((doorbell_value & doorbell_preserve) | doorbell_write,
+ &doorbell);
+
+ for (i = 0; i < POLL_LOOPS; i++) {
+ if (ioread16(&pcch_hdr->status) & CMD_COMPLETE)
+ break;
+ }
+}
+
+static inline void pcc_clear_mapping(void)
+{
+ if (pcch_virt_addr)
+ iounmap(pcch_virt_addr);
+ pcch_virt_addr = NULL;
+}
+
+static unsigned int pcc_get_freq(unsigned int cpu)
+{
+ struct pcc_cpu *pcc_cpu_data;
+ unsigned int curr_freq;
+ unsigned int freq_limit;
+ u16 status;
+ u32 input_buffer;
+ u32 output_buffer;
+
+ spin_lock(&pcc_lock);
+
+ dprintk("get: get_freq for CPU %d\n", cpu);
+ pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
+
+ input_buffer = 0x1;
+ iowrite32(input_buffer,
+ (pcch_virt_addr + pcc_cpu_data->input_offset));
+ iowrite16(CMD_GET_FREQ, &pcch_hdr->command);
+
+ pcc_cmd();
+
+ output_buffer =
+ ioread32(pcch_virt_addr + pcc_cpu_data->output_offset);
+
+ /* Clear the input buffer - we are done with the current command */
+ memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
+
+ status = ioread16(&pcch_hdr->status);
+ if (status != CMD_COMPLETE) {
+ dprintk("get: FAILED: for CPU %d, status is %d\n",
+ cpu, status);
+ goto cmd_incomplete;
+ }
+ iowrite16(0, &pcch_hdr->status);
+ curr_freq = (((ioread32(&pcch_hdr->nominal) * (output_buffer & 0xff))
+ / 100) * 1000);
+
+ dprintk("get: SUCCESS: (virtual) output_offset for cpu %d is "
+ "0x%x, contains a value of: 0x%x. Speed is: %d MHz\n",
+ cpu, (pcch_virt_addr + pcc_cpu_data->output_offset),
+ output_buffer, curr_freq);
+
+ freq_limit = (output_buffer >> 8) & 0xff;
+ if (freq_limit != 0xff) {
+ dprintk("get: frequency for cpu %d is being temporarily"
+ " capped at %d\n", cpu, curr_freq);
+ }
+
+ spin_unlock(&pcc_lock);
+ return curr_freq;
+
+cmd_incomplete:
+ iowrite16(0, &pcch_hdr->status);
+ spin_unlock(&pcc_lock);
+ return -EINVAL;
+}
+
+static int pcc_cpufreq_target(struct cpufreq_policy *policy,
+ unsigned int target_freq,
+ unsigned int relation)
+{
+ struct pcc_cpu *pcc_cpu_data;
+ struct cpufreq_freqs freqs;
+ u16 status;
+ u32 input_buffer;
+ int cpu;
+
+ spin_lock(&pcc_lock);
+ cpu = policy->cpu;
+ pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
+
+ dprintk("target: CPU %d should go to target freq: %d "
+ "(virtual) input_offset is 0x%x\n",
+ cpu, target_freq,
+ (pcch_virt_addr + pcc_cpu_data->input_offset));
+
+ freqs.new = target_freq;
+ freqs.cpu = cpu;
+ cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+
+ input_buffer = 0x1 | (((target_freq * 100)
+ / (ioread32(&pcch_hdr->nominal) * 1000)) << 8);
+ iowrite32(input_buffer,
+ (pcch_virt_addr + pcc_cpu_data->input_offset));
+ iowrite16(CMD_SET_FREQ, &pcch_hdr->command);
+
+ pcc_cmd();
+
+ /* Clear the input buffer - we are done with the current command */
+ memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
+
+ status = ioread16(&pcch_hdr->status);
+ if (status != CMD_COMPLETE) {
+ dprintk("target: FAILED for cpu %d, with status: 0x%x\n",
+ cpu, status);
+ goto cmd_incomplete;
+ }
+ iowrite16(0, &pcch_hdr->status);
+
+ cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+ dprintk("target: was SUCCESSFUL for cpu %d\n", cpu);
+ spin_unlock(&pcc_lock);
+
+ return 0;
+
+cmd_incomplete:
+ iowrite16(0, &pcch_hdr->status);
+ spin_unlock(&pcc_lock);
+ return -EINVAL;
+}
+
+static int pcc_get_offset(int cpu)
+{
+ acpi_status status;
+ struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL};
+ union acpi_object *pccp, *offset;
+ struct pcc_cpu *pcc_cpu_data;
+ struct acpi_processor *pr;
+ int ret = 0;
+
+ pr = per_cpu(processors, cpu);
+ pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
+
+ status = acpi_evaluate_object(pr->handle, "PCCP", NULL, &buffer);
+ if (ACPI_FAILURE(status))
+ return -ENODEV;
+
+ pccp = buffer.pointer;
+ if (!pccp || pccp->type != ACPI_TYPE_PACKAGE) {
+ ret = -ENODEV;
+ goto out_free;
+ };
+
+ offset = &(pccp->package.elements[0]);
+ if (!offset || offset->type != ACPI_TYPE_INTEGER) {
+ ret = -ENODEV;
+ goto out_free;
+ }
+
+ pcc_cpu_data->input_offset = offset->integer.value;
+
+ offset = &(pccp->package.elements[1]);
+ if (!offset || offset->type != ACPI_TYPE_INTEGER) {
+ ret = -ENODEV;
+ goto out_free;
+ }
+
+ pcc_cpu_data->output_offset = offset->integer.value;
+
+ memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
+ memset_io((pcch_virt_addr + pcc_cpu_data->output_offset), 0, BUF_SZ);
+
+ dprintk("pcc_get_offset: for CPU %d: pcc_cpu_data "
+ "input_offset: 0x%x, pcc_cpu_data output_offset: 0x%x\n",
+ cpu, pcc_cpu_data->input_offset, pcc_cpu_data->output_offset);
+out_free:
+ kfree(buffer.pointer);
+ return ret;
+}
+
+static int __init pcc_cpufreq_do_osc(acpi_handle *handle)
+{
+ acpi_status status;
+ struct acpi_object_list input;
+ struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL};
+ union acpi_object in_params[4];
+ union acpi_object *out_obj;
+ u32 capabilities[2];
+ u32 errors;
+ u32 supported;
+ int ret = 0;
+
+ input.count = 4;
+ input.pointer = in_params;
+ input.count = 4;
+ input.pointer = in_params;
+ in_params[0].type = ACPI_TYPE_BUFFER;
+ in_params[0].buffer.length = 16;
+ in_params[0].buffer.pointer = OSC_UUID;
+ in_params[1].type = ACPI_TYPE_INTEGER;
+ in_params[1].integer.value = 1;
+ in_params[2].type = ACPI_TYPE_INTEGER;
+ in_params[2].integer.value = 2;
+ in_params[3].type = ACPI_TYPE_BUFFER;
+ in_params[3].buffer.length = 8;
+ in_params[3].buffer.pointer = (u8 *)&capabilities;
+
+ capabilities[0] = OSC_QUERY_ENABLE;
+ capabilities[1] = 0x1;
+
+ status = acpi_evaluate_object(*handle, "_OSC", &input, &output);
+ if (ACPI_FAILURE(status))
+ return -ENODEV;
+
+ if (!output.length)
+ return -ENODEV;
+
+ out_obj = output.pointer;
+ if (out_obj->type != ACPI_TYPE_BUFFER) {
+ ret = -ENODEV;
+ goto out_free;
+ }
+
+ errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0);
+ if (errors) {
+ ret = -ENODEV;
+ goto out_free;
+ }
+
+ supported = *((u32 *)(out_obj->buffer.pointer + 4));
+ if (!(supported & 0x1)) {
+ ret = -ENODEV;
+ goto out_free;
+ }
+
+ kfree(output.pointer);
+ capabilities[0] = 0x0;
+ capabilities[1] = 0x1;
+
+ status = acpi_evaluate_object(*handle, "_OSC", &input, &output);
+ if (ACPI_FAILURE(status))
+ return -ENODEV;
+
+ if (!output.length)
+ return -ENODEV;
+
+ out_obj = output.pointer;
+ if (out_obj->type != ACPI_TYPE_BUFFER) {
+ ret = -ENODEV;
+ goto out_free;
+ }
+
+ errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0);
+ if (errors) {
+ ret = -ENODEV;
+ goto out_free;
+ }
+
+ supported = *((u32 *)(out_obj->buffer.pointer + 4));
+ if (!(supported & 0x1)) {
+ ret = -ENODEV;
+ goto out_free;
+ }
+
+out_free:
+ kfree(output.pointer);
+ return ret;
+}
+
+static int __init pcc_cpufreq_probe(void)
+{
+ acpi_status status;
+ struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL};
+ struct pcc_memory_resource *mem_resource;
+ struct pcc_register_resource *reg_resource;
+ union acpi_object *out_obj, *member;
+ acpi_handle handle, osc_handle;
+ int ret = 0;
+
+ status = acpi_get_handle(NULL, "\\_SB", &handle);
+ if (ACPI_FAILURE(status))
+ return -ENODEV;
+
+ status = acpi_get_handle(handle, "_OSC", &osc_handle);
+ if (ACPI_SUCCESS(status)) {
+ ret = pcc_cpufreq_do_osc(&osc_handle);
+ if (ret)
+ dprintk("probe: _OSC evaluation did not succeed\n");
+ /* Firmware's use of _OSC is optional */
+ ret = 0;
+ }
+
+ status = acpi_evaluate_object(handle, "PCCH", NULL, &output);
+ if (ACPI_FAILURE(status))
+ return -ENODEV;
+
+ out_obj = output.pointer;
+ if (out_obj->type != ACPI_TYPE_PACKAGE) {
+ ret = -ENODEV;
+ goto out_free;
+ }
+
+ member = &out_obj->package.elements[0];
+ if (member->type != ACPI_TYPE_BUFFER) {
+ ret = -ENODEV;
+ goto out_free;
+ }
+
+ mem_resource = (struct pcc_memory_resource *)member->buffer.pointer;
+
+ dprintk("probe: mem_resource descriptor: 0x%x,"
+ " length: %d, space_id: %d, resource_usage: %d,"
+ " type_specific: %d, granularity: 0x%llx,"
+ " minimum: 0x%llx, maximum: 0x%llx,"
+ " translation_offset: 0x%llx, address_length: 0x%llx\n",
+ mem_resource->descriptor, mem_resource->length,
+ mem_resource->space_id, mem_resource->resource_usage,
+ mem_resource->type_specific, mem_resource->granularity,
+ mem_resource->minimum, mem_resource->maximum,
+ mem_resource->translation_offset,
+ mem_resource->address_length);
+
+ if (mem_resource->space_id != ACPI_ADR_SPACE_SYSTEM_MEMORY) {
+ ret = -ENODEV;
+ goto out_free;
+ }
+
+ pcch_virt_addr = ioremap_nocache(mem_resource->minimum,
+ mem_resource->address_length);
+ if (pcch_virt_addr == NULL) {
+ dprintk("probe: could not map shared mem region\n");
+ goto out_free;
+ }
+ pcch_hdr = pcch_virt_addr;
+
+ dprintk("probe: PCCH header (virtual) addr: 0x%p\n", pcch_hdr);
+ dprintk("probe: PCCH header is at physical address: 0x%llx,"
+ " signature: 0x%x, length: %d bytes, major: %d, minor: %d,"
+ " supported features: 0x%x, command field: 0x%x,"
+ " status field: 0x%x, nominal latency: %d us\n",
+ mem_resource->minimum, ioread32(&pcch_hdr->signature),
+ ioread16(&pcch_hdr->length), ioread8(&pcch_hdr->major),
+ ioread8(&pcch_hdr->minor), ioread32(&pcch_hdr->features),
+ ioread16(&pcch_hdr->command), ioread16(&pcch_hdr->status),
+ ioread32(&pcch_hdr->latency));
+
+ dprintk("probe: min time between commands: %d us,"
+ " max time between commands: %d us,"
+ " nominal CPU frequency: %d MHz,"
+ " minimum CPU frequency: %d MHz,"
+ " minimum CPU frequency without throttling: %d MHz\n",
+ ioread32(&pcch_hdr->minimum_time),
+ ioread32(&pcch_hdr->maximum_time),
+ ioread32(&pcch_hdr->nominal),
+ ioread32(&pcch_hdr->throttled_frequency),
+ ioread32(&pcch_hdr->minimum_frequency));
+
+ member = &out_obj->package.elements[1];
+ if (member->type != ACPI_TYPE_BUFFER) {
+ ret = -ENODEV;
+ goto pcch_free;
+ }
+
+ reg_resource = (struct pcc_register_resource *)member->buffer.pointer;
+
+ doorbell.space_id = reg_resource->space_id;
+ doorbell.bit_width = reg_resource->bit_width;
+ doorbell.bit_offset = reg_resource->bit_offset;
+ doorbell.access_width = 64;
+ doorbell.address = reg_resource->address;
+
+ dprintk("probe: doorbell: space_id is %d, bit_width is %d, "
+ "bit_offset is %d, access_width is %d, address is 0x%llx\n",
+ doorbell.space_id, doorbell.bit_width, doorbell.bit_offset,
+ doorbell.access_width, reg_resource->address);
+
+ member = &out_obj->package.elements[2];
+ if (member->type != ACPI_TYPE_INTEGER) {
+ ret = -ENODEV;
+ goto pcch_free;
+ }
+
+ doorbell_preserve = member->integer.value;
+
+ member = &out_obj->package.elements[3];
+ if (member->type != ACPI_TYPE_INTEGER) {
+ ret = -ENODEV;
+ goto pcch_free;
+ }
+
+ doorbell_write = member->integer.value;
+
+ dprintk("probe: doorbell_preserve: 0x%llx,"
+ " doorbell_write: 0x%llx\n",
+ doorbell_preserve, doorbell_write);
+
+ pcc_cpu_info = alloc_percpu(struct pcc_cpu);
+ if (!pcc_cpu_info) {
+ ret = -ENOMEM;
+ goto pcch_free;
+ }
+
+ printk(KERN_DEBUG "pcc-cpufreq: (v%s) driver loaded with frequency"
+ " limits: %d MHz, %d MHz\n", PCC_VERSION,
+ ioread32(&pcch_hdr->minimum_frequency),
+ ioread32(&pcch_hdr->nominal));
+ kfree(output.pointer);
+ return ret;
+pcch_free:
+ pcc_clear_mapping();
+out_free:
+ kfree(output.pointer);
+ return ret;
+}
+
+static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy)
+{
+ unsigned int cpu = policy->cpu;
+ unsigned int result = 0;
+
+ if (!pcch_virt_addr) {
+ result = -1;
+ goto pcch_null;
+ }
+
+ result = pcc_get_offset(cpu);
+ if (result) {
+ dprintk("init: PCCP evaluation failed\n");
+ goto free;
+ }
+
+ policy->max = policy->cpuinfo.max_freq =
+ ioread32(&pcch_hdr->nominal) * 1000;
+ policy->min = policy->cpuinfo.min_freq =
+ ioread32(&pcch_hdr->minimum_frequency) * 1000;
+ policy->cur = pcc_get_freq(cpu);
+
+ dprintk("init: policy->max is %d, policy->min is %d\n",
+ policy->max, policy->min);
+
+ return 0;
+free:
+ pcc_clear_mapping();
+ free_percpu(pcc_cpu_info);
+pcch_null:
+ return result;
+}
+
+static int pcc_cpufreq_cpu_exit(struct cpufreq_policy *policy)
+{
+ return 0;
+}
+
+static struct cpufreq_driver pcc_cpufreq_driver = {
+ .flags = CPUFREQ_CONST_LOOPS,
+ .get = pcc_get_freq,
+ .verify = pcc_cpufreq_verify,
+ .target = pcc_cpufreq_target,
+ .init = pcc_cpufreq_cpu_init,
+ .exit = pcc_cpufreq_cpu_exit,
+ .name = "pcc-cpufreq",
+ .owner = THIS_MODULE,
+};
+
+static int __init pcc_cpufreq_init(void)
+{
+ int ret;
+
+ if (acpi_disabled)
+ return 0;
+
+ ret = pcc_cpufreq_probe();
+ if (ret) {
+ dprintk("pcc_cpufreq_init: PCCH evaluation failed\n");
+ return ret;
+ }
+
+ ret = cpufreq_register_driver(&pcc_cpufreq_driver);
+
+ return ret;
+}
+
+static void __exit pcc_cpufreq_exit(void)
+{
+ cpufreq_unregister_driver(&pcc_cpufreq_driver);
+
+ pcc_clear_mapping();
+
+ free_percpu(pcc_cpu_info);
+}
+
+MODULE_AUTHOR("Matthew Garrett, Naga Chumbalkar");
+MODULE_VERSION(PCC_VERSION);
+MODULE_DESCRIPTION("Processor Clocking Control interface driver");
+MODULE_LICENSE("GPL");
+
+late_initcall(pcc_cpufreq_init);
+module_exit(pcc_cpufreq_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
index cb01dac267d..b3379d6a5c5 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
@@ -13,7 +13,6 @@
#include <linux/init.h>
#include <linux/cpufreq.h>
#include <linux/ioport.h>
-#include <linux/slab.h>
#include <linux/timex.h>
#include <linux/io.h>
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index f125e5c551c..d360b56e982 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -806,7 +806,7 @@ static int find_psb_table(struct powernow_k8_data *data)
static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data,
unsigned int index)
{
- acpi_integer control;
+ u64 control;
if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE))
return;
@@ -824,7 +824,7 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
{
struct cpufreq_frequency_table *powernow_table;
int ret_val = -ENODEV;
- acpi_integer control, status;
+ u64 control, status;
if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) {
dprintk("register performance failed: bad ACPI data\n");
@@ -948,7 +948,7 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data,
u32 fid;
u32 vid;
u32 freq, index;
- acpi_integer status, control;
+ u64 status, control;
if (data->exttype) {
status = data->acpi_data.states[i].status;
@@ -1356,6 +1356,7 @@ static int __devexit powernowk8_cpu_exit(struct cpufreq_policy *pol)
kfree(data->powernow_table);
kfree(data);
+ per_cpu(powernow_data, pol->cpu) = NULL;
return 0;
}
@@ -1375,7 +1376,7 @@ static unsigned int powernowk8_get(unsigned int cpu)
int err;
if (!data)
- return -EINVAL;
+ return 0;
smp_call_function_single(cpu, query_values_on_cpu, &err, true);
if (err)
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
index 8d672ef162c..9b1ff37de46 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
@@ -20,6 +20,7 @@
#include <linux/sched.h> /* current */
#include <linux/delay.h>
#include <linux/compiler.h>
+#include <linux/gfp.h>
#include <asm/msr.h>
#include <asm/processor.h>
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
index 2ce8e0b5cc5..561758e9518 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
@@ -23,7 +23,6 @@
#include <linux/init.h>
#include <linux/cpufreq.h>
#include <linux/pci.h>
-#include <linux/slab.h>
#include <linux/sched.h>
#include "speedstep-lib.h"
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
index ad0083abfa2..a94ec6be69f 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
@@ -13,7 +13,6 @@
#include <linux/moduleparam.h>
#include <linux/init.h>
#include <linux/cpufreq.h>
-#include <linux/slab.h>
#include <asm/msr.h>
#include <asm/tsc.h>
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
index 04d73c114e4..8abd869baab 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
@@ -17,7 +17,6 @@
#include <linux/moduleparam.h>
#include <linux/init.h>
#include <linux/cpufreq.h>
-#include <linux/slab.h>
#include <linux/delay.h>
#include <linux/io.h>
#include <asm/ist.h>
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 879666f4d87..7e1cca13af3 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -70,7 +70,8 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
if (c->x86_power & (1 << 8)) {
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
- sched_clock_stable = 1;
+ if (!check_tsc_unstable())
+ sched_clock_stable = 1;
}
/*
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index c2b722d5a72..b3eeb66c0a5 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -18,6 +18,7 @@
#include <asm/processor.h>
#include <linux/smp.h>
#include <asm/k8.h>
+#include <asm/smp.h>
#define LVL_1_INST 1
#define LVL_1_DATA 2
@@ -152,7 +153,8 @@ struct _cpuid4_info {
union _cpuid4_leaf_ebx ebx;
union _cpuid4_leaf_ecx ecx;
unsigned long size;
- unsigned long can_disable;
+ bool can_disable;
+ unsigned int l3_indices;
DECLARE_BITMAP(shared_cpu_map, NR_CPUS);
};
@@ -162,7 +164,8 @@ struct _cpuid4_info_regs {
union _cpuid4_leaf_ebx ebx;
union _cpuid4_leaf_ecx ecx;
unsigned long size;
- unsigned long can_disable;
+ bool can_disable;
+ unsigned int l3_indices;
};
unsigned short num_cache_leaves;
@@ -292,6 +295,36 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
(ebx->split.ways_of_associativity + 1) - 1;
}
+struct _cache_attr {
+ struct attribute attr;
+ ssize_t (*show)(struct _cpuid4_info *, char *);
+ ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count);
+};
+
+#ifdef CONFIG_CPU_SUP_AMD
+static unsigned int __cpuinit amd_calc_l3_indices(void)
+{
+ /*
+ * We're called over smp_call_function_single() and therefore
+ * are on the correct cpu.
+ */
+ int cpu = smp_processor_id();
+ int node = cpu_to_node(cpu);
+ struct pci_dev *dev = node_to_k8_nb_misc(node);
+ unsigned int sc0, sc1, sc2, sc3;
+ u32 val = 0;
+
+ pci_read_config_dword(dev, 0x1C4, &val);
+
+ /* calculate subcache sizes */
+ sc0 = !(val & BIT(0));
+ sc1 = !(val & BIT(4));
+ sc2 = !(val & BIT(8)) + !(val & BIT(9));
+ sc3 = !(val & BIT(12)) + !(val & BIT(13));
+
+ return (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1;
+}
+
static void __cpuinit
amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
{
@@ -301,12 +334,103 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
if (boot_cpu_data.x86 == 0x11)
return;
- /* see erratum #382 */
- if ((boot_cpu_data.x86 == 0x10) && (boot_cpu_data.x86_model < 0x8))
+ /* see errata #382 and #388 */
+ if ((boot_cpu_data.x86 == 0x10) &&
+ ((boot_cpu_data.x86_model < 0x8) ||
+ (boot_cpu_data.x86_mask < 0x1)))
return;
- this_leaf->can_disable = 1;
+ this_leaf->can_disable = true;
+ this_leaf->l3_indices = amd_calc_l3_indices();
+}
+
+static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
+ unsigned int index)
+{
+ int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
+ int node = amd_get_nb_id(cpu);
+ struct pci_dev *dev = node_to_k8_nb_misc(node);
+ unsigned int reg = 0;
+
+ if (!this_leaf->can_disable)
+ return -EINVAL;
+
+ if (!dev)
+ return -EINVAL;
+
+ pci_read_config_dword(dev, 0x1BC + index * 4, &reg);
+ return sprintf(buf, "0x%08x\n", reg);
+}
+
+#define SHOW_CACHE_DISABLE(index) \
+static ssize_t \
+show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \
+{ \
+ return show_cache_disable(this_leaf, buf, index); \
+}
+SHOW_CACHE_DISABLE(0)
+SHOW_CACHE_DISABLE(1)
+
+static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
+ const char *buf, size_t count, unsigned int index)
+{
+ int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
+ int node = amd_get_nb_id(cpu);
+ struct pci_dev *dev = node_to_k8_nb_misc(node);
+ unsigned long val = 0;
+
+#define SUBCACHE_MASK (3UL << 20)
+#define SUBCACHE_INDEX 0xfff
+
+ if (!this_leaf->can_disable)
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!dev)
+ return -EINVAL;
+
+ if (strict_strtoul(buf, 10, &val) < 0)
+ return -EINVAL;
+
+ /* do not allow writes outside of allowed bits */
+ if ((val & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) ||
+ ((val & SUBCACHE_INDEX) > this_leaf->l3_indices))
+ return -EINVAL;
+
+ val |= BIT(30);
+ pci_write_config_dword(dev, 0x1BC + index * 4, val);
+ /*
+ * We need to WBINVD on a core on the node containing the L3 cache which
+ * indices we disable therefore a simple wbinvd() is not sufficient.
+ */
+ wbinvd_on_cpu(cpu);
+ pci_write_config_dword(dev, 0x1BC + index * 4, val | BIT(31));
+ return count;
+}
+
+#define STORE_CACHE_DISABLE(index) \
+static ssize_t \
+store_cache_disable_##index(struct _cpuid4_info *this_leaf, \
+ const char *buf, size_t count) \
+{ \
+ return store_cache_disable(this_leaf, buf, count, index); \
}
+STORE_CACHE_DISABLE(0)
+STORE_CACHE_DISABLE(1)
+
+static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644,
+ show_cache_disable_0, store_cache_disable_0);
+static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
+ show_cache_disable_1, store_cache_disable_1);
+
+#else /* CONFIG_CPU_SUP_AMD */
+static void __cpuinit
+amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
+{
+};
+#endif /* CONFIG_CPU_SUP_AMD */
static int
__cpuinit cpuid4_cache_lookup_regs(int index,
@@ -713,82 +837,6 @@ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf)
#define to_object(k) container_of(k, struct _index_kobject, kobj)
#define to_attr(a) container_of(a, struct _cache_attr, attr)
-static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
- unsigned int index)
-{
- int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
- int node = cpu_to_node(cpu);
- struct pci_dev *dev = node_to_k8_nb_misc(node);
- unsigned int reg = 0;
-
- if (!this_leaf->can_disable)
- return -EINVAL;
-
- if (!dev)
- return -EINVAL;
-
- pci_read_config_dword(dev, 0x1BC + index * 4, &reg);
- return sprintf(buf, "%x\n", reg);
-}
-
-#define SHOW_CACHE_DISABLE(index) \
-static ssize_t \
-show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \
-{ \
- return show_cache_disable(this_leaf, buf, index); \
-}
-SHOW_CACHE_DISABLE(0)
-SHOW_CACHE_DISABLE(1)
-
-static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
- const char *buf, size_t count, unsigned int index)
-{
- int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
- int node = cpu_to_node(cpu);
- struct pci_dev *dev = node_to_k8_nb_misc(node);
- unsigned long val = 0;
- unsigned int scrubber = 0;
-
- if (!this_leaf->can_disable)
- return -EINVAL;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (!dev)
- return -EINVAL;
-
- if (strict_strtoul(buf, 10, &val) < 0)
- return -EINVAL;
-
- val |= 0xc0000000;
-
- pci_read_config_dword(dev, 0x58, &scrubber);
- scrubber &= ~0x1f000000;
- pci_write_config_dword(dev, 0x58, scrubber);
-
- pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000);
- wbinvd();
- pci_write_config_dword(dev, 0x1BC + index * 4, val);
- return count;
-}
-
-#define STORE_CACHE_DISABLE(index) \
-static ssize_t \
-store_cache_disable_##index(struct _cpuid4_info *this_leaf, \
- const char *buf, size_t count) \
-{ \
- return store_cache_disable(this_leaf, buf, count, index); \
-}
-STORE_CACHE_DISABLE(0)
-STORE_CACHE_DISABLE(1)
-
-struct _cache_attr {
- struct attribute attr;
- ssize_t (*show)(struct _cpuid4_info *, char *);
- ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count);
-};
-
#define define_one_ro(_name) \
static struct _cache_attr _name = \
__ATTR(_name, 0444, show_##_name, NULL)
@@ -803,23 +851,28 @@ define_one_ro(size);
define_one_ro(shared_cpu_map);
define_one_ro(shared_cpu_list);
-static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644,
- show_cache_disable_0, store_cache_disable_0);
-static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
- show_cache_disable_1, store_cache_disable_1);
+#define DEFAULT_SYSFS_CACHE_ATTRS \
+ &type.attr, \
+ &level.attr, \
+ &coherency_line_size.attr, \
+ &physical_line_partition.attr, \
+ &ways_of_associativity.attr, \
+ &number_of_sets.attr, \
+ &size.attr, \
+ &shared_cpu_map.attr, \
+ &shared_cpu_list.attr
static struct attribute *default_attrs[] = {
- &type.attr,
- &level.attr,
- &coherency_line_size.attr,
- &physical_line_partition.attr,
- &ways_of_associativity.attr,
- &number_of_sets.attr,
- &size.attr,
- &shared_cpu_map.attr,
- &shared_cpu_list.attr,
+ DEFAULT_SYSFS_CACHE_ATTRS,
+ NULL
+};
+
+static struct attribute *default_l3_attrs[] = {
+ DEFAULT_SYSFS_CACHE_ATTRS,
+#ifdef CONFIG_CPU_SUP_AMD
&cache_disable_0.attr,
&cache_disable_1.attr,
+#endif
NULL
};
@@ -850,7 +903,7 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr,
return ret;
}
-static struct sysfs_ops sysfs_ops = {
+static const struct sysfs_ops sysfs_ops = {
.show = show,
.store = store,
};
@@ -910,6 +963,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
unsigned int cpu = sys_dev->id;
unsigned long i, j;
struct _index_kobject *this_object;
+ struct _cpuid4_info *this_leaf;
int retval;
retval = cpuid4_cache_sysfs_init(cpu);
@@ -928,6 +982,14 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
this_object = INDEX_KOBJECT_PTR(cpu, i);
this_object->cpu = cpu;
this_object->index = i;
+
+ this_leaf = CPUID4_INFO_IDX(cpu, i);
+
+ if (this_leaf->can_disable)
+ ktype_cache.default_attrs = default_l3_attrs;
+ else
+ ktype_cache.default_attrs = default_attrs;
+
retval = kobject_init_and_add(&(this_object->kobj),
&ktype_cache,
per_cpu(ici_cache_kobject, cpu),
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index 73734baa50f..e7dbde7bfed 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -22,6 +22,7 @@
#include <linux/kdebug.h>
#include <linux/cpu.h>
#include <linux/sched.h>
+#include <linux/gfp.h>
#include <asm/mce.h>
#include <asm/apic.h>
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index a8aacd4b513..7a355ddcc64 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -26,6 +26,7 @@
#include <linux/sched.h>
#include <linux/sysfs.h>
#include <linux/types.h>
+#include <linux/slab.h>
#include <linux/init.h>
#include <linux/kmod.h>
#include <linux/poll.h>
@@ -46,6 +47,13 @@
#include "mce-internal.h"
+static DEFINE_MUTEX(mce_read_mutex);
+
+#define rcu_dereference_check_mce(p) \
+ rcu_dereference_check((p), \
+ rcu_read_lock_sched_held() || \
+ lockdep_is_held(&mce_read_mutex))
+
#define CREATE_TRACE_POINTS
#include <trace/events/mce.h>
@@ -158,7 +166,7 @@ void mce_log(struct mce *mce)
mce->finished = 0;
wmb();
for (;;) {
- entry = rcu_dereference(mcelog.next);
+ entry = rcu_dereference_check_mce(mcelog.next);
for (;;) {
/*
* When the buffer fills up discard new entries.
@@ -531,7 +539,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
struct mce m;
int i;
- __get_cpu_var(mce_poll_count)++;
+ percpu_inc(mce_poll_count);
mce_setup(&m);
@@ -926,7 +934,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
atomic_inc(&mce_entry);
- __get_cpu_var(mce_exception_count)++;
+ percpu_inc(mce_exception_count);
if (notify_die(DIE_NMI, "machine check", regs, error_code,
18, SIGKILL) == NOTIFY_STOP)
@@ -1485,8 +1493,6 @@ static void collect_tscs(void *data)
rdtscll(cpu_tsc[smp_processor_id()]);
}
-static DEFINE_MUTEX(mce_read_mutex);
-
static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
loff_t *off)
{
@@ -1500,7 +1506,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
return -ENOMEM;
mutex_lock(&mce_read_mutex);
- next = rcu_dereference(mcelog.next);
+ next = rcu_dereference_check_mce(mcelog.next);
/* Only supports full reads right now */
if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
@@ -1565,7 +1571,7 @@ timeout:
static unsigned int mce_poll(struct file *file, poll_table *wait)
{
poll_wait(file, &mce_wait, wait);
- if (rcu_dereference(mcelog.next))
+ if (rcu_dereference_check_mce(mcelog.next))
return POLLIN | POLLRDNORM;
return 0;
}
@@ -2044,6 +2050,7 @@ static __init void mce_init_banks(void)
struct mce_bank *b = &mce_banks[i];
struct sysdev_attribute *a = &b->attr;
+ sysfs_attr_init(&a->attr);
a->attr.name = b->attrname;
snprintf(b->attrname, ATTR_LEN, "bank%d", i);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 83a3d1f4efc..224392d8fe8 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -21,6 +21,7 @@
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/sysfs.h>
+#include <linux/slab.h>
#include <linux/init.h>
#include <linux/cpu.h>
#include <linux/smp.h>
@@ -388,7 +389,7 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr,
return ret;
}
-static struct sysfs_ops threshold_ops = {
+static const struct sysfs_ops threshold_ops = {
.show = show,
.store = store,
};
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 7c785634af2..62b48e40920 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -5,6 +5,7 @@
* Author: Andi Kleen
*/
+#include <linux/gfp.h>
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/percpu.h>
@@ -95,7 +96,7 @@ static void cmci_discover(int banks, int boot)
/* Already owned by someone else? */
if (val & CMCI_EN) {
- if (test_and_clear_bit(i, owned) || boot)
+ if (test_and_clear_bit(i, owned) && !boot)
print_update("SHD", &hdr, i);
__clear_bit(i, __get_cpu_var(mce_poll_banks));
continue;
@@ -107,7 +108,7 @@ static void cmci_discover(int banks, int boot)
/* Did the enable bit stick? -- the bank supports CMCI */
if (val & CMCI_EN) {
- if (!test_and_set_bit(i, owned) || boot)
+ if (!test_and_set_bit(i, owned) && !boot)
print_update("CMCI", &hdr, i);
__clear_bit(i, __get_cpu_var(mce_poll_banks));
} else {
diff --git a/arch/x86/kernel/cpu/mtrr/Makefile b/arch/x86/kernel/cpu/mtrr/Makefile
index f4361b56f8e..ad9e5ed8118 100644
--- a/arch/x86/kernel/cpu/mtrr/Makefile
+++ b/arch/x86/kernel/cpu/mtrr/Makefile
@@ -1,3 +1,3 @@
-obj-y := main.o if.o generic.o state.o cleanup.o
+obj-y := main.o if.o generic.o cleanup.o
obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o
diff --git a/arch/x86/kernel/cpu/mtrr/amd.c b/arch/x86/kernel/cpu/mtrr/amd.c
index 33af14110df..92ba9cd31c9 100644
--- a/arch/x86/kernel/cpu/mtrr/amd.c
+++ b/arch/x86/kernel/cpu/mtrr/amd.c
@@ -108,7 +108,7 @@ amd_validate_add_page(unsigned long base, unsigned long size, unsigned int type)
return 0;
}
-static struct mtrr_ops amd_mtrr_ops = {
+static const struct mtrr_ops amd_mtrr_ops = {
.vendor = X86_VENDOR_AMD,
.set = amd_set_mtrr,
.get = amd_get_mtrr,
diff --git a/arch/x86/kernel/cpu/mtrr/centaur.c b/arch/x86/kernel/cpu/mtrr/centaur.c
index de89f14eff3..316fe3e60a9 100644
--- a/arch/x86/kernel/cpu/mtrr/centaur.c
+++ b/arch/x86/kernel/cpu/mtrr/centaur.c
@@ -110,7 +110,7 @@ centaur_validate_add_page(unsigned long base, unsigned long size, unsigned int t
return 0;
}
-static struct mtrr_ops centaur_mtrr_ops = {
+static const struct mtrr_ops centaur_mtrr_ops = {
.vendor = X86_VENDOR_CENTAUR,
.set = centaur_set_mcr,
.get = centaur_get_mcr,
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 09b1698e046..06130b52f01 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -22,10 +22,10 @@
#include <linux/pci.h>
#include <linux/smp.h>
#include <linux/cpu.h>
-#include <linux/sort.h>
#include <linux/mutex.h>
#include <linux/uaccess.h>
#include <linux/kvm_para.h>
+#include <linux/range.h>
#include <asm/processor.h>
#include <asm/e820.h>
@@ -34,11 +34,6 @@
#include "mtrr.h"
-struct res_range {
- unsigned long start;
- unsigned long end;
-};
-
struct var_mtrr_range_state {
unsigned long base_pfn;
unsigned long size_pfn;
@@ -56,7 +51,7 @@ struct var_mtrr_state {
/* Should be related to MTRR_VAR_RANGES nums */
#define RANGE_NUM 256
-static struct res_range __initdata range[RANGE_NUM];
+static struct range __initdata range[RANGE_NUM];
static int __initdata nr_range;
static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
@@ -64,152 +59,11 @@ static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
static int __initdata debug_print;
#define Dprintk(x...) do { if (debug_print) printk(KERN_DEBUG x); } while (0)
-
-static int __init
-add_range(struct res_range *range, int nr_range,
- unsigned long start, unsigned long end)
-{
- /* Out of slots: */
- if (nr_range >= RANGE_NUM)
- return nr_range;
-
- range[nr_range].start = start;
- range[nr_range].end = end;
-
- nr_range++;
-
- return nr_range;
-}
-
-static int __init
-add_range_with_merge(struct res_range *range, int nr_range,
- unsigned long start, unsigned long end)
-{
- int i;
-
- /* Try to merge it with old one: */
- for (i = 0; i < nr_range; i++) {
- unsigned long final_start, final_end;
- unsigned long common_start, common_end;
-
- if (!range[i].end)
- continue;
-
- common_start = max(range[i].start, start);
- common_end = min(range[i].end, end);
- if (common_start > common_end + 1)
- continue;
-
- final_start = min(range[i].start, start);
- final_end = max(range[i].end, end);
-
- range[i].start = final_start;
- range[i].end = final_end;
- return nr_range;
- }
-
- /* Need to add it: */
- return add_range(range, nr_range, start, end);
-}
-
-static void __init
-subtract_range(struct res_range *range, unsigned long start, unsigned long end)
-{
- int i, j;
-
- for (j = 0; j < RANGE_NUM; j++) {
- if (!range[j].end)
- continue;
-
- if (start <= range[j].start && end >= range[j].end) {
- range[j].start = 0;
- range[j].end = 0;
- continue;
- }
-
- if (start <= range[j].start && end < range[j].end &&
- range[j].start < end + 1) {
- range[j].start = end + 1;
- continue;
- }
-
-
- if (start > range[j].start && end >= range[j].end &&
- range[j].end > start - 1) {
- range[j].end = start - 1;
- continue;
- }
-
- if (start > range[j].start && end < range[j].end) {
- /* Find the new spare: */
- for (i = 0; i < RANGE_NUM; i++) {
- if (range[i].end == 0)
- break;
- }
- if (i < RANGE_NUM) {
- range[i].end = range[j].end;
- range[i].start = end + 1;
- } else {
- printk(KERN_ERR "run of slot in ranges\n");
- }
- range[j].end = start - 1;
- continue;
- }
- }
-}
-
-static int __init cmp_range(const void *x1, const void *x2)
-{
- const struct res_range *r1 = x1;
- const struct res_range *r2 = x2;
- long start1, start2;
-
- start1 = r1->start;
- start2 = r2->start;
-
- return start1 - start2;
-}
-
-static int __init clean_sort_range(struct res_range *range, int az)
-{
- int i, j, k = az - 1, nr_range = 0;
-
- for (i = 0; i < k; i++) {
- if (range[i].end)
- continue;
- for (j = k; j > i; j--) {
- if (range[j].end) {
- k = j;
- break;
- }
- }
- if (j == i)
- break;
- range[i].start = range[k].start;
- range[i].end = range[k].end;
- range[k].start = 0;
- range[k].end = 0;
- k--;
- }
- /* count it */
- for (i = 0; i < az; i++) {
- if (!range[i].end) {
- nr_range = i;
- break;
- }
- }
-
- /* sort them */
- sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
-
- return nr_range;
-}
-
#define BIOS_BUG_MSG KERN_WARNING \
"WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n"
static int __init
-x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
+x86_get_mtrr_mem_range(struct range *range, int nr_range,
unsigned long extra_remove_base,
unsigned long extra_remove_size)
{
@@ -223,14 +77,14 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
continue;
base = range_state[i].base_pfn;
size = range_state[i].size_pfn;
- nr_range = add_range_with_merge(range, nr_range, base,
- base + size - 1);
+ nr_range = add_range_with_merge(range, RANGE_NUM, nr_range,
+ base, base + size);
}
if (debug_print) {
printk(KERN_DEBUG "After WB checking\n");
for (i = 0; i < nr_range; i++)
- printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
- range[i].start, range[i].end + 1);
+ printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
+ range[i].start, range[i].end);
}
/* Take out UC ranges: */
@@ -252,19 +106,19 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
size -= (1<<(20-PAGE_SHIFT)) - base;
base = 1<<(20-PAGE_SHIFT);
}
- subtract_range(range, base, base + size - 1);
+ subtract_range(range, RANGE_NUM, base, base + size);
}
if (extra_remove_size)
- subtract_range(range, extra_remove_base,
- extra_remove_base + extra_remove_size - 1);
+ subtract_range(range, RANGE_NUM, extra_remove_base,
+ extra_remove_base + extra_remove_size);
if (debug_print) {
printk(KERN_DEBUG "After UC checking\n");
for (i = 0; i < RANGE_NUM; i++) {
if (!range[i].end)
continue;
- printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
- range[i].start, range[i].end + 1);
+ printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
+ range[i].start, range[i].end);
}
}
@@ -273,26 +127,22 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
if (debug_print) {
printk(KERN_DEBUG "After sorting\n");
for (i = 0; i < nr_range; i++)
- printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
- range[i].start, range[i].end + 1);
+ printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
+ range[i].start, range[i].end);
}
- /* clear those is not used */
- for (i = nr_range; i < RANGE_NUM; i++)
- memset(&range[i], 0, sizeof(range[i]));
-
return nr_range;
}
#ifdef CONFIG_MTRR_SANITIZER
-static unsigned long __init sum_ranges(struct res_range *range, int nr_range)
+static unsigned long __init sum_ranges(struct range *range, int nr_range)
{
unsigned long sum = 0;
int i;
for (i = 0; i < nr_range; i++)
- sum += range[i].end + 1 - range[i].start;
+ sum += range[i].end - range[i].start;
return sum;
}
@@ -621,7 +471,7 @@ static int __init parse_mtrr_spare_reg(char *arg)
early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg);
static int __init
-x86_setup_var_mtrrs(struct res_range *range, int nr_range,
+x86_setup_var_mtrrs(struct range *range, int nr_range,
u64 chunk_size, u64 gran_size)
{
struct var_mtrr_state var_state;
@@ -639,7 +489,7 @@ x86_setup_var_mtrrs(struct res_range *range, int nr_range,
/* Write the range: */
for (i = 0; i < nr_range; i++) {
set_var_mtrr_range(&var_state, range[i].start,
- range[i].end - range[i].start + 1);
+ range[i].end - range[i].start);
}
/* Write the last range: */
@@ -742,7 +592,7 @@ mtrr_calc_range_state(u64 chunk_size, u64 gran_size,
unsigned long x_remove_base,
unsigned long x_remove_size, int i)
{
- static struct res_range range_new[RANGE_NUM];
+ static struct range range_new[RANGE_NUM];
unsigned long range_sums_new;
static int nr_range_new;
int num_reg;
@@ -869,10 +719,10 @@ int __init mtrr_cleanup(unsigned address_bits)
* [0, 1M) should always be covered by var mtrr with WB
* and fixed mtrrs should take effect before var mtrr for it:
*/
- nr_range = add_range_with_merge(range, nr_range, 0,
- (1ULL<<(20 - PAGE_SHIFT)) - 1);
+ nr_range = add_range_with_merge(range, RANGE_NUM, nr_range, 0,
+ 1ULL<<(20 - PAGE_SHIFT));
/* Sort the ranges: */
- sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
+ sort_range(range, nr_range);
range_sums = sum_ranges(range, nr_range);
printk(KERN_INFO "total RAM covered: %ldM\n",
@@ -1089,9 +939,9 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
nr_range = 0;
if (mtrr_tom2) {
range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT));
- range[nr_range].end = (mtrr_tom2 >> PAGE_SHIFT) - 1;
- if (highest_pfn < range[nr_range].end + 1)
- highest_pfn = range[nr_range].end + 1;
+ range[nr_range].end = mtrr_tom2 >> PAGE_SHIFT;
+ if (highest_pfn < range[nr_range].end)
+ highest_pfn = range[nr_range].end;
nr_range++;
}
nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0);
@@ -1103,15 +953,15 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
/* Check the holes: */
for (i = 0; i < nr_range - 1; i++) {
- if (range[i].end + 1 < range[i+1].start)
- total_trim_size += real_trim_memory(range[i].end + 1,
+ if (range[i].end < range[i+1].start)
+ total_trim_size += real_trim_memory(range[i].end,
range[i+1].start);
}
/* Check the top: */
i = nr_range - 1;
- if (range[i].end + 1 < end_pfn)
- total_trim_size += real_trim_memory(range[i].end + 1,
+ if (range[i].end < end_pfn)
+ total_trim_size += real_trim_memory(range[i].end,
end_pfn);
if (total_trim_size) {
diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c
index 228d982ce09..68a3343e579 100644
--- a/arch/x86/kernel/cpu/mtrr/cyrix.c
+++ b/arch/x86/kernel/cpu/mtrr/cyrix.c
@@ -265,7 +265,7 @@ static void cyrix_set_all(void)
post_set();
}
-static struct mtrr_ops cyrix_mtrr_ops = {
+static const struct mtrr_ops cyrix_mtrr_ops = {
.vendor = X86_VENDOR_CYRIX,
.set_all = cyrix_set_all,
.set = cyrix_set_arr,
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 55da0c5f68d..fd31a441c61 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -6,7 +6,6 @@
#include <linux/module.h>
#include <linux/init.h>
-#include <linux/slab.h>
#include <linux/io.h>
#include <linux/mm.h>
@@ -464,7 +463,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
tmp |= ~((1<<(hi - 1)) - 1);
if (tmp != mask_lo) {
- WARN_ONCE(1, KERN_INFO "mtrr: your BIOS has set up an incorrect mask, fixing it up.\n");
+ printk(KERN_WARNING "mtrr: your BIOS has configured an incorrect mask, fixing it.\n");
mask_lo = tmp;
}
}
@@ -570,7 +569,7 @@ static unsigned long set_mtrr_state(void)
static unsigned long cr4;
-static DEFINE_SPINLOCK(set_atomicity_lock);
+static DEFINE_RAW_SPINLOCK(set_atomicity_lock);
/*
* Since we are disabling the cache don't allow any interrupts,
@@ -590,7 +589,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
* changes to the way the kernel boots
*/
- spin_lock(&set_atomicity_lock);
+ raw_spin_lock(&set_atomicity_lock);
/* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */
cr0 = read_cr0() | X86_CR0_CD;
@@ -627,7 +626,7 @@ static void post_set(void) __releases(set_atomicity_lock)
/* Restore value of CR4 */
if (cpu_has_pge)
write_cr4(cr4);
- spin_unlock(&set_atomicity_lock);
+ raw_spin_unlock(&set_atomicity_lock);
}
static void generic_set_all(void)
@@ -752,7 +751,7 @@ int positive_have_wrcomb(void)
/*
* Generic structure...
*/
-struct mtrr_ops generic_mtrr_ops = {
+const struct mtrr_ops generic_mtrr_ops = {
.use_intel_if = 1,
.set_all = generic_set_all,
.get = generic_get_mtrr,
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index e006e56f699..79289632cb2 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -5,6 +5,7 @@
#include <linux/module.h>
#include <linux/ctype.h>
#include <linux/string.h>
+#include <linux/slab.h>
#include <linux/init.h>
#define LINE_SIZE 80
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 84e83de5457..79556bd9b60 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -60,14 +60,14 @@ static DEFINE_MUTEX(mtrr_mutex);
u64 size_or_mask, size_and_mask;
static bool mtrr_aps_delayed_init;
-static struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM];
+static const struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM];
-struct mtrr_ops *mtrr_if;
+const struct mtrr_ops *mtrr_if;
static void set_mtrr(unsigned int reg, unsigned long base,
unsigned long size, mtrr_type type);
-void set_mtrr_ops(struct mtrr_ops *ops)
+void set_mtrr_ops(const struct mtrr_ops *ops)
{
if (ops->vendor && ops->vendor < X86_VENDOR_NUM)
mtrr_ops[ops->vendor] = ops;
@@ -145,6 +145,7 @@ struct set_mtrr_data {
/**
* ipi_handler - Synchronisation handler. Executed by "other" CPUs.
+ * @info: pointer to mtrr configuration data
*
* Returns nothing.
*/
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index a501dee9a87..df5e41f31a2 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -32,7 +32,7 @@ extern int generic_get_free_region(unsigned long base, unsigned long size,
extern int generic_validate_add_page(unsigned long base, unsigned long size,
unsigned int type);
-extern struct mtrr_ops generic_mtrr_ops;
+extern const struct mtrr_ops generic_mtrr_ops;
extern int positive_have_wrcomb(void);
@@ -53,10 +53,10 @@ void fill_mtrr_var_range(unsigned int index,
u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi);
void get_mtrr_state(void);
-extern void set_mtrr_ops(struct mtrr_ops *ops);
+extern void set_mtrr_ops(const struct mtrr_ops *ops);
extern u64 size_or_mask, size_and_mask;
-extern struct mtrr_ops *mtrr_if;
+extern const struct mtrr_ops *mtrr_if;
#define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd)
#define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1)
diff --git a/arch/x86/kernel/cpu/mtrr/state.c b/arch/x86/kernel/cpu/mtrr/state.c
deleted file mode 100644
index dfc80b4e6b0..00000000000
--- a/arch/x86/kernel/cpu/mtrr/state.c
+++ /dev/null
@@ -1,94 +0,0 @@
-#include <linux/init.h>
-#include <linux/io.h>
-#include <linux/mm.h>
-
-#include <asm/processor-cyrix.h>
-#include <asm/processor-flags.h>
-#include <asm/mtrr.h>
-#include <asm/msr.h>
-
-#include "mtrr.h"
-
-/* Put the processor into a state where MTRRs can be safely set */
-void set_mtrr_prepare_save(struct set_mtrr_context *ctxt)
-{
- unsigned int cr0;
-
- /* Disable interrupts locally */
- local_irq_save(ctxt->flags);
-
- if (use_intel() || is_cpu(CYRIX)) {
-
- /* Save value of CR4 and clear Page Global Enable (bit 7) */
- if (cpu_has_pge) {
- ctxt->cr4val = read_cr4();
- write_cr4(ctxt->cr4val & ~X86_CR4_PGE);
- }
-
- /*
- * Disable and flush caches. Note that wbinvd flushes the TLBs
- * as a side-effect
- */
- cr0 = read_cr0() | X86_CR0_CD;
- wbinvd();
- write_cr0(cr0);
- wbinvd();
-
- if (use_intel()) {
- /* Save MTRR state */
- rdmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi);
- } else {
- /*
- * Cyrix ARRs -
- * everything else were excluded at the top
- */
- ctxt->ccr3 = getCx86(CX86_CCR3);
- }
- }
-}
-
-void set_mtrr_cache_disable(struct set_mtrr_context *ctxt)
-{
- if (use_intel()) {
- /* Disable MTRRs, and set the default type to uncached */
- mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo & 0xf300UL,
- ctxt->deftype_hi);
- } else {
- if (is_cpu(CYRIX)) {
- /* Cyrix ARRs - everything else were excluded at the top */
- setCx86(CX86_CCR3, (ctxt->ccr3 & 0x0f) | 0x10);
- }
- }
-}
-
-/* Restore the processor after a set_mtrr_prepare */
-void set_mtrr_done(struct set_mtrr_context *ctxt)
-{
- if (use_intel() || is_cpu(CYRIX)) {
-
- /* Flush caches and TLBs */
- wbinvd();
-
- /* Restore MTRRdefType */
- if (use_intel()) {
- /* Intel (P6) standard MTRRs */
- mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo,
- ctxt->deftype_hi);
- } else {
- /*
- * Cyrix ARRs -
- * everything else was excluded at the top
- */
- setCx86(CX86_CCR3, ctxt->ccr3);
- }
-
- /* Enable caches */
- write_cr0(read_cr0() & 0xbfffffff);
-
- /* Restore value of CR4 */
- if (cpu_has_pge)
- write_cr4(ctxt->cr4val);
- }
- /* Re-enable interrupts locally (if enabled previously) */
- local_irq_restore(ctxt->flags);
-}
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index c223b7e895d..db5bdc8addf 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -7,6 +7,7 @@
* Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
* Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
* Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
+ * Copyright (C) 2009 Google, Inc., Stephane Eranian
*
* For licencing details see kernel-base/COPYING
*/
@@ -20,12 +21,15 @@
#include <linux/kdebug.h>
#include <linux/sched.h>
#include <linux/uaccess.h>
+#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/cpu.h>
+#include <linux/bitops.h>
#include <asm/apic.h>
#include <asm/stacktrace.h>
#include <asm/nmi.h>
+#include <asm/compat.h>
static u64 perf_event_mask __read_mostly;
@@ -68,26 +72,59 @@ struct debug_store {
u64 pebs_event_reset[MAX_PEBS_EVENTS];
};
+struct event_constraint {
+ union {
+ unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+ u64 idxmsk64;
+ };
+ u64 code;
+ u64 cmask;
+ int weight;
+};
+
+struct amd_nb {
+ int nb_id; /* NorthBridge id */
+ int refcnt; /* reference count */
+ struct perf_event *owners[X86_PMC_IDX_MAX];
+ struct event_constraint event_constraints[X86_PMC_IDX_MAX];
+};
+
struct cpu_hw_events {
- struct perf_event *events[X86_PMC_IDX_MAX];
- unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+ struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */
unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
unsigned long interrupts;
int enabled;
struct debug_store *ds;
-};
-struct event_constraint {
- unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
- int code;
+ int n_events;
+ int n_added;
+ int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
+ u64 tags[X86_PMC_IDX_MAX];
+ struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
+ struct amd_nb *amd_nb;
};
-#define EVENT_CONSTRAINT(c, m) { .code = (c), .idxmsk[0] = (m) }
-#define EVENT_CONSTRAINT_END { .code = 0, .idxmsk[0] = 0 }
+#define __EVENT_CONSTRAINT(c, n, m, w) {\
+ { .idxmsk64 = (n) }, \
+ .code = (c), \
+ .cmask = (m), \
+ .weight = (w), \
+}
+
+#define EVENT_CONSTRAINT(c, n, m) \
+ __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n))
-#define for_each_event_constraint(e, c) \
- for ((e) = (c); (e)->idxmsk[0]; (e)++)
+#define INTEL_EVENT_CONSTRAINT(c, n) \
+ EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVTSEL_MASK)
+#define FIXED_EVENT_CONSTRAINT(c, n) \
+ EVENT_CONSTRAINT(c, (1ULL << (32+n)), INTEL_ARCH_FIXED_MASK)
+
+#define EVENT_CONSTRAINT_END \
+ EVENT_CONSTRAINT(0, 0, 0)
+
+#define for_each_event_constraint(e, c) \
+ for ((e) = (c); (e)->cmask; (e)++)
/*
* struct x86_pmu - generic x86 pmu
@@ -98,8 +135,8 @@ struct x86_pmu {
int (*handle_irq)(struct pt_regs *);
void (*disable_all)(void);
void (*enable_all)(void);
- void (*enable)(struct hw_perf_event *, int);
- void (*disable)(struct hw_perf_event *, int);
+ void (*enable)(struct perf_event *);
+ void (*disable)(struct perf_event *);
unsigned eventsel;
unsigned perfctr;
u64 (*event_map)(int);
@@ -114,121 +151,28 @@ struct x86_pmu {
u64 intel_ctrl;
void (*enable_bts)(u64 config);
void (*disable_bts)(void);
- int (*get_event_idx)(struct cpu_hw_events *cpuc,
- struct hw_perf_event *hwc);
-};
-static struct x86_pmu x86_pmu __read_mostly;
+ struct event_constraint *
+ (*get_event_constraints)(struct cpu_hw_events *cpuc,
+ struct perf_event *event);
-static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
- .enabled = 1,
-};
+ void (*put_event_constraints)(struct cpu_hw_events *cpuc,
+ struct perf_event *event);
+ struct event_constraint *event_constraints;
-static const struct event_constraint *event_constraints;
-
-/*
- * Not sure about some of these
- */
-static const u64 p6_perfmon_event_map[] =
-{
- [PERF_COUNT_HW_CPU_CYCLES] = 0x0079,
- [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
- [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e,
- [PERF_COUNT_HW_CACHE_MISSES] = 0x012e,
- [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
- [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
- [PERF_COUNT_HW_BUS_CYCLES] = 0x0062,
-};
-
-static u64 p6_pmu_event_map(int hw_event)
-{
- return p6_perfmon_event_map[hw_event];
-}
-
-/*
- * Event setting that is specified not to count anything.
- * We use this to effectively disable a counter.
- *
- * L2_RQSTS with 0 MESI unit mask.
- */
-#define P6_NOP_EVENT 0x0000002EULL
-
-static u64 p6_pmu_raw_event(u64 hw_event)
-{
-#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL
-#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL
-#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL
-#define P6_EVNTSEL_INV_MASK 0x00800000ULL
-#define P6_EVNTSEL_REG_MASK 0xFF000000ULL
-
-#define P6_EVNTSEL_MASK \
- (P6_EVNTSEL_EVENT_MASK | \
- P6_EVNTSEL_UNIT_MASK | \
- P6_EVNTSEL_EDGE_MASK | \
- P6_EVNTSEL_INV_MASK | \
- P6_EVNTSEL_REG_MASK)
-
- return hw_event & P6_EVNTSEL_MASK;
-}
-
-static const struct event_constraint intel_p6_event_constraints[] =
-{
- EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */
- EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
- EVENT_CONSTRAINT(0x11, 0x1), /* FP_ASSIST */
- EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
- EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
- EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
- EVENT_CONSTRAINT_END
-};
-
-/*
- * Intel PerfMon v3. Used on Core2 and later.
- */
-static const u64 intel_perfmon_event_map[] =
-{
- [PERF_COUNT_HW_CPU_CYCLES] = 0x003c,
- [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
- [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e,
- [PERF_COUNT_HW_CACHE_MISSES] = 0x412e,
- [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
- [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
- [PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
+ int (*cpu_prepare)(int cpu);
+ void (*cpu_starting)(int cpu);
+ void (*cpu_dying)(int cpu);
+ void (*cpu_dead)(int cpu);
};
-static const struct event_constraint intel_core_event_constraints[] =
-{
- EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
- EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
- EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
- EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
- EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
- EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */
- EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
- EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */
- EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */
- EVENT_CONSTRAINT_END
-};
+static struct x86_pmu x86_pmu __read_mostly;
-static const struct event_constraint intel_nehalem_event_constraints[] =
-{
- EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */
- EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */
- EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */
- EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */
- EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */
- EVENT_CONSTRAINT(0x4c, 0x3), /* LOAD_HIT_PRE */
- EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
- EVENT_CONSTRAINT(0x52, 0x3), /* L1D_CACHE_PREFETCH_LOCK_FB_HIT */
- EVENT_CONSTRAINT(0x53, 0x3), /* L1D_CACHE_LOCK_FB_HIT */
- EVENT_CONSTRAINT(0xc5, 0x3), /* CACHE_LOCK_CYCLES */
- EVENT_CONSTRAINT_END
+static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
+ .enabled = 1,
};
-static u64 intel_pmu_event_map(int hw_event)
-{
- return intel_perfmon_event_map[hw_event];
-}
+static int x86_perf_event_set_period(struct perf_event *event);
/*
* Generalized hw caching related hw_event table, filled
@@ -245,435 +189,18 @@ static u64 __read_mostly hw_cache_event_ids
[PERF_COUNT_HW_CACHE_OP_MAX]
[PERF_COUNT_HW_CACHE_RESULT_MAX];
-static __initconst u64 nehalem_hw_cache_event_ids
- [PERF_COUNT_HW_CACHE_MAX]
- [PERF_COUNT_HW_CACHE_OP_MAX]
- [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
- [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
- [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */
- [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */
- },
- },
- [ C(L1I ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
- [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0x0,
- [ C(RESULT_MISS) ] = 0x0,
- },
- },
- [ C(LL ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */
- [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */
- [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */
- [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */
- },
- },
- [ C(DTLB) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
- [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
- [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0x0,
- [ C(RESULT_MISS) ] = 0x0,
- },
- },
- [ C(ITLB) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */
- [ C(RESULT_MISS) ] = 0x20c8, /* ITLB_MISS_RETIRED */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- },
- [ C(BPU ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
- [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- },
-};
-
-static __initconst u64 core2_hw_cache_event_ids
- [PERF_COUNT_HW_CACHE_MAX]
- [PERF_COUNT_HW_CACHE_OP_MAX]
- [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
- [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
- [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS */
- [ C(RESULT_MISS) ] = 0,
- },
- },
- [ C(L1I ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */
- [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0,
- [ C(RESULT_MISS) ] = 0,
- },
- },
- [ C(LL ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
- [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
- [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0,
- [ C(RESULT_MISS) ] = 0,
- },
- },
- [ C(DTLB) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
- [ C(RESULT_MISS) ] = 0x0208, /* DTLB_MISSES.MISS_LD */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
- [ C(RESULT_MISS) ] = 0x0808, /* DTLB_MISSES.MISS_ST */
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0,
- [ C(RESULT_MISS) ] = 0,
- },
- },
- [ C(ITLB) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
- [ C(RESULT_MISS) ] = 0x1282, /* ITLBMISSES */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- },
- [ C(BPU ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
- [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- },
-};
-
-static __initconst u64 atom_hw_cache_event_ids
- [PERF_COUNT_HW_CACHE_MAX]
- [PERF_COUNT_HW_CACHE_OP_MAX]
- [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD */
- [ C(RESULT_MISS) ] = 0,
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST */
- [ C(RESULT_MISS) ] = 0,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0x0,
- [ C(RESULT_MISS) ] = 0,
- },
- },
- [ C(L1I ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
- [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0,
- [ C(RESULT_MISS) ] = 0,
- },
- },
- [ C(LL ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
- [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
- [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0,
- [ C(RESULT_MISS) ] = 0,
- },
- },
- [ C(DTLB) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI (alias) */
- [ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI (alias) */
- [ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0,
- [ C(RESULT_MISS) ] = 0,
- },
- },
- [ C(ITLB) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
- [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- },
- [ C(BPU ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
- [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- },
-};
-
-static u64 intel_pmu_raw_event(u64 hw_event)
-{
-#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL
-#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL
-#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL
-#define CORE_EVNTSEL_INV_MASK 0x00800000ULL
-#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL
-
-#define CORE_EVNTSEL_MASK \
- (CORE_EVNTSEL_EVENT_MASK | \
- CORE_EVNTSEL_UNIT_MASK | \
- CORE_EVNTSEL_EDGE_MASK | \
- CORE_EVNTSEL_INV_MASK | \
- CORE_EVNTSEL_REG_MASK)
-
- return hw_event & CORE_EVNTSEL_MASK;
-}
-
-static __initconst u64 amd_hw_cache_event_ids
- [PERF_COUNT_HW_CACHE_MAX]
- [PERF_COUNT_HW_CACHE_OP_MAX]
- [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
- [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */
- [ C(RESULT_MISS) ] = 0,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts */
- [ C(RESULT_MISS) ] = 0x0167, /* Data Prefetcher :cancelled */
- },
- },
- [ C(L1I ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches */
- [ C(RESULT_MISS) ] = 0x0081, /* Instruction cache misses */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */
- [ C(RESULT_MISS) ] = 0,
- },
- },
- [ C(LL ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */
- [ C(RESULT_MISS) ] = 0x037E, /* L2 Cache Misses : IC+DC */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback */
- [ C(RESULT_MISS) ] = 0,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0,
- [ C(RESULT_MISS) ] = 0,
- },
- },
- [ C(DTLB) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
- [ C(RESULT_MISS) ] = 0x0046, /* L1 DTLB and L2 DLTB Miss */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0,
- [ C(RESULT_MISS) ] = 0,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0,
- [ C(RESULT_MISS) ] = 0,
- },
- },
- [ C(ITLB) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */
- [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- },
- [ C(BPU ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr. */
- [ C(RESULT_MISS) ] = 0x00c3, /* Retired Mispredicted BI */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- },
-};
-
-/*
- * AMD Performance Monitor K7 and later.
- */
-static const u64 amd_perfmon_event_map[] =
-{
- [PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
- [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
- [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080,
- [PERF_COUNT_HW_CACHE_MISSES] = 0x0081,
- [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
- [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
-};
-
-static u64 amd_pmu_event_map(int hw_event)
-{
- return amd_perfmon_event_map[hw_event];
-}
-
-static u64 amd_pmu_raw_event(u64 hw_event)
-{
-#define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL
-#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL
-#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL
-#define K7_EVNTSEL_INV_MASK 0x000800000ULL
-#define K7_EVNTSEL_REG_MASK 0x0FF000000ULL
-
-#define K7_EVNTSEL_MASK \
- (K7_EVNTSEL_EVENT_MASK | \
- K7_EVNTSEL_UNIT_MASK | \
- K7_EVNTSEL_EDGE_MASK | \
- K7_EVNTSEL_INV_MASK | \
- K7_EVNTSEL_REG_MASK)
-
- return hw_event & K7_EVNTSEL_MASK;
-}
-
/*
* Propagate event elapsed time into the generic event.
* Can only be executed on the CPU where the event is active.
* Returns the delta events processed.
*/
static u64
-x86_perf_event_update(struct perf_event *event,
- struct hw_perf_event *hwc, int idx)
+x86_perf_event_update(struct perf_event *event)
{
+ struct hw_perf_event *hwc = &event->hw;
int shift = 64 - x86_pmu.event_bits;
u64 prev_raw_count, new_raw_count;
+ int idx = hwc->idx;
s64 delta;
if (idx == X86_PMC_IDX_FIXED_BTS)
@@ -773,7 +300,7 @@ static inline bool bts_available(void)
return x86_pmu.enable_bts != NULL;
}
-static inline void init_debug_store_on_cpu(int cpu)
+static void init_debug_store_on_cpu(int cpu)
{
struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
@@ -785,7 +312,7 @@ static inline void init_debug_store_on_cpu(int cpu)
(u32)((u64)(unsigned long)ds >> 32));
}
-static inline void fini_debug_store_on_cpu(int cpu)
+static void fini_debug_store_on_cpu(int cpu)
{
if (!per_cpu(cpu_hw_events, cpu).ds)
return;
@@ -914,42 +441,6 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr)
return 0;
}
-static void intel_pmu_enable_bts(u64 config)
-{
- unsigned long debugctlmsr;
-
- debugctlmsr = get_debugctlmsr();
-
- debugctlmsr |= X86_DEBUGCTL_TR;
- debugctlmsr |= X86_DEBUGCTL_BTS;
- debugctlmsr |= X86_DEBUGCTL_BTINT;
-
- if (!(config & ARCH_PERFMON_EVENTSEL_OS))
- debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS;
-
- if (!(config & ARCH_PERFMON_EVENTSEL_USR))
- debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR;
-
- update_debugctlmsr(debugctlmsr);
-}
-
-static void intel_pmu_disable_bts(void)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- unsigned long debugctlmsr;
-
- if (!cpuc->ds)
- return;
-
- debugctlmsr = get_debugctlmsr();
-
- debugctlmsr &=
- ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT |
- X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR);
-
- update_debugctlmsr(debugctlmsr);
-}
-
/*
* Setup the hardware configuration for a given attr_type
*/
@@ -988,6 +479,8 @@ static int __hw_perf_event_init(struct perf_event *event)
hwc->config = ARCH_PERFMON_EVENTSEL_INT;
hwc->idx = -1;
+ hwc->last_cpu = -1;
+ hwc->last_tag = ~0ULL;
/*
* Count user and OS events unless requested not to.
@@ -1017,6 +510,9 @@ static int __hw_perf_event_init(struct perf_event *event)
*/
if (attr->type == PERF_TYPE_RAW) {
hwc->config |= x86_pmu.raw_event(attr->config);
+ if ((hwc->config & ARCH_PERFMON_EVENTSEL_ANY) &&
+ perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
+ return -EACCES;
return 0;
}
@@ -1056,216 +552,314 @@ static int __hw_perf_event_init(struct perf_event *event)
return 0;
}
-static void p6_pmu_disable_all(void)
+static void x86_pmu_disable_all(void)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- u64 val;
-
- if (!cpuc->enabled)
- return;
+ int idx;
- cpuc->enabled = 0;
- barrier();
+ for (idx = 0; idx < x86_pmu.num_events; idx++) {
+ u64 val;
- /* p6 only has one enable register */
- rdmsrl(MSR_P6_EVNTSEL0, val);
- val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
- wrmsrl(MSR_P6_EVNTSEL0, val);
+ if (!test_bit(idx, cpuc->active_mask))
+ continue;
+ rdmsrl(x86_pmu.eventsel + idx, val);
+ if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
+ continue;
+ val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
+ wrmsrl(x86_pmu.eventsel + idx, val);
+ }
}
-static void intel_pmu_disable_all(void)
+void hw_perf_disable(void)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ if (!x86_pmu_initialized())
+ return;
+
if (!cpuc->enabled)
return;
+ cpuc->n_added = 0;
cpuc->enabled = 0;
barrier();
- wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
-
- if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask))
- intel_pmu_disable_bts();
+ x86_pmu.disable_all();
}
-static void amd_pmu_disable_all(void)
+static void x86_pmu_enable_all(void)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
int idx;
- if (!cpuc->enabled)
- return;
-
- cpuc->enabled = 0;
- /*
- * ensure we write the disable before we start disabling the
- * events proper, so that amd_pmu_enable_event() does the
- * right thing.
- */
- barrier();
-
for (idx = 0; idx < x86_pmu.num_events; idx++) {
+ struct perf_event *event = cpuc->events[idx];
u64 val;
if (!test_bit(idx, cpuc->active_mask))
continue;
- rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
- if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE))
- continue;
- val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
- wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
+
+ val = event->hw.config;
+ val |= ARCH_PERFMON_EVENTSEL_ENABLE;
+ wrmsrl(x86_pmu.eventsel + idx, val);
}
}
-void hw_perf_disable(void)
+static const struct pmu pmu;
+
+static inline int is_x86_event(struct perf_event *event)
{
- if (!x86_pmu_initialized())
- return;
- return x86_pmu.disable_all();
+ return event->pmu == &pmu;
}
-static void p6_pmu_enable_all(void)
+static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- unsigned long val;
+ struct event_constraint *c, *constraints[X86_PMC_IDX_MAX];
+ unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+ int i, j, w, wmax, num = 0;
+ struct hw_perf_event *hwc;
- if (cpuc->enabled)
- return;
+ bitmap_zero(used_mask, X86_PMC_IDX_MAX);
- cpuc->enabled = 1;
- barrier();
+ for (i = 0; i < n; i++) {
+ c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]);
+ constraints[i] = c;
+ }
- /* p6 only has one enable register */
- rdmsrl(MSR_P6_EVNTSEL0, val);
- val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
- wrmsrl(MSR_P6_EVNTSEL0, val);
-}
+ /*
+ * fastpath, try to reuse previous register
+ */
+ for (i = 0; i < n; i++) {
+ hwc = &cpuc->event_list[i]->hw;
+ c = constraints[i];
-static void intel_pmu_enable_all(void)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ /* never assigned */
+ if (hwc->idx == -1)
+ break;
- if (cpuc->enabled)
- return;
+ /* constraint still honored */
+ if (!test_bit(hwc->idx, c->idxmsk))
+ break;
- cpuc->enabled = 1;
- barrier();
+ /* not already used */
+ if (test_bit(hwc->idx, used_mask))
+ break;
- wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
+ __set_bit(hwc->idx, used_mask);
+ if (assign)
+ assign[i] = hwc->idx;
+ }
+ if (i == n)
+ goto done;
- if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
- struct perf_event *event =
- cpuc->events[X86_PMC_IDX_FIXED_BTS];
+ /*
+ * begin slow path
+ */
- if (WARN_ON_ONCE(!event))
- return;
+ bitmap_zero(used_mask, X86_PMC_IDX_MAX);
- intel_pmu_enable_bts(event->hw.config);
- }
-}
+ /*
+ * weight = number of possible counters
+ *
+ * 1 = most constrained, only works on one counter
+ * wmax = least constrained, works on any counter
+ *
+ * assign events to counters starting with most
+ * constrained events.
+ */
+ wmax = x86_pmu.num_events;
-static void amd_pmu_enable_all(void)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- int idx;
+ /*
+ * when fixed event counters are present,
+ * wmax is incremented by 1 to account
+ * for one more choice
+ */
+ if (x86_pmu.num_events_fixed)
+ wmax++;
- if (cpuc->enabled)
- return;
+ for (w = 1, num = n; num && w <= wmax; w++) {
+ /* for each event */
+ for (i = 0; num && i < n; i++) {
+ c = constraints[i];
+ hwc = &cpuc->event_list[i]->hw;
- cpuc->enabled = 1;
- barrier();
+ if (c->weight != w)
+ continue;
- for (idx = 0; idx < x86_pmu.num_events; idx++) {
- struct perf_event *event = cpuc->events[idx];
- u64 val;
+ for_each_set_bit(j, c->idxmsk, X86_PMC_IDX_MAX) {
+ if (!test_bit(j, used_mask))
+ break;
+ }
- if (!test_bit(idx, cpuc->active_mask))
- continue;
+ if (j == X86_PMC_IDX_MAX)
+ break;
- val = event->hw.config;
- val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
- wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
+ __set_bit(j, used_mask);
+
+ if (assign)
+ assign[i] = j;
+ num--;
+ }
}
+done:
+ /*
+ * scheduling failed or is just a simulation,
+ * free resources if necessary
+ */
+ if (!assign || num) {
+ for (i = 0; i < n; i++) {
+ if (x86_pmu.put_event_constraints)
+ x86_pmu.put_event_constraints(cpuc, cpuc->event_list[i]);
+ }
+ }
+ return num ? -ENOSPC : 0;
}
-void hw_perf_enable(void)
+/*
+ * dogrp: true if must collect siblings events (group)
+ * returns total number of events and error code
+ */
+static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp)
{
- if (!x86_pmu_initialized())
- return;
- x86_pmu.enable_all();
-}
+ struct perf_event *event;
+ int n, max_count;
-static inline u64 intel_pmu_get_status(void)
-{
- u64 status;
+ max_count = x86_pmu.num_events + x86_pmu.num_events_fixed;
- rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+ /* current number of events already accepted */
+ n = cpuc->n_events;
- return status;
-}
+ if (is_x86_event(leader)) {
+ if (n >= max_count)
+ return -ENOSPC;
+ cpuc->event_list[n] = leader;
+ n++;
+ }
+ if (!dogrp)
+ return n;
-static inline void intel_pmu_ack_status(u64 ack)
-{
- wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
-}
+ list_for_each_entry(event, &leader->sibling_list, group_entry) {
+ if (!is_x86_event(event) ||
+ event->state <= PERF_EVENT_STATE_OFF)
+ continue;
-static inline void x86_pmu_enable_event(struct hw_perf_event *hwc, int idx)
-{
- (void)checking_wrmsrl(hwc->config_base + idx,
- hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
-}
+ if (n >= max_count)
+ return -ENOSPC;
-static inline void x86_pmu_disable_event(struct hw_perf_event *hwc, int idx)
-{
- (void)checking_wrmsrl(hwc->config_base + idx, hwc->config);
+ cpuc->event_list[n] = event;
+ n++;
+ }
+ return n;
}
-static inline void
-intel_pmu_disable_fixed(struct hw_perf_event *hwc, int __idx)
+static inline void x86_assign_hw_event(struct perf_event *event,
+ struct cpu_hw_events *cpuc, int i)
{
- int idx = __idx - X86_PMC_IDX_FIXED;
- u64 ctrl_val, mask;
+ struct hw_perf_event *hwc = &event->hw;
- mask = 0xfULL << (idx * 4);
+ hwc->idx = cpuc->assign[i];
+ hwc->last_cpu = smp_processor_id();
+ hwc->last_tag = ++cpuc->tags[i];
- rdmsrl(hwc->config_base, ctrl_val);
- ctrl_val &= ~mask;
- (void)checking_wrmsrl(hwc->config_base, ctrl_val);
+ if (hwc->idx == X86_PMC_IDX_FIXED_BTS) {
+ hwc->config_base = 0;
+ hwc->event_base = 0;
+ } else if (hwc->idx >= X86_PMC_IDX_FIXED) {
+ hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
+ /*
+ * We set it so that event_base + idx in wrmsr/rdmsr maps to
+ * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
+ */
+ hwc->event_base =
+ MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
+ } else {
+ hwc->config_base = x86_pmu.eventsel;
+ hwc->event_base = x86_pmu.perfctr;
+ }
}
-static inline void
-p6_pmu_disable_event(struct hw_perf_event *hwc, int idx)
+static inline int match_prev_assignment(struct hw_perf_event *hwc,
+ struct cpu_hw_events *cpuc,
+ int i)
{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- u64 val = P6_NOP_EVENT;
-
- if (cpuc->enabled)
- val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
-
- (void)checking_wrmsrl(hwc->config_base + idx, val);
+ return hwc->idx == cpuc->assign[i] &&
+ hwc->last_cpu == smp_processor_id() &&
+ hwc->last_tag == cpuc->tags[i];
}
-static inline void
-intel_pmu_disable_event(struct hw_perf_event *hwc, int idx)
+static int x86_pmu_start(struct perf_event *event);
+static void x86_pmu_stop(struct perf_event *event);
+
+void hw_perf_enable(void)
{
- if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
- intel_pmu_disable_bts();
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct perf_event *event;
+ struct hw_perf_event *hwc;
+ int i;
+
+ if (!x86_pmu_initialized())
return;
- }
- if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
- intel_pmu_disable_fixed(hwc, idx);
+ if (cpuc->enabled)
return;
+
+ if (cpuc->n_added) {
+ int n_running = cpuc->n_events - cpuc->n_added;
+ /*
+ * apply assignment obtained either from
+ * hw_perf_group_sched_in() or x86_pmu_enable()
+ *
+ * step1: save events moving to new counters
+ * step2: reprogram moved events into new counters
+ */
+ for (i = 0; i < n_running; i++) {
+ event = cpuc->event_list[i];
+ hwc = &event->hw;
+
+ /*
+ * we can avoid reprogramming counter if:
+ * - assigned same counter as last time
+ * - running on same CPU as last time
+ * - no other event has used the counter since
+ */
+ if (hwc->idx == -1 ||
+ match_prev_assignment(hwc, cpuc, i))
+ continue;
+
+ x86_pmu_stop(event);
+ }
+
+ for (i = 0; i < cpuc->n_events; i++) {
+ event = cpuc->event_list[i];
+ hwc = &event->hw;
+
+ if (!match_prev_assignment(hwc, cpuc, i))
+ x86_assign_hw_event(event, cpuc, i);
+ else if (i < n_running)
+ continue;
+
+ x86_pmu_start(event);
+ }
+ cpuc->n_added = 0;
+ perf_events_lapic_init();
}
- x86_pmu_disable_event(hwc, idx);
+ cpuc->enabled = 1;
+ barrier();
+
+ x86_pmu.enable_all();
}
-static inline void
-amd_pmu_disable_event(struct hw_perf_event *hwc, int idx)
+static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc)
{
- x86_pmu_disable_event(hwc, idx);
+ (void)checking_wrmsrl(hwc->config_base + hwc->idx,
+ hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE);
+}
+
+static inline void x86_pmu_disable_event(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ (void)checking_wrmsrl(hwc->config_base + hwc->idx, hwc->config);
}
static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
@@ -1275,12 +869,12 @@ static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
* To be called with the event disabled in hw:
*/
static int
-x86_perf_event_set_period(struct perf_event *event,
- struct hw_perf_event *hwc, int idx)
+x86_perf_event_set_period(struct perf_event *event)
{
+ struct hw_perf_event *hwc = &event->hw;
s64 left = atomic64_read(&hwc->period_left);
s64 period = hwc->sample_period;
- int err, ret = 0;
+ int err, ret = 0, idx = hwc->idx;
if (idx == X86_PMC_IDX_FIXED_BTS)
return 0;
@@ -1326,212 +920,63 @@ x86_perf_event_set_period(struct perf_event *event,
return ret;
}
-static inline void
-intel_pmu_enable_fixed(struct hw_perf_event *hwc, int __idx)
-{
- int idx = __idx - X86_PMC_IDX_FIXED;
- u64 ctrl_val, bits, mask;
- int err;
-
- /*
- * Enable IRQ generation (0x8),
- * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
- * if requested:
- */
- bits = 0x8ULL;
- if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
- bits |= 0x2;
- if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
- bits |= 0x1;
- bits <<= (idx * 4);
- mask = 0xfULL << (idx * 4);
-
- rdmsrl(hwc->config_base, ctrl_val);
- ctrl_val &= ~mask;
- ctrl_val |= bits;
- err = checking_wrmsrl(hwc->config_base, ctrl_val);
-}
-
-static void p6_pmu_enable_event(struct hw_perf_event *hwc, int idx)
+static void x86_pmu_enable_event(struct perf_event *event)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- u64 val;
-
- val = hwc->config;
if (cpuc->enabled)
- val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
-
- (void)checking_wrmsrl(hwc->config_base + idx, val);
+ __x86_pmu_enable_event(&event->hw);
}
-
-static void intel_pmu_enable_event(struct hw_perf_event *hwc, int idx)
-{
- if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
- if (!__get_cpu_var(cpu_hw_events).enabled)
- return;
-
- intel_pmu_enable_bts(hwc->config);
- return;
- }
-
- if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
- intel_pmu_enable_fixed(hwc, idx);
- return;
- }
-
- x86_pmu_enable_event(hwc, idx);
-}
-
-static void amd_pmu_enable_event(struct hw_perf_event *hwc, int idx)
+/*
+ * activate a single event
+ *
+ * The event is added to the group of enabled events
+ * but only if it can be scehduled with existing events.
+ *
+ * Called with PMU disabled. If successful and return value 1,
+ * then guaranteed to call perf_enable() and hw_perf_enable()
+ */
+static int x86_pmu_enable(struct perf_event *event)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct hw_perf_event *hwc;
+ int assign[X86_PMC_IDX_MAX];
+ int n, n0, ret;
- if (cpuc->enabled)
- x86_pmu_enable_event(hwc, idx);
-}
-
-static int fixed_mode_idx(struct hw_perf_event *hwc)
-{
- unsigned int hw_event;
-
- hw_event = hwc->config & ARCH_PERFMON_EVENT_MASK;
-
- if (unlikely((hw_event ==
- x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) &&
- (hwc->sample_period == 1)))
- return X86_PMC_IDX_FIXED_BTS;
+ hwc = &event->hw;
- if (!x86_pmu.num_events_fixed)
- return -1;
+ n0 = cpuc->n_events;
+ n = collect_events(cpuc, event, false);
+ if (n < 0)
+ return n;
+ ret = x86_schedule_events(cpuc, n, assign);
+ if (ret)
+ return ret;
/*
- * fixed counters do not take all possible filters
+ * copy new assignment, now we know it is possible
+ * will be used by hw_perf_enable()
*/
- if (hwc->config & ARCH_PERFMON_EVENT_FILTER_MASK)
- return -1;
-
- if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
- return X86_PMC_IDX_FIXED_INSTRUCTIONS;
- if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
- return X86_PMC_IDX_FIXED_CPU_CYCLES;
- if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES)))
- return X86_PMC_IDX_FIXED_BUS_CYCLES;
-
- return -1;
-}
-
-/*
- * generic counter allocator: get next free counter
- */
-static int
-gen_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
-{
- int idx;
-
- idx = find_first_zero_bit(cpuc->used_mask, x86_pmu.num_events);
- return idx == x86_pmu.num_events ? -1 : idx;
-}
+ memcpy(cpuc->assign, assign, n*sizeof(int));
-/*
- * intel-specific counter allocator: check event constraints
- */
-static int
-intel_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
-{
- const struct event_constraint *event_constraint;
- int i, code;
+ cpuc->n_events = n;
+ cpuc->n_added += n - n0;
- if (!event_constraints)
- goto skip;
-
- code = hwc->config & CORE_EVNTSEL_EVENT_MASK;
-
- for_each_event_constraint(event_constraint, event_constraints) {
- if (code == event_constraint->code) {
- for_each_bit(i, event_constraint->idxmsk, X86_PMC_IDX_MAX) {
- if (!test_and_set_bit(i, cpuc->used_mask))
- return i;
- }
- return -1;
- }
- }
-skip:
- return gen_get_event_idx(cpuc, hwc);
-}
-
-static int
-x86_schedule_event(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
-{
- int idx;
-
- idx = fixed_mode_idx(hwc);
- if (idx == X86_PMC_IDX_FIXED_BTS) {
- /* BTS is already occupied. */
- if (test_and_set_bit(idx, cpuc->used_mask))
- return -EAGAIN;
-
- hwc->config_base = 0;
- hwc->event_base = 0;
- hwc->idx = idx;
- } else if (idx >= 0) {
- /*
- * Try to get the fixed event, if that is already taken
- * then try to get a generic event:
- */
- if (test_and_set_bit(idx, cpuc->used_mask))
- goto try_generic;
-
- hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
- /*
- * We set it so that event_base + idx in wrmsr/rdmsr maps to
- * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
- */
- hwc->event_base =
- MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
- hwc->idx = idx;
- } else {
- idx = hwc->idx;
- /* Try to get the previous generic event again */
- if (idx == -1 || test_and_set_bit(idx, cpuc->used_mask)) {
-try_generic:
- idx = x86_pmu.get_event_idx(cpuc, hwc);
- if (idx == -1)
- return -EAGAIN;
-
- set_bit(idx, cpuc->used_mask);
- hwc->idx = idx;
- }
- hwc->config_base = x86_pmu.eventsel;
- hwc->event_base = x86_pmu.perfctr;
- }
-
- return idx;
+ return 0;
}
-/*
- * Find a PMC slot for the freshly enabled / scheduled in event:
- */
-static int x86_pmu_enable(struct perf_event *event)
+static int x86_pmu_start(struct perf_event *event)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- struct hw_perf_event *hwc = &event->hw;
- int idx;
-
- idx = x86_schedule_event(cpuc, hwc);
- if (idx < 0)
- return idx;
-
- perf_events_lapic_init();
+ int idx = event->hw.idx;
- x86_pmu.disable(hwc, idx);
+ if (idx == -1)
+ return -EAGAIN;
+ x86_perf_event_set_period(event);
cpuc->events[idx] = event;
- set_bit(idx, cpuc->active_mask);
-
- x86_perf_event_set_period(event, hwc, idx);
- x86_pmu.enable(hwc, idx);
-
+ __set_bit(idx, cpuc->active_mask);
+ x86_pmu.enable(event);
perf_event_update_userpage(event);
return 0;
@@ -1539,14 +984,8 @@ static int x86_pmu_enable(struct perf_event *event)
static void x86_pmu_unthrottle(struct perf_event *event)
{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- struct hw_perf_event *hwc = &event->hw;
-
- if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX ||
- cpuc->events[hwc->idx] != event))
- return;
-
- x86_pmu.enable(hwc, hwc->idx);
+ int ret = x86_pmu_start(event);
+ WARN_ON_ONCE(ret);
}
void perf_event_print_debug(void)
@@ -1576,7 +1015,7 @@ void perf_event_print_debug(void)
pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow);
pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed);
}
- pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used_mask);
+ pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask);
for (idx = 0; idx < x86_pmu.num_events; idx++) {
rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
@@ -1600,257 +1039,50 @@ void perf_event_print_debug(void)
local_irq_restore(flags);
}
-static void intel_pmu_drain_bts_buffer(struct cpu_hw_events *cpuc)
-{
- struct debug_store *ds = cpuc->ds;
- struct bts_record {
- u64 from;
- u64 to;
- u64 flags;
- };
- struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS];
- struct bts_record *at, *top;
- struct perf_output_handle handle;
- struct perf_event_header header;
- struct perf_sample_data data;
- struct pt_regs regs;
-
- if (!event)
- return;
-
- if (!ds)
- return;
-
- at = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
- top = (struct bts_record *)(unsigned long)ds->bts_index;
-
- if (top <= at)
- return;
-
- ds->bts_index = ds->bts_buffer_base;
-
-
- data.period = event->hw.last_period;
- data.addr = 0;
- data.raw = NULL;
- regs.ip = 0;
-
- /*
- * Prepare a generic sample, i.e. fill in the invariant fields.
- * We will overwrite the from and to address before we output
- * the sample.
- */
- perf_prepare_sample(&header, &data, event, &regs);
-
- if (perf_output_begin(&handle, event,
- header.size * (top - at), 1, 1))
- return;
-
- for (; at < top; at++) {
- data.ip = at->from;
- data.addr = at->to;
-
- perf_output_sample(&handle, &header, &data, event);
- }
-
- perf_output_end(&handle);
-
- /* There's new data available. */
- event->hw.interrupts++;
- event->pending_kill = POLL_IN;
-}
-
-static void x86_pmu_disable(struct perf_event *event)
+static void x86_pmu_stop(struct perf_event *event)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
struct hw_perf_event *hwc = &event->hw;
int idx = hwc->idx;
- /*
- * Must be done before we disable, otherwise the nmi handler
- * could reenable again:
- */
- clear_bit(idx, cpuc->active_mask);
- x86_pmu.disable(hwc, idx);
+ if (!__test_and_clear_bit(idx, cpuc->active_mask))
+ return;
- /*
- * Make sure the cleared pointer becomes visible before we
- * (potentially) free the event:
- */
- barrier();
+ x86_pmu.disable(event);
/*
* Drain the remaining delta count out of a event
* that we are disabling:
*/
- x86_perf_event_update(event, hwc, idx);
-
- /* Drain the remaining BTS records. */
- if (unlikely(idx == X86_PMC_IDX_FIXED_BTS))
- intel_pmu_drain_bts_buffer(cpuc);
+ x86_perf_event_update(event);
cpuc->events[idx] = NULL;
- clear_bit(idx, cpuc->used_mask);
-
- perf_event_update_userpage(event);
-}
-
-/*
- * Save and restart an expired event. Called by NMI contexts,
- * so it has to be careful about preempting normal event ops:
- */
-static int intel_pmu_save_and_restart(struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
- int idx = hwc->idx;
- int ret;
-
- x86_perf_event_update(event, hwc, idx);
- ret = x86_perf_event_set_period(event, hwc, idx);
-
- if (event->state == PERF_EVENT_STATE_ACTIVE)
- intel_pmu_enable_event(hwc, idx);
-
- return ret;
-}
-
-static void intel_pmu_reset(void)
-{
- struct debug_store *ds = __get_cpu_var(cpu_hw_events).ds;
- unsigned long flags;
- int idx;
-
- if (!x86_pmu.num_events)
- return;
-
- local_irq_save(flags);
-
- printk("clearing PMU state on CPU#%d\n", smp_processor_id());
-
- for (idx = 0; idx < x86_pmu.num_events; idx++) {
- checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
- checking_wrmsrl(x86_pmu.perfctr + idx, 0ull);
- }
- for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) {
- checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
- }
- if (ds)
- ds->bts_index = ds->bts_buffer_base;
-
- local_irq_restore(flags);
-}
-
-static int p6_pmu_handle_irq(struct pt_regs *regs)
-{
- struct perf_sample_data data;
- struct cpu_hw_events *cpuc;
- struct perf_event *event;
- struct hw_perf_event *hwc;
- int idx, handled = 0;
- u64 val;
-
- data.addr = 0;
- data.raw = NULL;
-
- cpuc = &__get_cpu_var(cpu_hw_events);
-
- for (idx = 0; idx < x86_pmu.num_events; idx++) {
- if (!test_bit(idx, cpuc->active_mask))
- continue;
-
- event = cpuc->events[idx];
- hwc = &event->hw;
-
- val = x86_perf_event_update(event, hwc, idx);
- if (val & (1ULL << (x86_pmu.event_bits - 1)))
- continue;
-
- /*
- * event overflow
- */
- handled = 1;
- data.period = event->hw.last_period;
-
- if (!x86_perf_event_set_period(event, hwc, idx))
- continue;
-
- if (perf_event_overflow(event, 1, &data, regs))
- p6_pmu_disable_event(hwc, idx);
- }
-
- if (handled)
- inc_irq_stat(apic_perf_irqs);
-
- return handled;
}
-/*
- * This handler is triggered by the local APIC, so the APIC IRQ handling
- * rules apply:
- */
-static int intel_pmu_handle_irq(struct pt_regs *regs)
+static void x86_pmu_disable(struct perf_event *event)
{
- struct perf_sample_data data;
- struct cpu_hw_events *cpuc;
- int bit, loops;
- u64 ack, status;
-
- data.addr = 0;
- data.raw = NULL;
-
- cpuc = &__get_cpu_var(cpu_hw_events);
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ int i;
- perf_disable();
- intel_pmu_drain_bts_buffer(cpuc);
- status = intel_pmu_get_status();
- if (!status) {
- perf_enable();
- return 0;
- }
+ x86_pmu_stop(event);
- loops = 0;
-again:
- if (++loops > 100) {
- WARN_ONCE(1, "perfevents: irq loop stuck!\n");
- perf_event_print_debug();
- intel_pmu_reset();
- perf_enable();
- return 1;
- }
-
- inc_irq_stat(apic_perf_irqs);
- ack = status;
- for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
- struct perf_event *event = cpuc->events[bit];
+ for (i = 0; i < cpuc->n_events; i++) {
+ if (event == cpuc->event_list[i]) {
- clear_bit(bit, (unsigned long *) &status);
- if (!test_bit(bit, cpuc->active_mask))
- continue;
+ if (x86_pmu.put_event_constraints)
+ x86_pmu.put_event_constraints(cpuc, event);
- if (!intel_pmu_save_and_restart(event))
- continue;
+ while (++i < cpuc->n_events)
+ cpuc->event_list[i-1] = cpuc->event_list[i];
- data.period = event->hw.last_period;
-
- if (perf_event_overflow(event, 1, &data, regs))
- intel_pmu_disable_event(&event->hw, bit);
+ --cpuc->n_events;
+ break;
+ }
}
-
- intel_pmu_ack_status(ack);
-
- /*
- * Repeat if there is more work to be done:
- */
- status = intel_pmu_get_status();
- if (status)
- goto again;
-
- perf_enable();
-
- return 1;
+ perf_event_update_userpage(event);
}
-static int amd_pmu_handle_irq(struct pt_regs *regs)
+static int x86_pmu_handle_irq(struct pt_regs *regs)
{
struct perf_sample_data data;
struct cpu_hw_events *cpuc;
@@ -1859,8 +1091,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
int idx, handled = 0;
u64 val;
- data.addr = 0;
- data.raw = NULL;
+ perf_sample_data_init(&data, 0);
cpuc = &__get_cpu_var(cpu_hw_events);
@@ -1871,7 +1102,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
event = cpuc->events[idx];
hwc = &event->hw;
- val = x86_perf_event_update(event, hwc, idx);
+ val = x86_perf_event_update(event);
if (val & (1ULL << (x86_pmu.event_bits - 1)))
continue;
@@ -1881,11 +1112,11 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
handled = 1;
data.period = event->hw.last_period;
- if (!x86_perf_event_set_period(event, hwc, idx))
+ if (!x86_perf_event_set_period(event))
continue;
if (perf_event_overflow(event, 1, &data, regs))
- amd_pmu_disable_event(hwc, idx);
+ x86_pmu_stop(event);
}
if (handled)
@@ -1968,193 +1199,171 @@ static __read_mostly struct notifier_block perf_event_nmi_notifier = {
.priority = 1
};
-static __initconst struct x86_pmu p6_pmu = {
- .name = "p6",
- .handle_irq = p6_pmu_handle_irq,
- .disable_all = p6_pmu_disable_all,
- .enable_all = p6_pmu_enable_all,
- .enable = p6_pmu_enable_event,
- .disable = p6_pmu_disable_event,
- .eventsel = MSR_P6_EVNTSEL0,
- .perfctr = MSR_P6_PERFCTR0,
- .event_map = p6_pmu_event_map,
- .raw_event = p6_pmu_raw_event,
- .max_events = ARRAY_SIZE(p6_perfmon_event_map),
- .apic = 1,
- .max_period = (1ULL << 31) - 1,
- .version = 0,
- .num_events = 2,
- /*
- * Events have 40 bits implemented. However they are designed such
- * that bits [32-39] are sign extensions of bit 31. As such the
- * effective width of a event for P6-like PMU is 32 bits only.
- *
- * See IA-32 Intel Architecture Software developer manual Vol 3B
- */
- .event_bits = 32,
- .event_mask = (1ULL << 32) - 1,
- .get_event_idx = intel_get_event_idx,
-};
+static struct event_constraint unconstrained;
+static struct event_constraint emptyconstraint;
-static __initconst struct x86_pmu intel_pmu = {
- .name = "Intel",
- .handle_irq = intel_pmu_handle_irq,
- .disable_all = intel_pmu_disable_all,
- .enable_all = intel_pmu_enable_all,
- .enable = intel_pmu_enable_event,
- .disable = intel_pmu_disable_event,
- .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
- .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
- .event_map = intel_pmu_event_map,
- .raw_event = intel_pmu_raw_event,
- .max_events = ARRAY_SIZE(intel_perfmon_event_map),
- .apic = 1,
- /*
- * Intel PMCs cannot be accessed sanely above 32 bit width,
- * so we install an artificial 1<<31 period regardless of
- * the generic event period:
- */
- .max_period = (1ULL << 31) - 1,
- .enable_bts = intel_pmu_enable_bts,
- .disable_bts = intel_pmu_disable_bts,
- .get_event_idx = intel_get_event_idx,
-};
+static struct event_constraint *
+x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+{
+ struct event_constraint *c;
-static __initconst struct x86_pmu amd_pmu = {
- .name = "AMD",
- .handle_irq = amd_pmu_handle_irq,
- .disable_all = amd_pmu_disable_all,
- .enable_all = amd_pmu_enable_all,
- .enable = amd_pmu_enable_event,
- .disable = amd_pmu_disable_event,
- .eventsel = MSR_K7_EVNTSEL0,
- .perfctr = MSR_K7_PERFCTR0,
- .event_map = amd_pmu_event_map,
- .raw_event = amd_pmu_raw_event,
- .max_events = ARRAY_SIZE(amd_perfmon_event_map),
- .num_events = 4,
- .event_bits = 48,
- .event_mask = (1ULL << 48) - 1,
- .apic = 1,
- /* use highest bit to detect overflow */
- .max_period = (1ULL << 47) - 1,
- .get_event_idx = gen_get_event_idx,
-};
+ if (x86_pmu.event_constraints) {
+ for_each_event_constraint(c, x86_pmu.event_constraints) {
+ if ((event->hw.config & c->cmask) == c->code)
+ return c;
+ }
+ }
+
+ return &unconstrained;
+}
-static __init int p6_pmu_init(void)
+static int x86_event_sched_in(struct perf_event *event,
+ struct perf_cpu_context *cpuctx)
{
- switch (boot_cpu_data.x86_model) {
- case 1:
- case 3: /* Pentium Pro */
- case 5:
- case 6: /* Pentium II */
- case 7:
- case 8:
- case 11: /* Pentium III */
- event_constraints = intel_p6_event_constraints;
- break;
- case 9:
- case 13:
- /* Pentium M */
- event_constraints = intel_p6_event_constraints;
- break;
- default:
- pr_cont("unsupported p6 CPU model %d ",
- boot_cpu_data.x86_model);
- return -ENODEV;
- }
+ int ret = 0;
- x86_pmu = p6_pmu;
+ event->state = PERF_EVENT_STATE_ACTIVE;
+ event->oncpu = smp_processor_id();
+ event->tstamp_running += event->ctx->time - event->tstamp_stopped;
- return 0;
+ if (!is_x86_event(event))
+ ret = event->pmu->enable(event);
+
+ if (!ret && !is_software_event(event))
+ cpuctx->active_oncpu++;
+
+ if (!ret && event->attr.exclusive)
+ cpuctx->exclusive = 1;
+
+ return ret;
}
-static __init int intel_pmu_init(void)
+static void x86_event_sched_out(struct perf_event *event,
+ struct perf_cpu_context *cpuctx)
{
- union cpuid10_edx edx;
- union cpuid10_eax eax;
- unsigned int unused;
- unsigned int ebx;
- int version;
-
- if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
- /* check for P6 processor family */
- if (boot_cpu_data.x86 == 6) {
- return p6_pmu_init();
- } else {
- return -ENODEV;
- }
- }
+ event->state = PERF_EVENT_STATE_INACTIVE;
+ event->oncpu = -1;
- /*
- * Check whether the Architectural PerfMon supports
- * Branch Misses Retired hw_event or not.
- */
- cpuid(10, &eax.full, &ebx, &unused, &edx.full);
- if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
- return -ENODEV;
+ if (!is_x86_event(event))
+ event->pmu->disable(event);
- version = eax.split.version_id;
- if (version < 2)
- return -ENODEV;
+ event->tstamp_running -= event->ctx->time - event->tstamp_stopped;
- x86_pmu = intel_pmu;
- x86_pmu.version = version;
- x86_pmu.num_events = eax.split.num_events;
- x86_pmu.event_bits = eax.split.bit_width;
- x86_pmu.event_mask = (1ULL << eax.split.bit_width) - 1;
+ if (!is_software_event(event))
+ cpuctx->active_oncpu--;
+ if (event->attr.exclusive || !cpuctx->active_oncpu)
+ cpuctx->exclusive = 0;
+}
+
+/*
+ * Called to enable a whole group of events.
+ * Returns 1 if the group was enabled, or -EAGAIN if it could not be.
+ * Assumes the caller has disabled interrupts and has
+ * frozen the PMU with hw_perf_save_disable.
+ *
+ * called with PMU disabled. If successful and return value 1,
+ * then guaranteed to call perf_enable() and hw_perf_enable()
+ */
+int hw_perf_group_sched_in(struct perf_event *leader,
+ struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct perf_event *sub;
+ int assign[X86_PMC_IDX_MAX];
+ int n0, n1, ret;
+
+ /* n0 = total number of events */
+ n0 = collect_events(cpuc, leader, true);
+ if (n0 < 0)
+ return n0;
+
+ ret = x86_schedule_events(cpuc, n0, assign);
+ if (ret)
+ return ret;
+
+ ret = x86_event_sched_in(leader, cpuctx);
+ if (ret)
+ return ret;
+
+ n1 = 1;
+ list_for_each_entry(sub, &leader->sibling_list, group_entry) {
+ if (sub->state > PERF_EVENT_STATE_OFF) {
+ ret = x86_event_sched_in(sub, cpuctx);
+ if (ret)
+ goto undo;
+ ++n1;
+ }
+ }
/*
- * Quirk: v2 perfmon does not report fixed-purpose events, so
- * assume at least 3 events:
+ * copy new assignment, now we know it is possible
+ * will be used by hw_perf_enable()
*/
- x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3);
+ memcpy(cpuc->assign, assign, n0*sizeof(int));
+
+ cpuc->n_events = n0;
+ cpuc->n_added += n1;
+ ctx->nr_active += n1;
/*
- * Install the hw-cache-events table:
+ * 1 means successful and events are active
+ * This is not quite true because we defer
+ * actual activation until hw_perf_enable() but
+ * this way we* ensure caller won't try to enable
+ * individual events
*/
- switch (boot_cpu_data.x86_model) {
- case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
- case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
- case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
- case 29: /* six-core 45 nm xeon "Dunnington" */
- memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
- sizeof(hw_cache_event_ids));
-
- pr_cont("Core2 events, ");
- event_constraints = intel_core_event_constraints;
- break;
- default:
- case 26:
- memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
- sizeof(hw_cache_event_ids));
+ return 1;
+undo:
+ x86_event_sched_out(leader, cpuctx);
+ n0 = 1;
+ list_for_each_entry(sub, &leader->sibling_list, group_entry) {
+ if (sub->state == PERF_EVENT_STATE_ACTIVE) {
+ x86_event_sched_out(sub, cpuctx);
+ if (++n0 == n1)
+ break;
+ }
+ }
+ return ret;
+}
+
+#include "perf_event_amd.c"
+#include "perf_event_p6.c"
+#include "perf_event_intel.c"
- event_constraints = intel_nehalem_event_constraints;
- pr_cont("Nehalem/Corei7 events, ");
+static int __cpuinit
+x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
+{
+ unsigned int cpu = (long)hcpu;
+ int ret = NOTIFY_OK;
+
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_UP_PREPARE:
+ if (x86_pmu.cpu_prepare)
+ ret = x86_pmu.cpu_prepare(cpu);
break;
- case 28:
- memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
- sizeof(hw_cache_event_ids));
- pr_cont("Atom events, ");
+ case CPU_STARTING:
+ if (x86_pmu.cpu_starting)
+ x86_pmu.cpu_starting(cpu);
break;
- }
- return 0;
-}
-static __init int amd_pmu_init(void)
-{
- /* Performance-monitoring supported from K7 and later: */
- if (boot_cpu_data.x86 < 6)
- return -ENODEV;
+ case CPU_DYING:
+ if (x86_pmu.cpu_dying)
+ x86_pmu.cpu_dying(cpu);
+ break;
- x86_pmu = amd_pmu;
+ case CPU_UP_CANCELED:
+ case CPU_DEAD:
+ if (x86_pmu.cpu_dead)
+ x86_pmu.cpu_dead(cpu);
+ break;
- /* Events are common for all AMDs */
- memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
- sizeof(hw_cache_event_ids));
+ default:
+ break;
+ }
- return 0;
+ return ret;
}
static void __init pmu_check_apic(void)
@@ -2169,6 +1378,7 @@ static void __init pmu_check_apic(void)
void __init init_hw_perf_events(void)
{
+ struct event_constraint *c;
int err;
pr_info("Performance Events: ");
@@ -2213,6 +1423,20 @@ void __init init_hw_perf_events(void)
perf_events_lapic_init();
register_die_notifier(&perf_event_nmi_notifier);
+ unconstrained = (struct event_constraint)
+ __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_events) - 1,
+ 0, x86_pmu.num_events);
+
+ if (x86_pmu.event_constraints) {
+ for_each_event_constraint(c, x86_pmu.event_constraints) {
+ if (c->cmask != INTEL_ARCH_FIXED_MASK)
+ continue;
+
+ c->idxmsk64 |= (1ULL << x86_pmu.num_events) - 1;
+ c->weight += x86_pmu.num_events;
+ }
+ }
+
pr_info("... version: %d\n", x86_pmu.version);
pr_info("... bit width: %d\n", x86_pmu.event_bits);
pr_info("... generic registers: %d\n", x86_pmu.num_events);
@@ -2220,60 +1444,91 @@ void __init init_hw_perf_events(void)
pr_info("... max period: %016Lx\n", x86_pmu.max_period);
pr_info("... fixed-purpose events: %d\n", x86_pmu.num_events_fixed);
pr_info("... event mask: %016Lx\n", perf_event_mask);
+
+ perf_cpu_notifier(x86_pmu_notifier);
}
static inline void x86_pmu_read(struct perf_event *event)
{
- x86_perf_event_update(event, &event->hw, event->hw.idx);
+ x86_perf_event_update(event);
}
static const struct pmu pmu = {
.enable = x86_pmu_enable,
.disable = x86_pmu_disable,
+ .start = x86_pmu_start,
+ .stop = x86_pmu_stop,
.read = x86_pmu_read,
.unthrottle = x86_pmu_unthrottle,
};
-static int
-validate_event(struct cpu_hw_events *cpuc, struct perf_event *event)
-{
- struct hw_perf_event fake_event = event->hw;
-
- if (event->pmu && event->pmu != &pmu)
- return 0;
-
- return x86_schedule_event(cpuc, &fake_event) >= 0;
-}
-
+/*
+ * validate a single event group
+ *
+ * validation include:
+ * - check events are compatible which each other
+ * - events do not compete for the same counter
+ * - number of events <= number of counters
+ *
+ * validation ensures the group can be loaded onto the
+ * PMU if it was the only group available.
+ */
static int validate_group(struct perf_event *event)
{
- struct perf_event *sibling, *leader = event->group_leader;
- struct cpu_hw_events fake_pmu;
+ struct perf_event *leader = event->group_leader;
+ struct cpu_hw_events *fake_cpuc;
+ int ret, n;
- memset(&fake_pmu, 0, sizeof(fake_pmu));
+ ret = -ENOMEM;
+ fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO);
+ if (!fake_cpuc)
+ goto out;
- if (!validate_event(&fake_pmu, leader))
- return -ENOSPC;
+ /*
+ * the event is not yet connected with its
+ * siblings therefore we must first collect
+ * existing siblings, then add the new event
+ * before we can simulate the scheduling
+ */
+ ret = -ENOSPC;
+ n = collect_events(fake_cpuc, leader, true);
+ if (n < 0)
+ goto out_free;
- list_for_each_entry(sibling, &leader->sibling_list, group_entry) {
- if (!validate_event(&fake_pmu, sibling))
- return -ENOSPC;
- }
+ fake_cpuc->n_events = n;
+ n = collect_events(fake_cpuc, event, false);
+ if (n < 0)
+ goto out_free;
- if (!validate_event(&fake_pmu, event))
- return -ENOSPC;
+ fake_cpuc->n_events = n;
- return 0;
+ ret = x86_schedule_events(fake_cpuc, n, NULL);
+
+out_free:
+ kfree(fake_cpuc);
+out:
+ return ret;
}
const struct pmu *hw_perf_event_init(struct perf_event *event)
{
+ const struct pmu *tmp;
int err;
err = __hw_perf_event_init(event);
if (!err) {
+ /*
+ * we temporarily connect event to its pmu
+ * such that validate_group() can classify
+ * it as an x86 event using is_x86_event()
+ */
+ tmp = event->pmu;
+ event->pmu = &pmu;
+
if (event->group_leader != event)
err = validate_group(event);
+
+ event->pmu = tmp;
}
if (err) {
if (event->destroy)
@@ -2297,7 +1552,6 @@ void callchain_store(struct perf_callchain_entry *entry, u64 ip)
static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry);
-static DEFINE_PER_CPU(int, in_ignored_frame);
static void
@@ -2313,10 +1567,6 @@ static void backtrace_warning(void *data, char *msg)
static int backtrace_stack(void *data, char *name)
{
- per_cpu(in_ignored_frame, smp_processor_id()) =
- x86_is_stack_id(NMI_STACK, name) ||
- x86_is_stack_id(DEBUG_STACK, name);
-
return 0;
}
@@ -2324,9 +1574,6 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
{
struct perf_callchain_entry *entry = data;
- if (per_cpu(in_ignored_frame, smp_processor_id()))
- return;
-
if (reliable)
callchain_store(entry, addr);
}
@@ -2347,7 +1594,7 @@ perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
callchain_store(entry, PERF_CONTEXT_KERNEL);
callchain_store(entry, regs->ip);
- dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
+ dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry);
}
/*
@@ -2385,14 +1632,42 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
return len;
}
-static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
+#ifdef CONFIG_COMPAT
+static inline int
+perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
{
- unsigned long bytes;
+ /* 32-bit process in 64-bit kernel. */
+ struct stack_frame_ia32 frame;
+ const void __user *fp;
+
+ if (!test_thread_flag(TIF_IA32))
+ return 0;
+
+ fp = compat_ptr(regs->bp);
+ while (entry->nr < PERF_MAX_STACK_DEPTH) {
+ unsigned long bytes;
+ frame.next_frame = 0;
+ frame.return_address = 0;
- bytes = copy_from_user_nmi(frame, fp, sizeof(*frame));
+ bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
+ if (bytes != sizeof(frame))
+ break;
+
+ if (fp < compat_ptr(regs->sp))
+ break;
- return bytes == sizeof(*frame);
+ callchain_store(entry, frame.return_address);
+ fp = compat_ptr(frame.next_frame);
+ }
+ return 1;
+}
+#else
+static inline int
+perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
+{
+ return 0;
}
+#endif
static void
perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
@@ -2408,11 +1683,16 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
callchain_store(entry, PERF_CONTEXT_USER);
callchain_store(entry, regs->ip);
+ if (perf_callchain_user32(regs, entry))
+ return;
+
while (entry->nr < PERF_MAX_STACK_DEPTH) {
+ unsigned long bytes;
frame.next_frame = NULL;
frame.return_address = 0;
- if (!copy_stack_frame(fp, &frame))
+ bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
+ if (bytes != sizeof(frame))
break;
if ((unsigned long)fp < regs->sp)
@@ -2433,9 +1713,6 @@ perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
is_user = user_mode(regs);
- if (!current || current->pid == 0)
- return;
-
if (is_user && current->state != TASK_RUNNING)
return;
@@ -2462,7 +1739,14 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
return entry;
}
-void hw_perf_event_setup_online(int cpu)
+void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
{
- init_debug_store_on_cpu(cpu);
+ regs->ip = ip;
+ /*
+ * perf_arch_fetch_caller_regs adds another call, we need to increment
+ * the skip level
+ */
+ regs->bp = rewind_frame_pointer(skip + 1);
+ regs->cs = __KERNEL_CS;
+ local_save_flags(regs->flags);
}
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
new file mode 100644
index 00000000000..db6f7d4056e
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -0,0 +1,422 @@
+#ifdef CONFIG_CPU_SUP_AMD
+
+static DEFINE_RAW_SPINLOCK(amd_nb_lock);
+
+static __initconst u64 amd_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
+ [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts */
+ [ C(RESULT_MISS) ] = 0x0167, /* Data Prefetcher :cancelled */
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches */
+ [ C(RESULT_MISS) ] = 0x0081, /* Instruction cache misses */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */
+ [ C(RESULT_MISS) ] = 0x037E, /* L2 Cache Misses : IC+DC */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback */
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
+ [ C(RESULT_MISS) ] = 0x0046, /* L1 DTLB and L2 DLTB Miss */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */
+ [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr. */
+ [ C(RESULT_MISS) ] = 0x00c3, /* Retired Mispredicted BI */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+};
+
+/*
+ * AMD Performance Monitor K7 and later.
+ */
+static const u64 amd_perfmon_event_map[] =
+{
+ [PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
+ [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
+ [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080,
+ [PERF_COUNT_HW_CACHE_MISSES] = 0x0081,
+ [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
+ [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
+};
+
+static u64 amd_pmu_event_map(int hw_event)
+{
+ return amd_perfmon_event_map[hw_event];
+}
+
+static u64 amd_pmu_raw_event(u64 hw_event)
+{
+#define K7_EVNTSEL_EVENT_MASK 0xF000000FFULL
+#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL
+#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL
+#define K7_EVNTSEL_INV_MASK 0x000800000ULL
+#define K7_EVNTSEL_REG_MASK 0x0FF000000ULL
+
+#define K7_EVNTSEL_MASK \
+ (K7_EVNTSEL_EVENT_MASK | \
+ K7_EVNTSEL_UNIT_MASK | \
+ K7_EVNTSEL_EDGE_MASK | \
+ K7_EVNTSEL_INV_MASK | \
+ K7_EVNTSEL_REG_MASK)
+
+ return hw_event & K7_EVNTSEL_MASK;
+}
+
+/*
+ * AMD64 events are detected based on their event codes.
+ */
+static inline int amd_is_nb_event(struct hw_perf_event *hwc)
+{
+ return (hwc->config & 0xe0) == 0xe0;
+}
+
+static inline int amd_has_nb(struct cpu_hw_events *cpuc)
+{
+ struct amd_nb *nb = cpuc->amd_nb;
+
+ return nb && nb->nb_id != -1;
+}
+
+static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
+ struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct amd_nb *nb = cpuc->amd_nb;
+ int i;
+
+ /*
+ * only care about NB events
+ */
+ if (!(amd_has_nb(cpuc) && amd_is_nb_event(hwc)))
+ return;
+
+ /*
+ * need to scan whole list because event may not have
+ * been assigned during scheduling
+ *
+ * no race condition possible because event can only
+ * be removed on one CPU at a time AND PMU is disabled
+ * when we come here
+ */
+ for (i = 0; i < x86_pmu.num_events; i++) {
+ if (nb->owners[i] == event) {
+ cmpxchg(nb->owners+i, event, NULL);
+ break;
+ }
+ }
+}
+
+ /*
+ * AMD64 NorthBridge events need special treatment because
+ * counter access needs to be synchronized across all cores
+ * of a package. Refer to BKDG section 3.12
+ *
+ * NB events are events measuring L3 cache, Hypertransport
+ * traffic. They are identified by an event code >= 0xe00.
+ * They measure events on the NorthBride which is shared
+ * by all cores on a package. NB events are counted on a
+ * shared set of counters. When a NB event is programmed
+ * in a counter, the data actually comes from a shared
+ * counter. Thus, access to those counters needs to be
+ * synchronized.
+ *
+ * We implement the synchronization such that no two cores
+ * can be measuring NB events using the same counters. Thus,
+ * we maintain a per-NB allocation table. The available slot
+ * is propagated using the event_constraint structure.
+ *
+ * We provide only one choice for each NB event based on
+ * the fact that only NB events have restrictions. Consequently,
+ * if a counter is available, there is a guarantee the NB event
+ * will be assigned to it. If no slot is available, an empty
+ * constraint is returned and scheduling will eventually fail
+ * for this event.
+ *
+ * Note that all cores attached the same NB compete for the same
+ * counters to host NB events, this is why we use atomic ops. Some
+ * multi-chip CPUs may have more than one NB.
+ *
+ * Given that resources are allocated (cmpxchg), they must be
+ * eventually freed for others to use. This is accomplished by
+ * calling amd_put_event_constraints().
+ *
+ * Non NB events are not impacted by this restriction.
+ */
+static struct event_constraint *
+amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct amd_nb *nb = cpuc->amd_nb;
+ struct perf_event *old = NULL;
+ int max = x86_pmu.num_events;
+ int i, j, k = -1;
+
+ /*
+ * if not NB event or no NB, then no constraints
+ */
+ if (!(amd_has_nb(cpuc) && amd_is_nb_event(hwc)))
+ return &unconstrained;
+
+ /*
+ * detect if already present, if so reuse
+ *
+ * cannot merge with actual allocation
+ * because of possible holes
+ *
+ * event can already be present yet not assigned (in hwc->idx)
+ * because of successive calls to x86_schedule_events() from
+ * hw_perf_group_sched_in() without hw_perf_enable()
+ */
+ for (i = 0; i < max; i++) {
+ /*
+ * keep track of first free slot
+ */
+ if (k == -1 && !nb->owners[i])
+ k = i;
+
+ /* already present, reuse */
+ if (nb->owners[i] == event)
+ goto done;
+ }
+ /*
+ * not present, so grab a new slot
+ * starting either at:
+ */
+ if (hwc->idx != -1) {
+ /* previous assignment */
+ i = hwc->idx;
+ } else if (k != -1) {
+ /* start from free slot found */
+ i = k;
+ } else {
+ /*
+ * event not found, no slot found in
+ * first pass, try again from the
+ * beginning
+ */
+ i = 0;
+ }
+ j = i;
+ do {
+ old = cmpxchg(nb->owners+i, NULL, event);
+ if (!old)
+ break;
+ if (++i == max)
+ i = 0;
+ } while (i != j);
+done:
+ if (!old)
+ return &nb->event_constraints[i];
+
+ return &emptyconstraint;
+}
+
+static struct amd_nb *amd_alloc_nb(int cpu, int nb_id)
+{
+ struct amd_nb *nb;
+ int i;
+
+ nb = kmalloc(sizeof(struct amd_nb), GFP_KERNEL);
+ if (!nb)
+ return NULL;
+
+ memset(nb, 0, sizeof(*nb));
+ nb->nb_id = nb_id;
+
+ /*
+ * initialize all possible NB constraints
+ */
+ for (i = 0; i < x86_pmu.num_events; i++) {
+ __set_bit(i, nb->event_constraints[i].idxmsk);
+ nb->event_constraints[i].weight = 1;
+ }
+ return nb;
+}
+
+static int amd_pmu_cpu_prepare(int cpu)
+{
+ struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+
+ WARN_ON_ONCE(cpuc->amd_nb);
+
+ if (boot_cpu_data.x86_max_cores < 2)
+ return NOTIFY_OK;
+
+ cpuc->amd_nb = amd_alloc_nb(cpu, -1);
+ if (!cpuc->amd_nb)
+ return NOTIFY_BAD;
+
+ return NOTIFY_OK;
+}
+
+static void amd_pmu_cpu_starting(int cpu)
+{
+ struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+ struct amd_nb *nb;
+ int i, nb_id;
+
+ if (boot_cpu_data.x86_max_cores < 2)
+ return;
+
+ nb_id = amd_get_nb_id(cpu);
+ WARN_ON_ONCE(nb_id == BAD_APICID);
+
+ raw_spin_lock(&amd_nb_lock);
+
+ for_each_online_cpu(i) {
+ nb = per_cpu(cpu_hw_events, i).amd_nb;
+ if (WARN_ON_ONCE(!nb))
+ continue;
+
+ if (nb->nb_id == nb_id) {
+ kfree(cpuc->amd_nb);
+ cpuc->amd_nb = nb;
+ break;
+ }
+ }
+
+ cpuc->amd_nb->nb_id = nb_id;
+ cpuc->amd_nb->refcnt++;
+
+ raw_spin_unlock(&amd_nb_lock);
+}
+
+static void amd_pmu_cpu_dead(int cpu)
+{
+ struct cpu_hw_events *cpuhw;
+
+ if (boot_cpu_data.x86_max_cores < 2)
+ return;
+
+ cpuhw = &per_cpu(cpu_hw_events, cpu);
+
+ raw_spin_lock(&amd_nb_lock);
+
+ if (cpuhw->amd_nb) {
+ struct amd_nb *nb = cpuhw->amd_nb;
+
+ if (nb->nb_id == -1 || --nb->refcnt == 0)
+ kfree(nb);
+
+ cpuhw->amd_nb = NULL;
+ }
+
+ raw_spin_unlock(&amd_nb_lock);
+}
+
+static __initconst struct x86_pmu amd_pmu = {
+ .name = "AMD",
+ .handle_irq = x86_pmu_handle_irq,
+ .disable_all = x86_pmu_disable_all,
+ .enable_all = x86_pmu_enable_all,
+ .enable = x86_pmu_enable_event,
+ .disable = x86_pmu_disable_event,
+ .eventsel = MSR_K7_EVNTSEL0,
+ .perfctr = MSR_K7_PERFCTR0,
+ .event_map = amd_pmu_event_map,
+ .raw_event = amd_pmu_raw_event,
+ .max_events = ARRAY_SIZE(amd_perfmon_event_map),
+ .num_events = 4,
+ .event_bits = 48,
+ .event_mask = (1ULL << 48) - 1,
+ .apic = 1,
+ /* use highest bit to detect overflow */
+ .max_period = (1ULL << 47) - 1,
+ .get_event_constraints = amd_get_event_constraints,
+ .put_event_constraints = amd_put_event_constraints,
+
+ .cpu_prepare = amd_pmu_cpu_prepare,
+ .cpu_starting = amd_pmu_cpu_starting,
+ .cpu_dead = amd_pmu_cpu_dead,
+};
+
+static __init int amd_pmu_init(void)
+{
+ /* Performance-monitoring supported from K7 and later: */
+ if (boot_cpu_data.x86 < 6)
+ return -ENODEV;
+
+ x86_pmu = amd_pmu;
+
+ /* Events are common for all AMDs */
+ memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+
+ return 0;
+}
+
+#else /* CONFIG_CPU_SUP_AMD */
+
+static int amd_pmu_init(void)
+{
+ return 0;
+}
+
+#endif
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
new file mode 100644
index 00000000000..9c794ac8783
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -0,0 +1,980 @@
+#ifdef CONFIG_CPU_SUP_INTEL
+
+/*
+ * Intel PerfMon, used on Core and later.
+ */
+static const u64 intel_perfmon_event_map[] =
+{
+ [PERF_COUNT_HW_CPU_CYCLES] = 0x003c,
+ [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
+ [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e,
+ [PERF_COUNT_HW_CACHE_MISSES] = 0x412e,
+ [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
+ [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
+ [PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
+};
+
+static struct event_constraint intel_core_event_constraints[] =
+{
+ INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
+ INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
+ INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
+ INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
+ INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
+ INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FP_COMP_INSTR_RET */
+ EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_core2_event_constraints[] =
+{
+ FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+ /*
+ * Core2 has Fixed Counter 2 listed as CPU_CLK_UNHALTED.REF and event
+ * 0x013c as CPU_CLK_UNHALTED.BUS and specifies there is a fixed
+ * ratio between these counters.
+ */
+ /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
+ INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
+ INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
+ INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
+ INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
+ INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
+ INTEL_EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */
+ INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
+ INTEL_EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */
+ INTEL_EVENT_CONSTRAINT(0xc9, 0x1), /* ITLB_MISS_RETIRED (T30-9) */
+ INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */
+ EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_nehalem_event_constraints[] =
+{
+ FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+ /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
+ INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */
+ INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */
+ INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */
+ INTEL_EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */
+ INTEL_EVENT_CONSTRAINT(0x48, 0x3), /* L1D_PEND_MISS */
+ INTEL_EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */
+ INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
+ INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
+ EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_westmere_event_constraints[] =
+{
+ FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+ /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
+ INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
+ INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */
+ INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
+ EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_gen_event_constraints[] =
+{
+ FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+ /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
+ EVENT_CONSTRAINT_END
+};
+
+static u64 intel_pmu_event_map(int hw_event)
+{
+ return intel_perfmon_event_map[hw_event];
+}
+
+static __initconst u64 westmere_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */
+ [ C(RESULT_MISS) ] = 0x0151, /* L1D.REPL */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */
+ [ C(RESULT_MISS) ] = 0x0251, /* L1D.M_REPL */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */
+ [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
+ [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */
+ [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */
+ [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */
+ [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */
+ [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */
+ [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */
+ [ C(RESULT_MISS) ] = 0x0185, /* ITLB_MISSES.ANY */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
+ [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+};
+
+static __initconst u64 nehalem_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
+ [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
+ [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */
+ [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
+ [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */
+ [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */
+ [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */
+ [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
+ [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
+ [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */
+ [ C(RESULT_MISS) ] = 0x20c8, /* ITLB_MISS_RETIRED */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
+ [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+};
+
+static __initconst u64 core2_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
+ [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
+ [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS */
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */
+ [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
+ [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
+ [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
+ [ C(RESULT_MISS) ] = 0x0208, /* DTLB_MISSES.MISS_LD */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
+ [ C(RESULT_MISS) ] = 0x0808, /* DTLB_MISSES.MISS_ST */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
+ [ C(RESULT_MISS) ] = 0x1282, /* ITLBMISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
+ [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+};
+
+static __initconst u64 atom_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD */
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST */
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
+ [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
+ [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
+ [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI (alias) */
+ [ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI (alias) */
+ [ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
+ [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
+ [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+};
+
+static u64 intel_pmu_raw_event(u64 hw_event)
+{
+#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL
+#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL
+#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL
+#define CORE_EVNTSEL_INV_MASK 0x00800000ULL
+#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL
+
+#define CORE_EVNTSEL_MASK \
+ (INTEL_ARCH_EVTSEL_MASK | \
+ INTEL_ARCH_UNIT_MASK | \
+ INTEL_ARCH_EDGE_MASK | \
+ INTEL_ARCH_INV_MASK | \
+ INTEL_ARCH_CNT_MASK)
+
+ return hw_event & CORE_EVNTSEL_MASK;
+}
+
+static void intel_pmu_enable_bts(u64 config)
+{
+ unsigned long debugctlmsr;
+
+ debugctlmsr = get_debugctlmsr();
+
+ debugctlmsr |= X86_DEBUGCTL_TR;
+ debugctlmsr |= X86_DEBUGCTL_BTS;
+ debugctlmsr |= X86_DEBUGCTL_BTINT;
+
+ if (!(config & ARCH_PERFMON_EVENTSEL_OS))
+ debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS;
+
+ if (!(config & ARCH_PERFMON_EVENTSEL_USR))
+ debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR;
+
+ update_debugctlmsr(debugctlmsr);
+}
+
+static void intel_pmu_disable_bts(void)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ unsigned long debugctlmsr;
+
+ if (!cpuc->ds)
+ return;
+
+ debugctlmsr = get_debugctlmsr();
+
+ debugctlmsr &=
+ ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT |
+ X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR);
+
+ update_debugctlmsr(debugctlmsr);
+}
+
+static void intel_pmu_disable_all(void)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+ wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+
+ if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask))
+ intel_pmu_disable_bts();
+}
+
+static void intel_pmu_enable_all(void)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+ wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
+
+ if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
+ struct perf_event *event =
+ cpuc->events[X86_PMC_IDX_FIXED_BTS];
+
+ if (WARN_ON_ONCE(!event))
+ return;
+
+ intel_pmu_enable_bts(event->hw.config);
+ }
+}
+
+static inline u64 intel_pmu_get_status(void)
+{
+ u64 status;
+
+ rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+
+ return status;
+}
+
+static inline void intel_pmu_ack_status(u64 ack)
+{
+ wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
+}
+
+static inline void
+intel_pmu_disable_fixed(struct hw_perf_event *hwc)
+{
+ int idx = hwc->idx - X86_PMC_IDX_FIXED;
+ u64 ctrl_val, mask;
+
+ mask = 0xfULL << (idx * 4);
+
+ rdmsrl(hwc->config_base, ctrl_val);
+ ctrl_val &= ~mask;
+ (void)checking_wrmsrl(hwc->config_base, ctrl_val);
+}
+
+static void intel_pmu_drain_bts_buffer(void)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct debug_store *ds = cpuc->ds;
+ struct bts_record {
+ u64 from;
+ u64 to;
+ u64 flags;
+ };
+ struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS];
+ struct bts_record *at, *top;
+ struct perf_output_handle handle;
+ struct perf_event_header header;
+ struct perf_sample_data data;
+ struct pt_regs regs;
+
+ if (!event)
+ return;
+
+ if (!ds)
+ return;
+
+ at = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
+ top = (struct bts_record *)(unsigned long)ds->bts_index;
+
+ if (top <= at)
+ return;
+
+ ds->bts_index = ds->bts_buffer_base;
+
+ perf_sample_data_init(&data, 0);
+
+ data.period = event->hw.last_period;
+ regs.ip = 0;
+
+ /*
+ * Prepare a generic sample, i.e. fill in the invariant fields.
+ * We will overwrite the from and to address before we output
+ * the sample.
+ */
+ perf_prepare_sample(&header, &data, event, &regs);
+
+ if (perf_output_begin(&handle, event,
+ header.size * (top - at), 1, 1))
+ return;
+
+ for (; at < top; at++) {
+ data.ip = at->from;
+ data.addr = at->to;
+
+ perf_output_sample(&handle, &header, &data, event);
+ }
+
+ perf_output_end(&handle);
+
+ /* There's new data available. */
+ event->hw.interrupts++;
+ event->pending_kill = POLL_IN;
+}
+
+static inline void
+intel_pmu_disable_event(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) {
+ intel_pmu_disable_bts();
+ intel_pmu_drain_bts_buffer();
+ return;
+ }
+
+ if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
+ intel_pmu_disable_fixed(hwc);
+ return;
+ }
+
+ x86_pmu_disable_event(event);
+}
+
+static inline void
+intel_pmu_enable_fixed(struct hw_perf_event *hwc)
+{
+ int idx = hwc->idx - X86_PMC_IDX_FIXED;
+ u64 ctrl_val, bits, mask;
+ int err;
+
+ /*
+ * Enable IRQ generation (0x8),
+ * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
+ * if requested:
+ */
+ bits = 0x8ULL;
+ if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
+ bits |= 0x2;
+ if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
+ bits |= 0x1;
+
+ /*
+ * ANY bit is supported in v3 and up
+ */
+ if (x86_pmu.version > 2 && hwc->config & ARCH_PERFMON_EVENTSEL_ANY)
+ bits |= 0x4;
+
+ bits <<= (idx * 4);
+ mask = 0xfULL << (idx * 4);
+
+ rdmsrl(hwc->config_base, ctrl_val);
+ ctrl_val &= ~mask;
+ ctrl_val |= bits;
+ err = checking_wrmsrl(hwc->config_base, ctrl_val);
+}
+
+static void intel_pmu_enable_event(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) {
+ if (!__get_cpu_var(cpu_hw_events).enabled)
+ return;
+
+ intel_pmu_enable_bts(hwc->config);
+ return;
+ }
+
+ if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
+ intel_pmu_enable_fixed(hwc);
+ return;
+ }
+
+ __x86_pmu_enable_event(hwc);
+}
+
+/*
+ * Save and restart an expired event. Called by NMI contexts,
+ * so it has to be careful about preempting normal event ops:
+ */
+static int intel_pmu_save_and_restart(struct perf_event *event)
+{
+ x86_perf_event_update(event);
+ return x86_perf_event_set_period(event);
+}
+
+static void intel_pmu_reset(void)
+{
+ struct debug_store *ds = __get_cpu_var(cpu_hw_events).ds;
+ unsigned long flags;
+ int idx;
+
+ if (!x86_pmu.num_events)
+ return;
+
+ local_irq_save(flags);
+
+ printk("clearing PMU state on CPU#%d\n", smp_processor_id());
+
+ for (idx = 0; idx < x86_pmu.num_events; idx++) {
+ checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
+ checking_wrmsrl(x86_pmu.perfctr + idx, 0ull);
+ }
+ for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) {
+ checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
+ }
+ if (ds)
+ ds->bts_index = ds->bts_buffer_base;
+
+ local_irq_restore(flags);
+}
+
+/*
+ * This handler is triggered by the local APIC, so the APIC IRQ handling
+ * rules apply:
+ */
+static int intel_pmu_handle_irq(struct pt_regs *regs)
+{
+ struct perf_sample_data data;
+ struct cpu_hw_events *cpuc;
+ int bit, loops;
+ u64 ack, status;
+
+ perf_sample_data_init(&data, 0);
+
+ cpuc = &__get_cpu_var(cpu_hw_events);
+
+ intel_pmu_disable_all();
+ intel_pmu_drain_bts_buffer();
+ status = intel_pmu_get_status();
+ if (!status) {
+ intel_pmu_enable_all();
+ return 0;
+ }
+
+ loops = 0;
+again:
+ if (++loops > 100) {
+ WARN_ONCE(1, "perfevents: irq loop stuck!\n");
+ perf_event_print_debug();
+ intel_pmu_reset();
+ goto done;
+ }
+
+ inc_irq_stat(apic_perf_irqs);
+ ack = status;
+ for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
+ struct perf_event *event = cpuc->events[bit];
+
+ if (!test_bit(bit, cpuc->active_mask))
+ continue;
+
+ if (!intel_pmu_save_and_restart(event))
+ continue;
+
+ data.period = event->hw.last_period;
+
+ if (perf_event_overflow(event, 1, &data, regs))
+ x86_pmu_stop(event);
+ }
+
+ intel_pmu_ack_status(ack);
+
+ /*
+ * Repeat if there is more work to be done:
+ */
+ status = intel_pmu_get_status();
+ if (status)
+ goto again;
+
+done:
+ intel_pmu_enable_all();
+ return 1;
+}
+
+static struct event_constraint bts_constraint =
+ EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0);
+
+static struct event_constraint *
+intel_special_constraints(struct perf_event *event)
+{
+ unsigned int hw_event;
+
+ hw_event = event->hw.config & INTEL_ARCH_EVENT_MASK;
+
+ if (unlikely((hw_event ==
+ x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) &&
+ (event->hw.sample_period == 1))) {
+
+ return &bts_constraint;
+ }
+ return NULL;
+}
+
+static struct event_constraint *
+intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+{
+ struct event_constraint *c;
+
+ c = intel_special_constraints(event);
+ if (c)
+ return c;
+
+ return x86_get_event_constraints(cpuc, event);
+}
+
+static __initconst struct x86_pmu core_pmu = {
+ .name = "core",
+ .handle_irq = x86_pmu_handle_irq,
+ .disable_all = x86_pmu_disable_all,
+ .enable_all = x86_pmu_enable_all,
+ .enable = x86_pmu_enable_event,
+ .disable = x86_pmu_disable_event,
+ .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
+ .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
+ .event_map = intel_pmu_event_map,
+ .raw_event = intel_pmu_raw_event,
+ .max_events = ARRAY_SIZE(intel_perfmon_event_map),
+ .apic = 1,
+ /*
+ * Intel PMCs cannot be accessed sanely above 32 bit width,
+ * so we install an artificial 1<<31 period regardless of
+ * the generic event period:
+ */
+ .max_period = (1ULL << 31) - 1,
+ .get_event_constraints = intel_get_event_constraints,
+ .event_constraints = intel_core_event_constraints,
+};
+
+static __initconst struct x86_pmu intel_pmu = {
+ .name = "Intel",
+ .handle_irq = intel_pmu_handle_irq,
+ .disable_all = intel_pmu_disable_all,
+ .enable_all = intel_pmu_enable_all,
+ .enable = intel_pmu_enable_event,
+ .disable = intel_pmu_disable_event,
+ .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
+ .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
+ .event_map = intel_pmu_event_map,
+ .raw_event = intel_pmu_raw_event,
+ .max_events = ARRAY_SIZE(intel_perfmon_event_map),
+ .apic = 1,
+ /*
+ * Intel PMCs cannot be accessed sanely above 32 bit width,
+ * so we install an artificial 1<<31 period regardless of
+ * the generic event period:
+ */
+ .max_period = (1ULL << 31) - 1,
+ .enable_bts = intel_pmu_enable_bts,
+ .disable_bts = intel_pmu_disable_bts,
+ .get_event_constraints = intel_get_event_constraints,
+
+ .cpu_starting = init_debug_store_on_cpu,
+ .cpu_dying = fini_debug_store_on_cpu,
+};
+
+static __init int intel_pmu_init(void)
+{
+ union cpuid10_edx edx;
+ union cpuid10_eax eax;
+ unsigned int unused;
+ unsigned int ebx;
+ int version;
+
+ if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
+ /* check for P6 processor family */
+ if (boot_cpu_data.x86 == 6) {
+ return p6_pmu_init();
+ } else {
+ return -ENODEV;
+ }
+ }
+
+ /*
+ * Check whether the Architectural PerfMon supports
+ * Branch Misses Retired hw_event or not.
+ */
+ cpuid(10, &eax.full, &ebx, &unused, &edx.full);
+ if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
+ return -ENODEV;
+
+ version = eax.split.version_id;
+ if (version < 2)
+ x86_pmu = core_pmu;
+ else
+ x86_pmu = intel_pmu;
+
+ x86_pmu.version = version;
+ x86_pmu.num_events = eax.split.num_events;
+ x86_pmu.event_bits = eax.split.bit_width;
+ x86_pmu.event_mask = (1ULL << eax.split.bit_width) - 1;
+
+ /*
+ * Quirk: v2 perfmon does not report fixed-purpose events, so
+ * assume at least 3 events:
+ */
+ if (version > 1)
+ x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3);
+
+ /*
+ * Install the hw-cache-events table:
+ */
+ switch (boot_cpu_data.x86_model) {
+ case 14: /* 65 nm core solo/duo, "Yonah" */
+ pr_cont("Core events, ");
+ break;
+
+ case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
+ case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
+ case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
+ case 29: /* six-core 45 nm xeon "Dunnington" */
+ memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+
+ x86_pmu.event_constraints = intel_core2_event_constraints;
+ pr_cont("Core2 events, ");
+ break;
+
+ case 26: /* 45 nm nehalem, "Bloomfield" */
+ case 30: /* 45 nm nehalem, "Lynnfield" */
+ case 46: /* 45 nm nehalem-ex, "Beckton" */
+ memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+
+ x86_pmu.event_constraints = intel_nehalem_event_constraints;
+ pr_cont("Nehalem/Corei7 events, ");
+ break;
+ case 28: /* Atom */
+ memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+
+ x86_pmu.event_constraints = intel_gen_event_constraints;
+ pr_cont("Atom events, ");
+ break;
+
+ case 37: /* 32 nm nehalem, "Clarkdale" */
+ case 44: /* 32 nm nehalem, "Gulftown" */
+ memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+
+ x86_pmu.event_constraints = intel_westmere_event_constraints;
+ pr_cont("Westmere events, ");
+ break;
+
+ default:
+ /*
+ * default constraints for v2 and up
+ */
+ x86_pmu.event_constraints = intel_gen_event_constraints;
+ pr_cont("generic architected perfmon, ");
+ }
+ return 0;
+}
+
+#else /* CONFIG_CPU_SUP_INTEL */
+
+static int intel_pmu_init(void)
+{
+ return 0;
+}
+
+#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
new file mode 100644
index 00000000000..a330485d14d
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_p6.c
@@ -0,0 +1,159 @@
+#ifdef CONFIG_CPU_SUP_INTEL
+
+/*
+ * Not sure about some of these
+ */
+static const u64 p6_perfmon_event_map[] =
+{
+ [PERF_COUNT_HW_CPU_CYCLES] = 0x0079,
+ [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
+ [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e,
+ [PERF_COUNT_HW_CACHE_MISSES] = 0x012e,
+ [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
+ [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
+ [PERF_COUNT_HW_BUS_CYCLES] = 0x0062,
+};
+
+static u64 p6_pmu_event_map(int hw_event)
+{
+ return p6_perfmon_event_map[hw_event];
+}
+
+/*
+ * Event setting that is specified not to count anything.
+ * We use this to effectively disable a counter.
+ *
+ * L2_RQSTS with 0 MESI unit mask.
+ */
+#define P6_NOP_EVENT 0x0000002EULL
+
+static u64 p6_pmu_raw_event(u64 hw_event)
+{
+#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL
+#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL
+#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL
+#define P6_EVNTSEL_INV_MASK 0x00800000ULL
+#define P6_EVNTSEL_REG_MASK 0xFF000000ULL
+
+#define P6_EVNTSEL_MASK \
+ (P6_EVNTSEL_EVENT_MASK | \
+ P6_EVNTSEL_UNIT_MASK | \
+ P6_EVNTSEL_EDGE_MASK | \
+ P6_EVNTSEL_INV_MASK | \
+ P6_EVNTSEL_REG_MASK)
+
+ return hw_event & P6_EVNTSEL_MASK;
+}
+
+static struct event_constraint p6_event_constraints[] =
+{
+ INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */
+ INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
+ INTEL_EVENT_CONSTRAINT(0x11, 0x1), /* FP_ASSIST */
+ INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
+ INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
+ INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
+ EVENT_CONSTRAINT_END
+};
+
+static void p6_pmu_disable_all(void)
+{
+ u64 val;
+
+ /* p6 only has one enable register */
+ rdmsrl(MSR_P6_EVNTSEL0, val);
+ val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
+ wrmsrl(MSR_P6_EVNTSEL0, val);
+}
+
+static void p6_pmu_enable_all(void)
+{
+ unsigned long val;
+
+ /* p6 only has one enable register */
+ rdmsrl(MSR_P6_EVNTSEL0, val);
+ val |= ARCH_PERFMON_EVENTSEL_ENABLE;
+ wrmsrl(MSR_P6_EVNTSEL0, val);
+}
+
+static inline void
+p6_pmu_disable_event(struct perf_event *event)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct hw_perf_event *hwc = &event->hw;
+ u64 val = P6_NOP_EVENT;
+
+ if (cpuc->enabled)
+ val |= ARCH_PERFMON_EVENTSEL_ENABLE;
+
+ (void)checking_wrmsrl(hwc->config_base + hwc->idx, val);
+}
+
+static void p6_pmu_enable_event(struct perf_event *event)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct hw_perf_event *hwc = &event->hw;
+ u64 val;
+
+ val = hwc->config;
+ if (cpuc->enabled)
+ val |= ARCH_PERFMON_EVENTSEL_ENABLE;
+
+ (void)checking_wrmsrl(hwc->config_base + hwc->idx, val);
+}
+
+static __initconst struct x86_pmu p6_pmu = {
+ .name = "p6",
+ .handle_irq = x86_pmu_handle_irq,
+ .disable_all = p6_pmu_disable_all,
+ .enable_all = p6_pmu_enable_all,
+ .enable = p6_pmu_enable_event,
+ .disable = p6_pmu_disable_event,
+ .eventsel = MSR_P6_EVNTSEL0,
+ .perfctr = MSR_P6_PERFCTR0,
+ .event_map = p6_pmu_event_map,
+ .raw_event = p6_pmu_raw_event,
+ .max_events = ARRAY_SIZE(p6_perfmon_event_map),
+ .apic = 1,
+ .max_period = (1ULL << 31) - 1,
+ .version = 0,
+ .num_events = 2,
+ /*
+ * Events have 40 bits implemented. However they are designed such
+ * that bits [32-39] are sign extensions of bit 31. As such the
+ * effective width of a event for P6-like PMU is 32 bits only.
+ *
+ * See IA-32 Intel Architecture Software developer manual Vol 3B
+ */
+ .event_bits = 32,
+ .event_mask = (1ULL << 32) - 1,
+ .get_event_constraints = x86_get_event_constraints,
+ .event_constraints = p6_event_constraints,
+};
+
+static __init int p6_pmu_init(void)
+{
+ switch (boot_cpu_data.x86_model) {
+ case 1:
+ case 3: /* Pentium Pro */
+ case 5:
+ case 6: /* Pentium II */
+ case 7:
+ case 8:
+ case 11: /* Pentium III */
+ case 9:
+ case 13:
+ /* Pentium M */
+ break;
+ default:
+ pr_cont("unsupported p6 CPU model %d ",
+ boot_cpu_data.x86_model);
+ return -ENODEV;
+ }
+
+ x86_pmu = p6_pmu;
+
+ return 0;
+}
+
+#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index 898df9719af..fb329e9f849 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -115,17 +115,6 @@ int avail_to_resrv_perfctr_nmi_bit(unsigned int counter)
return !test_bit(counter, perfctr_nmi_owner);
}
-
-/* checks the an msr for availability */
-int avail_to_resrv_perfctr_nmi(unsigned int msr)
-{
- unsigned int counter;
-
- counter = nmi_perfctr_msr_to_bit(msr);
- BUG_ON(counter > NMI_MAX_COUNTER_BITS);
-
- return !test_bit(counter, perfctr_nmi_owner);
-}
EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
int reserve_perfctr_nmi(unsigned int msr)
@@ -691,7 +680,7 @@ static int setup_intel_arch_watchdog(unsigned nmi_hz)
cpu_nmi_set_wd_enabled();
apic_write(APIC_LVTPC, APIC_DM_NMI);
- evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+ evntsel |= ARCH_PERFMON_EVENTSEL_ENABLE;
wrmsr(evntsel_msr, evntsel, 0);
intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1);
return 1;
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 1cbed97b59c..dfdb4dba232 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -22,6 +22,7 @@
*/
#include <linux/dmi.h>
+#include <linux/module.h>
#include <asm/div64.h>
#include <asm/vmware.h>
#include <asm/x86_init.h>
@@ -101,6 +102,7 @@ int vmware_platform(void)
return 0;
}
+EXPORT_SYMBOL(vmware_platform);
/*
* VMware hypervisor takes care of exporting a reliable TSC to the guest.
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index cb27fd6136c..8b862d5900f 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -40,6 +40,7 @@
#include <linux/cpu.h>
#include <linux/notifier.h>
#include <linux/uaccess.h>
+#include <linux/gfp.h>
#include <asm/processor.h>
#include <asm/msr.h>
@@ -229,7 +230,7 @@ static void __exit cpuid_exit(void)
for_each_online_cpu(cpu)
cpuid_device_destroy(cpu);
class_destroy(cpuid_class);
- unregister_chrdev(CPUID_MAJOR, "cpu/cpuid");
+ __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid");
unregister_hotcpu_notifier(&cpuid_class_cpu_notifier);
}
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index a4849c10a77..ebd4c51d096 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -27,7 +27,6 @@
#include <asm/cpu.h>
#include <asm/reboot.h>
#include <asm/virtext.h>
-#include <asm/x86_init.h>
#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
@@ -103,10 +102,5 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
#ifdef CONFIG_HPET_TIMER
hpet_disable();
#endif
-
-#ifdef CONFIG_X86_64
- x86_platform.iommu_shutdown();
-#endif
-
crash_save_cpu(regs, safe_smp_processor_id());
}
diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c
index cd97ce18c29..67414550c3c 100644
--- a/arch/x86/kernel/crash_dump_32.c
+++ b/arch/x86/kernel/crash_dump_32.c
@@ -5,6 +5,7 @@
* Copyright (C) IBM Corporation, 2004. All rights reserved
*/
+#include <linux/slab.h>
#include <linux/errno.h>
#include <linux/highmem.h>
#include <linux/crash_dump.h>
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index c56bc287303..6d817554780 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -123,13 +123,15 @@ print_context_stack_bp(struct thread_info *tinfo,
while (valid_stack_ptr(tinfo, ret_addr, sizeof(*ret_addr), end)) {
unsigned long addr = *ret_addr;
- if (__kernel_text_address(addr)) {
- ops->address(data, addr, 1);
- frame = frame->next_frame;
- ret_addr = &frame->return_address;
- print_ftrace_graph_addr(addr, data, ops, tinfo, graph);
- }
+ if (!__kernel_text_address(addr))
+ break;
+
+ ops->address(data, addr, 1);
+ frame = frame->next_frame;
+ ret_addr = &frame->return_address;
+ print_ftrace_graph_addr(addr, data, ops, tinfo, graph);
}
+
return (unsigned long)frame;
}
EXPORT_SYMBOL_GPL(print_context_stack_bp);
diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h
index 4fd1420faff..e1a93be4fd4 100644
--- a/arch/x86/kernel/dumpstack.h
+++ b/arch/x86/kernel/dumpstack.h
@@ -14,6 +14,8 @@
#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
#endif
+#include <linux/uaccess.h>
+
extern void
show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
unsigned long *stack, unsigned long bp, char *log_lvl);
@@ -29,4 +31,26 @@ struct stack_frame {
struct stack_frame *next_frame;
unsigned long return_address;
};
+
+struct stack_frame_ia32 {
+ u32 next_frame;
+ u32 return_address;
+};
+
+static inline unsigned long rewind_frame_pointer(int n)
+{
+ struct stack_frame *frame;
+
+ get_bp(frame);
+
+#ifdef CONFIG_FRAME_POINTER
+ while (n--) {
+ if (probe_kernel_address(&frame->next_frame, frame))
+ break;
+ }
#endif
+
+ return (unsigned long)frame;
+}
+
+#endif /* DUMPSTACK_H */
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index ae775ca47b2..11540a189d9 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -18,11 +18,6 @@
#include "dumpstack.h"
-/* Just a stub for now */
-int x86_is_stack_id(int id, char *name)
-{
- return 0;
-}
void dump_trace(struct task_struct *task, struct pt_regs *regs,
unsigned long *stack, unsigned long bp,
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 0ad9597073f..272c9f1f05f 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -33,11 +33,6 @@ static char x86_stack_ids[][8] = {
#endif
};
-int x86_is_stack_id(int id, char *name)
-{
- return x86_stack_ids[id - 1] == name;
-}
-
static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
unsigned *usedp, char **idp)
{
@@ -125,9 +120,15 @@ fixup_bp_irq_link(unsigned long bp, unsigned long *stack,
{
#ifdef CONFIG_FRAME_POINTER
struct stack_frame *frame = (struct stack_frame *)bp;
+ unsigned long next;
- if (!in_irq_stack(stack, irq_stack, irq_stack_end))
- return (unsigned long)frame->next_frame;
+ if (!in_irq_stack(stack, irq_stack, irq_stack_end)) {
+ if (!probe_kernel_address(&frame->next_frame, next))
+ return next;
+ else
+ WARN_ONCE(1, "Perf: bad frame pointer = %p in "
+ "callchain\n", &frame->next_frame);
+ }
#endif
return bp;
}
@@ -207,7 +208,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
if (in_irq_stack(stack, irq_stack, irq_stack_end)) {
if (ops->stack(data, "IRQ") < 0)
break;
- bp = print_context_stack(tinfo, stack, bp,
+ bp = ops->walk_stack(tinfo, stack, bp,
ops, data, irq_stack_end, &graph);
/*
* We link to the next stack (which would be
@@ -228,7 +229,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
/*
* This handles the process stack:
*/
- bp = print_context_stack(tinfo, stack, bp, ops, data, NULL, &graph);
+ bp = ops->walk_stack(tinfo, stack, bp, ops, data, NULL, &graph);
put_cpu();
}
EXPORT_SYMBOL(dump_trace);
@@ -291,6 +292,7 @@ void show_registers(struct pt_regs *regs)
sp = regs->sp;
printk("CPU %d ", cpu);
+ print_modules();
__show_regs(regs, 1);
printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
cur->comm, cur->pid, task_thread_info(cur), cur);
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 05ed7ab2ca4..7bca3c6a02f 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -12,21 +12,13 @@
#include <linux/types.h>
#include <linux/init.h>
#include <linux/bootmem.h>
-#include <linux/ioport.h>
-#include <linux/string.h>
-#include <linux/kexec.h>
-#include <linux/module.h>
-#include <linux/mm.h>
#include <linux/pfn.h>
#include <linux/suspend.h>
#include <linux/firmware-map.h>
-#include <asm/pgtable.h>
-#include <asm/page.h>
#include <asm/e820.h>
#include <asm/proto.h>
#include <asm/setup.h>
-#include <asm/trampoline.h>
/*
* The e820 map is the map that gets modified e.g. with command line parameters
@@ -517,31 +509,55 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type,
int checktype)
{
int i;
+ u64 end;
u64 real_removed_size = 0;
if (size > (ULLONG_MAX - start))
size = ULLONG_MAX - start;
+ end = start + size;
+ printk(KERN_DEBUG "e820 remove range: %016Lx - %016Lx ",
+ (unsigned long long) start,
+ (unsigned long long) end);
+ if (checktype)
+ e820_print_type(old_type);
+ printk(KERN_CONT "\n");
+
for (i = 0; i < e820.nr_map; i++) {
struct e820entry *ei = &e820.map[i];
u64 final_start, final_end;
+ u64 ei_end;
if (checktype && ei->type != old_type)
continue;
+
+ ei_end = ei->addr + ei->size;
/* totally covered? */
- if (ei->addr >= start &&
- (ei->addr + ei->size) <= (start + size)) {
+ if (ei->addr >= start && ei_end <= end) {
real_removed_size += ei->size;
memset(ei, 0, sizeof(struct e820entry));
continue;
}
+
+ /* new range is totally covered? */
+ if (ei->addr < start && ei_end > end) {
+ e820_add_region(end, ei_end - end, ei->type);
+ ei->size = start - ei->addr;
+ real_removed_size += size;
+ continue;
+ }
+
/* partially covered */
final_start = max(start, ei->addr);
- final_end = min(start + size, ei->addr + ei->size);
+ final_end = min(end, ei_end);
if (final_start >= final_end)
continue;
real_removed_size += final_end - final_start;
+ /*
+ * left range could be head or tail, so need to update
+ * size at first.
+ */
ei->size -= final_end - final_start;
if (ei->addr < final_start)
continue;
@@ -722,319 +738,44 @@ core_initcall(e820_mark_nvs_memory);
#endif
/*
- * Early reserved memory areas.
- */
-#define MAX_EARLY_RES 32
-
-struct early_res {
- u64 start, end;
- char name[16];
- char overlap_ok;
-};
-static struct early_res early_res[MAX_EARLY_RES] __initdata = {
- { 0, PAGE_SIZE, "BIOS data page", 1 }, /* BIOS data page */
-#ifdef CONFIG_X86_32
- /*
- * But first pinch a few for the stack/trampoline stuff
- * FIXME: Don't need the extra page at 4K, but need to fix
- * trampoline before removing it. (see the GDT stuff)
- */
- { PAGE_SIZE, PAGE_SIZE, "EX TRAMPOLINE", 1 },
-#endif
-
- {}
-};
-
-static int __init find_overlapped_early(u64 start, u64 end)
-{
- int i;
- struct early_res *r;
-
- for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
- r = &early_res[i];
- if (end > r->start && start < r->end)
- break;
- }
-
- return i;
-}
-
-/*
- * Drop the i-th range from the early reservation map,
- * by copying any higher ranges down one over it, and
- * clearing what had been the last slot.
- */
-static void __init drop_range(int i)
-{
- int j;
-
- for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++)
- ;
-
- memmove(&early_res[i], &early_res[i + 1],
- (j - 1 - i) * sizeof(struct early_res));
-
- early_res[j - 1].end = 0;
-}
-
-/*
- * Split any existing ranges that:
- * 1) are marked 'overlap_ok', and
- * 2) overlap with the stated range [start, end)
- * into whatever portion (if any) of the existing range is entirely
- * below or entirely above the stated range. Drop the portion
- * of the existing range that overlaps with the stated range,
- * which will allow the caller of this routine to then add that
- * stated range without conflicting with any existing range.
+ * Find a free area with specified alignment in a specific range.
*/
-static void __init drop_overlaps_that_are_ok(u64 start, u64 end)
+u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align)
{
int i;
- struct early_res *r;
- u64 lower_start, lower_end;
- u64 upper_start, upper_end;
- char name[16];
- for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
- r = &early_res[i];
+ for (i = 0; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+ u64 addr;
+ u64 ei_start, ei_last;
- /* Continue past non-overlapping ranges */
- if (end <= r->start || start >= r->end)
+ if (ei->type != E820_RAM)
continue;
- /*
- * Leave non-ok overlaps as is; let caller
- * panic "Overlapping early reservations"
- * when it hits this overlap.
- */
- if (!r->overlap_ok)
- return;
-
- /*
- * We have an ok overlap. We will drop it from the early
- * reservation map, and add back in any non-overlapping
- * portions (lower or upper) as separate, overlap_ok,
- * non-overlapping ranges.
- */
-
- /* 1. Note any non-overlapping (lower or upper) ranges. */
- strncpy(name, r->name, sizeof(name) - 1);
-
- lower_start = lower_end = 0;
- upper_start = upper_end = 0;
- if (r->start < start) {
- lower_start = r->start;
- lower_end = start;
- }
- if (r->end > end) {
- upper_start = end;
- upper_end = r->end;
- }
-
- /* 2. Drop the original ok overlapping range */
- drop_range(i);
-
- i--; /* resume for-loop on copied down entry */
-
- /* 3. Add back in any non-overlapping ranges. */
- if (lower_end)
- reserve_early_overlap_ok(lower_start, lower_end, name);
- if (upper_end)
- reserve_early_overlap_ok(upper_start, upper_end, name);
- }
-}
-
-static void __init __reserve_early(u64 start, u64 end, char *name,
- int overlap_ok)
-{
- int i;
- struct early_res *r;
-
- i = find_overlapped_early(start, end);
- if (i >= MAX_EARLY_RES)
- panic("Too many early reservations");
- r = &early_res[i];
- if (r->end)
- panic("Overlapping early reservations "
- "%llx-%llx %s to %llx-%llx %s\n",
- start, end - 1, name?name:"", r->start,
- r->end - 1, r->name);
- r->start = start;
- r->end = end;
- r->overlap_ok = overlap_ok;
- if (name)
- strncpy(r->name, name, sizeof(r->name) - 1);
-}
-
-/*
- * A few early reservtations come here.
- *
- * The 'overlap_ok' in the name of this routine does -not- mean it
- * is ok for these reservations to overlap an earlier reservation.
- * Rather it means that it is ok for subsequent reservations to
- * overlap this one.
- *
- * Use this entry point to reserve early ranges when you are doing
- * so out of "Paranoia", reserving perhaps more memory than you need,
- * just in case, and don't mind a subsequent overlapping reservation
- * that is known to be needed.
- *
- * The drop_overlaps_that_are_ok() call here isn't really needed.
- * It would be needed if we had two colliding 'overlap_ok'
- * reservations, so that the second such would not panic on the
- * overlap with the first. We don't have any such as of this
- * writing, but might as well tolerate such if it happens in
- * the future.
- */
-void __init reserve_early_overlap_ok(u64 start, u64 end, char *name)
-{
- drop_overlaps_that_are_ok(start, end);
- __reserve_early(start, end, name, 1);
-}
-
-/*
- * Most early reservations come here.
- *
- * We first have drop_overlaps_that_are_ok() drop any pre-existing
- * 'overlap_ok' ranges, so that we can then reserve this memory
- * range without risk of panic'ing on an overlapping overlap_ok
- * early reservation.
- */
-void __init reserve_early(u64 start, u64 end, char *name)
-{
- if (start >= end)
- return;
-
- drop_overlaps_that_are_ok(start, end);
- __reserve_early(start, end, name, 0);
-}
-
-void __init free_early(u64 start, u64 end)
-{
- struct early_res *r;
- int i;
-
- i = find_overlapped_early(start, end);
- r = &early_res[i];
- if (i >= MAX_EARLY_RES || r->end != end || r->start != start)
- panic("free_early on not reserved area: %llx-%llx!",
- start, end - 1);
-
- drop_range(i);
-}
-
-void __init early_res_to_bootmem(u64 start, u64 end)
-{
- int i, count;
- u64 final_start, final_end;
-
- count = 0;
- for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++)
- count++;
-
- printk(KERN_INFO "(%d early reservations) ==> bootmem [%010llx - %010llx]\n",
- count, start, end);
- for (i = 0; i < count; i++) {
- struct early_res *r = &early_res[i];
- printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i,
- r->start, r->end, r->name);
- final_start = max(start, r->start);
- final_end = min(end, r->end);
- if (final_start >= final_end) {
- printk(KERN_CONT "\n");
- continue;
- }
- printk(KERN_CONT " ==> [%010llx - %010llx]\n",
- final_start, final_end);
- reserve_bootmem_generic(final_start, final_end - final_start,
- BOOTMEM_DEFAULT);
- }
-}
+ ei_last = ei->addr + ei->size;
+ ei_start = ei->addr;
+ addr = find_early_area(ei_start, ei_last, start, end,
+ size, align);
-/* Check for already reserved areas */
-static inline int __init bad_addr(u64 *addrp, u64 size, u64 align)
-{
- int i;
- u64 addr = *addrp;
- int changed = 0;
- struct early_res *r;
-again:
- i = find_overlapped_early(addr, addr + size);
- r = &early_res[i];
- if (i < MAX_EARLY_RES && r->end) {
- *addrp = addr = round_up(r->end, align);
- changed = 1;
- goto again;
+ if (addr != -1ULL)
+ return addr;
}
- return changed;
+ return -1ULL;
}
-/* Check for already reserved areas */
-static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align)
+u64 __init find_fw_memmap_area(u64 start, u64 end, u64 size, u64 align)
{
- int i;
- u64 addr = *addrp, last;
- u64 size = *sizep;
- int changed = 0;
-again:
- last = addr + size;
- for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
- struct early_res *r = &early_res[i];
- if (last > r->start && addr < r->start) {
- size = r->start - addr;
- changed = 1;
- goto again;
- }
- if (last > r->end && addr < r->end) {
- addr = round_up(r->end, align);
- size = last - addr;
- changed = 1;
- goto again;
- }
- if (last <= r->end && addr >= r->start) {
- (*sizep)++;
- return 0;
- }
- }
- if (changed) {
- *addrp = addr;
- *sizep = size;
- }
- return changed;
+ return find_e820_area(start, end, size, align);
}
-/*
- * Find a free area with specified alignment in a specific range.
- */
-u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align)
+u64 __init get_max_mapped(void)
{
- int i;
+ u64 end = max_pfn_mapped;
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
- u64 addr, last;
- u64 ei_last;
+ end <<= PAGE_SHIFT;
- if (ei->type != E820_RAM)
- continue;
- addr = round_up(ei->addr, align);
- ei_last = ei->addr + ei->size;
- if (addr < start)
- addr = round_up(start, align);
- if (addr >= ei_last)
- continue;
- while (bad_addr(&addr, size, align) && addr+size <= ei_last)
- ;
- last = addr + size;
- if (last > ei_last)
- continue;
- if (last > end)
- continue;
- return addr;
- }
- return -1ULL;
+ return end;
}
-
/*
* Find next free range after *start
*/
@@ -1044,25 +785,19 @@ u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align)
for (i = 0; i < e820.nr_map; i++) {
struct e820entry *ei = &e820.map[i];
- u64 addr, last;
- u64 ei_last;
+ u64 addr;
+ u64 ei_start, ei_last;
if (ei->type != E820_RAM)
continue;
- addr = round_up(ei->addr, align);
+
ei_last = ei->addr + ei->size;
- if (addr < start)
- addr = round_up(start, align);
- if (addr >= ei_last)
- continue;
- *sizep = ei_last - addr;
- while (bad_addr_size(&addr, sizep, align) &&
- addr + *sizep <= ei_last)
- ;
- last = addr + *sizep;
- if (last > ei_last)
- continue;
- return addr;
+ ei_start = ei->addr;
+ addr = find_early_area_size(ei_start, ei_last, start,
+ sizep, align);
+
+ if (addr != -1ULL)
+ return addr;
}
return -1ULL;
@@ -1421,6 +1156,8 @@ void __init e820_reserve_resources_late(void)
end = MAX_RESOURCE_SIZE;
if (start >= end)
continue;
+ printk(KERN_DEBUG "reserve RAM buffer: %016llx - %016llx ",
+ start, end);
reserve_region_with_split(&iomem_resource, start, end,
"RAM buffer");
}
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index cdcfb122f25..c2fa9b8b497 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -362,7 +362,7 @@ void __init efi_init(void)
printk(KERN_ERR PFX "Could not map the firmware vendor!\n");
early_iounmap(tmp, 2);
- printk(KERN_INFO "EFI v%u.%.02u by %s \n",
+ printk(KERN_INFO "EFI v%u.%.02u by %s\n",
efi.systab->hdr.revision >> 16,
efi.systab->hdr.revision & 0xffff, vendor);
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 30968924543..cd37469b54e 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -30,14 +30,32 @@
#ifdef CONFIG_DYNAMIC_FTRACE
+/*
+ * modifying_code is set to notify NMIs that they need to use
+ * memory barriers when entering or exiting. But we don't want
+ * to burden NMIs with unnecessary memory barriers when code
+ * modification is not being done (which is most of the time).
+ *
+ * A mutex is already held when ftrace_arch_code_modify_prepare
+ * and post_process are called. No locks need to be taken here.
+ *
+ * Stop machine will make sure currently running NMIs are done
+ * and new NMIs will see the updated variable before we need
+ * to worry about NMIs doing memory barriers.
+ */
+static int modifying_code __read_mostly;
+static DEFINE_PER_CPU(int, save_modifying_code);
+
int ftrace_arch_code_modify_prepare(void)
{
set_kernel_text_rw();
+ modifying_code = 1;
return 0;
}
int ftrace_arch_code_modify_post_process(void)
{
+ modifying_code = 0;
set_kernel_text_ro();
return 0;
}
@@ -149,6 +167,11 @@ static void ftrace_mod_code(void)
void ftrace_nmi_enter(void)
{
+ __get_cpu_var(save_modifying_code) = modifying_code;
+
+ if (!__get_cpu_var(save_modifying_code))
+ return;
+
if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) {
smp_rmb();
ftrace_mod_code();
@@ -160,6 +183,9 @@ void ftrace_nmi_enter(void)
void ftrace_nmi_exit(void)
{
+ if (!__get_cpu_var(save_modifying_code))
+ return;
+
/* Finish all executions before clearing nmi_running */
smp_mb();
atomic_dec(&nmi_running);
@@ -484,13 +510,3 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
}
}
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
-
-#ifdef CONFIG_FTRACE_SYSCALLS
-
-extern unsigned long *sys_call_table;
-
-unsigned long __init arch_syscall_addr(int nr)
-{
- return (unsigned long)(&sys_call_table)[nr];
-}
-#endif
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 5051b94c906..b2e24603739 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -7,6 +7,7 @@
#include <linux/init.h>
#include <linux/start_kernel.h>
+#include <linux/mm.h>
#include <asm/setup.h>
#include <asm/sections.h>
@@ -29,14 +30,25 @@ static void __init i386_default_early_setup(void)
void __init i386_start_kernel(void)
{
+#ifdef CONFIG_X86_TRAMPOLINE
+ /*
+ * But first pinch a few for the stack/trampoline stuff
+ * FIXME: Don't need the extra page at 4K, but need to fix
+ * trampoline before removing it. (see the GDT stuff)
+ */
+ reserve_early_overlap_ok(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE,
+ "EX TRAMPOLINE");
+#endif
+
reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
#ifdef CONFIG_BLK_DEV_INITRD
/* Reserve INITRD */
if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
+ /* Assume only end is not page aligned */
u64 ramdisk_image = boot_params.hdr.ramdisk_image;
u64 ramdisk_size = boot_params.hdr.ramdisk_size;
- u64 ramdisk_end = ramdisk_image + ramdisk_size;
+ u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
}
#endif
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index b5a9896ca1e..7147143fd61 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -103,9 +103,10 @@ void __init x86_64_start_reservations(char *real_mode_data)
#ifdef CONFIG_BLK_DEV_INITRD
/* Reserve INITRD */
if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
+ /* Assume only end is not page aligned */
unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
- unsigned long ramdisk_end = ramdisk_image + ramdisk_size;
+ unsigned long ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
}
#endif
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 7fd318bac59..37c3d4b17d8 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -442,8 +442,8 @@ is386: movl $2,%ecx # set MP
*/
cmpb $0,ready
jne 1f
- movl $per_cpu__gdt_page,%eax
- movl $per_cpu__stack_canary,%ecx
+ movl $gdt_page,%eax
+ movl $stack_canary,%ecx
movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax)
shrl $16, %ecx
movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax)
@@ -706,7 +706,7 @@ idt_descr:
.word 0 # 32 bit align gdt_desc.address
ENTRY(early_gdt_descr)
.word GDT_ENTRIES*8-1
- .long per_cpu__gdt_page /* Overwritten for secondary CPUs */
+ .long gdt_page /* Overwritten for secondary CPUs */
/*
* The boot_gdt must mirror the equivalent in setup.S and is
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 2d8b5035371..3d1e6f16b7a 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -27,7 +27,7 @@
#define GET_CR2_INTO_RCX movq %cr2, %rcx
#endif
-/* we are not able to switch in one step to the final KERNEL ADRESS SPACE
+/* we are not able to switch in one step to the final KERNEL ADDRESS SPACE
* because we need identity-mapped pages.
*
*/
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index ba6e6588460..23b4ecdffa9 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -4,6 +4,7 @@
#include <linux/sysdev.h>
#include <linux/delay.h>
#include <linux/errno.h>
+#include <linux/slab.h>
#include <linux/hpet.h>
#include <linux/init.h>
#include <linux/cpu.h>
@@ -34,6 +35,8 @@
*/
unsigned long hpet_address;
u8 hpet_blockid; /* OS timer block num */
+u8 hpet_msi_disable;
+
#ifdef CONFIG_PCI_MSI
static unsigned long hpet_num_timers;
#endif
@@ -264,7 +267,7 @@ static void hpet_resume_device(void)
force_hpet_resume();
}
-static void hpet_resume_counter(void)
+static void hpet_resume_counter(struct clocksource *cs)
{
hpet_resume_device();
hpet_restart_counter();
@@ -397,9 +400,15 @@ static int hpet_next_event(unsigned long delta,
* then we might have a real hardware problem. We can not do
* much about it here, but at least alert the user/admin with
* a prominent warning.
+ * An erratum on some chipsets (ICH9,..), results in comparator read
+ * immediately following a write returning old value. Workaround
+ * for this is to read this value second time, when first
+ * read returns old value.
*/
- WARN_ONCE(hpet_readl(HPET_Tn_CMP(timer)) != cnt,
+ if (unlikely((u32)hpet_readl(HPET_Tn_CMP(timer)) != cnt)) {
+ WARN_ONCE(hpet_readl(HPET_Tn_CMP(timer)) != cnt,
KERN_WARNING "hpet: compare register read back failed.\n");
+ }
return (s32)(hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0;
}
@@ -596,6 +605,9 @@ static void hpet_msi_capability_lookup(unsigned int start_timer)
unsigned int num_timers_used = 0;
int i;
+ if (hpet_msi_disable)
+ return;
+
if (boot_cpu_has(X86_FEATURE_ARAT))
return;
id = hpet_readl(HPET_ID);
@@ -928,6 +940,9 @@ static __init int hpet_late_init(void)
hpet_reserve_platform_timers(hpet_readl(HPET_ID));
hpet_print_config();
+ if (hpet_msi_disable)
+ return 0;
+
if (boot_cpu_has(X86_FEATURE_ARAT))
return 0;
@@ -1135,6 +1150,7 @@ int hpet_set_periodic_freq(unsigned long freq)
do_div(clc, freq);
clc >>= hpet_clockevent.shift;
hpet_pie_delta = clc;
+ hpet_pie_limit = 0;
}
return 1;
}
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index 05d5fec64a9..d6cc065f519 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -212,25 +212,6 @@ static int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len)
return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE);
}
-/*
- * Store a breakpoint's encoded address, length, and type.
- */
-static int arch_store_info(struct perf_event *bp)
-{
- struct arch_hw_breakpoint *info = counter_arch_bp(bp);
- /*
- * For kernel-addresses, either the address or symbol name can be
- * specified.
- */
- if (info->name)
- info->address = (unsigned long)
- kallsyms_lookup_name(info->name);
- if (info->address)
- return 0;
-
- return -EINVAL;
-}
-
int arch_bp_generic_fields(int x86_len, int x86_type,
int *gen_len, int *gen_type)
{
@@ -362,10 +343,6 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp,
return ret;
}
- ret = arch_store_info(bp);
-
- if (ret < 0)
- return ret;
/*
* Check that the low-order bits of the address are appropriate
* for the alignment implied by len.
@@ -502,8 +479,6 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args)
rcu_read_lock();
bp = per_cpu(bp_per_reg[i], cpu);
- if (bp)
- rc = NOTIFY_DONE;
/*
* Reset the 'i'th TRAP bit in dr6 to denote completion of
* exception handling
@@ -522,7 +497,13 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args)
rcu_read_unlock();
}
- if (dr6 & (~DR_TRAP_BITS))
+ /*
+ * Further processing in do_debug() is needed for a) user-space
+ * breakpoints (to generate signals) and b) when the system has
+ * taken exception due to multiple causes
+ */
+ if ((current->thread.debugreg6 & DR_TRAP_BITS) ||
+ (dr6 & (~DR_TRAP_BITS)))
rc = NOTIFY_DONE;
set_debugreg(dr7, 7);
@@ -547,8 +528,3 @@ void hw_breakpoint_pmu_read(struct perf_event *bp)
{
/* TODO */
}
-
-void hw_breakpoint_pmu_unthrottle(struct perf_event *bp)
-{
- /* TODO */
-}
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index f2f8540a7f3..54c31c28548 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -8,6 +8,7 @@
#include <linux/module.h>
#include <linux/regset.h>
#include <linux/sched.h>
+#include <linux/slab.h>
#include <asm/sigcontext.h>
#include <asm/processor.h>
@@ -164,6 +165,11 @@ int init_fpu(struct task_struct *tsk)
return 0;
}
+/*
+ * The xstateregs_active() routine is the same as the fpregs_active() routine,
+ * as the "regset->n" for the xstate regset will be updated based on the feature
+ * capabilites supported by the xsave.
+ */
int fpregs_active(struct task_struct *target, const struct user_regset *regset)
{
return tsk_used_math(target) ? regset->n : 0;
@@ -204,8 +210,6 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
if (ret)
return ret;
- set_stopped_child_used_math(target);
-
ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
&target->thread.xstate->fxsave, 0, -1);
@@ -224,6 +228,68 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
return ret;
}
+int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
+ unsigned int pos, unsigned int count,
+ void *kbuf, void __user *ubuf)
+{
+ int ret;
+
+ if (!cpu_has_xsave)
+ return -ENODEV;
+
+ ret = init_fpu(target);
+ if (ret)
+ return ret;
+
+ /*
+ * Copy the 48bytes defined by the software first into the xstate
+ * memory layout in the thread struct, so that we can copy the entire
+ * xstateregs to the user using one user_regset_copyout().
+ */
+ memcpy(&target->thread.xstate->fxsave.sw_reserved,
+ xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes));
+
+ /*
+ * Copy the xstate memory layout.
+ */
+ ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+ &target->thread.xstate->xsave, 0, -1);
+ return ret;
+}
+
+int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
+ unsigned int pos, unsigned int count,
+ const void *kbuf, const void __user *ubuf)
+{
+ int ret;
+ struct xsave_hdr_struct *xsave_hdr;
+
+ if (!cpu_has_xsave)
+ return -ENODEV;
+
+ ret = init_fpu(target);
+ if (ret)
+ return ret;
+
+ ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+ &target->thread.xstate->xsave, 0, -1);
+
+ /*
+ * mxcsr reserved bits must be masked to zero for security reasons.
+ */
+ target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask;
+
+ xsave_hdr = &target->thread.xstate->xsave.xsave_hdr;
+
+ xsave_hdr->xstate_bv &= pcntxt_mask;
+ /*
+ * These bits must be zero.
+ */
+ xsave_hdr->reserved1[0] = xsave_hdr->reserved1[1] = 0;
+
+ return ret;
+}
+
#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
/*
@@ -404,8 +470,6 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
if (ret)
return ret;
- set_stopped_child_used_math(target);
-
if (!HAVE_HWFP)
return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf);
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index df89102bef8..7c9f02c130f 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -5,7 +5,6 @@
#include <linux/ioport.h>
#include <linux/interrupt.h>
#include <linux/timex.h>
-#include <linux/slab.h>
#include <linux/random.h>
#include <linux/init.h>
#include <linux/kernel_stat.h>
@@ -32,8 +31,14 @@
*/
static int i8259A_auto_eoi;
-DEFINE_SPINLOCK(i8259A_lock);
+DEFINE_RAW_SPINLOCK(i8259A_lock);
static void mask_and_ack_8259A(unsigned int);
+static void mask_8259A(void);
+static void unmask_8259A(void);
+static void disable_8259A_irq(unsigned int irq);
+static void enable_8259A_irq(unsigned int irq);
+static void init_8259A(int auto_eoi);
+static int i8259A_irq_pending(unsigned int irq);
struct irq_chip i8259A_chip = {
.name = "XT-PIC",
@@ -63,51 +68,51 @@ unsigned int cached_irq_mask = 0xffff;
*/
unsigned long io_apic_irqs;
-void disable_8259A_irq(unsigned int irq)
+static void disable_8259A_irq(unsigned int irq)
{
unsigned int mask = 1 << irq;
unsigned long flags;
- spin_lock_irqsave(&i8259A_lock, flags);
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
cached_irq_mask |= mask;
if (irq & 8)
outb(cached_slave_mask, PIC_SLAVE_IMR);
else
outb(cached_master_mask, PIC_MASTER_IMR);
- spin_unlock_irqrestore(&i8259A_lock, flags);
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
}
-void enable_8259A_irq(unsigned int irq)
+static void enable_8259A_irq(unsigned int irq)
{
unsigned int mask = ~(1 << irq);
unsigned long flags;
- spin_lock_irqsave(&i8259A_lock, flags);
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
cached_irq_mask &= mask;
if (irq & 8)
outb(cached_slave_mask, PIC_SLAVE_IMR);
else
outb(cached_master_mask, PIC_MASTER_IMR);
- spin_unlock_irqrestore(&i8259A_lock, flags);
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
}
-int i8259A_irq_pending(unsigned int irq)
+static int i8259A_irq_pending(unsigned int irq)
{
unsigned int mask = 1<<irq;
unsigned long flags;
int ret;
- spin_lock_irqsave(&i8259A_lock, flags);
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
if (irq < 8)
ret = inb(PIC_MASTER_CMD) & mask;
else
ret = inb(PIC_SLAVE_CMD) & (mask >> 8);
- spin_unlock_irqrestore(&i8259A_lock, flags);
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
return ret;
}
-void make_8259A_irq(unsigned int irq)
+static void make_8259A_irq(unsigned int irq)
{
disable_irq_nosync(irq);
io_apic_irqs &= ~(1<<irq);
@@ -150,7 +155,7 @@ static void mask_and_ack_8259A(unsigned int irq)
unsigned int irqmask = 1 << irq;
unsigned long flags;
- spin_lock_irqsave(&i8259A_lock, flags);
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
/*
* Lightweight spurious IRQ detection. We do not want
* to overdo spurious IRQ handling - it's usually a sign
@@ -183,7 +188,7 @@ handle_real_irq:
outb(cached_master_mask, PIC_MASTER_IMR);
outb(0x60+irq, PIC_MASTER_CMD); /* 'Specific EOI to master */
}
- spin_unlock_irqrestore(&i8259A_lock, flags);
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
return;
spurious_8259A_irq:
@@ -281,37 +286,37 @@ static int __init i8259A_init_sysfs(void)
device_initcall(i8259A_init_sysfs);
-void mask_8259A(void)
+static void mask_8259A(void)
{
unsigned long flags;
- spin_lock_irqsave(&i8259A_lock, flags);
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
- spin_unlock_irqrestore(&i8259A_lock, flags);
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
}
-void unmask_8259A(void)
+static void unmask_8259A(void)
{
unsigned long flags;
- spin_lock_irqsave(&i8259A_lock, flags);
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */
outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */
- spin_unlock_irqrestore(&i8259A_lock, flags);
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
}
-void init_8259A(int auto_eoi)
+static void init_8259A(int auto_eoi)
{
unsigned long flags;
i8259A_auto_eoi = auto_eoi;
- spin_lock_irqsave(&i8259A_lock, flags);
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
@@ -356,5 +361,49 @@ void init_8259A(int auto_eoi)
outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */
outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */
- spin_unlock_irqrestore(&i8259A_lock, flags);
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
}
+
+/*
+ * make i8259 a driver so that we can select pic functions at run time. the goal
+ * is to make x86 binary compatible among pc compatible and non-pc compatible
+ * platforms, such as x86 MID.
+ */
+
+static void legacy_pic_noop(void) { };
+static void legacy_pic_uint_noop(unsigned int unused) { };
+static void legacy_pic_int_noop(int unused) { };
+
+static struct irq_chip dummy_pic_chip = {
+ .name = "dummy pic",
+ .mask = legacy_pic_uint_noop,
+ .unmask = legacy_pic_uint_noop,
+ .disable = legacy_pic_uint_noop,
+ .mask_ack = legacy_pic_uint_noop,
+};
+static int legacy_pic_irq_pending_noop(unsigned int irq)
+{
+ return 0;
+}
+
+struct legacy_pic null_legacy_pic = {
+ .nr_legacy_irqs = 0,
+ .chip = &dummy_pic_chip,
+ .mask_all = legacy_pic_noop,
+ .restore_mask = legacy_pic_noop,
+ .init = legacy_pic_int_noop,
+ .irq_pending = legacy_pic_irq_pending_noop,
+ .make_irq = legacy_pic_uint_noop,
+};
+
+struct legacy_pic default_legacy_pic = {
+ .nr_legacy_irqs = NR_IRQS_LEGACY,
+ .chip = &i8259A_chip,
+ .mask_all = mask_8259A,
+ .restore_mask = unmask_8259A,
+ .init = init_8259A,
+ .irq_pending = i8259A_irq_pending,
+ .make_irq = make_8259A_irq,
+};
+
+struct legacy_pic *legacy_pic = &default_legacy_pic;
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index d5932226614..0ed2d300cd4 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -5,7 +5,6 @@
#include <linux/ioport.h>
#include <linux/interrupt.h>
#include <linux/timex.h>
-#include <linux/slab.h>
#include <linux/random.h>
#include <linux/kprobes.h>
#include <linux/init.h>
@@ -84,24 +83,7 @@ static struct irqaction irq2 = {
};
DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
- [0 ... IRQ0_VECTOR - 1] = -1,
- [IRQ0_VECTOR] = 0,
- [IRQ1_VECTOR] = 1,
- [IRQ2_VECTOR] = 2,
- [IRQ3_VECTOR] = 3,
- [IRQ4_VECTOR] = 4,
- [IRQ5_VECTOR] = 5,
- [IRQ6_VECTOR] = 6,
- [IRQ7_VECTOR] = 7,
- [IRQ8_VECTOR] = 8,
- [IRQ9_VECTOR] = 9,
- [IRQ10_VECTOR] = 10,
- [IRQ11_VECTOR] = 11,
- [IRQ12_VECTOR] = 12,
- [IRQ13_VECTOR] = 13,
- [IRQ14_VECTOR] = 14,
- [IRQ15_VECTOR] = 15,
- [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
+ [0 ... NR_VECTORS - 1] = -1,
};
int vector_used_by_percpu_irq(unsigned int vector)
@@ -123,12 +105,12 @@ void __init init_ISA_irqs(void)
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
init_bsp_APIC();
#endif
- init_8259A(0);
+ legacy_pic->init(0);
/*
* 16 old-style INTA-cycle interrupts:
*/
- for (i = 0; i < NR_IRQS_LEGACY; i++) {
+ for (i = 0; i < legacy_pic->nr_legacy_irqs; i++) {
struct irq_desc *desc = irq_to_desc(i);
desc->status = IRQ_DISABLED;
@@ -142,9 +124,44 @@ void __init init_ISA_irqs(void)
void __init init_IRQ(void)
{
+ int i;
+
+ /*
+ * On cpu 0, Assign IRQ0_VECTOR..IRQ15_VECTOR's to IRQ 0..15.
+ * If these IRQ's are handled by legacy interrupt-controllers like PIC,
+ * then this configuration will likely be static after the boot. If
+ * these IRQ's are handled by more mordern controllers like IO-APIC,
+ * then this vector space can be freed and re-used dynamically as the
+ * irq's migrate etc.
+ */
+ for (i = 0; i < legacy_pic->nr_legacy_irqs; i++)
+ per_cpu(vector_irq, 0)[IRQ0_VECTOR + i] = i;
+
x86_init.irqs.intr_init();
}
+/*
+ * Setup the vector to irq mappings.
+ */
+void setup_vector_irq(int cpu)
+{
+#ifndef CONFIG_X86_IO_APIC
+ int irq;
+
+ /*
+ * On most of the platforms, legacy PIC delivers the interrupts on the
+ * boot cpu. But there are certain platforms where PIC interrupts are
+ * delivered to multiple cpu's. If the legacy IRQ is handled by the
+ * legacy PIC, for the new cpu that is coming online, setup the static
+ * legacy vector to irq mapping:
+ */
+ for (irq = 0; irq < legacy_pic->nr_legacy_irqs; irq++)
+ per_cpu(vector_irq, cpu)[IRQ0_VECTOR + irq] = irq;
+#endif
+
+ __setup_vector_irq(cpu);
+}
+
static void __init smp_intr_init(void)
{
#ifdef CONFIG_SMP
diff --git a/arch/x86/kernel/k8.c b/arch/x86/kernel/k8.c
index cbc4332a77b..0f7bc20cfcd 100644
--- a/arch/x86/kernel/k8.c
+++ b/arch/x86/kernel/k8.c
@@ -2,8 +2,8 @@
* Shared support code for AMD K8 northbridges and derivates.
* Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2.
*/
-#include <linux/gfp.h>
#include <linux/types.h>
+#include <linux/slab.h>
#include <linux/init.h>
#include <linux/errno.h>
#include <linux/module.h>
@@ -121,3 +121,17 @@ void k8_flush_garts(void)
}
EXPORT_SYMBOL_GPL(k8_flush_garts);
+static __init int init_k8_nbs(void)
+{
+ int err = 0;
+
+ err = cache_k8_northbridges();
+
+ if (err < 0)
+ printk(KERN_NOTICE "K8 NB: Cannot enumerate AMD northbridges.\n");
+
+ return err;
+}
+
+/* This has to go after the PCI subsystem */
+fs_initcall(init_k8_nbs);
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
index e444357375c..8afd9f321f1 100644
--- a/arch/x86/kernel/kdebugfs.c
+++ b/arch/x86/kernel/kdebugfs.c
@@ -9,6 +9,7 @@
#include <linux/debugfs.h>
#include <linux/uaccess.h>
#include <linux/module.h>
+#include <linux/slab.h>
#include <linux/init.h>
#include <linux/stat.h>
#include <linux/io.h>
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index dd74fe7273b..b2258ca9100 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -42,6 +42,7 @@
#include <linux/init.h>
#include <linux/smp.h>
#include <linux/nmi.h>
+#include <linux/hw_breakpoint.h>
#include <asm/debugreg.h>
#include <asm/apicdef.h>
@@ -204,40 +205,81 @@ void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
static struct hw_breakpoint {
unsigned enabled;
- unsigned type;
- unsigned len;
unsigned long addr;
+ int len;
+ int type;
+ struct perf_event **pev;
} breakinfo[4];
static void kgdb_correct_hw_break(void)
{
- unsigned long dr7;
- int correctit = 0;
- int breakbit;
int breakno;
- get_debugreg(dr7, 7);
for (breakno = 0; breakno < 4; breakno++) {
- breakbit = 2 << (breakno << 1);
- if (!(dr7 & breakbit) && breakinfo[breakno].enabled) {
- correctit = 1;
- dr7 |= breakbit;
- dr7 &= ~(0xf0000 << (breakno << 2));
- dr7 |= ((breakinfo[breakno].len << 2) |
- breakinfo[breakno].type) <<
- ((breakno << 2) + 16);
- set_debugreg(breakinfo[breakno].addr, breakno);
-
- } else {
- if ((dr7 & breakbit) && !breakinfo[breakno].enabled) {
- correctit = 1;
- dr7 &= ~breakbit;
- dr7 &= ~(0xf0000 << (breakno << 2));
- }
- }
+ struct perf_event *bp;
+ struct arch_hw_breakpoint *info;
+ int val;
+ int cpu = raw_smp_processor_id();
+ if (!breakinfo[breakno].enabled)
+ continue;
+ bp = *per_cpu_ptr(breakinfo[breakno].pev, cpu);
+ info = counter_arch_bp(bp);
+ if (bp->attr.disabled != 1)
+ continue;
+ bp->attr.bp_addr = breakinfo[breakno].addr;
+ bp->attr.bp_len = breakinfo[breakno].len;
+ bp->attr.bp_type = breakinfo[breakno].type;
+ info->address = breakinfo[breakno].addr;
+ info->len = breakinfo[breakno].len;
+ info->type = breakinfo[breakno].type;
+ val = arch_install_hw_breakpoint(bp);
+ if (!val)
+ bp->attr.disabled = 0;
+ }
+ hw_breakpoint_restore();
+}
+
+static int hw_break_reserve_slot(int breakno)
+{
+ int cpu;
+ int cnt = 0;
+ struct perf_event **pevent;
+
+ for_each_online_cpu(cpu) {
+ cnt++;
+ pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu);
+ if (dbg_reserve_bp_slot(*pevent))
+ goto fail;
+ }
+
+ return 0;
+
+fail:
+ for_each_online_cpu(cpu) {
+ cnt--;
+ if (!cnt)
+ break;
+ pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu);
+ dbg_release_bp_slot(*pevent);
}
- if (correctit)
- set_debugreg(dr7, 7);
+ return -1;
+}
+
+static int hw_break_release_slot(int breakno)
+{
+ struct perf_event **pevent;
+ int cpu;
+
+ for_each_online_cpu(cpu) {
+ pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu);
+ if (dbg_release_bp_slot(*pevent))
+ /*
+ * The debugger is responisble for handing the retry on
+ * remove failure.
+ */
+ return -1;
+ }
+ return 0;
}
static int
@@ -251,6 +293,10 @@ kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
if (i == 4)
return -1;
+ if (hw_break_release_slot(i)) {
+ printk(KERN_ERR "Cannot remove hw breakpoint at %lx\n", addr);
+ return -1;
+ }
breakinfo[i].enabled = 0;
return 0;
@@ -259,15 +305,23 @@ kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
static void kgdb_remove_all_hw_break(void)
{
int i;
+ int cpu = raw_smp_processor_id();
+ struct perf_event *bp;
- for (i = 0; i < 4; i++)
- memset(&breakinfo[i], 0, sizeof(struct hw_breakpoint));
+ for (i = 0; i < 4; i++) {
+ if (!breakinfo[i].enabled)
+ continue;
+ bp = *per_cpu_ptr(breakinfo[i].pev, cpu);
+ if (bp->attr.disabled == 1)
+ continue;
+ arch_uninstall_hw_breakpoint(bp);
+ bp->attr.disabled = 1;
+ }
}
static int
kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
{
- unsigned type;
int i;
for (i = 0; i < 4; i++)
@@ -278,27 +332,42 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
switch (bptype) {
case BP_HARDWARE_BREAKPOINT:
- type = 0;
- len = 1;
+ len = 1;
+ breakinfo[i].type = X86_BREAKPOINT_EXECUTE;
break;
case BP_WRITE_WATCHPOINT:
- type = 1;
+ breakinfo[i].type = X86_BREAKPOINT_WRITE;
break;
case BP_ACCESS_WATCHPOINT:
- type = 3;
+ breakinfo[i].type = X86_BREAKPOINT_RW;
break;
default:
return -1;
}
-
- if (len == 1 || len == 2 || len == 4)
- breakinfo[i].len = len - 1;
- else
+ switch (len) {
+ case 1:
+ breakinfo[i].len = X86_BREAKPOINT_LEN_1;
+ break;
+ case 2:
+ breakinfo[i].len = X86_BREAKPOINT_LEN_2;
+ break;
+ case 4:
+ breakinfo[i].len = X86_BREAKPOINT_LEN_4;
+ break;
+#ifdef CONFIG_X86_64
+ case 8:
+ breakinfo[i].len = X86_BREAKPOINT_LEN_8;
+ break;
+#endif
+ default:
return -1;
-
- breakinfo[i].enabled = 1;
+ }
breakinfo[i].addr = addr;
- breakinfo[i].type = type;
+ if (hw_break_reserve_slot(i)) {
+ breakinfo[i].addr = 0;
+ return -1;
+ }
+ breakinfo[i].enabled = 1;
return 0;
}
@@ -313,8 +382,21 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
*/
void kgdb_disable_hw_debug(struct pt_regs *regs)
{
+ int i;
+ int cpu = raw_smp_processor_id();
+ struct perf_event *bp;
+
/* Disable hardware debugging while we are in kgdb: */
set_debugreg(0UL, 7);
+ for (i = 0; i < 4; i++) {
+ if (!breakinfo[i].enabled)
+ continue;
+ bp = *per_cpu_ptr(breakinfo[i].pev, cpu);
+ if (bp->attr.disabled == 1)
+ continue;
+ arch_uninstall_hw_breakpoint(bp);
+ bp->attr.disabled = 1;
+ }
}
/**
@@ -378,7 +460,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
struct pt_regs *linux_regs)
{
unsigned long addr;
- unsigned long dr6;
char *ptr;
int newPC;
@@ -404,20 +485,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
raw_smp_processor_id());
}
- get_debugreg(dr6, 6);
- if (!(dr6 & 0x4000)) {
- int breakno;
-
- for (breakno = 0; breakno < 4; breakno++) {
- if (dr6 & (1 << breakno) &&
- breakinfo[breakno].type == 0) {
- /* Set restore flag: */
- linux_regs->flags |= X86_EFLAGS_RF;
- break;
- }
- }
- }
- set_debugreg(0UL, 6);
kgdb_correct_hw_break();
return 0;
@@ -485,8 +552,7 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
break;
case DIE_DEBUG:
- if (atomic_read(&kgdb_cpu_doing_single_step) ==
- raw_smp_processor_id()) {
+ if (atomic_read(&kgdb_cpu_doing_single_step) != -1) {
if (user_mode(regs))
return single_step_cont(regs, args);
break;
@@ -539,7 +605,42 @@ static struct notifier_block kgdb_notifier = {
*/
int kgdb_arch_init(void)
{
- return register_die_notifier(&kgdb_notifier);
+ int i, cpu;
+ int ret;
+ struct perf_event_attr attr;
+ struct perf_event **pevent;
+
+ ret = register_die_notifier(&kgdb_notifier);
+ if (ret != 0)
+ return ret;
+ /*
+ * Pre-allocate the hw breakpoint structions in the non-atomic
+ * portion of kgdb because this operation requires mutexs to
+ * complete.
+ */
+ hw_breakpoint_init(&attr);
+ attr.bp_addr = (unsigned long)kgdb_arch_init;
+ attr.bp_len = HW_BREAKPOINT_LEN_1;
+ attr.bp_type = HW_BREAKPOINT_W;
+ attr.disabled = 1;
+ for (i = 0; i < 4; i++) {
+ breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL);
+ if (IS_ERR(breakinfo[i].pev)) {
+ printk(KERN_ERR "kgdb: Could not allocate hw breakpoints\n");
+ breakinfo[i].pev = NULL;
+ kgdb_arch_exit();
+ return -1;
+ }
+ for_each_online_cpu(cpu) {
+ pevent = per_cpu_ptr(breakinfo[i].pev, cpu);
+ pevent[0]->hw.sample_period = 1;
+ if (pevent[0]->destroy != NULL) {
+ pevent[0]->destroy = NULL;
+ release_bp_slot(*pevent);
+ }
+ }
+ }
+ return ret;
}
/**
@@ -550,6 +651,13 @@ int kgdb_arch_init(void)
*/
void kgdb_arch_exit(void)
{
+ int i;
+ for (i = 0; i < 4; i++) {
+ if (breakinfo[i].pev) {
+ unregister_wide_hw_breakpoint(breakinfo[i].pev);
+ breakinfo[i].pev = NULL;
+ }
+ }
unregister_die_notifier(&kgdb_notifier);
}
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 5b8c7505b3b..b43bbaebe2c 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -49,6 +49,7 @@
#include <linux/module.h>
#include <linux/kdebug.h>
#include <linux/kallsyms.h>
+#include <linux/ftrace.h>
#include <asm/cacheflush.h>
#include <asm/desc.h>
@@ -106,16 +107,22 @@ struct kretprobe_blackpoint kretprobe_blacklist[] = {
};
const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
-/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
-static void __kprobes set_jmp_op(void *from, void *to)
+static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op)
{
- struct __arch_jmp_op {
- char op;
+ struct __arch_relative_insn {
+ u8 op;
s32 raddr;
- } __attribute__((packed)) * jop;
- jop = (struct __arch_jmp_op *)from;
- jop->raddr = (s32)((long)(to) - ((long)(from) + 5));
- jop->op = RELATIVEJUMP_INSTRUCTION;
+ } __attribute__((packed)) *insn;
+
+ insn = (struct __arch_relative_insn *)from;
+ insn->raddr = (s32)((long)(to) - ((long)(from) + 5));
+ insn->op = op;
+}
+
+/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
+static void __kprobes synthesize_reljump(void *from, void *to)
+{
+ __synthesize_relative_insn(from, to, RELATIVEJUMP_OPCODE);
}
/*
@@ -202,7 +209,7 @@ static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
/*
* Basically, kp->ainsn.insn has an original instruction.
* However, RIP-relative instruction can not do single-stepping
- * at different place, fix_riprel() tweaks the displacement of
+ * at different place, __copy_instruction() tweaks the displacement of
* that instruction. In that case, we can't recover the instruction
* from the kp->ainsn.insn.
*
@@ -284,21 +291,37 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
}
/*
- * Adjust the displacement if the instruction uses the %rip-relative
- * addressing mode.
+ * Copy an instruction and adjust the displacement if the instruction
+ * uses the %rip-relative addressing mode.
* If it does, Return the address of the 32-bit displacement word.
* If not, return null.
* Only applicable to 64-bit x86.
*/
-static void __kprobes fix_riprel(struct kprobe *p)
+static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover)
{
-#ifdef CONFIG_X86_64
struct insn insn;
- kernel_insn_init(&insn, p->ainsn.insn);
+ int ret;
+ kprobe_opcode_t buf[MAX_INSN_SIZE];
+ kernel_insn_init(&insn, src);
+ if (recover) {
+ insn_get_opcode(&insn);
+ if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
+ ret = recover_probed_instruction(buf,
+ (unsigned long)src);
+ if (ret)
+ return 0;
+ kernel_insn_init(&insn, buf);
+ }
+ }
+ insn_get_length(&insn);
+ memcpy(dest, insn.kaddr, insn.length);
+
+#ifdef CONFIG_X86_64
if (insn_rip_relative(&insn)) {
s64 newdisp;
u8 *disp;
+ kernel_insn_init(&insn, dest);
insn_get_displacement(&insn);
/*
* The copied instruction uses the %rip-relative addressing
@@ -312,20 +335,23 @@ static void __kprobes fix_riprel(struct kprobe *p)
* extension of the original signed 32-bit displacement would
* have given.
*/
- newdisp = (u8 *) p->addr + (s64) insn.displacement.value -
- (u8 *) p->ainsn.insn;
+ newdisp = (u8 *) src + (s64) insn.displacement.value -
+ (u8 *) dest;
BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */
- disp = (u8 *) p->ainsn.insn + insn_offset_displacement(&insn);
+ disp = (u8 *) dest + insn_offset_displacement(&insn);
*(s32 *) disp = (s32) newdisp;
}
#endif
+ return insn.length;
}
static void __kprobes arch_copy_kprobe(struct kprobe *p)
{
- memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
-
- fix_riprel(p);
+ /*
+ * Copy an instruction without recovering int3, because it will be
+ * put by another subsystem.
+ */
+ __copy_instruction(p->ainsn.insn, p->addr, 0);
if (can_boost(p->addr))
p->ainsn.boostable = 0;
@@ -337,6 +363,9 @@ static void __kprobes arch_copy_kprobe(struct kprobe *p)
int __kprobes arch_prepare_kprobe(struct kprobe *p)
{
+ if (alternatives_text_reserved(p->addr, p->addr))
+ return -EINVAL;
+
if (!can_probe((unsigned long)p->addr))
return -EILSEQ;
/* insn: must be on special executable page on x86. */
@@ -403,18 +432,6 @@ static void __kprobes restore_btf(void)
update_debugctlmsr(current->thread.debugctlmsr);
}
-static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
-{
- clear_btf();
- regs->flags |= X86_EFLAGS_TF;
- regs->flags &= ~X86_EFLAGS_IF;
- /* single step inline if the instruction is an int3 */
- if (p->opcode == BREAKPOINT_INSTRUCTION)
- regs->ip = (unsigned long)p->addr;
- else
- regs->ip = (unsigned long)p->ainsn.insn;
-}
-
void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
struct pt_regs *regs)
{
@@ -426,20 +443,50 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
*sara = (unsigned long) &kretprobe_trampoline;
}
+#ifdef CONFIG_OPTPROBES
+static int __kprobes setup_detour_execution(struct kprobe *p,
+ struct pt_regs *regs,
+ int reenter);
+#else
+#define setup_detour_execution(p, regs, reenter) (0)
+#endif
+
static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs,
- struct kprobe_ctlblk *kcb)
+ struct kprobe_ctlblk *kcb, int reenter)
{
-#if !defined(CONFIG_PREEMPT) || defined(CONFIG_FREEZER)
+ if (setup_detour_execution(p, regs, reenter))
+ return;
+
+#if !defined(CONFIG_PREEMPT)
if (p->ainsn.boostable == 1 && !p->post_handler) {
/* Boost up -- we can execute copied instructions directly */
- reset_current_kprobe();
+ if (!reenter)
+ reset_current_kprobe();
+ /*
+ * Reentering boosted probe doesn't reset current_kprobe,
+ * nor set current_kprobe, because it doesn't use single
+ * stepping.
+ */
regs->ip = (unsigned long)p->ainsn.insn;
preempt_enable_no_resched();
return;
}
#endif
- prepare_singlestep(p, regs);
- kcb->kprobe_status = KPROBE_HIT_SS;
+ if (reenter) {
+ save_previous_kprobe(kcb);
+ set_current_kprobe(p, regs, kcb);
+ kcb->kprobe_status = KPROBE_REENTER;
+ } else
+ kcb->kprobe_status = KPROBE_HIT_SS;
+ /* Prepare real single stepping */
+ clear_btf();
+ regs->flags |= X86_EFLAGS_TF;
+ regs->flags &= ~X86_EFLAGS_IF;
+ /* single step inline if the instruction is an int3 */
+ if (p->opcode == BREAKPOINT_INSTRUCTION)
+ regs->ip = (unsigned long)p->addr;
+ else
+ regs->ip = (unsigned long)p->ainsn.insn;
}
/*
@@ -453,11 +500,8 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
switch (kcb->kprobe_status) {
case KPROBE_HIT_SSDONE:
case KPROBE_HIT_ACTIVE:
- save_previous_kprobe(kcb);
- set_current_kprobe(p, regs, kcb);
kprobes_inc_nmissed_count(p);
- prepare_singlestep(p, regs);
- kcb->kprobe_status = KPROBE_REENTER;
+ setup_singlestep(p, regs, kcb, 1);
break;
case KPROBE_HIT_SS:
/* A probe has been hit in the codepath leading up to, or just
@@ -532,13 +576,13 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
* more here.
*/
if (!p->pre_handler || !p->pre_handler(p, regs))
- setup_singlestep(p, regs, kcb);
+ setup_singlestep(p, regs, kcb, 0);
return 1;
}
} else if (kprobe_running()) {
p = __get_cpu_var(current_kprobe);
if (p->break_handler && p->break_handler(p, regs)) {
- setup_singlestep(p, regs, kcb);
+ setup_singlestep(p, regs, kcb, 0);
return 1;
}
} /* else: not a kprobe fault; let the kernel handle it */
@@ -547,6 +591,69 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
return 0;
}
+#ifdef CONFIG_X86_64
+#define SAVE_REGS_STRING \
+ /* Skip cs, ip, orig_ax. */ \
+ " subq $24, %rsp\n" \
+ " pushq %rdi\n" \
+ " pushq %rsi\n" \
+ " pushq %rdx\n" \
+ " pushq %rcx\n" \
+ " pushq %rax\n" \
+ " pushq %r8\n" \
+ " pushq %r9\n" \
+ " pushq %r10\n" \
+ " pushq %r11\n" \
+ " pushq %rbx\n" \
+ " pushq %rbp\n" \
+ " pushq %r12\n" \
+ " pushq %r13\n" \
+ " pushq %r14\n" \
+ " pushq %r15\n"
+#define RESTORE_REGS_STRING \
+ " popq %r15\n" \
+ " popq %r14\n" \
+ " popq %r13\n" \
+ " popq %r12\n" \
+ " popq %rbp\n" \
+ " popq %rbx\n" \
+ " popq %r11\n" \
+ " popq %r10\n" \
+ " popq %r9\n" \
+ " popq %r8\n" \
+ " popq %rax\n" \
+ " popq %rcx\n" \
+ " popq %rdx\n" \
+ " popq %rsi\n" \
+ " popq %rdi\n" \
+ /* Skip orig_ax, ip, cs */ \
+ " addq $24, %rsp\n"
+#else
+#define SAVE_REGS_STRING \
+ /* Skip cs, ip, orig_ax and gs. */ \
+ " subl $16, %esp\n" \
+ " pushl %fs\n" \
+ " pushl %ds\n" \
+ " pushl %es\n" \
+ " pushl %eax\n" \
+ " pushl %ebp\n" \
+ " pushl %edi\n" \
+ " pushl %esi\n" \
+ " pushl %edx\n" \
+ " pushl %ecx\n" \
+ " pushl %ebx\n"
+#define RESTORE_REGS_STRING \
+ " popl %ebx\n" \
+ " popl %ecx\n" \
+ " popl %edx\n" \
+ " popl %esi\n" \
+ " popl %edi\n" \
+ " popl %ebp\n" \
+ " popl %eax\n" \
+ /* Skip ds, es, fs, gs, orig_ax, and ip. Note: don't pop cs here*/\
+ " addl $24, %esp\n"
+#endif
+
/*
* When a retprobed function returns, this code saves registers and
* calls trampoline_handler() runs, which calls the kretprobe's handler.
@@ -560,65 +667,16 @@ static void __used __kprobes kretprobe_trampoline_holder(void)
/* We don't bother saving the ss register */
" pushq %rsp\n"
" pushfq\n"
- /*
- * Skip cs, ip, orig_ax.
- * trampoline_handler() will plug in these values
- */
- " subq $24, %rsp\n"
- " pushq %rdi\n"
- " pushq %rsi\n"
- " pushq %rdx\n"
- " pushq %rcx\n"
- " pushq %rax\n"
- " pushq %r8\n"
- " pushq %r9\n"
- " pushq %r10\n"
- " pushq %r11\n"
- " pushq %rbx\n"
- " pushq %rbp\n"
- " pushq %r12\n"
- " pushq %r13\n"
- " pushq %r14\n"
- " pushq %r15\n"
+ SAVE_REGS_STRING
" movq %rsp, %rdi\n"
" call trampoline_handler\n"
/* Replace saved sp with true return address. */
" movq %rax, 152(%rsp)\n"
- " popq %r15\n"
- " popq %r14\n"
- " popq %r13\n"
- " popq %r12\n"
- " popq %rbp\n"
- " popq %rbx\n"
- " popq %r11\n"
- " popq %r10\n"
- " popq %r9\n"
- " popq %r8\n"
- " popq %rax\n"
- " popq %rcx\n"
- " popq %rdx\n"
- " popq %rsi\n"
- " popq %rdi\n"
- /* Skip orig_ax, ip, cs */
- " addq $24, %rsp\n"
+ RESTORE_REGS_STRING
" popfq\n"
#else
" pushf\n"
- /*
- * Skip cs, ip, orig_ax and gs.
- * trampoline_handler() will plug in these values
- */
- " subl $16, %esp\n"
- " pushl %fs\n"
- " pushl %es\n"
- " pushl %ds\n"
- " pushl %eax\n"
- " pushl %ebp\n"
- " pushl %edi\n"
- " pushl %esi\n"
- " pushl %edx\n"
- " pushl %ecx\n"
- " pushl %ebx\n"
+ SAVE_REGS_STRING
" movl %esp, %eax\n"
" call trampoline_handler\n"
/* Move flags to cs */
@@ -626,15 +684,7 @@ static void __used __kprobes kretprobe_trampoline_holder(void)
" movl %edx, 52(%esp)\n"
/* Replace saved flags with true return address. */
" movl %eax, 56(%esp)\n"
- " popl %ebx\n"
- " popl %ecx\n"
- " popl %edx\n"
- " popl %esi\n"
- " popl %edi\n"
- " popl %ebp\n"
- " popl %eax\n"
- /* Skip ds, es, fs, gs, orig_ax and ip */
- " addl $24, %esp\n"
+ RESTORE_REGS_STRING
" popf\n"
#endif
" ret\n");
@@ -802,8 +852,8 @@ static void __kprobes resume_execution(struct kprobe *p,
* These instructions can be executed directly if it
* jumps back to correct address.
*/
- set_jmp_op((void *)regs->ip,
- (void *)orig_ip + (regs->ip - copy_ip));
+ synthesize_reljump((void *)regs->ip,
+ (void *)orig_ip + (regs->ip - copy_ip));
p->ainsn.boostable = 1;
} else {
p->ainsn.boostable = -1;
@@ -1030,6 +1080,358 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
return 0;
}
+
+#ifdef CONFIG_OPTPROBES
+
+/* Insert a call instruction at address 'from', which calls address 'to'.*/
+static void __kprobes synthesize_relcall(void *from, void *to)
+{
+ __synthesize_relative_insn(from, to, RELATIVECALL_OPCODE);
+}
+
+/* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */
+static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr,
+ unsigned long val)
+{
+#ifdef CONFIG_X86_64
+ *addr++ = 0x48;
+ *addr++ = 0xbf;
+#else
+ *addr++ = 0xb8;
+#endif
+ *(unsigned long *)addr = val;
+}
+
+void __kprobes kprobes_optinsn_template_holder(void)
+{
+ asm volatile (
+ ".global optprobe_template_entry\n"
+ "optprobe_template_entry: \n"
+#ifdef CONFIG_X86_64
+ /* We don't bother saving the ss register */
+ " pushq %rsp\n"
+ " pushfq\n"
+ SAVE_REGS_STRING
+ " movq %rsp, %rsi\n"
+ ".global optprobe_template_val\n"
+ "optprobe_template_val: \n"
+ ASM_NOP5
+ ASM_NOP5
+ ".global optprobe_template_call\n"
+ "optprobe_template_call: \n"
+ ASM_NOP5
+ /* Move flags to rsp */
+ " movq 144(%rsp), %rdx\n"
+ " movq %rdx, 152(%rsp)\n"
+ RESTORE_REGS_STRING
+ /* Skip flags entry */
+ " addq $8, %rsp\n"
+ " popfq\n"
+#else /* CONFIG_X86_32 */
+ " pushf\n"
+ SAVE_REGS_STRING
+ " movl %esp, %edx\n"
+ ".global optprobe_template_val\n"
+ "optprobe_template_val: \n"
+ ASM_NOP5
+ ".global optprobe_template_call\n"
+ "optprobe_template_call: \n"
+ ASM_NOP5
+ RESTORE_REGS_STRING
+ " addl $4, %esp\n" /* skip cs */
+ " popf\n"
+#endif
+ ".global optprobe_template_end\n"
+ "optprobe_template_end: \n");
+}
+
+#define TMPL_MOVE_IDX \
+ ((long)&optprobe_template_val - (long)&optprobe_template_entry)
+#define TMPL_CALL_IDX \
+ ((long)&optprobe_template_call - (long)&optprobe_template_entry)
+#define TMPL_END_IDX \
+ ((long)&optprobe_template_end - (long)&optprobe_template_entry)
+
+#define INT3_SIZE sizeof(kprobe_opcode_t)
+
+/* Optimized kprobe call back function: called from optinsn */
+static void __kprobes optimized_callback(struct optimized_kprobe *op,
+ struct pt_regs *regs)
+{
+ struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+ preempt_disable();
+ if (kprobe_running()) {
+ kprobes_inc_nmissed_count(&op->kp);
+ } else {
+ /* Save skipped registers */
+#ifdef CONFIG_X86_64
+ regs->cs = __KERNEL_CS;
+#else
+ regs->cs = __KERNEL_CS | get_kernel_rpl();
+ regs->gs = 0;
+#endif
+ regs->ip = (unsigned long)op->kp.addr + INT3_SIZE;
+ regs->orig_ax = ~0UL;
+
+ __get_cpu_var(current_kprobe) = &op->kp;
+ kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+ opt_pre_handler(&op->kp, regs);
+ __get_cpu_var(current_kprobe) = NULL;
+ }
+ preempt_enable_no_resched();
+}
+
+static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src)
+{
+ int len = 0, ret;
+
+ while (len < RELATIVEJUMP_SIZE) {
+ ret = __copy_instruction(dest + len, src + len, 1);
+ if (!ret || !can_boost(dest + len))
+ return -EINVAL;
+ len += ret;
+ }
+ /* Check whether the address range is reserved */
+ if (ftrace_text_reserved(src, src + len - 1) ||
+ alternatives_text_reserved(src, src + len - 1))
+ return -EBUSY;
+
+ return len;
+}
+
+/* Check whether insn is indirect jump */
+static int __kprobes insn_is_indirect_jump(struct insn *insn)
+{
+ return ((insn->opcode.bytes[0] == 0xff &&
+ (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */
+ insn->opcode.bytes[0] == 0xea); /* Segment based jump */
+}
+
+/* Check whether insn jumps into specified address range */
+static int insn_jump_into_range(struct insn *insn, unsigned long start, int len)
+{
+ unsigned long target = 0;
+
+ switch (insn->opcode.bytes[0]) {
+ case 0xe0: /* loopne */
+ case 0xe1: /* loope */
+ case 0xe2: /* loop */
+ case 0xe3: /* jcxz */
+ case 0xe9: /* near relative jump */
+ case 0xeb: /* short relative jump */
+ break;
+ case 0x0f:
+ if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */
+ break;
+ return 0;
+ default:
+ if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */
+ break;
+ return 0;
+ }
+ target = (unsigned long)insn->next_byte + insn->immediate.value;
+
+ return (start <= target && target <= start + len);
+}
+
+/* Decode whole function to ensure any instructions don't jump into target */
+static int __kprobes can_optimize(unsigned long paddr)
+{
+ int ret;
+ unsigned long addr, size = 0, offset = 0;
+ struct insn insn;
+ kprobe_opcode_t buf[MAX_INSN_SIZE];
+ /* Dummy buffers for lookup_symbol_attrs */
+ static char __dummy_buf[KSYM_NAME_LEN];
+
+ /* Lookup symbol including addr */
+ if (!kallsyms_lookup(paddr, &size, &offset, NULL, __dummy_buf))
+ return 0;
+
+ /* Check there is enough space for a relative jump. */
+ if (size - offset < RELATIVEJUMP_SIZE)
+ return 0;
+
+ /* Decode instructions */
+ addr = paddr - offset;
+ while (addr < paddr - offset + size) { /* Decode until function end */
+ if (search_exception_tables(addr))
+ /*
+ * Since some fixup code will jumps into this function,
+ * we can't optimize kprobe in this function.
+ */
+ return 0;
+ kernel_insn_init(&insn, (void *)addr);
+ insn_get_opcode(&insn);
+ if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
+ ret = recover_probed_instruction(buf, addr);
+ if (ret)
+ return 0;
+ kernel_insn_init(&insn, buf);
+ }
+ insn_get_length(&insn);
+ /* Recover address */
+ insn.kaddr = (void *)addr;
+ insn.next_byte = (void *)(addr + insn.length);
+ /* Check any instructions don't jump into target */
+ if (insn_is_indirect_jump(&insn) ||
+ insn_jump_into_range(&insn, paddr + INT3_SIZE,
+ RELATIVE_ADDR_SIZE))
+ return 0;
+ addr += insn.length;
+ }
+
+ return 1;
+}
+
+/* Check optimized_kprobe can actually be optimized. */
+int __kprobes arch_check_optimized_kprobe(struct optimized_kprobe *op)
+{
+ int i;
+ struct kprobe *p;
+
+ for (i = 1; i < op->optinsn.size; i++) {
+ p = get_kprobe(op->kp.addr + i);
+ if (p && !kprobe_disabled(p))
+ return -EEXIST;
+ }
+
+ return 0;
+}
+
+/* Check the addr is within the optimized instructions. */
+int __kprobes arch_within_optimized_kprobe(struct optimized_kprobe *op,
+ unsigned long addr)
+{
+ return ((unsigned long)op->kp.addr <= addr &&
+ (unsigned long)op->kp.addr + op->optinsn.size > addr);
+}
+
+/* Free optimized instruction slot */
+static __kprobes
+void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty)
+{
+ if (op->optinsn.insn) {
+ free_optinsn_slot(op->optinsn.insn, dirty);
+ op->optinsn.insn = NULL;
+ op->optinsn.size = 0;
+ }
+}
+
+void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op)
+{
+ __arch_remove_optimized_kprobe(op, 1);
+}
+
+/*
+ * Copy replacing target instructions
+ * Target instructions MUST be relocatable (checked inside)
+ */
+int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op)
+{
+ u8 *buf;
+ int ret;
+ long rel;
+
+ if (!can_optimize((unsigned long)op->kp.addr))
+ return -EILSEQ;
+
+ op->optinsn.insn = get_optinsn_slot();
+ if (!op->optinsn.insn)
+ return -ENOMEM;
+
+ /*
+ * Verify if the address gap is in 2GB range, because this uses
+ * a relative jump.
+ */
+ rel = (long)op->optinsn.insn - (long)op->kp.addr + RELATIVEJUMP_SIZE;
+ if (abs(rel) > 0x7fffffff)
+ return -ERANGE;
+
+ buf = (u8 *)op->optinsn.insn;
+
+ /* Copy instructions into the out-of-line buffer */
+ ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr);
+ if (ret < 0) {
+ __arch_remove_optimized_kprobe(op, 0);
+ return ret;
+ }
+ op->optinsn.size = ret;
+
+ /* Copy arch-dep-instance from template */
+ memcpy(buf, &optprobe_template_entry, TMPL_END_IDX);
+
+ /* Set probe information */
+ synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op);
+
+ /* Set probe function call */
+ synthesize_relcall(buf + TMPL_CALL_IDX, optimized_callback);
+
+ /* Set returning jmp instruction at the tail of out-of-line buffer */
+ synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size,
+ (u8 *)op->kp.addr + op->optinsn.size);
+
+ flush_icache_range((unsigned long) buf,
+ (unsigned long) buf + TMPL_END_IDX +
+ op->optinsn.size + RELATIVEJUMP_SIZE);
+ return 0;
+}
+
+/* Replace a breakpoint (int3) with a relative jump. */
+int __kprobes arch_optimize_kprobe(struct optimized_kprobe *op)
+{
+ unsigned char jmp_code[RELATIVEJUMP_SIZE];
+ s32 rel = (s32)((long)op->optinsn.insn -
+ ((long)op->kp.addr + RELATIVEJUMP_SIZE));
+
+ /* Backup instructions which will be replaced by jump address */
+ memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE,
+ RELATIVE_ADDR_SIZE);
+
+ jmp_code[0] = RELATIVEJUMP_OPCODE;
+ *(s32 *)(&jmp_code[1]) = rel;
+
+ /*
+ * text_poke_smp doesn't support NMI/MCE code modifying.
+ * However, since kprobes itself also doesn't support NMI/MCE
+ * code probing, it's not a problem.
+ */
+ text_poke_smp(op->kp.addr, jmp_code, RELATIVEJUMP_SIZE);
+ return 0;
+}
+
+/* Replace a relative jump with a breakpoint (int3). */
+void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op)
+{
+ u8 buf[RELATIVEJUMP_SIZE];
+
+ /* Set int3 to first byte for kprobes */
+ buf[0] = BREAKPOINT_INSTRUCTION;
+ memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
+ text_poke_smp(op->kp.addr, buf, RELATIVEJUMP_SIZE);
+}
+
+static int __kprobes setup_detour_execution(struct kprobe *p,
+ struct pt_regs *regs,
+ int reenter)
+{
+ struct optimized_kprobe *op;
+
+ if (p->flags & KPROBE_FLAG_OPTIMIZED) {
+ /* This kprobe is really able to run optimized path. */
+ op = container_of(p, struct optimized_kprobe, kp);
+ /* Detour through copied instructions */
+ regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX;
+ if (!reenter)
+ reset_current_kprobe();
+ preempt_enable_no_resched();
+ return 1;
+ }
+ return 0;
+}
+#endif
+
int __init arch_init_kprobes(void)
{
return 0;
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index ec6ef60cbd1..ea697263b37 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -7,6 +7,7 @@
*/
#include <linux/errno.h>
+#include <linux/gfp.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/mm.h>
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 4a8bb82248a..035c8c52918 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -9,6 +9,7 @@
#include <linux/mm.h>
#include <linux/kexec.h>
#include <linux/string.h>
+#include <linux/gfp.h>
#include <linux/reboot.h>
#include <linux/numa.h>
#include <linux/ftrace.h>
diff --git a/arch/x86/kernel/mca_32.c b/arch/x86/kernel/mca_32.c
index 845d80ce1ef..63eaf659623 100644
--- a/arch/x86/kernel/mca_32.c
+++ b/arch/x86/kernel/mca_32.c
@@ -42,6 +42,7 @@
#include <linux/kernel.h>
#include <linux/mca.h>
#include <linux/kprobes.h>
+#include <linux/slab.h>
#include <asm/system.h>
#include <asm/io.h>
#include <linux/proc_fs.h>
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index 37542b67c57..e1af7c055c7 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -36,9 +36,6 @@ MODULE_LICENSE("GPL v2");
#define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000
#define UCODE_UCODE_TYPE 0x00000001
-const struct firmware *firmware;
-static int supported_cpu;
-
struct equiv_cpu_entry {
u32 installed_cpu;
u32 fixed_errata_mask;
@@ -77,12 +74,15 @@ static struct equiv_cpu_entry *equiv_cpu_table;
static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
{
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
u32 dummy;
- if (!supported_cpu)
- return -1;
-
memset(csig, 0, sizeof(*csig));
+ if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
+ pr_warning("microcode: CPU%d: AMD CPU family 0x%x not "
+ "supported\n", cpu, c->x86);
+ return -1;
+ }
rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy);
pr_info("CPU%d: patch_level=0x%x\n", cpu, csig->rev);
return 0;
@@ -294,10 +294,14 @@ generic_load_microcode(int cpu, const u8 *data, size_t size)
static enum ucode_state request_microcode_fw(int cpu, struct device *device)
{
+ const char *fw_name = "amd-ucode/microcode_amd.bin";
+ const struct firmware *firmware;
enum ucode_state ret;
- if (firmware == NULL)
+ if (request_firmware(&firmware, fw_name, device)) {
+ printk(KERN_ERR "microcode: failed to load file %s\n", fw_name);
return UCODE_NFOUND;
+ }
if (*(u32 *)firmware->data != UCODE_MAGIC) {
pr_err("invalid UCODE_MAGIC (0x%08x)\n",
@@ -307,6 +311,8 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device)
ret = generic_load_microcode(cpu, firmware->data, firmware->size);
+ release_firmware(firmware);
+
return ret;
}
@@ -325,31 +331,7 @@ static void microcode_fini_cpu_amd(int cpu)
uci->mc = NULL;
}
-void init_microcode_amd(struct device *device)
-{
- const char *fw_name = "amd-ucode/microcode_amd.bin";
- struct cpuinfo_x86 *c = &boot_cpu_data;
-
- WARN_ON(c->x86_vendor != X86_VENDOR_AMD);
-
- if (c->x86 < 0x10) {
- pr_warning("AMD CPU family 0x%x not supported\n", c->x86);
- return;
- }
- supported_cpu = 1;
-
- if (request_firmware(&firmware, fw_name, device))
- pr_err("failed to load file %s\n", fw_name);
-}
-
-void fini_microcode_amd(void)
-{
- release_firmware(firmware);
-}
-
static struct microcode_ops microcode_amd_ops = {
- .init = init_microcode_amd,
- .fini = fini_microcode_amd,
.request_microcode_user = request_microcode_user,
.request_microcode_fw = request_microcode_fw,
.collect_cpu_info = collect_cpu_info_amd,
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 0c863243309..cceb5bc3c3c 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -521,9 +521,6 @@ static int __init microcode_init(void)
return PTR_ERR(microcode_pdev);
}
- if (microcode_ops->init)
- microcode_ops->init(&microcode_pdev->dev);
-
get_online_cpus();
mutex_lock(&microcode_mutex);
@@ -566,9 +563,6 @@ static void __exit microcode_exit(void)
platform_device_unregister(microcode_pdev);
- if (microcode_ops->fini)
- microcode_ops->fini();
-
microcode_ops = NULL;
pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index ebd193e476c..85a343e2893 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -328,7 +328,7 @@ static int apply_microcode(int cpu)
cpu_num, mc_intel->hdr.rev);
return -1;
}
- pr_info("CPU%d updated to revision 0x%x, date = %04x-%02x-%02x \n",
+ pr_info("CPU%d updated to revision 0x%x, date = %04x-%02x-%02x\n",
cpu_num, val[1],
mc_intel->hdr.date & 0xffff,
mc_intel->hdr.date >> 24,
diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c
index 712d15fdc41..71825806cd4 100644
--- a/arch/x86/kernel/mmconf-fam10h_64.c
+++ b/arch/x86/kernel/mmconf-fam10h_64.c
@@ -7,6 +7,8 @@
#include <linux/string.h>
#include <linux/pci.h>
#include <linux/dmi.h>
+#include <linux/range.h>
+
#include <asm/pci-direct.h>
#include <linux/sort.h>
#include <asm/io.h>
@@ -30,11 +32,6 @@ static struct pci_hostbridge_probe pci_probes[] __cpuinitdata = {
{ 0xff, 0, PCI_VENDOR_ID_AMD, 0x1200 },
};
-struct range {
- u64 start;
- u64 end;
-};
-
static int __cpuinit cmp_range(const void *x1, const void *x2)
{
const struct range *r1 = x1;
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 89f386f044e..e0bc186d750 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -23,6 +23,7 @@
#include <linux/kernel.h>
#include <linux/bug.h>
#include <linux/mm.h>
+#include <linux/gfp.h>
#include <asm/system.h>
#include <asm/page.h>
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 40b54ceb68b..e81030f71a8 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -359,13 +359,6 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
x86_init.mpparse.mpc_record(1);
}
-#ifdef CONFIG_X86_BIGSMP
- generic_bigsmp_probe();
-#endif
-
- if (apic->setup_apic_routing)
- apic->setup_apic_routing();
-
if (!num_processors)
printk(KERN_ERR "MPTABLE: no processors registered!\n");
return num_processors;
@@ -671,7 +664,7 @@ static void __init smp_reserve_memory(struct mpf_intel *mpf)
{
unsigned long size = get_mpc_size(mpf->physptr);
- reserve_early(mpf->physptr, mpf->physptr+size, "MP-table mpc");
+ reserve_early_overlap_ok(mpf->physptr, mpf->physptr+size, "MP-table mpc");
}
static int __init smp_scan_config(unsigned long base, unsigned long length)
@@ -700,7 +693,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length)
mpf, (u64)virt_to_phys(mpf));
mem = virt_to_phys(mpf);
- reserve_early(mem, mem + sizeof(*mpf), "MP-table mpf");
+ reserve_early_overlap_ok(mem, mem + sizeof(*mpf), "MP-table mpf");
if (mpf->physptr)
smp_reserve_memory(mpf);
diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c
index 3b7078abc87..0aad8670858 100644
--- a/arch/x86/kernel/mrst.c
+++ b/arch/x86/kernel/mrst.c
@@ -10,8 +10,211 @@
* of the License.
*/
#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sfi.h>
+#include <linux/irq.h>
+#include <linux/module.h>
#include <asm/setup.h>
+#include <asm/mpspec_def.h>
+#include <asm/hw_irq.h>
+#include <asm/apic.h>
+#include <asm/io_apic.h>
+#include <asm/mrst.h>
+#include <asm/io.h>
+#include <asm/i8259.h>
+#include <asm/apb_timer.h>
+
+static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM];
+static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM];
+int sfi_mtimer_num;
+
+struct sfi_rtc_table_entry sfi_mrtc_array[SFI_MRTC_MAX];
+EXPORT_SYMBOL_GPL(sfi_mrtc_array);
+int sfi_mrtc_num;
+
+static inline void assign_to_mp_irq(struct mpc_intsrc *m,
+ struct mpc_intsrc *mp_irq)
+{
+ memcpy(mp_irq, m, sizeof(struct mpc_intsrc));
+}
+
+static inline int mp_irq_cmp(struct mpc_intsrc *mp_irq,
+ struct mpc_intsrc *m)
+{
+ return memcmp(mp_irq, m, sizeof(struct mpc_intsrc));
+}
+
+static void save_mp_irq(struct mpc_intsrc *m)
+{
+ int i;
+
+ for (i = 0; i < mp_irq_entries; i++) {
+ if (!mp_irq_cmp(&mp_irqs[i], m))
+ return;
+ }
+
+ assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]);
+ if (++mp_irq_entries == MAX_IRQ_SOURCES)
+ panic("Max # of irq sources exceeded!!\n");
+}
+
+/* parse all the mtimer info to a static mtimer array */
+static int __init sfi_parse_mtmr(struct sfi_table_header *table)
+{
+ struct sfi_table_simple *sb;
+ struct sfi_timer_table_entry *pentry;
+ struct mpc_intsrc mp_irq;
+ int totallen;
+
+ sb = (struct sfi_table_simple *)table;
+ if (!sfi_mtimer_num) {
+ sfi_mtimer_num = SFI_GET_NUM_ENTRIES(sb,
+ struct sfi_timer_table_entry);
+ pentry = (struct sfi_timer_table_entry *) sb->pentry;
+ totallen = sfi_mtimer_num * sizeof(*pentry);
+ memcpy(sfi_mtimer_array, pentry, totallen);
+ }
+
+ printk(KERN_INFO "SFI: MTIMER info (num = %d):\n", sfi_mtimer_num);
+ pentry = sfi_mtimer_array;
+ for (totallen = 0; totallen < sfi_mtimer_num; totallen++, pentry++) {
+ printk(KERN_INFO "timer[%d]: paddr = 0x%08x, freq = %dHz,"
+ " irq = %d\n", totallen, (u32)pentry->phys_addr,
+ pentry->freq_hz, pentry->irq);
+ if (!pentry->irq)
+ continue;
+ mp_irq.type = MP_IOAPIC;
+ mp_irq.irqtype = mp_INT;
+/* triggering mode edge bit 2-3, active high polarity bit 0-1 */
+ mp_irq.irqflag = 5;
+ mp_irq.srcbus = 0;
+ mp_irq.srcbusirq = pentry->irq; /* IRQ */
+ mp_irq.dstapic = MP_APIC_ALL;
+ mp_irq.dstirq = pentry->irq;
+ save_mp_irq(&mp_irq);
+ }
+
+ return 0;
+}
+
+struct sfi_timer_table_entry *sfi_get_mtmr(int hint)
+{
+ int i;
+ if (hint < sfi_mtimer_num) {
+ if (!sfi_mtimer_usage[hint]) {
+ pr_debug("hint taken for timer %d irq %d\n",\
+ hint, sfi_mtimer_array[hint].irq);
+ sfi_mtimer_usage[hint] = 1;
+ return &sfi_mtimer_array[hint];
+ }
+ }
+ /* take the first timer available */
+ for (i = 0; i < sfi_mtimer_num;) {
+ if (!sfi_mtimer_usage[i]) {
+ sfi_mtimer_usage[i] = 1;
+ return &sfi_mtimer_array[i];
+ }
+ i++;
+ }
+ return NULL;
+}
+
+void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr)
+{
+ int i;
+ for (i = 0; i < sfi_mtimer_num;) {
+ if (mtmr->irq == sfi_mtimer_array[i].irq) {
+ sfi_mtimer_usage[i] = 0;
+ return;
+ }
+ i++;
+ }
+}
+
+/* parse all the mrtc info to a global mrtc array */
+int __init sfi_parse_mrtc(struct sfi_table_header *table)
+{
+ struct sfi_table_simple *sb;
+ struct sfi_rtc_table_entry *pentry;
+ struct mpc_intsrc mp_irq;
+
+ int totallen;
+
+ sb = (struct sfi_table_simple *)table;
+ if (!sfi_mrtc_num) {
+ sfi_mrtc_num = SFI_GET_NUM_ENTRIES(sb,
+ struct sfi_rtc_table_entry);
+ pentry = (struct sfi_rtc_table_entry *)sb->pentry;
+ totallen = sfi_mrtc_num * sizeof(*pentry);
+ memcpy(sfi_mrtc_array, pentry, totallen);
+ }
+
+ printk(KERN_INFO "SFI: RTC info (num = %d):\n", sfi_mrtc_num);
+ pentry = sfi_mrtc_array;
+ for (totallen = 0; totallen < sfi_mrtc_num; totallen++, pentry++) {
+ printk(KERN_INFO "RTC[%d]: paddr = 0x%08x, irq = %d\n",
+ totallen, (u32)pentry->phys_addr, pentry->irq);
+ mp_irq.type = MP_IOAPIC;
+ mp_irq.irqtype = mp_INT;
+ mp_irq.irqflag = 0;
+ mp_irq.srcbus = 0;
+ mp_irq.srcbusirq = pentry->irq; /* IRQ */
+ mp_irq.dstapic = MP_APIC_ALL;
+ mp_irq.dstirq = pentry->irq;
+ save_mp_irq(&mp_irq);
+ }
+ return 0;
+}
+
+/*
+ * the secondary clock in Moorestown can be APBT or LAPIC clock, default to
+ * APBT but cmdline option can also override it.
+ */
+static void __cpuinit mrst_setup_secondary_clock(void)
+{
+ /* restore default lapic clock if disabled by cmdline */
+ if (disable_apbt_percpu)
+ return setup_secondary_APIC_clock();
+ apbt_setup_secondary_clock();
+}
+
+static unsigned long __init mrst_calibrate_tsc(void)
+{
+ unsigned long flags, fast_calibrate;
+
+ local_irq_save(flags);
+ fast_calibrate = apbt_quick_calibrate();
+ local_irq_restore(flags);
+
+ if (fast_calibrate)
+ return fast_calibrate;
+
+ return 0;
+}
+
+void __init mrst_time_init(void)
+{
+ sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr);
+ pre_init_apic_IRQ0();
+ apbt_time_init();
+}
+
+void __init mrst_rtc_init(void)
+{
+ sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc);
+}
+
+/*
+ * if we use per cpu apb timer, the bootclock already setup. if we use lapic
+ * timer and one apbt timer for broadcast, we need to set up lapic boot clock.
+ */
+static void __init mrst_setup_boot_clock(void)
+{
+ pr_info("%s: per cpu apbt flag %d \n", __func__, disable_apbt_percpu);
+ if (disable_apbt_percpu)
+ setup_boot_APIC_clock();
+};
/*
* Moorestown specific x86_init function overrides and early setup
@@ -21,4 +224,17 @@ void __init x86_mrst_early_setup(void)
{
x86_init.resources.probe_roms = x86_init_noop;
x86_init.resources.reserve_resources = x86_init_noop;
+
+ x86_init.timers.timer_init = mrst_time_init;
+ x86_init.timers.setup_percpu_clockev = mrst_setup_boot_clock;
+
+ x86_init.irqs.pre_vector_init = x86_init_noop;
+
+ x86_cpuinit.setup_percpu_clockev = mrst_setup_secondary_clock;
+
+ x86_platform.calibrate_tsc = mrst_calibrate_tsc;
+ x86_init.pci.init = pci_mrst_init;
+ x86_init.pci.fixup_irqs = x86_init_noop;
+
+ legacy_pic = &null_legacy_pic;
}
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 4bd93c9b2b2..4d4468e9f47 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -37,6 +37,7 @@
#include <linux/cpu.h>
#include <linux/notifier.h>
#include <linux/uaccess.h>
+#include <linux/gfp.h>
#include <asm/processor.h>
#include <asm/msr.h>
@@ -285,7 +286,7 @@ static void __exit msr_exit(void)
for_each_online_cpu(cpu)
msr_device_destroy(cpu);
class_destroy(msr_class);
- unregister_chrdev(MSR_MAJOR, "cpu/msr");
+ __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr");
unregister_hotcpu_notifier(&msr_class_cpu_notifier);
}
diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c
index 9d1d263f786..8297160c41b 100644
--- a/arch/x86/kernel/olpc.c
+++ b/arch/x86/kernel/olpc.c
@@ -17,7 +17,9 @@
#include <linux/spinlock.h>
#include <linux/io.h>
#include <linux/string.h>
+
#include <asm/geode.h>
+#include <asm/setup.h>
#include <asm/olpc.h>
#ifdef CONFIG_OPEN_FIRMWARE
@@ -243,9 +245,11 @@ static int __init olpc_init(void)
olpc_ec_cmd(EC_FIRMWARE_REV, NULL, 0,
(unsigned char *) &olpc_platform_info.ecver, 1);
- /* check to see if the VSA exists */
- if (cs5535_has_vsa2())
- olpc_platform_info.flags |= OLPC_F_VSA;
+#ifdef CONFIG_PCI_OLPC
+ /* If the VSA exists let it emulate PCI, if not emulate in kernel */
+ if (!cs5535_has_vsa2())
+ x86_init.pci.arch_init = pci_olpc_init;
+#endif
printk(KERN_INFO "OLPC board revision %s%X (EC=%x)\n",
((olpc_platform_info.boardrev & 0xf) < 8) ? "pre" : "",
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 1b1739d1631..1db183ed7c0 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -428,10 +428,6 @@ struct pv_mmu_ops pv_mmu_ops = {
.ptep_modify_prot_start = __ptep_modify_prot_start,
.ptep_modify_prot_commit = __ptep_modify_prot_commit,
-#ifdef CONFIG_HIGHPTE
- .kmap_atomic_pte = kmap_atomic,
-#endif
-
#if PAGETABLE_LEVELS >= 3
#ifdef CONFIG_X86_PAE
.set_pte_atomic = native_set_pte_atomic,
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 2bbde607814..fb99f7edb34 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -1309,7 +1309,7 @@ static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl)
/*
* get_tce_space_from_tar():
* Function for kdump case. Get the tce tables from first kernel
- * by reading the contents of the base adress register of calgary iommu
+ * by reading the contents of the base address register of calgary iommu
*/
static void __init get_tce_space_from_tar(void)
{
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 75e14e21f61..4b7e3d8b01d 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -2,6 +2,7 @@
#include <linux/dma-debug.h>
#include <linux/dmar.h>
#include <linux/bootmem.h>
+#include <linux/gfp.h>
#include <linux/pci.h>
#include <linux/kmemleak.h>
@@ -38,7 +39,7 @@ int iommu_detected __read_mostly = 0;
* This variable becomes 1 if iommu=pt is passed on the kernel command line.
* If this variable is 1, IOMMU implementations do no DMA translation for
* devices and allow every device to access to whole physical memory. This is
- * useful if a user want to use an IOMMU only for KVM device assignment to
+ * useful if a user wants to use an IOMMU only for KVM device assignment to
* guests and not for driver dma translation.
*/
int iommu_pass_through __read_mostly;
@@ -65,7 +66,7 @@ int dma_set_mask(struct device *dev, u64 mask)
}
EXPORT_SYMBOL(dma_set_mask);
-#ifdef CONFIG_X86_64
+#if defined(CONFIG_X86_64) && !defined(CONFIG_NUMA)
static __initdata void *dma32_bootmem_ptr;
static unsigned long dma32_bootmem_size __initdata = (128ULL<<20);
@@ -116,14 +117,21 @@ static void __init dma32_free_bootmem(void)
dma32_bootmem_ptr = NULL;
dma32_bootmem_size = 0;
}
+#else
+void __init dma32_reserve_bootmem(void)
+{
+}
+static void __init dma32_free_bootmem(void)
+{
+}
+
#endif
void __init pci_iommu_alloc(void)
{
-#ifdef CONFIG_X86_64
/* free the range so iommu could get some range less than 4G */
dma32_free_bootmem();
-#endif
+
if (pci_swiotlb_detect())
goto out;
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index 34de53b46f8..0f7f130caa6 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -29,6 +29,7 @@
#include <linux/iommu-helper.h>
#include <linux/sysdev.h>
#include <linux/io.h>
+#include <linux/gfp.h>
#include <asm/atomic.h>
#include <asm/mtrr.h>
#include <asm/pgtable.h>
@@ -564,6 +565,9 @@ static void enable_gart_translations(void)
enable_gart_translation(dev, __pa(agp_gatt_table));
}
+
+ /* Flush the GART-TLB to remove stale entries */
+ k8_flush_garts();
}
/*
@@ -735,7 +739,7 @@ int __init gart_iommu_init(void)
unsigned long scratch;
long i;
- if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0)
+ if (num_k8_northbridges == 0)
return 0;
#ifndef CONFIG_AGP_AMD64
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index 22be12b60a8..3af4af810c0 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -4,6 +4,7 @@
#include <linux/scatterlist.h>
#include <linux/string.h>
#include <linux/init.h>
+#include <linux/gfp.h>
#include <linux/pci.h>
#include <linux/mm.h>
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index cf1e04b2ad6..28ad9f4d8b9 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -110,8 +110,8 @@ void show_regs_common(void)
if (!product)
product = "";
- printk("\n");
- printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s/%s\n",
+ printk(KERN_CONT "\n");
+ printk(KERN_DEFAULT "Pid: %d, comm: %.20s %s %s %.*s %s/%s\n",
current->pid, current->comm, print_tainted(),
init_utsname()->release,
(int)strcspn(init_utsname()->version, " "),
@@ -122,18 +122,6 @@ void flush_thread(void)
{
struct task_struct *tsk = current;
-#ifdef CONFIG_X86_64
- if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
- clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
- if (test_tsk_thread_flag(tsk, TIF_IA32)) {
- clear_tsk_thread_flag(tsk, TIF_IA32);
- } else {
- set_tsk_thread_flag(tsk, TIF_IA32);
- current_thread_info()->status |= TS_COMPAT;
- }
- }
-#endif
-
flush_ptrace_hw_breakpoint(tsk);
memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
/*
@@ -295,6 +283,8 @@ int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
regs.es = __USER_DS;
regs.fs = __KERNEL_PERCPU;
regs.gs = __KERNEL_STACK_CANARY;
+#else
+ regs.ss = __KERNEL_DS;
#endif
regs.orig_ax = -1;
@@ -536,21 +526,37 @@ static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
}
/*
- * Check for AMD CPUs, which have potentially C1E support
+ * Check for AMD CPUs, where APIC timer interrupt does not wake up CPU from C1e.
+ * For more information see
+ * - Erratum #400 for NPT family 0xf and family 0x10 CPUs
+ * - Erratum #365 for family 0x11 (not affected because C1e not in use)
*/
static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
{
+ u64 val;
if (c->x86_vendor != X86_VENDOR_AMD)
- return 0;
-
- if (c->x86 < 0x0F)
- return 0;
+ goto no_c1e_idle;
/* Family 0x0f models < rev F do not have C1E */
- if (c->x86 == 0x0f && c->x86_model < 0x40)
- return 0;
+ if (c->x86 == 0x0F && c->x86_model >= 0x40)
+ return 1;
- return 1;
+ if (c->x86 == 0x10) {
+ /*
+ * check OSVW bit for CPUs that are not affected
+ * by erratum #400
+ */
+ rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, val);
+ if (val >= 2) {
+ rdmsrl(MSR_AMD64_OSVW_STATUS, val);
+ if (!(val & BIT(1)))
+ goto no_c1e_idle;
+ }
+ return 1;
+ }
+
+no_c1e_idle:
+ return 0;
}
static cpumask_var_t c1e_mask;
@@ -617,7 +623,7 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
{
#ifdef CONFIG_SMP
if (pm_idle == poll_idle && smp_num_siblings > 1) {
- printk(KERN_WARNING "WARNING: polling idle and HT enabled,"
+ printk_once(KERN_WARNING "WARNING: polling idle and HT enabled,"
" performance may degrade.\n");
}
#endif
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index fe6a34e42bd..f6c62667e30 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -139,16 +139,16 @@ void __show_regs(struct pt_regs *regs, int all)
show_regs_common();
- printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
+ printk(KERN_DEFAULT "EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
(u16)regs->cs, regs->ip, regs->flags,
smp_processor_id());
print_symbol("EIP is at %s\n", regs->ip);
- printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
+ printk(KERN_DEFAULT "EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
regs->ax, regs->bx, regs->cx, regs->dx);
- printk("ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n",
+ printk(KERN_DEFAULT "ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n",
regs->si, regs->di, regs->bp, sp);
- printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n",
+ printk(KERN_DEFAULT " DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n",
(u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss);
if (!all)
@@ -158,19 +158,19 @@ void __show_regs(struct pt_regs *regs, int all)
cr2 = read_cr2();
cr3 = read_cr3();
cr4 = read_cr4_safe();
- printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n",
+ printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n",
cr0, cr2, cr3, cr4);
get_debugreg(d0, 0);
get_debugreg(d1, 1);
get_debugreg(d2, 2);
get_debugreg(d3, 3);
- printk("DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n",
+ printk(KERN_DEFAULT "DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n",
d0, d1, d2, d3);
get_debugreg(d6, 6);
get_debugreg(d7, 7);
- printk("DR6: %08lx DR7: %08lx\n",
+ printk(KERN_DEFAULT "DR6: %08lx DR7: %08lx\n",
d6, d7);
}
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 418f860880a..dc9690b4c4c 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -161,19 +161,19 @@ void __show_regs(struct pt_regs *regs, int all)
unsigned int ds, cs, es;
show_regs_common();
- printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
+ printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
printk_address(regs->ip, 1);
- printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss,
+ printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss,
regs->sp, regs->flags);
- printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n",
+ printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
regs->ax, regs->bx, regs->cx);
- printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n",
+ printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
regs->dx, regs->si, regs->di);
- printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n",
+ printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
regs->bp, regs->r8, regs->r9);
- printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n",
+ printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
regs->r10, regs->r11, regs->r12);
- printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n",
+ printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
regs->r13, regs->r14, regs->r15);
asm("movl %%ds,%0" : "=r" (ds));
@@ -194,21 +194,21 @@ void __show_regs(struct pt_regs *regs, int all)
cr3 = read_cr3();
cr4 = read_cr4();
- printk(KERN_INFO "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
+ printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
fs, fsindex, gs, gsindex, shadowgs);
- printk(KERN_INFO "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
+ printk(KERN_DEFAULT "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
es, cr0);
- printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
+ printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
cr4);
get_debugreg(d0, 0);
get_debugreg(d1, 1);
get_debugreg(d2, 2);
- printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
+ printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
get_debugreg(d3, 3);
get_debugreg(d6, 6);
get_debugreg(d7, 7);
- printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
+ printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
}
void release_thread(struct task_struct *dead_task)
@@ -515,6 +515,18 @@ void set_personality_64bit(void)
current->personality &= ~READ_IMPLIES_EXEC;
}
+void set_personality_ia32(void)
+{
+ /* inherit personality from parent */
+
+ /* Make sure to be in 32bit mode */
+ set_thread_flag(TIF_IA32);
+ current->personality |= force_personality32;
+
+ /* Prepare the first "return" to user space */
+ current_thread_info()->status |= TS_COMPAT;
+}
+
unsigned long get_wchan(struct task_struct *p)
{
unsigned long stack;
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 017d937639f..2e9b55027b7 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -12,6 +12,7 @@
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/errno.h>
+#include <linux/slab.h>
#include <linux/ptrace.h>
#include <linux/regset.h>
#include <linux/tracehook.h>
@@ -48,6 +49,7 @@ enum x86_regset {
REGSET_FP,
REGSET_XFP,
REGSET_IOPERM64 = REGSET_XFP,
+ REGSET_XSTATE,
REGSET_TLS,
REGSET_IOPERM32,
};
@@ -140,30 +142,6 @@ static const int arg_offs_table[] = {
#endif
};
-/**
- * regs_get_argument_nth() - get Nth argument at function call
- * @regs: pt_regs which contains registers at function entry.
- * @n: argument number.
- *
- * regs_get_argument_nth() returns @n th argument of a function call.
- * Since usually the kernel stack will be changed right after function entry,
- * you must use this at function entry. If the @n th entry is NOT in the
- * kernel stack or pt_regs, this returns 0.
- */
-unsigned long regs_get_argument_nth(struct pt_regs *regs, unsigned int n)
-{
- if (n < ARRAY_SIZE(arg_offs_table))
- return *(unsigned long *)((char *)regs + arg_offs_table[n]);
- else {
- /*
- * The typical case: arg n is on the stack.
- * (Note: stack[0] = return address, so skip it)
- */
- n -= ARRAY_SIZE(arg_offs_table);
- return regs_get_kernel_stack_nth(regs, 1 + n);
- }
-}
-
/*
* does not yet catch signals sent when the child dies.
* in exit.c or in signal.c.
@@ -604,7 +582,7 @@ ptrace_modify_breakpoint(struct perf_event *bp, int len, int type,
struct perf_event_attr attr;
/*
- * We shoud have at least an inactive breakpoint at this
+ * We should have at least an inactive breakpoint at this
* slot. It means the user is writing dr7 without having
* written the address register first
*/
@@ -702,7 +680,7 @@ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n)
} else if (n == 6) {
val = thread->debugreg6;
} else if (n == 7) {
- val = ptrace_get_dr7(thread->ptrace_bps);
+ val = thread->ptrace_dr7;
}
return val;
}
@@ -778,8 +756,11 @@ int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val)
return rc;
}
/* All that's left is DR7 */
- if (n == 7)
+ if (n == 7) {
rc = ptrace_write_dr7(tsk, val);
+ if (!rc)
+ thread->ptrace_dr7 = val;
+ }
ret_path:
return rc;
@@ -1584,7 +1565,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
#ifdef CONFIG_X86_64
-static const struct user_regset x86_64_regsets[] = {
+static struct user_regset x86_64_regsets[] __read_mostly = {
[REGSET_GENERAL] = {
.core_note_type = NT_PRSTATUS,
.n = sizeof(struct user_regs_struct) / sizeof(long),
@@ -1597,6 +1578,12 @@ static const struct user_regset x86_64_regsets[] = {
.size = sizeof(long), .align = sizeof(long),
.active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set
},
+ [REGSET_XSTATE] = {
+ .core_note_type = NT_X86_XSTATE,
+ .size = sizeof(u64), .align = sizeof(u64),
+ .active = xstateregs_active, .get = xstateregs_get,
+ .set = xstateregs_set
+ },
[REGSET_IOPERM64] = {
.core_note_type = NT_386_IOPERM,
.n = IO_BITMAP_LONGS,
@@ -1622,7 +1609,7 @@ static const struct user_regset_view user_x86_64_view = {
#endif /* CONFIG_X86_64 */
#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
-static const struct user_regset x86_32_regsets[] = {
+static struct user_regset x86_32_regsets[] __read_mostly = {
[REGSET_GENERAL] = {
.core_note_type = NT_PRSTATUS,
.n = sizeof(struct user_regs_struct32) / sizeof(u32),
@@ -1641,6 +1628,12 @@ static const struct user_regset x86_32_regsets[] = {
.size = sizeof(u32), .align = sizeof(u32),
.active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set
},
+ [REGSET_XSTATE] = {
+ .core_note_type = NT_X86_XSTATE,
+ .size = sizeof(u64), .align = sizeof(u64),
+ .active = xstateregs_active, .get = xstateregs_get,
+ .set = xstateregs_set
+ },
[REGSET_TLS] = {
.core_note_type = NT_386_TLS,
.n = GDT_ENTRY_TLS_ENTRIES, .bias = GDT_ENTRY_TLS_MIN,
@@ -1663,6 +1656,23 @@ static const struct user_regset_view user_x86_32_view = {
};
#endif
+/*
+ * This represents bytes 464..511 in the memory layout exported through
+ * the REGSET_XSTATE interface.
+ */
+u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
+
+void update_regset_xstate_info(unsigned int size, u64 xstate_mask)
+{
+#ifdef CONFIG_X86_64
+ x86_64_regsets[REGSET_XSTATE].n = size / sizeof(u64);
+#endif
+#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
+ x86_32_regsets[REGSET_XSTATE].n = size / sizeof(u64);
+#endif
+ xstate_fx_sw_bytes[USER_XSTATE_XCR0_WORD] = xstate_mask;
+}
+
const struct user_regset_view *task_user_regset_view(struct task_struct *task)
{
#ifdef CONFIG_IA32_EMULATION
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 18093d7498f..12e9feaa2f7 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -491,6 +491,19 @@ void force_hpet_resume(void)
break;
}
}
+
+/*
+ * HPET MSI on some boards (ATI SB700/SB800) has side effect on
+ * floppy DMA. Disable HPET MSI on such platforms.
+ */
+static void force_disable_hpet_msi(struct pci_dev *unused)
+{
+ hpet_msi_disable = 1;
+}
+
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS,
+ force_disable_hpet_msi);
+
#endif
#if defined(CONFIG_PCI) && defined(CONFIG_NUMA)
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 1545bc0c984..8e1aac86b50 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -203,6 +203,15 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
DMI_MATCH(DMI_BOARD_NAME, "0T656F"),
},
},
+ { /* Handle problems with rebooting on Dell OptiPlex 760 with 0G919G*/
+ .callback = set_bios_reboot,
+ .ident = "Dell OptiPlex 760",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 760"),
+ DMI_MATCH(DMI_BOARD_NAME, "0G919G"),
+ },
+ },
{ /* Handle problems with rebooting on Dell 2400's */
.callback = set_bios_reboot,
.ident = "Dell PowerEdge 2400",
@@ -452,6 +461,14 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "Macmini3,1"),
},
},
+ { /* Handle problems with rebooting on the iMac9,1. */
+ .callback = set_pci_reboot,
+ .ident = "Apple iMac9,1",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"),
+ },
+ },
{ }
};
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index f7b8b9894b2..c4851eff57b 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -55,7 +55,6 @@
#include <linux/stddef.h>
#include <linux/unistd.h>
#include <linux/ptrace.h>
-#include <linux/slab.h>
#include <linux/user.h>
#include <linux/delay.h>
@@ -121,7 +120,9 @@
unsigned long max_low_pfn_mapped;
unsigned long max_pfn_mapped;
+#ifdef CONFIG_DMI
RESERVE_BRK(dmi_alloc, 65536);
+#endif
unsigned int boot_cpu_id __read_mostly;
@@ -312,16 +313,17 @@ static void __init reserve_brk(void)
#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
static void __init relocate_initrd(void)
{
-
+ /* Assume only end is not page aligned */
u64 ramdisk_image = boot_params.hdr.ramdisk_image;
u64 ramdisk_size = boot_params.hdr.ramdisk_size;
+ u64 area_size = PAGE_ALIGN(ramdisk_size);
u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
u64 ramdisk_here;
unsigned long slop, clen, mapaddr;
char *p, *q;
/* We need to move the initrd down into lowmem */
- ramdisk_here = find_e820_area(0, end_of_lowmem, ramdisk_size,
+ ramdisk_here = find_e820_area(0, end_of_lowmem, area_size,
PAGE_SIZE);
if (ramdisk_here == -1ULL)
@@ -330,7 +332,7 @@ static void __init relocate_initrd(void)
/* Note: this includes all the lowmem currently occupied by
the initrd, we rely on that fact to keep the data intact. */
- reserve_early(ramdisk_here, ramdisk_here + ramdisk_size,
+ reserve_early(ramdisk_here, ramdisk_here + area_size,
"NEW RAMDISK");
initrd_start = ramdisk_here + PAGE_OFFSET;
initrd_end = initrd_start + ramdisk_size;
@@ -374,9 +376,10 @@ static void __init relocate_initrd(void)
static void __init reserve_initrd(void)
{
+ /* Assume only end is not page aligned */
u64 ramdisk_image = boot_params.hdr.ramdisk_image;
u64 ramdisk_size = boot_params.hdr.ramdisk_size;
- u64 ramdisk_end = ramdisk_image + ramdisk_size;
+ u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
if (!boot_params.hdr.type_of_loader ||
@@ -604,6 +607,16 @@ static int __init setup_elfcorehdr(char *arg)
early_param("elfcorehdr", setup_elfcorehdr);
#endif
+static __init void reserve_ibft_region(void)
+{
+ unsigned long addr, size = 0;
+
+ addr = find_ibft_region(&size);
+
+ if (size)
+ reserve_early_overlap_ok(addr, addr + size, "ibft");
+}
+
#ifdef CONFIG_X86_RESERVE_LOW_64K
static int __init dmi_low_memory_corruption(const struct dmi_system_id *d)
{
@@ -642,23 +655,48 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix/MSC"),
},
},
- {
/*
- * AMI BIOS with low memory corruption was found on Intel DG45ID board.
- * It hase different DMI_BIOS_VENDOR = "Intel Corp.", for now we will
+ * AMI BIOS with low memory corruption was found on Intel DG45ID and
+ * DG45FC boards.
+ * It has a different DMI_BIOS_VENDOR = "Intel Corp.", for now we will
* match only DMI_BOARD_NAME and see if there is more bad products
* with this vendor.
*/
+ {
.callback = dmi_low_memory_corruption,
.ident = "AMI BIOS",
.matches = {
DMI_MATCH(DMI_BOARD_NAME, "DG45ID"),
},
},
+ {
+ .callback = dmi_low_memory_corruption,
+ .ident = "AMI BIOS",
+ .matches = {
+ DMI_MATCH(DMI_BOARD_NAME, "DG45FC"),
+ },
+ },
#endif
{}
};
+static void __init trim_bios_range(void)
+{
+ /*
+ * A special case is the first 4Kb of memory;
+ * This is a BIOS owned area, not kernel ram, but generally
+ * not listed as such in the E820 table.
+ */
+ e820_update_range(0, PAGE_SIZE, E820_RAM, E820_RESERVED);
+ /*
+ * special case: Some BIOSen report the PC BIOS
+ * area (640->1Mb) as ram even though it is not.
+ * take them out.
+ */
+ e820_remove_range(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_RAM, 1);
+ sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+}
+
/*
* Determine if we were loaded by an EFI loader. If so, then we have also been
* passed the efi memmap, systab, etc., so we should use these data structures
@@ -822,7 +860,7 @@ void __init setup_arch(char **cmdline_p)
insert_resource(&iomem_resource, &data_resource);
insert_resource(&iomem_resource, &bss_resource);
-
+ trim_bios_range();
#ifdef CONFIG_X86_32
if (ppro_with_ram_bug()) {
e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM,
@@ -881,6 +919,8 @@ void __init setup_arch(char **cmdline_p)
*/
find_smp_config();
+ reserve_ibft_region();
+
reserve_trampoline_memory();
#ifdef CONFIG_ACPI_SLEEP
@@ -942,17 +982,11 @@ void __init setup_arch(char **cmdline_p)
#endif
initmem_init(0, max_pfn, acpi, k8);
-
-#ifdef CONFIG_X86_64
- /*
- * dma32_reserve_bootmem() allocates bootmem which may conflict
- * with the crashkernel command line, so do that after
- * reserve_crashkernel()
- */
- dma32_reserve_bootmem();
+#ifndef CONFIG_NO_BOOTMEM
+ early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
#endif
- reserve_ibft_region();
+ dma32_reserve_bootmem();
#ifdef CONFIG_KVM_CLOCK
kvmclock_init();
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 35abcb8b00e..ef6370b00e7 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -137,7 +137,13 @@ static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align)
static void __init pcpu_fc_free(void *ptr, size_t size)
{
+#ifdef CONFIG_NO_BOOTMEM
+ u64 start = __pa(ptr);
+ u64 end = start + size;
+ free_early_partial(start, end);
+#else
free_bootmem(__pa(ptr), size);
+#endif
}
static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index ec1de97600e..d801210945d 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -21,6 +21,7 @@
#include <linux/cache.h>
#include <linux/interrupt.h>
#include <linux/cpu.h>
+#include <linux/gfp.h>
#include <asm/mtrr.h>
#include <asm/tlbflush.h>
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 678d0b8c26f..763d815e27a 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -48,6 +48,8 @@
#include <linux/err.h>
#include <linux/nmi.h>
#include <linux/tboot.h>
+#include <linux/stackprotector.h>
+#include <linux/gfp.h>
#include <asm/acpi.h>
#include <asm/desc.h>
@@ -67,6 +69,7 @@
#include <linux/mc146818rtc.h>
#include <asm/smpboot_hooks.h>
+#include <asm/i8259.h>
#ifdef CONFIG_X86_32
u8 apicid_2_node[MAX_APICID];
@@ -240,7 +243,10 @@ static void __cpuinit smp_callin(void)
end_local_APIC_setup();
map_cpu_to_logical_apicid();
- notify_cpu_starting(cpuid);
+ /*
+ * Need to setup vector mappings before we enable interrupts.
+ */
+ setup_vector_irq(smp_processor_id());
/*
* Get our bogomips.
*
@@ -257,6 +263,8 @@ static void __cpuinit smp_callin(void)
*/
smp_store_cpu_info(cpuid);
+ notify_cpu_starting(cpuid);
+
/*
* Allow the master to continue.
*/
@@ -286,9 +294,9 @@ notrace static void __cpuinit start_secondary(void *unused)
check_tsc_sync_target();
if (nmi_watchdog == NMI_IO_APIC) {
- disable_8259A_irq(0);
+ legacy_pic->chip->mask(0);
enable_NMI_through_LVT0();
- enable_8259A_irq(0);
+ legacy_pic->chip->unmask(0);
}
#ifdef CONFIG_X86_32
@@ -315,15 +323,18 @@ notrace static void __cpuinit start_secondary(void *unused)
*/
ipi_call_lock();
lock_vector_lock();
- __setup_vector_irq(smp_processor_id());
set_cpu_online(smp_processor_id(), true);
unlock_vector_lock();
ipi_call_unlock();
per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
+ x86_platform.nmi_init();
/* enable local interrupts */
local_irq_enable();
+ /* to prevent fake stack check failure in clock setup */
+ boot_init_stack_canary();
+
x86_cpuinit.setup_percpu_clockev();
wmb();
@@ -1083,9 +1094,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
set_cpu_sibling_map(0);
enable_IR_x2apic();
-#ifdef CONFIG_X86_64
default_setup_apic_routing();
-#endif
if (smp_sanity_check(max_cpus) < 0) {
printk(KERN_INFO "SMP disabled\n");
@@ -1213,11 +1222,12 @@ __init void prefill_possible_map(void)
total_cpus = max_t(int, possible, num_processors + disabled_cpus);
- if (possible > CONFIG_NR_CPUS) {
+ /* nr_cpu_ids could be reduced via nr_cpus= */
+ if (possible > nr_cpu_ids) {
printk(KERN_WARNING
"%d Processors exceeds NR_CPUS limit of %d\n",
- possible, CONFIG_NR_CPUS);
- possible = CONFIG_NR_CPUS;
+ possible, nr_cpu_ids);
+ possible = nr_cpu_ids;
}
printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n",
diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c
index dee1ff7cba5..196552bb412 100644
--- a/arch/x86/kernel/sys_i386_32.c
+++ b/arch/x86/kernel/sys_i386_32.c
@@ -25,191 +25,6 @@
#include <asm/syscalls.h>
/*
- * Perform the select(nd, in, out, ex, tv) and mmap() system
- * calls. Linux/i386 didn't use to be able to handle more than
- * 4 system call parameters, so these system calls used a memory
- * block for parameter passing..
- */
-
-struct mmap_arg_struct {
- unsigned long addr;
- unsigned long len;
- unsigned long prot;
- unsigned long flags;
- unsigned long fd;
- unsigned long offset;
-};
-
-asmlinkage int old_mmap(struct mmap_arg_struct __user *arg)
-{
- struct mmap_arg_struct a;
- int err = -EFAULT;
-
- if (copy_from_user(&a, arg, sizeof(a)))
- goto out;
-
- err = -EINVAL;
- if (a.offset & ~PAGE_MASK)
- goto out;
-
- err = sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags,
- a.fd, a.offset >> PAGE_SHIFT);
-out:
- return err;
-}
-
-
-struct sel_arg_struct {
- unsigned long n;
- fd_set __user *inp, *outp, *exp;
- struct timeval __user *tvp;
-};
-
-asmlinkage int old_select(struct sel_arg_struct __user *arg)
-{
- struct sel_arg_struct a;
-
- if (copy_from_user(&a, arg, sizeof(a)))
- return -EFAULT;
- /* sys_select() does the appropriate kernel locking */
- return sys_select(a.n, a.inp, a.outp, a.exp, a.tvp);
-}
-
-/*
- * sys_ipc() is the de-multiplexer for the SysV IPC calls..
- *
- * This is really horribly ugly.
- */
-asmlinkage int sys_ipc(uint call, int first, int second,
- int third, void __user *ptr, long fifth)
-{
- int version, ret;
-
- version = call >> 16; /* hack for backward compatibility */
- call &= 0xffff;
-
- switch (call) {
- case SEMOP:
- return sys_semtimedop(first, (struct sembuf __user *)ptr, second, NULL);
- case SEMTIMEDOP:
- return sys_semtimedop(first, (struct sembuf __user *)ptr, second,
- (const struct timespec __user *)fifth);
-
- case SEMGET:
- return sys_semget(first, second, third);
- case SEMCTL: {
- union semun fourth;
- if (!ptr)
- return -EINVAL;
- if (get_user(fourth.__pad, (void __user * __user *) ptr))
- return -EFAULT;
- return sys_semctl(first, second, third, fourth);
- }
-
- case MSGSND:
- return sys_msgsnd(first, (struct msgbuf __user *) ptr,
- second, third);
- case MSGRCV:
- switch (version) {
- case 0: {
- struct ipc_kludge tmp;
- if (!ptr)
- return -EINVAL;
-
- if (copy_from_user(&tmp,
- (struct ipc_kludge __user *) ptr,
- sizeof(tmp)))
- return -EFAULT;
- return sys_msgrcv(first, tmp.msgp, second,
- tmp.msgtyp, third);
- }
- default:
- return sys_msgrcv(first,
- (struct msgbuf __user *) ptr,
- second, fifth, third);
- }
- case MSGGET:
- return sys_msgget((key_t) first, second);
- case MSGCTL:
- return sys_msgctl(first, second, (struct msqid_ds __user *) ptr);
-
- case SHMAT:
- switch (version) {
- default: {
- ulong raddr;
- ret = do_shmat(first, (char __user *) ptr, second, &raddr);
- if (ret)
- return ret;
- return put_user(raddr, (ulong __user *) third);
- }
- case 1: /* iBCS2 emulator entry point */
- if (!segment_eq(get_fs(), get_ds()))
- return -EINVAL;
- /* The "(ulong *) third" is valid _only_ because of the kernel segment thing */
- return do_shmat(first, (char __user *) ptr, second, (ulong *) third);
- }
- case SHMDT:
- return sys_shmdt((char __user *)ptr);
- case SHMGET:
- return sys_shmget(first, second, third);
- case SHMCTL:
- return sys_shmctl(first, second,
- (struct shmid_ds __user *) ptr);
- default:
- return -ENOSYS;
- }
-}
-
-/*
- * Old cruft
- */
-asmlinkage int sys_uname(struct old_utsname __user *name)
-{
- int err;
- if (!name)
- return -EFAULT;
- down_read(&uts_sem);
- err = copy_to_user(name, utsname(), sizeof(*name));
- up_read(&uts_sem);
- return err? -EFAULT:0;
-}
-
-asmlinkage int sys_olduname(struct oldold_utsname __user *name)
-{
- int error;
-
- if (!name)
- return -EFAULT;
- if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname)))
- return -EFAULT;
-
- down_read(&uts_sem);
-
- error = __copy_to_user(&name->sysname, &utsname()->sysname,
- __OLD_UTS_LEN);
- error |= __put_user(0, name->sysname + __OLD_UTS_LEN);
- error |= __copy_to_user(&name->nodename, &utsname()->nodename,
- __OLD_UTS_LEN);
- error |= __put_user(0, name->nodename + __OLD_UTS_LEN);
- error |= __copy_to_user(&name->release, &utsname()->release,
- __OLD_UTS_LEN);
- error |= __put_user(0, name->release + __OLD_UTS_LEN);
- error |= __copy_to_user(&name->version, &utsname()->version,
- __OLD_UTS_LEN);
- error |= __put_user(0, name->version + __OLD_UTS_LEN);
- error |= __copy_to_user(&name->machine, &utsname()->machine,
- __OLD_UTS_LEN);
- error |= __put_user(0, name->machine + __OLD_UTS_LEN);
-
- up_read(&uts_sem);
-
- error = error ? -EFAULT : 0;
-
- return error;
-}
-
-
-/*
* Do a system call from kernel instead of calling sys_execve so we
* end up with proper pt_regs.
*/
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 8aa2057efd1..ff14a5044ce 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -209,15 +209,3 @@ bottomup:
return addr;
}
-
-
-SYSCALL_DEFINE1(uname, struct new_utsname __user *, name)
-{
- int err;
- down_read(&uts_sem);
- err = copy_to_user(name, utsname(), sizeof(*name));
- up_read(&uts_sem);
- if (personality(current->personality) == PER_LINUX32)
- err |= copy_to_user(&name->machine, "i686", 5);
- return err ? -EFAULT : 0;
-}
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index 15228b5d3eb..8b372934121 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -81,7 +81,7 @@ ENTRY(sys_call_table)
.long sys_settimeofday
.long sys_getgroups16 /* 80 */
.long sys_setgroups16
- .long old_select
+ .long sys_old_select
.long sys_symlink
.long sys_lstat
.long sys_readlink /* 85 */
@@ -89,7 +89,7 @@ ENTRY(sys_call_table)
.long sys_swapon
.long sys_reboot
.long sys_old_readdir
- .long old_mmap /* 90 */
+ .long sys_old_mmap /* 90 */
.long sys_munmap
.long sys_truncate
.long sys_ftruncate
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index be2573448ed..fb5cc5e14cf 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -70,11 +70,11 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id)
* manually to deassert NMI lines for the watchdog if run
* on an 82489DX-based system.
*/
- spin_lock(&i8259A_lock);
+ raw_spin_lock(&i8259A_lock);
outb(0x0c, PIC_MASTER_OCW3);
/* Ack the IRQ; AEOI will end it automatically. */
inb(PIC_MASTER_POLL);
- spin_unlock(&i8259A_lock);
+ raw_spin_unlock(&i8259A_lock);
}
global_clock_event->event_handler(global_clock_event);
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 364d015efeb..17b03dd3a6b 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -9,6 +9,7 @@
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
#include <linux/kernel.h>
+#include <linux/slab.h>
#include <asm/mmu_context.h>
#include <asm/uv/uv.h>
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 33399176512..1168e445418 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -534,6 +534,9 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
get_debugreg(dr6, 6);
+ /* Filter out all the reserved bits which are preset to 1 */
+ dr6 &= ~DR6_RESERVED;
+
/* Catch kmemcheck conditions first of all! */
if ((dr6 & DR_STEP) && kmemcheck_trap(regs))
return;
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 597683aa5ba..9faf91ae184 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -50,7 +50,7 @@ u64 native_sched_clock(void)
* unstable. We do this because unlike Time Of Day,
* the scheduler clock tolerates small errors and it's
* very important for it to be as fast as the platform
- * can achive it. )
+ * can achieve it. )
*/
if (unlikely(tsc_disabled)) {
/* No locking but a rare wrong value is not a big deal: */
@@ -740,7 +740,7 @@ static cycle_t __vsyscall_fn vread_tsc(void)
}
#endif
-static void resume_tsc(void)
+static void resume_tsc(struct clocksource *cs)
{
clocksource_tsc.cycle_last = 0;
}
@@ -806,7 +806,7 @@ static void __init check_system_tsc_reliable(void)
unsigned long res_low, res_high;
rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high);
- /* Geode_LX - the OLPC CPU has a possibly a very reliable TSC */
+ /* Geode_LX - the OLPC CPU has a very reliable TSC */
if (res_low & RTSC_SUSP)
tsc_clocksource_reliable = 1;
#endif
diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c
index ece73d8e324..1d40336b030 100644
--- a/arch/x86/kernel/uv_irq.c
+++ b/arch/x86/kernel/uv_irq.c
@@ -10,6 +10,7 @@
#include <linux/module.h>
#include <linux/rbtree.h>
+#include <linux/slab.h>
#include <linux/irq.h>
#include <asm/apic.h>
diff --git a/arch/x86/kernel/uv_sysfs.c b/arch/x86/kernel/uv_sysfs.c
index 36afb98675a..309c70fb775 100644
--- a/arch/x86/kernel/uv_sysfs.c
+++ b/arch/x86/kernel/uv_sysfs.c
@@ -54,19 +54,19 @@ static int __init sgi_uv_sysfs_init(void)
if (!sgi_uv_kobj)
sgi_uv_kobj = kobject_create_and_add("sgi_uv", firmware_kobj);
if (!sgi_uv_kobj) {
- printk(KERN_WARNING "kobject_create_and_add sgi_uv failed \n");
+ printk(KERN_WARNING "kobject_create_and_add sgi_uv failed\n");
return -EINVAL;
}
ret = sysfs_create_file(sgi_uv_kobj, &partition_id_attr.attr);
if (ret) {
- printk(KERN_WARNING "sysfs_create_file partition_id failed \n");
+ printk(KERN_WARNING "sysfs_create_file partition_id failed\n");
return ret;
}
ret = sysfs_create_file(sgi_uv_kobj, &coherence_id_attr.attr);
if (ret) {
- printk(KERN_WARNING "sysfs_create_file coherence_id failed \n");
+ printk(KERN_WARNING "sysfs_create_file coherence_id failed\n");
return ret;
}
diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c
index 3c84aa001c1..56e421bc379 100644
--- a/arch/x86/kernel/uv_time.c
+++ b/arch/x86/kernel/uv_time.c
@@ -19,6 +19,7 @@
* Copyright (c) Dimitri Sivanich
*/
#include <linux/clockchips.h>
+#include <linux/slab.h>
#include <asm/uv/uv_mmrs.h>
#include <asm/uv/uv_hub.h>
@@ -282,10 +283,21 @@ static int uv_rtc_unset_timer(int cpu, int force)
/*
* Read the RTC.
+ *
+ * Starting with HUB rev 2.0, the UV RTC register is replicated across all
+ * cachelines of it's own page. This allows faster simultaneous reads
+ * from a given socket.
*/
static cycle_t uv_read_rtc(struct clocksource *cs)
{
- return (cycle_t)uv_read_local_mmr(UVH_RTC);
+ unsigned long offset;
+
+ if (uv_get_min_hub_revision_id() == 1)
+ offset = 0;
+ else
+ offset = (uv_blade_processor_id() * L1_CACHE_BYTES) % PAGE_SIZE;
+
+ return (cycle_t)uv_read_local_mmr(UVH_RTC | offset);
}
/*
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index 34a279a7471..e680ea52db9 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -49,11 +49,6 @@ extern int no_broadcast;
char visws_board_type = -1;
char visws_board_rev = -1;
-int is_visws_box(void)
-{
- return visws_board_type >= 0;
-}
-
static void __init visws_time_init(void)
{
printk(KERN_INFO "Starting Cobalt Timer system clock\n");
@@ -242,6 +237,8 @@ void __init visws_early_detect(void)
x86_init.irqs.pre_vector_init = visws_pre_intr_init;
x86_init.irqs.trap_init = visws_trap_init;
x86_init.timers.timer_init = visws_time_init;
+ x86_init.pci.init = pci_visws_init;
+ x86_init.pci.init_irq = x86_init_noop;
/*
* Install reboot quirks:
@@ -508,7 +505,7 @@ static struct irq_chip cobalt_irq_type = {
*/
static unsigned int startup_piix4_master_irq(unsigned int irq)
{
- init_8259A(0);
+ legacy_pic->init(0);
return startup_cobalt_irq(irq);
}
@@ -532,9 +529,6 @@ static struct irq_chip piix4_master_irq_type = {
static struct irq_chip piix4_virtual_irq_type = {
.name = "PIIX4-virtual",
- .shutdown = disable_8259A_irq,
- .enable = enable_8259A_irq,
- .disable = disable_8259A_irq,
};
@@ -559,7 +553,7 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id)
struct irq_desc *desc;
unsigned long flags;
- spin_lock_irqsave(&i8259A_lock, flags);
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
/* Find out what's interrupting in the PIIX4 master 8259 */
outb(0x0c, 0x20); /* OCW3 Poll command */
@@ -596,7 +590,7 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id)
outb(0x60 + realirq, 0x20);
}
- spin_unlock_irqrestore(&i8259A_lock, flags);
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
desc = irq_to_desc(realirq);
@@ -609,12 +603,12 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id)
handle_IRQ_event(realirq, desc->action);
if (!(desc->status & IRQ_DISABLED))
- enable_8259A_irq(realirq);
+ legacy_pic->chip->unmask(realirq);
return IRQ_HANDLED;
out_unlock:
- spin_unlock_irqrestore(&i8259A_lock, flags);
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
return IRQ_NONE;
}
@@ -628,6 +622,12 @@ static struct irqaction cascade_action = {
.name = "cascade",
};
+static inline void set_piix4_virtual_irq_type(void)
+{
+ piix4_virtual_irq_type.shutdown = i8259A_chip.mask;
+ piix4_virtual_irq_type.enable = i8259A_chip.unmask;
+ piix4_virtual_irq_type.disable = i8259A_chip.mask;
+}
void init_VISWS_APIC_irqs(void)
{
@@ -653,6 +653,7 @@ void init_VISWS_APIC_irqs(void)
desc->chip = &piix4_master_irq_type;
}
else if (i < CO_IRQ_APIC0) {
+ set_piix4_virtual_irq_type();
desc->chip = &piix4_virtual_irq_type;
}
else if (IS_CO_APIC(i)) {
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index d430e4c3019..ce9fbacb752 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -28,11 +28,13 @@
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/sched.h>
+#include <linux/gfp.h>
#include <asm/vmi.h>
#include <asm/io.h>
#include <asm/fixmap.h>
#include <asm/apicdef.h>
#include <asm/apic.h>
+#include <asm/pgalloc.h>
#include <asm/processor.h>
#include <asm/timer.h>
#include <asm/vmi_time.h>
@@ -266,30 +268,6 @@ static void vmi_nop(void)
{
}
-#ifdef CONFIG_HIGHPTE
-static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
-{
- void *va = kmap_atomic(page, type);
-
- /*
- * Internally, the VMI ROM must map virtual addresses to physical
- * addresses for processing MMU updates. By the time MMU updates
- * are issued, this information is typically already lost.
- * Fortunately, the VMI provides a cache of mapping slots for active
- * page tables.
- *
- * We use slot zero for the linear mapping of physical memory, and
- * in HIGHPTE kernels, slot 1 and 2 for KM_PTE0 and KM_PTE1.
- *
- * args: SLOT VA COUNT PFN
- */
- BUG_ON(type != KM_PTE0 && type != KM_PTE1);
- vmi_ops.set_linear_mapping((type - KM_PTE0)+1, va, 1, page_to_pfn(page));
-
- return va;
-}
-#endif
-
static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn)
{
vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
@@ -640,6 +618,12 @@ static inline int __init activate_vmi(void)
u64 reloc;
const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
+ /*
+ * Prevent page tables from being allocated in highmem, even if
+ * CONFIG_HIGHPTE is enabled.
+ */
+ __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
+
if (call_vrom_func(vmi_rom, vmi_init) != 0) {
printk(KERN_ERR "VMI ROM failed to initialize!");
return 0;
@@ -778,10 +762,6 @@ static inline int __init activate_vmi(void)
/* Set linear is needed in all cases */
vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping);
-#ifdef CONFIG_HIGHPTE
- if (vmi_ops.set_linear_mapping)
- pv_mmu_ops.kmap_atomic_pte = vmi_kmap_atomic_pte;
-#endif
/*
* These MUST always be patched. Don't support indirect jumps
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
index 74c92bb194d..5e1ff66ecd7 100644
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -79,11 +79,7 @@ unsigned long vmi_tsc_khz(void)
static inline unsigned int vmi_get_timer_vector(void)
{
-#ifdef CONFIG_X86_IO_APIC
- return FIRST_DEVICE_VECTOR;
-#else
- return FIRST_EXTERNAL_VECTOR;
-#endif
+ return IRQ0_VECTOR;
}
/** vmi clockchip */
@@ -171,7 +167,7 @@ static int vmi_timer_next_event(unsigned long delta,
{
/* Unfortunately, set_next_event interface only passes relative
* expiry, but we want absolute expiry. It'd be better if were
- * were passed an aboslute expiry, since a bunch of time may
+ * were passed an absolute expiry, since a bunch of time may
* have been stolen between the time the delta is computed and
* when we set the alarm below. */
cycle_t now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_ONESHOT));
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index f92a0da608c..2cc249718c4 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -291,8 +291,8 @@ SECTIONS
.smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
__smp_locks = .;
*(.smp_locks)
- __smp_locks_end = .;
. = ALIGN(PAGE_SIZE);
+ __smp_locks_end = .;
}
#ifdef CONFIG_X86_64
@@ -341,7 +341,7 @@ SECTIONS
* Per-cpu symbols which need to be offset from __per_cpu_load
* for the boot processor.
*/
-#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load
+#define INIT_PER_CPU(x) init_per_cpu__##x = x + __per_cpu_load
INIT_PER_CPU(gdt_page);
INIT_PER_CPU(irq_stack_union);
@@ -352,7 +352,7 @@ INIT_PER_CPU(irq_stack_union);
"kernel image bigger than KERNEL_IMAGE_SIZE");
#ifdef CONFIG_SMP
-. = ASSERT((per_cpu__irq_stack_union == 0),
+. = ASSERT((irq_stack_union == 0),
"irq_stack_union is not at start of per-cpu area");
#endif
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 9055e5872ff..1c0c6ab9c60 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -301,7 +301,8 @@ static int __init vsyscall_init(void)
register_sysctl_table(kernel_root_table2);
#endif
on_each_cpu(cpu_vsyscall_init, NULL, 1);
- hotcpu_notifier(cpu_vsyscall_notifier, 0);
+ /* notifier priority > KVM */
+ hotcpu_notifier(cpu_vsyscall_notifier, 30);
return 0;
}
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index ccd179dec36..61a1e8c7e19 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -4,9 +4,11 @@
* For licencing details see kernel-base/COPYING
*/
#include <linux/init.h>
+#include <linux/ioport.h>
#include <asm/bios_ebda.h>
#include <asm/paravirt.h>
+#include <asm/pci_x86.h>
#include <asm/mpspec.h>
#include <asm/setup.h>
#include <asm/apic.h>
@@ -70,16 +72,25 @@ struct x86_init_ops x86_init __initdata = {
.iommu = {
.iommu_init = iommu_init_noop,
},
+
+ .pci = {
+ .init = x86_default_pci_init,
+ .init_irq = x86_default_pci_init_irq,
+ .fixup_irqs = x86_default_pci_fixup_irqs,
+ },
};
struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = {
.setup_percpu_clockev = setup_secondary_APIC_clock,
};
+static void default_nmi_init(void) { };
+
struct x86_platform_ops x86_platform = {
.calibrate_tsc = native_calibrate_tsc,
.get_wallclock = mach_get_cmos_time,
.set_wallclock = mach_set_rtc_mmss,
.iommu_shutdown = iommu_shutdown_noop,
.is_untracked_pat_range = is_ISA_range,
+ .nmi_init = default_nmi_init
};
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index c5ee17e8c6d..782c3a362ec 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -337,6 +337,7 @@ void __ref xsave_cntxt_init(void)
cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx);
xstate_size = ebx;
+ update_regset_xstate_info(xstate_size, pcntxt_mask);
prepare_fx_sw_frame();
setup_xstate_init();