summaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile8
-rw-r--r--arch/x86/kernel/acpi/boot.c156
-rw-r--r--arch/x86/kernel/acpi/realmode/Makefile2
-rw-r--r--arch/x86/kernel/acpi/realmode/bioscall.S1
-rw-r--r--arch/x86/kernel/acpi/realmode/regs.c1
-rw-r--r--arch/x86/kernel/acpi/sleep.c2
-rw-r--r--arch/x86/kernel/amd_iommu.c500
-rw-r--r--arch/x86/kernel/amd_iommu_init.c273
-rw-r--r--arch/x86/kernel/apic/apic.c318
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c4
-rw-r--r--arch/x86/kernel/apic/es7000_32.c10
-rw-r--r--arch/x86/kernel/apic/io_apic.c908
-rw-r--r--arch/x86/kernel/apic/nmi.c4
-rw-r--r--arch/x86/kernel/apic/probe_32.c1
-rw-r--r--arch/x86/kernel/apic/probe_64.c2
-rw-r--r--arch/x86/kernel/apic/summit_32.c7
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c2
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c20
-rw-r--r--arch/x86/kernel/apm_32.c14
-rw-r--r--arch/x86/kernel/asm-offsets_32.c2
-rw-r--r--arch/x86/kernel/asm-offsets_64.c1
-rw-r--r--arch/x86/kernel/cpu/Makefile12
-rw-r--r--arch/x86/kernel/cpu/amd.c12
-rw-r--r--arch/x86/kernel/cpu/common.c32
-rw-r--r--arch/x86/kernel/cpu/cpu_debug.c431
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Kconfig9
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c12
-rw-r--r--arch/x86/kernel/cpu/cpufreq/p4-clockmod.c1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k7.c4
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c57
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c2
-rw-r--r--arch/x86/kernel/cpu/intel.c6
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c153
-rw-r--r--arch/x86/kernel/cpu/mcheck/Makefile10
-rw-r--r--arch/x86/kernel/cpu/mcheck/k7.c42
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-inject.c127
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h15
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c218
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c1964
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.h26
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_32.c76
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_64.c1187
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd_64.c203
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c74
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel_64.c66
-rw-r--r--arch/x86/kernel/cpu/mcheck/non-fatal.c57
-rw-r--r--arch/x86/kernel/cpu/mcheck/p4.c86
-rw-r--r--arch/x86/kernel/cpu/mcheck/p5.c48
-rw-r--r--arch/x86/kernel/cpu/mcheck/p6.c26
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c73
-rw-r--r--arch/x86/kernel/cpu/mcheck/threshold.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/winchip.c17
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c4
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c30
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.h15
-rw-r--r--arch/x86/kernel/cpu/mtrr/state.c6
-rw-r--r--arch/x86/kernel/cpu/perf_counter.c1711
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c4
-rw-r--r--arch/x86/kernel/ds.c921
-rw-r--r--arch/x86/kernel/ds_selftest.c408
-rw-r--r--arch/x86/kernel/ds_selftest.h15
-rw-r--r--arch/x86/kernel/dumpstack.h1
-rw-r--r--arch/x86/kernel/e820.c46
-rw-r--r--arch/x86/kernel/early-quirks.c2
-rw-r--r--arch/x86/kernel/entry_64.S40
-rw-r--r--arch/x86/kernel/ftrace.c2
-rw-r--r--arch/x86/kernel/head_32.S7
-rw-r--r--arch/x86/kernel/irq.c47
-rw-r--r--arch/x86/kernel/irqinit.c (renamed from arch/x86/kernel/irqinit_32.c)155
-rw-r--r--arch/x86/kernel/irqinit_64.c177
-rw-r--r--arch/x86/kernel/kgdb.c2
-rw-r--r--arch/x86/kernel/kvm.c6
-rw-r--r--arch/x86/kernel/microcode_amd.c70
-rw-r--r--arch/x86/kernel/microcode_core.c329
-rw-r--r--arch/x86/kernel/microcode_intel.c90
-rw-r--r--arch/x86/kernel/module.c (renamed from arch/x86/kernel/module_64.c)82
-rw-r--r--arch/x86/kernel/module_32.c152
-rw-r--r--arch/x86/kernel/mpparse.c34
-rw-r--r--arch/x86/kernel/paravirt.c58
-rw-r--r--arch/x86/kernel/pci-calgary_64.c54
-rw-r--r--arch/x86/kernel/pci-gart_64.c55
-rw-r--r--arch/x86/kernel/pci-swiotlb.c2
-rw-r--r--arch/x86/kernel/process.c20
-rw-r--r--arch/x86/kernel/process_32.c20
-rw-r--r--arch/x86/kernel/process_64.c20
-rw-r--r--arch/x86/kernel/ptrace.c284
-rw-r--r--arch/x86/kernel/quirks.c37
-rw-r--r--arch/x86/kernel/reboot.c17
-rw-r--r--arch/x86/kernel/setup.c55
-rw-r--r--arch/x86/kernel/setup_percpu.c12
-rw-r--r--arch/x86/kernel/signal.c7
-rw-r--r--arch/x86/kernel/smp.c51
-rw-r--r--arch/x86/kernel/smpboot.c24
-rw-r--r--arch/x86/kernel/stacktrace.c2
-rw-r--r--arch/x86/kernel/syscall_table_32.S2
-rw-r--r--arch/x86/kernel/tlb_uv.c17
-rw-r--r--arch/x86/kernel/traps.c26
-rw-r--r--arch/x86/kernel/tsc.c19
-rw-r--r--arch/x86/kernel/tsc_sync.c14
-rw-r--r--arch/x86/kernel/vm86_32.c13
-rw-r--r--arch/x86/kernel/vmi_32.c20
-rw-r--r--arch/x86/kernel/vmlinux.lds.S432
-rw-r--r--arch/x86/kernel/vmlinux_32.lds.S229
-rw-r--r--arch/x86/kernel/vmlinux_64.lds.S298
-rw-r--r--arch/x86/kernel/vsyscall_64.c8
106 files changed, 8552 insertions, 4886 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 145cce75cda..f3477bb8456 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -28,7 +28,7 @@ CFLAGS_paravirt.o := $(nostackp)
obj-y := process_$(BITS).o signal.o entry_$(BITS).o
obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
obj-y += time_$(BITS).o ioport.o ldt.o dumpstack.o
-obj-y += setup.o i8259.o irqinit_$(BITS).o
+obj-y += setup.o i8259.o irqinit.o
obj-$(CONFIG_X86_VISWS) += visws_quirks.o
obj-$(CONFIG_X86_32) += probe_roms_32.o
obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
@@ -44,6 +44,7 @@ obj-y += process.o
obj-y += i387.o xsave.o
obj-y += ptrace.o
obj-$(CONFIG_X86_DS) += ds.o
+obj-$(CONFIG_X86_DS_SELFTEST) += ds_selftest.o
obj-$(CONFIG_X86_32) += tls.o
obj-$(CONFIG_IA32_EMULATION) += tls.o
obj-y += step.o
@@ -72,7 +73,7 @@ obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o
obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
obj-$(CONFIG_KPROBES) += kprobes.o
-obj-$(CONFIG_MODULES) += module_$(BITS).o
+obj-$(CONFIG_MODULES) += module.o
obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o
obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o
obj-$(CONFIG_KGDB) += kgdb.o
@@ -89,7 +90,8 @@ obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o
obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o
obj-$(CONFIG_KVM_GUEST) += kvm.o
obj-$(CONFIG_KVM_CLOCK) += kvmclock.o
-obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o paravirt-spinlocks.o
+obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o
+obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o
obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 723989d7f80..631086159c5 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -33,6 +33,7 @@
#include <linux/irq.h>
#include <linux/bootmem.h>
#include <linux/ioport.h>
+#include <linux/pci.h>
#include <asm/pgtable.h>
#include <asm/io_apic.h>
@@ -522,7 +523,7 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
* success: return IRQ number (>=0)
* failure: return < 0
*/
-int acpi_register_gsi(u32 gsi, int triggering, int polarity)
+int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
{
unsigned int irq;
unsigned int plat_gsi = gsi;
@@ -532,14 +533,14 @@ int acpi_register_gsi(u32 gsi, int triggering, int polarity)
* Make sure all (legacy) PCI IRQs are set as level-triggered.
*/
if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) {
- if (triggering == ACPI_LEVEL_SENSITIVE)
+ if (trigger == ACPI_LEVEL_SENSITIVE)
eisa_set_level_irq(gsi);
}
#endif
#ifdef CONFIG_X86_IO_APIC
if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) {
- plat_gsi = mp_register_gsi(gsi, triggering, polarity);
+ plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity);
}
#endif
acpi_gsi_to_irq(plat_gsi, &irq);
@@ -903,10 +904,8 @@ extern int es7000_plat;
#endif
static struct {
- int apic_id;
int gsi_base;
int gsi_end;
- DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
} mp_ioapic_routing[MAX_IO_APICS];
int mp_find_ioapic(int gsi)
@@ -986,16 +985,12 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
mp_ioapics[idx].apicid = uniq_ioapic_id(id);
-#ifdef CONFIG_X86_32
mp_ioapics[idx].apicver = io_apic_get_version(idx);
-#else
- mp_ioapics[idx].apicver = 0;
-#endif
+
/*
* Build basic GSI lookup table to facilitate gsi->io_apic lookups
* and to prevent reprogramming of IOAPIC pins (PCI GSIs).
*/
- mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].apicid;
mp_ioapic_routing[idx].gsi_base = gsi_base;
mp_ioapic_routing[idx].gsi_end = gsi_base +
io_apic_get_redir_entries(idx);
@@ -1158,26 +1153,52 @@ void __init mp_config_acpi_legacy_irqs(void)
}
}
-int mp_register_gsi(u32 gsi, int triggering, int polarity)
+static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger,
+ int polarity)
{
+#ifdef CONFIG_X86_MPPARSE
+ struct mpc_intsrc mp_irq;
+ struct pci_dev *pdev;
+ unsigned char number;
+ unsigned int devfn;
int ioapic;
- int ioapic_pin;
-#ifdef CONFIG_X86_32
-#define MAX_GSI_NUM 4096
-#define IRQ_COMPRESSION_START 64
+ u8 pin;
- static int pci_irq = IRQ_COMPRESSION_START;
- /*
- * Mapping between Global System Interrupts, which
- * represent all possible interrupts, and IRQs
- * assigned to actual devices.
- */
- static int gsi_to_irq[MAX_GSI_NUM];
-#else
+ if (!acpi_ioapic)
+ return 0;
+ if (!dev)
+ return 0;
+ if (dev->bus != &pci_bus_type)
+ return 0;
+
+ pdev = to_pci_dev(dev);
+ number = pdev->bus->number;
+ devfn = pdev->devfn;
+ pin = pdev->pin;
+ /* print the entry should happen on mptable identically */
+ mp_irq.type = MP_INTSRC;
+ mp_irq.irqtype = mp_INT;
+ mp_irq.irqflag = (trigger == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |
+ (polarity == ACPI_ACTIVE_HIGH ? 1 : 3);
+ mp_irq.srcbus = number;
+ mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
+ ioapic = mp_find_ioapic(gsi);
+ mp_irq.dstapic = mp_ioapics[ioapic].apicid;
+ mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi);
+
+ save_mp_irq(&mp_irq);
+#endif
+ return 0;
+}
+
+int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
+{
+ int ioapic;
+ int ioapic_pin;
+ struct io_apic_irq_attr irq_attr;
if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
return gsi;
-#endif
/* Don't set up the ACPI SCI because it's already set up */
if (acpi_gbl_FADT.sci_interrupt == gsi)
@@ -1196,93 +1217,22 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity)
gsi = ioapic_renumber_irq(ioapic, gsi);
#endif
- /*
- * Avoid pin reprogramming. PRTs typically include entries
- * with redundant pin->gsi mappings (but unique PCI devices);
- * we only program the IOAPIC on the first.
- */
if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
printk(KERN_ERR "Invalid reference to IOAPIC pin "
- "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
+ "%d-%d\n", mp_ioapics[ioapic].apicid,
ioapic_pin);
return gsi;
}
- if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) {
- pr_debug("Pin %d-%d already programmed\n",
- mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
-#ifdef CONFIG_X86_32
- return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
-#else
- return gsi;
-#endif
- }
-
- set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed);
-#ifdef CONFIG_X86_32
- /*
- * For GSI >= 64, use IRQ compression
- */
- if ((gsi >= IRQ_COMPRESSION_START)
- && (triggering == ACPI_LEVEL_SENSITIVE)) {
- /*
- * For PCI devices assign IRQs in order, avoiding gaps
- * due to unused I/O APIC pins.
- */
- int irq = gsi;
- if (gsi < MAX_GSI_NUM) {
- /*
- * Retain the VIA chipset work-around (gsi > 15), but
- * avoid a problem where the 8254 timer (IRQ0) is setup
- * via an override (so it's not on pin 0 of the ioapic),
- * and at the same time, the pin 0 interrupt is a PCI
- * type. The gsi > 15 test could cause these two pins
- * to be shared as IRQ0, and they are not shareable.
- * So test for this condition, and if necessary, avoid
- * the pin collision.
- */
- gsi = pci_irq++;
- /*
- * Don't assign IRQ used by ACPI SCI
- */
- if (gsi == acpi_gbl_FADT.sci_interrupt)
- gsi = pci_irq++;
- gsi_to_irq[irq] = gsi;
- } else {
- printk(KERN_ERR "GSI %u is too high\n", gsi);
- return gsi;
- }
- }
-#endif
- io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
- triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
- polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
- return gsi;
-}
-int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
- u32 gsi, int triggering, int polarity)
-{
-#ifdef CONFIG_X86_MPPARSE
- struct mpc_intsrc mp_irq;
- int ioapic;
+ if (enable_update_mptable)
+ mp_config_acpi_gsi(dev, gsi, trigger, polarity);
- if (!acpi_ioapic)
- return 0;
+ set_io_apic_irq_attr(&irq_attr, ioapic, ioapic_pin,
+ trigger == ACPI_EDGE_SENSITIVE ? 0 : 1,
+ polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
+ io_apic_set_pci_routing(dev, gsi, &irq_attr);
- /* print the entry should happen on mptable identically */
- mp_irq.type = MP_INTSRC;
- mp_irq.irqtype = mp_INT;
- mp_irq.irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |
- (polarity == ACPI_ACTIVE_HIGH ? 1 : 3);
- mp_irq.srcbus = number;
- mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
- ioapic = mp_find_ioapic(gsi);
- mp_irq.dstapic = mp_ioapic_routing[ioapic].apic_id;
- mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi);
-
- save_mp_irq(&mp_irq);
-#endif
- return 0;
+ return gsi;
}
/*
diff --git a/arch/x86/kernel/acpi/realmode/Makefile b/arch/x86/kernel/acpi/realmode/Makefile
index 1c31cc0e9de..167bc16ce0e 100644
--- a/arch/x86/kernel/acpi/realmode/Makefile
+++ b/arch/x86/kernel/acpi/realmode/Makefile
@@ -9,7 +9,7 @@
always := wakeup.bin
targets := wakeup.elf wakeup.lds
-wakeup-y += wakeup.o wakemain.o video-mode.o copy.o
+wakeup-y += wakeup.o wakemain.o video-mode.o copy.o bioscall.o regs.o
# The link order of the video-*.o modules can matter. In particular,
# video-vga.o *must* be listed first, followed by video-vesa.o.
diff --git a/arch/x86/kernel/acpi/realmode/bioscall.S b/arch/x86/kernel/acpi/realmode/bioscall.S
new file mode 100644
index 00000000000..f51eb0bb56c
--- /dev/null
+++ b/arch/x86/kernel/acpi/realmode/bioscall.S
@@ -0,0 +1 @@
+#include "../../../boot/bioscall.S"
diff --git a/arch/x86/kernel/acpi/realmode/regs.c b/arch/x86/kernel/acpi/realmode/regs.c
new file mode 100644
index 00000000000..6206033ba20
--- /dev/null
+++ b/arch/x86/kernel/acpi/realmode/regs.c
@@ -0,0 +1 @@
+#include "../../../boot/regs.c"
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 7c243a2c511..ca93638ba43 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -104,7 +104,7 @@ int acpi_save_state_mem(void)
initial_gs = per_cpu_offset(smp_processor_id());
#endif
initial_code = (unsigned long)wakeup_long64;
- saved_magic = 0x123456789abcdef0;
+ saved_magic = 0x123456789abcdef0L;
#endif /* CONFIG_64BIT */
return 0;
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index a97db99dad5..1c60554537c 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -55,7 +55,16 @@ struct iommu_cmd {
static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
struct unity_map_entry *e);
static struct dma_ops_domain *find_protection_domain(u16 devid);
+static u64* alloc_pte(struct protection_domain *dom,
+ unsigned long address, u64
+ **pte_page, gfp_t gfp);
+static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
+ unsigned long start_page,
+ unsigned int pages);
+#ifndef BUS_NOTIFY_UNBOUND_DRIVER
+#define BUS_NOTIFY_UNBOUND_DRIVER 0x0005
+#endif
#ifdef CONFIG_AMD_IOMMU_STATS
@@ -213,7 +222,7 @@ irqreturn_t amd_iommu_int_handler(int irq, void *data)
{
struct amd_iommu *iommu;
- list_for_each_entry(iommu, &amd_iommu_list, list)
+ for_each_iommu(iommu)
iommu_poll_events(iommu);
return IRQ_HANDLED;
@@ -440,7 +449,7 @@ static void iommu_flush_domain(u16 domid)
__iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
domid, 1, 1);
- list_for_each_entry(iommu, &amd_iommu_list, list) {
+ for_each_iommu(iommu) {
spin_lock_irqsave(&iommu->lock, flags);
__iommu_queue_command(iommu, &cmd);
__iommu_completion_wait(iommu);
@@ -449,6 +458,35 @@ static void iommu_flush_domain(u16 domid)
}
}
+void amd_iommu_flush_all_domains(void)
+{
+ int i;
+
+ for (i = 1; i < MAX_DOMAIN_ID; ++i) {
+ if (!test_bit(i, amd_iommu_pd_alloc_bitmap))
+ continue;
+ iommu_flush_domain(i);
+ }
+}
+
+void amd_iommu_flush_all_devices(void)
+{
+ struct amd_iommu *iommu;
+ int i;
+
+ for (i = 0; i <= amd_iommu_last_bdf; ++i) {
+ if (amd_iommu_pd_table[i] == NULL)
+ continue;
+
+ iommu = amd_iommu_rlookup_table[i];
+ if (!iommu)
+ continue;
+
+ iommu_queue_inv_dev_entry(iommu, i);
+ iommu_completion_wait(iommu);
+ }
+}
+
/****************************************************************************
*
* The functions below are used the create the page table mappings for
@@ -468,7 +506,7 @@ static int iommu_map_page(struct protection_domain *dom,
unsigned long phys_addr,
int prot)
{
- u64 __pte, *pte, *page;
+ u64 __pte, *pte;
bus_addr = PAGE_ALIGN(bus_addr);
phys_addr = PAGE_ALIGN(phys_addr);
@@ -477,27 +515,7 @@ static int iommu_map_page(struct protection_domain *dom,
if (bus_addr > IOMMU_MAP_SIZE_L3 || !(prot & IOMMU_PROT_MASK))
return -EINVAL;
- pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(bus_addr)];
-
- if (!IOMMU_PTE_PRESENT(*pte)) {
- page = (u64 *)get_zeroed_page(GFP_KERNEL);
- if (!page)
- return -ENOMEM;
- *pte = IOMMU_L2_PDE(virt_to_phys(page));
- }
-
- pte = IOMMU_PTE_PAGE(*pte);
- pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)];
-
- if (!IOMMU_PTE_PRESENT(*pte)) {
- page = (u64 *)get_zeroed_page(GFP_KERNEL);
- if (!page)
- return -ENOMEM;
- *pte = IOMMU_L1_PDE(virt_to_phys(page));
- }
-
- pte = IOMMU_PTE_PAGE(*pte);
- pte = &pte[IOMMU_PTE_L0_INDEX(bus_addr)];
+ pte = alloc_pte(dom, bus_addr, NULL, GFP_KERNEL);
if (IOMMU_PTE_PRESENT(*pte))
return -EBUSY;
@@ -595,7 +613,8 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
* as allocated in the aperture
*/
if (addr < dma_dom->aperture_size)
- __set_bit(addr >> PAGE_SHIFT, dma_dom->bitmap);
+ __set_bit(addr >> PAGE_SHIFT,
+ dma_dom->aperture[0]->bitmap);
}
return 0;
@@ -632,42 +651,191 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
****************************************************************************/
/*
- * The address allocator core function.
+ * The address allocator core functions.
*
* called with domain->lock held
*/
+
+/*
+ * This function checks if there is a PTE for a given dma address. If
+ * there is one, it returns the pointer to it.
+ */
+static u64* fetch_pte(struct protection_domain *domain,
+ unsigned long address)
+{
+ u64 *pte;
+
+ pte = &domain->pt_root[IOMMU_PTE_L2_INDEX(address)];
+
+ if (!IOMMU_PTE_PRESENT(*pte))
+ return NULL;
+
+ pte = IOMMU_PTE_PAGE(*pte);
+ pte = &pte[IOMMU_PTE_L1_INDEX(address)];
+
+ if (!IOMMU_PTE_PRESENT(*pte))
+ return NULL;
+
+ pte = IOMMU_PTE_PAGE(*pte);
+ pte = &pte[IOMMU_PTE_L0_INDEX(address)];
+
+ return pte;
+}
+
+/*
+ * This function is used to add a new aperture range to an existing
+ * aperture in case of dma_ops domain allocation or address allocation
+ * failure.
+ */
+static int alloc_new_range(struct amd_iommu *iommu,
+ struct dma_ops_domain *dma_dom,
+ bool populate, gfp_t gfp)
+{
+ int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT;
+ int i;
+
+#ifdef CONFIG_IOMMU_STRESS
+ populate = false;
+#endif
+
+ if (index >= APERTURE_MAX_RANGES)
+ return -ENOMEM;
+
+ dma_dom->aperture[index] = kzalloc(sizeof(struct aperture_range), gfp);
+ if (!dma_dom->aperture[index])
+ return -ENOMEM;
+
+ dma_dom->aperture[index]->bitmap = (void *)get_zeroed_page(gfp);
+ if (!dma_dom->aperture[index]->bitmap)
+ goto out_free;
+
+ dma_dom->aperture[index]->offset = dma_dom->aperture_size;
+
+ if (populate) {
+ unsigned long address = dma_dom->aperture_size;
+ int i, num_ptes = APERTURE_RANGE_PAGES / 512;
+ u64 *pte, *pte_page;
+
+ for (i = 0; i < num_ptes; ++i) {
+ pte = alloc_pte(&dma_dom->domain, address,
+ &pte_page, gfp);
+ if (!pte)
+ goto out_free;
+
+ dma_dom->aperture[index]->pte_pages[i] = pte_page;
+
+ address += APERTURE_RANGE_SIZE / 64;
+ }
+ }
+
+ dma_dom->aperture_size += APERTURE_RANGE_SIZE;
+
+ /* Intialize the exclusion range if necessary */
+ if (iommu->exclusion_start &&
+ iommu->exclusion_start >= dma_dom->aperture[index]->offset &&
+ iommu->exclusion_start < dma_dom->aperture_size) {
+ unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT;
+ int pages = iommu_num_pages(iommu->exclusion_start,
+ iommu->exclusion_length,
+ PAGE_SIZE);
+ dma_ops_reserve_addresses(dma_dom, startpage, pages);
+ }
+
+ /*
+ * Check for areas already mapped as present in the new aperture
+ * range and mark those pages as reserved in the allocator. Such
+ * mappings may already exist as a result of requested unity
+ * mappings for devices.
+ */
+ for (i = dma_dom->aperture[index]->offset;
+ i < dma_dom->aperture_size;
+ i += PAGE_SIZE) {
+ u64 *pte = fetch_pte(&dma_dom->domain, i);
+ if (!pte || !IOMMU_PTE_PRESENT(*pte))
+ continue;
+
+ dma_ops_reserve_addresses(dma_dom, i << PAGE_SHIFT, 1);
+ }
+
+ return 0;
+
+out_free:
+ free_page((unsigned long)dma_dom->aperture[index]->bitmap);
+
+ kfree(dma_dom->aperture[index]);
+ dma_dom->aperture[index] = NULL;
+
+ return -ENOMEM;
+}
+
+static unsigned long dma_ops_area_alloc(struct device *dev,
+ struct dma_ops_domain *dom,
+ unsigned int pages,
+ unsigned long align_mask,
+ u64 dma_mask,
+ unsigned long start)
+{
+ unsigned long next_bit = dom->next_address % APERTURE_RANGE_SIZE;
+ int max_index = dom->aperture_size >> APERTURE_RANGE_SHIFT;
+ int i = start >> APERTURE_RANGE_SHIFT;
+ unsigned long boundary_size;
+ unsigned long address = -1;
+ unsigned long limit;
+
+ next_bit >>= PAGE_SHIFT;
+
+ boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
+ PAGE_SIZE) >> PAGE_SHIFT;
+
+ for (;i < max_index; ++i) {
+ unsigned long offset = dom->aperture[i]->offset >> PAGE_SHIFT;
+
+ if (dom->aperture[i]->offset >= dma_mask)
+ break;
+
+ limit = iommu_device_max_index(APERTURE_RANGE_PAGES, offset,
+ dma_mask >> PAGE_SHIFT);
+
+ address = iommu_area_alloc(dom->aperture[i]->bitmap,
+ limit, next_bit, pages, 0,
+ boundary_size, align_mask);
+ if (address != -1) {
+ address = dom->aperture[i]->offset +
+ (address << PAGE_SHIFT);
+ dom->next_address = address + (pages << PAGE_SHIFT);
+ break;
+ }
+
+ next_bit = 0;
+ }
+
+ return address;
+}
+
static unsigned long dma_ops_alloc_addresses(struct device *dev,
struct dma_ops_domain *dom,
unsigned int pages,
unsigned long align_mask,
u64 dma_mask)
{
- unsigned long limit;
unsigned long address;
- unsigned long boundary_size;
- boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
- PAGE_SIZE) >> PAGE_SHIFT;
- limit = iommu_device_max_index(dom->aperture_size >> PAGE_SHIFT, 0,
- dma_mask >> PAGE_SHIFT);
+#ifdef CONFIG_IOMMU_STRESS
+ dom->next_address = 0;
+ dom->need_flush = true;
+#endif
- if (dom->next_bit >= limit) {
- dom->next_bit = 0;
- dom->need_flush = true;
- }
+ address = dma_ops_area_alloc(dev, dom, pages, align_mask,
+ dma_mask, dom->next_address);
- address = iommu_area_alloc(dom->bitmap, limit, dom->next_bit, pages,
- 0 , boundary_size, align_mask);
if (address == -1) {
- address = iommu_area_alloc(dom->bitmap, limit, 0, pages,
- 0, boundary_size, align_mask);
+ dom->next_address = 0;
+ address = dma_ops_area_alloc(dev, dom, pages, align_mask,
+ dma_mask, 0);
dom->need_flush = true;
}
- if (likely(address != -1)) {
- dom->next_bit = address + pages;
- address <<= PAGE_SHIFT;
- } else
+ if (unlikely(address == -1))
address = bad_dma_address;
WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size);
@@ -684,11 +852,23 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom,
unsigned long address,
unsigned int pages)
{
- address >>= PAGE_SHIFT;
- iommu_area_free(dom->bitmap, address, pages);
+ unsigned i = address >> APERTURE_RANGE_SHIFT;
+ struct aperture_range *range = dom->aperture[i];
- if (address >= dom->next_bit)
+ BUG_ON(i >= APERTURE_MAX_RANGES || range == NULL);
+
+#ifdef CONFIG_IOMMU_STRESS
+ if (i < 4)
+ return;
+#endif
+
+ if (address >= dom->next_address)
dom->need_flush = true;
+
+ address = (address % APERTURE_RANGE_SIZE) >> PAGE_SHIFT;
+
+ iommu_area_free(range->bitmap, address, pages);
+
}
/****************************************************************************
@@ -736,12 +916,16 @@ static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
unsigned long start_page,
unsigned int pages)
{
- unsigned int last_page = dom->aperture_size >> PAGE_SHIFT;
+ unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT;
if (start_page + pages > last_page)
pages = last_page - start_page;
- iommu_area_reserve(dom->bitmap, start_page, pages);
+ for (i = start_page; i < start_page + pages; ++i) {
+ int index = i / APERTURE_RANGE_PAGES;
+ int page = i % APERTURE_RANGE_PAGES;
+ __set_bit(page, dom->aperture[index]->bitmap);
+ }
}
static void free_pagetable(struct protection_domain *domain)
@@ -780,14 +964,19 @@ static void free_pagetable(struct protection_domain *domain)
*/
static void dma_ops_domain_free(struct dma_ops_domain *dom)
{
+ int i;
+
if (!dom)
return;
free_pagetable(&dom->domain);
- kfree(dom->pte_pages);
-
- kfree(dom->bitmap);
+ for (i = 0; i < APERTURE_MAX_RANGES; ++i) {
+ if (!dom->aperture[i])
+ continue;
+ free_page((unsigned long)dom->aperture[i]->bitmap);
+ kfree(dom->aperture[i]);
+ }
kfree(dom);
}
@@ -797,19 +986,9 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
* It also intializes the page table and the address allocator data
* structures required for the dma_ops interface
*/
-static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
- unsigned order)
+static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu)
{
struct dma_ops_domain *dma_dom;
- unsigned i, num_pte_pages;
- u64 *l2_pde;
- u64 address;
-
- /*
- * Currently the DMA aperture must be between 32 MB and 1GB in size
- */
- if ((order < 25) || (order > 30))
- return NULL;
dma_dom = kzalloc(sizeof(struct dma_ops_domain), GFP_KERNEL);
if (!dma_dom)
@@ -826,55 +1005,20 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
dma_dom->domain.priv = dma_dom;
if (!dma_dom->domain.pt_root)
goto free_dma_dom;
- dma_dom->aperture_size = (1ULL << order);
- dma_dom->bitmap = kzalloc(dma_dom->aperture_size / (PAGE_SIZE * 8),
- GFP_KERNEL);
- if (!dma_dom->bitmap)
- goto free_dma_dom;
- /*
- * mark the first page as allocated so we never return 0 as
- * a valid dma-address. So we can use 0 as error value
- */
- dma_dom->bitmap[0] = 1;
- dma_dom->next_bit = 0;
dma_dom->need_flush = false;
dma_dom->target_dev = 0xffff;
- /* Intialize the exclusion range if necessary */
- if (iommu->exclusion_start &&
- iommu->exclusion_start < dma_dom->aperture_size) {
- unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT;
- int pages = iommu_num_pages(iommu->exclusion_start,
- iommu->exclusion_length,
- PAGE_SIZE);
- dma_ops_reserve_addresses(dma_dom, startpage, pages);
- }
+ if (alloc_new_range(iommu, dma_dom, true, GFP_KERNEL))
+ goto free_dma_dom;
/*
- * At the last step, build the page tables so we don't need to
- * allocate page table pages in the dma_ops mapping/unmapping
- * path.
+ * mark the first page as allocated so we never return 0 as
+ * a valid dma-address. So we can use 0 as error value
*/
- num_pte_pages = dma_dom->aperture_size / (PAGE_SIZE * 512);
- dma_dom->pte_pages = kzalloc(num_pte_pages * sizeof(void *),
- GFP_KERNEL);
- if (!dma_dom->pte_pages)
- goto free_dma_dom;
-
- l2_pde = (u64 *)get_zeroed_page(GFP_KERNEL);
- if (l2_pde == NULL)
- goto free_dma_dom;
+ dma_dom->aperture[0]->bitmap[0] = 1;
+ dma_dom->next_address = 0;
- dma_dom->domain.pt_root[0] = IOMMU_L2_PDE(virt_to_phys(l2_pde));
-
- for (i = 0; i < num_pte_pages; ++i) {
- dma_dom->pte_pages[i] = (u64 *)get_zeroed_page(GFP_KERNEL);
- if (!dma_dom->pte_pages[i])
- goto free_dma_dom;
- address = virt_to_phys(dma_dom->pte_pages[i]);
- l2_pde[i] = IOMMU_L1_PDE(address);
- }
return dma_dom;
@@ -983,7 +1127,6 @@ static int device_change_notifier(struct notifier_block *nb,
struct protection_domain *domain;
struct dma_ops_domain *dma_domain;
struct amd_iommu *iommu;
- int order = amd_iommu_aperture_order;
unsigned long flags;
if (devid > amd_iommu_last_bdf)
@@ -1002,17 +1145,7 @@ static int device_change_notifier(struct notifier_block *nb,
"to a non-dma-ops domain\n", dev_name(dev));
switch (action) {
- case BUS_NOTIFY_BOUND_DRIVER:
- if (domain)
- goto out;
- dma_domain = find_protection_domain(devid);
- if (!dma_domain)
- dma_domain = iommu->default_dom;
- attach_device(iommu, &dma_domain->domain, devid);
- printk(KERN_INFO "AMD IOMMU: Using protection domain %d for "
- "device %s\n", dma_domain->domain.id, dev_name(dev));
- break;
- case BUS_NOTIFY_UNBIND_DRIVER:
+ case BUS_NOTIFY_UNBOUND_DRIVER:
if (!domain)
goto out;
detach_device(domain, devid);
@@ -1022,7 +1155,7 @@ static int device_change_notifier(struct notifier_block *nb,
dma_domain = find_protection_domain(devid);
if (dma_domain)
goto out;
- dma_domain = dma_ops_domain_alloc(iommu, order);
+ dma_domain = dma_ops_domain_alloc(iommu);
if (!dma_domain)
goto out;
dma_domain->target_dev = devid;
@@ -1133,8 +1266,8 @@ static int get_device_resources(struct device *dev,
dma_dom = (*iommu)->default_dom;
*domain = &dma_dom->domain;
attach_device(*iommu, *domain, *bdf);
- printk(KERN_INFO "AMD IOMMU: Using protection domain %d for "
- "device %s\n", (*domain)->id, dev_name(dev));
+ DUMP_printk("Using protection domain %d for device %s\n",
+ (*domain)->id, dev_name(dev));
}
if (domain_for_device(_bdf) == NULL)
@@ -1144,6 +1277,66 @@ static int get_device_resources(struct device *dev,
}
/*
+ * If the pte_page is not yet allocated this function is called
+ */
+static u64* alloc_pte(struct protection_domain *dom,
+ unsigned long address, u64 **pte_page, gfp_t gfp)
+{
+ u64 *pte, *page;
+
+ pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(address)];
+
+ if (!IOMMU_PTE_PRESENT(*pte)) {
+ page = (u64 *)get_zeroed_page(gfp);
+ if (!page)
+ return NULL;
+ *pte = IOMMU_L2_PDE(virt_to_phys(page));
+ }
+
+ pte = IOMMU_PTE_PAGE(*pte);
+ pte = &pte[IOMMU_PTE_L1_INDEX(address)];
+
+ if (!IOMMU_PTE_PRESENT(*pte)) {
+ page = (u64 *)get_zeroed_page(gfp);
+ if (!page)
+ return NULL;
+ *pte = IOMMU_L1_PDE(virt_to_phys(page));
+ }
+
+ pte = IOMMU_PTE_PAGE(*pte);
+
+ if (pte_page)
+ *pte_page = pte;
+
+ pte = &pte[IOMMU_PTE_L0_INDEX(address)];
+
+ return pte;
+}
+
+/*
+ * This function fetches the PTE for a given address in the aperture
+ */
+static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
+ unsigned long address)
+{
+ struct aperture_range *aperture;
+ u64 *pte, *pte_page;
+
+ aperture = dom->aperture[APERTURE_RANGE_INDEX(address)];
+ if (!aperture)
+ return NULL;
+
+ pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
+ if (!pte) {
+ pte = alloc_pte(&dom->domain, address, &pte_page, GFP_ATOMIC);
+ aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page;
+ } else
+ pte += IOMMU_PTE_L0_INDEX(address);
+
+ return pte;
+}
+
+/*
* This is the generic map function. It maps one 4kb page at paddr to
* the given address in the DMA address space for the domain.
*/
@@ -1159,8 +1352,9 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
paddr &= PAGE_MASK;
- pte = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)];
- pte += IOMMU_PTE_L0_INDEX(address);
+ pte = dma_ops_get_pte(dom, address);
+ if (!pte)
+ return bad_dma_address;
__pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC;
@@ -1185,14 +1379,20 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu,
struct dma_ops_domain *dom,
unsigned long address)
{
+ struct aperture_range *aperture;
u64 *pte;
if (address >= dom->aperture_size)
return;
- WARN_ON(address & ~PAGE_MASK || address >= dom->aperture_size);
+ aperture = dom->aperture[APERTURE_RANGE_INDEX(address)];
+ if (!aperture)
+ return;
+
+ pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
+ if (!pte)
+ return;
- pte = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)];
pte += IOMMU_PTE_L0_INDEX(address);
WARN_ON(!*pte);
@@ -1216,7 +1416,7 @@ static dma_addr_t __map_single(struct device *dev,
u64 dma_mask)
{
dma_addr_t offset = paddr & ~PAGE_MASK;
- dma_addr_t address, start;
+ dma_addr_t address, start, ret;
unsigned int pages;
unsigned long align_mask = 0;
int i;
@@ -1232,14 +1432,33 @@ static dma_addr_t __map_single(struct device *dev,
if (align)
align_mask = (1UL << get_order(size)) - 1;
+retry:
address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask,
dma_mask);
- if (unlikely(address == bad_dma_address))
- goto out;
+ if (unlikely(address == bad_dma_address)) {
+ /*
+ * setting next_address here will let the address
+ * allocator only scan the new allocated range in the
+ * first run. This is a small optimization.
+ */
+ dma_dom->next_address = dma_dom->aperture_size;
+
+ if (alloc_new_range(iommu, dma_dom, false, GFP_ATOMIC))
+ goto out;
+
+ /*
+ * aperture was sucessfully enlarged by 128 MB, try
+ * allocation again
+ */
+ goto retry;
+ }
start = address;
for (i = 0; i < pages; ++i) {
- dma_ops_domain_map(iommu, dma_dom, start, paddr, dir);
+ ret = dma_ops_domain_map(iommu, dma_dom, start, paddr, dir);
+ if (ret == bad_dma_address)
+ goto out_unmap;
+
paddr += PAGE_SIZE;
start += PAGE_SIZE;
}
@@ -1255,6 +1474,17 @@ static dma_addr_t __map_single(struct device *dev,
out:
return address;
+
+out_unmap:
+
+ for (--i; i >= 0; --i) {
+ start -= PAGE_SIZE;
+ dma_ops_domain_unmap(iommu, dma_dom, start);
+ }
+
+ dma_ops_free_addresses(dma_dom, address, pages);
+
+ return bad_dma_address;
}
/*
@@ -1537,8 +1767,10 @@ static void *alloc_coherent(struct device *dev, size_t size,
*dma_addr = __map_single(dev, iommu, domain->priv, paddr,
size, DMA_BIDIRECTIONAL, true, dma_mask);
- if (*dma_addr == bad_dma_address)
+ if (*dma_addr == bad_dma_address) {
+ spin_unlock_irqrestore(&domain->lock, flags);
goto out_free;
+ }
iommu_completion_wait(iommu);
@@ -1625,7 +1857,6 @@ static void prealloc_protection_domains(void)
struct pci_dev *dev = NULL;
struct dma_ops_domain *dma_dom;
struct amd_iommu *iommu;
- int order = amd_iommu_aperture_order;
u16 devid;
while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
@@ -1638,7 +1869,7 @@ static void prealloc_protection_domains(void)
iommu = amd_iommu_rlookup_table[devid];
if (!iommu)
continue;
- dma_dom = dma_ops_domain_alloc(iommu, order);
+ dma_dom = dma_ops_domain_alloc(iommu);
if (!dma_dom)
continue;
init_unity_mappings_for_device(dma_dom, devid);
@@ -1664,7 +1895,6 @@ static struct dma_map_ops amd_iommu_dma_ops = {
int __init amd_iommu_init_dma_ops(void)
{
struct amd_iommu *iommu;
- int order = amd_iommu_aperture_order;
int ret;
/*
@@ -1672,8 +1902,8 @@ int __init amd_iommu_init_dma_ops(void)
* found in the system. Devices not assigned to any other
* protection domain will be assigned to the default one.
*/
- list_for_each_entry(iommu, &amd_iommu_list, list) {
- iommu->default_dom = dma_ops_domain_alloc(iommu, order);
+ for_each_iommu(iommu) {
+ iommu->default_dom = dma_ops_domain_alloc(iommu);
if (iommu->default_dom == NULL)
return -ENOMEM;
iommu->default_dom->domain.flags |= PD_DEFAULT_MASK;
@@ -1710,7 +1940,7 @@ int __init amd_iommu_init_dma_ops(void)
free_domains:
- list_for_each_entry(iommu, &amd_iommu_list, list) {
+ for_each_iommu(iommu) {
if (iommu->default_dom)
dma_ops_domain_free(iommu->default_dom);
}
@@ -1842,7 +2072,7 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,
old_domain = domain_for_device(devid);
if (old_domain)
- return -EBUSY;
+ detach_device(old_domain, devid);
attach_device(iommu, domain, devid);
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 8c0be0902da..238989ec077 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -115,15 +115,21 @@ struct ivmd_header {
u64 range_length;
} __attribute__((packed));
+bool amd_iommu_dump;
+
static int __initdata amd_iommu_detected;
u16 amd_iommu_last_bdf; /* largest PCI device id we have
to handle */
LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings
we find in ACPI */
-unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */
+#ifdef CONFIG_IOMMU_STRESS
+bool amd_iommu_isolate = false;
+#else
bool amd_iommu_isolate = true; /* if true, device isolation is
enabled */
+#endif
+
bool amd_iommu_unmap_flush; /* if true, flush on every unmap */
LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the
@@ -175,7 +181,7 @@ static inline void update_last_devid(u16 devid)
static inline unsigned long tbl_size(int entry_size)
{
unsigned shift = PAGE_SHIFT +
- get_order(amd_iommu_last_bdf * entry_size);
+ get_order(((int)amd_iommu_last_bdf + 1) * entry_size);
return 1UL << shift;
}
@@ -193,7 +199,7 @@ static inline unsigned long tbl_size(int entry_size)
* This function set the exclusion range in the IOMMU. DMA accesses to the
* exclusion range are passed through untranslated
*/
-static void __init iommu_set_exclusion_range(struct amd_iommu *iommu)
+static void iommu_set_exclusion_range(struct amd_iommu *iommu)
{
u64 start = iommu->exclusion_start & PAGE_MASK;
u64 limit = (start + iommu->exclusion_length) & PAGE_MASK;
@@ -225,7 +231,7 @@ static void __init iommu_set_device_table(struct amd_iommu *iommu)
}
/* Generic functions to enable/disable certain features of the IOMMU. */
-static void __init iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
+static void iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
{
u32 ctrl;
@@ -244,7 +250,7 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
}
/* Function to enable the hardware */
-static void __init iommu_enable(struct amd_iommu *iommu)
+static void iommu_enable(struct amd_iommu *iommu)
{
printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at %s cap 0x%hx\n",
dev_name(&iommu->dev->dev), iommu->cap_ptr);
@@ -252,11 +258,9 @@ static void __init iommu_enable(struct amd_iommu *iommu)
iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
}
-/* Function to enable IOMMU event logging and event interrupts */
-static void __init iommu_enable_event_logging(struct amd_iommu *iommu)
+static void iommu_disable(struct amd_iommu *iommu)
{
- iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN);
- iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
+ iommu_feature_disable(iommu, CONTROL_IOMMU_EN);
}
/*
@@ -413,25 +417,36 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
{
u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
get_order(CMD_BUFFER_SIZE));
- u64 entry;
if (cmd_buf == NULL)
return NULL;
iommu->cmd_buf_size = CMD_BUFFER_SIZE;
- entry = (u64)virt_to_phys(cmd_buf);
+ return cmd_buf;
+}
+
+/*
+ * This function writes the command buffer address to the hardware and
+ * enables it.
+ */
+static void iommu_enable_command_buffer(struct amd_iommu *iommu)
+{
+ u64 entry;
+
+ BUG_ON(iommu->cmd_buf == NULL);
+
+ entry = (u64)virt_to_phys(iommu->cmd_buf);
entry |= MMIO_CMD_SIZE_512;
+
memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
- &entry, sizeof(entry));
+ &entry, sizeof(entry));
/* set head and tail to zero manually */
writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
iommu_feature_enable(iommu, CONTROL_CMDBUF_EN);
-
- return cmd_buf;
}
static void __init free_command_buffer(struct amd_iommu *iommu)
@@ -443,20 +458,27 @@ static void __init free_command_buffer(struct amd_iommu *iommu)
/* allocates the memory where the IOMMU will log its events to */
static u8 * __init alloc_event_buffer(struct amd_iommu *iommu)
{
- u64 entry;
iommu->evt_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
get_order(EVT_BUFFER_SIZE));
if (iommu->evt_buf == NULL)
return NULL;
+ return iommu->evt_buf;
+}
+
+static void iommu_enable_event_buffer(struct amd_iommu *iommu)
+{
+ u64 entry;
+
+ BUG_ON(iommu->evt_buf == NULL);
+
entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK;
+
memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET,
&entry, sizeof(entry));
- iommu->evt_buf_size = EVT_BUFFER_SIZE;
-
- return iommu->evt_buf;
+ iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN);
}
static void __init free_event_buffer(struct amd_iommu *iommu)
@@ -596,32 +618,83 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
p += sizeof(struct ivhd_header);
end += h->length;
+
while (p < end) {
e = (struct ivhd_entry *)p;
switch (e->type) {
case IVHD_DEV_ALL:
+
+ DUMP_printk(" DEV_ALL\t\t\t first devid: %02x:%02x.%x"
+ " last device %02x:%02x.%x flags: %02x\n",
+ PCI_BUS(iommu->first_device),
+ PCI_SLOT(iommu->first_device),
+ PCI_FUNC(iommu->first_device),
+ PCI_BUS(iommu->last_device),
+ PCI_SLOT(iommu->last_device),
+ PCI_FUNC(iommu->last_device),
+ e->flags);
+
for (dev_i = iommu->first_device;
dev_i <= iommu->last_device; ++dev_i)
set_dev_entry_from_acpi(iommu, dev_i,
e->flags, 0);
break;
case IVHD_DEV_SELECT:
+
+ DUMP_printk(" DEV_SELECT\t\t\t devid: %02x:%02x.%x "
+ "flags: %02x\n",
+ PCI_BUS(e->devid),
+ PCI_SLOT(e->devid),
+ PCI_FUNC(e->devid),
+ e->flags);
+
devid = e->devid;
set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
break;
case IVHD_DEV_SELECT_RANGE_START:
+
+ DUMP_printk(" DEV_SELECT_RANGE_START\t "
+ "devid: %02x:%02x.%x flags: %02x\n",
+ PCI_BUS(e->devid),
+ PCI_SLOT(e->devid),
+ PCI_FUNC(e->devid),
+ e->flags);
+
devid_start = e->devid;
flags = e->flags;
ext_flags = 0;
alias = false;
break;
case IVHD_DEV_ALIAS:
+
+ DUMP_printk(" DEV_ALIAS\t\t\t devid: %02x:%02x.%x "
+ "flags: %02x devid_to: %02x:%02x.%x\n",
+ PCI_BUS(e->devid),
+ PCI_SLOT(e->devid),
+ PCI_FUNC(e->devid),
+ e->flags,
+ PCI_BUS(e->ext >> 8),
+ PCI_SLOT(e->ext >> 8),
+ PCI_FUNC(e->ext >> 8));
+
devid = e->devid;
devid_to = e->ext >> 8;
- set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
+ set_dev_entry_from_acpi(iommu, devid_to, e->flags, 0);
amd_iommu_alias_table[devid] = devid_to;
break;
case IVHD_DEV_ALIAS_RANGE:
+
+ DUMP_printk(" DEV_ALIAS_RANGE\t\t "
+ "devid: %02x:%02x.%x flags: %02x "
+ "devid_to: %02x:%02x.%x\n",
+ PCI_BUS(e->devid),
+ PCI_SLOT(e->devid),
+ PCI_FUNC(e->devid),
+ e->flags,
+ PCI_BUS(e->ext >> 8),
+ PCI_SLOT(e->ext >> 8),
+ PCI_FUNC(e->ext >> 8));
+
devid_start = e->devid;
flags = e->flags;
devid_to = e->ext >> 8;
@@ -629,17 +702,39 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
alias = true;
break;
case IVHD_DEV_EXT_SELECT:
+
+ DUMP_printk(" DEV_EXT_SELECT\t\t devid: %02x:%02x.%x "
+ "flags: %02x ext: %08x\n",
+ PCI_BUS(e->devid),
+ PCI_SLOT(e->devid),
+ PCI_FUNC(e->devid),
+ e->flags, e->ext);
+
devid = e->devid;
set_dev_entry_from_acpi(iommu, devid, e->flags,
e->ext);
break;
case IVHD_DEV_EXT_SELECT_RANGE:
+
+ DUMP_printk(" DEV_EXT_SELECT_RANGE\t devid: "
+ "%02x:%02x.%x flags: %02x ext: %08x\n",
+ PCI_BUS(e->devid),
+ PCI_SLOT(e->devid),
+ PCI_FUNC(e->devid),
+ e->flags, e->ext);
+
devid_start = e->devid;
flags = e->flags;
ext_flags = e->ext;
alias = false;
break;
case IVHD_DEV_RANGE_END:
+
+ DUMP_printk(" DEV_RANGE_END\t\t devid: %02x:%02x.%x\n",
+ PCI_BUS(e->devid),
+ PCI_SLOT(e->devid),
+ PCI_FUNC(e->devid));
+
devid = e->devid;
for (dev_i = devid_start; dev_i <= devid; ++dev_i) {
if (alias)
@@ -679,7 +774,7 @@ static void __init free_iommu_all(void)
{
struct amd_iommu *iommu, *next;
- list_for_each_entry_safe(iommu, next, &amd_iommu_list, list) {
+ for_each_iommu_safe(iommu, next) {
list_del(&iommu->list);
free_iommu_one(iommu);
kfree(iommu);
@@ -710,7 +805,6 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
if (!iommu->mmio_base)
return -ENOMEM;
- iommu_set_device_table(iommu);
iommu->cmd_buf = alloc_command_buffer(iommu);
if (!iommu->cmd_buf)
return -ENOMEM;
@@ -746,6 +840,15 @@ static int __init init_iommu_all(struct acpi_table_header *table)
h = (struct ivhd_header *)p;
switch (*p) {
case ACPI_IVHD_TYPE:
+
+ DUMP_printk("IOMMU: device: %02x:%02x.%01x cap: %04x "
+ "seg: %d flags: %01x info %04x\n",
+ PCI_BUS(h->devid), PCI_SLOT(h->devid),
+ PCI_FUNC(h->devid), h->cap_ptr,
+ h->pci_seg, h->flags, h->info);
+ DUMP_printk(" mmio-addr: %016llx\n",
+ h->mmio_phys);
+
iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL);
if (iommu == NULL)
return -ENOMEM;
@@ -773,56 +876,9 @@ static int __init init_iommu_all(struct acpi_table_header *table)
*
****************************************************************************/
-static int __init iommu_setup_msix(struct amd_iommu *iommu)
-{
- struct amd_iommu *curr;
- struct msix_entry entries[32]; /* only 32 supported by AMD IOMMU */
- int nvec = 0, i;
-
- list_for_each_entry(curr, &amd_iommu_list, list) {
- if (curr->dev == iommu->dev) {
- entries[nvec].entry = curr->evt_msi_num;
- entries[nvec].vector = 0;
- curr->int_enabled = true;
- nvec++;
- }
- }
-
- if (pci_enable_msix(iommu->dev, entries, nvec)) {
- pci_disable_msix(iommu->dev);
- return 1;
- }
-
- for (i = 0; i < nvec; ++i) {
- int r = request_irq(entries->vector, amd_iommu_int_handler,
- IRQF_SAMPLE_RANDOM,
- "AMD IOMMU",
- NULL);
- if (r)
- goto out_free;
- }
-
- return 0;
-
-out_free:
- for (i -= 1; i >= 0; --i)
- free_irq(entries->vector, NULL);
-
- pci_disable_msix(iommu->dev);
-
- return 1;
-}
-
static int __init iommu_setup_msi(struct amd_iommu *iommu)
{
int r;
- struct amd_iommu *curr;
-
- list_for_each_entry(curr, &amd_iommu_list, list) {
- if (curr->dev == iommu->dev)
- curr->int_enabled = true;
- }
-
if (pci_enable_msi(iommu->dev))
return 1;
@@ -837,17 +893,18 @@ static int __init iommu_setup_msi(struct amd_iommu *iommu)
return 1;
}
+ iommu->int_enabled = true;
+ iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
+
return 0;
}
-static int __init iommu_init_msi(struct amd_iommu *iommu)
+static int iommu_init_msi(struct amd_iommu *iommu)
{
if (iommu->int_enabled)
return 0;
- if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSIX))
- return iommu_setup_msix(iommu);
- else if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI))
+ if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI))
return iommu_setup_msi(iommu);
return 1;
@@ -899,6 +956,7 @@ static int __init init_exclusion_range(struct ivmd_header *m)
static int __init init_unity_map_range(struct ivmd_header *m)
{
struct unity_map_entry *e = 0;
+ char *s;
e = kzalloc(sizeof(*e), GFP_KERNEL);
if (e == NULL)
@@ -906,14 +964,19 @@ static int __init init_unity_map_range(struct ivmd_header *m)
switch (m->type) {
default:
+ kfree(e);
+ return 0;
case ACPI_IVMD_TYPE:
+ s = "IVMD_TYPEi\t\t\t";
e->devid_start = e->devid_end = m->devid;
break;
case ACPI_IVMD_TYPE_ALL:
+ s = "IVMD_TYPE_ALL\t\t";
e->devid_start = 0;
e->devid_end = amd_iommu_last_bdf;
break;
case ACPI_IVMD_TYPE_RANGE:
+ s = "IVMD_TYPE_RANGE\t\t";
e->devid_start = m->devid;
e->devid_end = m->aux;
break;
@@ -922,6 +985,13 @@ static int __init init_unity_map_range(struct ivmd_header *m)
e->address_end = e->address_start + PAGE_ALIGN(m->range_length);
e->prot = m->flags >> 1;
+ DUMP_printk("%s devid_start: %02x:%02x.%x devid_end: %02x:%02x.%x"
+ " range_start: %016llx range_end: %016llx flags: %x\n", s,
+ PCI_BUS(e->devid_start), PCI_SLOT(e->devid_start),
+ PCI_FUNC(e->devid_start), PCI_BUS(e->devid_end),
+ PCI_SLOT(e->devid_end), PCI_FUNC(e->devid_end),
+ e->address_start, e->address_end, m->flags);
+
list_add_tail(&e->list, &amd_iommu_unity_map);
return 0;
@@ -967,18 +1037,28 @@ static void init_device_table(void)
* This function finally enables all IOMMUs found in the system after
* they have been initialized
*/
-static void __init enable_iommus(void)
+static void enable_iommus(void)
{
struct amd_iommu *iommu;
- list_for_each_entry(iommu, &amd_iommu_list, list) {
+ for_each_iommu(iommu) {
+ iommu_set_device_table(iommu);
+ iommu_enable_command_buffer(iommu);
+ iommu_enable_event_buffer(iommu);
iommu_set_exclusion_range(iommu);
iommu_init_msi(iommu);
- iommu_enable_event_logging(iommu);
iommu_enable(iommu);
}
}
+static void disable_iommus(void)
+{
+ struct amd_iommu *iommu;
+
+ for_each_iommu(iommu)
+ iommu_disable(iommu);
+}
+
/*
* Suspend/Resume support
* disable suspend until real resume implemented
@@ -986,12 +1066,31 @@ static void __init enable_iommus(void)
static int amd_iommu_resume(struct sys_device *dev)
{
+ /*
+ * Disable IOMMUs before reprogramming the hardware registers.
+ * IOMMU is still enabled from the resume kernel.
+ */
+ disable_iommus();
+
+ /* re-load the hardware */
+ enable_iommus();
+
+ /*
+ * we have to flush after the IOMMUs are enabled because a
+ * disabled IOMMU will never execute the commands we send
+ */
+ amd_iommu_flush_all_domains();
+ amd_iommu_flush_all_devices();
+
return 0;
}
static int amd_iommu_suspend(struct sys_device *dev, pm_message_t state)
{
- return -EINVAL;
+ /* disable IOMMUs to go out of the way for BIOS */
+ disable_iommus();
+
+ return 0;
}
static struct sysdev_class amd_iommu_sysdev_class = {
@@ -1137,9 +1236,6 @@ int __init amd_iommu_init(void)
enable_iommus();
- printk(KERN_INFO "AMD IOMMU: aperture size is %d MB\n",
- (1 << (amd_iommu_aperture_order-20)));
-
printk(KERN_INFO "AMD IOMMU: device isolation ");
if (amd_iommu_isolate)
printk("enabled\n");
@@ -1211,6 +1307,13 @@ void __init amd_iommu_detect(void)
*
****************************************************************************/
+static int __init parse_amd_iommu_dump(char *str)
+{
+ amd_iommu_dump = true;
+
+ return 1;
+}
+
static int __init parse_amd_iommu_options(char *str)
{
for (; *str; ++str) {
@@ -1225,15 +1328,5 @@ static int __init parse_amd_iommu_options(char *str)
return 1;
}
-static int __init parse_amd_iommu_size_options(char *str)
-{
- unsigned order = PAGE_SHIFT + get_order(memparse(str, &str));
-
- if ((order > 24) && (order < 31))
- amd_iommu_aperture_order = order;
-
- return 1;
-}
-
+__setup("amd_iommu_dump", parse_amd_iommu_dump);
__setup("amd_iommu=", parse_amd_iommu_options);
-__setup("amd_iommu_size=", parse_amd_iommu_size_options);
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index f2870920f24..8c7c042ecad 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -14,6 +14,7 @@
* Mikael Pettersson : PM converted to driver model.
*/
+#include <linux/perf_counter.h>
#include <linux/kernel_stat.h>
#include <linux/mc146818rtc.h>
#include <linux/acpi_pmtmr.h>
@@ -34,6 +35,7 @@
#include <linux/smp.h>
#include <linux/mm.h>
+#include <asm/perf_counter.h>
#include <asm/pgalloc.h>
#include <asm/atomic.h>
#include <asm/mpspec.h>
@@ -98,6 +100,29 @@ early_param("lapic", parse_lapic);
/* Local APIC was disabled by the BIOS and enabled by the kernel */
static int enabled_via_apicbase;
+/*
+ * Handle interrupt mode configuration register (IMCR).
+ * This register controls whether the interrupt signals
+ * that reach the BSP come from the master PIC or from the
+ * local APIC. Before entering Symmetric I/O Mode, either
+ * the BIOS or the operating system must switch out of
+ * PIC Mode by changing the IMCR.
+ */
+static inline void imcr_pic_to_apic(void)
+{
+ /* select IMCR register */
+ outb(0x70, 0x22);
+ /* NMI and 8259 INTR go through APIC */
+ outb(0x01, 0x23);
+}
+
+static inline void imcr_apic_to_pic(void)
+{
+ /* select IMCR register */
+ outb(0x70, 0x22);
+ /* NMI and 8259 INTR go directly to BSP */
+ outb(0x00, 0x23);
+}
#endif
#ifdef CONFIG_X86_64
@@ -111,13 +136,19 @@ static __init int setup_apicpmtimer(char *s)
__setup("apicpmtimer", setup_apicpmtimer);
#endif
+int x2apic_mode;
#ifdef CONFIG_X86_X2APIC
-int x2apic;
/* x2apic enabled before OS handover */
static int x2apic_preenabled;
static int disable_x2apic;
static __init int setup_nox2apic(char *str)
{
+ if (x2apic_enabled()) {
+ pr_warning("Bios already enabled x2apic, "
+ "can't enforce nox2apic");
+ return 0;
+ }
+
disable_x2apic = 1;
setup_clear_cpu_cap(X86_FEATURE_X2APIC);
return 0;
@@ -209,6 +240,31 @@ static int modern_apic(void)
return lapic_get_version() >= 0x14;
}
+/*
+ * bare function to substitute write operation
+ * and it's _that_ fast :)
+ */
+static void native_apic_write_dummy(u32 reg, u32 v)
+{
+ WARN_ON_ONCE((cpu_has_apic || !disable_apic));
+}
+
+static u32 native_apic_read_dummy(u32 reg)
+{
+ WARN_ON_ONCE((cpu_has_apic && !disable_apic));
+ return 0;
+}
+
+/*
+ * right after this call apic->write/read doesn't do anything
+ * note that there is no restore operation it works one way
+ */
+void apic_disable(void)
+{
+ apic->read = native_apic_read_dummy;
+ apic->write = native_apic_write_dummy;
+}
+
void native_apic_wait_icr_idle(void)
{
while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
@@ -348,7 +404,7 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask)
{
- unsigned long reg = (lvt_off << 4) + APIC_EILVT0;
+ unsigned long reg = (lvt_off << 4) + APIC_EILVTn(0);
unsigned int v = (mask << 16) | (msg_type << 8) | vector;
apic_write(reg, v);
@@ -815,7 +871,7 @@ void clear_local_APIC(void)
u32 v;
/* APIC hasn't been mapped yet */
- if (!x2apic && !apic_phys)
+ if (!x2apic_mode && !apic_phys)
return;
maxlvt = lapic_get_maxlvt();
@@ -843,7 +899,7 @@ void clear_local_APIC(void)
}
/* lets not touch this if we didn't frob it */
-#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL)
+#ifdef CONFIG_X86_THERMAL_VECTOR
if (maxlvt >= 5) {
v = apic_read(APIC_LVTTHMR);
apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
@@ -1133,6 +1189,7 @@ void __cpuinit setup_local_APIC(void)
apic_write(APIC_ESR, 0);
}
#endif
+ perf_counters_lapic_init();
preempt_disable();
@@ -1287,7 +1344,7 @@ void check_x2apic(void)
{
if (x2apic_enabled()) {
pr_info("x2apic enabled by BIOS, switching to x2apic ops\n");
- x2apic_preenabled = x2apic = 1;
+ x2apic_preenabled = x2apic_mode = 1;
}
}
@@ -1295,7 +1352,7 @@ void enable_x2apic(void)
{
int msr, msr2;
- if (!x2apic)
+ if (!x2apic_mode)
return;
rdmsr(MSR_IA32_APICBASE, msr, msr2);
@@ -1304,6 +1361,7 @@ void enable_x2apic(void)
wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0);
}
}
+#endif /* CONFIG_X86_X2APIC */
void __init enable_IR_x2apic(void)
{
@@ -1312,32 +1370,21 @@ void __init enable_IR_x2apic(void)
unsigned long flags;
struct IO_APIC_route_entry **ioapic_entries = NULL;
- if (!cpu_has_x2apic)
- return;
-
- if (!x2apic_preenabled && disable_x2apic) {
- pr_info("Skipped enabling x2apic and Interrupt-remapping "
- "because of nox2apic\n");
- return;
+ ret = dmar_table_init();
+ if (ret) {
+ pr_debug("dmar_table_init() failed with %d:\n", ret);
+ goto ir_failed;
}
- if (x2apic_preenabled && disable_x2apic)
- panic("Bios already enabled x2apic, can't enforce nox2apic");
-
- if (!x2apic_preenabled && skip_ioapic_setup) {
- pr_info("Skipped enabling x2apic and Interrupt-remapping "
- "because of skipping io-apic setup\n");
- return;
+ if (!intr_remapping_supported()) {
+ pr_debug("intr-remapping not supported\n");
+ goto ir_failed;
}
- ret = dmar_table_init();
- if (ret) {
- pr_info("dmar_table_init() failed with %d:\n", ret);
- if (x2apic_preenabled)
- panic("x2apic enabled by bios. But IR enabling failed");
- else
- pr_info("Not enabling x2apic,Intr-remapping\n");
+ if (!x2apic_preenabled && skip_ioapic_setup) {
+ pr_info("Skipped enabling intr-remap because of skipping "
+ "io-apic setup\n");
return;
}
@@ -1357,19 +1404,16 @@ void __init enable_IR_x2apic(void)
mask_IO_APIC_setup(ioapic_entries);
mask_8259A();
- ret = enable_intr_remapping(EIM_32BIT_APIC_ID);
-
- if (ret && x2apic_preenabled) {
- local_irq_restore(flags);
- panic("x2apic enabled by bios. But IR enabling failed");
- }
-
+ ret = enable_intr_remapping(x2apic_supported());
if (ret)
goto end_restore;
- if (!x2apic) {
- x2apic = 1;
+ pr_info("Enabled Interrupt-remapping\n");
+
+ if (x2apic_supported() && !x2apic_mode) {
+ x2apic_mode = 1;
enable_x2apic();
+ pr_info("Enabled x2apic\n");
}
end_restore:
@@ -1378,37 +1422,34 @@ end_restore:
* IR enabling failed
*/
restore_IO_APIC_setup(ioapic_entries);
- else
- reinit_intr_remapped_IO_APIC(x2apic_preenabled, ioapic_entries);
unmask_8259A();
local_irq_restore(flags);
end:
- if (!ret) {
- if (!x2apic_preenabled)
- pr_info("Enabled x2apic and interrupt-remapping\n");
- else
- pr_info("Enabled Interrupt-remapping\n");
- } else
- pr_err("Failed to enable Interrupt-remapping and x2apic\n");
if (ioapic_entries)
free_ioapic_entries(ioapic_entries);
+
+ if (!ret)
+ return;
+
+ir_failed:
+ if (x2apic_preenabled)
+ panic("x2apic enabled by bios. But IR enabling failed");
+ else if (cpu_has_x2apic)
+ pr_info("Not enabling x2apic,Intr-remapping\n");
#else
if (!cpu_has_x2apic)
return;
if (x2apic_preenabled)
panic("x2apic enabled prior OS handover,"
- " enable CONFIG_INTR_REMAP");
-
- pr_info("Enable CONFIG_INTR_REMAP for enabling intr-remapping "
- " and x2apic\n");
+ " enable CONFIG_X86_X2APIC, CONFIG_INTR_REMAP");
#endif
return;
}
-#endif /* CONFIG_X86_X2APIC */
+
#ifdef CONFIG_X86_64
/*
@@ -1425,7 +1466,6 @@ static int __init detect_init_APIC(void)
}
mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
- boot_cpu_physical_apicid = 0;
return 0;
}
#else
@@ -1539,32 +1579,49 @@ void __init early_init_lapic_mapping(void)
*/
void __init init_apic_mappings(void)
{
- if (x2apic) {
+ unsigned int new_apicid;
+
+ if (x2apic_mode) {
boot_cpu_physical_apicid = read_apic_id();
return;
}
- /*
- * If no local APIC can be found then set up a fake all
- * zeroes page to simulate the local APIC and another
- * one for the IO-APIC.
- */
+ /* If no local APIC can be found return early */
if (!smp_found_config && detect_init_APIC()) {
- apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE);
- apic_phys = __pa(apic_phys);
- } else
+ /* lets NOP'ify apic operations */
+ pr_info("APIC: disable apic facility\n");
+ apic_disable();
+ } else {
apic_phys = mp_lapic_addr;
- set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
- apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n",
- APIC_BASE, apic_phys);
+ /*
+ * acpi lapic path already maps that address in
+ * acpi_register_lapic_address()
+ */
+ if (!acpi_lapic)
+ set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
+
+ apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n",
+ APIC_BASE, apic_phys);
+ }
/*
* Fetch the APIC ID of the BSP in case we have a
* default configuration (or the MP table is broken).
*/
- if (boot_cpu_physical_apicid == -1U)
- boot_cpu_physical_apicid = read_apic_id();
+ new_apicid = read_apic_id();
+ if (boot_cpu_physical_apicid != new_apicid) {
+ boot_cpu_physical_apicid = new_apicid;
+ /*
+ * yeah -- we lie about apic_version
+ * in case if apic was disabled via boot option
+ * but it's not a problem for SMP compiled kernel
+ * since smp_sanity_check is prepared for such a case
+ * and disable smp mode
+ */
+ apic_version[new_apicid] =
+ GET_APIC_VERSION(apic_read(APIC_LVR));
+ }
}
/*
@@ -1733,8 +1790,7 @@ void __init connect_bsp_APIC(void)
*/
apic_printk(APIC_VERBOSE, "leaving PIC mode, "
"enabling APIC mode.\n");
- outb(0x70, 0x22);
- outb(0x01, 0x23);
+ imcr_pic_to_apic();
}
#endif
if (apic->enable_apic_mode)
@@ -1762,8 +1818,7 @@ void disconnect_bsp_APIC(int virt_wire_setup)
*/
apic_printk(APIC_VERBOSE, "disabling APIC mode, "
"entering PIC mode.\n");
- outb(0x70, 0x22);
- outb(0x00, 0x23);
+ imcr_apic_to_pic();
return;
}
#endif
@@ -1962,17 +2017,17 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state)
apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
-#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL)
+#ifdef CONFIG_X86_THERMAL_VECTOR
if (maxlvt >= 5)
apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
#endif
local_irq_save(flags);
disable_local_APIC();
-#ifdef CONFIG_INTR_REMAP
+
if (intr_remapping_enabled)
disable_intr_remapping();
-#endif
+
local_irq_restore(flags);
return 0;
}
@@ -1982,42 +2037,34 @@ static int lapic_resume(struct sys_device *dev)
unsigned int l, h;
unsigned long flags;
int maxlvt;
-
-#ifdef CONFIG_INTR_REMAP
- int ret;
+ int ret = 0;
struct IO_APIC_route_entry **ioapic_entries = NULL;
if (!apic_pm_state.active)
return 0;
local_irq_save(flags);
- if (x2apic) {
+ if (intr_remapping_enabled) {
ioapic_entries = alloc_ioapic_entries();
if (!ioapic_entries) {
WARN(1, "Alloc ioapic_entries in lapic resume failed.");
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto restore;
}
ret = save_IO_APIC_setup(ioapic_entries);
if (ret) {
WARN(1, "Saving IO-APIC state failed: %d\n", ret);
free_ioapic_entries(ioapic_entries);
- return ret;
+ goto restore;
}
mask_IO_APIC_setup(ioapic_entries);
mask_8259A();
- enable_x2apic();
}
-#else
- if (!apic_pm_state.active)
- return 0;
- local_irq_save(flags);
- if (x2apic)
+ if (x2apic_mode)
enable_x2apic();
-#endif
-
else {
/*
* Make sure the APICBASE points to the right address
@@ -2055,21 +2102,16 @@ static int lapic_resume(struct sys_device *dev)
apic_write(APIC_ESR, 0);
apic_read(APIC_ESR);
-#ifdef CONFIG_INTR_REMAP
- if (intr_remapping_enabled)
- reenable_intr_remapping(EIM_32BIT_APIC_ID);
-
- if (x2apic) {
+ if (intr_remapping_enabled) {
+ reenable_intr_remapping(x2apic_mode);
unmask_8259A();
restore_IO_APIC_setup(ioapic_entries);
free_ioapic_entries(ioapic_entries);
}
-#endif
-
+restore:
local_irq_restore(flags);
-
- return 0;
+ return ret;
}
/*
@@ -2117,31 +2159,14 @@ static void apic_pm_activate(void) { }
#endif /* CONFIG_PM */
#ifdef CONFIG_X86_64
-/*
- * apic_is_clustered_box() -- Check if we can expect good TSC
- *
- * Thus far, the major user of this is IBM's Summit2 series:
- *
- * Clustered boxes may have unsynced TSC problems if they are
- * multi-chassis. Use available data to take a good guess.
- * If in doubt, go HPET.
- */
-__cpuinit int apic_is_clustered_box(void)
+
+static int __cpuinit apic_cluster_num(void)
{
int i, clusters, zeros;
unsigned id;
u16 *bios_cpu_apicid;
DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS);
- /*
- * there is not this kind of box with AMD CPU yet.
- * Some AMD box with quadcore cpu and 8 sockets apicid
- * will be [4, 0x23] or [8, 0x27] could be thought to
- * vsmp box still need checking...
- */
- if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box())
- return 0;
-
bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
@@ -2177,18 +2202,67 @@ __cpuinit int apic_is_clustered_box(void)
++zeros;
}
- /* ScaleMP vSMPowered boxes have one cluster per board and TSCs are
- * not guaranteed to be synced between boards
- */
- if (is_vsmp_box() && clusters > 1)
+ return clusters;
+}
+
+static int __cpuinitdata multi_checked;
+static int __cpuinitdata multi;
+
+static int __cpuinit set_multi(const struct dmi_system_id *d)
+{
+ if (multi)
+ return 0;
+ pr_info("APIC: %s detected, Multi Chassis\n", d->ident);
+ multi = 1;
+ return 0;
+}
+
+static const __cpuinitconst struct dmi_system_id multi_dmi_table[] = {
+ {
+ .callback = set_multi,
+ .ident = "IBM System Summit2",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "IBM"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Summit2"),
+ },
+ },
+ {}
+};
+
+static void __cpuinit dmi_check_multi(void)
+{
+ if (multi_checked)
+ return;
+
+ dmi_check_system(multi_dmi_table);
+ multi_checked = 1;
+}
+
+/*
+ * apic_is_clustered_box() -- Check if we can expect good TSC
+ *
+ * Thus far, the major user of this is IBM's Summit2 series:
+ * Clustered boxes may have unsynced TSC problems if they are
+ * multi-chassis.
+ * Use DMI to check them
+ */
+__cpuinit int apic_is_clustered_box(void)
+{
+ dmi_check_multi();
+ if (multi)
return 1;
+ if (!is_vsmp_box())
+ return 0;
+
/*
- * If clusters > 2, then should be multi-chassis.
- * May have to revisit this when multi-core + hyperthreaded CPUs come
- * out, but AFAIK this will work even for them.
+ * ScaleMP vSMPowered boxes have one cluster per board and TSCs are
+ * not guaranteed to be synced between boards
*/
- return (clusters > 2);
+ if (apic_cluster_num() > 1)
+ return 1;
+
+ return 0;
}
#endif
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 306e5e88fb6..d0c99abc26c 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -161,7 +161,7 @@ static int flat_apic_id_registered(void)
static int flat_phys_pkg_id(int initial_apic_id, int index_msb)
{
- return hard_smp_processor_id() >> index_msb;
+ return initial_apic_id >> index_msb;
}
struct apic apic_flat = {
@@ -235,7 +235,7 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
* regardless of how many processors are present (x86_64 ES7000
* is an example).
*/
- if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID &&
+ if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID &&
(acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) {
printk(KERN_DEBUG "system APIC only can use physical flat");
return 1;
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 1c11b819f24..69328ac8de9 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -145,7 +145,7 @@ es7000_rename_gsi(int ioapic, int gsi)
return gsi;
}
-static int wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip)
+static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip)
{
unsigned long vect = 0, psaival = 0;
@@ -254,7 +254,7 @@ static int parse_unisys_oem(char *oemptr)
}
#ifdef CONFIG_ACPI
-static int find_unisys_acpi_oem_table(unsigned long *oem_addr)
+static int __init find_unisys_acpi_oem_table(unsigned long *oem_addr)
{
struct acpi_table_header *header = NULL;
struct es7000_oem_table *table;
@@ -285,7 +285,7 @@ static int find_unisys_acpi_oem_table(unsigned long *oem_addr)
return 0;
}
-static void unmap_unisys_acpi_oem_table(unsigned long oem_addr)
+static void __init unmap_unisys_acpi_oem_table(unsigned long oem_addr)
{
if (!oem_addr)
return;
@@ -306,7 +306,7 @@ static int es7000_check_dsdt(void)
static int es7000_acpi_ret;
/* Hook from generic ACPI tables.c */
-static int es7000_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+static int __init es7000_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
unsigned long oem_addr = 0;
int check_dsdt;
@@ -717,7 +717,7 @@ struct apic apic_es7000_cluster = {
.safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
};
-struct apic apic_es7000 = {
+struct apic __refdata apic_es7000 = {
.name = "es7000",
.probe = probe_es7000,
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 30da617d18e..ef8d9290c7e 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -59,6 +59,7 @@
#include <asm/setup.h>
#include <asm/irq_remapping.h>
#include <asm/hpet.h>
+#include <asm/hw_irq.h>
#include <asm/uv/uv_hub.h>
#include <asm/uv/uv_irq.h>
@@ -129,12 +130,9 @@ struct irq_pin_list {
struct irq_pin_list *next;
};
-static struct irq_pin_list *get_one_free_irq_2_pin(int cpu)
+static struct irq_pin_list *get_one_free_irq_2_pin(int node)
{
struct irq_pin_list *pin;
- int node;
-
- node = cpu_to_node(cpu);
pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node);
@@ -148,9 +146,6 @@ struct irq_cfg {
unsigned move_cleanup_count;
u8 vector;
u8 move_in_progress : 1;
-#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
- u8 move_desc_pending : 1;
-#endif
};
/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
@@ -182,16 +177,18 @@ int __init arch_early_irq_init(void)
struct irq_cfg *cfg;
struct irq_desc *desc;
int count;
+ int node;
int i;
cfg = irq_cfgx;
count = ARRAY_SIZE(irq_cfgx);
+ node= cpu_to_node(boot_cpu_id);
for (i = 0; i < count; i++) {
desc = irq_to_desc(i);
desc->chip_data = &cfg[i];
- alloc_bootmem_cpumask_var(&cfg[i].domain);
- alloc_bootmem_cpumask_var(&cfg[i].old_domain);
+ zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node);
+ zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node);
if (i < NR_IRQS_LEGACY)
cpumask_setall(cfg[i].domain);
}
@@ -212,12 +209,9 @@ static struct irq_cfg *irq_cfg(unsigned int irq)
return cfg;
}
-static struct irq_cfg *get_one_free_irq_cfg(int cpu)
+static struct irq_cfg *get_one_free_irq_cfg(int node)
{
struct irq_cfg *cfg;
- int node;
-
- node = cpu_to_node(cpu);
cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node);
if (cfg) {
@@ -238,13 +232,13 @@ static struct irq_cfg *get_one_free_irq_cfg(int cpu)
return cfg;
}
-int arch_init_chip_data(struct irq_desc *desc, int cpu)
+int arch_init_chip_data(struct irq_desc *desc, int node)
{
struct irq_cfg *cfg;
cfg = desc->chip_data;
if (!cfg) {
- desc->chip_data = get_one_free_irq_cfg(cpu);
+ desc->chip_data = get_one_free_irq_cfg(node);
if (!desc->chip_data) {
printk(KERN_ERR "can not alloc irq_cfg\n");
BUG_ON(1);
@@ -254,10 +248,9 @@ int arch_init_chip_data(struct irq_desc *desc, int cpu)
return 0;
}
-#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
-
+/* for move_irq_desc */
static void
-init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu)
+init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int node)
{
struct irq_pin_list *old_entry, *head, *tail, *entry;
@@ -266,7 +259,7 @@ init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu)
if (!old_entry)
return;
- entry = get_one_free_irq_2_pin(cpu);
+ entry = get_one_free_irq_2_pin(node);
if (!entry)
return;
@@ -276,7 +269,7 @@ init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu)
tail = entry;
old_entry = old_entry->next;
while (old_entry) {
- entry = get_one_free_irq_2_pin(cpu);
+ entry = get_one_free_irq_2_pin(node);
if (!entry) {
entry = head;
while (entry) {
@@ -316,12 +309,12 @@ static void free_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg)
}
void arch_init_copy_chip_data(struct irq_desc *old_desc,
- struct irq_desc *desc, int cpu)
+ struct irq_desc *desc, int node)
{
struct irq_cfg *cfg;
struct irq_cfg *old_cfg;
- cfg = get_one_free_irq_cfg(cpu);
+ cfg = get_one_free_irq_cfg(node);
if (!cfg)
return;
@@ -332,7 +325,7 @@ void arch_init_copy_chip_data(struct irq_desc *old_desc,
memcpy(cfg, old_cfg, sizeof(struct irq_cfg));
- init_copy_irq_2_pin(old_cfg, cfg, cpu);
+ init_copy_irq_2_pin(old_cfg, cfg, node);
}
static void free_irq_cfg(struct irq_cfg *old_cfg)
@@ -356,19 +349,7 @@ void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc)
old_desc->chip_data = NULL;
}
}
-
-static void
-set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
-{
- struct irq_cfg *cfg = desc->chip_data;
-
- if (!cfg->move_in_progress) {
- /* it means that domain is not changed */
- if (!cpumask_intersects(desc->affinity, mask))
- cfg->move_desc_pending = 1;
- }
-}
-#endif
+/* end for move_irq_desc */
#else
static struct irq_cfg *irq_cfg(unsigned int irq)
@@ -378,13 +359,6 @@ static struct irq_cfg *irq_cfg(unsigned int irq)
#endif
-#ifndef CONFIG_NUMA_MIGRATE_IRQ_DESC
-static inline void
-set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
-{
-}
-#endif
-
struct io_apic {
unsigned int index;
unsigned int unused[3];
@@ -518,132 +492,18 @@ static void ioapic_mask_entry(int apic, int pin)
spin_unlock_irqrestore(&ioapic_lock, flags);
}
-#ifdef CONFIG_SMP
-static void send_cleanup_vector(struct irq_cfg *cfg)
-{
- cpumask_var_t cleanup_mask;
-
- if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
- unsigned int i;
- cfg->move_cleanup_count = 0;
- for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
- cfg->move_cleanup_count++;
- for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
- apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
- } else {
- cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
- cfg->move_cleanup_count = cpumask_weight(cleanup_mask);
- apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
- free_cpumask_var(cleanup_mask);
- }
- cfg->move_in_progress = 0;
-}
-
-static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
-{
- int apic, pin;
- struct irq_pin_list *entry;
- u8 vector = cfg->vector;
-
- entry = cfg->irq_2_pin;
- for (;;) {
- unsigned int reg;
-
- if (!entry)
- break;
-
- apic = entry->apic;
- pin = entry->pin;
- /*
- * With interrupt-remapping, destination information comes
- * from interrupt-remapping table entry.
- */
- if (!irq_remapped(irq))
- io_apic_write(apic, 0x11 + pin*2, dest);
- reg = io_apic_read(apic, 0x10 + pin*2);
- reg &= ~IO_APIC_REDIR_VECTOR_MASK;
- reg |= vector;
- io_apic_modify(apic, 0x10 + pin*2, reg);
- if (!entry->next)
- break;
- entry = entry->next;
- }
-}
-
-static int
-assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask);
-
-/*
- * Either sets desc->affinity to a valid value, and returns
- * ->cpu_mask_to_apicid of that, or returns BAD_APICID and
- * leaves desc->affinity untouched.
- */
-static unsigned int
-set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
-{
- struct irq_cfg *cfg;
- unsigned int irq;
-
- if (!cpumask_intersects(mask, cpu_online_mask))
- return BAD_APICID;
-
- irq = desc->irq;
- cfg = desc->chip_data;
- if (assign_irq_vector(irq, cfg, mask))
- return BAD_APICID;
-
- /* check that before desc->addinity get updated */
- set_extra_move_desc(desc, mask);
-
- cpumask_copy(desc->affinity, mask);
-
- return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain);
-}
-
-static void
-set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
-{
- struct irq_cfg *cfg;
- unsigned long flags;
- unsigned int dest;
- unsigned int irq;
-
- irq = desc->irq;
- cfg = desc->chip_data;
-
- spin_lock_irqsave(&ioapic_lock, flags);
- dest = set_desc_affinity(desc, mask);
- if (dest != BAD_APICID) {
- /* Only the high 8 bits are valid. */
- dest = SET_APIC_LOGICAL_ID(dest);
- __target_IO_APIC_irq(irq, dest, cfg);
- }
- spin_unlock_irqrestore(&ioapic_lock, flags);
-}
-
-static void
-set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
-{
- struct irq_desc *desc;
-
- desc = irq_to_desc(irq);
-
- set_ioapic_affinity_irq_desc(desc, mask);
-}
-#endif /* CONFIG_SMP */
-
/*
* The common case is 1:1 IRQ<->pin mappings. Sometimes there are
* shared ISA-space IRQs, so we have to support them. We are super
* fast in the common case, and fast for shared ISA-space IRQs.
*/
-static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin)
+static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
{
struct irq_pin_list *entry;
entry = cfg->irq_2_pin;
if (!entry) {
- entry = get_one_free_irq_2_pin(cpu);
+ entry = get_one_free_irq_2_pin(node);
if (!entry) {
printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n",
apic, pin);
@@ -663,7 +523,7 @@ static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin)
entry = entry->next;
}
- entry->next = get_one_free_irq_2_pin(cpu);
+ entry->next = get_one_free_irq_2_pin(node);
entry = entry->next;
entry->apic = apic;
entry->pin = pin;
@@ -672,7 +532,7 @@ static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin)
/*
* Reroute an IRQ to a different pin.
*/
-static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu,
+static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node,
int oldapic, int oldpin,
int newapic, int newpin)
{
@@ -692,7 +552,7 @@ static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu,
/* why? call replace before add? */
if (!replaced)
- add_pin_to_irq_cpu(cfg, cpu, newapic, newpin);
+ add_pin_to_irq_node(cfg, node, newapic, newpin);
}
static inline void io_apic_modify_irq(struct irq_cfg *cfg,
@@ -850,7 +710,6 @@ static int __init ioapic_pirq_setup(char *str)
__setup("pirq=", ioapic_pirq_setup);
#endif /* CONFIG_X86_32 */
-#ifdef CONFIG_INTR_REMAP
struct IO_APIC_route_entry **alloc_ioapic_entries(void)
{
int apic;
@@ -948,20 +807,6 @@ int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries)
return 0;
}
-void reinit_intr_remapped_IO_APIC(int intr_remapping,
- struct IO_APIC_route_entry **ioapic_entries)
-
-{
- /*
- * for now plain restore of previous settings.
- * TBD: In the case of OS enabling interrupt-remapping,
- * IO-APIC RTE's need to be setup to point to interrupt-remapping
- * table entries. for now, do a plain restore, and wait for
- * the setup_IO_APIC_irqs() to do proper initialization.
- */
- restore_IO_APIC_setup(ioapic_entries);
-}
-
void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries)
{
int apic;
@@ -971,7 +816,6 @@ void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries)
kfree(ioapic_entries);
}
-#endif
/*
* Find the IRQ entry number of a certain pin.
@@ -1032,54 +876,6 @@ static int __init find_isa_irq_apic(int irq, int type)
return -1;
}
-/*
- * Find a specific PCI IRQ entry.
- * Not an __init, possibly needed by modules
- */
-static int pin_2_irq(int idx, int apic, int pin);
-
-int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
-{
- int apic, i, best_guess = -1;
-
- apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
- bus, slot, pin);
- if (test_bit(bus, mp_bus_not_pci)) {
- apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
- return -1;
- }
- for (i = 0; i < mp_irq_entries; i++) {
- int lbus = mp_irqs[i].srcbus;
-
- for (apic = 0; apic < nr_ioapics; apic++)
- if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic ||
- mp_irqs[i].dstapic == MP_APIC_ALL)
- break;
-
- if (!test_bit(lbus, mp_bus_not_pci) &&
- !mp_irqs[i].irqtype &&
- (bus == lbus) &&
- (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) {
- int irq = pin_2_irq(i, apic, mp_irqs[i].dstirq);
-
- if (!(apic || IO_APIC_IRQ(irq)))
- continue;
-
- if (pin == (mp_irqs[i].srcbusirq & 3))
- return irq;
- /*
- * Use the first all-but-pin matching entry as a
- * best-guess fuzzy result for broken mptables.
- */
- if (best_guess < 0)
- best_guess = irq;
- }
- }
- return best_guess;
-}
-
-EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
-
#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
/*
* EISA Edge/Level control register, ELCR
@@ -1298,6 +1094,64 @@ static int pin_2_irq(int idx, int apic, int pin)
return irq;
}
+/*
+ * Find a specific PCI IRQ entry.
+ * Not an __init, possibly needed by modules
+ */
+int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin,
+ struct io_apic_irq_attr *irq_attr)
+{
+ int apic, i, best_guess = -1;
+
+ apic_printk(APIC_DEBUG,
+ "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
+ bus, slot, pin);
+ if (test_bit(bus, mp_bus_not_pci)) {
+ apic_printk(APIC_VERBOSE,
+ "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
+ return -1;
+ }
+ for (i = 0; i < mp_irq_entries; i++) {
+ int lbus = mp_irqs[i].srcbus;
+
+ for (apic = 0; apic < nr_ioapics; apic++)
+ if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic ||
+ mp_irqs[i].dstapic == MP_APIC_ALL)
+ break;
+
+ if (!test_bit(lbus, mp_bus_not_pci) &&
+ !mp_irqs[i].irqtype &&
+ (bus == lbus) &&
+ (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) {
+ int irq = pin_2_irq(i, apic, mp_irqs[i].dstirq);
+
+ if (!(apic || IO_APIC_IRQ(irq)))
+ continue;
+
+ if (pin == (mp_irqs[i].srcbusirq & 3)) {
+ set_io_apic_irq_attr(irq_attr, apic,
+ mp_irqs[i].dstirq,
+ irq_trigger(i),
+ irq_polarity(i));
+ return irq;
+ }
+ /*
+ * Use the first all-but-pin matching entry as a
+ * best-guess fuzzy result for broken mptables.
+ */
+ if (best_guess < 0) {
+ set_io_apic_irq_attr(irq_attr, apic,
+ mp_irqs[i].dstirq,
+ irq_trigger(i),
+ irq_polarity(i));
+ best_guess = irq;
+ }
+ }
+ }
+ return best_guess;
+}
+EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
+
void lock_vector_lock(void)
{
/* Used to the online set of cpus does not change
@@ -1628,58 +1482,70 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq
ioapic_write_entry(apic_id, pin, entry);
}
+static struct {
+ DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
+} mp_ioapic_routing[MAX_IO_APICS];
+
static void __init setup_IO_APIC_irqs(void)
{
- int apic_id, pin, idx, irq;
+ int apic_id = 0, pin, idx, irq;
int notcon = 0;
struct irq_desc *desc;
struct irq_cfg *cfg;
- int cpu = boot_cpu_id;
+ int node = cpu_to_node(boot_cpu_id);
apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
- for (apic_id = 0; apic_id < nr_ioapics; apic_id++) {
- for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) {
-
- idx = find_irq_entry(apic_id, pin, mp_INT);
- if (idx == -1) {
- if (!notcon) {
- notcon = 1;
- apic_printk(APIC_VERBOSE,
- KERN_DEBUG " %d-%d",
- mp_ioapics[apic_id].apicid, pin);
- } else
- apic_printk(APIC_VERBOSE, " %d-%d",
- mp_ioapics[apic_id].apicid, pin);
- continue;
- }
- if (notcon) {
- apic_printk(APIC_VERBOSE,
- " (apicid-pin) not connected\n");
- notcon = 0;
- }
+#ifdef CONFIG_ACPI
+ if (!acpi_disabled && acpi_ioapic) {
+ apic_id = mp_find_ioapic(0);
+ if (apic_id < 0)
+ apic_id = 0;
+ }
+#endif
- irq = pin_2_irq(idx, apic_id, pin);
+ for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) {
+ idx = find_irq_entry(apic_id, pin, mp_INT);
+ if (idx == -1) {
+ if (!notcon) {
+ notcon = 1;
+ apic_printk(APIC_VERBOSE,
+ KERN_DEBUG " %d-%d",
+ mp_ioapics[apic_id].apicid, pin);
+ } else
+ apic_printk(APIC_VERBOSE, " %d-%d",
+ mp_ioapics[apic_id].apicid, pin);
+ continue;
+ }
+ if (notcon) {
+ apic_printk(APIC_VERBOSE,
+ " (apicid-pin) not connected\n");
+ notcon = 0;
+ }
- /*
- * Skip the timer IRQ if there's a quirk handler
- * installed and if it returns 1:
- */
- if (apic->multi_timer_check &&
- apic->multi_timer_check(apic_id, irq))
- continue;
+ irq = pin_2_irq(idx, apic_id, pin);
- desc = irq_to_desc_alloc_cpu(irq, cpu);
- if (!desc) {
- printk(KERN_INFO "can not get irq_desc for %d\n", irq);
- continue;
- }
- cfg = desc->chip_data;
- add_pin_to_irq_cpu(cfg, cpu, apic_id, pin);
+ /*
+ * Skip the timer IRQ if there's a quirk handler
+ * installed and if it returns 1:
+ */
+ if (apic->multi_timer_check &&
+ apic->multi_timer_check(apic_id, irq))
+ continue;
- setup_IO_APIC_irq(apic_id, pin, irq, desc,
- irq_trigger(idx), irq_polarity(idx));
+ desc = irq_to_desc_alloc_node(irq, node);
+ if (!desc) {
+ printk(KERN_INFO "can not get irq_desc for %d\n", irq);
+ continue;
}
+ cfg = desc->chip_data;
+ add_pin_to_irq_node(cfg, node, apic_id, pin);
+ /*
+ * don't mark it in pin_programmed, so later acpi could
+ * set it correctly when irq < 16
+ */
+ setup_IO_APIC_irq(apic_id, pin, irq, desc,
+ irq_trigger(idx), irq_polarity(idx));
}
if (notcon)
@@ -1869,7 +1735,7 @@ __apicdebuginit(void) print_APIC_bitfield(int base)
__apicdebuginit(void) print_local_APIC(void *dummy)
{
- unsigned int v, ver, maxlvt;
+ unsigned int i, v, ver, maxlvt;
u64 icr;
if (apic_verbosity == APIC_QUIET)
@@ -1957,6 +1823,18 @@ __apicdebuginit(void) print_local_APIC(void *dummy)
printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
v = apic_read(APIC_TDCR);
printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
+
+ if (boot_cpu_has(X86_FEATURE_EXTAPIC)) {
+ v = apic_read(APIC_EFEAT);
+ maxlvt = (v >> 16) & 0xff;
+ printk(KERN_DEBUG "... APIC EFEAT: %08x\n", v);
+ v = apic_read(APIC_ECTRL);
+ printk(KERN_DEBUG "... APIC ECTRL: %08x\n", v);
+ for (i = 0; i < maxlvt; i++) {
+ v = apic_read(APIC_EILVTn(i));
+ printk(KERN_DEBUG "... APIC EILVT%d: %08x\n", i, v);
+ }
+ }
printk("\n");
}
@@ -2005,6 +1883,11 @@ __apicdebuginit(void) print_PIC(void)
__apicdebuginit(int) print_all_ICs(void)
{
print_PIC();
+
+ /* don't print out if apic is not there */
+ if (!cpu_has_apic || disable_apic)
+ return 0;
+
print_all_local_APICs();
print_IO_APIC();
@@ -2360,6 +2243,118 @@ static int ioapic_retrigger_irq(unsigned int irq)
*/
#ifdef CONFIG_SMP
+static void send_cleanup_vector(struct irq_cfg *cfg)
+{
+ cpumask_var_t cleanup_mask;
+
+ if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
+ unsigned int i;
+ cfg->move_cleanup_count = 0;
+ for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
+ cfg->move_cleanup_count++;
+ for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
+ apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
+ } else {
+ cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
+ cfg->move_cleanup_count = cpumask_weight(cleanup_mask);
+ apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
+ free_cpumask_var(cleanup_mask);
+ }
+ cfg->move_in_progress = 0;
+}
+
+static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
+{
+ int apic, pin;
+ struct irq_pin_list *entry;
+ u8 vector = cfg->vector;
+
+ entry = cfg->irq_2_pin;
+ for (;;) {
+ unsigned int reg;
+
+ if (!entry)
+ break;
+
+ apic = entry->apic;
+ pin = entry->pin;
+ /*
+ * With interrupt-remapping, destination information comes
+ * from interrupt-remapping table entry.
+ */
+ if (!irq_remapped(irq))
+ io_apic_write(apic, 0x11 + pin*2, dest);
+ reg = io_apic_read(apic, 0x10 + pin*2);
+ reg &= ~IO_APIC_REDIR_VECTOR_MASK;
+ reg |= vector;
+ io_apic_modify(apic, 0x10 + pin*2, reg);
+ if (!entry->next)
+ break;
+ entry = entry->next;
+ }
+}
+
+static int
+assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask);
+
+/*
+ * Either sets desc->affinity to a valid value, and returns
+ * ->cpu_mask_to_apicid of that, or returns BAD_APICID and
+ * leaves desc->affinity untouched.
+ */
+static unsigned int
+set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
+{
+ struct irq_cfg *cfg;
+ unsigned int irq;
+
+ if (!cpumask_intersects(mask, cpu_online_mask))
+ return BAD_APICID;
+
+ irq = desc->irq;
+ cfg = desc->chip_data;
+ if (assign_irq_vector(irq, cfg, mask))
+ return BAD_APICID;
+
+ cpumask_copy(desc->affinity, mask);
+
+ return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain);
+}
+
+static int
+set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
+{
+ struct irq_cfg *cfg;
+ unsigned long flags;
+ unsigned int dest;
+ unsigned int irq;
+ int ret = -1;
+
+ irq = desc->irq;
+ cfg = desc->chip_data;
+
+ spin_lock_irqsave(&ioapic_lock, flags);
+ dest = set_desc_affinity(desc, mask);
+ if (dest != BAD_APICID) {
+ /* Only the high 8 bits are valid. */
+ dest = SET_APIC_LOGICAL_ID(dest);
+ __target_IO_APIC_irq(irq, dest, cfg);
+ ret = 0;
+ }
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ return ret;
+}
+
+static int
+set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
+{
+ struct irq_desc *desc;
+
+ desc = irq_to_desc(irq);
+
+ return set_ioapic_affinity_irq_desc(desc, mask);
+}
#ifdef CONFIG_INTR_REMAP
@@ -2374,26 +2369,25 @@ static int ioapic_retrigger_irq(unsigned int irq)
* Real vector that is used for interrupting cpu will be coming from
* the interrupt-remapping table entry.
*/
-static void
+static int
migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
{
struct irq_cfg *cfg;
struct irte irte;
unsigned int dest;
unsigned int irq;
+ int ret = -1;
if (!cpumask_intersects(mask, cpu_online_mask))
- return;
+ return ret;
irq = desc->irq;
if (get_irte(irq, &irte))
- return;
+ return ret;
cfg = desc->chip_data;
if (assign_irq_vector(irq, cfg, mask))
- return;
-
- set_extra_move_desc(desc, mask);
+ return ret;
dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask);
@@ -2409,27 +2403,30 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
send_cleanup_vector(cfg);
cpumask_copy(desc->affinity, mask);
+
+ return 0;
}
/*
* Migrates the IRQ destination in the process context.
*/
-static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
+static int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
const struct cpumask *mask)
{
- migrate_ioapic_irq_desc(desc, mask);
+ return migrate_ioapic_irq_desc(desc, mask);
}
-static void set_ir_ioapic_affinity_irq(unsigned int irq,
+static int set_ir_ioapic_affinity_irq(unsigned int irq,
const struct cpumask *mask)
{
struct irq_desc *desc = irq_to_desc(irq);
- set_ir_ioapic_affinity_irq_desc(desc, mask);
+ return set_ir_ioapic_affinity_irq_desc(desc, mask);
}
#else
-static inline void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
+static inline int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
const struct cpumask *mask)
{
+ return 0;
}
#endif
@@ -2491,86 +2488,19 @@ static void irq_complete_move(struct irq_desc **descp)
struct irq_cfg *cfg = desc->chip_data;
unsigned vector, me;
- if (likely(!cfg->move_in_progress)) {
-#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
- if (likely(!cfg->move_desc_pending))
- return;
-
- /* domain has not changed, but affinity did */
- me = smp_processor_id();
- if (cpumask_test_cpu(me, desc->affinity)) {
- *descp = desc = move_irq_desc(desc, me);
- /* get the new one */
- cfg = desc->chip_data;
- cfg->move_desc_pending = 0;
- }
-#endif
+ if (likely(!cfg->move_in_progress))
return;
- }
vector = ~get_irq_regs()->orig_ax;
me = smp_processor_id();
- if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) {
-#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
- *descp = desc = move_irq_desc(desc, me);
- /* get the new one */
- cfg = desc->chip_data;
-#endif
+ if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
send_cleanup_vector(cfg);
- }
}
#else
static inline void irq_complete_move(struct irq_desc **descp) {}
#endif
-static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
-{
- int apic, pin;
- struct irq_pin_list *entry;
-
- entry = cfg->irq_2_pin;
- for (;;) {
-
- if (!entry)
- break;
-
- apic = entry->apic;
- pin = entry->pin;
- io_apic_eoi(apic, pin);
- entry = entry->next;
- }
-}
-
-static void
-eoi_ioapic_irq(struct irq_desc *desc)
-{
- struct irq_cfg *cfg;
- unsigned long flags;
- unsigned int irq;
-
- irq = desc->irq;
- cfg = desc->chip_data;
-
- spin_lock_irqsave(&ioapic_lock, flags);
- __eoi_ioapic_irq(irq, cfg);
- spin_unlock_irqrestore(&ioapic_lock, flags);
-}
-
-#ifdef CONFIG_X86_X2APIC
-static void ack_x2apic_level(unsigned int irq)
-{
- struct irq_desc *desc = irq_to_desc(irq);
- ack_x2APIC_irq();
- eoi_ioapic_irq(desc);
-}
-
-static void ack_x2apic_edge(unsigned int irq)
-{
- ack_x2APIC_irq();
-}
-#endif
-
static void ack_apic_edge(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
@@ -2634,9 +2564,6 @@ static void ack_apic_level(unsigned int irq)
*/
ack_APIC_irq();
- if (irq_remapped(irq))
- eoi_ioapic_irq(desc);
-
/* Now we can move and renable the irq */
if (unlikely(do_unmask_irq)) {
/* Only migrate the irq if the ack has been received.
@@ -2683,22 +2610,50 @@ static void ack_apic_level(unsigned int irq)
}
#ifdef CONFIG_INTR_REMAP
+static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
+{
+ int apic, pin;
+ struct irq_pin_list *entry;
+
+ entry = cfg->irq_2_pin;
+ for (;;) {
+
+ if (!entry)
+ break;
+
+ apic = entry->apic;
+ pin = entry->pin;
+ io_apic_eoi(apic, pin);
+ entry = entry->next;
+ }
+}
+
+static void
+eoi_ioapic_irq(struct irq_desc *desc)
+{
+ struct irq_cfg *cfg;
+ unsigned long flags;
+ unsigned int irq;
+
+ irq = desc->irq;
+ cfg = desc->chip_data;
+
+ spin_lock_irqsave(&ioapic_lock, flags);
+ __eoi_ioapic_irq(irq, cfg);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
static void ir_ack_apic_edge(unsigned int irq)
{
-#ifdef CONFIG_X86_X2APIC
- if (x2apic_enabled())
- return ack_x2apic_edge(irq);
-#endif
- return ack_apic_edge(irq);
+ ack_APIC_irq();
}
static void ir_ack_apic_level(unsigned int irq)
{
-#ifdef CONFIG_X86_X2APIC
- if (x2apic_enabled())
- return ack_x2apic_level(irq);
-#endif
- return ack_apic_level(irq);
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ ack_APIC_irq();
+ eoi_ioapic_irq(desc);
}
#endif /* CONFIG_INTR_REMAP */
@@ -2903,7 +2858,7 @@ static inline void __init check_timer(void)
{
struct irq_desc *desc = irq_to_desc(0);
struct irq_cfg *cfg = desc->chip_data;
- int cpu = boot_cpu_id;
+ int node = cpu_to_node(boot_cpu_id);
int apic1, pin1, apic2, pin2;
unsigned long flags;
int no_pin1 = 0;
@@ -2969,7 +2924,7 @@ static inline void __init check_timer(void)
* Ok, does IRQ0 through the IOAPIC work?
*/
if (no_pin1) {
- add_pin_to_irq_cpu(cfg, cpu, apic1, pin1);
+ add_pin_to_irq_node(cfg, node, apic1, pin1);
setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
} else {
/* for edge trigger, setup_IO_APIC_irq already
@@ -3006,7 +2961,7 @@ static inline void __init check_timer(void)
/*
* legacy devices should be connected to IO APIC #0
*/
- replace_pin_at_irq_cpu(cfg, cpu, apic1, pin1, apic2, pin2);
+ replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2);
setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
enable_8259A_irq(0);
if (timer_irq_works()) {
@@ -3218,14 +3173,13 @@ static int nr_irqs_gsi = NR_IRQS_LEGACY;
/*
* Dynamic irq allocate and deallocation
*/
-unsigned int create_irq_nr(unsigned int irq_want)
+unsigned int create_irq_nr(unsigned int irq_want, int node)
{
/* Allocate an unused irq */
unsigned int irq;
unsigned int new;
unsigned long flags;
struct irq_cfg *cfg_new = NULL;
- int cpu = boot_cpu_id;
struct irq_desc *desc_new = NULL;
irq = 0;
@@ -3234,7 +3188,7 @@ unsigned int create_irq_nr(unsigned int irq_want)
spin_lock_irqsave(&vector_lock, flags);
for (new = irq_want; new < nr_irqs; new++) {
- desc_new = irq_to_desc_alloc_cpu(new, cpu);
+ desc_new = irq_to_desc_alloc_node(new, node);
if (!desc_new) {
printk(KERN_INFO "can not get irq_desc for %d\n", new);
continue;
@@ -3243,6 +3197,9 @@ unsigned int create_irq_nr(unsigned int irq_want)
if (cfg_new->vector != 0)
continue;
+
+ desc_new = move_irq_desc(desc_new, node);
+
if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0)
irq = new;
break;
@@ -3260,11 +3217,12 @@ unsigned int create_irq_nr(unsigned int irq_want)
int create_irq(void)
{
+ int node = cpu_to_node(boot_cpu_id);
unsigned int irq_want;
int irq;
irq_want = nr_irqs_gsi;
- irq = create_irq_nr(irq_want);
+ irq = create_irq_nr(irq_want, node);
if (irq == 0)
irq = -1;
@@ -3366,7 +3324,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
}
#ifdef CONFIG_SMP
-static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
+static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
{
struct irq_desc *desc = irq_to_desc(irq);
struct irq_cfg *cfg;
@@ -3375,7 +3333,7 @@ static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
dest = set_desc_affinity(desc, mask);
if (dest == BAD_APICID)
- return;
+ return -1;
cfg = desc->chip_data;
@@ -3387,13 +3345,15 @@ static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
msg.address_lo |= MSI_ADDR_DEST_ID(dest);
write_msi_msg_desc(desc, &msg);
+
+ return 0;
}
#ifdef CONFIG_INTR_REMAP
/*
* Migrate the MSI irq to another cpumask. This migration is
* done in the process context using interrupt-remapping hardware.
*/
-static void
+static int
ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
{
struct irq_desc *desc = irq_to_desc(irq);
@@ -3402,11 +3362,11 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
struct irte irte;
if (get_irte(irq, &irte))
- return;
+ return -1;
dest = set_desc_affinity(desc, mask);
if (dest == BAD_APICID)
- return;
+ return -1;
irte.vector = cfg->vector;
irte.dest_id = IRTE_DEST(dest);
@@ -3423,6 +3383,8 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
*/
if (cfg->move_in_progress)
send_cleanup_vector(cfg);
+
+ return 0;
}
#endif
@@ -3518,15 +3480,17 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
unsigned int irq_want;
struct intel_iommu *iommu = NULL;
int index = 0;
+ int node;
/* x86 doesn't support multiple MSI yet */
if (type == PCI_CAP_ID_MSI && nvec > 1)
return 1;
+ node = dev_to_node(&dev->dev);
irq_want = nr_irqs_gsi;
sub_handle = 0;
list_for_each_entry(msidesc, &dev->msi_list, list) {
- irq = create_irq_nr(irq_want);
+ irq = create_irq_nr(irq_want, node);
if (irq == 0)
return -1;
irq_want = irq + 1;
@@ -3576,7 +3540,7 @@ void arch_teardown_msi_irq(unsigned int irq)
#if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP)
#ifdef CONFIG_SMP
-static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
+static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
{
struct irq_desc *desc = irq_to_desc(irq);
struct irq_cfg *cfg;
@@ -3585,7 +3549,7 @@ static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
dest = set_desc_affinity(desc, mask);
if (dest == BAD_APICID)
- return;
+ return -1;
cfg = desc->chip_data;
@@ -3597,6 +3561,8 @@ static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
msg.address_lo |= MSI_ADDR_DEST_ID(dest);
dmar_msi_write(irq, &msg);
+
+ return 0;
}
#endif /* CONFIG_SMP */
@@ -3630,7 +3596,7 @@ int arch_setup_dmar_msi(unsigned int irq)
#ifdef CONFIG_HPET_TIMER
#ifdef CONFIG_SMP
-static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
+static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
{
struct irq_desc *desc = irq_to_desc(irq);
struct irq_cfg *cfg;
@@ -3639,7 +3605,7 @@ static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
dest = set_desc_affinity(desc, mask);
if (dest == BAD_APICID)
- return;
+ return -1;
cfg = desc->chip_data;
@@ -3651,6 +3617,8 @@ static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
msg.address_lo |= MSI_ADDR_DEST_ID(dest);
hpet_msi_write(irq, &msg);
+
+ return 0;
}
#endif /* CONFIG_SMP */
@@ -3707,7 +3675,7 @@ static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
write_ht_irq_msg(irq, &msg);
}
-static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
+static int set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
{
struct irq_desc *desc = irq_to_desc(irq);
struct irq_cfg *cfg;
@@ -3715,11 +3683,13 @@ static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
dest = set_desc_affinity(desc, mask);
if (dest == BAD_APICID)
- return;
+ return -1;
cfg = desc->chip_data;
target_ht_irq(irq, dest, cfg->vector);
+
+ return 0;
}
#endif
@@ -3794,6 +3764,8 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
unsigned long flags;
int err;
+ BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
+
cfg = irq_cfg(irq);
err = assign_irq_vector(irq, cfg, eligible_cpu);
@@ -3807,15 +3779,13 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
mmr_value = 0;
entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
- BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
-
- entry->vector = cfg->vector;
- entry->delivery_mode = apic->irq_delivery_mode;
- entry->dest_mode = apic->irq_dest_mode;
- entry->polarity = 0;
- entry->trigger = 0;
- entry->mask = 0;
- entry->dest = apic->cpu_mask_to_apicid(eligible_cpu);
+ entry->vector = cfg->vector;
+ entry->delivery_mode = apic->irq_delivery_mode;
+ entry->dest_mode = apic->irq_dest_mode;
+ entry->polarity = 0;
+ entry->trigger = 0;
+ entry->mask = 0;
+ entry->dest = apic->cpu_mask_to_apicid(eligible_cpu);
mmr_pnode = uv_blade_to_pnode(mmr_blade);
uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
@@ -3833,10 +3803,10 @@ void arch_disable_uv_irq(int mmr_blade, unsigned long mmr_offset)
struct uv_IO_APIC_route_entry *entry;
int mmr_pnode;
+ BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
+
mmr_value = 0;
entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
- BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
-
entry->mask = 1;
mmr_pnode = uv_blade_to_pnode(mmr_blade);
@@ -3900,6 +3870,71 @@ int __init arch_probe_nr_irqs(void)
}
#endif
+static int __io_apic_set_pci_routing(struct device *dev, int irq,
+ struct io_apic_irq_attr *irq_attr)
+{
+ struct irq_desc *desc;
+ struct irq_cfg *cfg;
+ int node;
+ int ioapic, pin;
+ int trigger, polarity;
+
+ ioapic = irq_attr->ioapic;
+ if (!IO_APIC_IRQ(irq)) {
+ apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
+ ioapic);
+ return -EINVAL;
+ }
+
+ if (dev)
+ node = dev_to_node(dev);
+ else
+ node = cpu_to_node(boot_cpu_id);
+
+ desc = irq_to_desc_alloc_node(irq, node);
+ if (!desc) {
+ printk(KERN_INFO "can not get irq_desc %d\n", irq);
+ return 0;
+ }
+
+ pin = irq_attr->ioapic_pin;
+ trigger = irq_attr->trigger;
+ polarity = irq_attr->polarity;
+
+ /*
+ * IRQs < 16 are already in the irq_2_pin[] map
+ */
+ if (irq >= NR_IRQS_LEGACY) {
+ cfg = desc->chip_data;
+ add_pin_to_irq_node(cfg, node, ioapic, pin);
+ }
+
+ setup_IO_APIC_irq(ioapic, pin, irq, desc, trigger, polarity);
+
+ return 0;
+}
+
+int io_apic_set_pci_routing(struct device *dev, int irq,
+ struct io_apic_irq_attr *irq_attr)
+{
+ int ioapic, pin;
+ /*
+ * Avoid pin reprogramming. PRTs typically include entries
+ * with redundant pin->gsi mappings (but unique PCI devices);
+ * we only program the IOAPIC on the first.
+ */
+ ioapic = irq_attr->ioapic;
+ pin = irq_attr->ioapic_pin;
+ if (test_bit(pin, mp_ioapic_routing[ioapic].pin_programmed)) {
+ pr_debug("Pin %d-%d already programmed\n",
+ mp_ioapics[ioapic].apicid, pin);
+ return 0;
+ }
+ set_bit(pin, mp_ioapic_routing[ioapic].pin_programmed);
+
+ return __io_apic_set_pci_routing(dev, irq, irq_attr);
+}
+
/* --------------------------------------------------------------------------
ACPI-based IOAPIC Configuration
-------------------------------------------------------------------------- */
@@ -3980,6 +4015,7 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
return apic_id;
}
+#endif
int __init io_apic_get_version(int ioapic)
{
@@ -3992,39 +4028,6 @@ int __init io_apic_get_version(int ioapic)
return reg_01.bits.version;
}
-#endif
-
-int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity)
-{
- struct irq_desc *desc;
- struct irq_cfg *cfg;
- int cpu = boot_cpu_id;
-
- if (!IO_APIC_IRQ(irq)) {
- apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
- ioapic);
- return -EINVAL;
- }
-
- desc = irq_to_desc_alloc_cpu(irq, cpu);
- if (!desc) {
- printk(KERN_INFO "can not get irq_desc %d\n", irq);
- return 0;
- }
-
- /*
- * IRQs < 16 are already in the irq_2_pin[] map
- */
- if (irq >= NR_IRQS_LEGACY) {
- cfg = desc->chip_data;
- add_pin_to_irq_cpu(cfg, cpu, ioapic, pin);
- }
-
- setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity);
-
- return 0;
-}
-
int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
{
@@ -4055,51 +4058,44 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
#ifdef CONFIG_SMP
void __init setup_ioapic_dest(void)
{
- int pin, ioapic, irq, irq_entry;
+ int pin, ioapic = 0, irq, irq_entry;
struct irq_desc *desc;
- struct irq_cfg *cfg;
const struct cpumask *mask;
if (skip_ioapic_setup == 1)
return;
- for (ioapic = 0; ioapic < nr_ioapics; ioapic++) {
- for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
- irq_entry = find_irq_entry(ioapic, pin, mp_INT);
- if (irq_entry == -1)
- continue;
- irq = pin_2_irq(irq_entry, ioapic, pin);
-
- /* setup_IO_APIC_irqs could fail to get vector for some device
- * when you have too many devices, because at that time only boot
- * cpu is online.
- */
- desc = irq_to_desc(irq);
- cfg = desc->chip_data;
- if (!cfg->vector) {
- setup_IO_APIC_irq(ioapic, pin, irq, desc,
- irq_trigger(irq_entry),
- irq_polarity(irq_entry));
- continue;
+#ifdef CONFIG_ACPI
+ if (!acpi_disabled && acpi_ioapic) {
+ ioapic = mp_find_ioapic(0);
+ if (ioapic < 0)
+ ioapic = 0;
+ }
+#endif
- }
+ for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
+ irq_entry = find_irq_entry(ioapic, pin, mp_INT);
+ if (irq_entry == -1)
+ continue;
+ irq = pin_2_irq(irq_entry, ioapic, pin);
- /*
- * Honour affinities which have been set in early boot
- */
- if (desc->status &
- (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
- mask = desc->affinity;
- else
- mask = apic->target_cpus();
+ desc = irq_to_desc(irq);
- if (intr_remapping_enabled)
- set_ir_ioapic_affinity_irq_desc(desc, mask);
- else
- set_ioapic_affinity_irq_desc(desc, mask);
- }
+ /*
+ * Honour affinities which have been set in early boot
+ */
+ if (desc->status &
+ (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
+ mask = desc->affinity;
+ else
+ mask = apic->target_cpus();
+ if (intr_remapping_enabled)
+ set_ir_ioapic_affinity_irq_desc(desc, mask);
+ else
+ set_ioapic_affinity_irq_desc(desc, mask);
}
+
}
#endif
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index ce4fbfa315a..b3025b43b63 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -66,7 +66,7 @@ static inline unsigned int get_nmi_count(int cpu)
static inline int mce_in_progress(void)
{
-#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE)
+#if defined(CONFIG_X86_NEW_MCE)
return atomic_read(&mce_entry) > 0;
#endif
return 0;
@@ -104,7 +104,7 @@ static __init void nmi_cpu_busy(void *data)
}
#endif
-static void report_broken_nmi(int cpu, int *prev_nmi_count)
+static void report_broken_nmi(int cpu, unsigned int *prev_nmi_count)
{
printk(KERN_CONT "\n");
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 01eda2ac65e..440a8bccd91 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -160,7 +160,6 @@ extern struct apic apic_summit;
extern struct apic apic_bigsmp;
extern struct apic apic_es7000;
extern struct apic apic_es7000_cluster;
-extern struct apic apic_default;
struct apic *apic = &apic_default;
EXPORT_SYMBOL_GPL(apic);
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index 1783652bb0e..bc3e880f9b8 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -50,7 +50,7 @@ static struct apic *apic_probe[] __initdata = {
void __init default_setup_apic_routing(void)
{
#ifdef CONFIG_X86_X2APIC
- if (x2apic && (apic != &apic_x2apic_phys &&
+ if (x2apic_mode && (apic != &apic_x2apic_phys &&
#ifdef CONFIG_X86_UV
apic != &apic_x2apic_uv_x &&
#endif
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index 9cfe1f415d8..344eee4ac0a 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -173,13 +173,6 @@ static inline int is_WPEG(struct rio_detail *rio){
rio->type == LookOutAWPEG || rio->type == LookOutBWPEG);
}
-
-/* In clustered mode, the high nibble of APIC ID is a cluster number.
- * The low nibble is a 4-bit bitmap. */
-#define XAPIC_DEST_CPUS_SHIFT 4
-#define XAPIC_DEST_CPUS_MASK ((1u << XAPIC_DEST_CPUS_SHIFT) - 1)
-#define XAPIC_DEST_CLUSTER_MASK (XAPIC_DEST_CPUS_MASK << XAPIC_DEST_CPUS_SHIFT)
-
#define SUMMIT_APIC_DFR_VALUE (APIC_DFR_CLUSTER)
static const struct cpumask *summit_target_cpus(void)
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 4a903e2f0d1..8e4cbb255c3 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -10,7 +10,7 @@
#include <asm/apic.h>
#include <asm/ipi.h>
-DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
+static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 2bda6935297..ef0ae207a7c 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -105,7 +105,7 @@ static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask)
cpumask_set_cpu(cpu, retmask);
}
-static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip)
+static int __cpuinit uv_wakeup_secondary(int phys_apicid, unsigned long start_rip)
{
#ifdef CONFIG_SMP
unsigned long val;
@@ -562,7 +562,7 @@ void __init uv_system_init(void)
union uvh_node_id_u node_id;
unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size;
int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val;
- int max_pnode = 0;
+ int gnode_extra, max_pnode = 0;
unsigned long mmr_base, present, paddr;
unsigned short pnode_mask;
@@ -574,6 +574,13 @@ void __init uv_system_init(void)
mmr_base =
uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) &
~UV_MMR_ENABLE;
+ pnode_mask = (1 << n_val) - 1;
+ node_id.v = uv_read_local_mmr(UVH_NODE_ID);
+ gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1;
+ gnode_upper = ((unsigned long)gnode_extra << m_val);
+ printk(KERN_DEBUG "UV: N %d, M %d, gnode_upper 0x%lx, gnode_extra 0x%x\n",
+ n_val, m_val, gnode_upper, gnode_extra);
+
printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base);
for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++)
@@ -583,15 +590,18 @@ void __init uv_system_init(void)
bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades();
uv_blade_info = kmalloc(bytes, GFP_KERNEL);
+ BUG_ON(!uv_blade_info);
get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size);
bytes = sizeof(uv_node_to_blade[0]) * num_possible_nodes();
uv_node_to_blade = kmalloc(bytes, GFP_KERNEL);
+ BUG_ON(!uv_node_to_blade);
memset(uv_node_to_blade, 255, bytes);
bytes = sizeof(uv_cpu_to_blade[0]) * num_possible_cpus();
uv_cpu_to_blade = kmalloc(bytes, GFP_KERNEL);
+ BUG_ON(!uv_cpu_to_blade);
memset(uv_cpu_to_blade, 255, bytes);
blade = 0;
@@ -607,11 +617,6 @@ void __init uv_system_init(void)
}
}
- pnode_mask = (1 << n_val) - 1;
- node_id.v = uv_read_local_mmr(UVH_NODE_ID);
- gnode_upper = (((unsigned long)node_id.s.node_id) &
- ~((1 << n_val) - 1)) << m_val;
-
uv_bios_init();
uv_bios_get_sn_info(0, &uv_type, &sn_partition_id,
&sn_coherency_id, &sn_region_size);
@@ -634,6 +639,7 @@ void __init uv_system_init(void)
uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask;
uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1;
uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
+ uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra;
uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base;
uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id;
uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu;
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 49e0939bac4..79302e9a33a 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -1233,9 +1233,9 @@ static int suspend(int vetoable)
int err;
struct apm_user *as;
- device_suspend(PMSG_SUSPEND);
+ dpm_suspend_start(PMSG_SUSPEND);
- device_power_down(PMSG_SUSPEND);
+ dpm_suspend_noirq(PMSG_SUSPEND);
local_irq_disable();
sysdev_suspend(PMSG_SUSPEND);
@@ -1259,9 +1259,9 @@ static int suspend(int vetoable)
sysdev_resume();
local_irq_enable();
- device_power_up(PMSG_RESUME);
+ dpm_resume_noirq(PMSG_RESUME);
- device_resume(PMSG_RESUME);
+ dpm_resume_end(PMSG_RESUME);
queue_event(APM_NORMAL_RESUME, NULL);
spin_lock(&user_list_lock);
for (as = user_list; as != NULL; as = as->next) {
@@ -1277,7 +1277,7 @@ static void standby(void)
{
int err;
- device_power_down(PMSG_SUSPEND);
+ dpm_suspend_noirq(PMSG_SUSPEND);
local_irq_disable();
sysdev_suspend(PMSG_SUSPEND);
@@ -1291,7 +1291,7 @@ static void standby(void)
sysdev_resume();
local_irq_enable();
- device_power_up(PMSG_RESUME);
+ dpm_resume_noirq(PMSG_RESUME);
}
static apm_event_t get_event(void)
@@ -1376,7 +1376,7 @@ static void check_events(void)
ignore_bounce = 1;
if ((event != APM_NORMAL_RESUME)
|| (ignore_normal_resume == 0)) {
- device_resume(PMSG_RESUME);
+ dpm_resume_end(PMSG_RESUME);
queue_event(event, NULL);
}
ignore_normal_resume = 0;
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 5a6aa1c1162..dfdbf640389 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -126,6 +126,7 @@ void foo(void)
#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
BLANK();
OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
+ OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending);
OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir);
BLANK();
@@ -146,4 +147,5 @@ void foo(void)
OFFSET(BP_loadflags, boot_params, hdr.loadflags);
OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
OFFSET(BP_version, boot_params, hdr.version);
+ OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
}
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index e72f062fb4b..898ecc47e12 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -125,6 +125,7 @@ int main(void)
OFFSET(BP_loadflags, boot_params, hdr.loadflags);
OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
OFFSET(BP_version, boot_params, hdr.version);
+ OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
BLANK();
DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 4e242f9a06e..3efcb2b96a1 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -1,5 +1,5 @@
#
-# Makefile for x86-compatible CPU details and quirks
+# Makefile for x86-compatible CPU details, features and quirks
#
# Don't trace early stages of a secondary CPU boot
@@ -23,11 +23,13 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o
obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
-obj-$(CONFIG_X86_MCE) += mcheck/
-obj-$(CONFIG_MTRR) += mtrr/
-obj-$(CONFIG_CPU_FREQ) += cpufreq/
+obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
-obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
+obj-$(CONFIG_X86_MCE) += mcheck/
+obj-$(CONFIG_MTRR) += mtrr/
+obj-$(CONFIG_CPU_FREQ) += cpufreq/
+
+obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
quiet_cmd_mkcapflags = MKCAP $@
cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 7e4a459daa6..e5b27d8f1b4 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -6,6 +6,7 @@
#include <asm/processor.h>
#include <asm/apic.h>
#include <asm/cpu.h>
+#include <asm/pci-direct.h>
#ifdef CONFIG_X86_64
# include <asm/numa_64.h>
@@ -272,7 +273,7 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
int cpu = smp_processor_id();
int node;
- unsigned apicid = hard_smp_processor_id();
+ unsigned apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid;
node = c->phys_proc_id;
if (apicid_to_node[apicid] != NUMA_NO_NODE)
@@ -351,6 +352,15 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
(c->x86_model == 8 && c->x86_mask >= 8))
set_cpu_cap(c, X86_FEATURE_K6_MTRR);
#endif
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI)
+ /* check CPU config space for extended APIC ID */
+ if (c->x86 >= 0xf) {
+ unsigned int val;
+ val = read_pci_config(0, 24, 0, 0x68);
+ if ((val & ((1 << 17) | (1 << 18))) == ((1 << 17) | (1 << 18)))
+ set_cpu_cap(c, X86_FEATURE_EXTD_APICID);
+ }
+#endif
}
static void __cpuinit init_amd(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c1caefc82e6..3ffdcfa9abd 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -13,6 +13,7 @@
#include <linux/io.h>
#include <asm/stackprotector.h>
+#include <asm/perf_counter.h>
#include <asm/mmu_context.h>
#include <asm/hypervisor.h>
#include <asm/processor.h>
@@ -114,6 +115,13 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
} };
EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
+static int __init x86_xsave_setup(char *s)
+{
+ setup_clear_cpu_cap(X86_FEATURE_XSAVE);
+ return 1;
+}
+__setup("noxsave", x86_xsave_setup);
+
#ifdef CONFIG_X86_32
static int cachesize_override __cpuinitdata = -1;
static int disable_x86_serial_nr __cpuinitdata = 1;
@@ -292,7 +300,8 @@ static const char *__cpuinit table_lookup_model(struct cpuinfo_x86 *c)
return NULL; /* Not found */
}
-__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
+__u32 cpu_caps_cleared[NCAPINTS] __cpuinitdata;
+__u32 cpu_caps_set[NCAPINTS] __cpuinitdata;
void load_percpu_segment(int cpu)
{
@@ -761,6 +770,12 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
if (this_cpu->c_identify)
this_cpu->c_identify(c);
+ /* Clear/Set all flags overriden by options, after probe */
+ for (i = 0; i < NCAPINTS; i++) {
+ c->x86_capability[i] &= ~cpu_caps_cleared[i];
+ c->x86_capability[i] |= cpu_caps_set[i];
+ }
+
#ifdef CONFIG_X86_64
c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
#endif
@@ -806,6 +821,16 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
#endif
init_hypervisor(c);
+
+ /*
+ * Clear/Set all flags overriden by options, need do it
+ * before following smp all cpus cap AND.
+ */
+ for (i = 0; i < NCAPINTS; i++) {
+ c->x86_capability[i] &= ~cpu_caps_cleared[i];
+ c->x86_capability[i] |= cpu_caps_set[i];
+ }
+
/*
* On SMP, boot_cpu_data holds the common feature set between
* all CPUs; so make sure that we indicate which features are
@@ -818,10 +843,6 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
}
- /* Clear all flags overriden by options */
- for (i = 0; i < NCAPINTS; i++)
- c->x86_capability[i] &= ~cleared_cpu_caps[i];
-
#ifdef CONFIG_X86_MCE
/* Init Machine Check Exception if available. */
mcheck_init(c);
@@ -854,6 +875,7 @@ void __init identify_boot_cpu(void)
#else
vgetcpu_set_mode();
#endif
+ init_hw_perf_counters();
}
void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c
index 46e29ab96c6..6b2a52dd040 100644
--- a/arch/x86/kernel/cpu/cpu_debug.c
+++ b/arch/x86/kernel/cpu/cpu_debug.c
@@ -32,9 +32,7 @@
static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]);
static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]);
-static DEFINE_PER_CPU(unsigned, cpu_modelflag);
static DEFINE_PER_CPU(int, cpu_priv_count);
-static DEFINE_PER_CPU(unsigned, cpu_model);
static DEFINE_MUTEX(cpu_debug_lock);
@@ -80,302 +78,102 @@ static struct cpu_file_base cpu_file[] = {
{ "value", CPU_REG_ALL, 1 },
};
-/* Intel Registers Range */
-static struct cpu_debug_range cpu_intel_range[] = {
- { 0x00000000, 0x00000001, CPU_MC, CPU_INTEL_ALL },
- { 0x00000006, 0x00000007, CPU_MONITOR, CPU_CX_AT_XE },
- { 0x00000010, 0x00000010, CPU_TIME, CPU_INTEL_ALL },
- { 0x00000011, 0x00000013, CPU_PMC, CPU_INTEL_PENTIUM },
- { 0x00000017, 0x00000017, CPU_PLATFORM, CPU_PX_CX_AT_XE },
- { 0x0000001B, 0x0000001B, CPU_APIC, CPU_P6_CX_AT_XE },
-
- { 0x0000002A, 0x0000002A, CPU_POWERON, CPU_PX_CX_AT_XE },
- { 0x0000002B, 0x0000002B, CPU_POWERON, CPU_INTEL_XEON },
- { 0x0000002C, 0x0000002C, CPU_FREQ, CPU_INTEL_XEON },
- { 0x0000003A, 0x0000003A, CPU_CONTROL, CPU_CX_AT_XE },
-
- { 0x00000040, 0x00000043, CPU_LBRANCH, CPU_PM_CX_AT_XE },
- { 0x00000044, 0x00000047, CPU_LBRANCH, CPU_PM_CO_AT },
- { 0x00000060, 0x00000063, CPU_LBRANCH, CPU_C2_AT },
- { 0x00000064, 0x00000067, CPU_LBRANCH, CPU_INTEL_ATOM },
-
- { 0x00000079, 0x00000079, CPU_BIOS, CPU_P6_CX_AT_XE },
- { 0x00000088, 0x0000008A, CPU_CACHE, CPU_INTEL_P6 },
- { 0x0000008B, 0x0000008B, CPU_BIOS, CPU_P6_CX_AT_XE },
- { 0x0000009B, 0x0000009B, CPU_MONITOR, CPU_INTEL_XEON },
-
- { 0x000000C1, 0x000000C2, CPU_PMC, CPU_P6_CX_AT },
- { 0x000000CD, 0x000000CD, CPU_FREQ, CPU_CX_AT },
- { 0x000000E7, 0x000000E8, CPU_PERF, CPU_CX_AT },
- { 0x000000FE, 0x000000FE, CPU_MTRR, CPU_P6_CX_XE },
-
- { 0x00000116, 0x00000116, CPU_CACHE, CPU_INTEL_P6 },
- { 0x00000118, 0x00000118, CPU_CACHE, CPU_INTEL_P6 },
- { 0x00000119, 0x00000119, CPU_CACHE, CPU_INTEL_PX },
- { 0x0000011A, 0x0000011B, CPU_CACHE, CPU_INTEL_P6 },
- { 0x0000011E, 0x0000011E, CPU_CACHE, CPU_PX_CX_AT },
-
- { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_P6_CX_AT_XE },
- { 0x00000179, 0x0000017A, CPU_MC, CPU_PX_CX_AT_XE },
- { 0x0000017B, 0x0000017B, CPU_MC, CPU_P6_XE },
- { 0x00000186, 0x00000187, CPU_PMC, CPU_P6_CX_AT },
- { 0x00000198, 0x00000199, CPU_PERF, CPU_PM_CX_AT_XE },
- { 0x0000019A, 0x0000019A, CPU_TIME, CPU_PM_CX_AT_XE },
- { 0x0000019B, 0x0000019D, CPU_THERM, CPU_PM_CX_AT_XE },
- { 0x000001A0, 0x000001A0, CPU_MISC, CPU_PM_CX_AT_XE },
-
- { 0x000001C9, 0x000001C9, CPU_LBRANCH, CPU_PM_CX_AT },
- { 0x000001D7, 0x000001D8, CPU_LBRANCH, CPU_INTEL_XEON },
- { 0x000001D9, 0x000001D9, CPU_DEBUG, CPU_CX_AT_XE },
- { 0x000001DA, 0x000001DA, CPU_LBRANCH, CPU_INTEL_XEON },
- { 0x000001DB, 0x000001DB, CPU_LBRANCH, CPU_P6_XE },
- { 0x000001DC, 0x000001DC, CPU_LBRANCH, CPU_INTEL_P6 },
- { 0x000001DD, 0x000001DE, CPU_LBRANCH, CPU_PX_CX_AT_XE },
- { 0x000001E0, 0x000001E0, CPU_LBRANCH, CPU_INTEL_P6 },
-
- { 0x00000200, 0x0000020F, CPU_MTRR, CPU_P6_CX_XE },
- { 0x00000250, 0x00000250, CPU_MTRR, CPU_P6_CX_XE },
- { 0x00000258, 0x00000259, CPU_MTRR, CPU_P6_CX_XE },
- { 0x00000268, 0x0000026F, CPU_MTRR, CPU_P6_CX_XE },
- { 0x00000277, 0x00000277, CPU_PAT, CPU_C2_AT_XE },
- { 0x000002FF, 0x000002FF, CPU_MTRR, CPU_P6_CX_XE },
-
- { 0x00000300, 0x00000308, CPU_PMC, CPU_INTEL_XEON },
- { 0x00000309, 0x0000030B, CPU_PMC, CPU_C2_AT_XE },
- { 0x0000030C, 0x00000311, CPU_PMC, CPU_INTEL_XEON },
- { 0x00000345, 0x00000345, CPU_PMC, CPU_C2_AT },
- { 0x00000360, 0x00000371, CPU_PMC, CPU_INTEL_XEON },
- { 0x0000038D, 0x00000390, CPU_PMC, CPU_C2_AT },
- { 0x000003A0, 0x000003BE, CPU_PMC, CPU_INTEL_XEON },
- { 0x000003C0, 0x000003CD, CPU_PMC, CPU_INTEL_XEON },
- { 0x000003E0, 0x000003E1, CPU_PMC, CPU_INTEL_XEON },
- { 0x000003F0, 0x000003F0, CPU_PMC, CPU_INTEL_XEON },
- { 0x000003F1, 0x000003F1, CPU_PMC, CPU_C2_AT_XE },
- { 0x000003F2, 0x000003F2, CPU_PMC, CPU_INTEL_XEON },
-
- { 0x00000400, 0x00000402, CPU_MC, CPU_PM_CX_AT_XE },
- { 0x00000403, 0x00000403, CPU_MC, CPU_INTEL_XEON },
- { 0x00000404, 0x00000406, CPU_MC, CPU_PM_CX_AT_XE },
- { 0x00000407, 0x00000407, CPU_MC, CPU_INTEL_XEON },
- { 0x00000408, 0x0000040A, CPU_MC, CPU_PM_CX_AT_XE },
- { 0x0000040B, 0x0000040B, CPU_MC, CPU_INTEL_XEON },
- { 0x0000040C, 0x0000040E, CPU_MC, CPU_PM_CX_XE },
- { 0x0000040F, 0x0000040F, CPU_MC, CPU_INTEL_XEON },
- { 0x00000410, 0x00000412, CPU_MC, CPU_PM_CX_AT_XE },
- { 0x00000413, 0x00000417, CPU_MC, CPU_CX_AT_XE },
- { 0x00000480, 0x0000048B, CPU_VMX, CPU_CX_AT_XE },
-
- { 0x00000600, 0x00000600, CPU_DEBUG, CPU_PM_CX_AT_XE },
- { 0x00000680, 0x0000068F, CPU_LBRANCH, CPU_INTEL_XEON },
- { 0x000006C0, 0x000006CF, CPU_LBRANCH, CPU_INTEL_XEON },
-
- { 0x000107CC, 0x000107D3, CPU_PMC, CPU_INTEL_XEON_MP },
-
- { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_INTEL_XEON },
- { 0xC0000081, 0xC0000082, CPU_CALL, CPU_INTEL_XEON },
- { 0xC0000084, 0xC0000084, CPU_CALL, CPU_INTEL_XEON },
- { 0xC0000100, 0xC0000102, CPU_BASE, CPU_INTEL_XEON },
+/* CPU Registers Range */
+static struct cpu_debug_range cpu_reg_range[] = {
+ { 0x00000000, 0x00000001, CPU_MC, },
+ { 0x00000006, 0x00000007, CPU_MONITOR, },
+ { 0x00000010, 0x00000010, CPU_TIME, },
+ { 0x00000011, 0x00000013, CPU_PMC, },
+ { 0x00000017, 0x00000017, CPU_PLATFORM, },
+ { 0x0000001B, 0x0000001B, CPU_APIC, },
+ { 0x0000002A, 0x0000002B, CPU_POWERON, },
+ { 0x0000002C, 0x0000002C, CPU_FREQ, },
+ { 0x0000003A, 0x0000003A, CPU_CONTROL, },
+ { 0x00000040, 0x00000047, CPU_LBRANCH, },
+ { 0x00000060, 0x00000067, CPU_LBRANCH, },
+ { 0x00000079, 0x00000079, CPU_BIOS, },
+ { 0x00000088, 0x0000008A, CPU_CACHE, },
+ { 0x0000008B, 0x0000008B, CPU_BIOS, },
+ { 0x0000009B, 0x0000009B, CPU_MONITOR, },
+ { 0x000000C1, 0x000000C4, CPU_PMC, },
+ { 0x000000CD, 0x000000CD, CPU_FREQ, },
+ { 0x000000E7, 0x000000E8, CPU_PERF, },
+ { 0x000000FE, 0x000000FE, CPU_MTRR, },
+
+ { 0x00000116, 0x0000011E, CPU_CACHE, },
+ { 0x00000174, 0x00000176, CPU_SYSENTER, },
+ { 0x00000179, 0x0000017B, CPU_MC, },
+ { 0x00000186, 0x00000189, CPU_PMC, },
+ { 0x00000198, 0x00000199, CPU_PERF, },
+ { 0x0000019A, 0x0000019A, CPU_TIME, },
+ { 0x0000019B, 0x0000019D, CPU_THERM, },
+ { 0x000001A0, 0x000001A0, CPU_MISC, },
+ { 0x000001C9, 0x000001C9, CPU_LBRANCH, },
+ { 0x000001D7, 0x000001D8, CPU_LBRANCH, },
+ { 0x000001D9, 0x000001D9, CPU_DEBUG, },
+ { 0x000001DA, 0x000001E0, CPU_LBRANCH, },
+
+ { 0x00000200, 0x0000020F, CPU_MTRR, },
+ { 0x00000250, 0x00000250, CPU_MTRR, },
+ { 0x00000258, 0x00000259, CPU_MTRR, },
+ { 0x00000268, 0x0000026F, CPU_MTRR, },
+ { 0x00000277, 0x00000277, CPU_PAT, },
+ { 0x000002FF, 0x000002FF, CPU_MTRR, },
+
+ { 0x00000300, 0x00000311, CPU_PMC, },
+ { 0x00000345, 0x00000345, CPU_PMC, },
+ { 0x00000360, 0x00000371, CPU_PMC, },
+ { 0x0000038D, 0x00000390, CPU_PMC, },
+ { 0x000003A0, 0x000003BE, CPU_PMC, },
+ { 0x000003C0, 0x000003CD, CPU_PMC, },
+ { 0x000003E0, 0x000003E1, CPU_PMC, },
+ { 0x000003F0, 0x000003F2, CPU_PMC, },
+
+ { 0x00000400, 0x00000417, CPU_MC, },
+ { 0x00000480, 0x0000048B, CPU_VMX, },
+
+ { 0x00000600, 0x00000600, CPU_DEBUG, },
+ { 0x00000680, 0x0000068F, CPU_LBRANCH, },
+ { 0x000006C0, 0x000006CF, CPU_LBRANCH, },
+
+ { 0x000107CC, 0x000107D3, CPU_PMC, },
+
+ { 0xC0000080, 0xC0000080, CPU_FEATURES, },
+ { 0xC0000081, 0xC0000084, CPU_CALL, },
+ { 0xC0000100, 0xC0000102, CPU_BASE, },
+ { 0xC0000103, 0xC0000103, CPU_TIME, },
+
+ { 0xC0010000, 0xC0010007, CPU_PMC, },
+ { 0xC0010010, 0xC0010010, CPU_CONF, },
+ { 0xC0010015, 0xC0010015, CPU_CONF, },
+ { 0xC0010016, 0xC001001A, CPU_MTRR, },
+ { 0xC001001D, 0xC001001D, CPU_MTRR, },
+ { 0xC001001F, 0xC001001F, CPU_CONF, },
+ { 0xC0010030, 0xC0010035, CPU_BIOS, },
+ { 0xC0010044, 0xC0010048, CPU_MC, },
+ { 0xC0010050, 0xC0010056, CPU_SMM, },
+ { 0xC0010058, 0xC0010058, CPU_CONF, },
+ { 0xC0010060, 0xC0010060, CPU_CACHE, },
+ { 0xC0010061, 0xC0010068, CPU_SMM, },
+ { 0xC0010069, 0xC001006B, CPU_SMM, },
+ { 0xC0010070, 0xC0010071, CPU_SMM, },
+ { 0xC0010111, 0xC0010113, CPU_SMM, },
+ { 0xC0010114, 0xC0010118, CPU_SVM, },
+ { 0xC0010140, 0xC0010141, CPU_OSVM, },
+ { 0xC0011022, 0xC0011023, CPU_CONF, },
};
-/* AMD Registers Range */
-static struct cpu_debug_range cpu_amd_range[] = {
- { 0x00000000, 0x00000001, CPU_MC, CPU_K10_PLUS, },
- { 0x00000010, 0x00000010, CPU_TIME, CPU_K8_PLUS, },
- { 0x0000001B, 0x0000001B, CPU_APIC, CPU_K8_PLUS, },
- { 0x0000002A, 0x0000002A, CPU_POWERON, CPU_K7_PLUS },
- { 0x0000008B, 0x0000008B, CPU_VER, CPU_K8_PLUS },
- { 0x000000FE, 0x000000FE, CPU_MTRR, CPU_K8_PLUS, },
-
- { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_K8_PLUS, },
- { 0x00000179, 0x0000017B, CPU_MC, CPU_K8_PLUS, },
- { 0x000001D9, 0x000001D9, CPU_DEBUG, CPU_K8_PLUS, },
- { 0x000001DB, 0x000001DE, CPU_LBRANCH, CPU_K8_PLUS, },
-
- { 0x00000200, 0x0000020F, CPU_MTRR, CPU_K8_PLUS, },
- { 0x00000250, 0x00000250, CPU_MTRR, CPU_K8_PLUS, },
- { 0x00000258, 0x00000259, CPU_MTRR, CPU_K8_PLUS, },
- { 0x00000268, 0x0000026F, CPU_MTRR, CPU_K8_PLUS, },
- { 0x00000277, 0x00000277, CPU_PAT, CPU_K8_PLUS, },
- { 0x000002FF, 0x000002FF, CPU_MTRR, CPU_K8_PLUS, },
-
- { 0x00000400, 0x00000413, CPU_MC, CPU_K8_PLUS, },
-
- { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_AMD_ALL, },
- { 0xC0000081, 0xC0000084, CPU_CALL, CPU_K8_PLUS, },
- { 0xC0000100, 0xC0000102, CPU_BASE, CPU_K8_PLUS, },
- { 0xC0000103, 0xC0000103, CPU_TIME, CPU_K10_PLUS, },
-
- { 0xC0010000, 0xC0010007, CPU_PMC, CPU_K8_PLUS, },
- { 0xC0010010, 0xC0010010, CPU_CONF, CPU_K7_PLUS, },
- { 0xC0010015, 0xC0010015, CPU_CONF, CPU_K7_PLUS, },
- { 0xC0010016, 0xC001001A, CPU_MTRR, CPU_K8_PLUS, },
- { 0xC001001D, 0xC001001D, CPU_MTRR, CPU_K8_PLUS, },
- { 0xC001001F, 0xC001001F, CPU_CONF, CPU_K8_PLUS, },
- { 0xC0010030, 0xC0010035, CPU_BIOS, CPU_K8_PLUS, },
- { 0xC0010044, 0xC0010048, CPU_MC, CPU_K8_PLUS, },
- { 0xC0010050, 0xC0010056, CPU_SMM, CPU_K0F_PLUS, },
- { 0xC0010058, 0xC0010058, CPU_CONF, CPU_K10_PLUS, },
- { 0xC0010060, 0xC0010060, CPU_CACHE, CPU_AMD_11, },
- { 0xC0010061, 0xC0010068, CPU_SMM, CPU_K10_PLUS, },
- { 0xC0010069, 0xC001006B, CPU_SMM, CPU_AMD_11, },
- { 0xC0010070, 0xC0010071, CPU_SMM, CPU_K10_PLUS, },
- { 0xC0010111, 0xC0010113, CPU_SMM, CPU_K8_PLUS, },
- { 0xC0010114, 0xC0010118, CPU_SVM, CPU_K10_PLUS, },
- { 0xC0010140, 0xC0010141, CPU_OSVM, CPU_K10_PLUS, },
- { 0xC0011022, 0xC0011023, CPU_CONF, CPU_K10_PLUS, },
-};
-
-
-/* Intel */
-static int get_intel_modelflag(unsigned model)
-{
- int flag;
-
- switch (model) {
- case 0x0501:
- case 0x0502:
- case 0x0504:
- flag = CPU_INTEL_PENTIUM;
- break;
- case 0x0601:
- case 0x0603:
- case 0x0605:
- case 0x0607:
- case 0x0608:
- case 0x060A:
- case 0x060B:
- flag = CPU_INTEL_P6;
- break;
- case 0x0609:
- case 0x060D:
- flag = CPU_INTEL_PENTIUM_M;
- break;
- case 0x060E:
- flag = CPU_INTEL_CORE;
- break;
- case 0x060F:
- case 0x0617:
- flag = CPU_INTEL_CORE2;
- break;
- case 0x061C:
- flag = CPU_INTEL_ATOM;
- break;
- case 0x0F00:
- case 0x0F01:
- case 0x0F02:
- case 0x0F03:
- case 0x0F04:
- flag = CPU_INTEL_XEON_P4;
- break;
- case 0x0F06:
- flag = CPU_INTEL_XEON_MP;
- break;
- default:
- flag = CPU_NONE;
- break;
- }
-
- return flag;
-}
-
-/* AMD */
-static int get_amd_modelflag(unsigned model)
-{
- int flag;
-
- switch (model >> 8) {
- case 0x6:
- flag = CPU_AMD_K6;
- break;
- case 0x7:
- flag = CPU_AMD_K7;
- break;
- case 0x8:
- flag = CPU_AMD_K8;
- break;
- case 0xf:
- flag = CPU_AMD_0F;
- break;
- case 0x10:
- flag = CPU_AMD_10;
- break;
- case 0x11:
- flag = CPU_AMD_11;
- break;
- default:
- flag = CPU_NONE;
- break;
- }
-
- return flag;
-}
-
-static int get_cpu_modelflag(unsigned cpu)
-{
- int flag;
-
- flag = per_cpu(cpu_model, cpu);
-
- switch (flag >> 16) {
- case X86_VENDOR_INTEL:
- flag = get_intel_modelflag(flag);
- break;
- case X86_VENDOR_AMD:
- flag = get_amd_modelflag(flag & 0xffff);
- break;
- default:
- flag = CPU_NONE;
- break;
- }
-
- return flag;
-}
-
-static int get_cpu_range_count(unsigned cpu)
-{
- int index;
-
- switch (per_cpu(cpu_model, cpu) >> 16) {
- case X86_VENDOR_INTEL:
- index = ARRAY_SIZE(cpu_intel_range);
- break;
- case X86_VENDOR_AMD:
- index = ARRAY_SIZE(cpu_amd_range);
- break;
- default:
- index = 0;
- break;
- }
-
- return index;
-}
-
static int is_typeflag_valid(unsigned cpu, unsigned flag)
{
- unsigned vendor, modelflag;
- int i, index;
+ int i;
/* Standard Registers should be always valid */
if (flag >= CPU_TSS)
return 1;
- modelflag = per_cpu(cpu_modelflag, cpu);
- vendor = per_cpu(cpu_model, cpu) >> 16;
- index = get_cpu_range_count(cpu);
-
- for (i = 0; i < index; i++) {
- switch (vendor) {
- case X86_VENDOR_INTEL:
- if ((cpu_intel_range[i].model & modelflag) &&
- (cpu_intel_range[i].flag & flag))
- return 1;
- break;
- case X86_VENDOR_AMD:
- if ((cpu_amd_range[i].model & modelflag) &&
- (cpu_amd_range[i].flag & flag))
- return 1;
- break;
- }
+ for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
+ if (cpu_reg_range[i].flag == flag)
+ return 1;
}
/* Invalid */
@@ -385,26 +183,11 @@ static int is_typeflag_valid(unsigned cpu, unsigned flag)
static unsigned get_cpu_range(unsigned cpu, unsigned *min, unsigned *max,
int index, unsigned flag)
{
- unsigned modelflag;
-
- modelflag = per_cpu(cpu_modelflag, cpu);
- *max = 0;
- switch (per_cpu(cpu_model, cpu) >> 16) {
- case X86_VENDOR_INTEL:
- if ((cpu_intel_range[index].model & modelflag) &&
- (cpu_intel_range[index].flag & flag)) {
- *min = cpu_intel_range[index].min;
- *max = cpu_intel_range[index].max;
- }
- break;
- case X86_VENDOR_AMD:
- if ((cpu_amd_range[index].model & modelflag) &&
- (cpu_amd_range[index].flag & flag)) {
- *min = cpu_amd_range[index].min;
- *max = cpu_amd_range[index].max;
- }
- break;
- }
+ if (cpu_reg_range[index].flag == flag) {
+ *min = cpu_reg_range[index].min;
+ *max = cpu_reg_range[index].max;
+ } else
+ *max = 0;
return *max;
}
@@ -434,7 +217,7 @@ static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag)
unsigned msr, msr_min, msr_max;
struct cpu_private *priv;
u32 low, high;
- int i, range;
+ int i;
if (seq) {
priv = seq->private;
@@ -446,9 +229,7 @@ static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag)
}
}
- range = get_cpu_range_count(cpu);
-
- for (i = 0; i < range; i++) {
+ for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
if (!get_cpu_range(cpu, &msr_min, &msr_max, i, flag))
continue;
@@ -588,8 +369,20 @@ static void print_apic(void *arg)
seq_printf(seq, " TMICT\t\t: %08x\n", apic_read(APIC_TMICT));
seq_printf(seq, " TMCCT\t\t: %08x\n", apic_read(APIC_TMCCT));
seq_printf(seq, " TDCR\t\t: %08x\n", apic_read(APIC_TDCR));
-#endif /* CONFIG_X86_LOCAL_APIC */
+ if (boot_cpu_has(X86_FEATURE_EXTAPIC)) {
+ unsigned int i, v, maxeilvt;
+
+ v = apic_read(APIC_EFEAT);
+ maxeilvt = (v >> 16) & 0xff;
+ seq_printf(seq, " EFEAT\t\t: %08x\n", v);
+ seq_printf(seq, " ECTRL\t\t: %08x\n", apic_read(APIC_ECTRL));
+ for (i = 0; i < maxeilvt; i++) {
+ v = apic_read(APIC_EILVTn(i));
+ seq_printf(seq, " EILVT%d\t\t: %08x\n", i, v);
+ }
+ }
+#endif /* CONFIG_X86_LOCAL_APIC */
seq_printf(seq, "\n MSR\t:\n");
}
@@ -788,13 +581,11 @@ static int cpu_init_msr(unsigned cpu, unsigned type, struct dentry *dentry)
{
struct dentry *cpu_dentry = NULL;
unsigned reg, reg_min, reg_max;
- int i, range, err = 0;
+ int i, err = 0;
char reg_dir[12];
u32 low, high;
- range = get_cpu_range_count(cpu);
-
- for (i = 0; i < range; i++) {
+ for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
if (!get_cpu_range(cpu, &reg_min, &reg_max, i,
cpu_base[type].flag))
continue;
@@ -850,10 +641,6 @@ static int cpu_init_cpu(void)
cpui = &cpu_data(cpu);
if (!cpu_has(cpui, X86_FEATURE_MSR))
continue;
- per_cpu(cpu_model, cpu) = ((cpui->x86_vendor << 16) |
- (cpui->x86 << 8) |
- (cpui->x86_model));
- per_cpu(cpu_modelflag, cpu) = get_cpu_modelflag(cpu);
sprintf(cpu_dir, "cpu%d", cpu);
cpu_dentry = debugfs_create_dir(cpu_dir, cpu_debugfs_dir);
diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig
index 52c83987547..f138c6c389b 100644
--- a/arch/x86/kernel/cpu/cpufreq/Kconfig
+++ b/arch/x86/kernel/cpu/cpufreq/Kconfig
@@ -220,11 +220,14 @@ config X86_LONGHAUL
If in doubt, say N.
config X86_E_POWERSAVER
- tristate "VIA C7 Enhanced PowerSaver"
+ tristate "VIA C7 Enhanced PowerSaver (DANGEROUS)"
select CPU_FREQ_TABLE
- depends on X86_32
+ depends on X86_32 && EXPERIMENTAL
help
- This adds the CPUFreq driver for VIA C7 processors.
+ This adds the CPUFreq driver for VIA C7 processors. However, this driver
+ does not have any safeguards to prevent operating the CPU out of spec
+ and is thus considered dangerous. Please use the regular ACPI cpufreq
+ driver, enabled by CONFIG_X86_ACPI_CPUFREQ.
If in doubt, say N.
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 208ecf6643d..ae9b503220c 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -90,11 +90,7 @@ static int check_est_cpu(unsigned int cpuid)
{
struct cpuinfo_x86 *cpu = &cpu_data(cpuid);
- if (cpu->x86_vendor != X86_VENDOR_INTEL ||
- !cpu_has(cpu, X86_FEATURE_EST))
- return 0;
-
- return 1;
+ return cpu_has(cpu, X86_FEATURE_EST);
}
static unsigned extract_io(u32 value, struct acpi_cpufreq_data *data)
@@ -550,7 +546,7 @@ static int __init acpi_cpufreq_early_init(void)
return -ENOMEM;
}
for_each_possible_cpu(i) {
- if (!alloc_cpumask_var_node(
+ if (!zalloc_cpumask_var_node(
&per_cpu_ptr(acpi_perf_data, i)->shared_cpu_map,
GFP_KERNEL, cpu_to_node(i))) {
@@ -693,8 +689,8 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
if (perf->control_register.space_id == ACPI_ADR_SPACE_FIXED_HARDWARE &&
policy->cpuinfo.transition_latency > 20 * 1000) {
policy->cpuinfo.transition_latency = 20 * 1000;
- printk_once(KERN_INFO "Capping off P-state tranision"
- " latency at 20 uS\n");
+ printk_once(KERN_INFO
+ "P-state transition latency capped at 20 uS\n");
}
/* table init */
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
index 6ac55bd341a..86961519372 100644
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
@@ -168,6 +168,7 @@ static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c)
case 0x0E: /* Core */
case 0x0F: /* Core Duo */
case 0x16: /* Celeron Core */
+ case 0x1C: /* Atom */
p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
return speedstep_get_frequency(SPEEDSTEP_CPU_PCORE);
case 0x0D: /* Pentium M (Dothan) */
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
index 3c28ccd4974..d47c775eb0a 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
@@ -168,10 +168,12 @@ static int check_powernow(void)
return 1;
}
+#ifdef CONFIG_X86_POWERNOW_K7_ACPI
static void invalidate_entry(unsigned int entry)
{
powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID;
}
+#endif
static int get_ranges(unsigned char *pst)
{
@@ -320,7 +322,7 @@ static int powernow_acpi_init(void)
goto err0;
}
- if (!alloc_cpumask_var(&acpi_processor_perf->shared_cpu_map,
+ if (!zalloc_cpumask_var(&acpi_processor_perf->shared_cpu_map,
GFP_KERNEL)) {
retval = -ENOMEM;
goto err05;
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 4709ead2db5..cf52215d9eb 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -649,6 +649,20 @@ static void print_basics(struct powernow_k8_data *data)
data->batps);
}
+static u32 freq_from_fid_did(u32 fid, u32 did)
+{
+ u32 mhz = 0;
+
+ if (boot_cpu_data.x86 == 0x10)
+ mhz = (100 * (fid + 0x10)) >> did;
+ else if (boot_cpu_data.x86 == 0x11)
+ mhz = (100 * (fid + 8)) >> did;
+ else
+ BUG();
+
+ return mhz * 1000;
+}
+
static int fill_powernow_table(struct powernow_k8_data *data,
struct pst_s *pst, u8 maxvid)
{
@@ -821,7 +835,7 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
{
struct cpufreq_frequency_table *powernow_table;
int ret_val = -ENODEV;
- acpi_integer space_id;
+ acpi_integer control, status;
if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) {
dprintk("register performance failed: bad ACPI data\n");
@@ -834,12 +848,13 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
goto err_out;
}
- space_id = data->acpi_data.control_register.space_id;
- if ((space_id != ACPI_ADR_SPACE_FIXED_HARDWARE) ||
- (space_id != ACPI_ADR_SPACE_FIXED_HARDWARE)) {
+ control = data->acpi_data.control_register.space_id;
+ status = data->acpi_data.status_register.space_id;
+
+ if ((control != ACPI_ADR_SPACE_FIXED_HARDWARE) ||
+ (status != ACPI_ADR_SPACE_FIXED_HARDWARE)) {
dprintk("Invalid control/status registers (%x - %x)\n",
- data->acpi_data.control_register.space_id,
- space_id);
+ control, status);
goto err_out;
}
@@ -872,7 +887,7 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
/* notify BIOS that we exist */
acpi_processor_notify_smm(THIS_MODULE);
- if (!alloc_cpumask_var(&data->acpi_data.shared_cpu_map, GFP_KERNEL)) {
+ if (!zalloc_cpumask_var(&data->acpi_data.shared_cpu_map, GFP_KERNEL)) {
printk(KERN_ERR PFX
"unable to alloc powernow_k8_data cpumask\n");
ret_val = -ENOMEM;
@@ -923,8 +938,13 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data,
powernow_table[i].index = index;
- powernow_table[i].frequency =
- data->acpi_data.states[i].core_frequency * 1000;
+ /* Frequency may be rounded for these */
+ if (boot_cpu_data.x86 == 0x10 || boot_cpu_data.x86 == 0x11) {
+ powernow_table[i].frequency =
+ freq_from_fid_did(lo & 0x3f, (lo >> 6) & 7);
+ } else
+ powernow_table[i].frequency =
+ data->acpi_data.states[i].core_frequency * 1000;
}
return 0;
}
@@ -1215,13 +1235,16 @@ static int powernowk8_verify(struct cpufreq_policy *pol)
return cpufreq_frequency_table_verify(pol, data->powernow_table);
}
+static const char ACPI_PSS_BIOS_BUG_MSG[] =
+ KERN_ERR FW_BUG PFX "No compatible ACPI _PSS objects found.\n"
+ KERN_ERR FW_BUG PFX "Try again with latest BIOS.\n";
+
/* per CPU init entry point to the driver */
static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
{
struct powernow_k8_data *data;
cpumask_t oldmask;
int rc;
- static int print_once;
if (!cpu_online(pol->cpu))
return -ENODEV;
@@ -1244,19 +1267,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
* an UP version, and is deprecated by AMD.
*/
if (num_online_cpus() != 1) {
- /*
- * Replace this one with print_once as soon as such a
- * thing gets introduced
- */
- if (!print_once) {
- WARN_ONCE(1, KERN_ERR FW_BUG PFX "Your BIOS "
- "does not provide ACPI _PSS objects "
- "in a way that Linux understands. "
- "Please report this to the Linux ACPI"
- " maintainers and complain to your "
- "BIOS vendor.\n");
- print_once++;
- }
+ printk_once(ACPI_PSS_BIOS_BUG_MSG);
goto err_out;
}
if (pol->cpu != 0) {
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
index c9f1fdc0283..55c831ed71c 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
@@ -471,7 +471,7 @@ static int centrino_target (struct cpufreq_policy *policy,
if (unlikely(!alloc_cpumask_var(&saved_mask, GFP_KERNEL)))
return -ENOMEM;
- if (unlikely(!alloc_cpumask_var(&covered_cpus, GFP_KERNEL))) {
+ if (unlikely(!zalloc_cpumask_var(&covered_cpus, GFP_KERNEL))) {
free_cpumask_var(saved_mask);
return -ENOMEM;
}
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 7437fa133c0..daed39ba261 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -229,12 +229,12 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
}
#endif
-static void __cpuinit srat_detect_node(void)
+static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
{
#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
unsigned node;
int cpu = smp_processor_id();
- int apicid = hard_smp_processor_id();
+ int apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid;
/* Don't do the funky fallback heuristics the AMD version employs
for now. */
@@ -400,7 +400,7 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
}
/* Work around errata */
- srat_detect_node();
+ srat_detect_node(c);
if (cpu_has(c, X86_FEATURE_VMX))
detect_vmx_virtcap(c);
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 483eda96e10..789efe217e1 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -17,6 +17,7 @@
#include <asm/processor.h>
#include <asm/smp.h>
+#include <asm/k8.h>
#define LVL_1_INST 1
#define LVL_1_DATA 2
@@ -159,14 +160,6 @@ struct _cpuid4_info_regs {
unsigned long can_disable;
};
-#if defined(CONFIG_PCI) && defined(CONFIG_SYSFS)
-static struct pci_device_id k8_nb_id[] = {
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) },
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) },
- {}
-};
-#endif
-
unsigned short num_cache_leaves;
/* AMD doesn't have CPUID4. Emulate it here to report the same
@@ -207,10 +200,17 @@ union l3_cache {
};
static const unsigned short __cpuinitconst assocs[] = {
- [1] = 1, [2] = 2, [4] = 4, [6] = 8,
- [8] = 16, [0xa] = 32, [0xb] = 48,
+ [1] = 1,
+ [2] = 2,
+ [4] = 4,
+ [6] = 8,
+ [8] = 16,
+ [0xa] = 32,
+ [0xb] = 48,
[0xc] = 64,
- [0xf] = 0xffff // ??
+ [0xd] = 96,
+ [0xe] = 128,
+ [0xf] = 0xffff /* fully associative - no way to show this currently */
};
static const unsigned char __cpuinitconst levels[] = { 1, 1, 2, 3 };
@@ -271,7 +271,8 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
eax->split.type = types[leaf];
eax->split.level = levels[leaf];
if (leaf == 3)
- eax->split.num_threads_sharing = current_cpu_data.x86_max_cores - 1;
+ eax->split.num_threads_sharing =
+ current_cpu_data.x86_max_cores - 1;
else
eax->split.num_threads_sharing = 0;
eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1;
@@ -291,6 +292,14 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
{
if (index < 3)
return;
+
+ if (boot_cpu_data.x86 == 0x11)
+ return;
+
+ /* see erratum #382 */
+ if ((boot_cpu_data.x86 == 0x10) && (boot_cpu_data.x86_model < 0x8))
+ return;
+
this_leaf->can_disable = 1;
}
@@ -696,97 +705,75 @@ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf)
#define to_object(k) container_of(k, struct _index_kobject, kobj)
#define to_attr(a) container_of(a, struct _cache_attr, attr)
-#ifdef CONFIG_PCI
-static struct pci_dev *get_k8_northbridge(int node)
-{
- struct pci_dev *dev = NULL;
- int i;
-
- for (i = 0; i <= node; i++) {
- do {
- dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
- if (!dev)
- break;
- } while (!pci_match_id(&k8_nb_id[0], dev));
- if (!dev)
- break;
- }
- return dev;
-}
-#else
-static struct pci_dev *get_k8_northbridge(int node)
-{
- return NULL;
-}
-#endif
-
-static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf)
+static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
+ unsigned int index)
{
- const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map);
- int node = cpu_to_node(cpumask_first(mask));
- struct pci_dev *dev = NULL;
- ssize_t ret = 0;
- int i;
+ int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
+ int node = cpu_to_node(cpu);
+ struct pci_dev *dev = node_to_k8_nb_misc(node);
+ unsigned int reg = 0;
if (!this_leaf->can_disable)
- return sprintf(buf, "Feature not enabled\n");
-
- dev = get_k8_northbridge(node);
- if (!dev) {
- printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n");
return -EINVAL;
- }
- for (i = 0; i < 2; i++) {
- unsigned int reg;
+ if (!dev)
+ return -EINVAL;
- pci_read_config_dword(dev, 0x1BC + i * 4, &reg);
+ pci_read_config_dword(dev, 0x1BC + index * 4, &reg);
+ return sprintf(buf, "%x\n", reg);
+}
- ret += sprintf(buf, "%sEntry: %d\n", buf, i);
- ret += sprintf(buf, "%sReads: %s\tNew Entries: %s\n",
- buf,
- reg & 0x80000000 ? "Disabled" : "Allowed",
- reg & 0x40000000 ? "Disabled" : "Allowed");
- ret += sprintf(buf, "%sSubCache: %x\tIndex: %x\n",
- buf, (reg & 0x30000) >> 16, reg & 0xfff);
- }
- return ret;
+#define SHOW_CACHE_DISABLE(index) \
+static ssize_t \
+show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \
+{ \
+ return show_cache_disable(this_leaf, buf, index); \
}
+SHOW_CACHE_DISABLE(0)
+SHOW_CACHE_DISABLE(1)
-static ssize_t
-store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf,
- size_t count)
+static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
+ const char *buf, size_t count, unsigned int index)
{
- const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map);
- int node = cpu_to_node(cpumask_first(mask));
- struct pci_dev *dev = NULL;
- unsigned int ret, index, val;
+ int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
+ int node = cpu_to_node(cpu);
+ struct pci_dev *dev = node_to_k8_nb_misc(node);
+ unsigned long val = 0;
+ unsigned int scrubber = 0;
if (!this_leaf->can_disable)
- return 0;
-
- if (strlen(buf) > 15)
return -EINVAL;
- ret = sscanf(buf, "%x %x", &index, &val);
- if (ret != 2)
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!dev)
return -EINVAL;
- if (index > 1)
+
+ if (strict_strtoul(buf, 10, &val) < 0)
return -EINVAL;
val |= 0xc0000000;
- dev = get_k8_northbridge(node);
- if (!dev) {
- printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n");
- return -EINVAL;
- }
+
+ pci_read_config_dword(dev, 0x58, &scrubber);
+ scrubber &= ~0x1f000000;
+ pci_write_config_dword(dev, 0x58, scrubber);
pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000);
wbinvd();
pci_write_config_dword(dev, 0x1BC + index * 4, val);
+ return count;
+}
- return 1;
+#define STORE_CACHE_DISABLE(index) \
+static ssize_t \
+store_cache_disable_##index(struct _cpuid4_info *this_leaf, \
+ const char *buf, size_t count) \
+{ \
+ return store_cache_disable(this_leaf, buf, count, index); \
}
+STORE_CACHE_DISABLE(0)
+STORE_CACHE_DISABLE(1)
struct _cache_attr {
struct attribute attr;
@@ -808,7 +795,10 @@ define_one_ro(size);
define_one_ro(shared_cpu_map);
define_one_ro(shared_cpu_list);
-static struct _cache_attr cache_disable = __ATTR(cache_disable, 0644, show_cache_disable, store_cache_disable);
+static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644,
+ show_cache_disable_0, store_cache_disable_0);
+static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
+ show_cache_disable_1, store_cache_disable_1);
static struct attribute * default_attrs[] = {
&type.attr,
@@ -820,7 +810,8 @@ static struct attribute * default_attrs[] = {
&size.attr,
&shared_cpu_map.attr,
&shared_cpu_list.attr,
- &cache_disable.attr,
+ &cache_disable_0.attr,
+ &cache_disable_1.attr,
NULL
};
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
index b2f89829bbe..45004faf67e 100644
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -1,7 +1,11 @@
-obj-y = mce_$(BITS).o therm_throt.o
+obj-y = mce.o therm_throt.o
-obj-$(CONFIG_X86_32) += k7.o p4.o p5.o p6.o winchip.o
-obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o
+obj-$(CONFIG_X86_NEW_MCE) += mce-severity.o
+obj-$(CONFIG_X86_OLD_MCE) += k7.o p4.o p6.o
+obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o
+obj-$(CONFIG_X86_MCE_P4THERMAL) += mce_intel.o
+obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o mce_intel.o
obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o
obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o
obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
+obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o
diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c
index dd3af6e7b39..89e51042415 100644
--- a/arch/x86/kernel/cpu/mcheck/k7.c
+++ b/arch/x86/kernel/cpu/mcheck/k7.c
@@ -2,11 +2,10 @@
* Athlon specific Machine Check Exception Reporting
* (C) Copyright 2002 Dave Jones <davej@redhat.com>
*/
-
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
#include <linux/smp.h>
#include <asm/processor.h>
@@ -15,12 +14,12 @@
#include "mce.h"
-/* Machine Check Handler For AMD Athlon/Duron */
+/* Machine Check Handler For AMD Athlon/Duron: */
static void k7_machine_check(struct pt_regs *regs, long error_code)
{
- int recover = 1;
u32 alow, ahigh, high, low;
u32 mcgstl, mcgsth;
+ int recover = 1;
int i;
rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
@@ -32,15 +31,19 @@ static void k7_machine_check(struct pt_regs *regs, long error_code)
for (i = 1; i < nr_mce_banks; i++) {
rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
- if (high&(1<<31)) {
+ if (high & (1<<31)) {
char misc[20];
char addr[24];
- misc[0] = addr[0] = '\0';
+
+ misc[0] = '\0';
+ addr[0] = '\0';
+
if (high & (1<<29))
recover |= 1;
if (high & (1<<25))
recover |= 2;
high &= ~(1<<31);
+
if (high & (1<<27)) {
rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);
snprintf(misc, 20, "[%08x%08x]", ahigh, alow);
@@ -49,27 +52,31 @@ static void k7_machine_check(struct pt_regs *regs, long error_code)
rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
snprintf(addr, 24, " at %08x%08x", ahigh, alow);
}
+
printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
smp_processor_id(), i, high, low, misc, addr);
- /* Clear it */
+
+ /* Clear it: */
wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
- /* Serialize */
+ /* Serialize: */
wmb();
add_taint(TAINT_MACHINE_CHECK);
}
}
- if (recover&2)
+ if (recover & 2)
panic("CPU context corrupt");
- if (recover&1)
+ if (recover & 1)
panic("Unable to continue");
+
printk(KERN_EMERG "Attempting to continue.\n");
+
mcgstl &= ~(1<<2);
wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
}
-/* AMD K7 machine check is Intel like */
+/* AMD K7 machine check is Intel like: */
void amd_mcheck_init(struct cpuinfo_x86 *c)
{
u32 l, h;
@@ -79,21 +86,26 @@ void amd_mcheck_init(struct cpuinfo_x86 *c)
return;
machine_check_vector = k7_machine_check;
+ /* Make sure the vector pointer is visible before we enable MCEs: */
wmb();
printk(KERN_INFO "Intel machine check architecture supported.\n");
+
rdmsr(MSR_IA32_MCG_CAP, l, h);
if (l & (1<<8)) /* Control register present ? */
wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
nr_mce_banks = l & 0xff;
- /* Clear status for MC index 0 separately, we don't touch CTL,
- * as some K7 Athlons cause spurious MCEs when its enabled. */
+ /*
+ * Clear status for MC index 0 separately, we don't touch CTL,
+ * as some K7 Athlons cause spurious MCEs when its enabled:
+ */
if (boot_cpu_data.x86 == 6) {
wrmsr(MSR_IA32_MC0_STATUS, 0x0, 0x0);
i = 1;
} else
i = 0;
+
for (; i < nr_mce_banks; i++) {
wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
new file mode 100644
index 00000000000..a3a235a53f0
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -0,0 +1,127 @@
+/*
+ * Machine check injection support.
+ * Copyright 2008 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ *
+ * Authors:
+ * Andi Kleen
+ * Ying Huang
+ */
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/timer.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/smp.h>
+#include <asm/mce.h>
+
+/* Update fake mce registers on current CPU. */
+static void inject_mce(struct mce *m)
+{
+ struct mce *i = &per_cpu(injectm, m->extcpu);
+
+ /* Make sure noone reads partially written injectm */
+ i->finished = 0;
+ mb();
+ m->finished = 0;
+ /* First set the fields after finished */
+ i->extcpu = m->extcpu;
+ mb();
+ /* Now write record in order, finished last (except above) */
+ memcpy(i, m, sizeof(struct mce));
+ /* Finally activate it */
+ mb();
+ i->finished = 1;
+}
+
+struct delayed_mce {
+ struct timer_list timer;
+ struct mce m;
+};
+
+/* Inject mce on current CPU */
+static void raise_mce(unsigned long data)
+{
+ struct delayed_mce *dm = (struct delayed_mce *)data;
+ struct mce *m = &dm->m;
+ int cpu = m->extcpu;
+
+ inject_mce(m);
+ if (m->status & MCI_STATUS_UC) {
+ struct pt_regs regs;
+ memset(&regs, 0, sizeof(struct pt_regs));
+ regs.ip = m->ip;
+ regs.cs = m->cs;
+ printk(KERN_INFO "Triggering MCE exception on CPU %d\n", cpu);
+ do_machine_check(&regs, 0);
+ printk(KERN_INFO "MCE exception done on CPU %d\n", cpu);
+ } else {
+ mce_banks_t b;
+ memset(&b, 0xff, sizeof(mce_banks_t));
+ printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu);
+ machine_check_poll(0, &b);
+ mce_notify_irq();
+ printk(KERN_INFO "Finished machine check poll on CPU %d\n",
+ cpu);
+ }
+ kfree(dm);
+}
+
+/* Error injection interface */
+static ssize_t mce_write(struct file *filp, const char __user *ubuf,
+ size_t usize, loff_t *off)
+{
+ struct delayed_mce *dm;
+ struct mce m;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ /*
+ * There are some cases where real MSR reads could slip
+ * through.
+ */
+ if (!boot_cpu_has(X86_FEATURE_MCE) || !boot_cpu_has(X86_FEATURE_MCA))
+ return -EIO;
+
+ if ((unsigned long)usize > sizeof(struct mce))
+ usize = sizeof(struct mce);
+ if (copy_from_user(&m, ubuf, usize))
+ return -EFAULT;
+
+ if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu))
+ return -EINVAL;
+
+ dm = kmalloc(sizeof(struct delayed_mce), GFP_KERNEL);
+ if (!dm)
+ return -ENOMEM;
+
+ /*
+ * Need to give user space some time to set everything up,
+ * so do it a jiffie or two later everywhere.
+ * Should we use a hrtimer here for better synchronization?
+ */
+ memcpy(&dm->m, &m, sizeof(struct mce));
+ setup_timer(&dm->timer, raise_mce, (unsigned long)dm);
+ dm->timer.expires = jiffies + 2;
+ add_timer_on(&dm->timer, m.extcpu);
+ return usize;
+}
+
+static int inject_init(void)
+{
+ printk(KERN_INFO "Machine check injector initialized\n");
+ mce_chrdev_ops.write = mce_write;
+ return 0;
+}
+
+module_init(inject_init);
+/*
+ * Cannot tolerate unloading currently because we cannot
+ * guarantee all openers of mce_chrdev will get a reference to us.
+ */
+MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
new file mode 100644
index 00000000000..54dcb8ff12e
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -0,0 +1,15 @@
+#include <asm/mce.h>
+
+enum severity_level {
+ MCE_NO_SEVERITY,
+ MCE_KEEP_SEVERITY,
+ MCE_SOME_SEVERITY,
+ MCE_AO_SEVERITY,
+ MCE_UC_SEVERITY,
+ MCE_AR_SEVERITY,
+ MCE_PANIC_SEVERITY,
+};
+
+int mce_severity(struct mce *a, int tolerant, char **msg);
+
+extern int mce_ser;
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
new file mode 100644
index 00000000000..ff0807f9705
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -0,0 +1,218 @@
+/*
+ * MCE grading rules.
+ * Copyright 2008, 2009 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ *
+ * Author: Andi Kleen
+ */
+#include <linux/kernel.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <linux/debugfs.h>
+#include <asm/mce.h>
+
+#include "mce-internal.h"
+
+/*
+ * Grade an mce by severity. In general the most severe ones are processed
+ * first. Since there are quite a lot of combinations test the bits in a
+ * table-driven way. The rules are simply processed in order, first
+ * match wins.
+ *
+ * Note this is only used for machine check exceptions, the corrected
+ * errors use much simpler rules. The exceptions still check for the corrected
+ * errors, but only to leave them alone for the CMCI handler (except for
+ * panic situations)
+ */
+
+enum context { IN_KERNEL = 1, IN_USER = 2 };
+enum ser { SER_REQUIRED = 1, NO_SER = 2 };
+
+static struct severity {
+ u64 mask;
+ u64 result;
+ unsigned char sev;
+ unsigned char mcgmask;
+ unsigned char mcgres;
+ unsigned char ser;
+ unsigned char context;
+ unsigned char covered;
+ char *msg;
+} severities[] = {
+#define KERNEL .context = IN_KERNEL
+#define USER .context = IN_USER
+#define SER .ser = SER_REQUIRED
+#define NOSER .ser = NO_SER
+#define SEV(s) .sev = MCE_ ## s ## _SEVERITY
+#define BITCLR(x, s, m, r...) { .mask = x, .result = 0, SEV(s), .msg = m, ## r }
+#define BITSET(x, s, m, r...) { .mask = x, .result = x, SEV(s), .msg = m, ## r }
+#define MCGMASK(x, res, s, m, r...) \
+ { .mcgmask = x, .mcgres = res, SEV(s), .msg = m, ## r }
+#define MASK(x, y, s, m, r...) \
+ { .mask = x, .result = y, SEV(s), .msg = m, ## r }
+#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S)
+#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR)
+#define MCACOD 0xffff
+
+ BITCLR(MCI_STATUS_VAL, NO, "Invalid"),
+ BITCLR(MCI_STATUS_EN, NO, "Not enabled"),
+ BITSET(MCI_STATUS_PCC, PANIC, "Processor context corrupt"),
+ /* When MCIP is not set something is very confused */
+ MCGMASK(MCG_STATUS_MCIP, 0, PANIC, "MCIP not set in MCA handler"),
+ /* Neither return not error IP -- no chance to recover -> PANIC */
+ MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0, PANIC,
+ "Neither restart nor error IP"),
+ MCGMASK(MCG_STATUS_RIPV, 0, PANIC, "In kernel and no restart IP",
+ KERNEL),
+ BITCLR(MCI_STATUS_UC, KEEP, "Corrected error", NOSER),
+ MASK(MCI_STATUS_OVER|MCI_STATUS_UC|MCI_STATUS_EN, MCI_STATUS_UC, SOME,
+ "Spurious not enabled", SER),
+
+ /* ignore OVER for UCNA */
+ MASK(MCI_UC_SAR, MCI_STATUS_UC, KEEP,
+ "Uncorrected no action required", SER),
+ MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR, PANIC,
+ "Illegal combination (UCNA with AR=1)", SER),
+ MASK(MCI_STATUS_S, 0, KEEP, "Non signalled machine check", SER),
+
+ /* AR add known MCACODs here */
+ MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_SAR, PANIC,
+ "Action required with lost events", SER),
+ MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_SAR, PANIC,
+ "Action required; unknown MCACOD", SER),
+
+ /* known AO MCACODs: */
+ MASK(MCI_UC_SAR|MCI_STATUS_OVER|0xfff0, MCI_UC_S|0xc0, AO,
+ "Action optional: memory scrubbing error", SER),
+ MASK(MCI_UC_SAR|MCI_STATUS_OVER|MCACOD, MCI_UC_S|0x17a, AO,
+ "Action optional: last level cache writeback error", SER),
+
+ MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S, SOME,
+ "Action optional unknown MCACOD", SER),
+ MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S|MCI_STATUS_OVER, SOME,
+ "Action optional with lost events", SER),
+ BITSET(MCI_STATUS_UC|MCI_STATUS_OVER, PANIC, "Overflowed uncorrected"),
+ BITSET(MCI_STATUS_UC, UC, "Uncorrected"),
+ BITSET(0, SOME, "No match") /* always matches. keep at end */
+};
+
+/*
+ * If the EIPV bit is set, it means the saved IP is the
+ * instruction which caused the MCE.
+ */
+static int error_context(struct mce *m)
+{
+ if (m->mcgstatus & MCG_STATUS_EIPV)
+ return (m->ip && (m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
+ /* Unknown, assume kernel */
+ return IN_KERNEL;
+}
+
+int mce_severity(struct mce *a, int tolerant, char **msg)
+{
+ enum context ctx = error_context(a);
+ struct severity *s;
+
+ for (s = severities;; s++) {
+ if ((a->status & s->mask) != s->result)
+ continue;
+ if ((a->mcgstatus & s->mcgmask) != s->mcgres)
+ continue;
+ if (s->ser == SER_REQUIRED && !mce_ser)
+ continue;
+ if (s->ser == NO_SER && mce_ser)
+ continue;
+ if (s->context && ctx != s->context)
+ continue;
+ if (msg)
+ *msg = s->msg;
+ s->covered = 1;
+ if (s->sev >= MCE_UC_SEVERITY && ctx == IN_KERNEL) {
+ if (panic_on_oops || tolerant < 1)
+ return MCE_PANIC_SEVERITY;
+ }
+ return s->sev;
+ }
+}
+
+static void *s_start(struct seq_file *f, loff_t *pos)
+{
+ if (*pos >= ARRAY_SIZE(severities))
+ return NULL;
+ return &severities[*pos];
+}
+
+static void *s_next(struct seq_file *f, void *data, loff_t *pos)
+{
+ if (++(*pos) >= ARRAY_SIZE(severities))
+ return NULL;
+ return &severities[*pos];
+}
+
+static void s_stop(struct seq_file *f, void *data)
+{
+}
+
+static int s_show(struct seq_file *f, void *data)
+{
+ struct severity *ser = data;
+ seq_printf(f, "%d\t%s\n", ser->covered, ser->msg);
+ return 0;
+}
+
+static const struct seq_operations severities_seq_ops = {
+ .start = s_start,
+ .next = s_next,
+ .stop = s_stop,
+ .show = s_show,
+};
+
+static int severities_coverage_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &severities_seq_ops);
+}
+
+static ssize_t severities_coverage_write(struct file *file,
+ const char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ int i;
+ for (i = 0; i < ARRAY_SIZE(severities); i++)
+ severities[i].covered = 0;
+ return count;
+}
+
+static const struct file_operations severities_coverage_fops = {
+ .open = severities_coverage_open,
+ .release = seq_release,
+ .read = seq_read,
+ .write = severities_coverage_write,
+};
+
+static int __init severities_debugfs_init(void)
+{
+ struct dentry *dmce = NULL, *fseverities_coverage = NULL;
+
+ dmce = debugfs_create_dir("mce", NULL);
+ if (dmce == NULL)
+ goto err_out;
+ fseverities_coverage = debugfs_create_file("severities-coverage",
+ 0444, dmce, NULL,
+ &severities_coverage_fops);
+ if (fseverities_coverage == NULL)
+ goto err_out;
+
+ return 0;
+
+err_out:
+ if (fseverities_coverage)
+ debugfs_remove(fseverities_coverage);
+ if (dmce)
+ debugfs_remove(dmce);
+ return -ENOMEM;
+}
+late_initcall(severities_debugfs_init);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
new file mode 100644
index 00000000000..fabba15e455
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -0,0 +1,1964 @@
+/*
+ * Machine check handler.
+ *
+ * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ * Rest from unknown author(s).
+ * 2004 Andi Kleen. Rewrote most of it.
+ * Copyright 2008 Intel Corporation
+ * Author: Andi Kleen
+ */
+#include <linux/thread_info.h>
+#include <linux/capability.h>
+#include <linux/miscdevice.h>
+#include <linux/interrupt.h>
+#include <linux/ratelimit.h>
+#include <linux/kallsyms.h>
+#include <linux/rcupdate.h>
+#include <linux/kobject.h>
+#include <linux/uaccess.h>
+#include <linux/kdebug.h>
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <linux/string.h>
+#include <linux/sysdev.h>
+#include <linux/delay.h>
+#include <linux/ctype.h>
+#include <linux/sched.h>
+#include <linux/sysfs.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/kmod.h>
+#include <linux/poll.h>
+#include <linux/nmi.h>
+#include <linux/cpu.h>
+#include <linux/smp.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+
+#include <asm/processor.h>
+#include <asm/hw_irq.h>
+#include <asm/apic.h>
+#include <asm/idle.h>
+#include <asm/ipi.h>
+#include <asm/mce.h>
+#include <asm/msr.h>
+
+#include "mce-internal.h"
+#include "mce.h"
+
+/* Handle unconfigured int18 (should never happen) */
+static void unexpected_machine_check(struct pt_regs *regs, long error_code)
+{
+ printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
+ smp_processor_id());
+}
+
+/* Call the installed machine check handler for this CPU setup. */
+void (*machine_check_vector)(struct pt_regs *, long error_code) =
+ unexpected_machine_check;
+
+int mce_disabled;
+
+#ifdef CONFIG_X86_NEW_MCE
+
+#define MISC_MCELOG_MINOR 227
+
+#define SPINUNIT 100 /* 100ns */
+
+atomic_t mce_entry;
+
+DEFINE_PER_CPU(unsigned, mce_exception_count);
+
+/*
+ * Tolerant levels:
+ * 0: always panic on uncorrected errors, log corrected errors
+ * 1: panic or SIGBUS on uncorrected errors, log corrected errors
+ * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors
+ * 3: never panic or SIGBUS, log all errors (for testing only)
+ */
+static int tolerant = 1;
+static int banks;
+static u64 *bank;
+static unsigned long notify_user;
+static int rip_msr;
+static int mce_bootlog = -1;
+static int monarch_timeout = -1;
+static int mce_panic_timeout;
+static int mce_dont_log_ce;
+int mce_cmci_disabled;
+int mce_ignore_ce;
+int mce_ser;
+
+static char trigger[128];
+static char *trigger_argv[2] = { trigger, NULL };
+
+static unsigned long dont_init_banks;
+
+static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
+static DEFINE_PER_CPU(struct mce, mces_seen);
+static int cpu_missing;
+
+
+/* MCA banks polled by the period polling timer for corrected events */
+DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
+ [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
+};
+
+static inline int skip_bank_init(int i)
+{
+ return i < BITS_PER_LONG && test_bit(i, &dont_init_banks);
+}
+
+static DEFINE_PER_CPU(struct work_struct, mce_work);
+
+/* Do initial initialization of a struct mce */
+void mce_setup(struct mce *m)
+{
+ memset(m, 0, sizeof(struct mce));
+ m->cpu = m->extcpu = smp_processor_id();
+ rdtscll(m->tsc);
+ /* We hope get_seconds stays lockless */
+ m->time = get_seconds();
+ m->cpuvendor = boot_cpu_data.x86_vendor;
+ m->cpuid = cpuid_eax(1);
+#ifdef CONFIG_SMP
+ m->socketid = cpu_data(m->extcpu).phys_proc_id;
+#endif
+ m->apicid = cpu_data(m->extcpu).initial_apicid;
+ rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
+}
+
+DEFINE_PER_CPU(struct mce, injectm);
+EXPORT_PER_CPU_SYMBOL_GPL(injectm);
+
+/*
+ * Lockless MCE logging infrastructure.
+ * This avoids deadlocks on printk locks without having to break locks. Also
+ * separate MCEs from kernel messages to avoid bogus bug reports.
+ */
+
+static struct mce_log mcelog = {
+ .signature = MCE_LOG_SIGNATURE,
+ .len = MCE_LOG_LEN,
+ .recordlen = sizeof(struct mce),
+};
+
+void mce_log(struct mce *mce)
+{
+ unsigned next, entry;
+
+ mce->finished = 0;
+ wmb();
+ for (;;) {
+ entry = rcu_dereference(mcelog.next);
+ for (;;) {
+ /*
+ * When the buffer fills up discard new entries.
+ * Assume that the earlier errors are the more
+ * interesting ones:
+ */
+ if (entry >= MCE_LOG_LEN) {
+ set_bit(MCE_OVERFLOW,
+ (unsigned long *)&mcelog.flags);
+ return;
+ }
+ /* Old left over entry. Skip: */
+ if (mcelog.entry[entry].finished) {
+ entry++;
+ continue;
+ }
+ break;
+ }
+ smp_rmb();
+ next = entry + 1;
+ if (cmpxchg(&mcelog.next, entry, next) == entry)
+ break;
+ }
+ memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
+ wmb();
+ mcelog.entry[entry].finished = 1;
+ wmb();
+
+ mce->finished = 1;
+ set_bit(0, &notify_user);
+}
+
+static void print_mce(struct mce *m)
+{
+ printk(KERN_EMERG
+ "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
+ m->extcpu, m->mcgstatus, m->bank, m->status);
+ if (m->ip) {
+ printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
+ !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
+ m->cs, m->ip);
+ if (m->cs == __KERNEL_CS)
+ print_symbol("{%s}", m->ip);
+ printk("\n");
+ }
+ printk(KERN_EMERG "TSC %llx ", m->tsc);
+ if (m->addr)
+ printk("ADDR %llx ", m->addr);
+ if (m->misc)
+ printk("MISC %llx ", m->misc);
+ printk("\n");
+ printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
+ m->cpuvendor, m->cpuid, m->time, m->socketid,
+ m->apicid);
+}
+
+static void print_mce_head(void)
+{
+ printk(KERN_EMERG "\n" KERN_EMERG "HARDWARE ERROR\n");
+}
+
+static void print_mce_tail(void)
+{
+ printk(KERN_EMERG "This is not a software problem!\n"
+ KERN_EMERG "Run through mcelog --ascii to decode and contact your hardware vendor\n");
+}
+
+#define PANIC_TIMEOUT 5 /* 5 seconds */
+
+static atomic_t mce_paniced;
+
+/* Panic in progress. Enable interrupts and wait for final IPI */
+static void wait_for_panic(void)
+{
+ long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
+ preempt_disable();
+ local_irq_enable();
+ while (timeout-- > 0)
+ udelay(1);
+ if (panic_timeout == 0)
+ panic_timeout = mce_panic_timeout;
+ panic("Panicing machine check CPU died");
+}
+
+static void mce_panic(char *msg, struct mce *final, char *exp)
+{
+ int i;
+
+ /*
+ * Make sure only one CPU runs in machine check panic
+ */
+ if (atomic_add_return(1, &mce_paniced) > 1)
+ wait_for_panic();
+ barrier();
+
+ bust_spinlocks(1);
+ console_verbose();
+ print_mce_head();
+ /* First print corrected ones that are still unlogged */
+ for (i = 0; i < MCE_LOG_LEN; i++) {
+ struct mce *m = &mcelog.entry[i];
+ if (!(m->status & MCI_STATUS_VAL))
+ continue;
+ if (!(m->status & MCI_STATUS_UC))
+ print_mce(m);
+ }
+ /* Now print uncorrected but with the final one last */
+ for (i = 0; i < MCE_LOG_LEN; i++) {
+ struct mce *m = &mcelog.entry[i];
+ if (!(m->status & MCI_STATUS_VAL))
+ continue;
+ if (!(m->status & MCI_STATUS_UC))
+ continue;
+ if (!final || memcmp(m, final, sizeof(struct mce)))
+ print_mce(m);
+ }
+ if (final)
+ print_mce(final);
+ if (cpu_missing)
+ printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n");
+ print_mce_tail();
+ if (exp)
+ printk(KERN_EMERG "Machine check: %s\n", exp);
+ if (panic_timeout == 0)
+ panic_timeout = mce_panic_timeout;
+ panic(msg);
+}
+
+/* Support code for software error injection */
+
+static int msr_to_offset(u32 msr)
+{
+ unsigned bank = __get_cpu_var(injectm.bank);
+ if (msr == rip_msr)
+ return offsetof(struct mce, ip);
+ if (msr == MSR_IA32_MC0_STATUS + bank*4)
+ return offsetof(struct mce, status);
+ if (msr == MSR_IA32_MC0_ADDR + bank*4)
+ return offsetof(struct mce, addr);
+ if (msr == MSR_IA32_MC0_MISC + bank*4)
+ return offsetof(struct mce, misc);
+ if (msr == MSR_IA32_MCG_STATUS)
+ return offsetof(struct mce, mcgstatus);
+ return -1;
+}
+
+/* MSR access wrappers used for error injection */
+static u64 mce_rdmsrl(u32 msr)
+{
+ u64 v;
+ if (__get_cpu_var(injectm).finished) {
+ int offset = msr_to_offset(msr);
+ if (offset < 0)
+ return 0;
+ return *(u64 *)((char *)&__get_cpu_var(injectm) + offset);
+ }
+ rdmsrl(msr, v);
+ return v;
+}
+
+static void mce_wrmsrl(u32 msr, u64 v)
+{
+ if (__get_cpu_var(injectm).finished) {
+ int offset = msr_to_offset(msr);
+ if (offset >= 0)
+ *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v;
+ return;
+ }
+ wrmsrl(msr, v);
+}
+
+/*
+ * Simple lockless ring to communicate PFNs from the exception handler with the
+ * process context work function. This is vastly simplified because there's
+ * only a single reader and a single writer.
+ */
+#define MCE_RING_SIZE 16 /* we use one entry less */
+
+struct mce_ring {
+ unsigned short start;
+ unsigned short end;
+ unsigned long ring[MCE_RING_SIZE];
+};
+static DEFINE_PER_CPU(struct mce_ring, mce_ring);
+
+/* Runs with CPU affinity in workqueue */
+static int mce_ring_empty(void)
+{
+ struct mce_ring *r = &__get_cpu_var(mce_ring);
+
+ return r->start == r->end;
+}
+
+static int mce_ring_get(unsigned long *pfn)
+{
+ struct mce_ring *r;
+ int ret = 0;
+
+ *pfn = 0;
+ get_cpu();
+ r = &__get_cpu_var(mce_ring);
+ if (r->start == r->end)
+ goto out;
+ *pfn = r->ring[r->start];
+ r->start = (r->start + 1) % MCE_RING_SIZE;
+ ret = 1;
+out:
+ put_cpu();
+ return ret;
+}
+
+/* Always runs in MCE context with preempt off */
+static int mce_ring_add(unsigned long pfn)
+{
+ struct mce_ring *r = &__get_cpu_var(mce_ring);
+ unsigned next;
+
+ next = (r->end + 1) % MCE_RING_SIZE;
+ if (next == r->start)
+ return -1;
+ r->ring[r->end] = pfn;
+ wmb();
+ r->end = next;
+ return 0;
+}
+
+int mce_available(struct cpuinfo_x86 *c)
+{
+ if (mce_disabled)
+ return 0;
+ return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
+}
+
+static void mce_schedule_work(void)
+{
+ if (!mce_ring_empty()) {
+ struct work_struct *work = &__get_cpu_var(mce_work);
+ if (!work_pending(work))
+ schedule_work(work);
+ }
+}
+
+/*
+ * Get the address of the instruction at the time of the machine check
+ * error.
+ */
+static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
+{
+
+ if (regs && (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV))) {
+ m->ip = regs->ip;
+ m->cs = regs->cs;
+ } else {
+ m->ip = 0;
+ m->cs = 0;
+ }
+ if (rip_msr)
+ m->ip = mce_rdmsrl(rip_msr);
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+/*
+ * Called after interrupts have been reenabled again
+ * when a MCE happened during an interrupts off region
+ * in the kernel.
+ */
+asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs)
+{
+ ack_APIC_irq();
+ exit_idle();
+ irq_enter();
+ mce_notify_irq();
+ mce_schedule_work();
+ irq_exit();
+}
+#endif
+
+static void mce_report_event(struct pt_regs *regs)
+{
+ if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
+ mce_notify_irq();
+ /*
+ * Triggering the work queue here is just an insurance
+ * policy in case the syscall exit notify handler
+ * doesn't run soon enough or ends up running on the
+ * wrong CPU (can happen when audit sleeps)
+ */
+ mce_schedule_work();
+ return;
+ }
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ /*
+ * Without APIC do not notify. The event will be picked
+ * up eventually.
+ */
+ if (!cpu_has_apic)
+ return;
+
+ /*
+ * When interrupts are disabled we cannot use
+ * kernel services safely. Trigger an self interrupt
+ * through the APIC to instead do the notification
+ * after interrupts are reenabled again.
+ */
+ apic->send_IPI_self(MCE_SELF_VECTOR);
+
+ /*
+ * Wait for idle afterwards again so that we don't leave the
+ * APIC in a non idle state because the normal APIC writes
+ * cannot exclude us.
+ */
+ apic_wait_icr_idle();
+#endif
+}
+
+DEFINE_PER_CPU(unsigned, mce_poll_count);
+
+/*
+ * Poll for corrected events or events that happened before reset.
+ * Those are just logged through /dev/mcelog.
+ *
+ * This is executed in standard interrupt context.
+ *
+ * Note: spec recommends to panic for fatal unsignalled
+ * errors here. However this would be quite problematic --
+ * we would need to reimplement the Monarch handling and
+ * it would mess up the exclusion between exception handler
+ * and poll hander -- * so we skip this for now.
+ * These cases should not happen anyways, or only when the CPU
+ * is already totally * confused. In this case it's likely it will
+ * not fully execute the machine check handler either.
+ */
+void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
+{
+ struct mce m;
+ int i;
+
+ __get_cpu_var(mce_poll_count)++;
+
+ mce_setup(&m);
+
+ m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
+ for (i = 0; i < banks; i++) {
+ if (!bank[i] || !test_bit(i, *b))
+ continue;
+
+ m.misc = 0;
+ m.addr = 0;
+ m.bank = i;
+ m.tsc = 0;
+
+ barrier();
+ m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
+ if (!(m.status & MCI_STATUS_VAL))
+ continue;
+
+ /*
+ * Uncorrected or signalled events are handled by the exception
+ * handler when it is enabled, so don't process those here.
+ *
+ * TBD do the same check for MCI_STATUS_EN here?
+ */
+ if (!(flags & MCP_UC) &&
+ (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)))
+ continue;
+
+ if (m.status & MCI_STATUS_MISCV)
+ m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4);
+ if (m.status & MCI_STATUS_ADDRV)
+ m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4);
+
+ if (!(flags & MCP_TIMESTAMP))
+ m.tsc = 0;
+ /*
+ * Don't get the IP here because it's unlikely to
+ * have anything to do with the actual error location.
+ */
+ if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) {
+ mce_log(&m);
+ add_taint(TAINT_MACHINE_CHECK);
+ }
+
+ /*
+ * Clear state for this bank.
+ */
+ mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
+ }
+
+ /*
+ * Don't clear MCG_STATUS here because it's only defined for
+ * exceptions.
+ */
+
+ sync_core();
+}
+EXPORT_SYMBOL_GPL(machine_check_poll);
+
+/*
+ * Do a quick check if any of the events requires a panic.
+ * This decides if we keep the events around or clear them.
+ */
+static int mce_no_way_out(struct mce *m, char **msg)
+{
+ int i;
+
+ for (i = 0; i < banks; i++) {
+ m->status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
+ if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY)
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Variable to establish order between CPUs while scanning.
+ * Each CPU spins initially until executing is equal its number.
+ */
+static atomic_t mce_executing;
+
+/*
+ * Defines order of CPUs on entry. First CPU becomes Monarch.
+ */
+static atomic_t mce_callin;
+
+/*
+ * Check if a timeout waiting for other CPUs happened.
+ */
+static int mce_timed_out(u64 *t)
+{
+ /*
+ * The others already did panic for some reason.
+ * Bail out like in a timeout.
+ * rmb() to tell the compiler that system_state
+ * might have been modified by someone else.
+ */
+ rmb();
+ if (atomic_read(&mce_paniced))
+ wait_for_panic();
+ if (!monarch_timeout)
+ goto out;
+ if ((s64)*t < SPINUNIT) {
+ /* CHECKME: Make panic default for 1 too? */
+ if (tolerant < 1)
+ mce_panic("Timeout synchronizing machine check over CPUs",
+ NULL, NULL);
+ cpu_missing = 1;
+ return 1;
+ }
+ *t -= SPINUNIT;
+out:
+ touch_nmi_watchdog();
+ return 0;
+}
+
+/*
+ * The Monarch's reign. The Monarch is the CPU who entered
+ * the machine check handler first. It waits for the others to
+ * raise the exception too and then grades them. When any
+ * error is fatal panic. Only then let the others continue.
+ *
+ * The other CPUs entering the MCE handler will be controlled by the
+ * Monarch. They are called Subjects.
+ *
+ * This way we prevent any potential data corruption in a unrecoverable case
+ * and also makes sure always all CPU's errors are examined.
+ *
+ * Also this detects the case of an machine check event coming from outer
+ * space (not detected by any CPUs) In this case some external agent wants
+ * us to shut down, so panic too.
+ *
+ * The other CPUs might still decide to panic if the handler happens
+ * in a unrecoverable place, but in this case the system is in a semi-stable
+ * state and won't corrupt anything by itself. It's ok to let the others
+ * continue for a bit first.
+ *
+ * All the spin loops have timeouts; when a timeout happens a CPU
+ * typically elects itself to be Monarch.
+ */
+static void mce_reign(void)
+{
+ int cpu;
+ struct mce *m = NULL;
+ int global_worst = 0;
+ char *msg = NULL;
+ char *nmsg = NULL;
+
+ /*
+ * This CPU is the Monarch and the other CPUs have run
+ * through their handlers.
+ * Grade the severity of the errors of all the CPUs.
+ */
+ for_each_possible_cpu(cpu) {
+ int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant,
+ &nmsg);
+ if (severity > global_worst) {
+ msg = nmsg;
+ global_worst = severity;
+ m = &per_cpu(mces_seen, cpu);
+ }
+ }
+
+ /*
+ * Cannot recover? Panic here then.
+ * This dumps all the mces in the log buffer and stops the
+ * other CPUs.
+ */
+ if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3)
+ mce_panic("Fatal Machine check", m, msg);
+
+ /*
+ * For UC somewhere we let the CPU who detects it handle it.
+ * Also must let continue the others, otherwise the handling
+ * CPU could deadlock on a lock.
+ */
+
+ /*
+ * No machine check event found. Must be some external
+ * source or one CPU is hung. Panic.
+ */
+ if (!m && tolerant < 3)
+ mce_panic("Machine check from unknown source", NULL, NULL);
+
+ /*
+ * Now clear all the mces_seen so that they don't reappear on
+ * the next mce.
+ */
+ for_each_possible_cpu(cpu)
+ memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
+}
+
+static atomic_t global_nwo;
+
+/*
+ * Start of Monarch synchronization. This waits until all CPUs have
+ * entered the exception handler and then determines if any of them
+ * saw a fatal event that requires panic. Then it executes them
+ * in the entry order.
+ * TBD double check parallel CPU hotunplug
+ */
+static int mce_start(int no_way_out, int *order)
+{
+ int nwo;
+ int cpus = num_online_cpus();
+ u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
+
+ if (!timeout) {
+ *order = -1;
+ return no_way_out;
+ }
+
+ atomic_add(no_way_out, &global_nwo);
+
+ /*
+ * Wait for everyone.
+ */
+ while (atomic_read(&mce_callin) != cpus) {
+ if (mce_timed_out(&timeout)) {
+ atomic_set(&global_nwo, 0);
+ *order = -1;
+ return no_way_out;
+ }
+ ndelay(SPINUNIT);
+ }
+
+ /*
+ * Cache the global no_way_out state.
+ */
+ nwo = atomic_read(&global_nwo);
+
+ /*
+ * Monarch starts executing now, the others wait.
+ */
+ if (*order == 1) {
+ atomic_set(&mce_executing, 1);
+ return nwo;
+ }
+
+ /*
+ * Now start the scanning loop one by one
+ * in the original callin order.
+ * This way when there are any shared banks it will
+ * be only seen by one CPU before cleared, avoiding duplicates.
+ */
+ while (atomic_read(&mce_executing) < *order) {
+ if (mce_timed_out(&timeout)) {
+ atomic_set(&global_nwo, 0);
+ *order = -1;
+ return no_way_out;
+ }
+ ndelay(SPINUNIT);
+ }
+ return nwo;
+}
+
+/*
+ * Synchronize between CPUs after main scanning loop.
+ * This invokes the bulk of the Monarch processing.
+ */
+static int mce_end(int order)
+{
+ int ret = -1;
+ u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
+
+ if (!timeout)
+ goto reset;
+ if (order < 0)
+ goto reset;
+
+ /*
+ * Allow others to run.
+ */
+ atomic_inc(&mce_executing);
+
+ if (order == 1) {
+ /* CHECKME: Can this race with a parallel hotplug? */
+ int cpus = num_online_cpus();
+
+ /*
+ * Monarch: Wait for everyone to go through their scanning
+ * loops.
+ */
+ while (atomic_read(&mce_executing) <= cpus) {
+ if (mce_timed_out(&timeout))
+ goto reset;
+ ndelay(SPINUNIT);
+ }
+
+ mce_reign();
+ barrier();
+ ret = 0;
+ } else {
+ /*
+ * Subject: Wait for Monarch to finish.
+ */
+ while (atomic_read(&mce_executing) != 0) {
+ if (mce_timed_out(&timeout))
+ goto reset;
+ ndelay(SPINUNIT);
+ }
+
+ /*
+ * Don't reset anything. That's done by the Monarch.
+ */
+ return 0;
+ }
+
+ /*
+ * Reset all global state.
+ */
+reset:
+ atomic_set(&global_nwo, 0);
+ atomic_set(&mce_callin, 0);
+ barrier();
+
+ /*
+ * Let others run again.
+ */
+ atomic_set(&mce_executing, 0);
+ return ret;
+}
+
+/*
+ * Check if the address reported by the CPU is in a format we can parse.
+ * It would be possible to add code for most other cases, but all would
+ * be somewhat complicated (e.g. segment offset would require an instruction
+ * parser). So only support physical addresses upto page granuality for now.
+ */
+static int mce_usable_address(struct mce *m)
+{
+ if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
+ return 0;
+ if ((m->misc & 0x3f) > PAGE_SHIFT)
+ return 0;
+ if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS)
+ return 0;
+ return 1;
+}
+
+static void mce_clear_state(unsigned long *toclear)
+{
+ int i;
+
+ for (i = 0; i < banks; i++) {
+ if (test_bit(i, toclear))
+ mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
+ }
+}
+
+/*
+ * The actual machine check handler. This only handles real
+ * exceptions when something got corrupted coming in through int 18.
+ *
+ * This is executed in NMI context not subject to normal locking rules. This
+ * implies that most kernel services cannot be safely used. Don't even
+ * think about putting a printk in there!
+ *
+ * On Intel systems this is entered on all CPUs in parallel through
+ * MCE broadcast. However some CPUs might be broken beyond repair,
+ * so be always careful when synchronizing with others.
+ */
+void do_machine_check(struct pt_regs *regs, long error_code)
+{
+ struct mce m, *final;
+ int i;
+ int worst = 0;
+ int severity;
+ /*
+ * Establish sequential order between the CPUs entering the machine
+ * check handler.
+ */
+ int order;
+
+ /*
+ * If no_way_out gets set, there is no safe way to recover from this
+ * MCE. If tolerant is cranked up, we'll try anyway.
+ */
+ int no_way_out = 0;
+ /*
+ * If kill_it gets set, there might be a way to recover from this
+ * error.
+ */
+ int kill_it = 0;
+ DECLARE_BITMAP(toclear, MAX_NR_BANKS);
+ char *msg = "Unknown";
+
+ atomic_inc(&mce_entry);
+
+ __get_cpu_var(mce_exception_count)++;
+
+ if (notify_die(DIE_NMI, "machine check", regs, error_code,
+ 18, SIGKILL) == NOTIFY_STOP)
+ goto out;
+ if (!banks)
+ goto out;
+
+ order = atomic_add_return(1, &mce_callin);
+ mce_setup(&m);
+
+ m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
+ no_way_out = mce_no_way_out(&m, &msg);
+
+ final = &__get_cpu_var(mces_seen);
+ *final = m;
+
+ barrier();
+
+ /*
+ * When no restart IP must always kill or panic.
+ */
+ if (!(m.mcgstatus & MCG_STATUS_RIPV))
+ kill_it = 1;
+
+ /*
+ * Go through all the banks in exclusion of the other CPUs.
+ * This way we don't report duplicated events on shared banks
+ * because the first one to see it will clear it.
+ */
+ no_way_out = mce_start(no_way_out, &order);
+ for (i = 0; i < banks; i++) {
+ __clear_bit(i, toclear);
+ if (!bank[i])
+ continue;
+
+ m.misc = 0;
+ m.addr = 0;
+ m.bank = i;
+
+ m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
+ if ((m.status & MCI_STATUS_VAL) == 0)
+ continue;
+
+ /*
+ * Non uncorrected or non signaled errors are handled by
+ * machine_check_poll. Leave them alone, unless this panics.
+ */
+ if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
+ !no_way_out)
+ continue;
+
+ /*
+ * Set taint even when machine check was not enabled.
+ */
+ add_taint(TAINT_MACHINE_CHECK);
+
+ severity = mce_severity(&m, tolerant, NULL);
+
+ /*
+ * When machine check was for corrected handler don't touch,
+ * unless we're panicing.
+ */
+ if (severity == MCE_KEEP_SEVERITY && !no_way_out)
+ continue;
+ __set_bit(i, toclear);
+ if (severity == MCE_NO_SEVERITY) {
+ /*
+ * Machine check event was not enabled. Clear, but
+ * ignore.
+ */
+ continue;
+ }
+
+ /*
+ * Kill on action required.
+ */
+ if (severity == MCE_AR_SEVERITY)
+ kill_it = 1;
+
+ if (m.status & MCI_STATUS_MISCV)
+ m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4);
+ if (m.status & MCI_STATUS_ADDRV)
+ m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4);
+
+ /*
+ * Action optional error. Queue address for later processing.
+ * When the ring overflows we just ignore the AO error.
+ * RED-PEN add some logging mechanism when
+ * usable_address or mce_add_ring fails.
+ * RED-PEN don't ignore overflow for tolerant == 0
+ */
+ if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
+ mce_ring_add(m.addr >> PAGE_SHIFT);
+
+ mce_get_rip(&m, regs);
+ mce_log(&m);
+
+ if (severity > worst) {
+ *final = m;
+ worst = severity;
+ }
+ }
+
+ if (!no_way_out)
+ mce_clear_state(toclear);
+
+ /*
+ * Do most of the synchronization with other CPUs.
+ * When there's any problem use only local no_way_out state.
+ */
+ if (mce_end(order) < 0)
+ no_way_out = worst >= MCE_PANIC_SEVERITY;
+
+ /*
+ * If we have decided that we just CAN'T continue, and the user
+ * has not set tolerant to an insane level, give up and die.
+ *
+ * This is mainly used in the case when the system doesn't
+ * support MCE broadcasting or it has been disabled.
+ */
+ if (no_way_out && tolerant < 3)
+ mce_panic("Fatal machine check on current CPU", final, msg);
+
+ /*
+ * If the error seems to be unrecoverable, something should be
+ * done. Try to kill as little as possible. If we can kill just
+ * one task, do that. If the user has set the tolerance very
+ * high, don't try to do anything at all.
+ */
+
+ if (kill_it && tolerant < 3)
+ force_sig(SIGBUS, current);
+
+ /* notify userspace ASAP */
+ set_thread_flag(TIF_MCE_NOTIFY);
+
+ if (worst > 0)
+ mce_report_event(regs);
+ mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
+out:
+ atomic_dec(&mce_entry);
+ sync_core();
+}
+EXPORT_SYMBOL_GPL(do_machine_check);
+
+/* dummy to break dependency. actual code is in mm/memory-failure.c */
+void __attribute__((weak)) memory_failure(unsigned long pfn, int vector)
+{
+ printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn);
+}
+
+/*
+ * Called after mce notification in process context. This code
+ * is allowed to sleep. Call the high level VM handler to process
+ * any corrupted pages.
+ * Assume that the work queue code only calls this one at a time
+ * per CPU.
+ * Note we don't disable preemption, so this code might run on the wrong
+ * CPU. In this case the event is picked up by the scheduled work queue.
+ * This is merely a fast path to expedite processing in some common
+ * cases.
+ */
+void mce_notify_process(void)
+{
+ unsigned long pfn;
+ mce_notify_irq();
+ while (mce_ring_get(&pfn))
+ memory_failure(pfn, MCE_VECTOR);
+}
+
+static void mce_process_work(struct work_struct *dummy)
+{
+ mce_notify_process();
+}
+
+#ifdef CONFIG_X86_MCE_INTEL
+/***
+ * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
+ * @cpu: The CPU on which the event occurred.
+ * @status: Event status information
+ *
+ * This function should be called by the thermal interrupt after the
+ * event has been processed and the decision was made to log the event
+ * further.
+ *
+ * The status parameter will be saved to the 'status' field of 'struct mce'
+ * and historically has been the register value of the
+ * MSR_IA32_THERMAL_STATUS (Intel) msr.
+ */
+void mce_log_therm_throt_event(__u64 status)
+{
+ struct mce m;
+
+ mce_setup(&m);
+ m.bank = MCE_THERMAL_BANK;
+ m.status = status;
+ mce_log(&m);
+}
+#endif /* CONFIG_X86_MCE_INTEL */
+
+/*
+ * Periodic polling timer for "silent" machine check errors. If the
+ * poller finds an MCE, poll 2x faster. When the poller finds no more
+ * errors, poll 2x slower (up to check_interval seconds).
+ */
+static int check_interval = 5 * 60; /* 5 minutes */
+
+static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
+static DEFINE_PER_CPU(struct timer_list, mce_timer);
+
+static void mcheck_timer(unsigned long data)
+{
+ struct timer_list *t = &per_cpu(mce_timer, data);
+ int *n;
+
+ WARN_ON(smp_processor_id() != data);
+
+ if (mce_available(&current_cpu_data)) {
+ machine_check_poll(MCP_TIMESTAMP,
+ &__get_cpu_var(mce_poll_banks));
+ }
+
+ /*
+ * Alert userspace if needed. If we logged an MCE, reduce the
+ * polling interval, otherwise increase the polling interval.
+ */
+ n = &__get_cpu_var(next_interval);
+ if (mce_notify_irq())
+ *n = max(*n/2, HZ/100);
+ else
+ *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
+
+ t->expires = jiffies + *n;
+ add_timer(t);
+}
+
+static void mce_do_trigger(struct work_struct *work)
+{
+ call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
+}
+
+static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
+
+/*
+ * Notify the user(s) about new machine check events.
+ * Can be called from interrupt context, but not from machine check/NMI
+ * context.
+ */
+int mce_notify_irq(void)
+{
+ /* Not more than two messages every minute */
+ static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
+
+ clear_thread_flag(TIF_MCE_NOTIFY);
+
+ if (test_and_clear_bit(0, &notify_user)) {
+ wake_up_interruptible(&mce_wait);
+
+ /*
+ * There is no risk of missing notifications because
+ * work_pending is always cleared before the function is
+ * executed.
+ */
+ if (trigger[0] && !work_pending(&mce_trigger_work))
+ schedule_work(&mce_trigger_work);
+
+ if (__ratelimit(&ratelimit))
+ printk(KERN_INFO "Machine check events logged\n");
+
+ return 1;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(mce_notify_irq);
+
+/*
+ * Initialize Machine Checks for a CPU.
+ */
+static int mce_cap_init(void)
+{
+ unsigned b;
+ u64 cap;
+
+ rdmsrl(MSR_IA32_MCG_CAP, cap);
+
+ b = cap & MCG_BANKCNT_MASK;
+ printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b);
+
+ if (b > MAX_NR_BANKS) {
+ printk(KERN_WARNING
+ "MCE: Using only %u machine check banks out of %u\n",
+ MAX_NR_BANKS, b);
+ b = MAX_NR_BANKS;
+ }
+
+ /* Don't support asymmetric configurations today */
+ WARN_ON(banks != 0 && b != banks);
+ banks = b;
+ if (!bank) {
+ bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
+ if (!bank)
+ return -ENOMEM;
+ memset(bank, 0xff, banks * sizeof(u64));
+ }
+
+ /* Use accurate RIP reporting if available. */
+ if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
+ rip_msr = MSR_IA32_MCG_EIP;
+
+ if (cap & MCG_SER_P)
+ mce_ser = 1;
+
+ return 0;
+}
+
+static void mce_init(void)
+{
+ mce_banks_t all_banks;
+ u64 cap;
+ int i;
+
+ /*
+ * Log the machine checks left over from the previous reset.
+ */
+ bitmap_fill(all_banks, MAX_NR_BANKS);
+ machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
+
+ set_in_cr4(X86_CR4_MCE);
+
+ rdmsrl(MSR_IA32_MCG_CAP, cap);
+ if (cap & MCG_CTL_P)
+ wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
+
+ for (i = 0; i < banks; i++) {
+ if (skip_bank_init(i))
+ continue;
+ wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
+ wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
+ }
+}
+
+/* Add per CPU specific workarounds here */
+static void mce_cpu_quirks(struct cpuinfo_x86 *c)
+{
+ /* This should be disabled by the BIOS, but isn't always */
+ if (c->x86_vendor == X86_VENDOR_AMD) {
+ if (c->x86 == 15 && banks > 4) {
+ /*
+ * disable GART TBL walk error reporting, which
+ * trips off incorrectly with the IOMMU & 3ware
+ * & Cerberus:
+ */
+ clear_bit(10, (unsigned long *)&bank[4]);
+ }
+ if (c->x86 <= 17 && mce_bootlog < 0) {
+ /*
+ * Lots of broken BIOS around that don't clear them
+ * by default and leave crap in there. Don't log:
+ */
+ mce_bootlog = 0;
+ }
+ /*
+ * Various K7s with broken bank 0 around. Always disable
+ * by default.
+ */
+ if (c->x86 == 6)
+ bank[0] = 0;
+ }
+
+ if (c->x86_vendor == X86_VENDOR_INTEL) {
+ /*
+ * SDM documents that on family 6 bank 0 should not be written
+ * because it aliases to another special BIOS controlled
+ * register.
+ * But it's not aliased anymore on model 0x1a+
+ * Don't ignore bank 0 completely because there could be a
+ * valid event later, merely don't write CTL0.
+ */
+
+ if (c->x86 == 6 && c->x86_model < 0x1A)
+ __set_bit(0, &dont_init_banks);
+
+ /*
+ * All newer Intel systems support MCE broadcasting. Enable
+ * synchronization with a one second timeout.
+ */
+ if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
+ monarch_timeout < 0)
+ monarch_timeout = USEC_PER_SEC;
+ }
+ if (monarch_timeout < 0)
+ monarch_timeout = 0;
+ if (mce_bootlog != 0)
+ mce_panic_timeout = 30;
+}
+
+static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c)
+{
+ if (c->x86 != 5)
+ return;
+ switch (c->x86_vendor) {
+ case X86_VENDOR_INTEL:
+ if (mce_p5_enabled())
+ intel_p5_mcheck_init(c);
+ break;
+ case X86_VENDOR_CENTAUR:
+ winchip_mcheck_init(c);
+ break;
+ }
+}
+
+static void mce_cpu_features(struct cpuinfo_x86 *c)
+{
+ switch (c->x86_vendor) {
+ case X86_VENDOR_INTEL:
+ mce_intel_feature_init(c);
+ break;
+ case X86_VENDOR_AMD:
+ mce_amd_feature_init(c);
+ break;
+ default:
+ break;
+ }
+}
+
+static void mce_init_timer(void)
+{
+ struct timer_list *t = &__get_cpu_var(mce_timer);
+ int *n = &__get_cpu_var(next_interval);
+
+ if (mce_ignore_ce)
+ return;
+
+ *n = check_interval * HZ;
+ if (!*n)
+ return;
+ setup_timer(t, mcheck_timer, smp_processor_id());
+ t->expires = round_jiffies(jiffies + *n);
+ add_timer(t);
+}
+
+/*
+ * Called for each booted CPU to set up machine checks.
+ * Must be called with preempt off:
+ */
+void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
+{
+ if (mce_disabled)
+ return;
+
+ mce_ancient_init(c);
+
+ if (!mce_available(c))
+ return;
+
+ if (mce_cap_init() < 0) {
+ mce_disabled = 1;
+ return;
+ }
+ mce_cpu_quirks(c);
+
+ machine_check_vector = do_machine_check;
+
+ mce_init();
+ mce_cpu_features(c);
+ mce_init_timer();
+ INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
+}
+
+/*
+ * Character device to read and clear the MCE log.
+ */
+
+static DEFINE_SPINLOCK(mce_state_lock);
+static int open_count; /* #times opened */
+static int open_exclu; /* already open exclusive? */
+
+static int mce_open(struct inode *inode, struct file *file)
+{
+ spin_lock(&mce_state_lock);
+
+ if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
+ spin_unlock(&mce_state_lock);
+
+ return -EBUSY;
+ }
+
+ if (file->f_flags & O_EXCL)
+ open_exclu = 1;
+ open_count++;
+
+ spin_unlock(&mce_state_lock);
+
+ return nonseekable_open(inode, file);
+}
+
+static int mce_release(struct inode *inode, struct file *file)
+{
+ spin_lock(&mce_state_lock);
+
+ open_count--;
+ open_exclu = 0;
+
+ spin_unlock(&mce_state_lock);
+
+ return 0;
+}
+
+static void collect_tscs(void *data)
+{
+ unsigned long *cpu_tsc = (unsigned long *)data;
+
+ rdtscll(cpu_tsc[smp_processor_id()]);
+}
+
+static DEFINE_MUTEX(mce_read_mutex);
+
+static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
+ loff_t *off)
+{
+ char __user *buf = ubuf;
+ unsigned long *cpu_tsc;
+ unsigned prev, next;
+ int i, err;
+
+ cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
+ if (!cpu_tsc)
+ return -ENOMEM;
+
+ mutex_lock(&mce_read_mutex);
+ next = rcu_dereference(mcelog.next);
+
+ /* Only supports full reads right now */
+ if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
+ mutex_unlock(&mce_read_mutex);
+ kfree(cpu_tsc);
+
+ return -EINVAL;
+ }
+
+ err = 0;
+ prev = 0;
+ do {
+ for (i = prev; i < next; i++) {
+ unsigned long start = jiffies;
+
+ while (!mcelog.entry[i].finished) {
+ if (time_after_eq(jiffies, start + 2)) {
+ memset(mcelog.entry + i, 0,
+ sizeof(struct mce));
+ goto timeout;
+ }
+ cpu_relax();
+ }
+ smp_rmb();
+ err |= copy_to_user(buf, mcelog.entry + i,
+ sizeof(struct mce));
+ buf += sizeof(struct mce);
+timeout:
+ ;
+ }
+
+ memset(mcelog.entry + prev, 0,
+ (next - prev) * sizeof(struct mce));
+ prev = next;
+ next = cmpxchg(&mcelog.next, prev, 0);
+ } while (next != prev);
+
+ synchronize_sched();
+
+ /*
+ * Collect entries that were still getting written before the
+ * synchronize.
+ */
+ on_each_cpu(collect_tscs, cpu_tsc, 1);
+
+ for (i = next; i < MCE_LOG_LEN; i++) {
+ if (mcelog.entry[i].finished &&
+ mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
+ err |= copy_to_user(buf, mcelog.entry+i,
+ sizeof(struct mce));
+ smp_rmb();
+ buf += sizeof(struct mce);
+ memset(&mcelog.entry[i], 0, sizeof(struct mce));
+ }
+ }
+ mutex_unlock(&mce_read_mutex);
+ kfree(cpu_tsc);
+
+ return err ? -EFAULT : buf - ubuf;
+}
+
+static unsigned int mce_poll(struct file *file, poll_table *wait)
+{
+ poll_wait(file, &mce_wait, wait);
+ if (rcu_dereference(mcelog.next))
+ return POLLIN | POLLRDNORM;
+ return 0;
+}
+
+static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
+{
+ int __user *p = (int __user *)arg;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ switch (cmd) {
+ case MCE_GET_RECORD_LEN:
+ return put_user(sizeof(struct mce), p);
+ case MCE_GET_LOG_LEN:
+ return put_user(MCE_LOG_LEN, p);
+ case MCE_GETCLEAR_FLAGS: {
+ unsigned flags;
+
+ do {
+ flags = mcelog.flags;
+ } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
+
+ return put_user(flags, p);
+ }
+ default:
+ return -ENOTTY;
+ }
+}
+
+/* Modified in mce-inject.c, so not static or const */
+struct file_operations mce_chrdev_ops = {
+ .open = mce_open,
+ .release = mce_release,
+ .read = mce_read,
+ .poll = mce_poll,
+ .unlocked_ioctl = mce_ioctl,
+};
+EXPORT_SYMBOL_GPL(mce_chrdev_ops);
+
+static struct miscdevice mce_log_device = {
+ MISC_MCELOG_MINOR,
+ "mcelog",
+ &mce_chrdev_ops,
+};
+
+/*
+ * mce=off Disables machine check
+ * mce=no_cmci Disables CMCI
+ * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
+ * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
+ * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
+ * monarchtimeout is how long to wait for other CPUs on machine
+ * check, or 0 to not wait
+ * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
+ * mce=nobootlog Don't log MCEs from before booting.
+ */
+static int __init mcheck_enable(char *str)
+{
+ if (*str == 0)
+ enable_p5_mce();
+ if (*str == '=')
+ str++;
+ if (!strcmp(str, "off"))
+ mce_disabled = 1;
+ else if (!strcmp(str, "no_cmci"))
+ mce_cmci_disabled = 1;
+ else if (!strcmp(str, "dont_log_ce"))
+ mce_dont_log_ce = 1;
+ else if (!strcmp(str, "ignore_ce"))
+ mce_ignore_ce = 1;
+ else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
+ mce_bootlog = (str[0] == 'b');
+ else if (isdigit(str[0])) {
+ get_option(&str, &tolerant);
+ if (*str == ',') {
+ ++str;
+ get_option(&str, &monarch_timeout);
+ }
+ } else {
+ printk(KERN_INFO "mce argument %s ignored. Please use /sys\n",
+ str);
+ return 0;
+ }
+ return 1;
+}
+__setup("mce", mcheck_enable);
+
+/*
+ * Sysfs support
+ */
+
+/*
+ * Disable machine checks on suspend and shutdown. We can't really handle
+ * them later.
+ */
+static int mce_disable(void)
+{
+ int i;
+
+ for (i = 0; i < banks; i++) {
+ if (!skip_bank_init(i))
+ wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
+ }
+ return 0;
+}
+
+static int mce_suspend(struct sys_device *dev, pm_message_t state)
+{
+ return mce_disable();
+}
+
+static int mce_shutdown(struct sys_device *dev)
+{
+ return mce_disable();
+}
+
+/*
+ * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
+ * Only one CPU is active at this time, the others get re-added later using
+ * CPU hotplug:
+ */
+static int mce_resume(struct sys_device *dev)
+{
+ mce_init();
+ mce_cpu_features(&current_cpu_data);
+
+ return 0;
+}
+
+static void mce_cpu_restart(void *data)
+{
+ del_timer_sync(&__get_cpu_var(mce_timer));
+ if (mce_available(&current_cpu_data))
+ mce_init();
+ mce_init_timer();
+}
+
+/* Reinit MCEs after user configuration changes */
+static void mce_restart(void)
+{
+ on_each_cpu(mce_cpu_restart, NULL, 1);
+}
+
+static struct sysdev_class mce_sysclass = {
+ .suspend = mce_suspend,
+ .shutdown = mce_shutdown,
+ .resume = mce_resume,
+ .name = "machinecheck",
+};
+
+DEFINE_PER_CPU(struct sys_device, mce_dev);
+
+__cpuinitdata
+void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
+
+static struct sysdev_attribute *bank_attrs;
+
+static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
+ char *buf)
+{
+ u64 b = bank[attr - bank_attrs];
+
+ return sprintf(buf, "%llx\n", b);
+}
+
+static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
+ const char *buf, size_t size)
+{
+ u64 new;
+
+ if (strict_strtoull(buf, 0, &new) < 0)
+ return -EINVAL;
+
+ bank[attr - bank_attrs] = new;
+ mce_restart();
+
+ return size;
+}
+
+static ssize_t
+show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf)
+{
+ strcpy(buf, trigger);
+ strcat(buf, "\n");
+ return strlen(trigger) + 1;
+}
+
+static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
+ const char *buf, size_t siz)
+{
+ char *p;
+ int len;
+
+ strncpy(trigger, buf, sizeof(trigger));
+ trigger[sizeof(trigger)-1] = 0;
+ len = strlen(trigger);
+ p = strchr(trigger, '\n');
+
+ if (*p)
+ *p = 0;
+
+ return len;
+}
+
+static ssize_t store_int_with_restart(struct sys_device *s,
+ struct sysdev_attribute *attr,
+ const char *buf, size_t size)
+{
+ ssize_t ret = sysdev_store_int(s, attr, buf, size);
+ mce_restart();
+ return ret;
+}
+
+static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
+static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
+static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout);
+
+static struct sysdev_ext_attribute attr_check_interval = {
+ _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int,
+ store_int_with_restart),
+ &check_interval
+};
+
+static struct sysdev_attribute *mce_attrs[] = {
+ &attr_tolerant.attr, &attr_check_interval.attr, &attr_trigger,
+ &attr_monarch_timeout.attr,
+ NULL
+};
+
+static cpumask_var_t mce_dev_initialized;
+
+/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */
+static __cpuinit int mce_create_device(unsigned int cpu)
+{
+ int err;
+ int i;
+
+ if (!mce_available(&boot_cpu_data))
+ return -EIO;
+
+ memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject));
+ per_cpu(mce_dev, cpu).id = cpu;
+ per_cpu(mce_dev, cpu).cls = &mce_sysclass;
+
+ err = sysdev_register(&per_cpu(mce_dev, cpu));
+ if (err)
+ return err;
+
+ for (i = 0; mce_attrs[i]; i++) {
+ err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
+ if (err)
+ goto error;
+ }
+ for (i = 0; i < banks; i++) {
+ err = sysdev_create_file(&per_cpu(mce_dev, cpu),
+ &bank_attrs[i]);
+ if (err)
+ goto error2;
+ }
+ cpumask_set_cpu(cpu, mce_dev_initialized);
+
+ return 0;
+error2:
+ while (--i >= 0)
+ sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]);
+error:
+ while (--i >= 0)
+ sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
+
+ sysdev_unregister(&per_cpu(mce_dev, cpu));
+
+ return err;
+}
+
+static __cpuinit void mce_remove_device(unsigned int cpu)
+{
+ int i;
+
+ if (!cpumask_test_cpu(cpu, mce_dev_initialized))
+ return;
+
+ for (i = 0; mce_attrs[i]; i++)
+ sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
+
+ for (i = 0; i < banks; i++)
+ sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]);
+
+ sysdev_unregister(&per_cpu(mce_dev, cpu));
+ cpumask_clear_cpu(cpu, mce_dev_initialized);
+}
+
+/* Make sure there are no machine checks on offlined CPUs. */
+static void mce_disable_cpu(void *h)
+{
+ unsigned long action = *(unsigned long *)h;
+ int i;
+
+ if (!mce_available(&current_cpu_data))
+ return;
+ if (!(action & CPU_TASKS_FROZEN))
+ cmci_clear();
+ for (i = 0; i < banks; i++) {
+ if (!skip_bank_init(i))
+ wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
+ }
+}
+
+static void mce_reenable_cpu(void *h)
+{
+ unsigned long action = *(unsigned long *)h;
+ int i;
+
+ if (!mce_available(&current_cpu_data))
+ return;
+
+ if (!(action & CPU_TASKS_FROZEN))
+ cmci_reenable();
+ for (i = 0; i < banks; i++) {
+ if (!skip_bank_init(i))
+ wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
+ }
+}
+
+/* Get notified when a cpu comes on/off. Be hotplug friendly. */
+static int __cpuinit
+mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+ unsigned int cpu = (unsigned long)hcpu;
+ struct timer_list *t = &per_cpu(mce_timer, cpu);
+
+ switch (action) {
+ case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
+ mce_create_device(cpu);
+ if (threshold_cpu_callback)
+ threshold_cpu_callback(action, cpu);
+ break;
+ case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
+ if (threshold_cpu_callback)
+ threshold_cpu_callback(action, cpu);
+ mce_remove_device(cpu);
+ break;
+ case CPU_DOWN_PREPARE:
+ case CPU_DOWN_PREPARE_FROZEN:
+ del_timer_sync(t);
+ smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
+ break;
+ case CPU_DOWN_FAILED:
+ case CPU_DOWN_FAILED_FROZEN:
+ t->expires = round_jiffies(jiffies +
+ __get_cpu_var(next_interval));
+ add_timer_on(t, cpu);
+ smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
+ break;
+ case CPU_POST_DEAD:
+ /* intentionally ignoring frozen here */
+ cmci_rediscover(cpu);
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block mce_cpu_notifier __cpuinitdata = {
+ .notifier_call = mce_cpu_callback,
+};
+
+static __init int mce_init_banks(void)
+{
+ int i;
+
+ bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
+ GFP_KERNEL);
+ if (!bank_attrs)
+ return -ENOMEM;
+
+ for (i = 0; i < banks; i++) {
+ struct sysdev_attribute *a = &bank_attrs[i];
+
+ a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i);
+ if (!a->attr.name)
+ goto nomem;
+
+ a->attr.mode = 0644;
+ a->show = show_bank;
+ a->store = set_bank;
+ }
+ return 0;
+
+nomem:
+ while (--i >= 0)
+ kfree(bank_attrs[i].attr.name);
+ kfree(bank_attrs);
+ bank_attrs = NULL;
+
+ return -ENOMEM;
+}
+
+static __init int mce_init_device(void)
+{
+ int err;
+ int i = 0;
+
+ if (!mce_available(&boot_cpu_data))
+ return -EIO;
+
+ alloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL);
+
+ err = mce_init_banks();
+ if (err)
+ return err;
+
+ err = sysdev_class_register(&mce_sysclass);
+ if (err)
+ return err;
+
+ for_each_online_cpu(i) {
+ err = mce_create_device(i);
+ if (err)
+ return err;
+ }
+
+ register_hotcpu_notifier(&mce_cpu_notifier);
+ misc_register(&mce_log_device);
+
+ return err;
+}
+
+device_initcall(mce_init_device);
+
+#else /* CONFIG_X86_OLD_MCE: */
+
+int nr_mce_banks;
+EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */
+
+/* This has to be run for each processor */
+void mcheck_init(struct cpuinfo_x86 *c)
+{
+ if (mce_disabled == 1)
+ return;
+
+ switch (c->x86_vendor) {
+ case X86_VENDOR_AMD:
+ amd_mcheck_init(c);
+ break;
+
+ case X86_VENDOR_INTEL:
+ if (c->x86 == 5)
+ intel_p5_mcheck_init(c);
+ if (c->x86 == 6)
+ intel_p6_mcheck_init(c);
+ if (c->x86 == 15)
+ intel_p4_mcheck_init(c);
+ break;
+
+ case X86_VENDOR_CENTAUR:
+ if (c->x86 == 5)
+ winchip_mcheck_init(c);
+ break;
+
+ default:
+ break;
+ }
+ printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks);
+}
+
+static int __init mcheck_enable(char *str)
+{
+ mce_disabled = -1;
+ return 1;
+}
+
+__setup("mce", mcheck_enable);
+
+#endif /* CONFIG_X86_OLD_MCE */
+
+/*
+ * Old style boot options parsing. Only for compatibility.
+ */
+static int __init mcheck_disable(char *str)
+{
+ mce_disabled = 1;
+ return 1;
+}
+__setup("nomce", mcheck_disable);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.h b/arch/x86/kernel/cpu/mcheck/mce.h
index ae9f628838f..84a552b458c 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.h
+++ b/arch/x86/kernel/cpu/mcheck/mce.h
@@ -1,14 +1,38 @@
#include <linux/init.h>
#include <asm/mce.h>
+#ifdef CONFIG_X86_OLD_MCE
void amd_mcheck_init(struct cpuinfo_x86 *c);
void intel_p4_mcheck_init(struct cpuinfo_x86 *c);
-void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
void intel_p6_mcheck_init(struct cpuinfo_x86 *c);
+#endif
+
+#ifdef CONFIG_X86_ANCIENT_MCE
+void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
void winchip_mcheck_init(struct cpuinfo_x86 *c);
+extern int mce_p5_enable;
+static inline int mce_p5_enabled(void) { return mce_p5_enable; }
+static inline void enable_p5_mce(void) { mce_p5_enable = 1; }
+#else
+static inline void intel_p5_mcheck_init(struct cpuinfo_x86 *c) {}
+static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {}
+static inline int mce_p5_enabled(void) { return 0; }
+static inline void enable_p5_mce(void) { }
+#endif
/* Call the installed machine check handler for this CPU setup. */
extern void (*machine_check_vector)(struct pt_regs *, long error_code);
+#ifdef CONFIG_X86_OLD_MCE
+
extern int nr_mce_banks;
+void intel_set_thermal_handler(void);
+
+#else
+
+static inline void intel_set_thermal_handler(void) { }
+
+#endif
+
+void intel_init_thermal(struct cpuinfo_x86 *c);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c
deleted file mode 100644
index 3552119b091..00000000000
--- a/arch/x86/kernel/cpu/mcheck/mce_32.c
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * mce.c - x86 Machine Check Exception Reporting
- * (c) 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>, Dave Jones <davej@redhat.com>
- */
-
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/smp.h>
-#include <linux/thread_info.h>
-
-#include <asm/processor.h>
-#include <asm/system.h>
-#include <asm/mce.h>
-
-#include "mce.h"
-
-int mce_disabled;
-int nr_mce_banks;
-
-EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */
-
-/* Handle unconfigured int18 (should never happen) */
-static void unexpected_machine_check(struct pt_regs *regs, long error_code)
-{
- printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", smp_processor_id());
-}
-
-/* Call the installed machine check handler for this CPU setup. */
-void (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check;
-
-/* This has to be run for each processor */
-void mcheck_init(struct cpuinfo_x86 *c)
-{
- if (mce_disabled == 1)
- return;
-
- switch (c->x86_vendor) {
- case X86_VENDOR_AMD:
- amd_mcheck_init(c);
- break;
-
- case X86_VENDOR_INTEL:
- if (c->x86 == 5)
- intel_p5_mcheck_init(c);
- if (c->x86 == 6)
- intel_p6_mcheck_init(c);
- if (c->x86 == 15)
- intel_p4_mcheck_init(c);
- break;
-
- case X86_VENDOR_CENTAUR:
- if (c->x86 == 5)
- winchip_mcheck_init(c);
- break;
-
- default:
- break;
- }
-}
-
-static int __init mcheck_disable(char *str)
-{
- mce_disabled = 1;
- return 1;
-}
-
-static int __init mcheck_enable(char *str)
-{
- mce_disabled = -1;
- return 1;
-}
-
-__setup("nomce", mcheck_disable);
-__setup("mce", mcheck_enable);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
deleted file mode 100644
index 6fb0b359d2a..00000000000
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ /dev/null
@@ -1,1187 +0,0 @@
-/*
- * Machine check handler.
- * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
- * Rest from unknown author(s).
- * 2004 Andi Kleen. Rewrote most of it.
- * Copyright 2008 Intel Corporation
- * Author: Andi Kleen
- */
-
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/smp_lock.h>
-#include <linux/string.h>
-#include <linux/rcupdate.h>
-#include <linux/kallsyms.h>
-#include <linux/sysdev.h>
-#include <linux/miscdevice.h>
-#include <linux/fs.h>
-#include <linux/capability.h>
-#include <linux/cpu.h>
-#include <linux/percpu.h>
-#include <linux/poll.h>
-#include <linux/thread_info.h>
-#include <linux/ctype.h>
-#include <linux/kmod.h>
-#include <linux/kdebug.h>
-#include <linux/kobject.h>
-#include <linux/sysfs.h>
-#include <linux/ratelimit.h>
-#include <asm/processor.h>
-#include <asm/msr.h>
-#include <asm/mce.h>
-#include <asm/uaccess.h>
-#include <asm/smp.h>
-#include <asm/idle.h>
-
-#define MISC_MCELOG_MINOR 227
-
-atomic_t mce_entry;
-
-static int mce_dont_init;
-
-/*
- * Tolerant levels:
- * 0: always panic on uncorrected errors, log corrected errors
- * 1: panic or SIGBUS on uncorrected errors, log corrected errors
- * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors
- * 3: never panic or SIGBUS, log all errors (for testing only)
- */
-static int tolerant = 1;
-static int banks;
-static u64 *bank;
-static unsigned long notify_user;
-static int rip_msr;
-static int mce_bootlog = -1;
-static atomic_t mce_events;
-
-static char trigger[128];
-static char *trigger_argv[2] = { trigger, NULL };
-
-static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
-
-/* MCA banks polled by the period polling timer for corrected events */
-DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
- [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
-};
-
-/* Do initial initialization of a struct mce */
-void mce_setup(struct mce *m)
-{
- memset(m, 0, sizeof(struct mce));
- m->cpu = smp_processor_id();
- rdtscll(m->tsc);
-}
-
-/*
- * Lockless MCE logging infrastructure.
- * This avoids deadlocks on printk locks without having to break locks. Also
- * separate MCEs from kernel messages to avoid bogus bug reports.
- */
-
-static struct mce_log mcelog = {
- MCE_LOG_SIGNATURE,
- MCE_LOG_LEN,
-};
-
-void mce_log(struct mce *mce)
-{
- unsigned next, entry;
- atomic_inc(&mce_events);
- mce->finished = 0;
- wmb();
- for (;;) {
- entry = rcu_dereference(mcelog.next);
- for (;;) {
- /* When the buffer fills up discard new entries. Assume
- that the earlier errors are the more interesting. */
- if (entry >= MCE_LOG_LEN) {
- set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags);
- return;
- }
- /* Old left over entry. Skip. */
- if (mcelog.entry[entry].finished) {
- entry++;
- continue;
- }
- break;
- }
- smp_rmb();
- next = entry + 1;
- if (cmpxchg(&mcelog.next, entry, next) == entry)
- break;
- }
- memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
- wmb();
- mcelog.entry[entry].finished = 1;
- wmb();
-
- set_bit(0, &notify_user);
-}
-
-static void print_mce(struct mce *m)
-{
- printk(KERN_EMERG "\n"
- KERN_EMERG "HARDWARE ERROR\n"
- KERN_EMERG
- "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
- m->cpu, m->mcgstatus, m->bank, m->status);
- if (m->ip) {
- printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
- !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
- m->cs, m->ip);
- if (m->cs == __KERNEL_CS)
- print_symbol("{%s}", m->ip);
- printk("\n");
- }
- printk(KERN_EMERG "TSC %llx ", m->tsc);
- if (m->addr)
- printk("ADDR %llx ", m->addr);
- if (m->misc)
- printk("MISC %llx ", m->misc);
- printk("\n");
- printk(KERN_EMERG "This is not a software problem!\n");
- printk(KERN_EMERG "Run through mcelog --ascii to decode "
- "and contact your hardware vendor\n");
-}
-
-static void mce_panic(char *msg, struct mce *backup, unsigned long start)
-{
- int i;
-
- oops_begin();
- for (i = 0; i < MCE_LOG_LEN; i++) {
- unsigned long tsc = mcelog.entry[i].tsc;
-
- if (time_before(tsc, start))
- continue;
- print_mce(&mcelog.entry[i]);
- if (backup && mcelog.entry[i].tsc == backup->tsc)
- backup = NULL;
- }
- if (backup)
- print_mce(backup);
- panic(msg);
-}
-
-int mce_available(struct cpuinfo_x86 *c)
-{
- if (mce_dont_init)
- return 0;
- return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
-}
-
-static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
-{
- if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
- m->ip = regs->ip;
- m->cs = regs->cs;
- } else {
- m->ip = 0;
- m->cs = 0;
- }
- if (rip_msr) {
- /* Assume the RIP in the MSR is exact. Is this true? */
- m->mcgstatus |= MCG_STATUS_EIPV;
- rdmsrl(rip_msr, m->ip);
- m->cs = 0;
- }
-}
-
-/*
- * Poll for corrected events or events that happened before reset.
- * Those are just logged through /dev/mcelog.
- *
- * This is executed in standard interrupt context.
- */
-void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
-{
- struct mce m;
- int i;
-
- mce_setup(&m);
-
- rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
- for (i = 0; i < banks; i++) {
- if (!bank[i] || !test_bit(i, *b))
- continue;
-
- m.misc = 0;
- m.addr = 0;
- m.bank = i;
- m.tsc = 0;
-
- barrier();
- rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
- if (!(m.status & MCI_STATUS_VAL))
- continue;
-
- /*
- * Uncorrected events are handled by the exception handler
- * when it is enabled. But when the exception is disabled log
- * everything.
- *
- * TBD do the same check for MCI_STATUS_EN here?
- */
- if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
- continue;
-
- if (m.status & MCI_STATUS_MISCV)
- rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
- if (m.status & MCI_STATUS_ADDRV)
- rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
-
- if (!(flags & MCP_TIMESTAMP))
- m.tsc = 0;
- /*
- * Don't get the IP here because it's unlikely to
- * have anything to do with the actual error location.
- */
- if (!(flags & MCP_DONTLOG)) {
- mce_log(&m);
- add_taint(TAINT_MACHINE_CHECK);
- }
-
- /*
- * Clear state for this bank.
- */
- wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
- }
-
- /*
- * Don't clear MCG_STATUS here because it's only defined for
- * exceptions.
- */
-}
-
-/*
- * The actual machine check handler. This only handles real
- * exceptions when something got corrupted coming in through int 18.
- *
- * This is executed in NMI context not subject to normal locking rules. This
- * implies that most kernel services cannot be safely used. Don't even
- * think about putting a printk in there!
- */
-void do_machine_check(struct pt_regs * regs, long error_code)
-{
- struct mce m, panicm;
- u64 mcestart = 0;
- int i;
- int panicm_found = 0;
- /*
- * If no_way_out gets set, there is no safe way to recover from this
- * MCE. If tolerant is cranked up, we'll try anyway.
- */
- int no_way_out = 0;
- /*
- * If kill_it gets set, there might be a way to recover from this
- * error.
- */
- int kill_it = 0;
- DECLARE_BITMAP(toclear, MAX_NR_BANKS);
-
- atomic_inc(&mce_entry);
-
- if (notify_die(DIE_NMI, "machine check", regs, error_code,
- 18, SIGKILL) == NOTIFY_STOP)
- goto out2;
- if (!banks)
- goto out2;
-
- mce_setup(&m);
-
- rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
- /* if the restart IP is not valid, we're done for */
- if (!(m.mcgstatus & MCG_STATUS_RIPV))
- no_way_out = 1;
-
- rdtscll(mcestart);
- barrier();
-
- for (i = 0; i < banks; i++) {
- __clear_bit(i, toclear);
- if (!bank[i])
- continue;
-
- m.misc = 0;
- m.addr = 0;
- m.bank = i;
-
- rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
- if ((m.status & MCI_STATUS_VAL) == 0)
- continue;
-
- /*
- * Non uncorrected errors are handled by machine_check_poll
- * Leave them alone.
- */
- if ((m.status & MCI_STATUS_UC) == 0)
- continue;
-
- /*
- * Set taint even when machine check was not enabled.
- */
- add_taint(TAINT_MACHINE_CHECK);
-
- __set_bit(i, toclear);
-
- if (m.status & MCI_STATUS_EN) {
- /* if PCC was set, there's no way out */
- no_way_out |= !!(m.status & MCI_STATUS_PCC);
- /*
- * If this error was uncorrectable and there was
- * an overflow, we're in trouble. If no overflow,
- * we might get away with just killing a task.
- */
- if (m.status & MCI_STATUS_UC) {
- if (tolerant < 1 || m.status & MCI_STATUS_OVER)
- no_way_out = 1;
- kill_it = 1;
- }
- } else {
- /*
- * Machine check event was not enabled. Clear, but
- * ignore.
- */
- continue;
- }
-
- if (m.status & MCI_STATUS_MISCV)
- rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
- if (m.status & MCI_STATUS_ADDRV)
- rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
-
- mce_get_rip(&m, regs);
- mce_log(&m);
-
- /* Did this bank cause the exception? */
- /* Assume that the bank with uncorrectable errors did it,
- and that there is only a single one. */
- if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
- panicm = m;
- panicm_found = 1;
- }
- }
-
- /* If we didn't find an uncorrectable error, pick
- the last one (shouldn't happen, just being safe). */
- if (!panicm_found)
- panicm = m;
-
- /*
- * If we have decided that we just CAN'T continue, and the user
- * has not set tolerant to an insane level, give up and die.
- */
- if (no_way_out && tolerant < 3)
- mce_panic("Machine check", &panicm, mcestart);
-
- /*
- * If the error seems to be unrecoverable, something should be
- * done. Try to kill as little as possible. If we can kill just
- * one task, do that. If the user has set the tolerance very
- * high, don't try to do anything at all.
- */
- if (kill_it && tolerant < 3) {
- int user_space = 0;
-
- /*
- * If the EIPV bit is set, it means the saved IP is the
- * instruction which caused the MCE.
- */
- if (m.mcgstatus & MCG_STATUS_EIPV)
- user_space = panicm.ip && (panicm.cs & 3);
-
- /*
- * If we know that the error was in user space, send a
- * SIGBUS. Otherwise, panic if tolerance is low.
- *
- * force_sig() takes an awful lot of locks and has a slight
- * risk of deadlocking.
- */
- if (user_space) {
- force_sig(SIGBUS, current);
- } else if (panic_on_oops || tolerant < 2) {
- mce_panic("Uncorrected machine check",
- &panicm, mcestart);
- }
- }
-
- /* notify userspace ASAP */
- set_thread_flag(TIF_MCE_NOTIFY);
-
- /* the last thing we do is clear state */
- for (i = 0; i < banks; i++) {
- if (test_bit(i, toclear))
- wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
- }
- wrmsrl(MSR_IA32_MCG_STATUS, 0);
- out2:
- atomic_dec(&mce_entry);
-}
-
-#ifdef CONFIG_X86_MCE_INTEL
-/***
- * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
- * @cpu: The CPU on which the event occurred.
- * @status: Event status information
- *
- * This function should be called by the thermal interrupt after the
- * event has been processed and the decision was made to log the event
- * further.
- *
- * The status parameter will be saved to the 'status' field of 'struct mce'
- * and historically has been the register value of the
- * MSR_IA32_THERMAL_STATUS (Intel) msr.
- */
-void mce_log_therm_throt_event(__u64 status)
-{
- struct mce m;
-
- mce_setup(&m);
- m.bank = MCE_THERMAL_BANK;
- m.status = status;
- mce_log(&m);
-}
-#endif /* CONFIG_X86_MCE_INTEL */
-
-/*
- * Periodic polling timer for "silent" machine check errors. If the
- * poller finds an MCE, poll 2x faster. When the poller finds no more
- * errors, poll 2x slower (up to check_interval seconds).
- */
-
-static int check_interval = 5 * 60; /* 5 minutes */
-static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
-static void mcheck_timer(unsigned long);
-static DEFINE_PER_CPU(struct timer_list, mce_timer);
-
-static void mcheck_timer(unsigned long data)
-{
- struct timer_list *t = &per_cpu(mce_timer, data);
- int *n;
-
- WARN_ON(smp_processor_id() != data);
-
- if (mce_available(&current_cpu_data))
- machine_check_poll(MCP_TIMESTAMP,
- &__get_cpu_var(mce_poll_banks));
-
- /*
- * Alert userspace if needed. If we logged an MCE, reduce the
- * polling interval, otherwise increase the polling interval.
- */
- n = &__get_cpu_var(next_interval);
- if (mce_notify_user()) {
- *n = max(*n/2, HZ/100);
- } else {
- *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
- }
-
- t->expires = jiffies + *n;
- add_timer(t);
-}
-
-static void mce_do_trigger(struct work_struct *work)
-{
- call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
-}
-
-static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
-
-/*
- * Notify the user(s) about new machine check events.
- * Can be called from interrupt context, but not from machine check/NMI
- * context.
- */
-int mce_notify_user(void)
-{
- /* Not more than two messages every minute */
- static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
-
- clear_thread_flag(TIF_MCE_NOTIFY);
- if (test_and_clear_bit(0, &notify_user)) {
- wake_up_interruptible(&mce_wait);
-
- /*
- * There is no risk of missing notifications because
- * work_pending is always cleared before the function is
- * executed.
- */
- if (trigger[0] && !work_pending(&mce_trigger_work))
- schedule_work(&mce_trigger_work);
-
- if (__ratelimit(&ratelimit))
- printk(KERN_INFO "Machine check events logged\n");
-
- return 1;
- }
- return 0;
-}
-
-/* see if the idle task needs to notify userspace */
-static int
-mce_idle_callback(struct notifier_block *nfb, unsigned long action, void *junk)
-{
- /* IDLE_END should be safe - interrupts are back on */
- if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY))
- mce_notify_user();
-
- return NOTIFY_OK;
-}
-
-static struct notifier_block mce_idle_notifier = {
- .notifier_call = mce_idle_callback,
-};
-
-static __init int periodic_mcheck_init(void)
-{
- idle_notifier_register(&mce_idle_notifier);
- return 0;
-}
-__initcall(periodic_mcheck_init);
-
-/*
- * Initialize Machine Checks for a CPU.
- */
-static int mce_cap_init(void)
-{
- u64 cap;
- unsigned b;
-
- rdmsrl(MSR_IA32_MCG_CAP, cap);
- b = cap & 0xff;
- if (b > MAX_NR_BANKS) {
- printk(KERN_WARNING
- "MCE: Using only %u machine check banks out of %u\n",
- MAX_NR_BANKS, b);
- b = MAX_NR_BANKS;
- }
-
- /* Don't support asymmetric configurations today */
- WARN_ON(banks != 0 && b != banks);
- banks = b;
- if (!bank) {
- bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
- if (!bank)
- return -ENOMEM;
- memset(bank, 0xff, banks * sizeof(u64));
- }
-
- /* Use accurate RIP reporting if available. */
- if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
- rip_msr = MSR_IA32_MCG_EIP;
-
- return 0;
-}
-
-static void mce_init(void *dummy)
-{
- u64 cap;
- int i;
- mce_banks_t all_banks;
-
- /*
- * Log the machine checks left over from the previous reset.
- */
- bitmap_fill(all_banks, MAX_NR_BANKS);
- machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
-
- set_in_cr4(X86_CR4_MCE);
-
- rdmsrl(MSR_IA32_MCG_CAP, cap);
- if (cap & MCG_CTL_P)
- wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
-
- for (i = 0; i < banks; i++) {
- wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
- wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
- }
-}
-
-/* Add per CPU specific workarounds here */
-static void mce_cpu_quirks(struct cpuinfo_x86 *c)
-{
- /* This should be disabled by the BIOS, but isn't always */
- if (c->x86_vendor == X86_VENDOR_AMD) {
- if (c->x86 == 15 && banks > 4)
- /* disable GART TBL walk error reporting, which trips off
- incorrectly with the IOMMU & 3ware & Cerberus. */
- clear_bit(10, (unsigned long *)&bank[4]);
- if(c->x86 <= 17 && mce_bootlog < 0)
- /* Lots of broken BIOS around that don't clear them
- by default and leave crap in there. Don't log. */
- mce_bootlog = 0;
- }
-
-}
-
-static void mce_cpu_features(struct cpuinfo_x86 *c)
-{
- switch (c->x86_vendor) {
- case X86_VENDOR_INTEL:
- mce_intel_feature_init(c);
- break;
- case X86_VENDOR_AMD:
- mce_amd_feature_init(c);
- break;
- default:
- break;
- }
-}
-
-static void mce_init_timer(void)
-{
- struct timer_list *t = &__get_cpu_var(mce_timer);
- int *n = &__get_cpu_var(next_interval);
-
- *n = check_interval * HZ;
- if (!*n)
- return;
- setup_timer(t, mcheck_timer, smp_processor_id());
- t->expires = round_jiffies(jiffies + *n);
- add_timer(t);
-}
-
-/*
- * Called for each booted CPU to set up machine checks.
- * Must be called with preempt off.
- */
-void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
-{
- if (!mce_available(c))
- return;
-
- if (mce_cap_init() < 0) {
- mce_dont_init = 1;
- return;
- }
- mce_cpu_quirks(c);
-
- mce_init(NULL);
- mce_cpu_features(c);
- mce_init_timer();
-}
-
-/*
- * Character device to read and clear the MCE log.
- */
-
-static DEFINE_SPINLOCK(mce_state_lock);
-static int open_count; /* #times opened */
-static int open_exclu; /* already open exclusive? */
-
-static int mce_open(struct inode *inode, struct file *file)
-{
- lock_kernel();
- spin_lock(&mce_state_lock);
-
- if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
- spin_unlock(&mce_state_lock);
- unlock_kernel();
- return -EBUSY;
- }
-
- if (file->f_flags & O_EXCL)
- open_exclu = 1;
- open_count++;
-
- spin_unlock(&mce_state_lock);
- unlock_kernel();
-
- return nonseekable_open(inode, file);
-}
-
-static int mce_release(struct inode *inode, struct file *file)
-{
- spin_lock(&mce_state_lock);
-
- open_count--;
- open_exclu = 0;
-
- spin_unlock(&mce_state_lock);
-
- return 0;
-}
-
-static void collect_tscs(void *data)
-{
- unsigned long *cpu_tsc = (unsigned long *)data;
-
- rdtscll(cpu_tsc[smp_processor_id()]);
-}
-
-static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
- loff_t *off)
-{
- unsigned long *cpu_tsc;
- static DEFINE_MUTEX(mce_read_mutex);
- unsigned prev, next;
- char __user *buf = ubuf;
- int i, err;
-
- cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
- if (!cpu_tsc)
- return -ENOMEM;
-
- mutex_lock(&mce_read_mutex);
- next = rcu_dereference(mcelog.next);
-
- /* Only supports full reads right now */
- if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
- mutex_unlock(&mce_read_mutex);
- kfree(cpu_tsc);
- return -EINVAL;
- }
-
- err = 0;
- prev = 0;
- do {
- for (i = prev; i < next; i++) {
- unsigned long start = jiffies;
-
- while (!mcelog.entry[i].finished) {
- if (time_after_eq(jiffies, start + 2)) {
- memset(mcelog.entry + i, 0,
- sizeof(struct mce));
- goto timeout;
- }
- cpu_relax();
- }
- smp_rmb();
- err |= copy_to_user(buf, mcelog.entry + i,
- sizeof(struct mce));
- buf += sizeof(struct mce);
-timeout:
- ;
- }
-
- memset(mcelog.entry + prev, 0,
- (next - prev) * sizeof(struct mce));
- prev = next;
- next = cmpxchg(&mcelog.next, prev, 0);
- } while (next != prev);
-
- synchronize_sched();
-
- /*
- * Collect entries that were still getting written before the
- * synchronize.
- */
- on_each_cpu(collect_tscs, cpu_tsc, 1);
- for (i = next; i < MCE_LOG_LEN; i++) {
- if (mcelog.entry[i].finished &&
- mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
- err |= copy_to_user(buf, mcelog.entry+i,
- sizeof(struct mce));
- smp_rmb();
- buf += sizeof(struct mce);
- memset(&mcelog.entry[i], 0, sizeof(struct mce));
- }
- }
- mutex_unlock(&mce_read_mutex);
- kfree(cpu_tsc);
- return err ? -EFAULT : buf - ubuf;
-}
-
-static unsigned int mce_poll(struct file *file, poll_table *wait)
-{
- poll_wait(file, &mce_wait, wait);
- if (rcu_dereference(mcelog.next))
- return POLLIN | POLLRDNORM;
- return 0;
-}
-
-static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
-{
- int __user *p = (int __user *)arg;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
- switch (cmd) {
- case MCE_GET_RECORD_LEN:
- return put_user(sizeof(struct mce), p);
- case MCE_GET_LOG_LEN:
- return put_user(MCE_LOG_LEN, p);
- case MCE_GETCLEAR_FLAGS: {
- unsigned flags;
-
- do {
- flags = mcelog.flags;
- } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
- return put_user(flags, p);
- }
- default:
- return -ENOTTY;
- }
-}
-
-static const struct file_operations mce_chrdev_ops = {
- .open = mce_open,
- .release = mce_release,
- .read = mce_read,
- .poll = mce_poll,
- .unlocked_ioctl = mce_ioctl,
-};
-
-static struct miscdevice mce_log_device = {
- MISC_MCELOG_MINOR,
- "mcelog",
- &mce_chrdev_ops,
-};
-
-/*
- * Old style boot options parsing. Only for compatibility.
- */
-static int __init mcheck_disable(char *str)
-{
- mce_dont_init = 1;
- return 1;
-}
-
-/* mce=off disables machine check.
- mce=TOLERANCELEVEL (number, see above)
- mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
- mce=nobootlog Don't log MCEs from before booting. */
-static int __init mcheck_enable(char *str)
-{
- if (!strcmp(str, "off"))
- mce_dont_init = 1;
- else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog"))
- mce_bootlog = str[0] == 'b';
- else if (isdigit(str[0]))
- get_option(&str, &tolerant);
- else
- printk("mce= argument %s ignored. Please use /sys", str);
- return 1;
-}
-
-__setup("nomce", mcheck_disable);
-__setup("mce=", mcheck_enable);
-
-/*
- * Sysfs support
- */
-
-/*
- * Disable machine checks on suspend and shutdown. We can't really handle
- * them later.
- */
-static int mce_disable(void)
-{
- int i;
-
- for (i = 0; i < banks; i++)
- wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
- return 0;
-}
-
-static int mce_suspend(struct sys_device *dev, pm_message_t state)
-{
- return mce_disable();
-}
-
-static int mce_shutdown(struct sys_device *dev)
-{
- return mce_disable();
-}
-
-/* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
- Only one CPU is active at this time, the others get readded later using
- CPU hotplug. */
-static int mce_resume(struct sys_device *dev)
-{
- mce_init(NULL);
- mce_cpu_features(&current_cpu_data);
- return 0;
-}
-
-static void mce_cpu_restart(void *data)
-{
- del_timer_sync(&__get_cpu_var(mce_timer));
- if (mce_available(&current_cpu_data))
- mce_init(NULL);
- mce_init_timer();
-}
-
-/* Reinit MCEs after user configuration changes */
-static void mce_restart(void)
-{
- on_each_cpu(mce_cpu_restart, NULL, 1);
-}
-
-static struct sysdev_class mce_sysclass = {
- .suspend = mce_suspend,
- .shutdown = mce_shutdown,
- .resume = mce_resume,
- .name = "machinecheck",
-};
-
-DEFINE_PER_CPU(struct sys_device, device_mce);
-void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinitdata;
-
-/* Why are there no generic functions for this? */
-#define ACCESSOR(name, var, start) \
- static ssize_t show_ ## name(struct sys_device *s, \
- struct sysdev_attribute *attr, \
- char *buf) { \
- return sprintf(buf, "%lx\n", (unsigned long)var); \
- } \
- static ssize_t set_ ## name(struct sys_device *s, \
- struct sysdev_attribute *attr, \
- const char *buf, size_t siz) { \
- char *end; \
- unsigned long new = simple_strtoul(buf, &end, 0); \
- if (end == buf) return -EINVAL; \
- var = new; \
- start; \
- return end-buf; \
- } \
- static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
-
-static struct sysdev_attribute *bank_attrs;
-
-static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
- char *buf)
-{
- u64 b = bank[attr - bank_attrs];
- return sprintf(buf, "%llx\n", b);
-}
-
-static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
- const char *buf, size_t siz)
-{
- char *end;
- u64 new = simple_strtoull(buf, &end, 0);
- if (end == buf)
- return -EINVAL;
- bank[attr - bank_attrs] = new;
- mce_restart();
- return end-buf;
-}
-
-static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr,
- char *buf)
-{
- strcpy(buf, trigger);
- strcat(buf, "\n");
- return strlen(trigger) + 1;
-}
-
-static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
- const char *buf,size_t siz)
-{
- char *p;
- int len;
- strncpy(trigger, buf, sizeof(trigger));
- trigger[sizeof(trigger)-1] = 0;
- len = strlen(trigger);
- p = strchr(trigger, '\n');
- if (*p) *p = 0;
- return len;
-}
-
-static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
-static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
-ACCESSOR(check_interval,check_interval,mce_restart())
-static struct sysdev_attribute *mce_attributes[] = {
- &attr_tolerant.attr, &attr_check_interval, &attr_trigger,
- NULL
-};
-
-static cpumask_var_t mce_device_initialized;
-
-/* Per cpu sysdev init. All of the cpus still share the same ctl bank */
-static __cpuinit int mce_create_device(unsigned int cpu)
-{
- int err;
- int i;
-
- if (!mce_available(&boot_cpu_data))
- return -EIO;
-
- memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));
- per_cpu(device_mce,cpu).id = cpu;
- per_cpu(device_mce,cpu).cls = &mce_sysclass;
-
- err = sysdev_register(&per_cpu(device_mce,cpu));
- if (err)
- return err;
-
- for (i = 0; mce_attributes[i]; i++) {
- err = sysdev_create_file(&per_cpu(device_mce,cpu),
- mce_attributes[i]);
- if (err)
- goto error;
- }
- for (i = 0; i < banks; i++) {
- err = sysdev_create_file(&per_cpu(device_mce, cpu),
- &bank_attrs[i]);
- if (err)
- goto error2;
- }
- cpumask_set_cpu(cpu, mce_device_initialized);
-
- return 0;
-error2:
- while (--i >= 0) {
- sysdev_remove_file(&per_cpu(device_mce, cpu),
- &bank_attrs[i]);
- }
-error:
- while (--i >= 0) {
- sysdev_remove_file(&per_cpu(device_mce,cpu),
- mce_attributes[i]);
- }
- sysdev_unregister(&per_cpu(device_mce,cpu));
-
- return err;
-}
-
-static __cpuinit void mce_remove_device(unsigned int cpu)
-{
- int i;
-
- if (!cpumask_test_cpu(cpu, mce_device_initialized))
- return;
-
- for (i = 0; mce_attributes[i]; i++)
- sysdev_remove_file(&per_cpu(device_mce,cpu),
- mce_attributes[i]);
- for (i = 0; i < banks; i++)
- sysdev_remove_file(&per_cpu(device_mce, cpu),
- &bank_attrs[i]);
- sysdev_unregister(&per_cpu(device_mce,cpu));
- cpumask_clear_cpu(cpu, mce_device_initialized);
-}
-
-/* Make sure there are no machine checks on offlined CPUs. */
-static void mce_disable_cpu(void *h)
-{
- int i;
- unsigned long action = *(unsigned long *)h;
-
- if (!mce_available(&current_cpu_data))
- return;
- if (!(action & CPU_TASKS_FROZEN))
- cmci_clear();
- for (i = 0; i < banks; i++)
- wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
-}
-
-static void mce_reenable_cpu(void *h)
-{
- int i;
- unsigned long action = *(unsigned long *)h;
-
- if (!mce_available(&current_cpu_data))
- return;
- if (!(action & CPU_TASKS_FROZEN))
- cmci_reenable();
- for (i = 0; i < banks; i++)
- wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
-}
-
-/* Get notified when a cpu comes on/off. Be hotplug friendly. */
-static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
- unsigned long action, void *hcpu)
-{
- unsigned int cpu = (unsigned long)hcpu;
- struct timer_list *t = &per_cpu(mce_timer, cpu);
-
- switch (action) {
- case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
- mce_create_device(cpu);
- if (threshold_cpu_callback)
- threshold_cpu_callback(action, cpu);
- break;
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- if (threshold_cpu_callback)
- threshold_cpu_callback(action, cpu);
- mce_remove_device(cpu);
- break;
- case CPU_DOWN_PREPARE:
- case CPU_DOWN_PREPARE_FROZEN:
- del_timer_sync(t);
- smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
- break;
- case CPU_DOWN_FAILED:
- case CPU_DOWN_FAILED_FROZEN:
- t->expires = round_jiffies(jiffies +
- __get_cpu_var(next_interval));
- add_timer_on(t, cpu);
- smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
- break;
- case CPU_POST_DEAD:
- /* intentionally ignoring frozen here */
- cmci_rediscover(cpu);
- break;
- }
- return NOTIFY_OK;
-}
-
-static struct notifier_block mce_cpu_notifier __cpuinitdata = {
- .notifier_call = mce_cpu_callback,
-};
-
-static __init int mce_init_banks(void)
-{
- int i;
-
- bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
- GFP_KERNEL);
- if (!bank_attrs)
- return -ENOMEM;
-
- for (i = 0; i < banks; i++) {
- struct sysdev_attribute *a = &bank_attrs[i];
- a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i);
- if (!a->attr.name)
- goto nomem;
- a->attr.mode = 0644;
- a->show = show_bank;
- a->store = set_bank;
- }
- return 0;
-
-nomem:
- while (--i >= 0)
- kfree(bank_attrs[i].attr.name);
- kfree(bank_attrs);
- bank_attrs = NULL;
- return -ENOMEM;
-}
-
-static __init int mce_init_device(void)
-{
- int err;
- int i = 0;
-
- if (!mce_available(&boot_cpu_data))
- return -EIO;
-
- alloc_cpumask_var(&mce_device_initialized, GFP_KERNEL);
-
- err = mce_init_banks();
- if (err)
- return err;
-
- err = sysdev_class_register(&mce_sysclass);
- if (err)
- return err;
-
- for_each_online_cpu(i) {
- err = mce_create_device(i);
- if (err)
- return err;
- }
-
- register_hotcpu_notifier(&mce_cpu_notifier);
- misc_register(&mce_log_device);
- return err;
-}
-
-device_initcall(mce_init_device);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 56dde9c4bc9..ddae21620bd 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -13,22 +13,22 @@
*
* All MC4_MISCi registers are shared between multi-cores
*/
-
-#include <linux/cpu.h>
-#include <linux/errno.h>
-#include <linux/init.h>
#include <linux/interrupt.h>
-#include <linux/kobject.h>
#include <linux/notifier.h>
-#include <linux/sched.h>
-#include <linux/smp.h>
+#include <linux/kobject.h>
+#include <linux/percpu.h>
#include <linux/sysdev.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
#include <linux/sysfs.h>
+#include <linux/init.h>
+#include <linux/cpu.h>
+#include <linux/smp.h>
+
#include <asm/apic.h>
+#include <asm/idle.h>
#include <asm/mce.h>
#include <asm/msr.h>
-#include <asm/percpu.h>
-#include <asm/idle.h>
#define PFX "mce_threshold: "
#define VERSION "version 1.1.1"
@@ -48,26 +48,26 @@
#define MCG_XBLK_ADDR 0xC0000400
struct threshold_block {
- unsigned int block;
- unsigned int bank;
- unsigned int cpu;
- u32 address;
- u16 interrupt_enable;
- u16 threshold_limit;
- struct kobject kobj;
- struct list_head miscj;
+ unsigned int block;
+ unsigned int bank;
+ unsigned int cpu;
+ u32 address;
+ u16 interrupt_enable;
+ u16 threshold_limit;
+ struct kobject kobj;
+ struct list_head miscj;
};
/* defaults used early on boot */
static struct threshold_block threshold_defaults = {
- .interrupt_enable = 0,
- .threshold_limit = THRESHOLD_MAX,
+ .interrupt_enable = 0,
+ .threshold_limit = THRESHOLD_MAX,
};
struct threshold_bank {
- struct kobject *kobj;
- struct threshold_block *blocks;
- cpumask_var_t cpus;
+ struct kobject *kobj;
+ struct threshold_block *blocks;
+ cpumask_var_t cpus;
};
static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]);
@@ -86,9 +86,9 @@ static void amd_threshold_interrupt(void);
*/
struct thresh_restart {
- struct threshold_block *b;
- int reset;
- u16 old_limit;
+ struct threshold_block *b;
+ int reset;
+ u16 old_limit;
};
/* must be called with correct cpu affinity */
@@ -110,6 +110,7 @@ static void threshold_restart_bank(void *_tr)
} else if (tr->old_limit) { /* change limit w/o reset */
int new_count = (mci_misc_hi & THRESHOLD_MAX) +
(tr->old_limit - tr->b->threshold_limit);
+
mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) |
(new_count & THRESHOLD_MAX);
}
@@ -125,11 +126,11 @@ static void threshold_restart_bank(void *_tr)
/* cpu init entry point, called from mce.c with preempt off */
void mce_amd_feature_init(struct cpuinfo_x86 *c)
{
- unsigned int bank, block;
unsigned int cpu = smp_processor_id();
- u8 lvt_off;
u32 low = 0, high = 0, address = 0;
+ unsigned int bank, block;
struct thresh_restart tr;
+ u8 lvt_off;
for (bank = 0; bank < NR_BANKS; ++bank) {
for (block = 0; block < NR_BLOCKS; ++block) {
@@ -140,8 +141,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
if (!address)
break;
address += MCG_XBLK_ADDR;
- }
- else
+ } else
++address;
if (rdmsr_safe(address, &low, &high))
@@ -193,9 +193,9 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
*/
static void amd_threshold_interrupt(void)
{
+ u32 low = 0, high = 0, address = 0;
unsigned int bank, block;
struct mce m;
- u32 low = 0, high = 0, address = 0;
mce_setup(&m);
@@ -204,16 +204,16 @@ static void amd_threshold_interrupt(void)
if (!(per_cpu(bank_map, m.cpu) & (1 << bank)))
continue;
for (block = 0; block < NR_BLOCKS; ++block) {
- if (block == 0)
+ if (block == 0) {
address = MSR_IA32_MC0_MISC + bank * 4;
- else if (block == 1) {
+ } else if (block == 1) {
address = (low & MASK_BLKPTR_LO) >> 21;
if (!address)
break;
address += MCG_XBLK_ADDR;
- }
- else
+ } else {
++address;
+ }
if (rdmsr_safe(address, &low, &high))
break;
@@ -229,8 +229,10 @@ static void amd_threshold_interrupt(void)
(high & MASK_LOCKED_HI))
continue;
- /* Log the machine check that caused the threshold
- event. */
+ /*
+ * Log the machine check that caused the threshold
+ * event.
+ */
machine_check_poll(MCP_TIMESTAMP,
&__get_cpu_var(mce_poll_banks));
@@ -254,48 +256,52 @@ static void amd_threshold_interrupt(void)
struct threshold_attr {
struct attribute attr;
- ssize_t(*show) (struct threshold_block *, char *);
- ssize_t(*store) (struct threshold_block *, const char *, size_t count);
+ ssize_t (*show) (struct threshold_block *, char *);
+ ssize_t (*store) (struct threshold_block *, const char *, size_t count);
};
-#define SHOW_FIELDS(name) \
-static ssize_t show_ ## name(struct threshold_block * b, char *buf) \
-{ \
- return sprintf(buf, "%lx\n", (unsigned long) b->name); \
+#define SHOW_FIELDS(name) \
+static ssize_t show_ ## name(struct threshold_block *b, char *buf) \
+{ \
+ return sprintf(buf, "%lx\n", (unsigned long) b->name); \
}
SHOW_FIELDS(interrupt_enable)
SHOW_FIELDS(threshold_limit)
-static ssize_t store_interrupt_enable(struct threshold_block *b,
- const char *buf, size_t count)
+static ssize_t
+store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size)
{
- char *end;
struct thresh_restart tr;
- unsigned long new = simple_strtoul(buf, &end, 0);
- if (end == buf)
+ unsigned long new;
+
+ if (strict_strtoul(buf, 0, &new) < 0)
return -EINVAL;
+
b->interrupt_enable = !!new;
- tr.b = b;
- tr.reset = 0;
- tr.old_limit = 0;
+ tr.b = b;
+ tr.reset = 0;
+ tr.old_limit = 0;
+
smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
- return end - buf;
+ return size;
}
-static ssize_t store_threshold_limit(struct threshold_block *b,
- const char *buf, size_t count)
+static ssize_t
+store_threshold_limit(struct threshold_block *b, const char *buf, size_t size)
{
- char *end;
struct thresh_restart tr;
- unsigned long new = simple_strtoul(buf, &end, 0);
- if (end == buf)
+ unsigned long new;
+
+ if (strict_strtoul(buf, 0, &new) < 0)
return -EINVAL;
+
if (new > THRESHOLD_MAX)
new = THRESHOLD_MAX;
if (new < 1)
new = 1;
+
tr.old_limit = b->threshold_limit;
b->threshold_limit = new;
tr.b = b;
@@ -303,12 +309,12 @@ static ssize_t store_threshold_limit(struct threshold_block *b,
smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
- return end - buf;
+ return size;
}
struct threshold_block_cross_cpu {
- struct threshold_block *tb;
- long retval;
+ struct threshold_block *tb;
+ long retval;
};
static void local_error_count_handler(void *_tbcc)
@@ -338,16 +344,13 @@ static ssize_t store_error_count(struct threshold_block *b,
return 1;
}
-#define THRESHOLD_ATTR(_name,_mode,_show,_store) { \
- .attr = {.name = __stringify(_name), .mode = _mode }, \
- .show = _show, \
- .store = _store, \
+#define RW_ATTR(val) \
+static struct threshold_attr val = { \
+ .attr = {.name = __stringify(val), .mode = 0644 }, \
+ .show = show_## val, \
+ .store = store_## val, \
};
-#define RW_ATTR(name) \
-static struct threshold_attr name = \
- THRESHOLD_ATTR(name, 0644, show_## name, store_## name)
-
RW_ATTR(interrupt_enable);
RW_ATTR(threshold_limit);
RW_ATTR(error_count);
@@ -359,15 +362,17 @@ static struct attribute *default_attrs[] = {
NULL
};
-#define to_block(k) container_of(k, struct threshold_block, kobj)
-#define to_attr(a) container_of(a, struct threshold_attr, attr)
+#define to_block(k) container_of(k, struct threshold_block, kobj)
+#define to_attr(a) container_of(a, struct threshold_attr, attr)
static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
{
struct threshold_block *b = to_block(kobj);
struct threshold_attr *a = to_attr(attr);
ssize_t ret;
+
ret = a->show ? a->show(b, buf) : -EIO;
+
return ret;
}
@@ -377,18 +382,20 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr,
struct threshold_block *b = to_block(kobj);
struct threshold_attr *a = to_attr(attr);
ssize_t ret;
+
ret = a->store ? a->store(b, buf, count) : -EIO;
+
return ret;
}
static struct sysfs_ops threshold_ops = {
- .show = show,
- .store = store,
+ .show = show,
+ .store = store,
};
static struct kobj_type threshold_ktype = {
- .sysfs_ops = &threshold_ops,
- .default_attrs = default_attrs,
+ .sysfs_ops = &threshold_ops,
+ .default_attrs = default_attrs,
};
static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
@@ -396,9 +403,9 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
unsigned int block,
u32 address)
{
- int err;
- u32 low, high;
struct threshold_block *b = NULL;
+ u32 low, high;
+ int err;
if ((bank >= NR_BANKS) || (block >= NR_BLOCKS))
return 0;
@@ -421,20 +428,21 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
if (!b)
return -ENOMEM;
- b->block = block;
- b->bank = bank;
- b->cpu = cpu;
- b->address = address;
- b->interrupt_enable = 0;
- b->threshold_limit = THRESHOLD_MAX;
+ b->block = block;
+ b->bank = bank;
+ b->cpu = cpu;
+ b->address = address;
+ b->interrupt_enable = 0;
+ b->threshold_limit = THRESHOLD_MAX;
INIT_LIST_HEAD(&b->miscj);
- if (per_cpu(threshold_banks, cpu)[bank]->blocks)
+ if (per_cpu(threshold_banks, cpu)[bank]->blocks) {
list_add(&b->miscj,
&per_cpu(threshold_banks, cpu)[bank]->blocks->miscj);
- else
+ } else {
per_cpu(threshold_banks, cpu)[bank]->blocks = b;
+ }
err = kobject_init_and_add(&b->kobj, &threshold_ktype,
per_cpu(threshold_banks, cpu)[bank]->kobj,
@@ -447,8 +455,9 @@ recurse:
if (!address)
return 0;
address += MCG_XBLK_ADDR;
- } else
+ } else {
++address;
+ }
err = allocate_threshold_blocks(cpu, bank, ++block, address);
if (err)
@@ -500,13 +509,14 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
if (!b)
goto out;
- err = sysfs_create_link(&per_cpu(device_mce, cpu).kobj,
+ err = sysfs_create_link(&per_cpu(mce_dev, cpu).kobj,
b->kobj, name);
if (err)
goto out;
cpumask_copy(b->cpus, cpu_core_mask(cpu));
per_cpu(threshold_banks, cpu)[bank] = b;
+
goto out;
}
#endif
@@ -522,7 +532,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
goto out;
}
- b->kobj = kobject_create_and_add(name, &per_cpu(device_mce, cpu).kobj);
+ b->kobj = kobject_create_and_add(name, &per_cpu(mce_dev, cpu).kobj);
if (!b->kobj)
goto out_free;
@@ -542,7 +552,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
if (i == cpu)
continue;
- err = sysfs_create_link(&per_cpu(device_mce, i).kobj,
+ err = sysfs_create_link(&per_cpu(mce_dev, i).kobj,
b->kobj, name);
if (err)
goto out;
@@ -605,15 +615,13 @@ static void deallocate_threshold_block(unsigned int cpu,
static void threshold_remove_bank(unsigned int cpu, int bank)
{
- int i = 0;
struct threshold_bank *b;
char name[32];
+ int i = 0;
b = per_cpu(threshold_banks, cpu)[bank];
-
if (!b)
return;
-
if (!b->blocks)
goto free_out;
@@ -622,8 +630,9 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
#ifdef CONFIG_SMP
/* sibling symlink */
if (shared_bank[bank] && b->blocks->cpu != cpu) {
- sysfs_remove_link(&per_cpu(device_mce, cpu).kobj, name);
+ sysfs_remove_link(&per_cpu(mce_dev, cpu).kobj, name);
per_cpu(threshold_banks, cpu)[bank] = NULL;
+
return;
}
#endif
@@ -633,7 +642,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
if (i == cpu)
continue;
- sysfs_remove_link(&per_cpu(device_mce, i).kobj, name);
+ sysfs_remove_link(&per_cpu(mce_dev, i).kobj, name);
per_cpu(threshold_banks, i)[bank] = NULL;
}
@@ -659,12 +668,9 @@ static void threshold_remove_device(unsigned int cpu)
}
/* get notified when a cpu comes on/off */
-static void __cpuinit amd_64_threshold_cpu_callback(unsigned long action,
- unsigned int cpu)
+static void __cpuinit
+amd_64_threshold_cpu_callback(unsigned long action, unsigned int cpu)
{
- if (cpu >= NR_CPUS)
- return;
-
switch (action) {
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
@@ -686,11 +692,12 @@ static __init int threshold_init_device(void)
/* to hit CPUs online before the notifier is up */
for_each_online_cpu(lcpu) {
int err = threshold_create_device(lcpu);
+
if (err)
return err;
}
threshold_cpu_callback = amd_64_threshold_cpu_callback;
+
return 0;
}
-
device_initcall(threshold_init_device);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
new file mode 100644
index 00000000000..2b011d2d857
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -0,0 +1,74 @@
+/*
+ * Common code for Intel machine checks
+ */
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+
+#include <asm/therm_throt.h>
+#include <asm/processor.h>
+#include <asm/system.h>
+#include <asm/apic.h>
+#include <asm/msr.h>
+
+#include "mce.h"
+
+void intel_init_thermal(struct cpuinfo_x86 *c)
+{
+ unsigned int cpu = smp_processor_id();
+ int tm2 = 0;
+ u32 l, h;
+
+ /* Thermal monitoring depends on ACPI and clock modulation*/
+ if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC))
+ return;
+
+ /*
+ * First check if its enabled already, in which case there might
+ * be some SMM goo which handles it, so we can't even put a handler
+ * since it might be delivered via SMI already:
+ */
+ rdmsr(MSR_IA32_MISC_ENABLE, l, h);
+ h = apic_read(APIC_LVTTHMR);
+ if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
+ printk(KERN_DEBUG
+ "CPU%d: Thermal monitoring handled by SMI\n", cpu);
+ return;
+ }
+
+ if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2))
+ tm2 = 1;
+
+ /* Check whether a vector already exists */
+ if (h & APIC_VECTOR_MASK) {
+ printk(KERN_DEBUG
+ "CPU%d: Thermal LVT vector (%#x) already installed\n",
+ cpu, (h & APIC_VECTOR_MASK));
+ return;
+ }
+
+ /* We'll mask the thermal vector in the lapic till we're ready: */
+ h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED;
+ apic_write(APIC_LVTTHMR, h);
+
+ rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
+ wrmsr(MSR_IA32_THERM_INTERRUPT,
+ l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h);
+
+ intel_set_thermal_handler();
+
+ rdmsr(MSR_IA32_MISC_ENABLE, l, h);
+ wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);
+
+ /* Unmask the thermal vector: */
+ l = apic_read(APIC_LVTTHMR);
+ apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
+
+ printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n",
+ cpu, tm2 ? "TM2" : "TM1");
+
+ /* enable thermal throttle processing */
+ atomic_set(&therm_throt_en, 1);
+}
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index cef3ee30744..f2ef6952c40 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -15,7 +15,8 @@
#include <asm/hw_irq.h>
#include <asm/idle.h>
#include <asm/therm_throt.h>
-#include <asm/apic.h>
+
+#include "mce.h"
asmlinkage void smp_thermal_interrupt(void)
{
@@ -27,67 +28,13 @@ asmlinkage void smp_thermal_interrupt(void)
irq_enter();
rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
- if (therm_throt_process(msr_val & 1))
+ if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT))
mce_log_therm_throt_event(msr_val);
inc_irq_stat(irq_thermal_count);
irq_exit();
}
-static void intel_init_thermal(struct cpuinfo_x86 *c)
-{
- u32 l, h;
- int tm2 = 0;
- unsigned int cpu = smp_processor_id();
-
- if (!cpu_has(c, X86_FEATURE_ACPI))
- return;
-
- if (!cpu_has(c, X86_FEATURE_ACC))
- return;
-
- /* first check if TM1 is already enabled by the BIOS, in which
- * case there might be some SMM goo which handles it, so we can't even
- * put a handler since it might be delivered via SMI already.
- */
- rdmsr(MSR_IA32_MISC_ENABLE, l, h);
- h = apic_read(APIC_LVTTHMR);
- if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
- printk(KERN_DEBUG
- "CPU%d: Thermal monitoring handled by SMI\n", cpu);
- return;
- }
-
- if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2))
- tm2 = 1;
-
- if (h & APIC_VECTOR_MASK) {
- printk(KERN_DEBUG
- "CPU%d: Thermal LVT vector (%#x) already "
- "installed\n", cpu, (h & APIC_VECTOR_MASK));
- return;
- }
-
- h = THERMAL_APIC_VECTOR;
- h |= (APIC_DM_FIXED | APIC_LVT_MASKED);
- apic_write(APIC_LVTTHMR, h);
-
- rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
- wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h);
-
- rdmsr(MSR_IA32_MISC_ENABLE, l, h);
- wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);
-
- l = apic_read(APIC_LVTTHMR);
- apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
- printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n",
- cpu, tm2 ? "TM2" : "TM1");
-
- /* enable thermal throttle processing */
- atomic_set(&therm_throt_en, 1);
- return;
-}
-
/*
* Support for Intel Correct Machine Check Interrupts. This allows
* the CPU to raise an interrupt when a corrected machine check happened.
@@ -109,6 +56,9 @@ static int cmci_supported(int *banks)
{
u64 cap;
+ if (mce_cmci_disabled || mce_ignore_ce)
+ return 0;
+
/*
* Vendor check is not strictly needed, but the initial
* initialization is vendor keyed and this
@@ -132,7 +82,7 @@ static int cmci_supported(int *banks)
static void intel_threshold_interrupt(void)
{
machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
- mce_notify_user();
+ mce_notify_irq();
}
static void print_update(char *type, int *hdr, int num)
@@ -248,7 +198,7 @@ void cmci_rediscover(int dying)
return;
cpumask_copy(old, &current->cpus_allowed);
- for_each_online_cpu (cpu) {
+ for_each_online_cpu(cpu) {
if (cpu == dying)
continue;
if (set_cpus_allowed_ptr(current, cpumask_of(cpu)))
diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c
index a74af128efc..70b710420f7 100644
--- a/arch/x86/kernel/cpu/mcheck/non-fatal.c
+++ b/arch/x86/kernel/cpu/mcheck/non-fatal.c
@@ -6,15 +6,14 @@
* This file contains routines to check for non-fatal MCEs every 15s
*
*/
-
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/jiffies.h>
-#include <linux/workqueue.h>
#include <linux/interrupt.h>
-#include <linux/smp.h>
+#include <linux/workqueue.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/smp.h>
#include <asm/processor.h>
#include <asm/system.h>
@@ -22,9 +21,9 @@
#include "mce.h"
-static int firstbank;
+static int firstbank;
-#define MCE_RATE 15*HZ /* timer rate is 15s */
+#define MCE_RATE (15*HZ) /* timer rate is 15s */
static void mce_checkregs(void *info)
{
@@ -34,23 +33,24 @@ static void mce_checkregs(void *info)
for (i = firstbank; i < nr_mce_banks; i++) {
rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
- if (high & (1<<31)) {
- printk(KERN_INFO "MCE: The hardware reports a non "
- "fatal, correctable incident occurred on "
- "CPU %d.\n",
+ if (!(high & (1<<31)))
+ continue;
+
+ printk(KERN_INFO "MCE: The hardware reports a non fatal, "
+ "correctable incident occurred on CPU %d.\n",
smp_processor_id());
- printk(KERN_INFO "Bank %d: %08x%08x\n", i, high, low);
-
- /*
- * Scrub the error so we don't pick it up in MCE_RATE
- * seconds time.
- */
- wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
-
- /* Serialize */
- wmb();
- add_taint(TAINT_MACHINE_CHECK);
- }
+
+ printk(KERN_INFO "Bank %d: %08x%08x\n", i, high, low);
+
+ /*
+ * Scrub the error so we don't pick it up in MCE_RATE
+ * seconds time:
+ */
+ wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
+
+ /* Serialize: */
+ wmb();
+ add_taint(TAINT_MACHINE_CHECK);
}
}
@@ -77,16 +77,17 @@ static int __init init_nonfatal_mce_checker(void)
/* Some Athlons misbehave when we frob bank 0 */
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
- boot_cpu_data.x86 == 6)
- firstbank = 1;
+ boot_cpu_data.x86 == 6)
+ firstbank = 1;
else
- firstbank = 0;
+ firstbank = 0;
/*
* Check for non-fatal errors every MCE_RATE s
*/
schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE));
printk(KERN_INFO "Machine check exception polling timer started.\n");
+
return 0;
}
module_init(init_nonfatal_mce_checker);
diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c
index f53bdcbaf38..82cee108a2d 100644
--- a/arch/x86/kernel/cpu/mcheck/p4.c
+++ b/arch/x86/kernel/cpu/mcheck/p4.c
@@ -2,18 +2,17 @@
* P4 specific Machine Check Exception Reporting
*/
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
#include <linux/smp.h>
+#include <asm/therm_throt.h>
#include <asm/processor.h>
#include <asm/system.h>
-#include <asm/msr.h>
#include <asm/apic.h>
-
-#include <asm/therm_throt.h>
+#include <asm/msr.h>
#include "mce.h"
@@ -36,6 +35,7 @@ static int mce_num_extended_msrs;
#ifdef CONFIG_X86_MCE_P4THERMAL
+
static void unexpected_thermal_interrupt(struct pt_regs *regs)
{
printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n",
@@ -43,7 +43,7 @@ static void unexpected_thermal_interrupt(struct pt_regs *regs)
add_taint(TAINT_MACHINE_CHECK);
}
-/* P4/Xeon Thermal transition interrupt handler */
+/* P4/Xeon Thermal transition interrupt handler: */
static void intel_thermal_interrupt(struct pt_regs *regs)
{
__u64 msr_val;
@@ -51,11 +51,12 @@ static void intel_thermal_interrupt(struct pt_regs *regs)
ack_APIC_irq();
rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
- therm_throt_process(msr_val & 0x1);
+ therm_throt_process(msr_val & THERM_STATUS_PROCHOT);
}
-/* Thermal interrupt handler for this CPU setup */
-static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = unexpected_thermal_interrupt;
+/* Thermal interrupt handler for this CPU setup: */
+static void (*vendor_thermal_interrupt)(struct pt_regs *regs) =
+ unexpected_thermal_interrupt;
void smp_thermal_interrupt(struct pt_regs *regs)
{
@@ -65,67 +66,15 @@ void smp_thermal_interrupt(struct pt_regs *regs)
irq_exit();
}
-/* P4/Xeon Thermal regulation detect and init */
-static void intel_init_thermal(struct cpuinfo_x86 *c)
+void intel_set_thermal_handler(void)
{
- u32 l, h;
- unsigned int cpu = smp_processor_id();
-
- /* Thermal monitoring */
- if (!cpu_has(c, X86_FEATURE_ACPI))
- return; /* -ENODEV */
-
- /* Clock modulation */
- if (!cpu_has(c, X86_FEATURE_ACC))
- return; /* -ENODEV */
-
- /* first check if its enabled already, in which case there might
- * be some SMM goo which handles it, so we can't even put a handler
- * since it might be delivered via SMI already -zwanem.
- */
- rdmsr(MSR_IA32_MISC_ENABLE, l, h);
- h = apic_read(APIC_LVTTHMR);
- if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
- printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n",
- cpu);
- return; /* -EBUSY */
- }
-
- /* check whether a vector already exists, temporarily masked? */
- if (h & APIC_VECTOR_MASK) {
- printk(KERN_DEBUG "CPU%d: Thermal LVT vector (%#x) already "
- "installed\n",
- cpu, (h & APIC_VECTOR_MASK));
- return; /* -EBUSY */
- }
-
- /* The temperature transition interrupt handler setup */
- h = THERMAL_APIC_VECTOR; /* our delivery vector */
- h |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */
- apic_write(APIC_LVTTHMR, h);
-
- rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
- wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03 , h);
-
- /* ok we're good to go... */
vendor_thermal_interrupt = intel_thermal_interrupt;
-
- rdmsr(MSR_IA32_MISC_ENABLE, l, h);
- wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);
-
- l = apic_read(APIC_LVTTHMR);
- apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
- printk(KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu);
-
- /* enable thermal throttle processing */
- atomic_set(&therm_throt_en, 1);
- return;
}
-#endif /* CONFIG_X86_MCE_P4THERMAL */
+#endif /* CONFIG_X86_MCE_P4THERMAL */
/* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */
-static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
+static void intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
{
u32 h;
@@ -143,9 +92,9 @@ static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
static void intel_machine_check(struct pt_regs *regs, long error_code)
{
- int recover = 1;
u32 alow, ahigh, high, low;
u32 mcgstl, mcgsth;
+ int recover = 1;
int i;
rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
@@ -157,7 +106,9 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
if (mce_num_extended_msrs > 0) {
struct intel_mce_extended_msrs dbg;
+
intel_get_extended_msrs(&dbg);
+
printk(KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n"
"\teax: %08x ebx: %08x ecx: %08x edx: %08x\n"
"\tesi: %08x edi: %08x ebp: %08x esp: %08x\n",
@@ -171,6 +122,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
if (high & (1<<31)) {
char misc[20];
char addr[24];
+
misc[0] = addr[0] = '\0';
if (high & (1<<29))
recover |= 1;
@@ -196,6 +148,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
panic("Unable to continue");
printk(KERN_EMERG "Attempting to continue.\n");
+
/*
* Do not clear the MSR_IA32_MCi_STATUS if the error is not
* recoverable/continuable.This will allow BIOS to look at the MSRs
@@ -217,7 +170,6 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
}
-
void intel_p4_mcheck_init(struct cpuinfo_x86 *c)
{
u32 l, h;
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c
index c9f77ea69ed..015f481ab1b 100644
--- a/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/arch/x86/kernel/cpu/mcheck/p5.c
@@ -2,11 +2,10 @@
* P5 specific Machine Check Exception Reporting
* (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>
*/
-
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
#include <linux/smp.h>
#include <asm/processor.h>
@@ -15,39 +14,58 @@
#include "mce.h"
-/* Machine check handler for Pentium class Intel */
+/* By default disabled */
+int mce_p5_enable;
+
+/* Machine check handler for Pentium class Intel CPUs: */
static void pentium_machine_check(struct pt_regs *regs, long error_code)
{
u32 loaddr, hi, lotype;
+
rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi);
- printk(KERN_EMERG "CPU#%d: Machine Check Exception: 0x%8X (type 0x%8X).\n", smp_processor_id(), loaddr, lotype);
- if (lotype&(1<<5))
- printk(KERN_EMERG "CPU#%d: Possible thermal failure (CPU on fire ?).\n", smp_processor_id());
+
+ printk(KERN_EMERG
+ "CPU#%d: Machine Check Exception: 0x%8X (type 0x%8X).\n",
+ smp_processor_id(), loaddr, lotype);
+
+ if (lotype & (1<<5)) {
+ printk(KERN_EMERG
+ "CPU#%d: Possible thermal failure (CPU on fire ?).\n",
+ smp_processor_id());
+ }
+
add_taint(TAINT_MACHINE_CHECK);
}
-/* Set up machine check reporting for processors with Intel style MCE */
+/* Set up machine check reporting for processors with Intel style MCE: */
void intel_p5_mcheck_init(struct cpuinfo_x86 *c)
{
u32 l, h;
- /*Check for MCE support */
+ /* Check for MCE support: */
if (!cpu_has(c, X86_FEATURE_MCE))
return;
- /* Default P5 to off as its often misconnected */
+#ifdef CONFIG_X86_OLD_MCE
+ /* Default P5 to off as its often misconnected: */
if (mce_disabled != -1)
return;
+#endif
+
machine_check_vector = pentium_machine_check;
+ /* Make sure the vector pointer is visible before we enable MCEs: */
wmb();
- /* Read registers before enabling */
+ /* Read registers before enabling: */
rdmsr(MSR_IA32_P5_MC_ADDR, l, h);
rdmsr(MSR_IA32_P5_MC_TYPE, l, h);
- printk(KERN_INFO "Intel old style machine check architecture supported.\n");
+ printk(KERN_INFO
+ "Intel old style machine check architecture supported.\n");
- /* Enable MCE */
+ /* Enable MCE: */
set_in_cr4(X86_CR4_MCE);
- printk(KERN_INFO "Intel old style machine check reporting enabled on CPU#%d.\n", smp_processor_id());
+ printk(KERN_INFO
+ "Intel old style machine check reporting enabled on CPU#%d.\n",
+ smp_processor_id());
}
diff --git a/arch/x86/kernel/cpu/mcheck/p6.c b/arch/x86/kernel/cpu/mcheck/p6.c
index 2ac52d7b434..43c24e66745 100644
--- a/arch/x86/kernel/cpu/mcheck/p6.c
+++ b/arch/x86/kernel/cpu/mcheck/p6.c
@@ -2,11 +2,10 @@
* P6 specific Machine Check Exception Reporting
* (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>
*/
-
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
#include <linux/smp.h>
#include <asm/processor.h>
@@ -18,9 +17,9 @@
/* Machine Check Handler For PII/PIII */
static void intel_machine_check(struct pt_regs *regs, long error_code)
{
- int recover = 1;
u32 alow, ahigh, high, low;
u32 mcgstl, mcgsth;
+ int recover = 1;
int i;
rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
@@ -35,12 +34,16 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
if (high & (1<<31)) {
char misc[20];
char addr[24];
- misc[0] = addr[0] = '\0';
+
+ misc[0] = '\0';
+ addr[0] = '\0';
+
if (high & (1<<29))
recover |= 1;
if (high & (1<<25))
recover |= 2;
high &= ~(1<<31);
+
if (high & (1<<27)) {
rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);
snprintf(misc, 20, "[%08x%08x]", ahigh, alow);
@@ -49,6 +52,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
snprintf(addr, 24, " at %08x%08x", ahigh, alow);
}
+
printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
smp_processor_id(), i, high, low, misc, addr);
}
@@ -63,16 +67,17 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
/*
* Do not clear the MSR_IA32_MCi_STATUS if the error is not
* recoverable/continuable.This will allow BIOS to look at the MSRs
- * for errors if the OS could not log the error.
+ * for errors if the OS could not log the error:
*/
for (i = 0; i < nr_mce_banks; i++) {
unsigned int msr;
+
msr = MSR_IA32_MC0_STATUS+i*4;
rdmsr(msr, low, high);
if (high & (1<<31)) {
- /* Clear it */
+ /* Clear it: */
wrmsr(msr, 0UL, 0UL);
- /* Serialize */
+ /* Serialize: */
wmb();
add_taint(TAINT_MACHINE_CHECK);
}
@@ -81,7 +86,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
}
-/* Set up machine check reporting for processors with Intel style MCE */
+/* Set up machine check reporting for processors with Intel style MCE: */
void intel_p6_mcheck_init(struct cpuinfo_x86 *c)
{
u32 l, h;
@@ -97,6 +102,7 @@ void intel_p6_mcheck_init(struct cpuinfo_x86 *c)
/* Ok machine check is available */
machine_check_vector = intel_machine_check;
+ /* Make sure the vector pointer is visible before we enable MCEs: */
wmb();
printk(KERN_INFO "Intel machine check architecture supported.\n");
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index d5ae2243f0b..7b1ae2e20ba 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -1,7 +1,7 @@
/*
- *
* Thermal throttle event support code (such as syslog messaging and rate
* limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c).
+ *
* This allows consistent reporting of CPU thermal throttle events.
*
* Maintains a counter in /sys that keeps track of the number of thermal
@@ -13,43 +13,43 @@
* Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c.
* Inspired by Ross Biro's and Al Borchers' counter code.
*/
-
+#include <linux/notifier.h>
+#include <linux/jiffies.h>
#include <linux/percpu.h>
#include <linux/sysdev.h>
#include <linux/cpu.h>
-#include <asm/cpu.h>
-#include <linux/notifier.h>
-#include <linux/jiffies.h>
+
#include <asm/therm_throt.h>
/* How long to wait between reporting thermal events */
-#define CHECK_INTERVAL (300 * HZ)
+#define CHECK_INTERVAL (300 * HZ)
static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES;
static DEFINE_PER_CPU(unsigned long, thermal_throttle_count);
-atomic_t therm_throt_en = ATOMIC_INIT(0);
+
+atomic_t therm_throt_en = ATOMIC_INIT(0);
#ifdef CONFIG_SYSFS
-#define define_therm_throt_sysdev_one_ro(_name) \
- static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL)
-
-#define define_therm_throt_sysdev_show_func(name) \
-static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \
- struct sysdev_attribute *attr, \
- char *buf) \
-{ \
- unsigned int cpu = dev->id; \
- ssize_t ret; \
- \
- preempt_disable(); /* CPU hotplug */ \
- if (cpu_online(cpu)) \
- ret = sprintf(buf, "%lu\n", \
- per_cpu(thermal_throttle_##name, cpu)); \
- else \
- ret = 0; \
- preempt_enable(); \
- \
- return ret; \
+#define define_therm_throt_sysdev_one_ro(_name) \
+ static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL)
+
+#define define_therm_throt_sysdev_show_func(name) \
+static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \
+ struct sysdev_attribute *attr, \
+ char *buf) \
+{ \
+ unsigned int cpu = dev->id; \
+ ssize_t ret; \
+ \
+ preempt_disable(); /* CPU hotplug */ \
+ if (cpu_online(cpu)) \
+ ret = sprintf(buf, "%lu\n", \
+ per_cpu(thermal_throttle_##name, cpu)); \
+ else \
+ ret = 0; \
+ preempt_enable(); \
+ \
+ return ret; \
}
define_therm_throt_sysdev_show_func(count);
@@ -61,8 +61,8 @@ static struct attribute *thermal_throttle_attrs[] = {
};
static struct attribute_group thermal_throttle_attr_group = {
- .attrs = thermal_throttle_attrs,
- .name = "thermal_throttle"
+ .attrs = thermal_throttle_attrs,
+ .name = "thermal_throttle"
};
#endif /* CONFIG_SYSFS */
@@ -110,10 +110,11 @@ int therm_throt_process(int curr)
}
#ifdef CONFIG_SYSFS
-/* Add/Remove thermal_throttle interface for CPU device */
+/* Add/Remove thermal_throttle interface for CPU device: */
static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev)
{
- return sysfs_create_group(&sys_dev->kobj, &thermal_throttle_attr_group);
+ return sysfs_create_group(&sys_dev->kobj,
+ &thermal_throttle_attr_group);
}
static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev)
@@ -121,19 +122,21 @@ static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev)
sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group);
}
-/* Mutex protecting device creation against CPU hotplug */
+/* Mutex protecting device creation against CPU hotplug: */
static DEFINE_MUTEX(therm_cpu_lock);
/* Get notified when a cpu comes on/off. Be hotplug friendly. */
-static __cpuinit int thermal_throttle_cpu_callback(struct notifier_block *nfb,
- unsigned long action,
- void *hcpu)
+static __cpuinit int
+thermal_throttle_cpu_callback(struct notifier_block *nfb,
+ unsigned long action,
+ void *hcpu)
{
unsigned int cpu = (unsigned long)hcpu;
struct sys_device *sys_dev;
int err = 0;
sys_dev = get_cpu_sysdev(cpu);
+
switch (action) {
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c
index 23ee9e730f7..d746df2909c 100644
--- a/arch/x86/kernel/cpu/mcheck/threshold.c
+++ b/arch/x86/kernel/cpu/mcheck/threshold.c
@@ -17,7 +17,7 @@ static void default_threshold_interrupt(void)
void (*mce_threshold_vector)(void) = default_threshold_interrupt;
-asmlinkage void mce_threshold_interrupt(void)
+asmlinkage void smp_threshold_interrupt(void)
{
exit_idle();
irq_enter();
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c
index 2a043d89811..81b02487090 100644
--- a/arch/x86/kernel/cpu/mcheck/winchip.c
+++ b/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -2,11 +2,10 @@
* IDT Winchip specific Machine Check Exception Reporting
* (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>
*/
-
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
#include <asm/processor.h>
#include <asm/system.h>
@@ -14,7 +13,7 @@
#include "mce.h"
-/* Machine check handler for WinChip C6 */
+/* Machine check handler for WinChip C6: */
static void winchip_machine_check(struct pt_regs *regs, long error_code)
{
printk(KERN_EMERG "CPU0: Machine Check Exception.\n");
@@ -25,12 +24,18 @@ static void winchip_machine_check(struct pt_regs *regs, long error_code)
void winchip_mcheck_init(struct cpuinfo_x86 *c)
{
u32 lo, hi;
+
machine_check_vector = winchip_machine_check;
+ /* Make sure the vector pointer is visible before we enable MCEs: */
wmb();
+
rdmsr(MSR_IDT_FCR1, lo, hi);
lo |= (1<<2); /* Enable EIERRINT (int 18 MCE) */
lo &= ~(1<<4); /* Enable MCE */
wrmsr(MSR_IDT_FCR1, lo, hi);
+
set_in_cr4(X86_CR4_MCE);
- printk(KERN_INFO "Winchip machine check reporting enabled on CPU#0.\n");
+
+ printk(KERN_INFO
+ "Winchip machine check reporting enabled on CPU#0.\n");
}
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index ce0fe4b5c04..1d584a18a50 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -808,7 +808,7 @@ int __init mtrr_cleanup(unsigned address_bits)
if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1)
return 0;
- rdmsr(MTRRdefType_MSR, def, dummy);
+ rdmsr(MSR_MTRRdefType, def, dummy);
def &= 0xff;
if (def != MTRR_TYPE_UNCACHABLE)
return 0;
@@ -1003,7 +1003,7 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
*/
if (!is_cpu(INTEL) || disable_mtrr_trim)
return 0;
- rdmsr(MTRRdefType_MSR, def, dummy);
+ rdmsr(MSR_MTRRdefType, def, dummy);
def &= 0xff;
if (def != MTRR_TYPE_UNCACHABLE)
return 0;
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 0b776c09aff..0543f69f0b2 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -20,9 +20,9 @@ struct fixed_range_block {
};
static struct fixed_range_block fixed_range_blocks[] = {
- { MTRRfix64K_00000_MSR, 1 }, /* one 64k MTRR */
- { MTRRfix16K_80000_MSR, 2 }, /* two 16k MTRRs */
- { MTRRfix4K_C0000_MSR, 8 }, /* eight 4k MTRRs */
+ { MSR_MTRRfix64K_00000, 1 }, /* one 64k MTRR */
+ { MSR_MTRRfix16K_80000, 2 }, /* two 16k MTRRs */
+ { MSR_MTRRfix4K_C0000, 8 }, /* eight 4k MTRRs */
{}
};
@@ -194,12 +194,12 @@ get_fixed_ranges(mtrr_type * frs)
k8_check_syscfg_dram_mod_en();
- rdmsr(MTRRfix64K_00000_MSR, p[0], p[1]);
+ rdmsr(MSR_MTRRfix64K_00000, p[0], p[1]);
for (i = 0; i < 2; i++)
- rdmsr(MTRRfix16K_80000_MSR + i, p[2 + i * 2], p[3 + i * 2]);
+ rdmsr(MSR_MTRRfix16K_80000 + i, p[2 + i * 2], p[3 + i * 2]);
for (i = 0; i < 8; i++)
- rdmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], p[7 + i * 2]);
+ rdmsr(MSR_MTRRfix4K_C0000 + i, p[6 + i * 2], p[7 + i * 2]);
}
void mtrr_save_fixed_ranges(void *info)
@@ -275,7 +275,11 @@ static void __init print_mtrr_state(void)
}
printk(KERN_DEBUG "MTRR variable ranges %sabled:\n",
mtrr_state.enabled & 2 ? "en" : "dis");
- high_width = ((size_or_mask ? ffs(size_or_mask) - 1 : 32) - (32 - PAGE_SHIFT) + 3) / 4;
+ if (size_or_mask & 0xffffffffUL)
+ high_width = ffs(size_or_mask & 0xffffffffUL) - 1;
+ else
+ high_width = ffs(size_or_mask>>32) + 32 - 1;
+ high_width = (high_width - (32 - PAGE_SHIFT) + 3) / 4;
for (i = 0; i < num_var_ranges; ++i) {
if (mtrr_state.var_ranges[i].mask_lo & (1 << 11))
printk(KERN_DEBUG " %u base %0*X%05X000 mask %0*X%05X000 %s\n",
@@ -306,7 +310,7 @@ void __init get_mtrr_state(void)
vrs = mtrr_state.var_ranges;
- rdmsr(MTRRcap_MSR, lo, dummy);
+ rdmsr(MSR_MTRRcap, lo, dummy);
mtrr_state.have_fixed = (lo >> 8) & 1;
for (i = 0; i < num_var_ranges; i++)
@@ -314,7 +318,7 @@ void __init get_mtrr_state(void)
if (mtrr_state.have_fixed)
get_fixed_ranges(mtrr_state.fixed_ranges);
- rdmsr(MTRRdefType_MSR, lo, dummy);
+ rdmsr(MSR_MTRRdefType, lo, dummy);
mtrr_state.def_type = (lo & 0xff);
mtrr_state.enabled = (lo & 0xc00) >> 10;
@@ -579,10 +583,10 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
__flush_tlb();
/* Save MTRR state */
- rdmsr(MTRRdefType_MSR, deftype_lo, deftype_hi);
+ rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);
/* Disable MTRRs, and set the default type to uncached */
- mtrr_wrmsr(MTRRdefType_MSR, deftype_lo & ~0xcff, deftype_hi);
+ mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi);
}
static void post_set(void) __releases(set_atomicity_lock)
@@ -591,7 +595,7 @@ static void post_set(void) __releases(set_atomicity_lock)
__flush_tlb();
/* Intel (P6) standard MTRRs */
- mtrr_wrmsr(MTRRdefType_MSR, deftype_lo, deftype_hi);
+ mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);
/* Enable caches */
write_cr0(read_cr0() & 0xbfffffff);
@@ -703,7 +707,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size, unsigned i
static int generic_have_wrcomb(void)
{
unsigned long config, dummy;
- rdmsr(MTRRcap_MSR, config, dummy);
+ rdmsr(MSR_MTRRcap, config, dummy);
return (config & (1 << 10));
}
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 03cda01f57c..8fc248b5aea 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -104,7 +104,7 @@ static void __init set_num_var_ranges(void)
unsigned long config = 0, dummy;
if (use_intel()) {
- rdmsr(MTRRcap_MSR, config, dummy);
+ rdmsr(MSR_MTRRcap, config, dummy);
} else if (is_cpu(AMD))
config = 2;
else if (is_cpu(CYRIX) || is_cpu(CENTAUR))
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index 77f67f7b347..7538b767f20 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -5,21 +5,6 @@
#include <linux/types.h>
#include <linux/stddef.h>
-#define MTRRcap_MSR 0x0fe
-#define MTRRdefType_MSR 0x2ff
-
-#define MTRRfix64K_00000_MSR 0x250
-#define MTRRfix16K_80000_MSR 0x258
-#define MTRRfix16K_A0000_MSR 0x259
-#define MTRRfix4K_C0000_MSR 0x268
-#define MTRRfix4K_C8000_MSR 0x269
-#define MTRRfix4K_D0000_MSR 0x26a
-#define MTRRfix4K_D8000_MSR 0x26b
-#define MTRRfix4K_E0000_MSR 0x26c
-#define MTRRfix4K_E8000_MSR 0x26d
-#define MTRRfix4K_F0000_MSR 0x26e
-#define MTRRfix4K_F8000_MSR 0x26f
-
#define MTRR_CHANGE_MASK_FIXED 0x01
#define MTRR_CHANGE_MASK_VARIABLE 0x02
#define MTRR_CHANGE_MASK_DEFTYPE 0x04
diff --git a/arch/x86/kernel/cpu/mtrr/state.c b/arch/x86/kernel/cpu/mtrr/state.c
index 7f7e2753685..1f5fb1588d1 100644
--- a/arch/x86/kernel/cpu/mtrr/state.c
+++ b/arch/x86/kernel/cpu/mtrr/state.c
@@ -35,7 +35,7 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt)
if (use_intel())
/* Save MTRR state */
- rdmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi);
+ rdmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi);
else
/* Cyrix ARRs - everything else were excluded at the top */
ctxt->ccr3 = getCx86(CX86_CCR3);
@@ -46,7 +46,7 @@ void set_mtrr_cache_disable(struct set_mtrr_context *ctxt)
{
if (use_intel())
/* Disable MTRRs, and set the default type to uncached */
- mtrr_wrmsr(MTRRdefType_MSR, ctxt->deftype_lo & 0xf300UL,
+ mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo & 0xf300UL,
ctxt->deftype_hi);
else if (is_cpu(CYRIX))
/* Cyrix ARRs - everything else were excluded at the top */
@@ -64,7 +64,7 @@ void set_mtrr_done(struct set_mtrr_context *ctxt)
/* Restore MTRRdefType */
if (use_intel())
/* Intel (P6) standard MTRRs */
- mtrr_wrmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi);
+ mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi);
else
/* Cyrix ARRs - everything else was excluded at the top */
setCx86(CX86_CCR3, ctxt->ccr3);
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
new file mode 100644
index 00000000000..275bc142cd5
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -0,0 +1,1711 @@
+/*
+ * Performance counter x86 architecture code
+ *
+ * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
+ * Copyright (C) 2009 Jaswinder Singh Rajput
+ * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
+ * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *
+ * For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/perf_counter.h>
+#include <linux/capability.h>
+#include <linux/notifier.h>
+#include <linux/hardirq.h>
+#include <linux/kprobes.h>
+#include <linux/module.h>
+#include <linux/kdebug.h>
+#include <linux/sched.h>
+#include <linux/uaccess.h>
+
+#include <asm/apic.h>
+#include <asm/stacktrace.h>
+#include <asm/nmi.h>
+
+static u64 perf_counter_mask __read_mostly;
+
+struct cpu_hw_counters {
+ struct perf_counter *counters[X86_PMC_IDX_MAX];
+ unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+ unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+ unsigned long interrupts;
+ int enabled;
+};
+
+/*
+ * struct x86_pmu - generic x86 pmu
+ */
+struct x86_pmu {
+ const char *name;
+ int version;
+ int (*handle_irq)(struct pt_regs *);
+ void (*disable_all)(void);
+ void (*enable_all)(void);
+ void (*enable)(struct hw_perf_counter *, int);
+ void (*disable)(struct hw_perf_counter *, int);
+ unsigned eventsel;
+ unsigned perfctr;
+ u64 (*event_map)(int);
+ u64 (*raw_event)(u64);
+ int max_events;
+ int num_counters;
+ int num_counters_fixed;
+ int counter_bits;
+ u64 counter_mask;
+ u64 max_period;
+ u64 intel_ctrl;
+};
+
+static struct x86_pmu x86_pmu __read_mostly;
+
+static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = {
+ .enabled = 1,
+};
+
+/*
+ * Intel PerfMon v3. Used on Core2 and later.
+ */
+static const u64 intel_perfmon_event_map[] =
+{
+ [PERF_COUNT_HW_CPU_CYCLES] = 0x003c,
+ [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
+ [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e,
+ [PERF_COUNT_HW_CACHE_MISSES] = 0x412e,
+ [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
+ [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
+ [PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
+};
+
+static u64 intel_pmu_event_map(int event)
+{
+ return intel_perfmon_event_map[event];
+}
+
+/*
+ * Generalized hw caching related event table, filled
+ * in on a per model basis. A value of 0 means
+ * 'not supported', -1 means 'event makes no sense on
+ * this CPU', any other value means the raw event
+ * ID.
+ */
+
+#define C(x) PERF_COUNT_HW_CACHE_##x
+
+static u64 __read_mostly hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX];
+
+static const u64 nehalem_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
+ [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
+ [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */
+ [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
+ [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */
+ [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */
+ [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */
+ [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
+ [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
+ [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */
+ [ C(RESULT_MISS) ] = 0x20c8, /* ITLB_MISS_RETIRED */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
+ [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+};
+
+static const u64 core2_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
+ [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
+ [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS */
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */
+ [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
+ [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
+ [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
+ [ C(RESULT_MISS) ] = 0x0208, /* DTLB_MISSES.MISS_LD */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
+ [ C(RESULT_MISS) ] = 0x0808, /* DTLB_MISSES.MISS_ST */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
+ [ C(RESULT_MISS) ] = 0x1282, /* ITLBMISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
+ [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+};
+
+static const u64 atom_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD */
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST */
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
+ [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
+ [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
+ [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI (alias) */
+ [ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI (alias) */
+ [ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
+ [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
+ [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+};
+
+static u64 intel_pmu_raw_event(u64 event)
+{
+#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL
+#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL
+#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL
+#define CORE_EVNTSEL_INV_MASK 0x00800000ULL
+#define CORE_EVNTSEL_COUNTER_MASK 0xFF000000ULL
+
+#define CORE_EVNTSEL_MASK \
+ (CORE_EVNTSEL_EVENT_MASK | \
+ CORE_EVNTSEL_UNIT_MASK | \
+ CORE_EVNTSEL_EDGE_MASK | \
+ CORE_EVNTSEL_INV_MASK | \
+ CORE_EVNTSEL_COUNTER_MASK)
+
+ return event & CORE_EVNTSEL_MASK;
+}
+
+static const u64 amd_0f_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches */
+ [ C(RESULT_MISS) ] = 0x0081, /* Instruction cache misses */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */
+ [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr. */
+ [ C(RESULT_MISS) ] = 0x00c3, /* Retired Mispredicted BI */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+};
+
+/*
+ * AMD Performance Monitor K7 and later.
+ */
+static const u64 amd_perfmon_event_map[] =
+{
+ [PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
+ [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
+ [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080,
+ [PERF_COUNT_HW_CACHE_MISSES] = 0x0081,
+ [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
+ [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
+};
+
+static u64 amd_pmu_event_map(int event)
+{
+ return amd_perfmon_event_map[event];
+}
+
+static u64 amd_pmu_raw_event(u64 event)
+{
+#define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL
+#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL
+#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL
+#define K7_EVNTSEL_INV_MASK 0x000800000ULL
+#define K7_EVNTSEL_COUNTER_MASK 0x0FF000000ULL
+
+#define K7_EVNTSEL_MASK \
+ (K7_EVNTSEL_EVENT_MASK | \
+ K7_EVNTSEL_UNIT_MASK | \
+ K7_EVNTSEL_EDGE_MASK | \
+ K7_EVNTSEL_INV_MASK | \
+ K7_EVNTSEL_COUNTER_MASK)
+
+ return event & K7_EVNTSEL_MASK;
+}
+
+/*
+ * Propagate counter elapsed time into the generic counter.
+ * Can only be executed on the CPU where the counter is active.
+ * Returns the delta events processed.
+ */
+static u64
+x86_perf_counter_update(struct perf_counter *counter,
+ struct hw_perf_counter *hwc, int idx)
+{
+ int shift = 64 - x86_pmu.counter_bits;
+ u64 prev_raw_count, new_raw_count;
+ s64 delta;
+
+ /*
+ * Careful: an NMI might modify the previous counter value.
+ *
+ * Our tactic to handle this is to first atomically read and
+ * exchange a new raw count - then add that new-prev delta
+ * count to the generic counter atomically:
+ */
+again:
+ prev_raw_count = atomic64_read(&hwc->prev_count);
+ rdmsrl(hwc->counter_base + idx, new_raw_count);
+
+ if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
+ new_raw_count) != prev_raw_count)
+ goto again;
+
+ /*
+ * Now we have the new raw value and have updated the prev
+ * timestamp already. We can now calculate the elapsed delta
+ * (counter-)time and add that to the generic counter.
+ *
+ * Careful, not all hw sign-extends above the physical width
+ * of the count.
+ */
+ delta = (new_raw_count << shift) - (prev_raw_count << shift);
+ delta >>= shift;
+
+ atomic64_add(delta, &counter->count);
+ atomic64_sub(delta, &hwc->period_left);
+
+ return new_raw_count;
+}
+
+static atomic_t active_counters;
+static DEFINE_MUTEX(pmc_reserve_mutex);
+
+static bool reserve_pmc_hardware(void)
+{
+ int i;
+
+ if (nmi_watchdog == NMI_LOCAL_APIC)
+ disable_lapic_nmi_watchdog();
+
+ for (i = 0; i < x86_pmu.num_counters; i++) {
+ if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
+ goto perfctr_fail;
+ }
+
+ for (i = 0; i < x86_pmu.num_counters; i++) {
+ if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
+ goto eventsel_fail;
+ }
+
+ return true;
+
+eventsel_fail:
+ for (i--; i >= 0; i--)
+ release_evntsel_nmi(x86_pmu.eventsel + i);
+
+ i = x86_pmu.num_counters;
+
+perfctr_fail:
+ for (i--; i >= 0; i--)
+ release_perfctr_nmi(x86_pmu.perfctr + i);
+
+ if (nmi_watchdog == NMI_LOCAL_APIC)
+ enable_lapic_nmi_watchdog();
+
+ return false;
+}
+
+static void release_pmc_hardware(void)
+{
+ int i;
+
+ for (i = 0; i < x86_pmu.num_counters; i++) {
+ release_perfctr_nmi(x86_pmu.perfctr + i);
+ release_evntsel_nmi(x86_pmu.eventsel + i);
+ }
+
+ if (nmi_watchdog == NMI_LOCAL_APIC)
+ enable_lapic_nmi_watchdog();
+}
+
+static void hw_perf_counter_destroy(struct perf_counter *counter)
+{
+ if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) {
+ release_pmc_hardware();
+ mutex_unlock(&pmc_reserve_mutex);
+ }
+}
+
+static inline int x86_pmu_initialized(void)
+{
+ return x86_pmu.handle_irq != NULL;
+}
+
+static inline int
+set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr)
+{
+ unsigned int cache_type, cache_op, cache_result;
+ u64 config, val;
+
+ config = attr->config;
+
+ cache_type = (config >> 0) & 0xff;
+ if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
+ return -EINVAL;
+
+ cache_op = (config >> 8) & 0xff;
+ if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
+ return -EINVAL;
+
+ cache_result = (config >> 16) & 0xff;
+ if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
+ return -EINVAL;
+
+ val = hw_cache_event_ids[cache_type][cache_op][cache_result];
+
+ if (val == 0)
+ return -ENOENT;
+
+ if (val == -1)
+ return -EINVAL;
+
+ hwc->config |= val;
+
+ return 0;
+}
+
+/*
+ * Setup the hardware configuration for a given attr_type
+ */
+static int __hw_perf_counter_init(struct perf_counter *counter)
+{
+ struct perf_counter_attr *attr = &counter->attr;
+ struct hw_perf_counter *hwc = &counter->hw;
+ int err;
+
+ if (!x86_pmu_initialized())
+ return -ENODEV;
+
+ err = 0;
+ if (!atomic_inc_not_zero(&active_counters)) {
+ mutex_lock(&pmc_reserve_mutex);
+ if (atomic_read(&active_counters) == 0 && !reserve_pmc_hardware())
+ err = -EBUSY;
+ else
+ atomic_inc(&active_counters);
+ mutex_unlock(&pmc_reserve_mutex);
+ }
+ if (err)
+ return err;
+
+ /*
+ * Generate PMC IRQs:
+ * (keep 'enabled' bit clear for now)
+ */
+ hwc->config = ARCH_PERFMON_EVENTSEL_INT;
+
+ /*
+ * Count user and OS events unless requested not to.
+ */
+ if (!attr->exclude_user)
+ hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
+ if (!attr->exclude_kernel)
+ hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
+
+ if (!hwc->sample_period) {
+ hwc->sample_period = x86_pmu.max_period;
+ hwc->last_period = hwc->sample_period;
+ atomic64_set(&hwc->period_left, hwc->sample_period);
+ }
+
+ counter->destroy = hw_perf_counter_destroy;
+
+ /*
+ * Raw event type provide the config in the event structure
+ */
+ if (attr->type == PERF_TYPE_RAW) {
+ hwc->config |= x86_pmu.raw_event(attr->config);
+ return 0;
+ }
+
+ if (attr->type == PERF_TYPE_HW_CACHE)
+ return set_ext_hw_attr(hwc, attr);
+
+ if (attr->config >= x86_pmu.max_events)
+ return -EINVAL;
+ /*
+ * The generic map:
+ */
+ hwc->config |= x86_pmu.event_map(attr->config);
+
+ return 0;
+}
+
+static void intel_pmu_disable_all(void)
+{
+ wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+}
+
+static void amd_pmu_disable_all(void)
+{
+ struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+ int idx;
+
+ if (!cpuc->enabled)
+ return;
+
+ cpuc->enabled = 0;
+ /*
+ * ensure we write the disable before we start disabling the
+ * counters proper, so that amd_pmu_enable_counter() does the
+ * right thing.
+ */
+ barrier();
+
+ for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ u64 val;
+
+ if (!test_bit(idx, cpuc->active_mask))
+ continue;
+ rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
+ if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE))
+ continue;
+ val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
+ wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
+ }
+}
+
+void hw_perf_disable(void)
+{
+ if (!x86_pmu_initialized())
+ return;
+ return x86_pmu.disable_all();
+}
+
+static void intel_pmu_enable_all(void)
+{
+ wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
+}
+
+static void amd_pmu_enable_all(void)
+{
+ struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+ int idx;
+
+ if (cpuc->enabled)
+ return;
+
+ cpuc->enabled = 1;
+ barrier();
+
+ for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ u64 val;
+
+ if (!test_bit(idx, cpuc->active_mask))
+ continue;
+ rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
+ if (val & ARCH_PERFMON_EVENTSEL0_ENABLE)
+ continue;
+ val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+ wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
+ }
+}
+
+void hw_perf_enable(void)
+{
+ if (!x86_pmu_initialized())
+ return;
+ x86_pmu.enable_all();
+}
+
+static inline u64 intel_pmu_get_status(void)
+{
+ u64 status;
+
+ rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+
+ return status;
+}
+
+static inline void intel_pmu_ack_status(u64 ack)
+{
+ wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
+}
+
+static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
+{
+ int err;
+ err = checking_wrmsrl(hwc->config_base + idx,
+ hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
+}
+
+static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
+{
+ int err;
+ err = checking_wrmsrl(hwc->config_base + idx,
+ hwc->config);
+}
+
+static inline void
+intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx)
+{
+ int idx = __idx - X86_PMC_IDX_FIXED;
+ u64 ctrl_val, mask;
+ int err;
+
+ mask = 0xfULL << (idx * 4);
+
+ rdmsrl(hwc->config_base, ctrl_val);
+ ctrl_val &= ~mask;
+ err = checking_wrmsrl(hwc->config_base, ctrl_val);
+}
+
+static inline void
+intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
+{
+ if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
+ intel_pmu_disable_fixed(hwc, idx);
+ return;
+ }
+
+ x86_pmu_disable_counter(hwc, idx);
+}
+
+static inline void
+amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
+{
+ x86_pmu_disable_counter(hwc, idx);
+}
+
+static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
+
+/*
+ * Set the next IRQ period, based on the hwc->period_left value.
+ * To be called with the counter disabled in hw:
+ */
+static int
+x86_perf_counter_set_period(struct perf_counter *counter,
+ struct hw_perf_counter *hwc, int idx)
+{
+ s64 left = atomic64_read(&hwc->period_left);
+ s64 period = hwc->sample_period;
+ int err, ret = 0;
+
+ /*
+ * If we are way outside a reasoable range then just skip forward:
+ */
+ if (unlikely(left <= -period)) {
+ left = period;
+ atomic64_set(&hwc->period_left, left);
+ hwc->last_period = period;
+ ret = 1;
+ }
+
+ if (unlikely(left <= 0)) {
+ left += period;
+ atomic64_set(&hwc->period_left, left);
+ hwc->last_period = period;
+ ret = 1;
+ }
+ /*
+ * Quirk: certain CPUs dont like it if just 1 event is left:
+ */
+ if (unlikely(left < 2))
+ left = 2;
+
+ if (left > x86_pmu.max_period)
+ left = x86_pmu.max_period;
+
+ per_cpu(prev_left[idx], smp_processor_id()) = left;
+
+ /*
+ * The hw counter starts counting from this counter offset,
+ * mark it to be able to extra future deltas:
+ */
+ atomic64_set(&hwc->prev_count, (u64)-left);
+
+ err = checking_wrmsrl(hwc->counter_base + idx,
+ (u64)(-left) & x86_pmu.counter_mask);
+
+ return ret;
+}
+
+static inline void
+intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx)
+{
+ int idx = __idx - X86_PMC_IDX_FIXED;
+ u64 ctrl_val, bits, mask;
+ int err;
+
+ /*
+ * Enable IRQ generation (0x8),
+ * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
+ * if requested:
+ */
+ bits = 0x8ULL;
+ if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
+ bits |= 0x2;
+ if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
+ bits |= 0x1;
+ bits <<= (idx * 4);
+ mask = 0xfULL << (idx * 4);
+
+ rdmsrl(hwc->config_base, ctrl_val);
+ ctrl_val &= ~mask;
+ ctrl_val |= bits;
+ err = checking_wrmsrl(hwc->config_base, ctrl_val);
+}
+
+static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
+{
+ if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
+ intel_pmu_enable_fixed(hwc, idx);
+ return;
+ }
+
+ x86_pmu_enable_counter(hwc, idx);
+}
+
+static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
+{
+ struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+
+ if (cpuc->enabled)
+ x86_pmu_enable_counter(hwc, idx);
+ else
+ x86_pmu_disable_counter(hwc, idx);
+}
+
+static int
+fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
+{
+ unsigned int event;
+
+ if (!x86_pmu.num_counters_fixed)
+ return -1;
+
+ /*
+ * Quirk, IA32_FIXED_CTRs do not work on current Atom processors:
+ */
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
+ boot_cpu_data.x86_model == 28)
+ return -1;
+
+ event = hwc->config & ARCH_PERFMON_EVENT_MASK;
+
+ if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
+ return X86_PMC_IDX_FIXED_INSTRUCTIONS;
+ if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
+ return X86_PMC_IDX_FIXED_CPU_CYCLES;
+ if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES)))
+ return X86_PMC_IDX_FIXED_BUS_CYCLES;
+
+ return -1;
+}
+
+/*
+ * Find a PMC slot for the freshly enabled / scheduled in counter:
+ */
+static int x86_pmu_enable(struct perf_counter *counter)
+{
+ struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+ struct hw_perf_counter *hwc = &counter->hw;
+ int idx;
+
+ idx = fixed_mode_idx(counter, hwc);
+ if (idx >= 0) {
+ /*
+ * Try to get the fixed counter, if that is already taken
+ * then try to get a generic counter:
+ */
+ if (test_and_set_bit(idx, cpuc->used_mask))
+ goto try_generic;
+
+ hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
+ /*
+ * We set it so that counter_base + idx in wrmsr/rdmsr maps to
+ * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
+ */
+ hwc->counter_base =
+ MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
+ hwc->idx = idx;
+ } else {
+ idx = hwc->idx;
+ /* Try to get the previous generic counter again */
+ if (test_and_set_bit(idx, cpuc->used_mask)) {
+try_generic:
+ idx = find_first_zero_bit(cpuc->used_mask,
+ x86_pmu.num_counters);
+ if (idx == x86_pmu.num_counters)
+ return -EAGAIN;
+
+ set_bit(idx, cpuc->used_mask);
+ hwc->idx = idx;
+ }
+ hwc->config_base = x86_pmu.eventsel;
+ hwc->counter_base = x86_pmu.perfctr;
+ }
+
+ perf_counters_lapic_init();
+
+ x86_pmu.disable(hwc, idx);
+
+ cpuc->counters[idx] = counter;
+ set_bit(idx, cpuc->active_mask);
+
+ x86_perf_counter_set_period(counter, hwc, idx);
+ x86_pmu.enable(hwc, idx);
+
+ return 0;
+}
+
+static void x86_pmu_unthrottle(struct perf_counter *counter)
+{
+ struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+ struct hw_perf_counter *hwc = &counter->hw;
+
+ if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX ||
+ cpuc->counters[hwc->idx] != counter))
+ return;
+
+ x86_pmu.enable(hwc, hwc->idx);
+}
+
+void perf_counter_print_debug(void)
+{
+ u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
+ struct cpu_hw_counters *cpuc;
+ unsigned long flags;
+ int cpu, idx;
+
+ if (!x86_pmu.num_counters)
+ return;
+
+ local_irq_save(flags);
+
+ cpu = smp_processor_id();
+ cpuc = &per_cpu(cpu_hw_counters, cpu);
+
+ if (x86_pmu.version >= 2) {
+ rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
+ rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+ rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
+ rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
+
+ pr_info("\n");
+ pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl);
+ pr_info("CPU#%d: status: %016llx\n", cpu, status);
+ pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow);
+ pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed);
+ }
+ pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used_mask);
+
+ for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
+ rdmsrl(x86_pmu.perfctr + idx, pmc_count);
+
+ prev_left = per_cpu(prev_left[idx], cpu);
+
+ pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n",
+ cpu, idx, pmc_ctrl);
+ pr_info("CPU#%d: gen-PMC%d count: %016llx\n",
+ cpu, idx, pmc_count);
+ pr_info("CPU#%d: gen-PMC%d left: %016llx\n",
+ cpu, idx, prev_left);
+ }
+ for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
+ rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
+
+ pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
+ cpu, idx, pmc_count);
+ }
+ local_irq_restore(flags);
+}
+
+static void x86_pmu_disable(struct perf_counter *counter)
+{
+ struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+ struct hw_perf_counter *hwc = &counter->hw;
+ int idx = hwc->idx;
+
+ /*
+ * Must be done before we disable, otherwise the nmi handler
+ * could reenable again:
+ */
+ clear_bit(idx, cpuc->active_mask);
+ x86_pmu.disable(hwc, idx);
+
+ /*
+ * Make sure the cleared pointer becomes visible before we
+ * (potentially) free the counter:
+ */
+ barrier();
+
+ /*
+ * Drain the remaining delta count out of a counter
+ * that we are disabling:
+ */
+ x86_perf_counter_update(counter, hwc, idx);
+ cpuc->counters[idx] = NULL;
+ clear_bit(idx, cpuc->used_mask);
+}
+
+/*
+ * Save and restart an expired counter. Called by NMI contexts,
+ * so it has to be careful about preempting normal counter ops:
+ */
+static int intel_pmu_save_and_restart(struct perf_counter *counter)
+{
+ struct hw_perf_counter *hwc = &counter->hw;
+ int idx = hwc->idx;
+ int ret;
+
+ x86_perf_counter_update(counter, hwc, idx);
+ ret = x86_perf_counter_set_period(counter, hwc, idx);
+
+ if (counter->state == PERF_COUNTER_STATE_ACTIVE)
+ intel_pmu_enable_counter(hwc, idx);
+
+ return ret;
+}
+
+static void intel_pmu_reset(void)
+{
+ unsigned long flags;
+ int idx;
+
+ if (!x86_pmu.num_counters)
+ return;
+
+ local_irq_save(flags);
+
+ printk("clearing PMU state on CPU#%d\n", smp_processor_id());
+
+ for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
+ checking_wrmsrl(x86_pmu.perfctr + idx, 0ull);
+ }
+ for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
+ checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
+ }
+
+ local_irq_restore(flags);
+}
+
+
+/*
+ * This handler is triggered by the local APIC, so the APIC IRQ handling
+ * rules apply:
+ */
+static int intel_pmu_handle_irq(struct pt_regs *regs)
+{
+ struct perf_sample_data data;
+ struct cpu_hw_counters *cpuc;
+ int bit, cpu, loops;
+ u64 ack, status;
+
+ data.regs = regs;
+ data.addr = 0;
+
+ cpu = smp_processor_id();
+ cpuc = &per_cpu(cpu_hw_counters, cpu);
+
+ perf_disable();
+ status = intel_pmu_get_status();
+ if (!status) {
+ perf_enable();
+ return 0;
+ }
+
+ loops = 0;
+again:
+ if (++loops > 100) {
+ WARN_ONCE(1, "perfcounters: irq loop stuck!\n");
+ perf_counter_print_debug();
+ intel_pmu_reset();
+ perf_enable();
+ return 1;
+ }
+
+ inc_irq_stat(apic_perf_irqs);
+ ack = status;
+ for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
+ struct perf_counter *counter = cpuc->counters[bit];
+
+ clear_bit(bit, (unsigned long *) &status);
+ if (!test_bit(bit, cpuc->active_mask))
+ continue;
+
+ if (!intel_pmu_save_and_restart(counter))
+ continue;
+
+ if (perf_counter_overflow(counter, 1, &data))
+ intel_pmu_disable_counter(&counter->hw, bit);
+ }
+
+ intel_pmu_ack_status(ack);
+
+ /*
+ * Repeat if there is more work to be done:
+ */
+ status = intel_pmu_get_status();
+ if (status)
+ goto again;
+
+ perf_enable();
+
+ return 1;
+}
+
+static int amd_pmu_handle_irq(struct pt_regs *regs)
+{
+ struct perf_sample_data data;
+ struct cpu_hw_counters *cpuc;
+ struct perf_counter *counter;
+ struct hw_perf_counter *hwc;
+ int cpu, idx, handled = 0;
+ u64 val;
+
+ data.regs = regs;
+ data.addr = 0;
+
+ cpu = smp_processor_id();
+ cpuc = &per_cpu(cpu_hw_counters, cpu);
+
+ for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ if (!test_bit(idx, cpuc->active_mask))
+ continue;
+
+ counter = cpuc->counters[idx];
+ hwc = &counter->hw;
+
+ val = x86_perf_counter_update(counter, hwc, idx);
+ if (val & (1ULL << (x86_pmu.counter_bits - 1)))
+ continue;
+
+ /*
+ * counter overflow
+ */
+ handled = 1;
+ data.period = counter->hw.last_period;
+
+ if (!x86_perf_counter_set_period(counter, hwc, idx))
+ continue;
+
+ if (perf_counter_overflow(counter, 1, &data))
+ amd_pmu_disable_counter(hwc, idx);
+ }
+
+ if (handled)
+ inc_irq_stat(apic_perf_irqs);
+
+ return handled;
+}
+
+void smp_perf_pending_interrupt(struct pt_regs *regs)
+{
+ irq_enter();
+ ack_APIC_irq();
+ inc_irq_stat(apic_pending_irqs);
+ perf_counter_do_pending();
+ irq_exit();
+}
+
+void set_perf_counter_pending(void)
+{
+ apic->send_IPI_self(LOCAL_PENDING_VECTOR);
+}
+
+void perf_counters_lapic_init(void)
+{
+ if (!x86_pmu_initialized())
+ return;
+
+ /*
+ * Always use NMI for PMU
+ */
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+}
+
+static int __kprobes
+perf_counter_nmi_handler(struct notifier_block *self,
+ unsigned long cmd, void *__args)
+{
+ struct die_args *args = __args;
+ struct pt_regs *regs;
+
+ if (!atomic_read(&active_counters))
+ return NOTIFY_DONE;
+
+ switch (cmd) {
+ case DIE_NMI:
+ case DIE_NMI_IPI:
+ break;
+
+ default:
+ return NOTIFY_DONE;
+ }
+
+ regs = args->regs;
+
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+ /*
+ * Can't rely on the handled return value to say it was our NMI, two
+ * counters could trigger 'simultaneously' raising two back-to-back NMIs.
+ *
+ * If the first NMI handles both, the latter will be empty and daze
+ * the CPU.
+ */
+ x86_pmu.handle_irq(regs);
+
+ return NOTIFY_STOP;
+}
+
+static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
+ .notifier_call = perf_counter_nmi_handler,
+ .next = NULL,
+ .priority = 1
+};
+
+static struct x86_pmu intel_pmu = {
+ .name = "Intel",
+ .handle_irq = intel_pmu_handle_irq,
+ .disable_all = intel_pmu_disable_all,
+ .enable_all = intel_pmu_enable_all,
+ .enable = intel_pmu_enable_counter,
+ .disable = intel_pmu_disable_counter,
+ .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
+ .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
+ .event_map = intel_pmu_event_map,
+ .raw_event = intel_pmu_raw_event,
+ .max_events = ARRAY_SIZE(intel_perfmon_event_map),
+ /*
+ * Intel PMCs cannot be accessed sanely above 32 bit width,
+ * so we install an artificial 1<<31 period regardless of
+ * the generic counter period:
+ */
+ .max_period = (1ULL << 31) - 1,
+};
+
+static struct x86_pmu amd_pmu = {
+ .name = "AMD",
+ .handle_irq = amd_pmu_handle_irq,
+ .disable_all = amd_pmu_disable_all,
+ .enable_all = amd_pmu_enable_all,
+ .enable = amd_pmu_enable_counter,
+ .disable = amd_pmu_disable_counter,
+ .eventsel = MSR_K7_EVNTSEL0,
+ .perfctr = MSR_K7_PERFCTR0,
+ .event_map = amd_pmu_event_map,
+ .raw_event = amd_pmu_raw_event,
+ .max_events = ARRAY_SIZE(amd_perfmon_event_map),
+ .num_counters = 4,
+ .counter_bits = 48,
+ .counter_mask = (1ULL << 48) - 1,
+ /* use highest bit to detect overflow */
+ .max_period = (1ULL << 47) - 1,
+};
+
+static int intel_pmu_init(void)
+{
+ union cpuid10_edx edx;
+ union cpuid10_eax eax;
+ unsigned int unused;
+ unsigned int ebx;
+ int version;
+
+ if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
+ return -ENODEV;
+
+ /*
+ * Check whether the Architectural PerfMon supports
+ * Branch Misses Retired Event or not.
+ */
+ cpuid(10, &eax.full, &ebx, &unused, &edx.full);
+ if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
+ return -ENODEV;
+
+ version = eax.split.version_id;
+ if (version < 2)
+ return -ENODEV;
+
+ x86_pmu = intel_pmu;
+ x86_pmu.version = version;
+ x86_pmu.num_counters = eax.split.num_counters;
+ x86_pmu.counter_bits = eax.split.bit_width;
+ x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1;
+
+ /*
+ * Quirk: v2 perfmon does not report fixed-purpose counters, so
+ * assume at least 3 counters:
+ */
+ x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3);
+
+ rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
+
+ /*
+ * Install the hw-cache-events table:
+ */
+ switch (boot_cpu_data.x86_model) {
+ case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
+ case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
+ case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
+ case 29: /* six-core 45 nm xeon "Dunnington" */
+ memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+
+ pr_cont("Core2 events, ");
+ break;
+ default:
+ case 26:
+ memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+
+ pr_cont("Nehalem/Corei7 events, ");
+ break;
+ case 28:
+ memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+
+ pr_cont("Atom events, ");
+ break;
+ }
+ return 0;
+}
+
+static int amd_pmu_init(void)
+{
+ x86_pmu = amd_pmu;
+
+ switch (boot_cpu_data.x86) {
+ case 0x0f:
+ case 0x10:
+ case 0x11:
+ memcpy(hw_cache_event_ids, amd_0f_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+
+ pr_cont("AMD Family 0f/10/11 events, ");
+ break;
+ }
+ return 0;
+}
+
+void __init init_hw_perf_counters(void)
+{
+ int err;
+
+ pr_info("Performance Counters: ");
+
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_INTEL:
+ err = intel_pmu_init();
+ break;
+ case X86_VENDOR_AMD:
+ err = amd_pmu_init();
+ break;
+ default:
+ return;
+ }
+ if (err != 0) {
+ pr_cont("no PMU driver, software counters only.\n");
+ return;
+ }
+
+ pr_cont("%s PMU driver.\n", x86_pmu.name);
+
+ if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
+ x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
+ WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
+ x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
+ }
+ perf_counter_mask = (1 << x86_pmu.num_counters) - 1;
+ perf_max_counters = x86_pmu.num_counters;
+
+ if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
+ x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
+ WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
+ x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
+ }
+
+ perf_counter_mask |=
+ ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
+
+ perf_counters_lapic_init();
+ register_die_notifier(&perf_counter_nmi_notifier);
+
+ pr_info("... version: %d\n", x86_pmu.version);
+ pr_info("... bit width: %d\n", x86_pmu.counter_bits);
+ pr_info("... generic counters: %d\n", x86_pmu.num_counters);
+ pr_info("... value mask: %016Lx\n", x86_pmu.counter_mask);
+ pr_info("... max period: %016Lx\n", x86_pmu.max_period);
+ pr_info("... fixed-purpose counters: %d\n", x86_pmu.num_counters_fixed);
+ pr_info("... counter mask: %016Lx\n", perf_counter_mask);
+}
+
+static inline void x86_pmu_read(struct perf_counter *counter)
+{
+ x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
+}
+
+static const struct pmu pmu = {
+ .enable = x86_pmu_enable,
+ .disable = x86_pmu_disable,
+ .read = x86_pmu_read,
+ .unthrottle = x86_pmu_unthrottle,
+};
+
+const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
+{
+ int err;
+
+ err = __hw_perf_counter_init(counter);
+ if (err)
+ return ERR_PTR(err);
+
+ return &pmu;
+}
+
+/*
+ * callchain support
+ */
+
+static inline
+void callchain_store(struct perf_callchain_entry *entry, unsigned long ip)
+{
+ if (entry->nr < MAX_STACK_DEPTH)
+ entry->ip[entry->nr++] = ip;
+}
+
+static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry);
+static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry);
+
+
+static void
+backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
+{
+ /* Ignore warnings */
+}
+
+static void backtrace_warning(void *data, char *msg)
+{
+ /* Ignore warnings */
+}
+
+static int backtrace_stack(void *data, char *name)
+{
+ /* Don't bother with IRQ stacks for now */
+ return -1;
+}
+
+static void backtrace_address(void *data, unsigned long addr, int reliable)
+{
+ struct perf_callchain_entry *entry = data;
+
+ if (reliable)
+ callchain_store(entry, addr);
+}
+
+static const struct stacktrace_ops backtrace_ops = {
+ .warning = backtrace_warning,
+ .warning_symbol = backtrace_warning_symbol,
+ .stack = backtrace_stack,
+ .address = backtrace_address,
+};
+
+static void
+perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
+{
+ unsigned long bp;
+ char *stack;
+ int nr = entry->nr;
+
+ callchain_store(entry, instruction_pointer(regs));
+
+ stack = ((char *)regs + sizeof(struct pt_regs));
+#ifdef CONFIG_FRAME_POINTER
+ bp = frame_pointer(regs);
+#else
+ bp = 0;
+#endif
+
+ dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, entry);
+
+ entry->kernel = entry->nr - nr;
+}
+
+
+struct stack_frame {
+ const void __user *next_fp;
+ unsigned long return_address;
+};
+
+static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
+{
+ int ret;
+
+ if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
+ return 0;
+
+ ret = 1;
+ pagefault_disable();
+ if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
+ ret = 0;
+ pagefault_enable();
+
+ return ret;
+}
+
+static void
+perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
+{
+ struct stack_frame frame;
+ const void __user *fp;
+ int nr = entry->nr;
+
+ regs = (struct pt_regs *)current->thread.sp0 - 1;
+ fp = (void __user *)regs->bp;
+
+ callchain_store(entry, regs->ip);
+
+ while (entry->nr < MAX_STACK_DEPTH) {
+ frame.next_fp = NULL;
+ frame.return_address = 0;
+
+ if (!copy_stack_frame(fp, &frame))
+ break;
+
+ if ((unsigned long)fp < user_stack_pointer(regs))
+ break;
+
+ callchain_store(entry, frame.return_address);
+ fp = frame.next_fp;
+ }
+
+ entry->user = entry->nr - nr;
+}
+
+static void
+perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
+{
+ int is_user;
+
+ if (!regs)
+ return;
+
+ is_user = user_mode(regs);
+
+ if (!current || current->pid == 0)
+ return;
+
+ if (is_user && current->state != TASK_RUNNING)
+ return;
+
+ if (!is_user)
+ perf_callchain_kernel(regs, entry);
+
+ if (current->mm)
+ perf_callchain_user(regs, entry);
+}
+
+struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+{
+ struct perf_callchain_entry *entry;
+
+ if (in_nmi())
+ entry = &__get_cpu_var(nmi_entry);
+ else
+ entry = &__get_cpu_var(irq_entry);
+
+ entry->nr = 0;
+ entry->hv = 0;
+ entry->kernel = 0;
+ entry->user = 0;
+
+ perf_do_callchain(regs, entry);
+
+ return entry;
+}
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index f6c70a164e3..d6f5b9fbde3 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -19,8 +19,8 @@
#include <linux/nmi.h>
#include <linux/kprobes.h>
-#include <asm/genapic.h>
-#include <asm/intel_arch_perfmon.h>
+#include <asm/apic.h>
+#include <asm/perf_counter.h>
struct nmi_watchdog_ctlblk {
unsigned int cccr_msr;
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index 87b67e3a765..48bfe138603 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -19,45 +19,61 @@
* Markus Metzger <markus.t.metzger@intel.com>, 2007-2009
*/
-
-#include <asm/ds.h>
-
-#include <linux/errno.h>
+#include <linux/kernel.h>
#include <linux/string.h>
-#include <linux/slab.h>
+#include <linux/errno.h>
#include <linux/sched.h>
+#include <linux/slab.h>
#include <linux/mm.h>
-#include <linux/kernel.h>
+#include <linux/trace_clock.h>
+
+#include <asm/ds.h>
+#include "ds_selftest.h"
/*
- * The configuration for a particular DS hardware implementation.
+ * The configuration for a particular DS hardware implementation:
*/
struct ds_configuration {
- /* the name of the configuration */
- const char *name;
- /* the size of one pointer-typed field in the DS structure and
- in the BTS and PEBS buffers in bytes;
- this covers the first 8 DS fields related to buffer management. */
- unsigned char sizeof_field;
- /* the size of a BTS/PEBS record in bytes */
- unsigned char sizeof_rec[2];
- /* a series of bit-masks to control various features indexed
- * by enum ds_feature */
- unsigned long ctl[dsf_ctl_max];
+ /* The name of the configuration: */
+ const char *name;
+
+ /* The size of pointer-typed fields in DS, BTS, and PEBS: */
+ unsigned char sizeof_ptr_field;
+
+ /* The size of a BTS/PEBS record in bytes: */
+ unsigned char sizeof_rec[2];
+
+ /* The number of pebs counter reset values in the DS structure. */
+ unsigned char nr_counter_reset;
+
+ /* Control bit-masks indexed by enum ds_feature: */
+ unsigned long ctl[dsf_ctl_max];
};
-static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array);
+static struct ds_configuration ds_cfg __read_mostly;
+
+
+/* Maximal size of a DS configuration: */
+#define MAX_SIZEOF_DS 0x80
-#define ds_cfg per_cpu(ds_cfg_array, smp_processor_id())
+/* Maximal size of a BTS record: */
+#define MAX_SIZEOF_BTS (3 * 8)
-#define MAX_SIZEOF_DS (12 * 8) /* maximal size of a DS configuration */
-#define MAX_SIZEOF_BTS (3 * 8) /* maximal size of a BTS record */
-#define DS_ALIGNMENT (1 << 3) /* BTS and PEBS buffer alignment */
+/* BTS and PEBS buffer alignment: */
+#define DS_ALIGNMENT (1 << 3)
-#define BTS_CONTROL \
- (ds_cfg.ctl[dsf_bts] | ds_cfg.ctl[dsf_bts_kernel] | ds_cfg.ctl[dsf_bts_user] |\
- ds_cfg.ctl[dsf_bts_overflow])
+/* Number of buffer pointers in DS: */
+#define NUM_DS_PTR_FIELDS 8
+/* Size of a pebs reset value in DS: */
+#define PEBS_RESET_FIELD_SIZE 8
+
+/* Mask of control bits in the DS MSR register: */
+#define BTS_CONTROL \
+ ( ds_cfg.ctl[dsf_bts] | \
+ ds_cfg.ctl[dsf_bts_kernel] | \
+ ds_cfg.ctl[dsf_bts_user] | \
+ ds_cfg.ctl[dsf_bts_overflow] )
/*
* A BTS or PEBS tracer.
@@ -66,29 +82,36 @@ static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array);
* to identify tracers.
*/
struct ds_tracer {
- /* the DS context (partially) owned by this tracer */
- struct ds_context *context;
- /* the buffer provided on ds_request() and its size in bytes */
- void *buffer;
- size_t size;
+ /* The DS context (partially) owned by this tracer. */
+ struct ds_context *context;
+ /* The buffer provided on ds_request() and its size in bytes. */
+ void *buffer;
+ size_t size;
};
struct bts_tracer {
- /* the common DS part */
- struct ds_tracer ds;
- /* the trace including the DS configuration */
- struct bts_trace trace;
- /* buffer overflow notification function */
- bts_ovfl_callback_t ovfl;
+ /* The common DS part: */
+ struct ds_tracer ds;
+
+ /* The trace including the DS configuration: */
+ struct bts_trace trace;
+
+ /* Buffer overflow notification function: */
+ bts_ovfl_callback_t ovfl;
+
+ /* Active flags affecting trace collection. */
+ unsigned int flags;
};
struct pebs_tracer {
- /* the common DS part */
- struct ds_tracer ds;
- /* the trace including the DS configuration */
- struct pebs_trace trace;
- /* buffer overflow notification function */
- pebs_ovfl_callback_t ovfl;
+ /* The common DS part: */
+ struct ds_tracer ds;
+
+ /* The trace including the DS configuration: */
+ struct pebs_trace trace;
+
+ /* Buffer overflow notification function: */
+ pebs_ovfl_callback_t ovfl;
};
/*
@@ -97,6 +120,7 @@ struct pebs_tracer {
*
* The DS configuration consists of the following fields; different
* architetures vary in the size of those fields.
+ *
* - double-word aligned base linear address of the BTS buffer
* - write pointer into the BTS buffer
* - end linear address of the BTS buffer (one byte beyond the end of
@@ -135,21 +159,22 @@ enum ds_field {
};
enum ds_qualifier {
- ds_bts = 0,
+ ds_bts = 0,
ds_pebs
};
-static inline unsigned long ds_get(const unsigned char *base,
- enum ds_qualifier qual, enum ds_field field)
+static inline unsigned long
+ds_get(const unsigned char *base, enum ds_qualifier qual, enum ds_field field)
{
- base += (ds_cfg.sizeof_field * (field + (4 * qual)));
+ base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual)));
return *(unsigned long *)base;
}
-static inline void ds_set(unsigned char *base, enum ds_qualifier qual,
- enum ds_field field, unsigned long value)
+static inline void
+ds_set(unsigned char *base, enum ds_qualifier qual, enum ds_field field,
+ unsigned long value)
{
- base += (ds_cfg.sizeof_field * (field + (4 * qual)));
+ base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual)));
(*(unsigned long *)base) = value;
}
@@ -159,7 +184,6 @@ static inline void ds_set(unsigned char *base, enum ds_qualifier qual,
*/
static DEFINE_SPINLOCK(ds_lock);
-
/*
* We either support (system-wide) per-cpu or per-thread allocation.
* We distinguish the two based on the task_struct pointer, where a
@@ -178,12 +202,28 @@ static DEFINE_SPINLOCK(ds_lock);
*/
static atomic_t tracers = ATOMIC_INIT(0);
-static inline void get_tracer(struct task_struct *task)
+static inline int get_tracer(struct task_struct *task)
{
- if (task)
+ int error;
+
+ spin_lock_irq(&ds_lock);
+
+ if (task) {
+ error = -EPERM;
+ if (atomic_read(&tracers) < 0)
+ goto out;
atomic_inc(&tracers);
- else
+ } else {
+ error = -EPERM;
+ if (atomic_read(&tracers) > 0)
+ goto out;
atomic_dec(&tracers);
+ }
+
+ error = 0;
+out:
+ spin_unlock_irq(&ds_lock);
+ return error;
}
static inline void put_tracer(struct task_struct *task)
@@ -194,14 +234,6 @@ static inline void put_tracer(struct task_struct *task)
atomic_inc(&tracers);
}
-static inline int check_tracer(struct task_struct *task)
-{
- return task ?
- (atomic_read(&tracers) >= 0) :
- (atomic_read(&tracers) <= 0);
-}
-
-
/*
* The DS context is either attached to a thread or to a cpu:
* - in the former case, the thread_struct contains a pointer to the
@@ -213,61 +245,58 @@ static inline int check_tracer(struct task_struct *task)
* deallocated when the last user puts the context.
*/
struct ds_context {
- /* pointer to the DS configuration; goes into MSR_IA32_DS_AREA */
- unsigned char ds[MAX_SIZEOF_DS];
- /* the owner of the BTS and PEBS configuration, respectively */
- struct bts_tracer *bts_master;
- struct pebs_tracer *pebs_master;
- /* use count */
- unsigned long count;
- /* a pointer to the context location inside the thread_struct
- * or the per_cpu context array */
- struct ds_context **this;
- /* a pointer to the task owning this context, or NULL, if the
- * context is owned by a cpu */
- struct task_struct *task;
-};
+ /* The DS configuration; goes into MSR_IA32_DS_AREA: */
+ unsigned char ds[MAX_SIZEOF_DS];
+
+ /* The owner of the BTS and PEBS configuration, respectively: */
+ struct bts_tracer *bts_master;
+ struct pebs_tracer *pebs_master;
-static DEFINE_PER_CPU(struct ds_context *, system_context_array);
+ /* Use count: */
+ unsigned long count;
-#define system_context per_cpu(system_context_array, smp_processor_id())
+ /* Pointer to the context pointer field: */
+ struct ds_context **this;
+
+ /* The traced task; NULL for cpu tracing: */
+ struct task_struct *task;
+
+ /* The traced cpu; only valid if task is NULL: */
+ int cpu;
+};
+static DEFINE_PER_CPU(struct ds_context *, cpu_context);
-static inline struct ds_context *ds_get_context(struct task_struct *task)
+
+static struct ds_context *ds_get_context(struct task_struct *task, int cpu)
{
struct ds_context **p_context =
- (task ? &task->thread.ds_ctx : &system_context);
+ (task ? &task->thread.ds_ctx : &per_cpu(cpu_context, cpu));
struct ds_context *context = NULL;
struct ds_context *new_context = NULL;
- unsigned long irq;
/* Chances are small that we already have a context. */
new_context = kzalloc(sizeof(*new_context), GFP_KERNEL);
if (!new_context)
return NULL;
- spin_lock_irqsave(&ds_lock, irq);
+ spin_lock_irq(&ds_lock);
context = *p_context;
- if (!context) {
+ if (likely(!context)) {
context = new_context;
context->this = p_context;
context->task = task;
+ context->cpu = cpu;
context->count = 0;
- if (task)
- set_tsk_thread_flag(task, TIF_DS_AREA_MSR);
-
- if (!task || (task == current))
- wrmsrl(MSR_IA32_DS_AREA, (unsigned long)context->ds);
-
*p_context = context;
}
context->count++;
- spin_unlock_irqrestore(&ds_lock, irq);
+ spin_unlock_irq(&ds_lock);
if (context != new_context)
kfree(new_context);
@@ -275,8 +304,9 @@ static inline struct ds_context *ds_get_context(struct task_struct *task)
return context;
}
-static inline void ds_put_context(struct ds_context *context)
+static void ds_put_context(struct ds_context *context)
{
+ struct task_struct *task;
unsigned long irq;
if (!context)
@@ -291,17 +321,55 @@ static inline void ds_put_context(struct ds_context *context)
*(context->this) = NULL;
- if (context->task)
- clear_tsk_thread_flag(context->task, TIF_DS_AREA_MSR);
+ task = context->task;
+
+ if (task)
+ clear_tsk_thread_flag(task, TIF_DS_AREA_MSR);
- if (!context->task || (context->task == current))
- wrmsrl(MSR_IA32_DS_AREA, 0);
+ /*
+ * We leave the (now dangling) pointer to the DS configuration in
+ * the DS_AREA msr. This is as good or as bad as replacing it with
+ * NULL - the hardware would crash if we enabled tracing.
+ *
+ * This saves us some problems with having to write an msr on a
+ * different cpu while preventing others from doing the same for the
+ * next context for that same cpu.
+ */
spin_unlock_irqrestore(&ds_lock, irq);
+ /* The context might still be in use for context switching. */
+ if (task && (task != current))
+ wait_task_context_switch(task);
+
kfree(context);
}
+static void ds_install_ds_area(struct ds_context *context)
+{
+ unsigned long ds;
+
+ ds = (unsigned long)context->ds;
+
+ /*
+ * There is a race between the bts master and the pebs master.
+ *
+ * The thread/cpu access is synchronized via get/put_cpu() for
+ * task tracing and via wrmsr_on_cpu for cpu tracing.
+ *
+ * If bts and pebs are collected for the same task or same cpu,
+ * the same confiuration is written twice.
+ */
+ if (context->task) {
+ get_cpu();
+ if (context->task == current)
+ wrmsrl(MSR_IA32_DS_AREA, ds);
+ set_tsk_thread_flag(context->task, TIF_DS_AREA_MSR);
+ put_cpu();
+ } else
+ wrmsr_on_cpu(context->cpu, MSR_IA32_DS_AREA,
+ (u32)((u64)ds), (u32)((u64)ds >> 32));
+}
/*
* Call the tracer's callback on a buffer overflow.
@@ -332,9 +400,9 @@ static void ds_overflow(struct ds_context *context, enum ds_qualifier qual)
* The remainder of any partially written record is zeroed out.
*
* context: the DS context
- * qual: the buffer type
- * record: the data to write
- * size: the size of the data
+ * qual: the buffer type
+ * record: the data to write
+ * size: the size of the data
*/
static int ds_write(struct ds_context *context, enum ds_qualifier qual,
const void *record, size_t size)
@@ -349,14 +417,14 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual,
unsigned long write_size, adj_write_size;
/*
- * write as much as possible without producing an
+ * Write as much as possible without producing an
* overflow interrupt.
*
- * interrupt_threshold must either be
+ * Interrupt_threshold must either be
* - bigger than absolute_maximum or
* - point to a record between buffer_base and absolute_maximum
*
- * index points to a valid record.
+ * Index points to a valid record.
*/
base = ds_get(context->ds, qual, ds_buffer_base);
index = ds_get(context->ds, qual, ds_index);
@@ -365,8 +433,10 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual,
write_end = min(end, int_th);
- /* if we are already beyond the interrupt threshold,
- * we fill the entire buffer */
+ /*
+ * If we are already beyond the interrupt threshold,
+ * we fill the entire buffer.
+ */
if (write_end <= index)
write_end = end;
@@ -383,7 +453,7 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual,
adj_write_size = write_size / ds_cfg.sizeof_rec[qual];
adj_write_size *= ds_cfg.sizeof_rec[qual];
- /* zero out trailing bytes */
+ /* Zero out trailing bytes. */
memset((char *)index + write_size, 0,
adj_write_size - write_size);
index += adj_write_size;
@@ -410,7 +480,7 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual,
* Later architectures use 64bit pointers throughout, whereas earlier
* architectures use 32bit pointers in 32bit mode.
*
- * We compute the base address for the first 8 fields based on:
+ * We compute the base address for the fields based on:
* - the field size stored in the DS configuration
* - the relative field position
*
@@ -431,23 +501,23 @@ enum bts_field {
bts_to,
bts_flags,
- bts_qual = bts_from,
- bts_jiffies = bts_to,
- bts_pid = bts_flags,
+ bts_qual = bts_from,
+ bts_clock = bts_to,
+ bts_pid = bts_flags,
- bts_qual_mask = (bts_qual_max - 1),
- bts_escape = ((unsigned long)-1 & ~bts_qual_mask)
+ bts_qual_mask = (bts_qual_max - 1),
+ bts_escape = ((unsigned long)-1 & ~bts_qual_mask)
};
static inline unsigned long bts_get(const char *base, enum bts_field field)
{
- base += (ds_cfg.sizeof_field * field);
+ base += (ds_cfg.sizeof_ptr_field * field);
return *(unsigned long *)base;
}
static inline void bts_set(char *base, enum bts_field field, unsigned long val)
{
- base += (ds_cfg.sizeof_field * field);;
+ base += (ds_cfg.sizeof_ptr_field * field);;
(*(unsigned long *)base) = val;
}
@@ -463,8 +533,8 @@ static inline void bts_set(char *base, enum bts_field field, unsigned long val)
*
* return: bytes read/written on success; -Eerrno, otherwise
*/
-static int bts_read(struct bts_tracer *tracer, const void *at,
- struct bts_struct *out)
+static int
+bts_read(struct bts_tracer *tracer, const void *at, struct bts_struct *out)
{
if (!tracer)
return -EINVAL;
@@ -478,8 +548,8 @@ static int bts_read(struct bts_tracer *tracer, const void *at,
memset(out, 0, sizeof(*out));
if ((bts_get(at, bts_qual) & ~bts_qual_mask) == bts_escape) {
out->qualifier = (bts_get(at, bts_qual) & bts_qual_mask);
- out->variant.timestamp.jiffies = bts_get(at, bts_jiffies);
- out->variant.timestamp.pid = bts_get(at, bts_pid);
+ out->variant.event.clock = bts_get(at, bts_clock);
+ out->variant.event.pid = bts_get(at, bts_pid);
} else {
out->qualifier = bts_branch;
out->variant.lbr.from = bts_get(at, bts_from);
@@ -516,8 +586,8 @@ static int bts_write(struct bts_tracer *tracer, const struct bts_struct *in)
case bts_task_arrives:
case bts_task_departs:
bts_set(raw, bts_qual, (bts_escape | in->qualifier));
- bts_set(raw, bts_jiffies, in->variant.timestamp.jiffies);
- bts_set(raw, bts_pid, in->variant.timestamp.pid);
+ bts_set(raw, bts_clock, in->variant.event.clock);
+ bts_set(raw, bts_pid, in->variant.event.pid);
break;
default:
return -EINVAL;
@@ -555,7 +625,8 @@ static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual,
unsigned int flags) {
unsigned long buffer, adj;
- /* adjust the buffer address and size to meet alignment
+ /*
+ * Adjust the buffer address and size to meet alignment
* constraints:
* - buffer is double-word aligned
* - size is multiple of record size
@@ -577,9 +648,11 @@ static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual,
trace->begin = (void *)buffer;
trace->top = trace->begin;
trace->end = (void *)(buffer + size);
- /* The value for 'no threshold' is -1, which will set the
+ /*
+ * The value for 'no threshold' is -1, which will set the
* threshold outside of the buffer, just like we want it.
*/
+ ith *= ds_cfg.sizeof_rec[qual];
trace->ith = (void *)(buffer + size - ith);
trace->flags = flags;
@@ -588,18 +661,27 @@ static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual,
static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace,
enum ds_qualifier qual, struct task_struct *task,
- void *base, size_t size, size_t th, unsigned int flags)
+ int cpu, void *base, size_t size, size_t th)
{
struct ds_context *context;
int error;
+ size_t req_size;
+
+ error = -EOPNOTSUPP;
+ if (!ds_cfg.sizeof_rec[qual])
+ goto out;
error = -EINVAL;
if (!base)
goto out;
- /* we require some space to do alignment adjustments below */
+ req_size = ds_cfg.sizeof_rec[qual];
+ /* We might need space for alignment adjustments. */
+ if (!IS_ALIGNED((unsigned long)base, DS_ALIGNMENT))
+ req_size += DS_ALIGNMENT;
+
error = -EINVAL;
- if (size < (DS_ALIGNMENT + ds_cfg.sizeof_rec[qual]))
+ if (size < req_size)
goto out;
if (th != (size_t)-1) {
@@ -614,182 +696,318 @@ static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace,
tracer->size = size;
error = -ENOMEM;
- context = ds_get_context(task);
+ context = ds_get_context(task, cpu);
if (!context)
goto out;
tracer->context = context;
- ds_init_ds_trace(trace, qual, base, size, th, flags);
+ /*
+ * Defer any tracer-specific initialization work for the context until
+ * context ownership has been clarified.
+ */
error = 0;
out:
return error;
}
-struct bts_tracer *ds_request_bts(struct task_struct *task,
- void *base, size_t size,
- bts_ovfl_callback_t ovfl, size_t th,
- unsigned int flags)
+static struct bts_tracer *ds_request_bts(struct task_struct *task, int cpu,
+ void *base, size_t size,
+ bts_ovfl_callback_t ovfl, size_t th,
+ unsigned int flags)
{
struct bts_tracer *tracer;
- unsigned long irq;
int error;
+ /* Buffer overflow notification is not yet implemented. */
error = -EOPNOTSUPP;
- if (!ds_cfg.ctl[dsf_bts])
+ if (ovfl)
goto out;
- /* buffer overflow notification is not yet implemented */
- error = -EOPNOTSUPP;
- if (ovfl)
+ error = get_tracer(task);
+ if (error < 0)
goto out;
error = -ENOMEM;
tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
if (!tracer)
- goto out;
+ goto out_put_tracer;
tracer->ovfl = ovfl;
+ /* Do some more error checking and acquire a tracing context. */
error = ds_request(&tracer->ds, &tracer->trace.ds,
- ds_bts, task, base, size, th, flags);
+ ds_bts, task, cpu, base, size, th);
if (error < 0)
goto out_tracer;
-
- spin_lock_irqsave(&ds_lock, irq);
-
- error = -EPERM;
- if (!check_tracer(task))
- goto out_unlock;
- get_tracer(task);
+ /* Claim the bts part of the tracing context we acquired above. */
+ spin_lock_irq(&ds_lock);
error = -EPERM;
if (tracer->ds.context->bts_master)
- goto out_put_tracer;
+ goto out_unlock;
tracer->ds.context->bts_master = tracer;
- spin_unlock_irqrestore(&ds_lock, irq);
+ spin_unlock_irq(&ds_lock);
+ /*
+ * Now that we own the bts part of the context, let's complete the
+ * initialization for that part.
+ */
+ ds_init_ds_trace(&tracer->trace.ds, ds_bts, base, size, th, flags);
+ ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
+ ds_install_ds_area(tracer->ds.context);
tracer->trace.read = bts_read;
tracer->trace.write = bts_write;
- ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
+ /* Start tracing. */
ds_resume_bts(tracer);
return tracer;
- out_put_tracer:
- put_tracer(task);
out_unlock:
- spin_unlock_irqrestore(&ds_lock, irq);
+ spin_unlock_irq(&ds_lock);
ds_put_context(tracer->ds.context);
out_tracer:
kfree(tracer);
+ out_put_tracer:
+ put_tracer(task);
out:
return ERR_PTR(error);
}
-struct pebs_tracer *ds_request_pebs(struct task_struct *task,
- void *base, size_t size,
- pebs_ovfl_callback_t ovfl, size_t th,
- unsigned int flags)
+struct bts_tracer *ds_request_bts_task(struct task_struct *task,
+ void *base, size_t size,
+ bts_ovfl_callback_t ovfl,
+ size_t th, unsigned int flags)
+{
+ return ds_request_bts(task, 0, base, size, ovfl, th, flags);
+}
+
+struct bts_tracer *ds_request_bts_cpu(int cpu, void *base, size_t size,
+ bts_ovfl_callback_t ovfl,
+ size_t th, unsigned int flags)
+{
+ return ds_request_bts(NULL, cpu, base, size, ovfl, th, flags);
+}
+
+static struct pebs_tracer *ds_request_pebs(struct task_struct *task, int cpu,
+ void *base, size_t size,
+ pebs_ovfl_callback_t ovfl, size_t th,
+ unsigned int flags)
{
struct pebs_tracer *tracer;
- unsigned long irq;
int error;
- /* buffer overflow notification is not yet implemented */
+ /* Buffer overflow notification is not yet implemented. */
error = -EOPNOTSUPP;
if (ovfl)
goto out;
+ error = get_tracer(task);
+ if (error < 0)
+ goto out;
+
error = -ENOMEM;
tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
if (!tracer)
- goto out;
+ goto out_put_tracer;
tracer->ovfl = ovfl;
+ /* Do some more error checking and acquire a tracing context. */
error = ds_request(&tracer->ds, &tracer->trace.ds,
- ds_pebs, task, base, size, th, flags);
+ ds_pebs, task, cpu, base, size, th);
if (error < 0)
goto out_tracer;
- spin_lock_irqsave(&ds_lock, irq);
-
- error = -EPERM;
- if (!check_tracer(task))
- goto out_unlock;
- get_tracer(task);
+ /* Claim the pebs part of the tracing context we acquired above. */
+ spin_lock_irq(&ds_lock);
error = -EPERM;
if (tracer->ds.context->pebs_master)
- goto out_put_tracer;
+ goto out_unlock;
tracer->ds.context->pebs_master = tracer;
- spin_unlock_irqrestore(&ds_lock, irq);
+ spin_unlock_irq(&ds_lock);
+ /*
+ * Now that we own the pebs part of the context, let's complete the
+ * initialization for that part.
+ */
+ ds_init_ds_trace(&tracer->trace.ds, ds_pebs, base, size, th, flags);
ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
+ ds_install_ds_area(tracer->ds.context);
+
+ /* Start tracing. */
ds_resume_pebs(tracer);
return tracer;
- out_put_tracer:
- put_tracer(task);
out_unlock:
- spin_unlock_irqrestore(&ds_lock, irq);
+ spin_unlock_irq(&ds_lock);
ds_put_context(tracer->ds.context);
out_tracer:
kfree(tracer);
+ out_put_tracer:
+ put_tracer(task);
out:
return ERR_PTR(error);
}
-void ds_release_bts(struct bts_tracer *tracer)
+struct pebs_tracer *ds_request_pebs_task(struct task_struct *task,
+ void *base, size_t size,
+ pebs_ovfl_callback_t ovfl,
+ size_t th, unsigned int flags)
{
- if (!tracer)
- return;
+ return ds_request_pebs(task, 0, base, size, ovfl, th, flags);
+}
- ds_suspend_bts(tracer);
+struct pebs_tracer *ds_request_pebs_cpu(int cpu, void *base, size_t size,
+ pebs_ovfl_callback_t ovfl,
+ size_t th, unsigned int flags)
+{
+ return ds_request_pebs(NULL, cpu, base, size, ovfl, th, flags);
+}
+
+static void ds_free_bts(struct bts_tracer *tracer)
+{
+ struct task_struct *task;
+
+ task = tracer->ds.context->task;
WARN_ON_ONCE(tracer->ds.context->bts_master != tracer);
tracer->ds.context->bts_master = NULL;
- put_tracer(tracer->ds.context->task);
+ /* Make sure tracing stopped and the tracer is not in use. */
+ if (task && (task != current))
+ wait_task_context_switch(task);
+
ds_put_context(tracer->ds.context);
+ put_tracer(task);
kfree(tracer);
}
+void ds_release_bts(struct bts_tracer *tracer)
+{
+ might_sleep();
+
+ if (!tracer)
+ return;
+
+ ds_suspend_bts(tracer);
+ ds_free_bts(tracer);
+}
+
+int ds_release_bts_noirq(struct bts_tracer *tracer)
+{
+ struct task_struct *task;
+ unsigned long irq;
+ int error;
+
+ if (!tracer)
+ return 0;
+
+ task = tracer->ds.context->task;
+
+ local_irq_save(irq);
+
+ error = -EPERM;
+ if (!task &&
+ (tracer->ds.context->cpu != smp_processor_id()))
+ goto out;
+
+ error = -EPERM;
+ if (task && (task != current))
+ goto out;
+
+ ds_suspend_bts_noirq(tracer);
+ ds_free_bts(tracer);
+
+ error = 0;
+ out:
+ local_irq_restore(irq);
+ return error;
+}
+
+static void update_task_debugctlmsr(struct task_struct *task,
+ unsigned long debugctlmsr)
+{
+ task->thread.debugctlmsr = debugctlmsr;
+
+ get_cpu();
+ if (task == current)
+ update_debugctlmsr(debugctlmsr);
+ put_cpu();
+}
+
void ds_suspend_bts(struct bts_tracer *tracer)
{
struct task_struct *task;
+ unsigned long debugctlmsr;
+ int cpu;
if (!tracer)
return;
+ tracer->flags = 0;
+
task = tracer->ds.context->task;
+ cpu = tracer->ds.context->cpu;
- if (!task || (task == current))
- update_debugctlmsr(get_debugctlmsr() & ~BTS_CONTROL);
+ WARN_ON(!task && irqs_disabled());
- if (task) {
- task->thread.debugctlmsr &= ~BTS_CONTROL;
+ debugctlmsr = (task ?
+ task->thread.debugctlmsr :
+ get_debugctlmsr_on_cpu(cpu));
+ debugctlmsr &= ~BTS_CONTROL;
- if (!task->thread.debugctlmsr)
- clear_tsk_thread_flag(task, TIF_DEBUGCTLMSR);
- }
+ if (task)
+ update_task_debugctlmsr(task, debugctlmsr);
+ else
+ update_debugctlmsr_on_cpu(cpu, debugctlmsr);
}
-void ds_resume_bts(struct bts_tracer *tracer)
+int ds_suspend_bts_noirq(struct bts_tracer *tracer)
{
struct task_struct *task;
- unsigned long control;
+ unsigned long debugctlmsr, irq;
+ int cpu, error = 0;
if (!tracer)
- return;
+ return 0;
+
+ tracer->flags = 0;
task = tracer->ds.context->task;
+ cpu = tracer->ds.context->cpu;
+
+ local_irq_save(irq);
+
+ error = -EPERM;
+ if (!task && (cpu != smp_processor_id()))
+ goto out;
+
+ debugctlmsr = (task ?
+ task->thread.debugctlmsr :
+ get_debugctlmsr());
+ debugctlmsr &= ~BTS_CONTROL;
+
+ if (task)
+ update_task_debugctlmsr(task, debugctlmsr);
+ else
+ update_debugctlmsr(debugctlmsr);
+
+ error = 0;
+ out:
+ local_irq_restore(irq);
+ return error;
+}
+
+static unsigned long ds_bts_control(struct bts_tracer *tracer)
+{
+ unsigned long control;
control = ds_cfg.ctl[dsf_bts];
if (!(tracer->trace.ds.flags & BTS_KERNEL))
@@ -797,41 +1015,149 @@ void ds_resume_bts(struct bts_tracer *tracer)
if (!(tracer->trace.ds.flags & BTS_USER))
control |= ds_cfg.ctl[dsf_bts_user];
- if (task) {
- task->thread.debugctlmsr |= control;
- set_tsk_thread_flag(task, TIF_DEBUGCTLMSR);
- }
-
- if (!task || (task == current))
- update_debugctlmsr(get_debugctlmsr() | control);
+ return control;
}
-void ds_release_pebs(struct pebs_tracer *tracer)
+void ds_resume_bts(struct bts_tracer *tracer)
{
+ struct task_struct *task;
+ unsigned long debugctlmsr;
+ int cpu;
+
if (!tracer)
return;
- ds_suspend_pebs(tracer);
+ tracer->flags = tracer->trace.ds.flags;
+
+ task = tracer->ds.context->task;
+ cpu = tracer->ds.context->cpu;
+
+ WARN_ON(!task && irqs_disabled());
+
+ debugctlmsr = (task ?
+ task->thread.debugctlmsr :
+ get_debugctlmsr_on_cpu(cpu));
+ debugctlmsr |= ds_bts_control(tracer);
+
+ if (task)
+ update_task_debugctlmsr(task, debugctlmsr);
+ else
+ update_debugctlmsr_on_cpu(cpu, debugctlmsr);
+}
+
+int ds_resume_bts_noirq(struct bts_tracer *tracer)
+{
+ struct task_struct *task;
+ unsigned long debugctlmsr, irq;
+ int cpu, error = 0;
+
+ if (!tracer)
+ return 0;
+
+ tracer->flags = tracer->trace.ds.flags;
+
+ task = tracer->ds.context->task;
+ cpu = tracer->ds.context->cpu;
+
+ local_irq_save(irq);
+
+ error = -EPERM;
+ if (!task && (cpu != smp_processor_id()))
+ goto out;
+
+ debugctlmsr = (task ?
+ task->thread.debugctlmsr :
+ get_debugctlmsr());
+ debugctlmsr |= ds_bts_control(tracer);
+
+ if (task)
+ update_task_debugctlmsr(task, debugctlmsr);
+ else
+ update_debugctlmsr(debugctlmsr);
+
+ error = 0;
+ out:
+ local_irq_restore(irq);
+ return error;
+}
+
+static void ds_free_pebs(struct pebs_tracer *tracer)
+{
+ struct task_struct *task;
+
+ task = tracer->ds.context->task;
WARN_ON_ONCE(tracer->ds.context->pebs_master != tracer);
tracer->ds.context->pebs_master = NULL;
- put_tracer(tracer->ds.context->task);
ds_put_context(tracer->ds.context);
+ put_tracer(task);
kfree(tracer);
}
+void ds_release_pebs(struct pebs_tracer *tracer)
+{
+ might_sleep();
+
+ if (!tracer)
+ return;
+
+ ds_suspend_pebs(tracer);
+ ds_free_pebs(tracer);
+}
+
+int ds_release_pebs_noirq(struct pebs_tracer *tracer)
+{
+ struct task_struct *task;
+ unsigned long irq;
+ int error;
+
+ if (!tracer)
+ return 0;
+
+ task = tracer->ds.context->task;
+
+ local_irq_save(irq);
+
+ error = -EPERM;
+ if (!task &&
+ (tracer->ds.context->cpu != smp_processor_id()))
+ goto out;
+
+ error = -EPERM;
+ if (task && (task != current))
+ goto out;
+
+ ds_suspend_pebs_noirq(tracer);
+ ds_free_pebs(tracer);
+
+ error = 0;
+ out:
+ local_irq_restore(irq);
+ return error;
+}
+
void ds_suspend_pebs(struct pebs_tracer *tracer)
{
}
+int ds_suspend_pebs_noirq(struct pebs_tracer *tracer)
+{
+ return 0;
+}
+
void ds_resume_pebs(struct pebs_tracer *tracer)
{
}
+int ds_resume_pebs_noirq(struct pebs_tracer *tracer)
+{
+ return 0;
+}
+
const struct bts_trace *ds_read_bts(struct bts_tracer *tracer)
{
if (!tracer)
@@ -847,8 +1173,12 @@ const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer)
return NULL;
ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
- tracer->trace.reset_value =
- *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8));
+
+ tracer->trace.counters = ds_cfg.nr_counter_reset;
+ memcpy(tracer->trace.counter_reset,
+ tracer->ds.context->ds +
+ (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field),
+ ds_cfg.nr_counter_reset * PEBS_RESET_FIELD_SIZE);
return &tracer->trace;
}
@@ -873,18 +1203,24 @@ int ds_reset_pebs(struct pebs_tracer *tracer)
tracer->trace.ds.top = tracer->trace.ds.begin;
- ds_set(tracer->ds.context->ds, ds_bts, ds_index,
+ ds_set(tracer->ds.context->ds, ds_pebs, ds_index,
(unsigned long)tracer->trace.ds.top);
return 0;
}
-int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value)
+int ds_set_pebs_reset(struct pebs_tracer *tracer,
+ unsigned int counter, u64 value)
{
if (!tracer)
return -EINVAL;
- *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)) = value;
+ if (ds_cfg.nr_counter_reset < counter)
+ return -EINVAL;
+
+ *(u64 *)(tracer->ds.context->ds +
+ (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field) +
+ (counter * PEBS_RESET_FIELD_SIZE)) = value;
return 0;
}
@@ -894,73 +1230,117 @@ static const struct ds_configuration ds_cfg_netburst = {
.ctl[dsf_bts] = (1 << 2) | (1 << 3),
.ctl[dsf_bts_kernel] = (1 << 5),
.ctl[dsf_bts_user] = (1 << 6),
-
- .sizeof_field = sizeof(long),
- .sizeof_rec[ds_bts] = sizeof(long) * 3,
-#ifdef __i386__
- .sizeof_rec[ds_pebs] = sizeof(long) * 10,
-#else
- .sizeof_rec[ds_pebs] = sizeof(long) * 18,
-#endif
+ .nr_counter_reset = 1,
};
static const struct ds_configuration ds_cfg_pentium_m = {
.name = "Pentium M",
.ctl[dsf_bts] = (1 << 6) | (1 << 7),
-
- .sizeof_field = sizeof(long),
- .sizeof_rec[ds_bts] = sizeof(long) * 3,
-#ifdef __i386__
- .sizeof_rec[ds_pebs] = sizeof(long) * 10,
-#else
- .sizeof_rec[ds_pebs] = sizeof(long) * 18,
-#endif
+ .nr_counter_reset = 1,
};
static const struct ds_configuration ds_cfg_core2_atom = {
.name = "Core 2/Atom",
.ctl[dsf_bts] = (1 << 6) | (1 << 7),
.ctl[dsf_bts_kernel] = (1 << 9),
.ctl[dsf_bts_user] = (1 << 10),
-
- .sizeof_field = 8,
- .sizeof_rec[ds_bts] = 8 * 3,
- .sizeof_rec[ds_pebs] = 8 * 18,
+ .nr_counter_reset = 1,
+};
+static const struct ds_configuration ds_cfg_core_i7 = {
+ .name = "Core i7",
+ .ctl[dsf_bts] = (1 << 6) | (1 << 7),
+ .ctl[dsf_bts_kernel] = (1 << 9),
+ .ctl[dsf_bts_user] = (1 << 10),
+ .nr_counter_reset = 4,
};
static void
-ds_configure(const struct ds_configuration *cfg)
+ds_configure(const struct ds_configuration *cfg,
+ struct cpuinfo_x86 *cpu)
{
+ unsigned long nr_pebs_fields = 0;
+
+ printk(KERN_INFO "[ds] using %s configuration\n", cfg->name);
+
+#ifdef __i386__
+ nr_pebs_fields = 10;
+#else
+ nr_pebs_fields = 18;
+#endif
+
+ /*
+ * Starting with version 2, architectural performance
+ * monitoring supports a format specifier.
+ */
+ if ((cpuid_eax(0xa) & 0xff) > 1) {
+ unsigned long perf_capabilities, format;
+
+ rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_capabilities);
+
+ format = (perf_capabilities >> 8) & 0xf;
+
+ switch (format) {
+ case 0:
+ nr_pebs_fields = 18;
+ break;
+ case 1:
+ nr_pebs_fields = 22;
+ break;
+ default:
+ printk(KERN_INFO
+ "[ds] unknown PEBS format: %lu\n", format);
+ nr_pebs_fields = 0;
+ break;
+ }
+ }
+
memset(&ds_cfg, 0, sizeof(ds_cfg));
ds_cfg = *cfg;
- printk(KERN_INFO "[ds] using %s configuration\n", ds_cfg.name);
+ ds_cfg.sizeof_ptr_field =
+ (cpu_has(cpu, X86_FEATURE_DTES64) ? 8 : 4);
+
+ ds_cfg.sizeof_rec[ds_bts] = ds_cfg.sizeof_ptr_field * 3;
+ ds_cfg.sizeof_rec[ds_pebs] = ds_cfg.sizeof_ptr_field * nr_pebs_fields;
- if (!cpu_has_bts) {
- ds_cfg.ctl[dsf_bts] = 0;
+ if (!cpu_has(cpu, X86_FEATURE_BTS)) {
+ ds_cfg.sizeof_rec[ds_bts] = 0;
printk(KERN_INFO "[ds] bts not available\n");
}
- if (!cpu_has_pebs)
+ if (!cpu_has(cpu, X86_FEATURE_PEBS)) {
+ ds_cfg.sizeof_rec[ds_pebs] = 0;
printk(KERN_INFO "[ds] pebs not available\n");
+ }
+
+ printk(KERN_INFO "[ds] sizes: address: %u bit, ",
+ 8 * ds_cfg.sizeof_ptr_field);
+ printk("bts/pebs record: %u/%u bytes\n",
+ ds_cfg.sizeof_rec[ds_bts], ds_cfg.sizeof_rec[ds_pebs]);
- WARN_ON_ONCE(MAX_SIZEOF_DS < (12 * ds_cfg.sizeof_field));
+ WARN_ON_ONCE(MAX_PEBS_COUNTERS < ds_cfg.nr_counter_reset);
}
void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
{
+ /* Only configure the first cpu. Others are identical. */
+ if (ds_cfg.name)
+ return;
+
switch (c->x86) {
case 0x6:
switch (c->x86_model) {
case 0x9:
case 0xd: /* Pentium M */
- ds_configure(&ds_cfg_pentium_m);
+ ds_configure(&ds_cfg_pentium_m, c);
break;
case 0xf:
case 0x17: /* Core2 */
case 0x1c: /* Atom */
- ds_configure(&ds_cfg_core2_atom);
+ ds_configure(&ds_cfg_core2_atom, c);
+ break;
+ case 0x1a: /* Core i7 */
+ ds_configure(&ds_cfg_core_i7, c);
break;
- case 0x1a: /* i7 */
default:
- /* sorry, don't know about them */
+ /* Sorry, don't know about them. */
break;
}
break;
@@ -969,64 +1349,89 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
case 0x0:
case 0x1:
case 0x2: /* Netburst */
- ds_configure(&ds_cfg_netburst);
+ ds_configure(&ds_cfg_netburst, c);
break;
default:
- /* sorry, don't know about them */
+ /* Sorry, don't know about them. */
break;
}
break;
default:
- /* sorry, don't know about them */
+ /* Sorry, don't know about them. */
break;
}
}
+static inline void ds_take_timestamp(struct ds_context *context,
+ enum bts_qualifier qualifier,
+ struct task_struct *task)
+{
+ struct bts_tracer *tracer = context->bts_master;
+ struct bts_struct ts;
+
+ /* Prevent compilers from reading the tracer pointer twice. */
+ barrier();
+
+ if (!tracer || !(tracer->flags & BTS_TIMESTAMPS))
+ return;
+
+ memset(&ts, 0, sizeof(ts));
+ ts.qualifier = qualifier;
+ ts.variant.event.clock = trace_clock_global();
+ ts.variant.event.pid = task->pid;
+
+ bts_write(tracer, &ts);
+}
+
/*
* Change the DS configuration from tracing prev to tracing next.
*/
void ds_switch_to(struct task_struct *prev, struct task_struct *next)
{
- struct ds_context *prev_ctx = prev->thread.ds_ctx;
- struct ds_context *next_ctx = next->thread.ds_ctx;
+ struct ds_context *prev_ctx = prev->thread.ds_ctx;
+ struct ds_context *next_ctx = next->thread.ds_ctx;
+ unsigned long debugctlmsr = next->thread.debugctlmsr;
+
+ /* Make sure all data is read before we start. */
+ barrier();
if (prev_ctx) {
update_debugctlmsr(0);
- if (prev_ctx->bts_master &&
- (prev_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) {
- struct bts_struct ts = {
- .qualifier = bts_task_departs,
- .variant.timestamp.jiffies = jiffies_64,
- .variant.timestamp.pid = prev->pid
- };
- bts_write(prev_ctx->bts_master, &ts);
- }
+ ds_take_timestamp(prev_ctx, bts_task_departs, prev);
}
if (next_ctx) {
- if (next_ctx->bts_master &&
- (next_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) {
- struct bts_struct ts = {
- .qualifier = bts_task_arrives,
- .variant.timestamp.jiffies = jiffies_64,
- .variant.timestamp.pid = next->pid
- };
- bts_write(next_ctx->bts_master, &ts);
- }
+ ds_take_timestamp(next_ctx, bts_task_arrives, next);
wrmsrl(MSR_IA32_DS_AREA, (unsigned long)next_ctx->ds);
}
- update_debugctlmsr(next->thread.debugctlmsr);
+ update_debugctlmsr(debugctlmsr);
}
-void ds_copy_thread(struct task_struct *tsk, struct task_struct *father)
+static __init int ds_selftest(void)
{
- clear_tsk_thread_flag(tsk, TIF_DS_AREA_MSR);
- tsk->thread.ds_ctx = NULL;
-}
+ if (ds_cfg.sizeof_rec[ds_bts]) {
+ int error;
-void ds_exit_thread(struct task_struct *tsk)
-{
+ error = ds_selftest_bts();
+ if (error) {
+ WARN(1, "[ds] selftest failed. disabling bts.\n");
+ ds_cfg.sizeof_rec[ds_bts] = 0;
+ }
+ }
+
+ if (ds_cfg.sizeof_rec[ds_pebs]) {
+ int error;
+
+ error = ds_selftest_pebs();
+ if (error) {
+ WARN(1, "[ds] selftest failed. disabling pebs.\n");
+ ds_cfg.sizeof_rec[ds_pebs] = 0;
+ }
+ }
+
+ return 0;
}
+device_initcall(ds_selftest);
diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c
new file mode 100644
index 00000000000..6bc7c199ab9
--- /dev/null
+++ b/arch/x86/kernel/ds_selftest.c
@@ -0,0 +1,408 @@
+/*
+ * Debug Store support - selftest
+ *
+ *
+ * Copyright (C) 2009 Intel Corporation.
+ * Markus Metzger <markus.t.metzger@intel.com>, 2009
+ */
+
+#include "ds_selftest.h"
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/smp.h>
+#include <linux/cpu.h>
+
+#include <asm/ds.h>
+
+
+#define BUFFER_SIZE 521 /* Intentionally chose an odd size. */
+#define SMALL_BUFFER_SIZE 24 /* A single bts entry. */
+
+struct ds_selftest_bts_conf {
+ struct bts_tracer *tracer;
+ int error;
+ int (*suspend)(struct bts_tracer *);
+ int (*resume)(struct bts_tracer *);
+};
+
+static int ds_selftest_bts_consistency(const struct bts_trace *trace)
+{
+ int error = 0;
+
+ if (!trace) {
+ printk(KERN_CONT "failed to access trace...");
+ /* Bail out. Other tests are pointless. */
+ return -1;
+ }
+
+ if (!trace->read) {
+ printk(KERN_CONT "bts read not available...");
+ error = -1;
+ }
+
+ /* Do some sanity checks on the trace configuration. */
+ if (!trace->ds.n) {
+ printk(KERN_CONT "empty bts buffer...");
+ error = -1;
+ }
+ if (!trace->ds.size) {
+ printk(KERN_CONT "bad bts trace setup...");
+ error = -1;
+ }
+ if (trace->ds.end !=
+ (char *)trace->ds.begin + (trace->ds.n * trace->ds.size)) {
+ printk(KERN_CONT "bad bts buffer setup...");
+ error = -1;
+ }
+ /*
+ * We allow top in [begin; end], since its not clear when the
+ * overflow adjustment happens: after the increment or before the
+ * write.
+ */
+ if ((trace->ds.top < trace->ds.begin) ||
+ (trace->ds.end < trace->ds.top)) {
+ printk(KERN_CONT "bts top out of bounds...");
+ error = -1;
+ }
+
+ return error;
+}
+
+static int ds_selftest_bts_read(struct bts_tracer *tracer,
+ const struct bts_trace *trace,
+ const void *from, const void *to)
+{
+ const unsigned char *at;
+
+ /*
+ * Check a few things which do not belong to this test.
+ * They should be covered by other tests.
+ */
+ if (!trace)
+ return -1;
+
+ if (!trace->read)
+ return -1;
+
+ if (to < from)
+ return -1;
+
+ if (from < trace->ds.begin)
+ return -1;
+
+ if (trace->ds.end < to)
+ return -1;
+
+ if (!trace->ds.size)
+ return -1;
+
+ /* Now to the test itself. */
+ for (at = from; (void *)at < to; at += trace->ds.size) {
+ struct bts_struct bts;
+ unsigned long index;
+ int error;
+
+ if (((void *)at - trace->ds.begin) % trace->ds.size) {
+ printk(KERN_CONT
+ "read from non-integer index...");
+ return -1;
+ }
+ index = ((void *)at - trace->ds.begin) / trace->ds.size;
+
+ memset(&bts, 0, sizeof(bts));
+ error = trace->read(tracer, at, &bts);
+ if (error < 0) {
+ printk(KERN_CONT
+ "error reading bts trace at [%lu] (0x%p)...",
+ index, at);
+ return error;
+ }
+
+ switch (bts.qualifier) {
+ case BTS_BRANCH:
+ break;
+ default:
+ printk(KERN_CONT
+ "unexpected bts entry %llu at [%lu] (0x%p)...",
+ bts.qualifier, index, at);
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+static void ds_selftest_bts_cpu(void *arg)
+{
+ struct ds_selftest_bts_conf *conf = arg;
+ const struct bts_trace *trace;
+ void *top;
+
+ if (IS_ERR(conf->tracer)) {
+ conf->error = PTR_ERR(conf->tracer);
+ conf->tracer = NULL;
+
+ printk(KERN_CONT
+ "initialization failed (err: %d)...", conf->error);
+ return;
+ }
+
+ /* We should meanwhile have enough trace. */
+ conf->error = conf->suspend(conf->tracer);
+ if (conf->error < 0)
+ return;
+
+ /* Let's see if we can access the trace. */
+ trace = ds_read_bts(conf->tracer);
+
+ conf->error = ds_selftest_bts_consistency(trace);
+ if (conf->error < 0)
+ return;
+
+ /* If everything went well, we should have a few trace entries. */
+ if (trace->ds.top == trace->ds.begin) {
+ /*
+ * It is possible but highly unlikely that we got a
+ * buffer overflow and end up at exactly the same
+ * position we started from.
+ * Let's issue a warning, but continue.
+ */
+ printk(KERN_CONT "no trace/overflow...");
+ }
+
+ /* Let's try to read the trace we collected. */
+ conf->error =
+ ds_selftest_bts_read(conf->tracer, trace,
+ trace->ds.begin, trace->ds.top);
+ if (conf->error < 0)
+ return;
+
+ /*
+ * Let's read the trace again.
+ * Since we suspended tracing, we should get the same result.
+ */
+ top = trace->ds.top;
+
+ trace = ds_read_bts(conf->tracer);
+ conf->error = ds_selftest_bts_consistency(trace);
+ if (conf->error < 0)
+ return;
+
+ if (top != trace->ds.top) {
+ printk(KERN_CONT "suspend not working...");
+ conf->error = -1;
+ return;
+ }
+
+ /* Let's collect some more trace - see if resume is working. */
+ conf->error = conf->resume(conf->tracer);
+ if (conf->error < 0)
+ return;
+
+ conf->error = conf->suspend(conf->tracer);
+ if (conf->error < 0)
+ return;
+
+ trace = ds_read_bts(conf->tracer);
+
+ conf->error = ds_selftest_bts_consistency(trace);
+ if (conf->error < 0)
+ return;
+
+ if (trace->ds.top == top) {
+ /*
+ * It is possible but highly unlikely that we got a
+ * buffer overflow and end up at exactly the same
+ * position we started from.
+ * Let's issue a warning and check the full trace.
+ */
+ printk(KERN_CONT
+ "no resume progress/overflow...");
+
+ conf->error =
+ ds_selftest_bts_read(conf->tracer, trace,
+ trace->ds.begin, trace->ds.end);
+ } else if (trace->ds.top < top) {
+ /*
+ * We had a buffer overflow - the entire buffer should
+ * contain trace records.
+ */
+ conf->error =
+ ds_selftest_bts_read(conf->tracer, trace,
+ trace->ds.begin, trace->ds.end);
+ } else {
+ /*
+ * It is quite likely that the buffer did not overflow.
+ * Let's just check the delta trace.
+ */
+ conf->error =
+ ds_selftest_bts_read(conf->tracer, trace, top,
+ trace->ds.top);
+ }
+ if (conf->error < 0)
+ return;
+
+ conf->error = 0;
+}
+
+static int ds_suspend_bts_wrap(struct bts_tracer *tracer)
+{
+ ds_suspend_bts(tracer);
+ return 0;
+}
+
+static int ds_resume_bts_wrap(struct bts_tracer *tracer)
+{
+ ds_resume_bts(tracer);
+ return 0;
+}
+
+static void ds_release_bts_noirq_wrap(void *tracer)
+{
+ (void)ds_release_bts_noirq(tracer);
+}
+
+static int ds_selftest_bts_bad_release_noirq(int cpu,
+ struct bts_tracer *tracer)
+{
+ int error = -EPERM;
+
+ /* Try to release the tracer on the wrong cpu. */
+ get_cpu();
+ if (cpu != smp_processor_id()) {
+ error = ds_release_bts_noirq(tracer);
+ if (error != -EPERM)
+ printk(KERN_CONT "release on wrong cpu...");
+ }
+ put_cpu();
+
+ return error ? 0 : -1;
+}
+
+static int ds_selftest_bts_bad_request_cpu(int cpu, void *buffer)
+{
+ struct bts_tracer *tracer;
+ int error;
+
+ /* Try to request cpu tracing while task tracing is active. */
+ tracer = ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE, NULL,
+ (size_t)-1, BTS_KERNEL);
+ error = PTR_ERR(tracer);
+ if (!IS_ERR(tracer)) {
+ ds_release_bts(tracer);
+ error = 0;
+ }
+
+ if (error != -EPERM)
+ printk(KERN_CONT "cpu/task tracing overlap...");
+
+ return error ? 0 : -1;
+}
+
+static int ds_selftest_bts_bad_request_task(void *buffer)
+{
+ struct bts_tracer *tracer;
+ int error;
+
+ /* Try to request cpu tracing while task tracing is active. */
+ tracer = ds_request_bts_task(current, buffer, BUFFER_SIZE, NULL,
+ (size_t)-1, BTS_KERNEL);
+ error = PTR_ERR(tracer);
+ if (!IS_ERR(tracer)) {
+ error = 0;
+ ds_release_bts(tracer);
+ }
+
+ if (error != -EPERM)
+ printk(KERN_CONT "task/cpu tracing overlap...");
+
+ return error ? 0 : -1;
+}
+
+int ds_selftest_bts(void)
+{
+ struct ds_selftest_bts_conf conf;
+ unsigned char buffer[BUFFER_SIZE], *small_buffer;
+ unsigned long irq;
+ int cpu;
+
+ printk(KERN_INFO "[ds] bts selftest...");
+ conf.error = 0;
+
+ small_buffer = (unsigned char *)ALIGN((unsigned long)buffer, 8) + 8;
+
+ get_online_cpus();
+ for_each_online_cpu(cpu) {
+ conf.suspend = ds_suspend_bts_wrap;
+ conf.resume = ds_resume_bts_wrap;
+ conf.tracer =
+ ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE,
+ NULL, (size_t)-1, BTS_KERNEL);
+ ds_selftest_bts_cpu(&conf);
+ if (conf.error >= 0)
+ conf.error = ds_selftest_bts_bad_request_task(buffer);
+ ds_release_bts(conf.tracer);
+ if (conf.error < 0)
+ goto out;
+
+ conf.suspend = ds_suspend_bts_noirq;
+ conf.resume = ds_resume_bts_noirq;
+ conf.tracer =
+ ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE,
+ NULL, (size_t)-1, BTS_KERNEL);
+ smp_call_function_single(cpu, ds_selftest_bts_cpu, &conf, 1);
+ if (conf.error >= 0) {
+ conf.error =
+ ds_selftest_bts_bad_release_noirq(cpu,
+ conf.tracer);
+ /* We must not release the tracer twice. */
+ if (conf.error < 0)
+ conf.tracer = NULL;
+ }
+ if (conf.error >= 0)
+ conf.error = ds_selftest_bts_bad_request_task(buffer);
+ smp_call_function_single(cpu, ds_release_bts_noirq_wrap,
+ conf.tracer, 1);
+ if (conf.error < 0)
+ goto out;
+ }
+
+ conf.suspend = ds_suspend_bts_wrap;
+ conf.resume = ds_resume_bts_wrap;
+ conf.tracer =
+ ds_request_bts_task(current, buffer, BUFFER_SIZE,
+ NULL, (size_t)-1, BTS_KERNEL);
+ ds_selftest_bts_cpu(&conf);
+ if (conf.error >= 0)
+ conf.error = ds_selftest_bts_bad_request_cpu(0, buffer);
+ ds_release_bts(conf.tracer);
+ if (conf.error < 0)
+ goto out;
+
+ conf.suspend = ds_suspend_bts_noirq;
+ conf.resume = ds_resume_bts_noirq;
+ conf.tracer =
+ ds_request_bts_task(current, small_buffer, SMALL_BUFFER_SIZE,
+ NULL, (size_t)-1, BTS_KERNEL);
+ local_irq_save(irq);
+ ds_selftest_bts_cpu(&conf);
+ if (conf.error >= 0)
+ conf.error = ds_selftest_bts_bad_request_cpu(0, buffer);
+ ds_release_bts_noirq(conf.tracer);
+ local_irq_restore(irq);
+ if (conf.error < 0)
+ goto out;
+
+ conf.error = 0;
+ out:
+ put_online_cpus();
+ printk(KERN_CONT "%s.\n", (conf.error ? "failed" : "passed"));
+
+ return conf.error;
+}
+
+int ds_selftest_pebs(void)
+{
+ return 0;
+}
diff --git a/arch/x86/kernel/ds_selftest.h b/arch/x86/kernel/ds_selftest.h
new file mode 100644
index 00000000000..2ba8745c666
--- /dev/null
+++ b/arch/x86/kernel/ds_selftest.h
@@ -0,0 +1,15 @@
+/*
+ * Debug Store support - selftest
+ *
+ *
+ * Copyright (C) 2009 Intel Corporation.
+ * Markus Metzger <markus.t.metzger@intel.com>, 2009
+ */
+
+#ifdef CONFIG_X86_DS_SELFTEST
+extern int ds_selftest_bts(void);
+extern int ds_selftest_pebs(void);
+#else
+static inline int ds_selftest_bts(void) { return 0; }
+static inline int ds_selftest_pebs(void) { return 0; }
+#endif
diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h
index da87590b869..81086c227ab 100644
--- a/arch/x86/kernel/dumpstack.h
+++ b/arch/x86/kernel/dumpstack.h
@@ -29,7 +29,6 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
unsigned long *sp, unsigned long bp, char *log_lvl);
extern unsigned int code_bytes;
-extern int kstack_depth_to_print;
/* The form of the top of the frame on the stack */
struct stack_frame {
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 00628130292..7271fa33d79 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -617,7 +617,7 @@ __init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize,
*/
__init void e820_setup_gap(void)
{
- unsigned long gapstart, gapsize, round;
+ unsigned long gapstart, gapsize;
int found;
gapstart = 0x10000000;
@@ -635,14 +635,9 @@ __init void e820_setup_gap(void)
#endif
/*
- * See how much we want to round up: start off with
- * rounding to the next 1MB area.
+ * e820_reserve_resources_late protect stolen RAM already
*/
- round = 0x100000;
- while ((gapsize >> 4) > round)
- round += round;
- /* Fun with two's complement */
- pci_mem_start = (gapstart + round) & -round;
+ pci_mem_start = gapstart;
printk(KERN_INFO
"Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
@@ -1371,6 +1366,23 @@ void __init e820_reserve_resources(void)
}
}
+/* How much should we pad RAM ending depending on where it is? */
+static unsigned long ram_alignment(resource_size_t pos)
+{
+ unsigned long mb = pos >> 20;
+
+ /* To 64kB in the first megabyte */
+ if (!mb)
+ return 64*1024;
+
+ /* To 1MB in the first 16MB */
+ if (mb < 16)
+ return 1024*1024;
+
+ /* To 32MB for anything above that */
+ return 32*1024*1024;
+}
+
void __init e820_reserve_resources_late(void)
{
int i;
@@ -1382,6 +1394,24 @@ void __init e820_reserve_resources_late(void)
insert_resource_expand_to_fit(&iomem_resource, res);
res++;
}
+
+ /*
+ * Try to bump up RAM regions to reasonable boundaries to
+ * avoid stolen RAM:
+ */
+ for (i = 0; i < e820.nr_map; i++) {
+ struct e820entry *entry = &e820_saved.map[i];
+ resource_size_t start, end;
+
+ if (entry->type != E820_RAM)
+ continue;
+ start = entry->addr + entry->size;
+ end = round_up(start, ram_alignment(start));
+ if (start == end)
+ continue;
+ reserve_region_with_split(&iomem_resource, start,
+ end - 1, "RAM buffer");
+ }
}
char *__init default_machine_specific_memory_setup(void)
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 76b8cd953de..ebdb85cf268 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -97,6 +97,7 @@ static void __init nvidia_bugs(int num, int slot, int func)
}
#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC)
+#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC)
static u32 __init ati_ixp4x0_rev(int num, int slot, int func)
{
u32 d;
@@ -114,6 +115,7 @@ static u32 __init ati_ixp4x0_rev(int num, int slot, int func)
d &= 0xff;
return d;
}
+#endif
static void __init ati_bugs(int num, int slot, int func)
{
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 38946c6e843..de74f0a3e0e 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -147,27 +147,14 @@ END(ftrace_graph_caller)
GLOBAL(return_to_handler)
subq $80, %rsp
+ /* Save the return values */
movq %rax, (%rsp)
- movq %rcx, 8(%rsp)
- movq %rdx, 16(%rsp)
- movq %rsi, 24(%rsp)
- movq %rdi, 32(%rsp)
- movq %r8, 40(%rsp)
- movq %r9, 48(%rsp)
- movq %r10, 56(%rsp)
- movq %r11, 64(%rsp)
+ movq %rdx, 8(%rsp)
call ftrace_return_to_handler
movq %rax, 72(%rsp)
- movq 64(%rsp), %r11
- movq 56(%rsp), %r10
- movq 48(%rsp), %r9
- movq 40(%rsp), %r8
- movq 32(%rsp), %rdi
- movq 24(%rsp), %rsi
- movq 16(%rsp), %rdx
- movq 8(%rsp), %rcx
+ movq 8(%rsp), %rdx
movq (%rsp), %rax
addq $72, %rsp
retq
@@ -976,6 +963,8 @@ END(\sym)
#ifdef CONFIG_SMP
apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \
irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt
+apicinterrupt REBOOT_VECTOR \
+ reboot_interrupt smp_reboot_interrupt
#endif
#ifdef CONFIG_X86_UV
@@ -1007,10 +996,15 @@ apicinterrupt INVALIDATE_TLB_VECTOR_START+7 \
#endif
apicinterrupt THRESHOLD_APIC_VECTOR \
- threshold_interrupt mce_threshold_interrupt
+ threshold_interrupt smp_threshold_interrupt
apicinterrupt THERMAL_APIC_VECTOR \
thermal_interrupt smp_thermal_interrupt
+#ifdef CONFIG_X86_MCE
+apicinterrupt MCE_SELF_VECTOR \
+ mce_self_interrupt smp_mce_self_interrupt
+#endif
+
#ifdef CONFIG_SMP
apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \
call_function_single_interrupt smp_call_function_single_interrupt
@@ -1025,6 +1019,11 @@ apicinterrupt ERROR_APIC_VECTOR \
apicinterrupt SPURIOUS_APIC_VECTOR \
spurious_interrupt smp_spurious_interrupt
+#ifdef CONFIG_PERF_COUNTERS
+apicinterrupt LOCAL_PENDING_VECTOR \
+ perf_pending_interrupt smp_perf_pending_interrupt
+#endif
+
/*
* Exception entry points.
*/
@@ -1379,10 +1378,15 @@ END(xen_failsafe_callback)
paranoidzeroentry_ist debug do_debug DEBUG_STACK
paranoidzeroentry_ist int3 do_int3 DEBUG_STACK
paranoiderrorentry stack_segment do_stack_segment
+#ifdef CONFIG_XEN
+zeroentry xen_debug do_debug
+zeroentry xen_int3 do_int3
+errorentry xen_stack_segment do_stack_segment
+#endif
errorentry general_protection do_general_protection
errorentry page_fault do_page_fault
#ifdef CONFIG_X86_MCE
-paranoidzeroentry machine_check do_machine_check
+paranoidzeroentry machine_check *machine_check_vector(%rip)
#endif
/*
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 18dfa30795c..b79c5533c42 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -442,7 +442,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
_ASM_EXTABLE(1b, 4b)
_ASM_EXTABLE(2b, 4b)
- : [old] "=r" (old), [faulted] "=r" (faulted)
+ : [old] "=&r" (old), [faulted] "=r" (faulted)
: [parent] "r" (parent), [return_hooker] "r" (return_hooker)
: "memory"
);
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 30683883e0c..dc5ed4bdd88 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -608,13 +608,6 @@ ignore_int:
ENTRY(initial_code)
.long i386_start_kernel
-.section .text
-/*
- * Real beginning of normal "text" segment
- */
-ENTRY(stext)
-ENTRY(_stext)
-
/*
* BSS section
*/
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index c3fe010d74c..b0cdde6932f 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -12,6 +12,8 @@
#include <asm/io_apic.h>
#include <asm/irq.h>
#include <asm/idle.h>
+#include <asm/mce.h>
+#include <asm/hw_irq.h>
atomic_t irq_err_count;
@@ -24,9 +26,9 @@ void (*generic_interrupt_extension)(void) = NULL;
*/
void ack_bad_irq(unsigned int irq)
{
- printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq);
+ if (printk_ratelimit())
+ pr_err("unexpected IRQ trap at vector %02x\n", irq);
-#ifdef CONFIG_X86_LOCAL_APIC
/*
* Currently unexpected vectors happen only on SMP and APIC.
* We _must_ ack these because every local APIC has only N
@@ -36,9 +38,7 @@ void ack_bad_irq(unsigned int irq)
* completely.
* But only ack when the APIC is enabled -AK
*/
- if (cpu_has_apic)
- ack_APIC_irq();
-#endif
+ ack_APIC_irq();
}
#define irq_stats(x) (&per_cpu(irq_stat, x))
@@ -63,6 +63,14 @@ static int show_other_interrupts(struct seq_file *p, int prec)
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
seq_printf(p, " Spurious interrupts\n");
+ seq_printf(p, "%*s: ", prec, "CNT");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
+ seq_printf(p, " Performance counter interrupts\n");
+ seq_printf(p, "%*s: ", prec, "PND");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs);
+ seq_printf(p, " Performance pending work\n");
#endif
if (generic_interrupt_extension) {
seq_printf(p, "%*s: ", prec, "PLT");
@@ -89,13 +97,23 @@ static int show_other_interrupts(struct seq_file *p, int prec)
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count);
seq_printf(p, " Thermal event interrupts\n");
-# ifdef CONFIG_X86_64
+# ifdef CONFIG_X86_MCE_THRESHOLD
seq_printf(p, "%*s: ", prec, "THR");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
seq_printf(p, " Threshold APIC interrupts\n");
# endif
#endif
+#ifdef CONFIG_X86_NEW_MCE
+ seq_printf(p, "%*s: ", prec, "MCE");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", per_cpu(mce_exception_count, j));
+ seq_printf(p, " Machine check exceptions\n");
+ seq_printf(p, "%*s: ", prec, "MCP");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", per_cpu(mce_poll_count, j));
+ seq_printf(p, " Machine check polls\n");
+#endif
seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count));
#if defined(CONFIG_X86_IO_APIC)
seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count));
@@ -166,6 +184,8 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
#ifdef CONFIG_X86_LOCAL_APIC
sum += irq_stats(cpu)->apic_timer_irqs;
sum += irq_stats(cpu)->irq_spurious_count;
+ sum += irq_stats(cpu)->apic_perf_irqs;
+ sum += irq_stats(cpu)->apic_pending_irqs;
#endif
if (generic_interrupt_extension)
sum += irq_stats(cpu)->generic_irqs;
@@ -176,9 +196,13 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
#endif
#ifdef CONFIG_X86_MCE
sum += irq_stats(cpu)->irq_thermal_count;
-# ifdef CONFIG_X86_64
+# ifdef CONFIG_X86_MCE_THRESHOLD
sum += irq_stats(cpu)->irq_threshold_count;
+# endif
#endif
+#ifdef CONFIG_X86_NEW_MCE
+ sum += per_cpu(mce_exception_count, cpu);
+ sum += per_cpu(mce_poll_count, cpu);
#endif
return sum;
}
@@ -213,14 +237,11 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
irq = __get_cpu_var(vector_irq)[vector];
if (!handle_irq(irq, regs)) {
-#ifdef CONFIG_X86_64
- if (!disable_apic)
- ack_APIC_irq();
-#endif
+ ack_APIC_irq();
if (printk_ratelimit())
- printk(KERN_EMERG "%s: %d.%d No irq handler for vector (irq %d)\n",
- __func__, smp_processor_id(), vector, irq);
+ pr_emerg("%s: %d.%d No irq handler for vector (irq %d)\n",
+ __func__, smp_processor_id(), vector, irq);
}
irq_exit();
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit.c
index 368b0a8836f..696f0e475c2 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit.c
@@ -1,20 +1,25 @@
+#include <linux/linkage.h>
#include <linux/errno.h>
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/ioport.h>
#include <linux/interrupt.h>
+#include <linux/timex.h>
#include <linux/slab.h>
#include <linux/random.h>
+#include <linux/kprobes.h>
#include <linux/init.h>
#include <linux/kernel_stat.h>
#include <linux/sysdev.h>
#include <linux/bitops.h>
+#include <linux/acpi.h>
#include <linux/io.h>
#include <linux/delay.h>
#include <asm/atomic.h>
#include <asm/system.h>
#include <asm/timer.h>
+#include <asm/hw_irq.h>
#include <asm/pgtable.h>
#include <asm/desc.h>
#include <asm/apic.h>
@@ -22,7 +27,23 @@
#include <asm/i8259.h>
#include <asm/traps.h>
+/*
+ * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
+ * (these are usually mapped to vectors 0x30-0x3f)
+ */
+
+/*
+ * The IO-APIC gives us many more interrupt sources. Most of these
+ * are unused but an SMP system is supposed to have enough memory ...
+ * sometimes (mostly wrt. hw bugs) we get corrupted vectors all
+ * across the spectrum, so we really want to be prepared to get all
+ * of these. Plus, more powerful systems might have more than 64
+ * IO-APIC registers.
+ *
+ * (these are usually mapped into the 0x30-0xff vector range)
+ */
+#ifdef CONFIG_X86_32
/*
* Note that on a 486, we don't want to do a SIGFPE on an irq13
* as the irq is unreliable, and exception 16 works correctly
@@ -52,30 +73,7 @@ static struct irqaction fpu_irq = {
.handler = math_error_irq,
.name = "fpu",
};
-
-void __init init_ISA_irqs(void)
-{
- int i;
-
-#ifdef CONFIG_X86_LOCAL_APIC
- init_bsp_APIC();
#endif
- init_8259A(0);
-
- /*
- * 16 old-style INTA-cycle interrupts:
- */
- for (i = 0; i < NR_IRQS_LEGACY; i++) {
- struct irq_desc *desc = irq_to_desc(i);
-
- desc->status = IRQ_DISABLED;
- desc->action = NULL;
- desc->depth = 1;
-
- set_irq_chip_and_handler_name(i, &i8259A_chip,
- handle_level_irq, "XT");
- }
-}
/*
* IRQ2 is cascade interrupt to second interrupt controller
@@ -118,29 +116,37 @@ int vector_used_by_percpu_irq(unsigned int vector)
return 0;
}
-/* Overridden in paravirt.c */
-void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
-
-void __init native_init_IRQ(void)
+static void __init init_ISA_irqs(void)
{
int i;
- /* Execute any quirks before the call gates are initialised: */
- x86_quirk_pre_intr_init();
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
+ init_bsp_APIC();
+#endif
+ init_8259A(0);
/*
- * Cover the whole vector space, no vector can escape
- * us. (some of these will be overridden and become
- * 'special' SMP interrupts)
+ * 16 old-style INTA-cycle interrupts:
*/
- for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
- /* SYSCALL_VECTOR was reserved in trap_init. */
- if (i != SYSCALL_VECTOR)
- set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
+ for (i = 0; i < NR_IRQS_LEGACY; i++) {
+ struct irq_desc *desc = irq_to_desc(i);
+
+ desc->status = IRQ_DISABLED;
+ desc->action = NULL;
+ desc->depth = 1;
+
+ set_irq_chip_and_handler_name(i, &i8259A_chip,
+ handle_level_irq, "XT");
}
+}
+/* Overridden in paravirt.c */
+void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
-#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP)
+static void __init smp_intr_init(void)
+{
+#ifdef CONFIG_SMP
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
/*
* The reschedule interrupt is a CPU-to-CPU reschedule-helper
* IPI, driven by wakeup.
@@ -160,16 +166,35 @@ void __init native_init_IRQ(void)
/* IPI for generic function call */
alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
- /* IPI for single call function */
+ /* IPI for generic single function call */
alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
- call_function_single_interrupt);
+ call_function_single_interrupt);
/* Low priority IPI to cleanup after moving an irq */
set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
+
+ /* IPI used for rebooting/stopping */
+ alloc_intr_gate(REBOOT_VECTOR, reboot_interrupt);
#endif
+#endif /* CONFIG_SMP */
+}
+
+static void __init apic_intr_init(void)
+{
+ smp_intr_init();
-#ifdef CONFIG_X86_LOCAL_APIC
+#ifdef CONFIG_X86_THERMAL_VECTOR
+ alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
+#endif
+#ifdef CONFIG_X86_THRESHOLD
+ alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
+#endif
+#if defined(CONFIG_X86_NEW_MCE) && defined(CONFIG_X86_LOCAL_APIC)
+ alloc_intr_gate(MCE_SELF_VECTOR, mce_self_interrupt);
+#endif
+
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
/* self generated IPI for local APIC timer */
alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
@@ -179,16 +204,59 @@ void __init native_init_IRQ(void)
/* IPI vectors for APIC spurious and error interrupts */
alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
+
+ /* Performance monitoring interrupts: */
+# ifdef CONFIG_PERF_COUNTERS
+ alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
+# endif
+
#endif
+}
-#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL)
- /* thermal monitor LVT interrupt */
- alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
+/**
+ * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors
+ *
+ * Description:
+ * Perform any necessary interrupt initialisation prior to setting up
+ * the "ordinary" interrupt call gates. For legacy reasons, the ISA
+ * interrupts should be initialised here if the machine emulates a PC
+ * in any way.
+ **/
+static void __init x86_quirk_pre_intr_init(void)
+{
+#ifdef CONFIG_X86_32
+ if (x86_quirks->arch_pre_intr_init) {
+ if (x86_quirks->arch_pre_intr_init())
+ return;
+ }
#endif
+ init_ISA_irqs();
+}
+
+void __init native_init_IRQ(void)
+{
+ int i;
+
+ /* Execute any quirks before the call gates are initialised: */
+ x86_quirk_pre_intr_init();
+
+ apic_intr_init();
+
+ /*
+ * Cover the whole vector space, no vector can escape
+ * us. (some of these will be overridden and become
+ * 'special' SMP interrupts)
+ */
+ for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
+ /* IA32_SYSCALL_VECTOR could be used in trap_init already. */
+ if (!test_bit(i, used_vectors))
+ set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
+ }
if (!acpi_ioapic)
setup_irq(2, &irq2);
+#ifdef CONFIG_X86_32
/*
* Call quirks after call gates are initialised (usually add in
* the architecture specific gates):
@@ -203,4 +271,5 @@ void __init native_init_IRQ(void)
setup_irq(FPU_IRQ, &fpu_irq);
irq_ctx_init(smp_processor_id());
+#endif
}
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
deleted file mode 100644
index 8cd10537fd4..00000000000
--- a/arch/x86/kernel/irqinit_64.c
+++ /dev/null
@@ -1,177 +0,0 @@
-#include <linux/linkage.h>
-#include <linux/errno.h>
-#include <linux/signal.h>
-#include <linux/sched.h>
-#include <linux/ioport.h>
-#include <linux/interrupt.h>
-#include <linux/timex.h>
-#include <linux/slab.h>
-#include <linux/random.h>
-#include <linux/init.h>
-#include <linux/kernel_stat.h>
-#include <linux/sysdev.h>
-#include <linux/bitops.h>
-#include <linux/acpi.h>
-#include <linux/io.h>
-#include <linux/delay.h>
-
-#include <asm/atomic.h>
-#include <asm/system.h>
-#include <asm/hw_irq.h>
-#include <asm/pgtable.h>
-#include <asm/desc.h>
-#include <asm/apic.h>
-#include <asm/i8259.h>
-
-/*
- * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
- * (these are usually mapped to vectors 0x30-0x3f)
- */
-
-/*
- * The IO-APIC gives us many more interrupt sources. Most of these
- * are unused but an SMP system is supposed to have enough memory ...
- * sometimes (mostly wrt. hw bugs) we get corrupted vectors all
- * across the spectrum, so we really want to be prepared to get all
- * of these. Plus, more powerful systems might have more than 64
- * IO-APIC registers.
- *
- * (these are usually mapped into the 0x30-0xff vector range)
- */
-
-/*
- * IRQ2 is cascade interrupt to second interrupt controller
- */
-
-static struct irqaction irq2 = {
- .handler = no_action,
- .name = "cascade",
-};
-DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
- [0 ... IRQ0_VECTOR - 1] = -1,
- [IRQ0_VECTOR] = 0,
- [IRQ1_VECTOR] = 1,
- [IRQ2_VECTOR] = 2,
- [IRQ3_VECTOR] = 3,
- [IRQ4_VECTOR] = 4,
- [IRQ5_VECTOR] = 5,
- [IRQ6_VECTOR] = 6,
- [IRQ7_VECTOR] = 7,
- [IRQ8_VECTOR] = 8,
- [IRQ9_VECTOR] = 9,
- [IRQ10_VECTOR] = 10,
- [IRQ11_VECTOR] = 11,
- [IRQ12_VECTOR] = 12,
- [IRQ13_VECTOR] = 13,
- [IRQ14_VECTOR] = 14,
- [IRQ15_VECTOR] = 15,
- [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
-};
-
-int vector_used_by_percpu_irq(unsigned int vector)
-{
- int cpu;
-
- for_each_online_cpu(cpu) {
- if (per_cpu(vector_irq, cpu)[vector] != -1)
- return 1;
- }
-
- return 0;
-}
-
-static void __init init_ISA_irqs(void)
-{
- int i;
-
- init_bsp_APIC();
- init_8259A(0);
-
- for (i = 0; i < NR_IRQS_LEGACY; i++) {
- struct irq_desc *desc = irq_to_desc(i);
-
- desc->status = IRQ_DISABLED;
- desc->action = NULL;
- desc->depth = 1;
-
- /*
- * 16 old-style INTA-cycle interrupts:
- */
- set_irq_chip_and_handler_name(i, &i8259A_chip,
- handle_level_irq, "XT");
- }
-}
-
-void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
-
-static void __init smp_intr_init(void)
-{
-#ifdef CONFIG_SMP
- /*
- * The reschedule interrupt is a CPU-to-CPU reschedule-helper
- * IPI, driven by wakeup.
- */
- alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
-
- /* IPIs for invalidation */
- alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0);
- alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1);
- alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2);
- alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3);
- alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4);
- alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5);
- alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6);
- alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7);
-
- /* IPI for generic function call */
- alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
-
- /* IPI for generic single function call */
- alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
- call_function_single_interrupt);
-
- /* Low priority IPI to cleanup after moving an irq */
- set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
- set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
-#endif
-}
-
-static void __init apic_intr_init(void)
-{
- smp_intr_init();
-
- alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
- alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
-
- /* self generated IPI for local APIC timer */
- alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
-
- /* generic IPI for platform specific use */
- alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt);
-
- /* IPI vectors for APIC spurious and error interrupts */
- alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
- alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
-}
-
-void __init native_init_IRQ(void)
-{
- int i;
-
- init_ISA_irqs();
- /*
- * Cover the whole vector space, no vector can escape
- * us. (some of these will be overridden and become
- * 'special' SMP interrupts)
- */
- for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
- int vector = FIRST_EXTERNAL_VECTOR + i;
- if (vector != IA32_SYSCALL_VECTOR)
- set_intr_gate(vector, interrupt[i]);
- }
-
- apic_intr_init();
-
- if (!acpi_ioapic)
- setup_irq(2, &irq2);
-}
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index b1f4dffb919..8d82a77a3f3 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -142,7 +142,7 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
gdb_regs32[GDB_PS] = *(unsigned long *)(p->thread.sp + 8);
gdb_regs32[GDB_CS] = __KERNEL_CS;
gdb_regs32[GDB_SS] = __KERNEL_DS;
- gdb_regs[GDB_PC] = p->thread.ip;
+ gdb_regs[GDB_PC] = 0;
gdb_regs[GDB_R8] = 0;
gdb_regs[GDB_R9] = 0;
gdb_regs[GDB_R10] = 0;
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 33019ddb56b..a78ecad0c90 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -27,6 +27,7 @@
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/hardirq.h>
+#include <asm/timer.h>
#define MMU_QUEUE_SIZE 1024
@@ -195,7 +196,7 @@ static void kvm_leave_lazy_mmu(void)
struct kvm_para_state *state = kvm_para_state();
mmu_queue_flush(state);
- paravirt_leave_lazy(paravirt_get_lazy_mode());
+ paravirt_leave_lazy_mmu();
state->mode = paravirt_get_lazy_mode();
}
@@ -230,6 +231,9 @@ static void paravirt_ops_setup(void)
pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu;
pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu;
}
+#ifdef CONFIG_X86_IO_APIC
+ no_timer_check = 1;
+#endif
}
void __init kvm_guest_init(void)
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index 453b5795a5c..366baa17991 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -13,25 +13,13 @@
* Licensed under the terms of the GNU General Public
* License version 2. See file COPYING for details.
*/
-#include <linux/platform_device.h>
-#include <linux/capability.h>
-#include <linux/miscdevice.h>
#include <linux/firmware.h>
-#include <linux/spinlock.h>
-#include <linux/cpumask.h>
#include <linux/pci_ids.h>
#include <linux/uaccess.h>
#include <linux/vmalloc.h>
#include <linux/kernel.h>
#include <linux/module.h>
-#include <linux/mutex.h>
-#include <linux/sched.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/cpu.h>
#include <linux/pci.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
#include <asm/microcode.h>
#include <asm/processor.h>
@@ -79,9 +67,6 @@ struct microcode_amd {
#define UCODE_CONTAINER_SECTION_HDR 8
#define UCODE_CONTAINER_HEADER_SIZE 12
-/* serialize access to the physical write */
-static DEFINE_SPINLOCK(microcode_update_lock);
-
static struct equiv_cpu_entry *equiv_cpu_table;
static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
@@ -144,9 +129,8 @@ static int get_matching_microcode(int cpu, void *mc, int rev)
return 1;
}
-static void apply_microcode_amd(int cpu)
+static int apply_microcode_amd(int cpu)
{
- unsigned long flags;
u32 rev, dummy;
int cpu_num = raw_smp_processor_id();
struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
@@ -156,25 +140,25 @@ static void apply_microcode_amd(int cpu)
BUG_ON(cpu_num != cpu);
if (mc_amd == NULL)
- return;
+ return 0;
- spin_lock_irqsave(&microcode_update_lock, flags);
wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code);
/* get patch id after patching */
rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
- spin_unlock_irqrestore(&microcode_update_lock, flags);
/* check current patch id and patch's id for match */
if (rev != mc_amd->hdr.patch_id) {
printk(KERN_ERR "microcode: CPU%d: update failed "
"(for patch_level=0x%x)\n", cpu, mc_amd->hdr.patch_id);
- return;
+ return -1;
}
printk(KERN_INFO "microcode: CPU%d: updated (new patch_level=0x%x)\n",
cpu, rev);
uci->cpu_sig.rev = rev;
+
+ return 0;
}
static int get_ucode_data(void *to, const u8 *from, size_t n)
@@ -257,13 +241,12 @@ static int install_equiv_cpu_table(const u8 *buf)
static void free_equiv_cpu_table(void)
{
- if (equiv_cpu_table) {
- vfree(equiv_cpu_table);
- equiv_cpu_table = NULL;
- }
+ vfree(equiv_cpu_table);
+ equiv_cpu_table = NULL;
}
-static int generic_load_microcode(int cpu, const u8 *data, size_t size)
+static enum ucode_state
+generic_load_microcode(int cpu, const u8 *data, size_t size)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
const u8 *ucode_ptr = data;
@@ -272,12 +255,13 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size)
int new_rev = uci->cpu_sig.rev;
unsigned int leftover;
unsigned long offset;
+ enum ucode_state state = UCODE_OK;
offset = install_equiv_cpu_table(ucode_ptr);
if (!offset) {
printk(KERN_ERR "microcode: failed to create "
"equivalent cpu table\n");
- return -EINVAL;
+ return UCODE_ERROR;
}
ucode_ptr += offset;
@@ -293,8 +277,7 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size)
mc_header = (struct microcode_header_amd *)mc;
if (get_matching_microcode(cpu, mc, new_rev)) {
- if (new_mc)
- vfree(new_mc);
+ vfree(new_mc);
new_rev = mc_header->patch_id;
new_mc = mc;
} else
@@ -306,34 +289,32 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size)
if (new_mc) {
if (!leftover) {
- if (uci->mc)
- vfree(uci->mc);
+ vfree(uci->mc);
uci->mc = new_mc;
pr_debug("microcode: CPU%d found a matching microcode "
"update with version 0x%x (current=0x%x)\n",
cpu, new_rev, uci->cpu_sig.rev);
- } else
+ } else {
vfree(new_mc);
- }
+ state = UCODE_ERROR;
+ }
+ } else
+ state = UCODE_NFOUND;
free_equiv_cpu_table();
- return (int)leftover;
+ return state;
}
-static int request_microcode_fw(int cpu, struct device *device)
+static enum ucode_state request_microcode_fw(int cpu, struct device *device)
{
const char *fw_name = "amd-ucode/microcode_amd.bin";
const struct firmware *firmware;
- int ret;
-
- /* We should bind the task to the CPU */
- BUG_ON(cpu != raw_smp_processor_id());
+ enum ucode_state ret;
- ret = request_firmware(&firmware, fw_name, device);
- if (ret) {
+ if (request_firmware(&firmware, fw_name, device)) {
printk(KERN_ERR "microcode: failed to load file %s\n", fw_name);
- return ret;
+ return UCODE_NFOUND;
}
ret = generic_load_microcode(cpu, firmware->data, firmware->size);
@@ -343,11 +324,12 @@ static int request_microcode_fw(int cpu, struct device *device)
return ret;
}
-static int request_microcode_user(int cpu, const void __user *buf, size_t size)
+static enum ucode_state
+request_microcode_user(int cpu, const void __user *buf, size_t size)
{
printk(KERN_INFO "microcode: AMD microcode update via "
"/dev/cpu/microcode not supported\n");
- return -1;
+ return UCODE_ERROR;
}
static void microcode_fini_cpu_amd(int cpu)
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 98c470c069d..9c4461501fc 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -71,27 +71,18 @@
* Thanks to Stuart Swales for pointing out this bug.
*/
#include <linux/platform_device.h>
-#include <linux/capability.h>
#include <linux/miscdevice.h>
-#include <linux/firmware.h>
+#include <linux/capability.h>
#include <linux/smp_lock.h>
-#include <linux/spinlock.h>
-#include <linux/cpumask.h>
-#include <linux/uaccess.h>
-#include <linux/vmalloc.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/mutex.h>
-#include <linux/sched.h>
-#include <linux/init.h>
-#include <linux/slab.h>
#include <linux/cpu.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <asm/microcode.h>
#include <asm/processor.h>
-#include <asm/msr.h>
MODULE_DESCRIPTION("Microcode Update Driver");
MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
@@ -101,36 +92,110 @@ MODULE_LICENSE("GPL");
static struct microcode_ops *microcode_ops;
-/* no concurrent ->write()s are allowed on /dev/cpu/microcode */
+/*
+ * Synchronization.
+ *
+ * All non cpu-hotplug-callback call sites use:
+ *
+ * - microcode_mutex to synchronize with each other;
+ * - get/put_online_cpus() to synchronize with
+ * the cpu-hotplug-callback call sites.
+ *
+ * We guarantee that only a single cpu is being
+ * updated at any particular moment of time.
+ */
static DEFINE_MUTEX(microcode_mutex);
struct ucode_cpu_info ucode_cpu_info[NR_CPUS];
EXPORT_SYMBOL_GPL(ucode_cpu_info);
+/*
+ * Operations that are run on a target cpu:
+ */
+
+struct cpu_info_ctx {
+ struct cpu_signature *cpu_sig;
+ int err;
+};
+
+static void collect_cpu_info_local(void *arg)
+{
+ struct cpu_info_ctx *ctx = arg;
+
+ ctx->err = microcode_ops->collect_cpu_info(smp_processor_id(),
+ ctx->cpu_sig);
+}
+
+static int collect_cpu_info_on_target(int cpu, struct cpu_signature *cpu_sig)
+{
+ struct cpu_info_ctx ctx = { .cpu_sig = cpu_sig, .err = 0 };
+ int ret;
+
+ ret = smp_call_function_single(cpu, collect_cpu_info_local, &ctx, 1);
+ if (!ret)
+ ret = ctx.err;
+
+ return ret;
+}
+
+static int collect_cpu_info(int cpu)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+ int ret;
+
+ memset(uci, 0, sizeof(*uci));
+
+ ret = collect_cpu_info_on_target(cpu, &uci->cpu_sig);
+ if (!ret)
+ uci->valid = 1;
+
+ return ret;
+}
+
+struct apply_microcode_ctx {
+ int err;
+};
+
+static void apply_microcode_local(void *arg)
+{
+ struct apply_microcode_ctx *ctx = arg;
+
+ ctx->err = microcode_ops->apply_microcode(smp_processor_id());
+}
+
+static int apply_microcode_on_target(int cpu)
+{
+ struct apply_microcode_ctx ctx = { .err = 0 };
+ int ret;
+
+ ret = smp_call_function_single(cpu, apply_microcode_local, &ctx, 1);
+ if (!ret)
+ ret = ctx.err;
+
+ return ret;
+}
+
#ifdef CONFIG_MICROCODE_OLD_INTERFACE
static int do_microcode_update(const void __user *buf, size_t size)
{
- cpumask_t old;
int error = 0;
int cpu;
- old = current->cpus_allowed;
-
for_each_online_cpu(cpu) {
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+ enum ucode_state ustate;
if (!uci->valid)
continue;
- set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
- error = microcode_ops->request_microcode_user(cpu, buf, size);
- if (error < 0)
- goto out;
- if (!error)
- microcode_ops->apply_microcode(cpu);
+ ustate = microcode_ops->request_microcode_user(cpu, buf, size);
+ if (ustate == UCODE_ERROR) {
+ error = -1;
+ break;
+ } else if (ustate == UCODE_OK)
+ apply_microcode_on_target(cpu);
}
-out:
- set_cpus_allowed_ptr(current, &old);
+
return error;
}
@@ -143,19 +208,17 @@ static int microcode_open(struct inode *unused1, struct file *unused2)
static ssize_t microcode_write(struct file *file, const char __user *buf,
size_t len, loff_t *ppos)
{
- ssize_t ret;
+ ssize_t ret = -EINVAL;
if ((len >> PAGE_SHIFT) > num_physpages) {
- printk(KERN_ERR "microcode: too much data (max %ld pages)\n",
- num_physpages);
- return -EINVAL;
+ pr_err("microcode: too much data (max %ld pages)\n", num_physpages);
+ return ret;
}
get_online_cpus();
mutex_lock(&microcode_mutex);
- ret = do_microcode_update(buf, len);
- if (!ret)
+ if (do_microcode_update(buf, len) == 0)
ret = (ssize_t)len;
mutex_unlock(&microcode_mutex);
@@ -165,15 +228,15 @@ static ssize_t microcode_write(struct file *file, const char __user *buf,
}
static const struct file_operations microcode_fops = {
- .owner = THIS_MODULE,
- .write = microcode_write,
- .open = microcode_open,
+ .owner = THIS_MODULE,
+ .write = microcode_write,
+ .open = microcode_open,
};
static struct miscdevice microcode_dev = {
- .minor = MICROCODE_MINOR,
- .name = "microcode",
- .fops = &microcode_fops,
+ .minor = MICROCODE_MINOR,
+ .name = "microcode",
+ .fops = &microcode_fops,
};
static int __init microcode_dev_init(void)
@@ -182,9 +245,7 @@ static int __init microcode_dev_init(void)
error = misc_register(&microcode_dev);
if (error) {
- printk(KERN_ERR
- "microcode: can't misc_register on minor=%d\n",
- MICROCODE_MINOR);
+ pr_err("microcode: can't misc_register on minor=%d\n", MICROCODE_MINOR);
return error;
}
@@ -205,42 +266,51 @@ MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
/* fake device for request_firmware */
static struct platform_device *microcode_pdev;
-static long reload_for_cpu(void *unused)
+static int reload_for_cpu(int cpu)
{
- struct ucode_cpu_info *uci = ucode_cpu_info + smp_processor_id();
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
int err = 0;
mutex_lock(&microcode_mutex);
if (uci->valid) {
- err = microcode_ops->request_microcode_fw(smp_processor_id(),
- &microcode_pdev->dev);
- if (!err)
- microcode_ops->apply_microcode(smp_processor_id());
+ enum ucode_state ustate;
+
+ ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev);
+ if (ustate == UCODE_OK)
+ apply_microcode_on_target(cpu);
+ else
+ if (ustate == UCODE_ERROR)
+ err = -EINVAL;
}
mutex_unlock(&microcode_mutex);
+
return err;
}
static ssize_t reload_store(struct sys_device *dev,
struct sysdev_attribute *attr,
- const char *buf, size_t sz)
+ const char *buf, size_t size)
{
- char *end;
- unsigned long val = simple_strtoul(buf, &end, 0);
- int err = 0;
+ unsigned long val;
int cpu = dev->id;
+ int ret = 0;
+ char *end;
+ val = simple_strtoul(buf, &end, 0);
if (end == buf)
return -EINVAL;
+
if (val == 1) {
get_online_cpus();
if (cpu_online(cpu))
- err = work_on_cpu(cpu, reload_for_cpu, NULL);
+ ret = reload_for_cpu(cpu);
put_online_cpus();
}
- if (err)
- return err;
- return sz;
+
+ if (!ret)
+ ret = size;
+
+ return ret;
}
static ssize_t version_show(struct sys_device *dev,
@@ -271,11 +341,11 @@ static struct attribute *mc_default_attrs[] = {
};
static struct attribute_group mc_attr_group = {
- .attrs = mc_default_attrs,
- .name = "microcode",
+ .attrs = mc_default_attrs,
+ .name = "microcode",
};
-static void __microcode_fini_cpu(int cpu)
+static void microcode_fini_cpu(int cpu)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
@@ -283,103 +353,68 @@ static void __microcode_fini_cpu(int cpu)
uci->valid = 0;
}
-static void microcode_fini_cpu(int cpu)
-{
- mutex_lock(&microcode_mutex);
- __microcode_fini_cpu(cpu);
- mutex_unlock(&microcode_mutex);
-}
-
-static void collect_cpu_info(int cpu)
+static enum ucode_state microcode_resume_cpu(int cpu)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
- memset(uci, 0, sizeof(*uci));
- if (!microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig))
- uci->valid = 1;
+ if (!uci->mc)
+ return UCODE_NFOUND;
+
+ pr_debug("microcode: CPU%d updated upon resume\n", cpu);
+ apply_microcode_on_target(cpu);
+
+ return UCODE_OK;
}
-static int microcode_resume_cpu(int cpu)
+static enum ucode_state microcode_init_cpu(int cpu)
{
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
- struct cpu_signature nsig;
+ enum ucode_state ustate;
- pr_debug("microcode: CPU%d resumed\n", cpu);
+ if (collect_cpu_info(cpu))
+ return UCODE_ERROR;
- if (!uci->mc)
- return 1;
+ /* --dimm. Trigger a delayed update? */
+ if (system_state != SYSTEM_RUNNING)
+ return UCODE_NFOUND;
- /*
- * Let's verify that the 'cached' ucode does belong
- * to this cpu (a bit of paranoia):
- */
- if (microcode_ops->collect_cpu_info(cpu, &nsig)) {
- __microcode_fini_cpu(cpu);
- printk(KERN_ERR "failed to collect_cpu_info for resuming cpu #%d\n",
- cpu);
- return -1;
- }
+ ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev);
- if ((nsig.sig != uci->cpu_sig.sig) || (nsig.pf != uci->cpu_sig.pf)) {
- __microcode_fini_cpu(cpu);
- printk(KERN_ERR "cached ucode doesn't match the resuming cpu #%d\n",
- cpu);
- /* Should we look for a new ucode here? */
- return 1;
+ if (ustate == UCODE_OK) {
+ pr_debug("microcode: CPU%d updated upon init\n", cpu);
+ apply_microcode_on_target(cpu);
}
- return 0;
+ return ustate;
}
-static long microcode_update_cpu(void *unused)
+static enum ucode_state microcode_update_cpu(int cpu)
{
- struct ucode_cpu_info *uci = ucode_cpu_info + smp_processor_id();
- int err = 0;
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+ enum ucode_state ustate;
- /*
- * Check if the system resume is in progress (uci->valid != NULL),
- * otherwise just request a firmware:
- */
- if (uci->valid) {
- err = microcode_resume_cpu(smp_processor_id());
- } else {
- collect_cpu_info(smp_processor_id());
- if (uci->valid && system_state == SYSTEM_RUNNING)
- err = microcode_ops->request_microcode_fw(
- smp_processor_id(),
- &microcode_pdev->dev);
- }
- if (!err)
- microcode_ops->apply_microcode(smp_processor_id());
- return err;
-}
+ if (uci->valid)
+ ustate = microcode_resume_cpu(cpu);
+ else
+ ustate = microcode_init_cpu(cpu);
-static int microcode_init_cpu(int cpu)
-{
- int err;
- mutex_lock(&microcode_mutex);
- err = work_on_cpu(cpu, microcode_update_cpu, NULL);
- mutex_unlock(&microcode_mutex);
-
- return err;
+ return ustate;
}
static int mc_sysdev_add(struct sys_device *sys_dev)
{
int err, cpu = sys_dev->id;
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
if (!cpu_online(cpu))
return 0;
pr_debug("microcode: CPU%d added\n", cpu);
- memset(uci, 0, sizeof(*uci));
err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
if (err)
return err;
- err = microcode_init_cpu(cpu);
+ if (microcode_init_cpu(cpu) == UCODE_ERROR)
+ err = -EINVAL;
return err;
}
@@ -400,19 +435,30 @@ static int mc_sysdev_remove(struct sys_device *sys_dev)
static int mc_sysdev_resume(struct sys_device *dev)
{
int cpu = dev->id;
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
if (!cpu_online(cpu))
return 0;
- /* only CPU 0 will apply ucode here */
- microcode_update_cpu(NULL);
+ /*
+ * All non-bootup cpus are still disabled,
+ * so only CPU 0 will apply ucode here.
+ *
+ * Moreover, there can be no concurrent
+ * updates from any other places at this point.
+ */
+ WARN_ON(cpu != 0);
+
+ if (uci->valid && uci->mc)
+ microcode_ops->apply_microcode(cpu);
+
return 0;
}
static struct sysdev_driver mc_sysdev_driver = {
- .add = mc_sysdev_add,
- .remove = mc_sysdev_remove,
- .resume = mc_sysdev_resume,
+ .add = mc_sysdev_add,
+ .remove = mc_sysdev_remove,
+ .resume = mc_sysdev_resume,
};
static __cpuinit int
@@ -425,15 +471,12 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
switch (action) {
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
- if (microcode_init_cpu(cpu))
- printk(KERN_ERR "microcode: failed to init CPU%d\n",
- cpu);
+ microcode_update_cpu(cpu);
case CPU_DOWN_FAILED:
case CPU_DOWN_FAILED_FROZEN:
pr_debug("microcode: CPU%d added\n", cpu);
if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group))
- printk(KERN_ERR "microcode: Failed to create the sysfs "
- "group for CPU%d\n", cpu);
+ pr_err("microcode: Failed to create group for CPU%d\n", cpu);
break;
case CPU_DOWN_PREPARE:
case CPU_DOWN_PREPARE_FROZEN:
@@ -465,13 +508,10 @@ static int __init microcode_init(void)
microcode_ops = init_amd_microcode();
if (!microcode_ops) {
- printk(KERN_ERR "microcode: no support for this CPU vendor\n");
+ pr_err("microcode: no support for this CPU vendor\n");
return -ENODEV;
}
- error = microcode_dev_init();
- if (error)
- return error;
microcode_pdev = platform_device_register_simple("microcode", -1,
NULL, 0);
if (IS_ERR(microcode_pdev)) {
@@ -480,23 +520,31 @@ static int __init microcode_init(void)
}
get_online_cpus();
+ mutex_lock(&microcode_mutex);
+
error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver);
+
+ mutex_unlock(&microcode_mutex);
put_online_cpus();
+
if (error) {
- microcode_dev_exit();
platform_device_unregister(microcode_pdev);
return error;
}
+ error = microcode_dev_init();
+ if (error)
+ return error;
+
register_hotcpu_notifier(&mc_cpu_notifier);
- printk(KERN_INFO
- "Microcode Update Driver: v" MICROCODE_VERSION
+ pr_info("Microcode Update Driver: v" MICROCODE_VERSION
" <tigran@aivazian.fsnet.co.uk>,"
" Peter Oruba\n");
return 0;
}
+module_init(microcode_init);
static void __exit microcode_exit(void)
{
@@ -505,16 +553,17 @@ static void __exit microcode_exit(void)
unregister_hotcpu_notifier(&mc_cpu_notifier);
get_online_cpus();
+ mutex_lock(&microcode_mutex);
+
sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
+
+ mutex_unlock(&microcode_mutex);
put_online_cpus();
platform_device_unregister(microcode_pdev);
microcode_ops = NULL;
- printk(KERN_INFO
- "Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
+ pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
}
-
-module_init(microcode_init);
module_exit(microcode_exit);
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index 149b9ec7c1a..0d334ddd0a9 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -70,24 +70,11 @@
* Fix sigmatch() macro to handle old CPUs with pf == 0.
* Thanks to Stuart Swales for pointing out this bug.
*/
-#include <linux/platform_device.h>
-#include <linux/capability.h>
-#include <linux/miscdevice.h>
#include <linux/firmware.h>
-#include <linux/smp_lock.h>
-#include <linux/spinlock.h>
-#include <linux/cpumask.h>
#include <linux/uaccess.h>
-#include <linux/vmalloc.h>
#include <linux/kernel.h>
#include <linux/module.h>
-#include <linux/mutex.h>
-#include <linux/sched.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/cpu.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
+#include <linux/vmalloc.h>
#include <asm/microcode.h>
#include <asm/processor.h>
@@ -150,13 +137,9 @@ struct extended_sigtable {
#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
-/* serialize access to the physical write to MSR 0x79 */
-static DEFINE_SPINLOCK(microcode_update_lock);
-
static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
{
struct cpuinfo_x86 *c = &cpu_data(cpu_num);
- unsigned long flags;
unsigned int val[2];
memset(csig, 0, sizeof(*csig));
@@ -176,18 +159,14 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
csig->pf = 1 << ((val[1] >> 18) & 7);
}
- /* serialize access to the physical write to MSR 0x79 */
- spin_lock_irqsave(&microcode_update_lock, flags);
-
wrmsr(MSR_IA32_UCODE_REV, 0, 0);
/* see notes above for revision 1.07. Apparent chip bug */
sync_core();
/* get the current revision from MSR 0x8B */
rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev);
- spin_unlock_irqrestore(&microcode_update_lock, flags);
- pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n",
- csig->sig, csig->pf, csig->rev);
+ printk(KERN_INFO "microcode: CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n",
+ cpu_num, csig->sig, csig->pf, csig->rev);
return 0;
}
@@ -318,11 +297,10 @@ get_matching_microcode(struct cpu_signature *cpu_sig, void *mc, int rev)
return 0;
}
-static void apply_microcode(int cpu)
+static int apply_microcode(int cpu)
{
struct microcode_intel *mc_intel;
struct ucode_cpu_info *uci;
- unsigned long flags;
unsigned int val[2];
int cpu_num;
@@ -334,10 +312,7 @@ static void apply_microcode(int cpu)
BUG_ON(cpu_num != cpu);
if (mc_intel == NULL)
- return;
-
- /* serialize access to the physical write to MSR 0x79 */
- spin_lock_irqsave(&microcode_update_lock, flags);
+ return 0;
/* write microcode via MSR 0x79 */
wrmsr(MSR_IA32_UCODE_WRITE,
@@ -351,30 +326,32 @@ static void apply_microcode(int cpu)
/* get the current revision from MSR 0x8B */
rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
- spin_unlock_irqrestore(&microcode_update_lock, flags);
if (val[1] != mc_intel->hdr.rev) {
- printk(KERN_ERR "microcode: CPU%d update from revision "
- "0x%x to 0x%x failed\n",
- cpu_num, uci->cpu_sig.rev, val[1]);
- return;
+ printk(KERN_ERR "microcode: CPU%d update "
+ "to revision 0x%x failed\n",
+ cpu_num, mc_intel->hdr.rev);
+ return -1;
}
- printk(KERN_INFO "microcode: CPU%d updated from revision "
- "0x%x to 0x%x, date = %04x-%02x-%02x \n",
- cpu_num, uci->cpu_sig.rev, val[1],
+ printk(KERN_INFO "microcode: CPU%d updated to revision "
+ "0x%x, date = %04x-%02x-%02x \n",
+ cpu_num, val[1],
mc_intel->hdr.date & 0xffff,
mc_intel->hdr.date >> 24,
(mc_intel->hdr.date >> 16) & 0xff);
uci->cpu_sig.rev = val[1];
+
+ return 0;
}
-static int generic_load_microcode(int cpu, void *data, size_t size,
- int (*get_ucode_data)(void *, const void *, size_t))
+static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
+ int (*get_ucode_data)(void *, const void *, size_t))
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
u8 *ucode_ptr = data, *new_mc = NULL, *mc;
int new_rev = uci->cpu_sig.rev;
unsigned int leftover = size;
+ enum ucode_state state = UCODE_OK;
while (leftover) {
struct microcode_header_intel mc_header;
@@ -412,11 +389,15 @@ static int generic_load_microcode(int cpu, void *data, size_t size,
leftover -= mc_size;
}
- if (!new_mc)
+ if (leftover) {
+ if (new_mc)
+ vfree(new_mc);
+ state = UCODE_ERROR;
goto out;
+ }
- if (leftover) {
- vfree(new_mc);
+ if (!new_mc) {
+ state = UCODE_NFOUND;
goto out;
}
@@ -427,9 +408,8 @@ static int generic_load_microcode(int cpu, void *data, size_t size,
pr_debug("microcode: CPU%d found a matching microcode update with"
" version 0x%x (current=0x%x)\n",
cpu, new_rev, uci->cpu_sig.rev);
-
- out:
- return (int)leftover;
+out:
+ return state;
}
static int get_ucode_fw(void *to, const void *from, size_t n)
@@ -438,21 +418,19 @@ static int get_ucode_fw(void *to, const void *from, size_t n)
return 0;
}
-static int request_microcode_fw(int cpu, struct device *device)
+static enum ucode_state request_microcode_fw(int cpu, struct device *device)
{
char name[30];
struct cpuinfo_x86 *c = &cpu_data(cpu);
const struct firmware *firmware;
- int ret;
+ enum ucode_state ret;
- /* We should bind the task to the CPU */
- BUG_ON(cpu != raw_smp_processor_id());
sprintf(name, "intel-ucode/%02x-%02x-%02x",
c->x86, c->x86_model, c->x86_mask);
- ret = request_firmware(&firmware, name, device);
- if (ret) {
+
+ if (request_firmware(&firmware, name, device)) {
pr_debug("microcode: data file %s load failed\n", name);
- return ret;
+ return UCODE_NFOUND;
}
ret = generic_load_microcode(cpu, (void *)firmware->data,
@@ -468,11 +446,9 @@ static int get_ucode_user(void *to, const void *from, size_t n)
return copy_from_user(to, from, n);
}
-static int request_microcode_user(int cpu, const void __user *buf, size_t size)
+static enum ucode_state
+request_microcode_user(int cpu, const void __user *buf, size_t size)
{
- /* We should bind the task to the CPU */
- BUG_ON(cpu != raw_smp_processor_id());
-
return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user);
}
diff --git a/arch/x86/kernel/module_64.c b/arch/x86/kernel/module.c
index c23880b90b5..89f386f044e 100644
--- a/arch/x86/kernel/module_64.c
+++ b/arch/x86/kernel/module.c
@@ -1,6 +1,5 @@
-/* Kernel module help for x86-64
+/* Kernel module help for x86.
Copyright (C) 2001 Rusty Russell.
- Copyright (C) 2002,2003 Andi Kleen, SuSE Labs.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -22,23 +21,18 @@
#include <linux/fs.h>
#include <linux/string.h>
#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/slab.h>
#include <linux/bug.h>
+#include <linux/mm.h>
#include <asm/system.h>
#include <asm/page.h>
#include <asm/pgtable.h>
+#if 0
+#define DEBUGP printk
+#else
#define DEBUGP(fmt...)
-
-#ifndef CONFIG_UML
-void module_free(struct module *mod, void *module_region)
-{
- vfree(module_region);
- /* FIXME: If module_region == mod->init_region, trim exception
- table entries. */
-}
+#endif
void *module_alloc(unsigned long size)
{
@@ -54,9 +48,15 @@ void *module_alloc(unsigned long size)
if (!area)
return NULL;
- return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL_EXEC);
+ return __vmalloc_area(area, GFP_KERNEL | __GFP_HIGHMEM,
+ PAGE_KERNEL_EXEC);
+}
+
+/* Free memory returned from module_alloc */
+void module_free(struct module *mod, void *module_region)
+{
+ vfree(module_region);
}
-#endif
/* We don't need anything special. */
int module_frob_arch_sections(Elf_Ehdr *hdr,
@@ -67,6 +67,58 @@ int module_frob_arch_sections(Elf_Ehdr *hdr,
return 0;
}
+#ifdef CONFIG_X86_32
+int apply_relocate(Elf32_Shdr *sechdrs,
+ const char *strtab,
+ unsigned int symindex,
+ unsigned int relsec,
+ struct module *me)
+{
+ unsigned int i;
+ Elf32_Rel *rel = (void *)sechdrs[relsec].sh_addr;
+ Elf32_Sym *sym;
+ uint32_t *location;
+
+ DEBUGP("Applying relocate section %u to %u\n", relsec,
+ sechdrs[relsec].sh_info);
+ for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
+ /* This is where to make the change */
+ location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
+ + rel[i].r_offset;
+ /* This is the symbol it is referring to. Note that all
+ undefined symbols have been resolved. */
+ sym = (Elf32_Sym *)sechdrs[symindex].sh_addr
+ + ELF32_R_SYM(rel[i].r_info);
+
+ switch (ELF32_R_TYPE(rel[i].r_info)) {
+ case R_386_32:
+ /* We add the value into the location given */
+ *location += sym->st_value;
+ break;
+ case R_386_PC32:
+ /* Add the value, subtract its postition */
+ *location += sym->st_value - (uint32_t)location;
+ break;
+ default:
+ printk(KERN_ERR "module %s: Unknown relocation: %u\n",
+ me->name, ELF32_R_TYPE(rel[i].r_info));
+ return -ENOEXEC;
+ }
+ }
+ return 0;
+}
+
+int apply_relocate_add(Elf32_Shdr *sechdrs,
+ const char *strtab,
+ unsigned int symindex,
+ unsigned int relsec,
+ struct module *me)
+{
+ printk(KERN_ERR "module %s: ADD RELOCATION unsupported\n",
+ me->name);
+ return -ENOEXEC;
+}
+#else /*X86_64*/
int apply_relocate_add(Elf64_Shdr *sechdrs,
const char *strtab,
unsigned int symindex,
@@ -147,6 +199,8 @@ int apply_relocate(Elf_Shdr *sechdrs,
return -ENOSYS;
}
+#endif
+
int module_finalize(const Elf_Ehdr *hdr,
const Elf_Shdr *sechdrs,
struct module *me)
diff --git a/arch/x86/kernel/module_32.c b/arch/x86/kernel/module_32.c
deleted file mode 100644
index 0edd819050e..00000000000
--- a/arch/x86/kernel/module_32.c
+++ /dev/null
@@ -1,152 +0,0 @@
-/* Kernel module help for i386.
- Copyright (C) 2001 Rusty Russell.
-
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-*/
-#include <linux/moduleloader.h>
-#include <linux/elf.h>
-#include <linux/vmalloc.h>
-#include <linux/fs.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/bug.h>
-
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(fmt...)
-#endif
-
-void *module_alloc(unsigned long size)
-{
- if (size == 0)
- return NULL;
- return vmalloc_exec(size);
-}
-
-
-/* Free memory returned from module_alloc */
-void module_free(struct module *mod, void *module_region)
-{
- vfree(module_region);
- /* FIXME: If module_region == mod->init_region, trim exception
- table entries. */
-}
-
-/* We don't need anything special. */
-int module_frob_arch_sections(Elf_Ehdr *hdr,
- Elf_Shdr *sechdrs,
- char *secstrings,
- struct module *mod)
-{
- return 0;
-}
-
-int apply_relocate(Elf32_Shdr *sechdrs,
- const char *strtab,
- unsigned int symindex,
- unsigned int relsec,
- struct module *me)
-{
- unsigned int i;
- Elf32_Rel *rel = (void *)sechdrs[relsec].sh_addr;
- Elf32_Sym *sym;
- uint32_t *location;
-
- DEBUGP("Applying relocate section %u to %u\n", relsec,
- sechdrs[relsec].sh_info);
- for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
- /* This is where to make the change */
- location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
- + rel[i].r_offset;
- /* This is the symbol it is referring to. Note that all
- undefined symbols have been resolved. */
- sym = (Elf32_Sym *)sechdrs[symindex].sh_addr
- + ELF32_R_SYM(rel[i].r_info);
-
- switch (ELF32_R_TYPE(rel[i].r_info)) {
- case R_386_32:
- /* We add the value into the location given */
- *location += sym->st_value;
- break;
- case R_386_PC32:
- /* Add the value, subtract its postition */
- *location += sym->st_value - (uint32_t)location;
- break;
- default:
- printk(KERN_ERR "module %s: Unknown relocation: %u\n",
- me->name, ELF32_R_TYPE(rel[i].r_info));
- return -ENOEXEC;
- }
- }
- return 0;
-}
-
-int apply_relocate_add(Elf32_Shdr *sechdrs,
- const char *strtab,
- unsigned int symindex,
- unsigned int relsec,
- struct module *me)
-{
- printk(KERN_ERR "module %s: ADD RELOCATION unsupported\n",
- me->name);
- return -ENOEXEC;
-}
-
-int module_finalize(const Elf_Ehdr *hdr,
- const Elf_Shdr *sechdrs,
- struct module *me)
-{
- const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
- *para = NULL;
- char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
-
- for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
- if (!strcmp(".text", secstrings + s->sh_name))
- text = s;
- if (!strcmp(".altinstructions", secstrings + s->sh_name))
- alt = s;
- if (!strcmp(".smp_locks", secstrings + s->sh_name))
- locks = s;
- if (!strcmp(".parainstructions", secstrings + s->sh_name))
- para = s;
- }
-
- if (alt) {
- /* patch .altinstructions */
- void *aseg = (void *)alt->sh_addr;
- apply_alternatives(aseg, aseg + alt->sh_size);
- }
- if (locks && text) {
- void *lseg = (void *)locks->sh_addr;
- void *tseg = (void *)text->sh_addr;
- alternatives_smp_module_add(me, me->name,
- lseg, lseg + locks->sh_size,
- tseg, tseg + text->sh_size);
- }
-
- if (para) {
- void *pseg = (void *)para->sh_addr;
- apply_paravirt(pseg, pseg + para->sh_size);
- }
-
- return module_bug_finalize(hdr, sechdrs, me);
-}
-
-void module_arch_cleanup(struct module *mod)
-{
- alternatives_smp_module_del(mod);
- module_bug_cleanup(mod);
-}
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 70fd7e414c1..651c93b2886 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -17,6 +17,7 @@
#include <linux/acpi.h>
#include <linux/module.h>
#include <linux/smp.h>
+#include <linux/pci.h>
#include <asm/mtrr.h>
#include <asm/mpspec.h>
@@ -870,24 +871,17 @@ static
inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {}
#endif /* CONFIG_X86_IO_APIC */
-static int check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length,
- int count)
+static int
+check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count)
{
- if (!mpc_new_phys) {
- pr_info("No spare slots, try to append...take your risk, "
- "new mpc_length %x\n", count);
- } else {
- if (count <= mpc_new_length)
- pr_info("No spare slots, try to append..., "
- "new mpc_length %x\n", count);
- else {
- pr_err("mpc_new_length %lx is too small\n",
- mpc_new_length);
- return -1;
- }
+ int ret = 0;
+
+ if (!mpc_new_phys || count <= mpc_new_length) {
+ WARN(1, "update_mptable: No spare slots (length: %x)\n", count);
+ return -1;
}
- return 0;
+ return ret;
}
static int __init replace_intsrc_all(struct mpc_table *mpc,
@@ -946,7 +940,7 @@ static int __init replace_intsrc_all(struct mpc_table *mpc,
} else {
struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
count += sizeof(struct mpc_intsrc);
- if (!check_slot(mpc_new_phys, mpc_new_length, count))
+ if (check_slot(mpc_new_phys, mpc_new_length, count) < 0)
goto out;
assign_to_mpc_intsrc(&mp_irqs[i], m);
mpc->length = count;
@@ -963,11 +957,14 @@ out:
return 0;
}
-static int __initdata enable_update_mptable;
+int enable_update_mptable;
static int __init update_mptable_setup(char *str)
{
enable_update_mptable = 1;
+#ifdef CONFIG_PCI
+ pci_routeirq = 1;
+#endif
return 0;
}
early_param("update_mptable", update_mptable_setup);
@@ -980,6 +977,9 @@ static int __initdata alloc_mptable;
static int __init parse_alloc_mptable_opt(char *p)
{
enable_update_mptable = 1;
+#ifdef CONFIG_PCI
+ pci_routeirq = 1;
+#endif
alloc_mptable = 1;
if (!p)
return 0;
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 8e45f446488..70ec9b951d7 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -134,7 +134,9 @@ static void *get_call_destination(u8 type)
.pv_irq_ops = pv_irq_ops,
.pv_apic_ops = pv_apic_ops,
.pv_mmu_ops = pv_mmu_ops,
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
.pv_lock_ops = pv_lock_ops,
+#endif
};
return *((void **)&tmpl + type);
}
@@ -246,18 +248,16 @@ static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LA
static inline void enter_lazy(enum paravirt_lazy_mode mode)
{
- BUG_ON(__get_cpu_var(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
- BUG_ON(preemptible());
+ BUG_ON(percpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
- __get_cpu_var(paravirt_lazy_mode) = mode;
+ percpu_write(paravirt_lazy_mode, mode);
}
-void paravirt_leave_lazy(enum paravirt_lazy_mode mode)
+static void leave_lazy(enum paravirt_lazy_mode mode)
{
- BUG_ON(__get_cpu_var(paravirt_lazy_mode) != mode);
- BUG_ON(preemptible());
+ BUG_ON(percpu_read(paravirt_lazy_mode) != mode);
- __get_cpu_var(paravirt_lazy_mode) = PARAVIRT_LAZY_NONE;
+ percpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE);
}
void paravirt_enter_lazy_mmu(void)
@@ -267,22 +267,36 @@ void paravirt_enter_lazy_mmu(void)
void paravirt_leave_lazy_mmu(void)
{
- paravirt_leave_lazy(PARAVIRT_LAZY_MMU);
+ leave_lazy(PARAVIRT_LAZY_MMU);
}
-void paravirt_enter_lazy_cpu(void)
+void paravirt_start_context_switch(struct task_struct *prev)
{
+ BUG_ON(preemptible());
+
+ if (percpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) {
+ arch_leave_lazy_mmu_mode();
+ set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES);
+ }
enter_lazy(PARAVIRT_LAZY_CPU);
}
-void paravirt_leave_lazy_cpu(void)
+void paravirt_end_context_switch(struct task_struct *next)
{
- paravirt_leave_lazy(PARAVIRT_LAZY_CPU);
+ BUG_ON(preemptible());
+
+ leave_lazy(PARAVIRT_LAZY_CPU);
+
+ if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES))
+ arch_enter_lazy_mmu_mode();
}
enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
{
- return __get_cpu_var(paravirt_lazy_mode);
+ if (in_interrupt())
+ return PARAVIRT_LAZY_NONE;
+
+ return percpu_read(paravirt_lazy_mode);
}
void arch_flush_lazy_mmu_mode(void)
@@ -290,7 +304,6 @@ void arch_flush_lazy_mmu_mode(void)
preempt_disable();
if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
- WARN_ON(preempt_count() == 1);
arch_leave_lazy_mmu_mode();
arch_enter_lazy_mmu_mode();
}
@@ -298,19 +311,6 @@ void arch_flush_lazy_mmu_mode(void)
preempt_enable();
}
-void arch_flush_lazy_cpu_mode(void)
-{
- preempt_disable();
-
- if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) {
- WARN_ON(preempt_count() == 1);
- arch_leave_lazy_cpu_mode();
- arch_enter_lazy_cpu_mode();
- }
-
- preempt_enable();
-}
-
struct pv_info pv_info = {
.name = "bare hardware",
.paravirt_enabled = 0,
@@ -402,10 +402,8 @@ struct pv_cpu_ops pv_cpu_ops = {
.set_iopl_mask = native_set_iopl_mask,
.io_delay = native_io_delay,
- .lazy_mode = {
- .enter = paravirt_nop,
- .leave = paravirt_nop,
- },
+ .start_context_switch = paravirt_nop,
+ .end_context_switch = paravirt_nop,
};
struct pv_apic_ops pv_apic_ops = {
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 755c21e906f..971a3bec47a 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -186,37 +186,6 @@ static struct cal_chipset_ops calioc2_chip_ops = {
static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, };
-/* enable this to stress test the chip's TCE cache */
-#ifdef CONFIG_IOMMU_DEBUG
-static int debugging = 1;
-
-static inline unsigned long verify_bit_range(unsigned long* bitmap,
- int expected, unsigned long start, unsigned long end)
-{
- unsigned long idx = start;
-
- BUG_ON(start >= end);
-
- while (idx < end) {
- if (!!test_bit(idx, bitmap) != expected)
- return idx;
- ++idx;
- }
-
- /* all bits have the expected value */
- return ~0UL;
-}
-#else /* debugging is disabled */
-static int debugging;
-
-static inline unsigned long verify_bit_range(unsigned long* bitmap,
- int expected, unsigned long start, unsigned long end)
-{
- return ~0UL;
-}
-
-#endif /* CONFIG_IOMMU_DEBUG */
-
static inline int translation_enabled(struct iommu_table *tbl)
{
/* only PHBs with translation enabled have an IOMMU table */
@@ -228,7 +197,6 @@ static void iommu_range_reserve(struct iommu_table *tbl,
{
unsigned long index;
unsigned long end;
- unsigned long badbit;
unsigned long flags;
index = start_addr >> PAGE_SHIFT;
@@ -243,14 +211,6 @@ static void iommu_range_reserve(struct iommu_table *tbl,
spin_lock_irqsave(&tbl->it_lock, flags);
- badbit = verify_bit_range(tbl->it_map, 0, index, end);
- if (badbit != ~0UL) {
- if (printk_ratelimit())
- printk(KERN_ERR "Calgary: entry already allocated at "
- "0x%lx tbl %p dma 0x%lx npages %u\n",
- badbit, tbl, start_addr, npages);
- }
-
iommu_area_reserve(tbl->it_map, index, npages);
spin_unlock_irqrestore(&tbl->it_lock, flags);
@@ -326,7 +286,6 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
unsigned int npages)
{
unsigned long entry;
- unsigned long badbit;
unsigned long badend;
unsigned long flags;
@@ -346,14 +305,6 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
spin_lock_irqsave(&tbl->it_lock, flags);
- badbit = verify_bit_range(tbl->it_map, 1, entry, entry + npages);
- if (badbit != ~0UL) {
- if (printk_ratelimit())
- printk(KERN_ERR "Calgary: bit is off at 0x%lx "
- "tbl %p dma 0x%Lx entry 0x%lx npages %u\n",
- badbit, tbl, dma_addr, entry, npages);
- }
-
iommu_area_free(tbl->it_map, entry, npages);
spin_unlock_irqrestore(&tbl->it_lock, flags);
@@ -1488,9 +1439,8 @@ void __init detect_calgary(void)
iommu_detected = 1;
calgary_detected = 1;
printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected.\n");
- printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, "
- "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size,
- debugging ? "enabled" : "disabled");
+ printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n",
+ specified_table_size);
/* swiotlb for devices that aren't behind the Calgary. */
if (max_pfn > MAX_DMA32_PFN)
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index b284b58c035..cfd9f906389 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -144,48 +144,21 @@ static void flush_gart(void)
}
#ifdef CONFIG_IOMMU_LEAK
-
-#define SET_LEAK(x) \
- do { \
- if (iommu_leak_tab) \
- iommu_leak_tab[x] = __builtin_return_address(0);\
- } while (0)
-
-#define CLEAR_LEAK(x) \
- do { \
- if (iommu_leak_tab) \
- iommu_leak_tab[x] = NULL; \
- } while (0)
-
/* Debugging aid for drivers that don't free their IOMMU tables */
-static void **iommu_leak_tab;
static int leak_trace;
static int iommu_leak_pages = 20;
static void dump_leak(void)
{
- int i;
static int dump;
- if (dump || !iommu_leak_tab)
+ if (dump)
return;
dump = 1;
- show_stack(NULL, NULL);
- /* Very crude. dump some from the end of the table too */
- printk(KERN_DEBUG "Dumping %d pages from end of IOMMU:\n",
- iommu_leak_pages);
- for (i = 0; i < iommu_leak_pages; i += 2) {
- printk(KERN_DEBUG "%lu: ", iommu_pages-i);
- printk_address((unsigned long) iommu_leak_tab[iommu_pages-i],
- 0);
- printk(KERN_CONT "%c", (i+1)%2 == 0 ? '\n' : ' ');
- }
- printk(KERN_DEBUG "\n");
+ show_stack(NULL, NULL);
+ debug_dma_dump_mappings(NULL);
}
-#else
-# define SET_LEAK(x)
-# define CLEAR_LEAK(x)
#endif
static void iommu_full(struct device *dev, size_t size, int dir)
@@ -248,7 +221,6 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
for (i = 0; i < npages; i++) {
iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem);
- SET_LEAK(iommu_page + i);
phys_mem += PAGE_SIZE;
}
return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK);
@@ -294,7 +266,6 @@ static void gart_unmap_page(struct device *dev, dma_addr_t dma_addr,
npages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
for (i = 0; i < npages; i++) {
iommu_gatt_base[iommu_page + i] = gart_unmapped_entry;
- CLEAR_LEAK(iommu_page + i);
}
free_iommu(iommu_page, npages);
}
@@ -377,7 +348,6 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start,
pages = iommu_num_pages(s->offset, s->length, PAGE_SIZE);
while (pages--) {
iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr);
- SET_LEAK(iommu_page);
addr += PAGE_SIZE;
iommu_page++;
}
@@ -688,8 +658,6 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
agp_gatt_table = gatt;
- enable_gart_translations();
-
error = sysdev_class_register(&gart_sysdev_class);
if (!error)
error = sysdev_register(&device_gart);
@@ -801,11 +769,12 @@ void __init gart_iommu_init(void)
#ifdef CONFIG_IOMMU_LEAK
if (leak_trace) {
- iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO,
- get_order(iommu_pages*sizeof(void *)));
- if (!iommu_leak_tab)
+ int ret;
+
+ ret = dma_debug_resize_entries(iommu_pages);
+ if (ret)
printk(KERN_DEBUG
- "PCI-DMA: Cannot allocate leak trace area\n");
+ "PCI-DMA: Cannot trace all the entries\n");
}
#endif
@@ -845,6 +814,14 @@ void __init gart_iommu_init(void)
* the pages as Not-Present:
*/
wbinvd();
+
+ /*
+ * Now all caches are flushed and we can safely enable
+ * GART hardware. Doing it early leaves the possibility
+ * of stale cache entries that can lead to GART PTE
+ * errors.
+ */
+ enable_gart_translations();
/*
* Try to workaround a bug (thanks to BenH):
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index 221a3853e26..a1712f2b50f 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -28,7 +28,7 @@ dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr)
return paddr;
}
-phys_addr_t swiotlb_bus_to_phys(dma_addr_t baddr)
+phys_addr_t swiotlb_bus_to_phys(struct device *hwdev, dma_addr_t baddr)
{
return baddr;
}
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index ca989158e84..3bb2be1649b 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -8,12 +8,15 @@
#include <linux/module.h>
#include <linux/pm.h>
#include <linux/clockchips.h>
+#include <linux/random.h>
#include <trace/power.h>
#include <asm/system.h>
#include <asm/apic.h>
+#include <asm/syscalls.h>
#include <asm/idle.h>
#include <asm/uaccess.h>
#include <asm/i387.h>
+#include <asm/ds.h>
unsigned long idle_halt;
EXPORT_SYMBOL(idle_halt);
@@ -45,6 +48,8 @@ void free_thread_xstate(struct task_struct *tsk)
kmem_cache_free(task_xstate_cachep, tsk->thread.xstate);
tsk->thread.xstate = NULL;
}
+
+ WARN(tsk->thread.ds_ctx, "leaking DS context\n");
}
void free_thread_info(struct thread_info *ti)
@@ -83,8 +88,6 @@ void exit_thread(void)
put_cpu();
kfree(bp);
}
-
- ds_exit_thread(current);
}
void flush_thread(void)
@@ -613,3 +616,16 @@ static int __init idle_setup(char *str)
}
early_param("idle", idle_setup);
+unsigned long arch_align_stack(unsigned long sp)
+{
+ if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
+ sp -= get_random_int() % 8192;
+ return sp & ~0xf;
+}
+
+unsigned long arch_randomize_brk(struct mm_struct *mm)
+{
+ unsigned long range_end = mm->brk + 0x02000000;
+ return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
+}
+
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 76f8f84043a..59f4524984a 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -9,8 +9,6 @@
* This file handles the architecture-dependent parts of process handling..
*/
-#include <stdarg.h>
-
#include <linux/stackprotector.h>
#include <linux/cpu.h>
#include <linux/errno.h>
@@ -33,7 +31,6 @@
#include <linux/module.h>
#include <linux/kallsyms.h>
#include <linux/ptrace.h>
-#include <linux/random.h>
#include <linux/personality.h>
#include <linux/tick.h>
#include <linux/percpu.h>
@@ -290,7 +287,8 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
p->thread.io_bitmap_max = 0;
}
- ds_copy_thread(p, current);
+ clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
+ p->thread.ds_ctx = NULL;
clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
p->thread.debugctlmsr = 0;
@@ -407,7 +405,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
* done before math_state_restore, so the TS bit is up
* to date.
*/
- arch_leave_lazy_cpu_mode();
+ arch_end_context_switch(next_p);
/* If the task has used fpu the last 5 timeslices, just do a full
* restore of the math state immediately to avoid the trap; the
@@ -497,15 +495,3 @@ unsigned long get_wchan(struct task_struct *p)
return 0;
}
-unsigned long arch_align_stack(unsigned long sp)
-{
- if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
- sp -= get_random_int() % 8192;
- return sp & ~0xf;
-}
-
-unsigned long arch_randomize_brk(struct mm_struct *mm)
-{
- unsigned long range_end = mm->brk + 0x02000000;
- return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
-}
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index b751a41392b..ebefb5407b9 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -14,8 +14,6 @@
* This file handles the architecture-dependent parts of process handling..
*/
-#include <stdarg.h>
-
#include <linux/stackprotector.h>
#include <linux/cpu.h>
#include <linux/errno.h>
@@ -32,7 +30,6 @@
#include <linux/delay.h>
#include <linux/module.h>
#include <linux/ptrace.h>
-#include <linux/random.h>
#include <linux/notifier.h>
#include <linux/kprobes.h>
#include <linux/kdebug.h>
@@ -335,7 +332,8 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
goto out;
}
- ds_copy_thread(p, me);
+ clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
+ p->thread.ds_ctx = NULL;
clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
p->thread.debugctlmsr = 0;
@@ -428,7 +426,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
* done before math_state_restore, so the TS bit is up
* to date.
*/
- arch_leave_lazy_cpu_mode();
+ arch_end_context_switch(next_p);
/*
* Switch FS and GS.
@@ -660,15 +658,3 @@ long sys_arch_prctl(int code, unsigned long addr)
return do_arch_prctl(current, code, addr);
}
-unsigned long arch_align_stack(unsigned long sp)
-{
- if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
- sp -= get_random_int() % 8192;
- return sp & ~0xf;
-}
-
-unsigned long arch_randomize_brk(struct mm_struct *mm)
-{
- unsigned long range_end = mm->brk + 0x02000000;
- return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
-}
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 23b7c8f017e..09ecbde91c1 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -21,6 +21,7 @@
#include <linux/audit.h>
#include <linux/seccomp.h>
#include <linux/signal.h>
+#include <linux/workqueue.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
@@ -578,17 +579,130 @@ static int ioperm_get(struct task_struct *target,
}
#ifdef CONFIG_X86_PTRACE_BTS
+/*
+ * A branch trace store context.
+ *
+ * Contexts may only be installed by ptrace_bts_config() and only for
+ * ptraced tasks.
+ *
+ * Contexts are destroyed when the tracee is detached from the tracer.
+ * The actual destruction work requires interrupts enabled, so the
+ * work is deferred and will be scheduled during __ptrace_unlink().
+ *
+ * Contexts hold an additional task_struct reference on the traced
+ * task, as well as a reference on the tracer's mm.
+ *
+ * Ptrace already holds a task_struct for the duration of ptrace operations,
+ * but since destruction is deferred, it may be executed after both
+ * tracer and tracee exited.
+ */
+struct bts_context {
+ /* The branch trace handle. */
+ struct bts_tracer *tracer;
+
+ /* The buffer used to store the branch trace and its size. */
+ void *buffer;
+ unsigned int size;
+
+ /* The mm that paid for the above buffer. */
+ struct mm_struct *mm;
+
+ /* The task this context belongs to. */
+ struct task_struct *task;
+
+ /* The signal to send on a bts buffer overflow. */
+ unsigned int bts_ovfl_signal;
+
+ /* The work struct to destroy a context. */
+ struct work_struct work;
+};
+
+static int alloc_bts_buffer(struct bts_context *context, unsigned int size)
+{
+ void *buffer = NULL;
+ int err = -ENOMEM;
+
+ err = account_locked_memory(current->mm, current->signal->rlim, size);
+ if (err < 0)
+ return err;
+
+ buffer = kzalloc(size, GFP_KERNEL);
+ if (!buffer)
+ goto out_refund;
+
+ context->buffer = buffer;
+ context->size = size;
+ context->mm = get_task_mm(current);
+
+ return 0;
+
+ out_refund:
+ refund_locked_memory(current->mm, size);
+ return err;
+}
+
+static inline void free_bts_buffer(struct bts_context *context)
+{
+ if (!context->buffer)
+ return;
+
+ kfree(context->buffer);
+ context->buffer = NULL;
+
+ refund_locked_memory(context->mm, context->size);
+ context->size = 0;
+
+ mmput(context->mm);
+ context->mm = NULL;
+}
+
+static void free_bts_context_work(struct work_struct *w)
+{
+ struct bts_context *context;
+
+ context = container_of(w, struct bts_context, work);
+
+ ds_release_bts(context->tracer);
+ put_task_struct(context->task);
+ free_bts_buffer(context);
+ kfree(context);
+}
+
+static inline void free_bts_context(struct bts_context *context)
+{
+ INIT_WORK(&context->work, free_bts_context_work);
+ schedule_work(&context->work);
+}
+
+static inline struct bts_context *alloc_bts_context(struct task_struct *task)
+{
+ struct bts_context *context = kzalloc(sizeof(*context), GFP_KERNEL);
+ if (context) {
+ context->task = task;
+ task->bts = context;
+
+ get_task_struct(task);
+ }
+
+ return context;
+}
+
static int ptrace_bts_read_record(struct task_struct *child, size_t index,
struct bts_struct __user *out)
{
+ struct bts_context *context;
const struct bts_trace *trace;
struct bts_struct bts;
const unsigned char *at;
int error;
- trace = ds_read_bts(child->bts);
+ context = child->bts;
+ if (!context)
+ return -ESRCH;
+
+ trace = ds_read_bts(context->tracer);
if (!trace)
- return -EPERM;
+ return -ESRCH;
at = trace->ds.top - ((index + 1) * trace->ds.size);
if ((void *)at < trace->ds.begin)
@@ -597,7 +711,7 @@ static int ptrace_bts_read_record(struct task_struct *child, size_t index,
if (!trace->read)
return -EOPNOTSUPP;
- error = trace->read(child->bts, at, &bts);
+ error = trace->read(context->tracer, at, &bts);
if (error < 0)
return error;
@@ -611,13 +725,18 @@ static int ptrace_bts_drain(struct task_struct *child,
long size,
struct bts_struct __user *out)
{
+ struct bts_context *context;
const struct bts_trace *trace;
const unsigned char *at;
int error, drained = 0;
- trace = ds_read_bts(child->bts);
+ context = child->bts;
+ if (!context)
+ return -ESRCH;
+
+ trace = ds_read_bts(context->tracer);
if (!trace)
- return -EPERM;
+ return -ESRCH;
if (!trace->read)
return -EOPNOTSUPP;
@@ -628,9 +747,8 @@ static int ptrace_bts_drain(struct task_struct *child,
for (at = trace->ds.begin; (void *)at < trace->ds.top;
out++, drained++, at += trace->ds.size) {
struct bts_struct bts;
- int error;
- error = trace->read(child->bts, at, &bts);
+ error = trace->read(context->tracer, at, &bts);
if (error < 0)
return error;
@@ -640,35 +758,18 @@ static int ptrace_bts_drain(struct task_struct *child,
memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
- error = ds_reset_bts(child->bts);
+ error = ds_reset_bts(context->tracer);
if (error < 0)
return error;
return drained;
}
-static int ptrace_bts_allocate_buffer(struct task_struct *child, size_t size)
-{
- child->bts_buffer = alloc_locked_buffer(size);
- if (!child->bts_buffer)
- return -ENOMEM;
-
- child->bts_size = size;
-
- return 0;
-}
-
-static void ptrace_bts_free_buffer(struct task_struct *child)
-{
- free_locked_buffer(child->bts_buffer, child->bts_size);
- child->bts_buffer = NULL;
- child->bts_size = 0;
-}
-
static int ptrace_bts_config(struct task_struct *child,
long cfg_size,
const struct ptrace_bts_config __user *ucfg)
{
+ struct bts_context *context;
struct ptrace_bts_config cfg;
unsigned int flags = 0;
@@ -678,28 +779,33 @@ static int ptrace_bts_config(struct task_struct *child,
if (copy_from_user(&cfg, ucfg, sizeof(cfg)))
return -EFAULT;
- if (child->bts) {
- ds_release_bts(child->bts);
- child->bts = NULL;
- }
+ context = child->bts;
+ if (!context)
+ context = alloc_bts_context(child);
+ if (!context)
+ return -ENOMEM;
if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
if (!cfg.signal)
return -EINVAL;
- child->thread.bts_ovfl_signal = cfg.signal;
return -EOPNOTSUPP;
+ context->bts_ovfl_signal = cfg.signal;
}
- if ((cfg.flags & PTRACE_BTS_O_ALLOC) &&
- (cfg.size != child->bts_size)) {
- int error;
+ ds_release_bts(context->tracer);
+ context->tracer = NULL;
- ptrace_bts_free_buffer(child);
+ if ((cfg.flags & PTRACE_BTS_O_ALLOC) && (cfg.size != context->size)) {
+ int err;
- error = ptrace_bts_allocate_buffer(child, cfg.size);
- if (error < 0)
- return error;
+ free_bts_buffer(context);
+ if (!cfg.size)
+ return 0;
+
+ err = alloc_bts_buffer(context, cfg.size);
+ if (err < 0)
+ return err;
}
if (cfg.flags & PTRACE_BTS_O_TRACE)
@@ -708,15 +814,14 @@ static int ptrace_bts_config(struct task_struct *child,
if (cfg.flags & PTRACE_BTS_O_SCHED)
flags |= BTS_TIMESTAMPS;
- child->bts = ds_request_bts(child, child->bts_buffer, child->bts_size,
- /* ovfl = */ NULL, /* th = */ (size_t)-1,
- flags);
- if (IS_ERR(child->bts)) {
- int error = PTR_ERR(child->bts);
-
- ptrace_bts_free_buffer(child);
- child->bts = NULL;
+ context->tracer =
+ ds_request_bts_task(child, context->buffer, context->size,
+ NULL, (size_t)-1, flags);
+ if (unlikely(IS_ERR(context->tracer))) {
+ int error = PTR_ERR(context->tracer);
+ free_bts_buffer(context);
+ context->tracer = NULL;
return error;
}
@@ -727,20 +832,25 @@ static int ptrace_bts_status(struct task_struct *child,
long cfg_size,
struct ptrace_bts_config __user *ucfg)
{
+ struct bts_context *context;
const struct bts_trace *trace;
struct ptrace_bts_config cfg;
+ context = child->bts;
+ if (!context)
+ return -ESRCH;
+
if (cfg_size < sizeof(cfg))
return -EIO;
- trace = ds_read_bts(child->bts);
+ trace = ds_read_bts(context->tracer);
if (!trace)
- return -EPERM;
+ return -ESRCH;
memset(&cfg, 0, sizeof(cfg));
- cfg.size = trace->ds.end - trace->ds.begin;
- cfg.signal = child->thread.bts_ovfl_signal;
- cfg.bts_size = sizeof(struct bts_struct);
+ cfg.size = trace->ds.end - trace->ds.begin;
+ cfg.signal = context->bts_ovfl_signal;
+ cfg.bts_size = sizeof(struct bts_struct);
if (cfg.signal)
cfg.flags |= PTRACE_BTS_O_SIGNAL;
@@ -759,80 +869,51 @@ static int ptrace_bts_status(struct task_struct *child,
static int ptrace_bts_clear(struct task_struct *child)
{
+ struct bts_context *context;
const struct bts_trace *trace;
- trace = ds_read_bts(child->bts);
+ context = child->bts;
+ if (!context)
+ return -ESRCH;
+
+ trace = ds_read_bts(context->tracer);
if (!trace)
- return -EPERM;
+ return -ESRCH;
memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
- return ds_reset_bts(child->bts);
+ return ds_reset_bts(context->tracer);
}
static int ptrace_bts_size(struct task_struct *child)
{
+ struct bts_context *context;
const struct bts_trace *trace;
- trace = ds_read_bts(child->bts);
+ context = child->bts;
+ if (!context)
+ return -ESRCH;
+
+ trace = ds_read_bts(context->tracer);
if (!trace)
- return -EPERM;
+ return -ESRCH;
return (trace->ds.top - trace->ds.begin) / trace->ds.size;
}
-static void ptrace_bts_fork(struct task_struct *tsk)
-{
- tsk->bts = NULL;
- tsk->bts_buffer = NULL;
- tsk->bts_size = 0;
- tsk->thread.bts_ovfl_signal = 0;
-}
-
-static void ptrace_bts_untrace(struct task_struct *child)
+/*
+ * Called from __ptrace_unlink() after the child has been moved back
+ * to its original parent.
+ */
+void ptrace_bts_untrace(struct task_struct *child)
{
if (unlikely(child->bts)) {
- ds_release_bts(child->bts);
+ free_bts_context(child->bts);
child->bts = NULL;
-
- /* We cannot update total_vm and locked_vm since
- child's mm is already gone. But we can reclaim the
- memory. */
- kfree(child->bts_buffer);
- child->bts_buffer = NULL;
- child->bts_size = 0;
}
}
-
-static void ptrace_bts_detach(struct task_struct *child)
-{
- /*
- * Ptrace_detach() races with ptrace_untrace() in case
- * the child dies and is reaped by another thread.
- *
- * We only do the memory accounting at this point and
- * leave the buffer deallocation and the bts tracer
- * release to ptrace_bts_untrace() which will be called
- * later on with tasklist_lock held.
- */
- release_locked_buffer(child->bts_buffer, child->bts_size);
-}
-#else
-static inline void ptrace_bts_fork(struct task_struct *tsk) {}
-static inline void ptrace_bts_detach(struct task_struct *child) {}
-static inline void ptrace_bts_untrace(struct task_struct *child) {}
#endif /* CONFIG_X86_PTRACE_BTS */
-void x86_ptrace_fork(struct task_struct *child, unsigned long clone_flags)
-{
- ptrace_bts_fork(child);
-}
-
-void x86_ptrace_untrace(struct task_struct *child)
-{
- ptrace_bts_untrace(child);
-}
-
/*
* Called by kernel/ptrace.c when detaching..
*
@@ -844,7 +925,6 @@ void ptrace_disable(struct task_struct *child)
#ifdef TIF_SYSCALL_EMU
clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
#endif
- ptrace_bts_detach(child);
}
#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 7563b31b4f0..af71d06624b 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -491,5 +491,42 @@ void force_hpet_resume(void)
break;
}
}
+#endif
+
+#if defined(CONFIG_PCI) && defined(CONFIG_NUMA)
+/* Set correct numa_node information for AMD NB functions */
+static void __init quirk_amd_nb_node(struct pci_dev *dev)
+{
+ struct pci_dev *nb_ht;
+ unsigned int devfn;
+ u32 val;
+
+ devfn = PCI_DEVFN(PCI_SLOT(dev->devfn), 0);
+ nb_ht = pci_get_slot(dev->bus, devfn);
+ if (!nb_ht)
+ return;
+
+ pci_read_config_dword(nb_ht, 0x60, &val);
+ set_dev_node(&dev->dev, val & 7);
+ pci_dev_put(dev);
+}
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_ADDRMAP,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MEMCTL,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_HT,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MAP,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_DRAM,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_LINK,
+ quirk_amd_nb_node);
#endif
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 1340dad417f..d2d1ce8170f 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -192,6 +192,15 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
DMI_MATCH(DMI_BOARD_NAME, "0KP561"),
},
},
+ { /* Handle problems with rebooting on Dell Optiplex 360 with 0T656F */
+ .callback = set_bios_reboot,
+ .ident = "Dell OptiPlex 360",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 360"),
+ DMI_MATCH(DMI_BOARD_NAME, "0T656F"),
+ },
+ },
{ /* Handle problems with rebooting on Dell 2400's */
.callback = set_bios_reboot,
.ident = "Dell PowerEdge 2400",
@@ -232,6 +241,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "Dell DXP061"),
},
},
+ { /* Handle problems with rebooting on Sony VGN-Z540N */
+ .callback = set_bios_reboot,
+ .ident = "Sony VGN-Z540N",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Sony Corporation"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "VGN-Z540N"),
+ },
+ },
{ }
};
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index b4158439bf6..be5ae80f897 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -112,6 +112,14 @@
#define ARCH_SETUP
#endif
+/*
+ * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
+ * The direct mapping extends to max_pfn_mapped, so that we can directly access
+ * apertures, ACPI and other tables without having to play with fixmaps.
+ */
+unsigned long max_low_pfn_mapped;
+unsigned long max_pfn_mapped;
+
RESERVE_BRK(dmi_alloc, 65536);
unsigned int boot_cpu_id __read_mostly;
@@ -214,8 +222,8 @@ unsigned long mmu_cr4_features;
unsigned long mmu_cr4_features = X86_CR4_PAE;
#endif
-/* Boot loader ID as an integer, for the benefit of proc_dointvec */
-int bootloader_type;
+/* Boot loader ID and version as integers, for the benefit of proc_dointvec */
+int bootloader_type, bootloader_version;
/*
* Setup options
@@ -293,15 +301,13 @@ static void __init reserve_brk(void)
#ifdef CONFIG_BLK_DEV_INITRD
-#ifdef CONFIG_X86_32
-
#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
static void __init relocate_initrd(void)
{
u64 ramdisk_image = boot_params.hdr.ramdisk_image;
u64 ramdisk_size = boot_params.hdr.ramdisk_size;
- u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT;
+ u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
u64 ramdisk_here;
unsigned long slop, clen, mapaddr;
char *p, *q;
@@ -357,14 +363,13 @@ static void __init relocate_initrd(void)
ramdisk_image, ramdisk_image + ramdisk_size - 1,
ramdisk_here, ramdisk_here + ramdisk_size - 1);
}
-#endif
static void __init reserve_initrd(void)
{
u64 ramdisk_image = boot_params.hdr.ramdisk_image;
u64 ramdisk_size = boot_params.hdr.ramdisk_size;
u64 ramdisk_end = ramdisk_image + ramdisk_size;
- u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT;
+ u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
if (!boot_params.hdr.type_of_loader ||
!ramdisk_image || !ramdisk_size)
@@ -394,14 +399,8 @@ static void __init reserve_initrd(void)
return;
}
-#ifdef CONFIG_X86_32
relocate_initrd();
-#else
- printk(KERN_ERR "initrd extends beyond end of memory "
- "(0x%08llx > 0x%08llx)\ndisabling initrd\n",
- ramdisk_end, end_of_lowmem);
- initrd_start = 0;
-#endif
+
free_early(ramdisk_image, ramdisk_end);
}
#else
@@ -706,6 +705,12 @@ void __init setup_arch(char **cmdline_p)
#endif
saved_video_mode = boot_params.hdr.vid_mode;
bootloader_type = boot_params.hdr.type_of_loader;
+ if ((bootloader_type >> 4) == 0xe) {
+ bootloader_type &= 0xf;
+ bootloader_type |= (boot_params.hdr.ext_loader_type+0x10) << 4;
+ }
+ bootloader_version = bootloader_type & 0xf;
+ bootloader_version |= boot_params.hdr.ext_loader_ver << 4;
#ifdef CONFIG_BLK_DEV_RAM
rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
@@ -854,12 +859,16 @@ void __init setup_arch(char **cmdline_p)
max_low_pfn = max_pfn;
high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
+ max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT;
#endif
#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
setup_bios_corruption_check();
#endif
+ printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n",
+ max_pfn_mapped<<PAGE_SHIFT);
+
reserve_brk();
/* max_pfn_mapped is updated here */
@@ -997,24 +1006,6 @@ void __init setup_arch(char **cmdline_p)
#ifdef CONFIG_X86_32
/**
- * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors
- *
- * Description:
- * Perform any necessary interrupt initialisation prior to setting up
- * the "ordinary" interrupt call gates. For legacy reasons, the ISA
- * interrupts should be initialised here if the machine emulates a PC
- * in any way.
- **/
-void __init x86_quirk_pre_intr_init(void)
-{
- if (x86_quirks->arch_pre_intr_init) {
- if (x86_quirks->arch_pre_intr_init())
- return;
- }
- init_ISA_irqs();
-}
-
-/**
* x86_quirk_intr_init - post gate setup interrupt initialisation
*
* Description:
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 3a97a4cf187..9c3f0823e6a 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -160,8 +160,10 @@ static ssize_t __init setup_pcpu_remap(size_t static_size)
/*
* If large page isn't supported, there's no benefit in doing
* this. Also, on non-NUMA, embedding is better.
+ *
+ * NOTE: disabled for now.
*/
- if (!cpu_has_pse || !pcpu_need_numa())
+ if (true || !cpu_has_pse || !pcpu_need_numa())
return -EINVAL;
/*
@@ -423,6 +425,14 @@ void __init setup_per_cpu_areas(void)
early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
#endif
+#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA)
+ /*
+ * make sure boot cpu node_number is right, when boot cpu is on the
+ * node that doesn't have mem installed
+ */
+ per_cpu(node_number, boot_cpu_id) = cpu_to_node(boot_cpu_id);
+#endif
+
/* Setup node to cpumask map */
setup_node_to_cpumask_map();
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 14425166b8e..4c578751e94 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -6,7 +6,6 @@
* 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes
* 2000-2002 x86-64 support by Andi Kleen
*/
-
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/smp.h>
@@ -25,11 +24,11 @@
#include <asm/ucontext.h>
#include <asm/i387.h>
#include <asm/vdso.h>
+#include <asm/mce.h>
#ifdef CONFIG_X86_64
#include <asm/proto.h>
#include <asm/ia32_unistd.h>
-#include <asm/mce.h>
#endif /* CONFIG_X86_64 */
#include <asm/syscall.h>
@@ -857,10 +856,10 @@ static void do_signal(struct pt_regs *regs)
void
do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
{
-#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE)
+#ifdef CONFIG_X86_NEW_MCE
/* notify userspace of pending MCEs */
if (thread_info_flags & _TIF_MCE_NOTIFY)
- mce_notify_user();
+ mce_notify_process();
#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
/* deal with pending signal delivery */
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 13f33ea8cca..ec1de97600e 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -150,14 +150,40 @@ void native_send_call_func_ipi(const struct cpumask *mask)
* this function calls the 'stop' function on all other CPUs in the system.
*/
+asmlinkage void smp_reboot_interrupt(void)
+{
+ ack_APIC_irq();
+ irq_enter();
+ stop_this_cpu(NULL);
+ irq_exit();
+}
+
static void native_smp_send_stop(void)
{
unsigned long flags;
+ unsigned long wait;
if (reboot_force)
return;
- smp_call_function(stop_this_cpu, NULL, 0);
+ /*
+ * Use an own vector here because smp_call_function
+ * does lots of things not suitable in a panic situation.
+ * On most systems we could also use an NMI here,
+ * but there are a few systems around where NMI
+ * is problematic so stay with an non NMI for now
+ * (this implies we cannot stop CPUs spinning with irq off
+ * currently)
+ */
+ if (num_online_cpus() > 1) {
+ apic->send_IPI_allbutself(REBOOT_VECTOR);
+
+ /* Don't wait longer than a second */
+ wait = USEC_PER_SEC;
+ while (num_online_cpus() > 1 && wait--)
+ udelay(1);
+ }
+
local_irq_save(flags);
disable_local_APIC();
local_irq_restore(flags);
@@ -172,6 +198,9 @@ void smp_reschedule_interrupt(struct pt_regs *regs)
{
ack_APIC_irq();
inc_irq_stat(irq_resched_count);
+ /*
+ * KVM uses this interrupt to force a cpu out of guest mode
+ */
}
void smp_call_function_interrupt(struct pt_regs *regs)
@@ -193,19 +222,19 @@ void smp_call_function_single_interrupt(struct pt_regs *regs)
}
struct smp_ops smp_ops = {
- .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu,
- .smp_prepare_cpus = native_smp_prepare_cpus,
- .smp_cpus_done = native_smp_cpus_done,
+ .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu,
+ .smp_prepare_cpus = native_smp_prepare_cpus,
+ .smp_cpus_done = native_smp_cpus_done,
- .smp_send_stop = native_smp_send_stop,
- .smp_send_reschedule = native_smp_send_reschedule,
+ .smp_send_stop = native_smp_send_stop,
+ .smp_send_reschedule = native_smp_send_reschedule,
- .cpu_up = native_cpu_up,
- .cpu_die = native_cpu_die,
- .cpu_disable = native_cpu_disable,
- .play_dead = native_play_dead,
+ .cpu_up = native_cpu_up,
+ .cpu_die = native_cpu_die,
+ .cpu_disable = native_cpu_disable,
+ .play_dead = native_play_dead,
- .send_call_func_ipi = native_send_call_func_ipi,
+ .send_call_func_ipi = native_send_call_func_ipi,
.send_call_func_single_ipi = native_send_call_func_single_ipi,
};
EXPORT_SYMBOL_GPL(smp_ops);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 58d24ef917d..2fecda69ee6 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -504,7 +504,7 @@ void __inquire_remote_apic(int apicid)
* INIT, INIT, STARTUP sequence will reset the chip hard for us, and this
* won't ... remember to clear down the APIC, etc later.
*/
-int __devinit
+int __cpuinit
wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)
{
unsigned long send_status, accept_status = 0;
@@ -538,7 +538,7 @@ wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)
return (send_status | accept_status);
}
-int __devinit
+static int __cpuinit
wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
{
unsigned long send_status, accept_status = 0;
@@ -822,10 +822,12 @@ do_rest:
/* mark "stuck" area as not stuck */
*((volatile unsigned long *)trampoline_base) = 0;
- /*
- * Cleanup possible dangling ends...
- */
- smpboot_restore_warm_reset_vector();
+ if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
+ /*
+ * Cleanup possible dangling ends...
+ */
+ smpboot_restore_warm_reset_vector();
+ }
return boot_error;
}
@@ -871,7 +873,7 @@ int __cpuinit native_cpu_up(unsigned int cpu)
err = do_boot_cpu(apicid, cpu);
- zap_low_mappings();
+ zap_low_mappings(false);
low_mappings = 0;
#else
err = do_boot_cpu(apicid, cpu);
@@ -990,10 +992,12 @@ static int __init smp_sanity_check(unsigned max_cpus)
*/
if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) &&
!cpu_has_apic) {
- printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
- boot_cpu_physical_apicid);
- printk(KERN_ERR "... forcing use of dummy APIC emulation."
+ if (!disable_apic) {
+ pr_err("BIOS bug, local APIC #%d not detected!...\n",
+ boot_cpu_physical_apicid);
+ pr_err("... forcing use of dummy APIC emulation."
"(tell your hw vendor)\n");
+ }
smpboot_clear_io_apic();
arch_disable_smp_support();
return -1;
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index f7bddc2e37d..4aaf7e48394 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -20,7 +20,7 @@ save_stack_warning_symbol(void *data, char *msg, unsigned long symbol)
static int save_stack_stack(void *data, char *name)
{
- return -1;
+ return 0;
}
static void save_stack_address(void *data, unsigned long addr, int reliable)
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index ff5c8736b49..d51321ddafd 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -334,3 +334,5 @@ ENTRY(sys_call_table)
.long sys_inotify_init1
.long sys_preadv
.long sys_pwritev
+ .long sys_rt_tgsigqueueinfo /* 335 */
+ .long sys_perf_counter_open
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index ed0c33761e6..124d40c575d 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -715,7 +715,12 @@ uv_activation_descriptor_init(int node, int pnode)
struct bau_desc *adp;
struct bau_desc *ad2;
- adp = (struct bau_desc *)kmalloc_node(16384, GFP_KERNEL, node);
+ /*
+ * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR)
+ * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per blade
+ */
+ adp = (struct bau_desc *)kmalloc_node(sizeof(struct bau_desc)*
+ UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node);
BUG_ON(!adp);
pa = uv_gpa(adp); /* need the real nasid*/
@@ -729,7 +734,13 @@ uv_activation_descriptor_init(int node, int pnode)
(n << UV_DESC_BASE_PNODE_SHIFT | m));
}
- for (i = 0, ad2 = adp; i < UV_ACTIVATION_DESCRIPTOR_SIZE; i++, ad2++) {
+ /*
+ * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each
+ * cpu even though we only use the first one; one descriptor can
+ * describe a broadcast to 256 nodes.
+ */
+ for (i = 0, ad2 = adp; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR);
+ i++, ad2++) {
memset(ad2, 0, sizeof(struct bau_desc));
ad2->header.sw_ack_flag = 1;
/*
@@ -832,7 +843,7 @@ static int __init uv_bau_init(void)
return 0;
for_each_possible_cpu(cur_cpu)
- alloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu),
+ zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu),
GFP_KERNEL, cpu_to_node(cur_cpu));
uv_bau_retry_limit = 1;
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index a1d288327ff..1e1e27b7d43 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -798,15 +798,15 @@ unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp)
return new_kesp;
}
-#else
+#endif
+
asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
{
}
-asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void)
+asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void)
{
}
-#endif
/*
* 'math_state_restore()' saves the current math information in the
@@ -839,9 +839,6 @@ asmlinkage void math_state_restore(void)
}
clts(); /* Allow maths ops (or we recurse) */
-#ifdef CONFIG_X86_32
- restore_fpu(tsk);
-#else
/*
* Paranoid restore. send a SIGSEGV if we fail to restore the state.
*/
@@ -850,7 +847,7 @@ asmlinkage void math_state_restore(void)
force_sig(SIGSEGV, tsk);
return;
}
-#endif
+
thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */
tsk->fpu_counter++;
}
@@ -945,8 +942,13 @@ void __init trap_init(void)
#endif
set_intr_gate(19, &simd_coprocessor_error);
+ /* Reserve all the builtin and the syscall vector: */
+ for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
+ set_bit(i, used_vectors);
+
#ifdef CONFIG_IA32_EMULATION
set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
+ set_bit(IA32_SYSCALL_VECTOR, used_vectors);
#endif
#ifdef CONFIG_X86_32
@@ -963,17 +965,9 @@ void __init trap_init(void)
}
set_system_trap_gate(SYSCALL_VECTOR, &system_call);
-#endif
-
- /* Reserve all the builtin and the syscall vector: */
- for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
- set_bit(i, used_vectors);
-
-#ifdef CONFIG_X86_64
- set_bit(IA32_SYSCALL_VECTOR, used_vectors);
-#else
set_bit(SYSCALL_VECTOR, used_vectors);
#endif
+
/*
* Should be a barrier for any external CPU state:
*/
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index d57de05dc43..3e1c057e98f 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -384,13 +384,13 @@ unsigned long native_calibrate_tsc(void)
{
u64 tsc1, tsc2, delta, ref1, ref2;
unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
- unsigned long flags, latch, ms, fast_calibrate, tsc_khz;
+ unsigned long flags, latch, ms, fast_calibrate, hv_tsc_khz;
int hpet = is_hpet_enabled(), i, loopmin;
- tsc_khz = get_hypervisor_tsc_freq();
- if (tsc_khz) {
+ hv_tsc_khz = get_hypervisor_tsc_freq();
+ if (hv_tsc_khz) {
printk(KERN_INFO "TSC: Frequency read from the hypervisor\n");
- return tsc_khz;
+ return hv_tsc_khz;
}
local_irq_save(flags);
@@ -710,7 +710,16 @@ static cycle_t read_tsc(struct clocksource *cs)
#ifdef CONFIG_X86_64
static cycle_t __vsyscall_fn vread_tsc(void)
{
- cycle_t ret = (cycle_t)vget_cycles();
+ cycle_t ret;
+
+ /*
+ * Surround the RDTSC by barriers, to make sure it's not
+ * speculated to outside the seqlock critical section and
+ * does not cause time warps:
+ */
+ rdtsc_barrier();
+ ret = (cycle_t)vget_cycles();
+ rdtsc_barrier();
return ret >= __vsyscall_gtod_data.clock.cycle_last ?
ret : __vsyscall_gtod_data.clock.cycle_last;
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index bf36328f6ef..027b5b49899 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -34,6 +34,7 @@ static __cpuinitdata atomic_t stop_count;
* of a critical section, to be able to prove TSC time-warps:
*/
static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED;
+
static __cpuinitdata cycles_t last_tsc;
static __cpuinitdata cycles_t max_warp;
static __cpuinitdata int nr_warps;
@@ -113,13 +114,12 @@ void __cpuinit check_tsc_sync_source(int cpu)
return;
if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) {
- printk(KERN_INFO
- "Skipping synchronization checks as TSC is reliable.\n");
+ pr_info("Skipping synchronization checks as TSC is reliable.\n");
return;
}
- printk(KERN_INFO "checking TSC synchronization [CPU#%d -> CPU#%d]:",
- smp_processor_id(), cpu);
+ pr_info("checking TSC synchronization [CPU#%d -> CPU#%d]:",
+ smp_processor_id(), cpu);
/*
* Reset it - in case this is a second bootup:
@@ -143,8 +143,8 @@ void __cpuinit check_tsc_sync_source(int cpu)
if (nr_warps) {
printk("\n");
- printk(KERN_WARNING "Measured %Ld cycles TSC warp between CPUs,"
- " turning off TSC clock.\n", max_warp);
+ pr_warning("Measured %Ld cycles TSC warp between CPUs, "
+ "turning off TSC clock.\n", max_warp);
mark_tsc_unstable("check_tsc_sync_source failed");
} else {
printk(" passed.\n");
@@ -195,5 +195,3 @@ void __cpuinit check_tsc_sync_target(void)
while (atomic_read(&stop_count) != cpus)
cpu_relax();
}
-#undef NR_LOOPS
-
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index d7ac84e7fc1..9c4e6253905 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -287,10 +287,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
info->regs.pt.ds = 0;
info->regs.pt.es = 0;
info->regs.pt.fs = 0;
-
-/* we are clearing gs later just before "jmp resume_userspace",
- * because it is not saved/restored.
- */
+#ifndef CONFIG_X86_32_LAZY_GS
+ info->regs.pt.gs = 0;
+#endif
/*
* The flags register is also special: we cannot trust that the user
@@ -318,9 +317,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
}
/*
- * Save old state, set default return value (%ax) to 0
+ * Save old state, set default return value (%ax) to 0 (VM86_SIGNAL)
*/
- info->regs32->ax = 0;
+ info->regs32->ax = VM86_SIGNAL;
tsk->thread.saved_sp0 = tsk->thread.sp0;
tsk->thread.saved_fs = info->regs32->fs;
tsk->thread.saved_gs = get_user_gs(info->regs32);
@@ -343,7 +342,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
__asm__ __volatile__(
"movl %0,%%esp\n\t"
"movl %1,%%ebp\n\t"
+#ifdef CONFIG_X86_32_LAZY_GS
"mov %2, %%gs\n\t"
+#endif
"jmp resume_userspace"
: /* no outputs */
:"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0));
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 95deb9f2211..b263423fbe2 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -462,22 +462,28 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
}
#endif
-static void vmi_enter_lazy_cpu(void)
+static void vmi_start_context_switch(struct task_struct *prev)
{
- paravirt_enter_lazy_cpu();
+ paravirt_start_context_switch(prev);
vmi_ops.set_lazy_mode(2);
}
+static void vmi_end_context_switch(struct task_struct *next)
+{
+ vmi_ops.set_lazy_mode(0);
+ paravirt_end_context_switch(next);
+}
+
static void vmi_enter_lazy_mmu(void)
{
paravirt_enter_lazy_mmu();
vmi_ops.set_lazy_mode(1);
}
-static void vmi_leave_lazy(void)
+static void vmi_leave_lazy_mmu(void)
{
- paravirt_leave_lazy(paravirt_get_lazy_mode());
vmi_ops.set_lazy_mode(0);
+ paravirt_leave_lazy_mmu();
}
static inline int __init check_vmi_rom(struct vrom_header *rom)
@@ -711,14 +717,14 @@ static inline int __init activate_vmi(void)
para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask);
para_fill(pv_cpu_ops.io_delay, IODelay);
- para_wrap(pv_cpu_ops.lazy_mode.enter, vmi_enter_lazy_cpu,
+ para_wrap(pv_cpu_ops.start_context_switch, vmi_start_context_switch,
set_lazy_mode, SetLazyMode);
- para_wrap(pv_cpu_ops.lazy_mode.leave, vmi_leave_lazy,
+ para_wrap(pv_cpu_ops.end_context_switch, vmi_end_context_switch,
set_lazy_mode, SetLazyMode);
para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu,
set_lazy_mode, SetLazyMode);
- para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy,
+ para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy_mmu,
set_lazy_mode, SetLazyMode);
/* user and kernel flush are just handled with different flags to FlushTLB */
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 849ee611f01..367e8788204 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -1,5 +1,433 @@
+/*
+ * ld script for the x86 kernel
+ *
+ * Historic 32-bit version written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>
+ *
+ * Modernisation, unification and other changes and fixes:
+ * Copyright (C) 2007-2009 Sam Ravnborg <sam@ravnborg.org>
+ *
+ *
+ * Don't define absolute symbols until and unless you know that symbol
+ * value is should remain constant even if kernel image is relocated
+ * at run time. Absolute symbols are not relocated. If symbol value should
+ * change if kernel is relocated, make the symbol section relative and
+ * put it inside the section definition.
+ */
+
#ifdef CONFIG_X86_32
-# include "vmlinux_32.lds.S"
+#define LOAD_OFFSET __PAGE_OFFSET
#else
-# include "vmlinux_64.lds.S"
+#define LOAD_OFFSET __START_KERNEL_map
#endif
+
+#include <asm-generic/vmlinux.lds.h>
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
+#include <asm/page_types.h>
+#include <asm/cache.h>
+#include <asm/boot.h>
+
+#undef i386 /* in case the preprocessor is a 32bit one */
+
+OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT)
+
+#ifdef CONFIG_X86_32
+OUTPUT_ARCH(i386)
+ENTRY(phys_startup_32)
+jiffies = jiffies_64;
+#else
+OUTPUT_ARCH(i386:x86-64)
+ENTRY(phys_startup_64)
+jiffies_64 = jiffies;
+#endif
+
+PHDRS {
+ text PT_LOAD FLAGS(5); /* R_E */
+ data PT_LOAD FLAGS(7); /* RWE */
+#ifdef CONFIG_X86_64
+ user PT_LOAD FLAGS(7); /* RWE */
+ data.init PT_LOAD FLAGS(7); /* RWE */
+#ifdef CONFIG_SMP
+ percpu PT_LOAD FLAGS(7); /* RWE */
+#endif
+ data.init2 PT_LOAD FLAGS(7); /* RWE */
+#endif
+ note PT_NOTE FLAGS(0); /* ___ */
+}
+
+SECTIONS
+{
+#ifdef CONFIG_X86_32
+ . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
+ phys_startup_32 = startup_32 - LOAD_OFFSET;
+#else
+ . = __START_KERNEL;
+ phys_startup_64 = startup_64 - LOAD_OFFSET;
+#endif
+
+ /* Text and read-only data */
+
+ /* bootstrapping code */
+ .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) {
+ _text = .;
+ *(.text.head)
+ } :text = 0x9090
+
+ /* The rest of the text */
+ .text : AT(ADDR(.text) - LOAD_OFFSET) {
+#ifdef CONFIG_X86_32
+ /* not really needed, already page aligned */
+ . = ALIGN(PAGE_SIZE);
+ *(.text.page_aligned)
+#endif
+ . = ALIGN(8);
+ _stext = .;
+ TEXT_TEXT
+ SCHED_TEXT
+ LOCK_TEXT
+ KPROBES_TEXT
+ IRQENTRY_TEXT
+ *(.fixup)
+ *(.gnu.warning)
+ /* End of text section */
+ _etext = .;
+ } :text = 0x9090
+
+ NOTES :text :note
+
+ /* Exception table */
+ . = ALIGN(16);
+ __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
+ __start___ex_table = .;
+ *(__ex_table)
+ __stop___ex_table = .;
+ } :text = 0x9090
+
+ RODATA
+
+ /* Data */
+ . = ALIGN(PAGE_SIZE);
+ .data : AT(ADDR(.data) - LOAD_OFFSET) {
+ /* Start of data section */
+ _sdata = .;
+ DATA_DATA
+ CONSTRUCTORS
+
+#ifdef CONFIG_X86_64
+ /* End of data section */
+ _edata = .;
+#endif
+ } :data
+
+#ifdef CONFIG_X86_32
+ /* 32 bit has nosave before _edata */
+ . = ALIGN(PAGE_SIZE);
+ .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
+ __nosave_begin = .;
+ *(.data.nosave)
+ . = ALIGN(PAGE_SIZE);
+ __nosave_end = .;
+ }
+#endif
+
+ . = ALIGN(PAGE_SIZE);
+ .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
+ *(.data.page_aligned)
+ *(.data.idt)
+ }
+
+#ifdef CONFIG_X86_32
+ . = ALIGN(32);
+#else
+ . = ALIGN(PAGE_SIZE);
+ . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+#endif
+ .data.cacheline_aligned :
+ AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
+ *(.data.cacheline_aligned)
+ }
+
+ /* rarely changed data like cpu maps */
+#ifdef CONFIG_X86_32
+ . = ALIGN(32);
+#else
+ . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES);
+#endif
+ .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
+ *(.data.read_mostly)
+
+#ifdef CONFIG_X86_32
+ /* End of data section */
+ _edata = .;
+#endif
+ }
+
+#ifdef CONFIG_X86_64
+
+#define VSYSCALL_ADDR (-10*1024*1024)
+#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + \
+ SIZEOF(.data.read_mostly) + 4095) & ~(4095))
+#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + \
+ SIZEOF(.data.read_mostly) + 4095) & ~(4095))
+
+#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR)
+#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET)
+
+#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR)
+#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
+
+ . = VSYSCALL_ADDR;
+ .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) {
+ *(.vsyscall_0)
+ } :user
+
+ __vsyscall_0 = VSYSCALL_VIRT_ADDR;
+
+ . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+ .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) {
+ *(.vsyscall_fn)
+ }
+
+ . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+ .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) {
+ *(.vsyscall_gtod_data)
+ }
+
+ vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data);
+ .vsyscall_clock : AT(VLOAD(.vsyscall_clock)) {
+ *(.vsyscall_clock)
+ }
+ vsyscall_clock = VVIRT(.vsyscall_clock);
+
+
+ .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) {
+ *(.vsyscall_1)
+ }
+ .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) {
+ *(.vsyscall_2)
+ }
+
+ .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) {
+ *(.vgetcpu_mode)
+ }
+ vgetcpu_mode = VVIRT(.vgetcpu_mode);
+
+ . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+ .jiffies : AT(VLOAD(.jiffies)) {
+ *(.jiffies)
+ }
+ jiffies = VVIRT(.jiffies);
+
+ .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) {
+ *(.vsyscall_3)
+ }
+
+ . = VSYSCALL_VIRT_ADDR + PAGE_SIZE;
+
+#undef VSYSCALL_ADDR
+#undef VSYSCALL_PHYS_ADDR
+#undef VSYSCALL_VIRT_ADDR
+#undef VLOAD_OFFSET
+#undef VLOAD
+#undef VVIRT_OFFSET
+#undef VVIRT
+
+#endif /* CONFIG_X86_64 */
+
+ /* init_task */
+ . = ALIGN(THREAD_SIZE);
+ .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
+ *(.data.init_task)
+ }
+#ifdef CONFIG_X86_64
+ :data.init
+#endif
+
+ /*
+ * smp_locks might be freed after init
+ * start/end must be page aligned
+ */
+ . = ALIGN(PAGE_SIZE);
+ .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
+ __smp_locks = .;
+ *(.smp_locks)
+ __smp_locks_end = .;
+ . = ALIGN(PAGE_SIZE);
+ }
+
+ /* Init code and data - will be freed after init */
+ . = ALIGN(PAGE_SIZE);
+ .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
+ __init_begin = .; /* paired with __init_end */
+ _sinittext = .;
+ INIT_TEXT
+ _einittext = .;
+ }
+
+ .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
+ INIT_DATA
+ }
+
+ . = ALIGN(16);
+ .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
+ __setup_start = .;
+ *(.init.setup)
+ __setup_end = .;
+ }
+ .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
+ __initcall_start = .;
+ INITCALLS
+ __initcall_end = .;
+ }
+
+ .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
+ __con_initcall_start = .;
+ *(.con_initcall.init)
+ __con_initcall_end = .;
+ }
+
+ .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
+ __x86_cpu_dev_start = .;
+ *(.x86_cpu_dev.init)
+ __x86_cpu_dev_end = .;
+ }
+
+ SECURITY_INIT
+
+ . = ALIGN(8);
+ .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
+ __parainstructions = .;
+ *(.parainstructions)
+ __parainstructions_end = .;
+ }
+
+ . = ALIGN(8);
+ .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
+ __alt_instructions = .;
+ *(.altinstructions)
+ __alt_instructions_end = .;
+ }
+
+ .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
+ *(.altinstr_replacement)
+ }
+
+ /*
+ * .exit.text is discard at runtime, not link time, to deal with
+ * references from .altinstructions and .eh_frame
+ */
+ .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
+ EXIT_TEXT
+ }
+
+ .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) {
+ EXIT_DATA
+ }
+
+#ifdef CONFIG_BLK_DEV_INITRD
+ . = ALIGN(PAGE_SIZE);
+ .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
+ __initramfs_start = .;
+ *(.init.ramfs)
+ __initramfs_end = .;
+ }
+#endif
+
+#if defined(CONFIG_X86_64) && defined(CONFIG_SMP)
+ /*
+ * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the
+ * output PHDR, so the next output section - __data_nosave - should
+ * start another section data.init2. Also, pda should be at the head of
+ * percpu area. Preallocate it and define the percpu offset symbol
+ * so that it can be accessed as a percpu variable.
+ */
+ . = ALIGN(PAGE_SIZE);
+ PERCPU_VADDR(0, :percpu)
+#else
+ PERCPU(PAGE_SIZE)
+#endif
+
+ . = ALIGN(PAGE_SIZE);
+
+ /* freed after init ends here */
+ .init.end : AT(ADDR(.init.end) - LOAD_OFFSET) {
+ __init_end = .;
+ }
+
+#ifdef CONFIG_X86_64
+ .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
+ . = ALIGN(PAGE_SIZE);
+ __nosave_begin = .;
+ *(.data.nosave)
+ . = ALIGN(PAGE_SIZE);
+ __nosave_end = .;
+ } :data.init2
+ /* use another section data.init2, see PERCPU_VADDR() above */
+#endif
+
+ /* BSS */
+ . = ALIGN(PAGE_SIZE);
+ .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
+ __bss_start = .;
+ *(.bss.page_aligned)
+ *(.bss)
+ . = ALIGN(4);
+ __bss_stop = .;
+ }
+
+ . = ALIGN(PAGE_SIZE);
+ .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
+ __brk_base = .;
+ . += 64 * 1024; /* 64k alignment slop space */
+ *(.brk_reservation) /* areas brk users have reserved */
+ __brk_limit = .;
+ }
+
+ .end : AT(ADDR(.end) - LOAD_OFFSET) {
+ _end = .;
+ }
+
+ /* Sections to be discarded */
+ /DISCARD/ : {
+ *(.exitcall.exit)
+ *(.eh_frame)
+ *(.discard)
+ }
+
+ STABS_DEBUG
+ DWARF_DEBUG
+}
+
+
+#ifdef CONFIG_X86_32
+ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
+ "kernel image bigger than KERNEL_IMAGE_SIZE")
+#else
+/*
+ * Per-cpu symbols which need to be offset from __per_cpu_load
+ * for the boot processor.
+ */
+#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load
+INIT_PER_CPU(gdt_page);
+INIT_PER_CPU(irq_stack_union);
+
+/*
+ * Build-time check on the image size:
+ */
+ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
+ "kernel image bigger than KERNEL_IMAGE_SIZE")
+
+#ifdef CONFIG_SMP
+ASSERT((per_cpu__irq_stack_union == 0),
+ "irq_stack_union is not at start of per-cpu area");
+#endif
+
+#endif /* CONFIG_X86_32 */
+
+#ifdef CONFIG_KEXEC
+#include <asm/kexec.h>
+
+ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
+ "kexec control code size is too big")
+#endif
+
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
deleted file mode 100644
index 62ad500d55f..00000000000
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ /dev/null
@@ -1,229 +0,0 @@
-/* ld script to make i386 Linux kernel
- * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
- *
- * Don't define absolute symbols until and unless you know that symbol
- * value is should remain constant even if kernel image is relocated
- * at run time. Absolute symbols are not relocated. If symbol value should
- * change if kernel is relocated, make the symbol section relative and
- * put it inside the section definition.
- */
-
-#define LOAD_OFFSET __PAGE_OFFSET
-
-#include <asm-generic/vmlinux.lds.h>
-#include <asm/thread_info.h>
-#include <asm/page_types.h>
-#include <asm/cache.h>
-#include <asm/boot.h>
-
-OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
-OUTPUT_ARCH(i386)
-ENTRY(phys_startup_32)
-jiffies = jiffies_64;
-
-PHDRS {
- text PT_LOAD FLAGS(5); /* R_E */
- data PT_LOAD FLAGS(7); /* RWE */
- note PT_NOTE FLAGS(0); /* ___ */
-}
-SECTIONS
-{
- . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
- phys_startup_32 = startup_32 - LOAD_OFFSET;
-
- .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) {
- _text = .; /* Text and read-only data */
- *(.text.head)
- } :text = 0x9090
-
- /* read-only */
- .text : AT(ADDR(.text) - LOAD_OFFSET) {
- . = ALIGN(PAGE_SIZE); /* not really needed, already page aligned */
- *(.text.page_aligned)
- TEXT_TEXT
- SCHED_TEXT
- LOCK_TEXT
- KPROBES_TEXT
- IRQENTRY_TEXT
- *(.fixup)
- *(.gnu.warning)
- _etext = .; /* End of text section */
- } :text = 0x9090
-
- NOTES :text :note
-
- . = ALIGN(16); /* Exception table */
- __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
- __start___ex_table = .;
- *(__ex_table)
- __stop___ex_table = .;
- } :text = 0x9090
-
- RODATA
-
- /* writeable */
- . = ALIGN(PAGE_SIZE);
- .data : AT(ADDR(.data) - LOAD_OFFSET) { /* Data */
- DATA_DATA
- CONSTRUCTORS
- } :data
-
- . = ALIGN(PAGE_SIZE);
- .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
- __nosave_begin = .;
- *(.data.nosave)
- . = ALIGN(PAGE_SIZE);
- __nosave_end = .;
- }
-
- . = ALIGN(PAGE_SIZE);
- .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
- *(.data.page_aligned)
- *(.data.idt)
- }
-
- . = ALIGN(32);
- .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
- *(.data.cacheline_aligned)
- }
-
- /* rarely changed data like cpu maps */
- . = ALIGN(32);
- .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
- *(.data.read_mostly)
- _edata = .; /* End of data section */
- }
-
- . = ALIGN(THREAD_SIZE); /* init_task */
- .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
- *(.data.init_task)
- }
-
- /* might get freed after init */
- . = ALIGN(PAGE_SIZE);
- .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
- __smp_locks = .;
- *(.smp_locks)
- __smp_locks_end = .;
- }
- /* will be freed after init
- * Following ALIGN() is required to make sure no other data falls on the
- * same page where __smp_alt_end is pointing as that page might be freed
- * after boot. Always make sure that ALIGN() directive is present after
- * the section which contains __smp_alt_end.
- */
- . = ALIGN(PAGE_SIZE);
-
- /* will be freed after init */
- . = ALIGN(PAGE_SIZE); /* Init code and data */
- .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
- __init_begin = .;
- _sinittext = .;
- INIT_TEXT
- _einittext = .;
- }
- .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
- INIT_DATA
- }
- . = ALIGN(16);
- .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
- __setup_start = .;
- *(.init.setup)
- __setup_end = .;
- }
- .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
- __initcall_start = .;
- INITCALLS
- __initcall_end = .;
- }
- .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
- __con_initcall_start = .;
- *(.con_initcall.init)
- __con_initcall_end = .;
- }
- .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
- __x86_cpu_dev_start = .;
- *(.x86_cpu_dev.init)
- __x86_cpu_dev_end = .;
- }
- SECURITY_INIT
- . = ALIGN(4);
- .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
- __alt_instructions = .;
- *(.altinstructions)
- __alt_instructions_end = .;
- }
- .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
- *(.altinstr_replacement)
- }
- . = ALIGN(4);
- .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
- __parainstructions = .;
- *(.parainstructions)
- __parainstructions_end = .;
- }
- /* .exit.text is discard at runtime, not link time, to deal with references
- from .altinstructions and .eh_frame */
- .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
- EXIT_TEXT
- }
- .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) {
- EXIT_DATA
- }
-#if defined(CONFIG_BLK_DEV_INITRD)
- . = ALIGN(PAGE_SIZE);
- .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
- __initramfs_start = .;
- *(.init.ramfs)
- __initramfs_end = .;
- }
-#endif
- PERCPU(PAGE_SIZE)
- . = ALIGN(PAGE_SIZE);
- /* freed after init ends here */
-
- .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
- __init_end = .;
- __bss_start = .; /* BSS */
- *(.bss.page_aligned)
- *(.bss)
- . = ALIGN(4);
- __bss_stop = .;
- }
-
- .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
- . = ALIGN(PAGE_SIZE);
- __brk_base = . ;
- . += 64 * 1024 ; /* 64k alignment slop space */
- *(.brk_reservation) /* areas brk users have reserved */
- __brk_limit = . ;
- }
-
- .end : AT(ADDR(.end) - LOAD_OFFSET) {
- _end = . ;
- }
-
- /* Sections to be discarded */
- /DISCARD/ : {
- *(.exitcall.exit)
- *(.discard)
- }
-
- STABS_DEBUG
-
- DWARF_DEBUG
-}
-
-/*
- * Build-time check on the image size:
- */
-ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
- "kernel image bigger than KERNEL_IMAGE_SIZE")
-
-#ifdef CONFIG_KEXEC
-/* Link time checks */
-#include <asm/kexec.h>
-
-ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
- "kexec control code size is too big")
-#endif
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
deleted file mode 100644
index c8742507b03..00000000000
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ /dev/null
@@ -1,298 +0,0 @@
-/* ld script to make x86-64 Linux kernel
- * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
- */
-
-#define LOAD_OFFSET __START_KERNEL_map
-
-#include <asm-generic/vmlinux.lds.h>
-#include <asm/asm-offsets.h>
-#include <asm/page_types.h>
-
-#undef i386 /* in case the preprocessor is a 32bit one */
-
-OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
-OUTPUT_ARCH(i386:x86-64)
-ENTRY(phys_startup_64)
-jiffies_64 = jiffies;
-PHDRS {
- text PT_LOAD FLAGS(5); /* R_E */
- data PT_LOAD FLAGS(7); /* RWE */
- user PT_LOAD FLAGS(7); /* RWE */
- data.init PT_LOAD FLAGS(7); /* RWE */
-#ifdef CONFIG_SMP
- percpu PT_LOAD FLAGS(7); /* RWE */
-#endif
- data.init2 PT_LOAD FLAGS(7); /* RWE */
- note PT_NOTE FLAGS(0); /* ___ */
-}
-SECTIONS
-{
- . = __START_KERNEL;
- phys_startup_64 = startup_64 - LOAD_OFFSET;
- .text : AT(ADDR(.text) - LOAD_OFFSET) {
- _text = .; /* Text and read-only data */
- /* First the code that has to be first for bootstrapping */
- *(.text.head)
- _stext = .;
- /* Then the rest */
- TEXT_TEXT
- SCHED_TEXT
- LOCK_TEXT
- KPROBES_TEXT
- IRQENTRY_TEXT
- *(.fixup)
- *(.gnu.warning)
- _etext = .; /* End of text section */
- } :text = 0x9090
-
- NOTES :text :note
-
- . = ALIGN(16); /* Exception table */
- __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
- __start___ex_table = .;
- *(__ex_table)
- __stop___ex_table = .;
- } :text = 0x9090
-
- RODATA
-
- . = ALIGN(PAGE_SIZE); /* Align data segment to page size boundary */
- /* Data */
- .data : AT(ADDR(.data) - LOAD_OFFSET) {
- DATA_DATA
- CONSTRUCTORS
- _edata = .; /* End of data section */
- } :data
-
-
- .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
- . = ALIGN(PAGE_SIZE);
- . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
- *(.data.cacheline_aligned)
- }
- . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES);
- .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
- *(.data.read_mostly)
- }
-
-#define VSYSCALL_ADDR (-10*1024*1024)
-#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095))
-#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095))
-
-#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR)
-#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET)
-
-#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR)
-#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
-
- . = VSYSCALL_ADDR;
- .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) } :user
- __vsyscall_0 = VSYSCALL_VIRT_ADDR;
-
- . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
- .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { *(.vsyscall_fn) }
- . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
- .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data))
- { *(.vsyscall_gtod_data) }
- vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data);
- .vsyscall_clock : AT(VLOAD(.vsyscall_clock))
- { *(.vsyscall_clock) }
- vsyscall_clock = VVIRT(.vsyscall_clock);
-
-
- .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1))
- { *(.vsyscall_1) }
- .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2))
- { *(.vsyscall_2) }
-
- .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { *(.vgetcpu_mode) }
- vgetcpu_mode = VVIRT(.vgetcpu_mode);
-
- . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
- .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) }
- jiffies = VVIRT(.jiffies);
-
- .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3))
- { *(.vsyscall_3) }
-
- . = VSYSCALL_VIRT_ADDR + PAGE_SIZE;
-
-#undef VSYSCALL_ADDR
-#undef VSYSCALL_PHYS_ADDR
-#undef VSYSCALL_VIRT_ADDR
-#undef VLOAD_OFFSET
-#undef VLOAD
-#undef VVIRT_OFFSET
-#undef VVIRT
-
- .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
- . = ALIGN(THREAD_SIZE); /* init_task */
- *(.data.init_task)
- }:data.init
-
- .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
- . = ALIGN(PAGE_SIZE);
- *(.data.page_aligned)
- }
-
- .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
- /* might get freed after init */
- . = ALIGN(PAGE_SIZE);
- __smp_alt_begin = .;
- __smp_locks = .;
- *(.smp_locks)
- __smp_locks_end = .;
- . = ALIGN(PAGE_SIZE);
- __smp_alt_end = .;
- }
-
- . = ALIGN(PAGE_SIZE); /* Init code and data */
- __init_begin = .; /* paired with __init_end */
- .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
- _sinittext = .;
- INIT_TEXT
- _einittext = .;
- }
- .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
- __initdata_begin = .;
- INIT_DATA
- __initdata_end = .;
- }
-
- .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
- . = ALIGN(16);
- __setup_start = .;
- *(.init.setup)
- __setup_end = .;
- }
- .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
- __initcall_start = .;
- INITCALLS
- __initcall_end = .;
- }
- .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
- __con_initcall_start = .;
- *(.con_initcall.init)
- __con_initcall_end = .;
- }
- .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
- __x86_cpu_dev_start = .;
- *(.x86_cpu_dev.init)
- __x86_cpu_dev_end = .;
- }
- SECURITY_INIT
-
- . = ALIGN(8);
- .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
- __parainstructions = .;
- *(.parainstructions)
- __parainstructions_end = .;
- }
-
- .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
- . = ALIGN(8);
- __alt_instructions = .;
- *(.altinstructions)
- __alt_instructions_end = .;
- }
- .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
- *(.altinstr_replacement)
- }
- /* .exit.text is discard at runtime, not link time, to deal with references
- from .altinstructions and .eh_frame */
- .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
- EXIT_TEXT
- }
- .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) {
- EXIT_DATA
- }
-
-#ifdef CONFIG_BLK_DEV_INITRD
- . = ALIGN(PAGE_SIZE);
- .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
- __initramfs_start = .;
- *(.init.ramfs)
- __initramfs_end = .;
- }
-#endif
-
-#ifdef CONFIG_SMP
- /*
- * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the
- * output PHDR, so the next output section - __data_nosave - should
- * start another section data.init2. Also, pda should be at the head of
- * percpu area. Preallocate it and define the percpu offset symbol
- * so that it can be accessed as a percpu variable.
- */
- . = ALIGN(PAGE_SIZE);
- PERCPU_VADDR(0, :percpu)
-#else
- PERCPU(PAGE_SIZE)
-#endif
-
- . = ALIGN(PAGE_SIZE);
- __init_end = .;
-
- .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
- . = ALIGN(PAGE_SIZE);
- __nosave_begin = .;
- *(.data.nosave)
- . = ALIGN(PAGE_SIZE);
- __nosave_end = .;
- } :data.init2 /* use another section data.init2, see PERCPU_VADDR() above */
-
- .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
- . = ALIGN(PAGE_SIZE);
- __bss_start = .; /* BSS */
- *(.bss.page_aligned)
- *(.bss)
- __bss_stop = .;
- }
-
- .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
- . = ALIGN(PAGE_SIZE);
- __brk_base = . ;
- . += 64 * 1024 ; /* 64k alignment slop space */
- *(.brk_reservation) /* areas brk users have reserved */
- __brk_limit = . ;
- }
-
- _end = . ;
-
- /* Sections to be discarded */
- /DISCARD/ : {
- *(.exitcall.exit)
- *(.eh_frame)
- *(.discard)
- }
-
- STABS_DEBUG
-
- DWARF_DEBUG
-}
-
- /*
- * Per-cpu symbols which need to be offset from __per_cpu_load
- * for the boot processor.
- */
-#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load
-INIT_PER_CPU(gdt_page);
-INIT_PER_CPU(irq_stack_union);
-
-/*
- * Build-time check on the image size:
- */
-ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
- "kernel image bigger than KERNEL_IMAGE_SIZE")
-
-#ifdef CONFIG_SMP
-ASSERT((per_cpu__irq_stack_union == 0),
- "irq_stack_union is not at start of per-cpu area");
-#endif
-
-#ifdef CONFIG_KEXEC
-#include <asm/kexec.h>
-
-ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
- "kexec control code size is too big")
-#endif
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 44153afc906..25ee06a80aa 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -132,15 +132,7 @@ static __always_inline void do_vgettimeofday(struct timeval * tv)
return;
}
- /*
- * Surround the RDTSC by barriers, to make sure it's not
- * speculated to outside the seqlock critical section and
- * does not cause time warps:
- */
- rdtsc_barrier();
now = vread();
- rdtsc_barrier();
-
base = __vsyscall_gtod_data.clock.cycle_last;
mask = __vsyscall_gtod_data.clock.mask;
mult = __vsyscall_gtod_data.clock.mult;