summaryrefslogtreecommitdiffstats
path: root/arch/x86_64/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86_64/kernel')
-rw-r--r--arch/x86_64/kernel/Makefile9
-rw-r--r--arch/x86_64/kernel/aperture.c25
-rw-r--r--arch/x86_64/kernel/apic.c229
-rw-r--r--arch/x86_64/kernel/crash.c26
-rw-r--r--arch/x86_64/kernel/e820.c291
-rw-r--r--arch/x86_64/kernel/early-quirks.c122
-rw-r--r--arch/x86_64/kernel/early_printk.c20
-rw-r--r--arch/x86_64/kernel/entry.S63
-rw-r--r--arch/x86_64/kernel/genapic_cluster.c1
-rw-r--r--arch/x86_64/kernel/genapic_flat.c5
-rw-r--r--arch/x86_64/kernel/head.S15
-rw-r--r--arch/x86_64/kernel/head64.c44
-rw-r--r--arch/x86_64/kernel/i8259.c15
-rw-r--r--arch/x86_64/kernel/io_apic.c482
-rw-r--r--arch/x86_64/kernel/ioport.c1
-rw-r--r--arch/x86_64/kernel/irq.c12
-rw-r--r--arch/x86_64/kernel/machine_kexec.c99
-rw-r--r--arch/x86_64/kernel/mce.c29
-rw-r--r--arch/x86_64/kernel/mce_intel.c30
-rw-r--r--arch/x86_64/kernel/mpparse.c275
-rw-r--r--arch/x86_64/kernel/nmi.c844
-rw-r--r--arch/x86_64/kernel/pci-calgary.c142
-rw-r--r--arch/x86_64/kernel/pci-dma.c100
-rw-r--r--arch/x86_64/kernel/pci-gart.c3
-rw-r--r--arch/x86_64/kernel/pci-nommu.c1
-rw-r--r--arch/x86_64/kernel/pci-swiotlb.c3
-rw-r--r--arch/x86_64/kernel/process.c110
-rw-r--r--arch/x86_64/kernel/ptrace.c29
-rw-r--r--arch/x86_64/kernel/relocate_kernel.S171
-rw-r--r--arch/x86_64/kernel/setup.c260
-rw-r--r--arch/x86_64/kernel/setup64.c45
-rw-r--r--arch/x86_64/kernel/signal.c87
-rw-r--r--arch/x86_64/kernel/smp.c23
-rw-r--r--arch/x86_64/kernel/smpboot.c17
-rw-r--r--arch/x86_64/kernel/stacktrace.c220
-rw-r--r--arch/x86_64/kernel/suspend_asm.S2
-rw-r--r--arch/x86_64/kernel/tce.c12
-rw-r--r--arch/x86_64/kernel/time.c151
-rw-r--r--arch/x86_64/kernel/trampoline.S2
-rw-r--r--arch/x86_64/kernel/traps.c204
-rw-r--r--arch/x86_64/kernel/vmlinux.lds.S40
-rw-r--r--arch/x86_64/kernel/vsmp.c3
-rw-r--r--arch/x86_64/kernel/vsyscall.c101
-rw-r--r--arch/x86_64/kernel/x8664_ksyms.c1
44 files changed, 2182 insertions, 2182 deletions
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
index b5aaeafc1cd..3c7cbff04d3 100644
--- a/arch/x86_64/kernel/Makefile
+++ b/arch/x86_64/kernel/Makefile
@@ -11,7 +11,7 @@ obj-y := process.o signal.o entry.o traps.o irq.o \
pci-dma.o pci-nommu.o alternative.o
obj-$(CONFIG_STACKTRACE) += stacktrace.o
-obj-$(CONFIG_X86_MCE) += mce.o
+obj-$(CONFIG_X86_MCE) += mce.o therm_throt.o
obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o
obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o
obj-$(CONFIG_MTRR) += ../../i386/kernel/cpu/mtrr/
@@ -20,8 +20,8 @@ obj-$(CONFIG_X86_MSR) += msr.o
obj-$(CONFIG_MICROCODE) += microcode.o
obj-$(CONFIG_X86_CPUID) += cpuid.o
obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o
-obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o
-obj-$(CONFIG_X86_IO_APIC) += io_apic.o mpparse.o \
+obj-y += apic.o nmi.o
+obj-y += io_apic.o mpparse.o \
genapic.o genapic_cluster.o genapic_flat.o
obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o crash.o
obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
@@ -39,12 +39,14 @@ obj-$(CONFIG_K8_NB) += k8.o
obj-$(CONFIG_AUDIT) += audit.o
obj-$(CONFIG_MODULES) += module.o
+obj-$(CONFIG_PCI) += early-quirks.o
obj-y += topology.o
obj-y += intel_cacheinfo.o
CFLAGS_vsyscall.o := $(PROFILING) -g0
+therm_throt-y += ../../i386/kernel/cpu/mcheck/therm_throt.o
bootflag-y += ../../i386/kernel/bootflag.o
cpuid-$(subst m,y,$(CONFIG_X86_CPUID)) += ../../i386/kernel/cpuid.o
topology-y += ../../i386/kernel/topology.o
@@ -54,4 +56,3 @@ quirks-y += ../../i386/kernel/quirks.o
i8237-y += ../../i386/kernel/i8237.o
msr-$(subst m,y,$(CONFIG_X86_MSR)) += ../../i386/kernel/msr.o
alternative-y += ../../i386/kernel/alternative.o
-
diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c
index 58af8e73738..b487396c4c5 100644
--- a/arch/x86_64/kernel/aperture.c
+++ b/arch/x86_64/kernel/aperture.c
@@ -17,6 +17,7 @@
#include <linux/pci_ids.h>
#include <linux/pci.h>
#include <linux/bitops.h>
+#include <linux/ioport.h>
#include <asm/e820.h>
#include <asm/io.h>
#include <asm/proto.h>
@@ -33,6 +34,18 @@ int fallback_aper_force __initdata = 0;
int fix_aperture __initdata = 1;
+static struct resource gart_resource = {
+ .name = "GART",
+ .flags = IORESOURCE_MEM,
+};
+
+static void __init insert_aperture_resource(u32 aper_base, u32 aper_size)
+{
+ gart_resource.start = aper_base;
+ gart_resource.end = aper_base + aper_size - 1;
+ insert_resource(&iomem_resource, &gart_resource);
+}
+
/* This code runs before the PCI subsystem is initialized, so just
access the northbridge directly. */
@@ -48,7 +61,7 @@ static u32 __init allocate_aperture(void)
/*
* Aperture has to be naturally aligned. This means an 2GB aperture won't
- * have much chances to find a place in the lower 4GB of memory.
+ * have much chance of finding a place in the lower 4GB of memory.
* Unfortunately we cannot move it up because that would make the
* IOMMU useless.
*/
@@ -62,6 +75,7 @@ static u32 __init allocate_aperture(void)
}
printk("Mapping aperture over %d KB of RAM @ %lx\n",
aper_size >> 10, __pa(p));
+ insert_aperture_resource((u32)__pa(p), aper_size);
return (u32)__pa(p);
}
@@ -198,7 +212,7 @@ void __init iommu_hole_init(void)
u64 aper_base, last_aper_base = 0;
int valid_agp = 0;
- if (iommu_aperture_disabled || !fix_aperture)
+ if (iommu_aperture_disabled || !fix_aperture || !early_pci_allowed())
return;
printk("Checking aperture...\n");
@@ -233,8 +247,13 @@ void __init iommu_hole_init(void)
last_aper_base = aper_base;
}
- if (!fix && !fallback_aper_force)
+ if (!fix && !fallback_aper_force) {
+ if (last_aper_base) {
+ unsigned long n = (32 * 1024 * 1024) << last_aper_order;
+ insert_aperture_resource((u32)last_aper_base, n);
+ }
return;
+ }
if (!fallback_aper_force)
aper_alloc = search_agp_bridge(&aper_order, &valid_agp);
diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c
index 2b8cef037a6..135ff25e6b4 100644
--- a/arch/x86_64/kernel/apic.c
+++ b/arch/x86_64/kernel/apic.c
@@ -25,6 +25,7 @@
#include <linux/kernel_stat.h>
#include <linux/sysdev.h>
#include <linux/module.h>
+#include <linux/ioport.h>
#include <asm/atomic.h>
#include <asm/smp.h>
@@ -36,13 +37,20 @@
#include <asm/idle.h>
#include <asm/proto.h>
#include <asm/timex.h>
+#include <asm/apic.h>
+int apic_mapped;
int apic_verbosity;
int apic_runs_main_timer;
int apic_calibrate_pmtmr __initdata;
int disable_apic_timer __initdata;
+static struct resource lapic_resource = {
+ .name = "Local APIC",
+ .flags = IORESOURCE_MEM | IORESOURCE_BUSY,
+};
+
/*
* cpu_mask that denotes the CPUs that needs timer interrupt coming in as
* IPIs in place of local APIC timers
@@ -136,72 +144,40 @@ void clear_local_APIC(void)
apic_read(APIC_ESR);
}
-void __init connect_bsp_APIC(void)
-{
- if (pic_mode) {
- /*
- * Do not trust the local APIC being empty at bootup.
- */
- clear_local_APIC();
- /*
- * PIC mode, enable APIC mode in the IMCR, i.e.
- * connect BSP's local APIC to INT and NMI lines.
- */
- apic_printk(APIC_VERBOSE, "leaving PIC mode, enabling APIC mode.\n");
- outb(0x70, 0x22);
- outb(0x01, 0x23);
- }
-}
-
void disconnect_bsp_APIC(int virt_wire_setup)
{
- if (pic_mode) {
- /*
- * Put the board back into PIC mode (has an effect
- * only on certain older boards). Note that APIC
- * interrupts, including IPIs, won't work beyond
- * this point! The only exception are INIT IPIs.
- */
- apic_printk(APIC_QUIET, "disabling APIC mode, entering PIC mode.\n");
- outb(0x70, 0x22);
- outb(0x00, 0x23);
- }
- else {
- /* Go back to Virtual Wire compatibility mode */
- unsigned long value;
-
- /* For the spurious interrupt use vector F, and enable it */
- value = apic_read(APIC_SPIV);
- value &= ~APIC_VECTOR_MASK;
- value |= APIC_SPIV_APIC_ENABLED;
- value |= 0xf;
- apic_write(APIC_SPIV, value);
-
- if (!virt_wire_setup) {
- /* For LVT0 make it edge triggered, active high, external and enabled */
- value = apic_read(APIC_LVT0);
- value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
- APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
- APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED );
- value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
- value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
- apic_write(APIC_LVT0, value);
- }
- else {
- /* Disable LVT0 */
- apic_write(APIC_LVT0, APIC_LVT_MASKED);
- }
+ /* Go back to Virtual Wire compatibility mode */
+ unsigned long value;
+
+ /* For the spurious interrupt use vector F, and enable it */
+ value = apic_read(APIC_SPIV);
+ value &= ~APIC_VECTOR_MASK;
+ value |= APIC_SPIV_APIC_ENABLED;
+ value |= 0xf;
+ apic_write(APIC_SPIV, value);
- /* For LVT1 make it edge triggered, active high, nmi and enabled */
- value = apic_read(APIC_LVT1);
- value &= ~(
- APIC_MODE_MASK | APIC_SEND_PENDING |
+ if (!virt_wire_setup) {
+ /* For LVT0 make it edge triggered, active high, external and enabled */
+ value = apic_read(APIC_LVT0);
+ value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
- APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
+ APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED );
value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
- value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
- apic_write(APIC_LVT1, value);
+ value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
+ apic_write(APIC_LVT0, value);
+ } else {
+ /* Disable LVT0 */
+ apic_write(APIC_LVT0, APIC_LVT_MASKED);
}
+
+ /* For LVT1 make it edge triggered, active high, nmi and enabled */
+ value = apic_read(APIC_LVT1);
+ value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
+ APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
+ APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
+ value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
+ value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
+ apic_write(APIC_LVT1, value);
}
void disable_local_APIC(void)
@@ -297,8 +273,6 @@ void __init sync_Arb_IDs(void)
| APIC_DM_INIT);
}
-extern void __error_in_apic_c (void);
-
/*
* An initial setup of the virtual wire mode.
*/
@@ -345,8 +319,7 @@ void __cpuinit setup_local_APIC (void)
value = apic_read(APIC_LVR);
- if ((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f)
- __error_in_apic_c();
+ BUILD_BUG_ON((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f);
/*
* Double-check whether this APIC is really registered.
@@ -399,32 +372,8 @@ void __cpuinit setup_local_APIC (void)
*/
value |= APIC_SPIV_APIC_ENABLED;
- /*
- * Some unknown Intel IO/APIC (or APIC) errata is biting us with
- * certain networking cards. If high frequency interrupts are
- * happening on a particular IOAPIC pin, plus the IOAPIC routing
- * entry is masked/unmasked at a high rate as well then sooner or
- * later IOAPIC line gets 'stuck', no more interrupts are received
- * from the device. If focus CPU is disabled then the hang goes
- * away, oh well :-(
- *
- * [ This bug can be reproduced easily with a level-triggered
- * PCI Ne2000 networking cards and PII/PIII processors, dual
- * BX chipset. ]
- */
- /*
- * Actually disabling the focus CPU check just makes the hang less
- * frequent as it makes the interrupt distributon model be more
- * like LRU than MRU (the short-term load is more even across CPUs).
- * See also the comment in end_level_ioapic_irq(). --macro
- */
-#if 1
- /* Enable focus processor (bit==0) */
- value &= ~APIC_SPIV_FOCUS_DISABLED;
-#else
- /* Disable focus processor (bit==1) */
- value |= APIC_SPIV_FOCUS_DISABLED;
-#endif
+ /* We always use processor focus */
+
/*
* Set spurious IRQ vector
*/
@@ -442,7 +391,7 @@ void __cpuinit setup_local_APIC (void)
* TODO: set up through-local-APIC from through-I/O-APIC? --macro
*/
value = apic_read(APIC_LVT0) & APIC_LVT_MASKED;
- if (!smp_processor_id() && (pic_mode || !value)) {
+ if (!smp_processor_id() && !value) {
value = APIC_DM_EXTINT;
apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", smp_processor_id());
} else {
@@ -479,8 +428,7 @@ void __cpuinit setup_local_APIC (void)
}
nmi_watchdog_default();
- if (nmi_watchdog == NMI_LOCAL_APIC)
- setup_apic_nmi_watchdog();
+ setup_apic_nmi_watchdog(NULL);
apic_pm_activate();
}
@@ -527,8 +475,7 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state)
apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
- local_save_flags(flags);
- local_irq_disable();
+ local_irq_save(flags);
disable_local_APIC();
local_irq_restore(flags);
return 0;
@@ -606,18 +553,24 @@ static void apic_pm_activate(void) { }
static int __init apic_set_verbosity(char *str)
{
+ if (str == NULL) {
+ skip_ioapic_setup = 0;
+ ioapic_force = 1;
+ return 0;
+ }
if (strcmp("debug", str) == 0)
apic_verbosity = APIC_DEBUG;
else if (strcmp("verbose", str) == 0)
apic_verbosity = APIC_VERBOSE;
- else
+ else {
printk(KERN_WARNING "APIC Verbosity level %s not recognised"
- " use apic=verbose or apic=debug", str);
+ " use apic=verbose or apic=debug\n", str);
+ return -EINVAL;
+ }
- return 1;
+ return 0;
}
-
-__setup("apic=", apic_set_verbosity);
+early_param("apic", apic_set_verbosity);
/*
* Detect and enable local APICs on non-SMP boards.
@@ -638,6 +591,40 @@ static int __init detect_init_APIC (void)
return 0;
}
+#ifdef CONFIG_X86_IO_APIC
+static struct resource * __init ioapic_setup_resources(void)
+{
+#define IOAPIC_RESOURCE_NAME_SIZE 11
+ unsigned long n;
+ struct resource *res;
+ char *mem;
+ int i;
+
+ if (nr_ioapics <= 0)
+ return NULL;
+
+ n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource);
+ n *= nr_ioapics;
+
+ res = alloc_bootmem(n);
+
+ if (!res)
+ return NULL;
+
+ memset(res, 0, n);
+ mem = (void *)&res[nr_ioapics];
+
+ for (i = 0; i < nr_ioapics; i++) {
+ res[i].name = mem;
+ res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ snprintf(mem, IOAPIC_RESOURCE_NAME_SIZE, "IOAPIC %u", i);
+ mem += IOAPIC_RESOURCE_NAME_SIZE;
+ }
+
+ return res;
+}
+#endif
+
void __init init_apic_mappings(void)
{
unsigned long apic_phys;
@@ -654,19 +641,26 @@ void __init init_apic_mappings(void)
apic_phys = mp_lapic_addr;
set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
+ apic_mapped = 1;
apic_printk(APIC_VERBOSE,"mapped APIC to %16lx (%16lx)\n", APIC_BASE, apic_phys);
+ /* Put local APIC into the resource map. */
+ lapic_resource.start = apic_phys;
+ lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1;
+ insert_resource(&iomem_resource, &lapic_resource);
+
/*
* Fetch the APIC ID of the BSP in case we have a
* default configuration (or the MP table is broken).
*/
boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID));
-#ifdef CONFIG_X86_IO_APIC
{
unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
int i;
+ struct resource *ioapic_res;
+ ioapic_res = ioapic_setup_resources();
for (i = 0; i < nr_ioapics; i++) {
if (smp_found_config) {
ioapic_phys = mp_ioapics[i].mpc_apicaddr;
@@ -678,9 +672,15 @@ void __init init_apic_mappings(void)
apic_printk(APIC_VERBOSE,"mapped IOAPIC to %016lx (%016lx)\n",
__fix_to_virt(idx), ioapic_phys);
idx++;
+
+ if (ioapic_res) {
+ ioapic_res->start = ioapic_phys;
+ ioapic_res->end = ioapic_phys + (4 * 1024) - 1;
+ insert_resource(&iomem_resource, ioapic_res);
+ ioapic_res++;
+ }
}
}
-#endif
}
/*
@@ -951,7 +951,7 @@ void smp_local_timer_interrupt(struct pt_regs *regs)
* We take the 'long' return path, and there every subsystem
* grabs the appropriate locks (kernel lock/ irq lock).
*
- * we might want to decouple profiling from the 'long path',
+ * We might want to decouple profiling from the 'long path',
* and do the profiling totally in assembly.
*
* Currently this isn't too much of an issue (performance wise),
@@ -1123,19 +1123,15 @@ int __init APIC_init_uniprocessor (void)
verify_local_APIC();
- connect_bsp_APIC();
-
phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id);
apic_write(APIC_ID, SET_APIC_ID(boot_cpu_id));
setup_local_APIC();
-#ifdef CONFIG_X86_IO_APIC
if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
- setup_IO_APIC();
+ setup_IO_APIC();
else
nr_ioapics = 0;
-#endif
setup_boot_APIC_clock();
check_nmi_watchdog();
return 0;
@@ -1144,14 +1140,17 @@ int __init APIC_init_uniprocessor (void)
static __init int setup_disableapic(char *str)
{
disable_apic = 1;
- return 1;
-}
+ clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
+ return 0;
+}
+early_param("disableapic", setup_disableapic);
+/* same as disableapic, for compatibility */
static __init int setup_nolapic(char *str)
{
- disable_apic = 1;
- return 1;
+ return setup_disableapic(str);
}
+early_param("nolapic", setup_nolapic);
static __init int setup_noapictimer(char *str)
{
@@ -1184,11 +1183,5 @@ static __init int setup_apicpmtimer(char *s)
}
__setup("apicpmtimer", setup_apicpmtimer);
-/* dummy parsing: see setup.c */
-
-__setup("disableapic", setup_disableapic);
-__setup("nolapic", setup_nolapic); /* same as disableapic, for compatibility */
-
__setup("noapictimer", setup_noapictimer);
-/* no "lapic" flag - we only use the lapic when the BIOS tells us so. */
diff --git a/arch/x86_64/kernel/crash.c b/arch/x86_64/kernel/crash.c
index d8d5750d610..3525f884af8 100644
--- a/arch/x86_64/kernel/crash.c
+++ b/arch/x86_64/kernel/crash.c
@@ -23,6 +23,7 @@
#include <asm/nmi.h>
#include <asm/hw_irq.h>
#include <asm/mach_apic.h>
+#include <asm/kdebug.h>
/* This keeps a track of which one is crashing cpu. */
static int crashing_cpu;
@@ -68,7 +69,7 @@ static void crash_save_this_cpu(struct pt_regs *regs, int cpu)
* for the data I pass, and I need tags
* on the data to indicate what information I have
* squirrelled away. ELF notes happen to provide
- * all of that that no need to invent something new.
+ * all of that, no need to invent something new.
*/
buf = (u32*)per_cpu_ptr(crash_notes, cpu);
@@ -95,15 +96,25 @@ static void crash_save_self(struct pt_regs *regs)
#ifdef CONFIG_SMP
static atomic_t waiting_for_crash_ipi;
-static int crash_nmi_callback(struct pt_regs *regs, int cpu)
+static int crash_nmi_callback(struct notifier_block *self,
+ unsigned long val, void *data)
{
+ struct pt_regs *regs;
+ int cpu;
+
+ if (val != DIE_NMI_IPI)
+ return NOTIFY_OK;
+
+ regs = ((struct die_args *)data)->regs;
+ cpu = raw_smp_processor_id();
+
/*
* Don't do anything if this handler is invoked on crashing cpu.
* Otherwise, system will completely hang. Crashing cpu can get
* an NMI if system was initially booted with nmi_watchdog parameter.
*/
if (cpu == crashing_cpu)
- return 1;
+ return NOTIFY_STOP;
local_irq_disable();
crash_save_this_cpu(regs, cpu);
@@ -127,12 +138,17 @@ static void smp_send_nmi_allbutself(void)
* cpu hotplug shouldn't matter.
*/
+static struct notifier_block crash_nmi_nb = {
+ .notifier_call = crash_nmi_callback,
+};
+
static void nmi_shootdown_cpus(void)
{
unsigned long msecs;
atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
- set_nmi_callback(crash_nmi_callback);
+ if (register_die_notifier(&crash_nmi_nb))
+ return; /* return what? */
/*
* Ensure the new callback function is set before sending
@@ -178,9 +194,7 @@ void machine_crash_shutdown(struct pt_regs *regs)
if(cpu_has_apic)
disable_local_APIC();
-#if defined(CONFIG_X86_IO_APIC)
disable_IO_APIC();
-#endif
crash_save_self(regs);
}
diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c
index d6d7f731f6f..b3f0908668e 100644
--- a/arch/x86_64/kernel/e820.c
+++ b/arch/x86_64/kernel/e820.c
@@ -16,6 +16,7 @@
#include <linux/string.h>
#include <linux/kexec.h>
#include <linux/module.h>
+#include <linux/mm.h>
#include <asm/pgtable.h>
#include <asm/page.h>
@@ -24,6 +25,8 @@
#include <asm/bootsetup.h>
#include <asm/sections.h>
+struct e820map e820 __initdata;
+
/*
* PFN of last memory page.
*/
@@ -40,7 +43,7 @@ unsigned long end_pfn_map;
/*
* Last pfn which the user wants to use.
*/
-unsigned long end_user_pfn = MAXMEM>>PAGE_SHIFT;
+static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT;
extern struct resource code_resource, data_resource;
@@ -69,12 +72,7 @@ static inline int bad_addr(unsigned long *addrp, unsigned long size)
return 1;
}
#endif
- /* kernel code + 640k memory hole (later should not be needed, but
- be paranoid for now) */
- if (last >= 640*1024 && addr < 1024*1024) {
- *addrp = 1024*1024;
- return 1;
- }
+ /* kernel code */
if (last >= __pa_symbol(&_text) && last < __pa_symbol(&_end)) {
*addrp = __pa_symbol(&_end);
return 1;
@@ -164,59 +162,14 @@ unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsi
return -1UL;
}
-/*
- * Free bootmem based on the e820 table for a node.
- */
-void __init e820_bootmem_free(pg_data_t *pgdat, unsigned long start,unsigned long end)
-{
- int i;
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
- unsigned long last, addr;
-
- if (ei->type != E820_RAM ||
- ei->addr+ei->size <= start ||
- ei->addr >= end)
- continue;
-
- addr = round_up(ei->addr, PAGE_SIZE);
- if (addr < start)
- addr = start;
-
- last = round_down(ei->addr + ei->size, PAGE_SIZE);
- if (last >= end)
- last = end;
-
- if (last > addr && last-addr >= PAGE_SIZE)
- free_bootmem_node(pgdat, addr, last-addr);
- }
-}
-
/*
* Find the highest page frame number we have available
*/
unsigned long __init e820_end_of_ram(void)
{
- int i;
unsigned long end_pfn = 0;
+ end_pfn = find_max_pfn_with_active_regions();
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
- unsigned long start, end;
-
- start = round_up(ei->addr, PAGE_SIZE);
- end = round_down(ei->addr + ei->size, PAGE_SIZE);
- if (start >= end)
- continue;
- if (ei->type == E820_RAM) {
- if (end > end_pfn<<PAGE_SHIFT)
- end_pfn = end>>PAGE_SHIFT;
- } else {
- if (end > end_pfn_map<<PAGE_SHIFT)
- end_pfn_map = end>>PAGE_SHIFT;
- }
- }
-
if (end_pfn > end_pfn_map)
end_pfn_map = end_pfn;
if (end_pfn_map > MAXMEM>>PAGE_SHIFT)
@@ -226,43 +179,10 @@ unsigned long __init e820_end_of_ram(void)
if (end_pfn > end_pfn_map)
end_pfn = end_pfn_map;
+ printk("end_pfn_map = %lu\n", end_pfn_map);
return end_pfn;
}
-/*
- * Compute how much memory is missing in a range.
- * Unlike the other functions in this file the arguments are in page numbers.
- */
-unsigned long __init
-e820_hole_size(unsigned long start_pfn, unsigned long end_pfn)
-{
- unsigned long ram = 0;
- unsigned long start = start_pfn << PAGE_SHIFT;
- unsigned long end = end_pfn << PAGE_SHIFT;
- int i;
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
- unsigned long last, addr;
-
- if (ei->type != E820_RAM ||
- ei->addr+ei->size <= start ||
- ei->addr >= end)
- continue;
-
- addr = round_up(ei->addr, PAGE_SIZE);
- if (addr < start)
- addr = start;
-
- last = round_down(ei->addr + ei->size, PAGE_SIZE);
- if (last >= end)
- last = end;
-
- if (last > addr)
- ram += last - addr;
- }
- return ((end - start) - ram) >> PAGE_SHIFT;
-}
-
/*
* Mark e820 reserved areas as busy for the resource manager.
*/
@@ -297,6 +217,96 @@ void __init e820_reserve_resources(void)
}
}
+/* Mark pages corresponding to given address range as nosave */
+static void __init
+e820_mark_nosave_range(unsigned long start, unsigned long end)
+{
+ unsigned long pfn, max_pfn;
+
+ if (start >= end)
+ return;
+
+ printk("Nosave address range: %016lx - %016lx\n", start, end);
+ max_pfn = end >> PAGE_SHIFT;
+ for (pfn = start >> PAGE_SHIFT; pfn < max_pfn; pfn++)
+ if (pfn_valid(pfn))
+ SetPageNosave(pfn_to_page(pfn));
+}
+
+/*
+ * Find the ranges of physical addresses that do not correspond to
+ * e820 RAM areas and mark the corresponding pages as nosave for software
+ * suspend and suspend to RAM.
+ *
+ * This function requires the e820 map to be sorted and without any
+ * overlapping entries and assumes the first e820 area to be RAM.
+ */
+void __init e820_mark_nosave_regions(void)
+{
+ int i;
+ unsigned long paddr;
+
+ paddr = round_down(e820.map[0].addr + e820.map[0].size, PAGE_SIZE);
+ for (i = 1; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+
+ if (paddr < ei->addr)
+ e820_mark_nosave_range(paddr,
+ round_up(ei->addr, PAGE_SIZE));
+
+ paddr = round_down(ei->addr + ei->size, PAGE_SIZE);
+ if (ei->type != E820_RAM)
+ e820_mark_nosave_range(round_up(ei->addr, PAGE_SIZE),
+ paddr);
+
+ if (paddr >= (end_pfn << PAGE_SHIFT))
+ break;
+ }
+}
+
+/* Walk the e820 map and register active regions within a node */
+void __init
+e820_register_active_regions(int nid, unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ int i;
+ unsigned long ei_startpfn, ei_endpfn;
+ for (i = 0; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+ ei_startpfn = round_up(ei->addr, PAGE_SIZE) >> PAGE_SHIFT;
+ ei_endpfn = round_down(ei->addr + ei->size, PAGE_SIZE)
+ >> PAGE_SHIFT;
+
+ /* Skip map entries smaller than a page */
+ if (ei_startpfn > ei_endpfn)
+ continue;
+
+ /* Check if end_pfn_map should be updated */
+ if (ei->type != E820_RAM && ei_endpfn > end_pfn_map)
+ end_pfn_map = ei_endpfn;
+
+ /* Skip if map is outside the node */
+ if (ei->type != E820_RAM ||
+ ei_endpfn <= start_pfn ||
+ ei_startpfn >= end_pfn)
+ continue;
+
+ /* Check for overlaps */
+ if (ei_startpfn < start_pfn)
+ ei_startpfn = start_pfn;
+ if (ei_endpfn > end_pfn)
+ ei_endpfn = end_pfn;
+
+ /* Obey end_user_pfn to save on memmap */
+ if (ei_startpfn >= end_user_pfn)
+ continue;
+ if (ei_endpfn > end_user_pfn)
+ ei_endpfn = end_user_pfn;
+
+ add_active_range(nid, ei_startpfn, ei_endpfn);
+ }
+}
+
/*
* Add a memory region to the kernel e820 map.
*/
@@ -517,13 +527,6 @@ static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
* If we're lucky and live on a modern system, the setup code
* will have given us a memory map that we can use to properly
* set up memory. If we aren't, we'll fake a memory map.
- *
- * We check to see that the memory map contains at least 2 elements
- * before we'll use it, because the detection code in setup.S may
- * not be perfect and most every PC known to man has two memory
- * regions: one from 0 to 640k, and one from 1mb up. (The IBM
- * thinkpad 560x, for example, does not cooperate with the memory
- * detection code.)
*/
static int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
{
@@ -541,34 +544,19 @@ static int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
if (start > end)
return -1;
- /*
- * Some BIOSes claim RAM in the 640k - 1M region.
- * Not right. Fix it up.
- *
- * This should be removed on Hammer which is supposed to not
- * have non e820 covered ISA mappings there, but I had some strange
- * problems so it stays for now. -AK
- */
- if (type == E820_RAM) {
- if (start < 0x100000ULL && end > 0xA0000ULL) {
- if (start < 0xA0000ULL)
- add_memory_region(start, 0xA0000ULL-start, type);
- if (end <= 0x100000ULL)
- continue;
- start = 0x100000ULL;
- size = end - start;
- }
- }
-
add_memory_region(start, size, type);
} while (biosmap++,--nr_map);
return 0;
}
-void __init setup_memory_region(void)
+void early_panic(char *msg)
{
- char *who = "BIOS-e820";
+ early_printk(msg);
+ panic(msg);
+}
+void __init setup_memory_region(void)
+{
/*
* Try to copy the BIOS-supplied E820-map.
*
@@ -576,51 +564,70 @@ void __init setup_memory_region(void)
* the next section from 1mb->appropriate_mem_k
*/
sanitize_e820_map(E820_MAP, &E820_MAP_NR);
- if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0) {
- unsigned long mem_size;
-
- /* compare results from other methods and take the greater */
- if (ALT_MEM_K < EXT_MEM_K) {
- mem_size = EXT_MEM_K;
- who = "BIOS-88";
- } else {
- mem_size = ALT_MEM_K;
- who = "BIOS-e801";
- }
-
- e820.nr_map = 0;
- add_memory_region(0, LOWMEMSIZE(), E820_RAM);
- add_memory_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
- }
+ if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0)
+ early_panic("Cannot find a valid memory map");
printk(KERN_INFO "BIOS-provided physical RAM map:\n");
- e820_print_map(who);
+ e820_print_map("BIOS-e820");
}
-void __init parse_memopt(char *p, char **from)
-{
- end_user_pfn = memparse(p, from);
+static int __init parse_memopt(char *p)
+{
+ if (!p)
+ return -EINVAL;
+ end_user_pfn = memparse(p, &p);
end_user_pfn >>= PAGE_SHIFT;
+ return 0;
}
+early_param("mem", parse_memopt);
+
+static int userdef __initdata;
-void __init parse_memmapopt(char *p, char **from)
+static int __init parse_memmap_opt(char *p)
{
+ char *oldp;
unsigned long long start_at, mem_size;
- mem_size = memparse(p, from);
- p = *from;
+ if (!strcmp(p, "exactmap")) {
+#ifdef CONFIG_CRASH_DUMP
+ /* If we are doing a crash dump, we
+ * still need to know the real mem
+ * size before original memory map is
+ * reset.
+ */
+ saved_max_pfn = e820_end_of_ram();
+#endif
+ end_pfn_map = 0;
+ e820.nr_map = 0;
+ userdef = 1;
+ return 0;
+ }
+
+ oldp = p;
+ mem_size = memparse(p, &p);
+ if (p == oldp)
+ return -EINVAL;
if (*p == '@') {
- start_at = memparse(p+1, from);
+ start_at = memparse(p+1, &p);
add_memory_region(start_at, mem_size, E820_RAM);
} else if (*p == '#') {
- start_at = memparse(p+1, from);
+ start_at = memparse(p+1, &p);
add_memory_region(start_at, mem_size, E820_ACPI);
} else if (*p == '$') {
- start_at = memparse(p+1, from);
+ start_at = memparse(p+1, &p);
add_memory_region(start_at, mem_size, E820_RESERVED);
} else {
end_user_pfn = (mem_size >> PAGE_SHIFT);
}
- p = *from;
+ return *p == '\0' ? 0 : -EINVAL;
+}
+early_param("memmap", parse_memmap_opt);
+
+void finish_e820_parsing(void)
+{
+ if (userdef) {
+ printk(KERN_INFO "user-defined physical RAM map:\n");
+ e820_print_map("user");
+ }
}
unsigned long pci_mem_start = 0xaeedbabe;
diff --git a/arch/x86_64/kernel/early-quirks.c b/arch/x86_64/kernel/early-quirks.c
new file mode 100644
index 00000000000..208e38a372c
--- /dev/null
+++ b/arch/x86_64/kernel/early-quirks.c
@@ -0,0 +1,122 @@
+/* Various workarounds for chipset bugs.
+ This code runs very early and can't use the regular PCI subsystem
+ The entries are keyed to PCI bridges which usually identify chipsets
+ uniquely.
+ This is only for whole classes of chipsets with specific problems which
+ need early invasive action (e.g. before the timers are initialized).
+ Most PCI device specific workarounds can be done later and should be
+ in standard PCI quirks
+ Mainboard specific bugs should be handled by DMI entries.
+ CPU specific bugs in setup.c */
+
+#include <linux/pci.h>
+#include <linux/acpi.h>
+#include <linux/pci_ids.h>
+#include <asm/pci-direct.h>
+#include <asm/proto.h>
+#include <asm/dma.h>
+
+static void via_bugs(void)
+{
+#ifdef CONFIG_IOMMU
+ if ((end_pfn > MAX_DMA32_PFN || force_iommu) &&
+ !iommu_aperture_allowed) {
+ printk(KERN_INFO
+ "Looks like a VIA chipset. Disabling IOMMU. Override with iommu=allowed\n");
+ iommu_aperture_disabled = 1;
+ }
+#endif
+}
+
+#ifdef CONFIG_ACPI
+
+static int nvidia_hpet_detected __initdata;
+
+static int __init nvidia_hpet_check(unsigned long phys, unsigned long size)
+{
+ nvidia_hpet_detected = 1;
+ return 0;
+}
+#endif
+
+static void nvidia_bugs(void)
+{
+#ifdef CONFIG_ACPI
+ /*
+ * All timer overrides on Nvidia are
+ * wrong unless HPET is enabled.
+ */
+ nvidia_hpet_detected = 0;
+ acpi_table_parse(ACPI_HPET, nvidia_hpet_check);
+ if (nvidia_hpet_detected == 0) {
+ acpi_skip_timer_override = 1;
+ printk(KERN_INFO "Nvidia board "
+ "detected. Ignoring ACPI "
+ "timer override.\n");
+ }
+#endif
+ /* RED-PEN skip them on mptables too? */
+
+}
+
+static void ati_bugs(void)
+{
+#if 1 /* for testing */
+ printk("ATI board detected\n");
+#endif
+ /* No bugs right now */
+}
+
+struct chipset {
+ u16 vendor;
+ void (*f)(void);
+};
+
+static struct chipset early_qrk[] = {
+ { PCI_VENDOR_ID_NVIDIA, nvidia_bugs },
+ { PCI_VENDOR_ID_VIA, via_bugs },
+ { PCI_VENDOR_ID_ATI, ati_bugs },
+ {}
+};
+
+void __init early_quirks(void)
+{
+ int num, slot, func;
+
+ if (!early_pci_allowed())
+ return;
+
+ /* Poor man's PCI discovery */
+ for (num = 0; num < 32; num++) {
+ for (slot = 0; slot < 32; slot++) {
+ for (func = 0; func < 8; func++) {
+ u32 class;
+ u32 vendor;
+ u8 type;
+ int i;
+ class = read_pci_config(num,slot,func,
+ PCI_CLASS_REVISION);
+ if (class == 0xffffffff)
+ break;
+
+ if ((class >> 16) != PCI_CLASS_BRIDGE_PCI)
+ continue;
+
+ vendor = read_pci_config(num, slot, func,
+ PCI_VENDOR_ID);
+ vendor &= 0xffff;
+
+ for (i = 0; early_qrk[i].f; i++)
+ if (early_qrk[i].vendor == vendor) {
+ early_qrk[i].f();
+ return;
+ }
+
+ type = read_pci_config_byte(num, slot, func,
+ PCI_HEADER_TYPE);
+ if (!(type & 0x80))
+ break;
+ }
+ }
+ }
+}
diff --git a/arch/x86_64/kernel/early_printk.c b/arch/x86_64/kernel/early_printk.c
index 140051e07fa..e22ecd54870 100644
--- a/arch/x86_64/kernel/early_printk.c
+++ b/arch/x86_64/kernel/early_printk.c
@@ -215,20 +215,16 @@ void early_printk(const char *fmt, ...)
static int __initdata keep_early;
-int __init setup_early_printk(char *opt)
+static int __init setup_early_printk(char *buf)
{
- char *space;
- char buf[256];
+ if (!buf)
+ return 0;
if (early_console_initialized)
- return 1;
-
- strlcpy(buf,opt,sizeof(buf));
- space = strchr(buf, ' ');
- if (space)
- *space = 0;
+ return 0;
+ early_console_initialized = 1;
- if (strstr(buf,"keep"))
+ if (!strcmp(buf,"keep"))
keep_early = 1;
if (!strncmp(buf, "serial", 6)) {
@@ -248,11 +244,12 @@ int __init setup_early_printk(char *opt)
early_console = &simnow_console;
keep_early = 1;
}
- early_console_initialized = 1;
register_console(early_console);
return 0;
}
+early_param("earlyprintk", setup_early_printk);
+
void __init disable_early_printk(void)
{
if (!early_console_initialized || !early_console)
@@ -266,4 +263,3 @@ void __init disable_early_printk(void)
}
}
-__setup("earlyprintk=", setup_early_printk);
diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S
index aa8d8939abc..2802524104f 100644
--- a/arch/x86_64/kernel/entry.S
+++ b/arch/x86_64/kernel/entry.S
@@ -4,8 +4,6 @@
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
* Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
- *
- * $Id$
*/
/*
@@ -22,15 +20,25 @@
* at the top of the kernel process stack.
* - partial stack frame: partially saved registers upto R11.
* - full stack frame: Like partial stack frame, but all register saved.
- *
- * TODO:
- * - schedule it carefully for the final hardware.
+ *
+ * Some macro usage:
+ * - CFI macros are used to generate dwarf2 unwind information for better
+ * backtraces. They don't change any code.
+ * - SAVE_ALL/RESTORE_ALL - Save/restore all registers
+ * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify.
+ * There are unfortunately lots of special cases where some registers
+ * not touched. The macro is a big mess that should be cleaned up.
+ * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS.
+ * Gives a full stack frame.
+ * - ENTRY/END Define functions in the symbol table.
+ * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack
+ * frame that is otherwise undefined after a SYSCALL
+ * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging.
+ * - errorentry/paranoidentry/zeroentry - Define exception entry points.
*/
-#define ASSEMBLY 1
#include <linux/linkage.h>
#include <asm/segment.h>
-#include <asm/smp.h>
#include <asm/cache.h>
#include <asm/errno.h>
#include <asm/dwarf2.h>
@@ -115,6 +123,7 @@
.macro CFI_DEFAULT_STACK start=1
.if \start
CFI_STARTPROC simple
+ CFI_SIGNAL_FRAME
CFI_DEF_CFA rsp,SS+8
.else
CFI_DEF_CFA_OFFSET SS+8
@@ -146,6 +155,10 @@
/* rdi: prev */
ENTRY(ret_from_fork)
CFI_DEFAULT_STACK
+ push kernel_eflags(%rip)
+ CFI_ADJUST_CFA_OFFSET 4
+ popf # reset kernel eflags
+ CFI_ADJUST_CFA_OFFSET -4
call schedule_tail
GET_THREAD_INFO(%rcx)
testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx)
@@ -199,6 +212,7 @@ END(ret_from_fork)
ENTRY(system_call)
CFI_STARTPROC simple
+ CFI_SIGNAL_FRAME
CFI_DEF_CFA rsp,PDA_STACKOFFSET
CFI_REGISTER rip,rcx
/*CFI_REGISTER rflags,r11*/
@@ -316,6 +330,7 @@ END(system_call)
*/
ENTRY(int_ret_from_sys_call)
CFI_STARTPROC simple
+ CFI_SIGNAL_FRAME
CFI_DEF_CFA rsp,SS+8-ARGOFFSET
/*CFI_REL_OFFSET ss,SS-ARGOFFSET*/
CFI_REL_OFFSET rsp,RSP-ARGOFFSET
@@ -476,6 +491,7 @@ END(stub_rt_sigreturn)
*/
.macro _frame ref
CFI_STARTPROC simple
+ CFI_SIGNAL_FRAME
CFI_DEF_CFA rsp,SS+8-\ref
/*CFI_REL_OFFSET ss,SS-\ref*/
CFI_REL_OFFSET rsp,RSP-\ref
@@ -511,7 +527,12 @@ END(stub_rt_sigreturn)
testl $3,CS(%rdi)
je 1f
swapgs
-1: incl %gs:pda_irqcount # RED-PEN should check preempt count
+ /* irqcount is used to check if a CPU is already on an interrupt
+ stack or not. While this is essentially redundant with preempt_count
+ it is a little cheaper to use a separate counter in the PDA
+ (short of moving irq_enter into assembly, which would be too
+ much work) */
+1: incl %gs:pda_irqcount
cmoveq %gs:pda_irqstackptr,%rsp
push %rbp # backlink for old unwinder
/*
@@ -619,8 +640,7 @@ retint_signal:
#ifdef CONFIG_PREEMPT
/* Returning to kernel space. Check if we need preemption */
/* rcx: threadinfo. interrupts off. */
- .p2align
-retint_kernel:
+ENTRY(retint_kernel)
cmpl $0,threadinfo_preempt_count(%rcx)
jnz retint_restore_args
bt $TIF_NEED_RESCHED,threadinfo_flags(%rcx)
@@ -679,7 +699,6 @@ ENTRY(call_function_interrupt)
END(call_function_interrupt)
#endif
-#ifdef CONFIG_X86_LOCAL_APIC
ENTRY(apic_timer_interrupt)
apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt
END(apic_timer_interrupt)
@@ -691,7 +710,6 @@ END(error_interrupt)
ENTRY(spurious_interrupt)
apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt
END(spurious_interrupt)
-#endif
/*
* Exception entry points.
@@ -768,7 +786,9 @@ paranoid_exit\trace:
testl $3,CS(%rsp)
jnz paranoid_userspace\trace
paranoid_swapgs\trace:
+ .if \trace
TRACE_IRQS_IRETQ 0
+ .endif
swapgs
paranoid_restore\trace:
RESTORE_ALL 8
@@ -814,7 +834,7 @@ paranoid_schedule\trace:
* Exception entry point. This expects an error code/orig_rax on the stack
* and the exception handler in %rax.
*/
-ENTRY(error_entry)
+KPROBE_ENTRY(error_entry)
_frame RDI
/* rdi slot contains rax, oldrax contains error code */
cld
@@ -898,7 +918,7 @@ error_kernelspace:
cmpq $gs_change,RIP(%rsp)
je error_swapgs
jmp error_sti
-END(error_entry)
+KPROBE_END(error_entry)
/* Reload gs selector with exception handling */
/* edi: new selector */
@@ -1020,8 +1040,7 @@ ENDPROC(execve)
KPROBE_ENTRY(page_fault)
errorentry do_page_fault
-END(page_fault)
- .previous .text
+KPROBE_END(page_fault)
ENTRY(coprocessor_error)
zeroentry do_coprocessor_error
@@ -1042,8 +1061,7 @@ KPROBE_ENTRY(debug)
CFI_ADJUST_CFA_OFFSET 8
paranoidentry do_debug, DEBUG_STACK
paranoidexit
-END(debug)
- .previous .text
+KPROBE_END(debug)
/* runs on exception stack */
KPROBE_ENTRY(nmi)
@@ -1057,8 +1075,7 @@ KPROBE_ENTRY(nmi)
jmp paranoid_exit1
CFI_ENDPROC
#endif
-END(nmi)
- .previous .text
+KPROBE_END(nmi)
KPROBE_ENTRY(int3)
INTR_FRAME
@@ -1067,8 +1084,7 @@ KPROBE_ENTRY(int3)
paranoidentry do_int3, DEBUG_STACK
jmp paranoid_exit1
CFI_ENDPROC
-END(int3)
- .previous .text
+KPROBE_END(int3)
ENTRY(overflow)
zeroentry do_overflow
@@ -1116,8 +1132,7 @@ END(stack_segment)
KPROBE_ENTRY(general_protection)
errorentry do_general_protection
-END(general_protection)
- .previous .text
+KPROBE_END(general_protection)
ENTRY(alignment_check)
errorentry do_alignment_check
diff --git a/arch/x86_64/kernel/genapic_cluster.c b/arch/x86_64/kernel/genapic_cluster.c
index 3020917546d..cdb90e671b8 100644
--- a/arch/x86_64/kernel/genapic_cluster.c
+++ b/arch/x86_64/kernel/genapic_cluster.c
@@ -118,7 +118,6 @@ struct genapic apic_cluster = {
.name = "clustered",
.int_delivery_mode = dest_Fixed,
.int_dest_mode = (APIC_DEST_PHYSICAL != 0),
- .int_delivery_dest = APIC_DEST_PHYSICAL | APIC_DM_FIXED,
.target_cpus = cluster_target_cpus,
.apic_id_registered = cluster_apic_id_registered,
.init_apic_ldr = cluster_init_apic_ldr,
diff --git a/arch/x86_64/kernel/genapic_flat.c b/arch/x86_64/kernel/genapic_flat.c
index eb86d374813..50ad153eaac 100644
--- a/arch/x86_64/kernel/genapic_flat.c
+++ b/arch/x86_64/kernel/genapic_flat.c
@@ -49,8 +49,7 @@ static void flat_send_IPI_mask(cpumask_t cpumask, int vector)
unsigned long cfg;
unsigned long flags;
- local_save_flags(flags);
- local_irq_disable();
+ local_irq_save(flags);
/*
* Wait for idle.
@@ -121,7 +120,6 @@ struct genapic apic_flat = {
.name = "flat",
.int_delivery_mode = dest_LowestPrio,
.int_dest_mode = (APIC_DEST_LOGICAL != 0),
- .int_delivery_dest = APIC_DEST_LOGICAL | APIC_DM_LOWEST,
.target_cpus = flat_target_cpus,
.apic_id_registered = flat_apic_id_registered,
.init_apic_ldr = flat_init_apic_ldr,
@@ -180,7 +178,6 @@ struct genapic apic_physflat = {
.name = "physical flat",
.int_delivery_mode = dest_Fixed,
.int_dest_mode = (APIC_DEST_PHYSICAL != 0),
- .int_delivery_dest = APIC_DEST_PHYSICAL | APIC_DM_FIXED,
.target_cpus = physflat_target_cpus,
.apic_id_registered = flat_apic_id_registered,
.init_apic_ldr = flat_init_apic_ldr,/*not needed, but shouldn't hurt*/
diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S
index c9739ca81d0..1e6f8087067 100644
--- a/arch/x86_64/kernel/head.S
+++ b/arch/x86_64/kernel/head.S
@@ -5,8 +5,6 @@
* Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
* Copyright (C) 2000 Karsten Keil <kkeil@suse.de>
* Copyright (C) 2001,2002 Andi Kleen <ak@suse.de>
- *
- * $Id: head.S,v 1.49 2002/03/19 17:39:25 ak Exp $
*/
@@ -187,12 +185,15 @@ startup_64:
/* Finally jump to run C code and to be on real kernel address
* Since we are running on identity-mapped space we have to jump
- * to the full 64bit address , this is only possible as indirect
- * jump
+ * to the full 64bit address, this is only possible as indirect
+ * jump. In addition we need to ensure %cs is set so we make this
+ * a far return.
*/
movq initial_code(%rip),%rax
- pushq $0 # fake return address
- jmp *%rax
+ pushq $0 # fake return address to stop unwinder
+ pushq $__KERNEL_CS # set correct cs
+ pushq %rax # target address in negative space
+ lretq
/* SMP bootup changes these two */
.align 8
@@ -371,7 +372,7 @@ ENTRY(cpu_gdt_table)
.quad 0,0 /* TSS */
.quad 0,0 /* LDT */
.quad 0,0,0 /* three TLS descriptors */
- .quad 0 /* unused */
+ .quad 0x0000f40000000000 /* node/CPU stored in limit */
gdt_end:
/* asm/segment.h:GDT_ENTRIES must match this */
/* This should be a multiple of the cache line size */
diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c
index 36647ce6aec..9561eb3c5b5 100644
--- a/arch/x86_64/kernel/head64.c
+++ b/arch/x86_64/kernel/head64.c
@@ -45,38 +45,16 @@ static void __init copy_bootdata(char *real_mode_data)
new_data = *(int *) (x86_boot_params + NEW_CL_POINTER);
if (!new_data) {
if (OLD_CL_MAGIC != * (u16 *) OLD_CL_MAGIC_ADDR) {
- printk("so old bootloader that it does not support commandline?!\n");
return;
}
new_data = OLD_CL_BASE_ADDR + * (u16 *) OLD_CL_OFFSET;
- printk("old bootloader convention, maybe loadlin?\n");
}
command_line = (char *) ((u64)(new_data));
memcpy(saved_command_line, command_line, COMMAND_LINE_SIZE);
- printk("Bootdata ok (command line is %s)\n", saved_command_line);
-}
-
-static void __init setup_boot_cpu_data(void)
-{
- unsigned int dummy, eax;
-
- /* get vendor info */
- cpuid(0, (unsigned int *)&boot_cpu_data.cpuid_level,
- (unsigned int *)&boot_cpu_data.x86_vendor_id[0],
- (unsigned int *)&boot_cpu_data.x86_vendor_id[8],
- (unsigned int *)&boot_cpu_data.x86_vendor_id[4]);
-
- /* get cpu type */
- cpuid(1, &eax, &dummy, &dummy,
- (unsigned int *) &boot_cpu_data.x86_capability);
- boot_cpu_data.x86 = (eax >> 8) & 0xf;
- boot_cpu_data.x86_model = (eax >> 4) & 0xf;
- boot_cpu_data.x86_mask = eax & 0xf;
}
void __init x86_64_start_kernel(char * real_mode_data)
{
- char *s;
int i;
for (i = 0; i < 256; i++)
@@ -84,10 +62,7 @@ void __init x86_64_start_kernel(char * real_mode_data)
asm volatile("lidt %0" :: "m" (idt_descr));
clear_bss();
- /*
- * This must be called really, really early:
- */
- lockdep_init();
+ early_printk("Kernel alive\n");
/*
* switch to init_level4_pgt from boot_level4_pgt
@@ -103,22 +78,5 @@ void __init x86_64_start_kernel(char * real_mode_data)
#ifdef CONFIG_SMP
cpu_set(0, cpu_online_map);
#endif
- s = strstr(saved_command_line, "earlyprintk=");
- if (s != NULL)
- setup_early_printk(strchr(s, '=') + 1);
-#ifdef CONFIG_NUMA
- s = strstr(saved_command_line, "numa=");
- if (s != NULL)
- numa_setup(s+5);
-#endif
-#ifdef CONFIG_X86_IO_APIC
- if (strstr(saved_command_line, "disableapic"))
- disable_apic = 1;
-#endif
- /* You need early console to see that */
- if (__pa_symbol(&_end) >= KERNEL_TEXT_SIZE)
- panic("Kernel too big for kernel mapping\n");
-
- setup_boot_cpu_data();
start_kernel();
}
diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c
index 0434b1f8e3d..2dd51f364ea 100644
--- a/arch/x86_64/kernel/i8259.c
+++ b/arch/x86_64/kernel/i8259.c
@@ -55,7 +55,6 @@
*/
BUILD_16_IRQS(0x0)
-#ifdef CONFIG_X86_LOCAL_APIC
/*
* The IO-APIC gives us many more interrupt sources. Most of these
* are unused but an SMP system is supposed to have enough memory ...
@@ -75,8 +74,6 @@ BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd)
BUILD_15_IRQS(0xe)
#endif
-#endif
-
#undef BUILD_16_IRQS
#undef BUILD_15_IRQS
#undef BI
@@ -100,7 +97,6 @@ BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd)
void (*interrupt[NR_IRQS])(void) = {
IRQLIST_16(0x0),
-#ifdef CONFIG_X86_IO_APIC
IRQLIST_16(0x1), IRQLIST_16(0x2), IRQLIST_16(0x3),
IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7),
IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb),
@@ -110,7 +106,6 @@ void (*interrupt[NR_IRQS])(void) = {
, IRQLIST_15(0xe)
#endif
-#endif
};
#undef IRQ
@@ -128,6 +123,8 @@ void (*interrupt[NR_IRQS])(void) = {
DEFINE_SPINLOCK(i8259A_lock);
+static int i8259A_auto_eoi;
+
static void end_8259A_irq (unsigned int irq)
{
if (irq > 256) {
@@ -341,6 +338,8 @@ void init_8259A(int auto_eoi)
{
unsigned long flags;
+ i8259A_auto_eoi = auto_eoi;
+
spin_lock_irqsave(&i8259A_lock, flags);
outb(0xff, 0x21); /* mask all of 8259A-1 */
@@ -399,7 +398,7 @@ static void save_ELCR(char *trigger)
static int i8259A_resume(struct sys_device *dev)
{
- init_8259A(0);
+ init_8259A(i8259A_auto_eoi);
restore_ELCR(irq_trigger);
return 0;
}
@@ -453,9 +452,7 @@ void __init init_ISA_irqs (void)
{
int i;
-#ifdef CONFIG_X86_LOCAL_APIC
init_bsp_APIC();
-#endif
init_8259A(0);
for (i = 0; i < NR_IRQS; i++) {
@@ -581,14 +578,12 @@ void __init init_IRQ(void)
set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
set_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
-#ifdef CONFIG_X86_LOCAL_APIC
/* self generated IPI for local APIC timer */
set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
/* IPI vectors for APIC spurious and error interrupts */
set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
set_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
-#endif
/*
* Set the clock to HZ Hz, we already have a valid
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index 924a4a33295..0491019d4c8 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -48,7 +48,7 @@ int sis_apic_bug; /* not actually supported, dummy for compile */
static int no_timer_check;
-int disable_timer_pin_1 __initdata;
+static int disable_timer_pin_1 __initdata;
int timer_over_8254 __initdata = 0;
@@ -111,6 +111,33 @@ int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1};
FINAL; \
}
+union entry_union {
+ struct { u32 w1, w2; };
+ struct IO_APIC_route_entry entry;
+};
+
+static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
+{
+ union entry_union eu;
+ unsigned long flags;
+ spin_lock_irqsave(&ioapic_lock, flags);
+ eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
+ eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+ return eu.entry;
+}
+
+static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
+{
+ unsigned long flags;
+ union entry_union eu;
+ eu.entry = e;
+ spin_lock_irqsave(&ioapic_lock, flags);
+ io_apic_write(apic, 0x10 + 2*pin, eu.w1);
+ io_apic_write(apic, 0x11 + 2*pin, eu.w2);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
#ifdef CONFIG_SMP
static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
{
@@ -196,13 +223,9 @@ static void unmask_IO_APIC_irq (unsigned int irq)
static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
{
struct IO_APIC_route_entry entry;
- unsigned long flags;
/* Check delivery_mode to be sure we're not clearing an SMI pin */
- spin_lock_irqsave(&ioapic_lock, flags);
- *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
- *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ entry = ioapic_read_entry(apic, pin);
if (entry.delivery_mode == dest_SMI)
return;
/*
@@ -210,10 +233,7 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
*/
memset(&entry, 0, sizeof(entry));
entry.mask = 1;
- spin_lock_irqsave(&ioapic_lock, flags);
- io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0));
- io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1));
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ ioapic_write_entry(apic, pin, entry);
}
static void clear_IO_APIC (void)
@@ -225,14 +245,6 @@ static void clear_IO_APIC (void)
clear_IO_APIC_pin(apic, pin);
}
-/*
- * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
- * specific CPU-side IRQs.
- */
-
-#define MAX_PIRQS 8
-static int pirq_entries [MAX_PIRQS];
-static int pirqs_enabled;
int skip_ioapic_setup;
int ioapic_force;
@@ -241,18 +253,17 @@ int ioapic_force;
static int __init disable_ioapic_setup(char *str)
{
skip_ioapic_setup = 1;
- return 1;
+ return 0;
}
+early_param("noapic", disable_ioapic_setup);
-static int __init enable_ioapic_setup(char *str)
+/* Actually the next is obsolete, but keep it for paranoid reasons -AK */
+static int __init disable_timer_pin_setup(char *arg)
{
- ioapic_force = 1;
- skip_ioapic_setup = 0;
+ disable_timer_pin_1 = 1;
return 1;
}
-
-__setup("noapic", disable_ioapic_setup);
-__setup("apic", enable_ioapic_setup);
+__setup("disable_timer_pin_1", disable_timer_pin_setup);
static int __init setup_disable_8254_timer(char *s)
{
@@ -268,135 +279,6 @@ static int __init setup_enable_8254_timer(char *s)
__setup("disable_8254_timer", setup_disable_8254_timer);
__setup("enable_8254_timer", setup_enable_8254_timer);
-#include <asm/pci-direct.h>
-#include <linux/pci_ids.h>
-#include <linux/pci.h>
-
-
-#ifdef CONFIG_ACPI
-
-static int nvidia_hpet_detected __initdata;
-
-static int __init nvidia_hpet_check(unsigned long phys, unsigned long size)
-{
- nvidia_hpet_detected = 1;
- return 0;
-}
-#endif
-
-/* Temporary Hack. Nvidia and VIA boards currently only work with IO-APIC
- off. Check for an Nvidia or VIA PCI bridge and turn it off.
- Use pci direct infrastructure because this runs before the PCI subsystem.
-
- Can be overwritten with "apic"
-
- And another hack to disable the IOMMU on VIA chipsets.
-
- ... and others. Really should move this somewhere else.
-
- Kludge-O-Rama. */
-void __init check_ioapic(void)
-{
- int num,slot,func;
- /* Poor man's PCI discovery */
- for (num = 0; num < 32; num++) {
- for (slot = 0; slot < 32; slot++) {
- for (func = 0; func < 8; func++) {
- u32 class;
- u32 vendor;
- u8 type;
- class = read_pci_config(num,slot,func,
- PCI_CLASS_REVISION);
- if (class == 0xffffffff)
- break;
-
- if ((class >> 16) != PCI_CLASS_BRIDGE_PCI)
- continue;
-
- vendor = read_pci_config(num, slot, func,
- PCI_VENDOR_ID);
- vendor &= 0xffff;
- switch (vendor) {
- case PCI_VENDOR_ID_VIA:
-#ifdef CONFIG_IOMMU
- if ((end_pfn > MAX_DMA32_PFN ||
- force_iommu) &&
- !iommu_aperture_allowed) {
- printk(KERN_INFO
- "Looks like a VIA chipset. Disabling IOMMU. Override with \"iommu=allowed\"\n");
- iommu_aperture_disabled = 1;
- }
-#endif
- return;
- case PCI_VENDOR_ID_NVIDIA:
-#ifdef CONFIG_ACPI
- /*
- * All timer overrides on Nvidia are
- * wrong unless HPET is enabled.
- */
- nvidia_hpet_detected = 0;
- acpi_table_parse(ACPI_HPET,
- nvidia_hpet_check);
- if (nvidia_hpet_detected == 0) {
- acpi_skip_timer_override = 1;
- printk(KERN_INFO "Nvidia board "
- "detected. Ignoring ACPI "
- "timer override.\n");
- }
-#endif
- /* RED-PEN skip them on mptables too? */
- return;
-
- /* This should be actually default, but
- for 2.6.16 let's do it for ATI only where
- it's really needed. */
- case PCI_VENDOR_ID_ATI:
- if (timer_over_8254 == 1) {
- timer_over_8254 = 0;
- printk(KERN_INFO
- "ATI board detected. Disabling timer routing over 8254.\n");
- }
- return;
- }
-
-
- /* No multi-function device? */
- type = read_pci_config_byte(num,slot,func,
- PCI_HEADER_TYPE);
- if (!(type & 0x80))
- break;
- }
- }
- }
-}
-
-static int __init ioapic_pirq_setup(char *str)
-{
- int i, max;
- int ints[MAX_PIRQS+1];
-
- get_options(str, ARRAY_SIZE(ints), ints);
-
- for (i = 0; i < MAX_PIRQS; i++)
- pirq_entries[i] = -1;
-
- pirqs_enabled = 1;
- apic_printk(APIC_VERBOSE, "PIRQ redirection, working around broken MP-BIOS.\n");
- max = MAX_PIRQS;
- if (ints[0] < MAX_PIRQS)
- max = ints[0];
-
- for (i = 0; i < max; i++) {
- apic_printk(APIC_VERBOSE, "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
- /*
- * PIRQs are mapped upside down, usually.
- */
- pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
- }
- return 1;
-}
-
-__setup("pirq=", ioapic_pirq_setup);
/*
* Find the IRQ entry number of a certain pin.
@@ -425,9 +307,7 @@ static int __init find_isa_irq_pin(int irq, int type)
for (i = 0; i < mp_irq_entries; i++) {
int lbus = mp_irqs[i].mpc_srcbus;
- if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
- mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
- mp_bus_id_to_type[lbus] == MP_BUS_MCA) &&
+ if (test_bit(lbus, mp_bus_not_pci) &&
(mp_irqs[i].mpc_irqtype == type) &&
(mp_irqs[i].mpc_srcbusirq == irq))
@@ -443,9 +323,7 @@ static int __init find_isa_irq_apic(int irq, int type)
for (i = 0; i < mp_irq_entries; i++) {
int lbus = mp_irqs[i].mpc_srcbus;
- if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
- mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
- mp_bus_id_to_type[lbus] == MP_BUS_MCA) &&
+ if (test_bit(lbus, mp_bus_not_pci) &&
(mp_irqs[i].mpc_irqtype == type) &&
(mp_irqs[i].mpc_srcbusirq == irq))
break;
@@ -485,7 +363,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
break;
- if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) &&
+ if (!test_bit(lbus, mp_bus_not_pci) &&
!mp_irqs[i].mpc_irqtype &&
(bus == lbus) &&
(slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
@@ -508,27 +386,6 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
return best_guess;
}
-/*
- * EISA Edge/Level control register, ELCR
- */
-static int EISA_ELCR(unsigned int irq)
-{
- if (irq < 16) {
- unsigned int port = 0x4d0 + (irq >> 3);
- return (inb(port) >> (irq & 7)) & 1;
- }
- apic_printk(APIC_VERBOSE, "Broken MPtable reports ISA irq %d\n", irq);
- return 0;
-}
-
-/* EISA interrupts are always polarity zero and can be edge or level
- * trigger depending on the ELCR value. If an interrupt is listed as
- * EISA conforming in the MP table, that means its trigger type must
- * be read in from the ELCR */
-
-#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq))
-#define default_EISA_polarity(idx) (0)
-
/* ISA interrupts are always polarity zero edge triggered,
* when listed as conforming in the MP table. */
@@ -541,12 +398,6 @@ static int EISA_ELCR(unsigned int irq)
#define default_PCI_trigger(idx) (1)
#define default_PCI_polarity(idx) (1)
-/* MCA interrupts are always polarity zero level triggered,
- * when listed as conforming in the MP table. */
-
-#define default_MCA_trigger(idx) (1)
-#define default_MCA_polarity(idx) (0)
-
static int __init MPBIOS_polarity(int idx)
{
int bus = mp_irqs[idx].mpc_srcbus;
@@ -558,38 +409,11 @@ static int __init MPBIOS_polarity(int idx)
switch (mp_irqs[idx].mpc_irqflag & 3)
{
case 0: /* conforms, ie. bus-type dependent polarity */
- {
- switch (mp_bus_id_to_type[bus])
- {
- case MP_BUS_ISA: /* ISA pin */
- {
- polarity = default_ISA_polarity(idx);
- break;
- }
- case MP_BUS_EISA: /* EISA pin */
- {
- polarity = default_EISA_polarity(idx);
- break;
- }
- case MP_BUS_PCI: /* PCI pin */
- {
- polarity = default_PCI_polarity(idx);
- break;
- }
- case MP_BUS_MCA: /* MCA pin */
- {
- polarity = default_MCA_polarity(idx);
- break;
- }
- default:
- {
- printk(KERN_WARNING "broken BIOS!!\n");
- polarity = 1;
- break;
- }
- }
+ if (test_bit(bus, mp_bus_not_pci))
+ polarity = default_ISA_polarity(idx);
+ else
+ polarity = default_PCI_polarity(idx);
break;
- }
case 1: /* high active */
{
polarity = 0;
@@ -627,38 +451,11 @@ static int MPBIOS_trigger(int idx)
switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
{
case 0: /* conforms, ie. bus-type dependent */
- {
- switch (mp_bus_id_to_type[bus])
- {
- case MP_BUS_ISA: /* ISA pin */
- {
- trigger = default_ISA_trigger(idx);
- break;
- }
- case MP_BUS_EISA: /* EISA pin */
- {
- trigger = default_EISA_trigger(idx);
- break;
- }
- case MP_BUS_PCI: /* PCI pin */
- {
- trigger = default_PCI_trigger(idx);
- break;
- }
- case MP_BUS_MCA: /* MCA pin */
- {
- trigger = default_MCA_trigger(idx);
- break;
- }
- default:
- {
- printk(KERN_WARNING "broken BIOS!!\n");
- trigger = 1;
- break;
- }
- }
+ if (test_bit(bus, mp_bus_not_pci))
+ trigger = default_ISA_trigger(idx);
+ else
+ trigger = default_PCI_trigger(idx);
break;
- }
case 1: /* edge */
{
trigger = 0;
@@ -764,49 +561,17 @@ static int pin_2_irq(int idx, int apic, int pin)
if (mp_irqs[idx].mpc_dstirq != pin)
printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
- switch (mp_bus_id_to_type[bus])
- {
- case MP_BUS_ISA: /* ISA pin */
- case MP_BUS_EISA:
- case MP_BUS_MCA:
- {
- irq = mp_irqs[idx].mpc_srcbusirq;
- break;
- }
- case MP_BUS_PCI: /* PCI pin */
- {
- /*
- * PCI IRQs are mapped in order
- */
- i = irq = 0;
- while (i < apic)
- irq += nr_ioapic_registers[i++];
- irq += pin;
- irq = gsi_irq_sharing(irq);
- break;
- }
- default:
- {
- printk(KERN_ERR "unknown bus type %d.\n",bus);
- irq = 0;
- break;
- }
- }
- BUG_ON(irq >= NR_IRQS);
-
- /*
- * PCI IRQ command line redirection. Yes, limits are hardcoded.
- */
- if ((pin >= 16) && (pin <= 23)) {
- if (pirq_entries[pin-16] != -1) {
- if (!pirq_entries[pin-16]) {
- apic_printk(APIC_VERBOSE, "disabling PIRQ%d\n", pin-16);
- } else {
- irq = pirq_entries[pin-16];
- apic_printk(APIC_VERBOSE, "using PIRQ%d -> IRQ %d\n",
- pin-16, irq);
- }
- }
+ if (test_bit(bus, mp_bus_not_pci)) {
+ irq = mp_irqs[idx].mpc_srcbusirq;
+ } else {
+ /*
+ * PCI IRQs are mapped in order
+ */
+ i = irq = 0;
+ while (i < apic)
+ irq += nr_ioapic_registers[i++];
+ irq += pin;
+ irq = gsi_irq_sharing(irq);
}
BUG_ON(irq >= NR_IRQS);
return irq;
@@ -943,9 +708,9 @@ static void __init setup_IO_APIC_irqs(void)
if (!apic && (irq < 16))
disable_8259A_irq(irq);
}
+ ioapic_write_entry(apic, pin, entry);
+
spin_lock_irqsave(&ioapic_lock, flags);
- io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
- io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
set_native_irq_info(irq, TARGET_CPUS);
spin_unlock_irqrestore(&ioapic_lock, flags);
}
@@ -1083,10 +848,7 @@ void __apicdebuginit print_IO_APIC(void)
for (i = 0; i <= reg_01.bits.entries; i++) {
struct IO_APIC_route_entry entry;
- spin_lock_irqsave(&ioapic_lock, flags);
- *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2);
- *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ entry = ioapic_read_entry(apic, i);
printk(KERN_DEBUG " %02x %03X %02X ",
i,
@@ -1281,9 +1043,6 @@ static void __init enable_IO_APIC(void)
irq_2_pin[i].pin = -1;
irq_2_pin[i].next = 0;
}
- if (!pirqs_enabled)
- for (i = 0; i < MAX_PIRQS; i++)
- pirq_entries[i] = -1;
/*
* The number of IO-APIC IRQ registers (== #pins):
@@ -1299,11 +1058,7 @@ static void __init enable_IO_APIC(void)
/* See if any of the pins is in ExtINT mode */
for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
struct IO_APIC_route_entry entry;
- spin_lock_irqsave(&ioapic_lock, flags);
- *(((int *)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
- *(((int *)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
- spin_unlock_irqrestore(&ioapic_lock, flags);
-
+ entry = ioapic_read_entry(apic, pin);
/* If the interrupt line is enabled and in ExtInt mode
* I have found the pin where the i8259 is connected.
@@ -1355,7 +1110,6 @@ void disable_IO_APIC(void)
*/
if (ioapic_i8259.pin != -1) {
struct IO_APIC_route_entry entry;
- unsigned long flags;
memset(&entry, 0, sizeof(entry));
entry.mask = 0; /* Enabled */
@@ -1372,84 +1126,13 @@ void disable_IO_APIC(void)
/*
* Add it to the IO-APIC irq-routing table:
*/
- spin_lock_irqsave(&ioapic_lock, flags);
- io_apic_write(ioapic_i8259.apic, 0x11+2*ioapic_i8259.pin,
- *(((int *)&entry)+1));
- io_apic_write(ioapic_i8259.apic, 0x10+2*ioapic_i8259.pin,
- *(((int *)&entry)+0));
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
}
disconnect_bsp_APIC(ioapic_i8259.pin != -1);
}
/*
- * function to set the IO-APIC physical IDs based on the
- * values stored in the MPC table.
- *
- * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
- */
-
-static void __init setup_ioapic_ids_from_mpc (void)
-{
- union IO_APIC_reg_00 reg_00;
- int apic;
- int i;
- unsigned char old_id;
- unsigned long flags;
-
- /*
- * Set the IOAPIC ID to the value stored in the MPC table.
- */
- for (apic = 0; apic < nr_ioapics; apic++) {
-
- /* Read the register 0 value */
- spin_lock_irqsave(&ioapic_lock, flags);
- reg_00.raw = io_apic_read(apic, 0);
- spin_unlock_irqrestore(&ioapic_lock, flags);
-
- old_id = mp_ioapics[apic].mpc_apicid;
-
-
- printk(KERN_INFO "Using IO-APIC %d\n", mp_ioapics[apic].mpc_apicid);
-
-
- /*
- * We need to adjust the IRQ routing table
- * if the ID changed.
- */
- if (old_id != mp_ioapics[apic].mpc_apicid)
- for (i = 0; i < mp_irq_entries; i++)
- if (mp_irqs[i].mpc_dstapic == old_id)
- mp_irqs[i].mpc_dstapic
- = mp_ioapics[apic].mpc_apicid;
-
- /*
- * Read the right value from the MPC table and
- * write it into the ID register.
- */
- apic_printk(APIC_VERBOSE,KERN_INFO "...changing IO-APIC physical APIC ID to %d ...",
- mp_ioapics[apic].mpc_apicid);
-
- reg_00.bits.ID = mp_ioapics[apic].mpc_apicid;
- spin_lock_irqsave(&ioapic_lock, flags);
- io_apic_write(apic, 0, reg_00.raw);
- spin_unlock_irqrestore(&ioapic_lock, flags);
-
- /*
- * Sanity check
- */
- spin_lock_irqsave(&ioapic_lock, flags);
- reg_00.raw = io_apic_read(apic, 0);
- spin_unlock_irqrestore(&ioapic_lock, flags);
- if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid)
- printk("could not set ID!\n");
- else
- apic_printk(APIC_VERBOSE," ok.\n");
- }
-}
-
-/*
* There is a nasty bug in some older SMP boards, their mptable lies
* about the timer IRQ. We do the following to work around the situation:
*
@@ -1964,11 +1647,6 @@ void __init setup_IO_APIC(void)
apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
- /*
- * Set up the IO-APIC IRQ routing table.
- */
- if (!acpi_ioapic)
- setup_ioapic_ids_from_mpc();
sync_Arb_IDs();
setup_IO_APIC_irqs();
init_IO_APIC_traps();
@@ -1987,17 +1665,12 @@ static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
{
struct IO_APIC_route_entry *entry;
struct sysfs_ioapic_data *data;
- unsigned long flags;
int i;
data = container_of(dev, struct sysfs_ioapic_data, dev);
entry = data->entry;
- spin_lock_irqsave(&ioapic_lock, flags);
- for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
- *(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i);
- *(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i);
- }
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ )
+ *entry = ioapic_read_entry(dev->id, i);
return 0;
}
@@ -2019,11 +1692,9 @@ static int ioapic_resume(struct sys_device *dev)
reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
io_apic_write(dev->id, 0, reg_00.raw);
}
- for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
- io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1));
- io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0));
- }
spin_unlock_irqrestore(&ioapic_lock, flags);
+ for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
+ ioapic_write_entry(dev->id, i, entry[i]);
return 0;
}
@@ -2077,19 +1748,6 @@ device_initcall(ioapic_init_sysfs);
#define IO_APIC_MAX_ID 0xFE
-int __init io_apic_get_version (int ioapic)
-{
- union IO_APIC_reg_01 reg_01;
- unsigned long flags;
-
- spin_lock_irqsave(&ioapic_lock, flags);
- reg_01.raw = io_apic_read(ioapic, 1);
- spin_unlock_irqrestore(&ioapic_lock, flags);
-
- return reg_01.bits.version;
-}
-
-
int __init io_apic_get_redir_entries (int ioapic)
{
union IO_APIC_reg_01 reg_01;
@@ -2148,10 +1806,10 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int p
if (!ioapic && (irq < 16))
disable_8259A_irq(irq);
+ ioapic_write_entry(ioapic, pin, entry);
+
spin_lock_irqsave(&ioapic_lock, flags);
- io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1));
- io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0));
- set_native_irq_info(use_pci_vector() ? entry.vector : irq, TARGET_CPUS);
+ set_native_irq_info(use_pci_vector() ? entry.vector : irq, TARGET_CPUS);
spin_unlock_irqrestore(&ioapic_lock, flags);
return 0;
diff --git a/arch/x86_64/kernel/ioport.c b/arch/x86_64/kernel/ioport.c
index b81614970ec..fe063d3cfe4 100644
--- a/arch/x86_64/kernel/ioport.c
+++ b/arch/x86_64/kernel/ioport.c
@@ -56,6 +56,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
memset(bitmap, 0xff, IO_BITMAP_BYTES);
t->io_bitmap_ptr = bitmap;
+ set_thread_flag(TIF_IO_BITMAP);
}
/*
diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c
index 5221a53e90c..b3677e6ccc6 100644
--- a/arch/x86_64/kernel/irq.c
+++ b/arch/x86_64/kernel/irq.c
@@ -20,11 +20,6 @@
#include <asm/idle.h>
atomic_t irq_err_count;
-#ifdef CONFIG_X86_IO_APIC
-#ifdef APIC_MISMATCH_DEBUG
-atomic_t irq_mis_count;
-#endif
-#endif
#ifdef CONFIG_DEBUG_STACKOVERFLOW
/*
@@ -92,18 +87,11 @@ skip:
for_each_online_cpu(j)
seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count);
seq_putc(p, '\n');
-#ifdef CONFIG_X86_LOCAL_APIC
seq_printf(p, "LOC: ");
for_each_online_cpu(j)
seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs);
seq_putc(p, '\n');
-#endif
seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
-#ifdef CONFIG_X86_IO_APIC
-#ifdef APIC_MISMATCH_DEBUG
- seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
-#endif
-#endif
}
return 0;
}
diff --git a/arch/x86_64/kernel/machine_kexec.c b/arch/x86_64/kernel/machine_kexec.c
index 106076b370f..0497e3bd5bf 100644
--- a/arch/x86_64/kernel/machine_kexec.c
+++ b/arch/x86_64/kernel/machine_kexec.c
@@ -15,6 +15,15 @@
#include <asm/mmu_context.h>
#include <asm/io.h>
+#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
+static u64 kexec_pgd[512] PAGE_ALIGNED;
+static u64 kexec_pud0[512] PAGE_ALIGNED;
+static u64 kexec_pmd0[512] PAGE_ALIGNED;
+static u64 kexec_pte0[512] PAGE_ALIGNED;
+static u64 kexec_pud1[512] PAGE_ALIGNED;
+static u64 kexec_pmd1[512] PAGE_ALIGNED;
+static u64 kexec_pte1[512] PAGE_ALIGNED;
+
static void init_level2_page(pmd_t *level2p, unsigned long addr)
{
unsigned long end_addr;
@@ -144,32 +153,19 @@ static void load_segments(void)
);
}
-typedef NORET_TYPE void (*relocate_new_kernel_t)(unsigned long indirection_page,
- unsigned long control_code_buffer,
- unsigned long start_address,
- unsigned long pgtable) ATTRIB_NORET;
-
-extern const unsigned char relocate_new_kernel[];
-extern const unsigned long relocate_new_kernel_size;
-
int machine_kexec_prepare(struct kimage *image)
{
- unsigned long start_pgtable, control_code_buffer;
+ unsigned long start_pgtable;
int result;
/* Calculate the offsets */
start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
- control_code_buffer = start_pgtable + PAGE_SIZE;
/* Setup the identity mapped 64bit page table */
result = init_pgtable(image, start_pgtable);
if (result)
return result;
- /* Place the code in the reboot code buffer */
- memcpy(__va(control_code_buffer), relocate_new_kernel,
- relocate_new_kernel_size);
-
return 0;
}
@@ -184,28 +180,34 @@ void machine_kexec_cleanup(struct kimage *image)
*/
NORET_TYPE void machine_kexec(struct kimage *image)
{
- unsigned long page_list;
- unsigned long control_code_buffer;
- unsigned long start_pgtable;
- relocate_new_kernel_t rnk;
+ unsigned long page_list[PAGES_NR];
+ void *control_page;
/* Interrupts aren't acceptable while we reboot */
local_irq_disable();
- /* Calculate the offsets */
- page_list = image->head;
- start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
- control_code_buffer = start_pgtable + PAGE_SIZE;
-
- /* Set the low half of the page table to my identity mapped
- * page table for kexec. Leave the high half pointing at the
- * kernel pages. Don't bother to flush the global pages
- * as that will happen when I fully switch to my identity mapped
- * page table anyway.
- */
- memcpy(__va(read_cr3()), __va(start_pgtable), PAGE_SIZE/2);
- __flush_tlb();
-
+ control_page = page_address(image->control_code_page) + PAGE_SIZE;
+ memcpy(control_page, relocate_kernel, PAGE_SIZE);
+
+ page_list[PA_CONTROL_PAGE] = __pa(control_page);
+ page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel;
+ page_list[PA_PGD] = __pa(kexec_pgd);
+ page_list[VA_PGD] = (unsigned long)kexec_pgd;
+ page_list[PA_PUD_0] = __pa(kexec_pud0);
+ page_list[VA_PUD_0] = (unsigned long)kexec_pud0;
+ page_list[PA_PMD_0] = __pa(kexec_pmd0);
+ page_list[VA_PMD_0] = (unsigned long)kexec_pmd0;
+ page_list[PA_PTE_0] = __pa(kexec_pte0);
+ page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
+ page_list[PA_PUD_1] = __pa(kexec_pud1);
+ page_list[VA_PUD_1] = (unsigned long)kexec_pud1;
+ page_list[PA_PMD_1] = __pa(kexec_pmd1);
+ page_list[VA_PMD_1] = (unsigned long)kexec_pmd1;
+ page_list[PA_PTE_1] = __pa(kexec_pte1);
+ page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
+
+ page_list[PA_TABLE_PAGE] =
+ (unsigned long)__pa(page_address(image->control_code_page));
/* The segment registers are funny things, they have both a
* visible and an invisible part. Whenever the visible part is
@@ -222,7 +224,36 @@ NORET_TYPE void machine_kexec(struct kimage *image)
*/
set_gdt(phys_to_virt(0),0);
set_idt(phys_to_virt(0),0);
+
/* now call it */
- rnk = (relocate_new_kernel_t) control_code_buffer;
- (*rnk)(page_list, control_code_buffer, image->start, start_pgtable);
+ relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
+ image->start);
}
+
+/* crashkernel=size@addr specifies the location to reserve for
+ * a crash kernel. By reserving this memory we guarantee
+ * that linux never set's it up as a DMA target.
+ * Useful for holding code to do something appropriate
+ * after a kernel panic.
+ */
+static int __init setup_crashkernel(char *arg)
+{
+ unsigned long size, base;
+ char *p;
+ if (!arg)
+ return -EINVAL;
+ size = memparse(arg, &p);
+ if (arg == p)
+ return -EINVAL;
+ if (*p == '@') {
+ base = memparse(p+1, &p);
+ /* FIXME: Do I want a sanity check to validate the
+ * memory range? Yes you do, but it's too early for
+ * e820 -AK */
+ crashk_res.start = base;
+ crashk_res.end = base + size - 1;
+ }
+ return 0;
+}
+early_param("crashkernel", setup_crashkernel);
+
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
index 4e017fb30fb..bbea88801d8 100644
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -182,7 +182,7 @@ void do_machine_check(struct pt_regs * regs, long error_code)
goto out2;
memset(&m, 0, sizeof(struct mce));
- m.cpu = safe_smp_processor_id();
+ m.cpu = smp_processor_id();
rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
if (!(m.mcgstatus & MCG_STATUS_RIPV))
kill_it = 1;
@@ -274,6 +274,33 @@ void do_machine_check(struct pt_regs * regs, long error_code)
atomic_dec(&mce_entry);
}
+#ifdef CONFIG_X86_MCE_INTEL
+/***
+ * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
+ * @cpu: The CPU on which the event occured.
+ * @status: Event status information
+ *
+ * This function should be called by the thermal interrupt after the
+ * event has been processed and the decision was made to log the event
+ * further.
+ *
+ * The status parameter will be saved to the 'status' field of 'struct mce'
+ * and historically has been the register value of the
+ * MSR_IA32_THERMAL_STATUS (Intel) msr.
+ */
+void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
+{
+ struct mce m;
+
+ memset(&m, 0, sizeof(m));
+ m.cpu = cpu;
+ m.bank = MCE_THERMAL_BANK;
+ m.status = status;
+ rdtscll(m.tsc);
+ mce_log(&m);
+}
+#endif /* CONFIG_X86_MCE_INTEL */
+
/*
* Periodic polling timer for "silent" machine check errors.
*/
diff --git a/arch/x86_64/kernel/mce_intel.c b/arch/x86_64/kernel/mce_intel.c
index 8f533d2c40c..6551505d8a2 100644
--- a/arch/x86_64/kernel/mce_intel.c
+++ b/arch/x86_64/kernel/mce_intel.c
@@ -11,36 +11,21 @@
#include <asm/mce.h>
#include <asm/hw_irq.h>
#include <asm/idle.h>
-
-static DEFINE_PER_CPU(unsigned long, next_check);
+#include <asm/therm_throt.h>
asmlinkage void smp_thermal_interrupt(void)
{
- struct mce m;
+ __u64 msr_val;
ack_APIC_irq();
exit_idle();
irq_enter();
- if (time_before(jiffies, __get_cpu_var(next_check)))
- goto done;
-
- __get_cpu_var(next_check) = jiffies + HZ*300;
- memset(&m, 0, sizeof(m));
- m.cpu = smp_processor_id();
- m.bank = MCE_THERMAL_BANK;
- rdtscll(m.tsc);
- rdmsrl(MSR_IA32_THERM_STATUS, m.status);
- if (m.status & 0x1) {
- printk(KERN_EMERG
- "CPU%d: Temperature above threshold, cpu clock throttled\n", m.cpu);
- add_taint(TAINT_MACHINE_CHECK);
- } else {
- printk(KERN_EMERG "CPU%d: Temperature/speed normal\n", m.cpu);
- }
- mce_log(&m);
-done:
+ rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
+ if (therm_throt_process(msr_val & 1))
+ mce_log_therm_throt_event(smp_processor_id(), msr_val);
+
irq_exit();
}
@@ -92,6 +77,9 @@ static void __cpuinit intel_init_thermal(struct cpuinfo_x86 *c)
apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n",
cpu, tm2 ? "TM2" : "TM1");
+
+ /* enable thermal throttle processing */
+ atomic_set(&therm_throt_en, 1);
return;
}
diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c
index a1ab4197f8a..b8d53dfa993 100644
--- a/arch/x86_64/kernel/mpparse.c
+++ b/arch/x86_64/kernel/mpparse.c
@@ -41,8 +41,7 @@ int acpi_found_madt;
* Various Linux-internal data structures created from the
* MP-table.
*/
-unsigned char apic_version [MAX_APICS];
-unsigned char mp_bus_id_to_type [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
+DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
static int mp_current_pci_id = 0;
@@ -56,7 +55,6 @@ struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
int mp_irq_entries;
int nr_ioapics;
-int pic_mode;
unsigned long mp_lapic_addr = 0;
@@ -71,19 +69,6 @@ unsigned disabled_cpus __initdata;
/* Bitmask of physically existing CPUs */
physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE;
-/* ACPI MADT entry parsing functions */
-#ifdef CONFIG_ACPI
-extern struct acpi_boot_flags acpi_boot;
-#ifdef CONFIG_X86_LOCAL_APIC
-extern int acpi_parse_lapic (acpi_table_entry_header *header);
-extern int acpi_parse_lapic_addr_ovr (acpi_table_entry_header *header);
-extern int acpi_parse_lapic_nmi (acpi_table_entry_header *header);
-#endif /*CONFIG_X86_LOCAL_APIC*/
-#ifdef CONFIG_X86_IO_APIC
-extern int acpi_parse_ioapic (acpi_table_entry_header *header);
-#endif /*CONFIG_X86_IO_APIC*/
-#endif /*CONFIG_ACPI*/
-
u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
@@ -108,24 +93,20 @@ static int __init mpf_checksum(unsigned char *mp, int len)
static void __cpuinit MP_processor_info (struct mpc_config_processor *m)
{
int cpu;
- unsigned char ver;
cpumask_t tmp_map;
+ char *bootup_cpu = "";
if (!(m->mpc_cpuflag & CPU_ENABLED)) {
disabled_cpus++;
return;
}
-
- printk(KERN_INFO "Processor #%d %d:%d APIC version %d\n",
- m->mpc_apicid,
- (m->mpc_cpufeature & CPU_FAMILY_MASK)>>8,
- (m->mpc_cpufeature & CPU_MODEL_MASK)>>4,
- m->mpc_apicver);
-
if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
- Dprintk(" Bootup CPU\n");
+ bootup_cpu = " (Bootup-CPU)";
boot_cpu_id = m->mpc_apicid;
}
+
+ printk(KERN_INFO "Processor #%d%s\n", m->mpc_apicid, bootup_cpu);
+
if (num_processors >= NR_CPUS) {
printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
" Processor ignored.\n", NR_CPUS);
@@ -136,24 +117,7 @@ static void __cpuinit MP_processor_info (struct mpc_config_processor *m)
cpus_complement(tmp_map, cpu_present_map);
cpu = first_cpu(tmp_map);
-#if MAX_APICS < 255
- if ((int)m->mpc_apicid > MAX_APICS) {
- printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n",
- m->mpc_apicid, MAX_APICS);
- return;
- }
-#endif
- ver = m->mpc_apicver;
-
physid_set(m->mpc_apicid, phys_cpu_present_map);
- /*
- * Validate version
- */
- if (ver == 0x0) {
- printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! fixing up to 0x10. (tell your hw vendor)\n", m->mpc_apicid);
- ver = 0x10;
- }
- apic_version[m->mpc_apicid] = ver;
if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
/*
* bios_cpu_apicid is required to have processors listed
@@ -178,37 +142,42 @@ static void __init MP_bus_info (struct mpc_config_bus *m)
Dprintk("Bus #%d is %s\n", m->mpc_busid, str);
if (strncmp(str, "ISA", 3) == 0) {
- mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
- } else if (strncmp(str, "EISA", 4) == 0) {
- mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
+ set_bit(m->mpc_busid, mp_bus_not_pci);
} else if (strncmp(str, "PCI", 3) == 0) {
- mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
+ clear_bit(m->mpc_busid, mp_bus_not_pci);
mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
mp_current_pci_id++;
- } else if (strncmp(str, "MCA", 3) == 0) {
- mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
} else {
printk(KERN_ERR "Unknown bustype %s\n", str);
}
}
+static int bad_ioapic(unsigned long address)
+{
+ if (nr_ioapics >= MAX_IO_APICS) {
+ printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
+ "(found %d)\n", MAX_IO_APICS, nr_ioapics);
+ panic("Recompile kernel with bigger MAX_IO_APICS!\n");
+ }
+ if (!address) {
+ printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
+ " found in table, skipping!\n");
+ return 1;
+ }
+ return 0;
+}
+
static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
{
if (!(m->mpc_flags & MPC_APIC_USABLE))
return;
- printk("I/O APIC #%d Version %d at 0x%X.\n",
- m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
- if (nr_ioapics >= MAX_IO_APICS) {
- printk(KERN_ERR "Max # of I/O APICs (%d) exceeded (found %d).\n",
- MAX_IO_APICS, nr_ioapics);
- panic("Recompile kernel with bigger MAX_IO_APICS!.\n");
- }
- if (!m->mpc_apicaddr) {
- printk(KERN_ERR "WARNING: bogus zero I/O APIC address"
- " found in MP table, skipping!\n");
+ printk("I/O APIC #%d at 0x%X.\n",
+ m->mpc_apicid, m->mpc_apicaddr);
+
+ if (bad_ioapic(m->mpc_apicaddr))
return;
- }
+
mp_ioapics[nr_ioapics] = *m;
nr_ioapics++;
}
@@ -232,19 +201,6 @@ static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m)
m->mpc_irqtype, m->mpc_irqflag & 3,
(m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
- /*
- * Well it seems all SMP boards in existence
- * use ExtINT/LVT1 == LINT0 and
- * NMI/LVT2 == LINT1 - the following check
- * will show us if this assumptions is false.
- * Until then we do not have to add baggage.
- */
- if ((m->mpc_irqtype == mp_ExtINT) &&
- (m->mpc_destapiclint != 0))
- BUG();
- if ((m->mpc_irqtype == mp_NMI) &&
- (m->mpc_destapiclint != 1))
- BUG();
}
/*
@@ -258,7 +214,7 @@ static int __init smp_read_mpc(struct mp_config_table *mpc)
unsigned char *mpt=((unsigned char *)mpc)+count;
if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) {
- printk("SMP mptable: bad signature [%c%c%c%c]!\n",
+ printk("MPTABLE: bad signature [%c%c%c%c]!\n",
mpc->mpc_signature[0],
mpc->mpc_signature[1],
mpc->mpc_signature[2],
@@ -266,31 +222,31 @@ static int __init smp_read_mpc(struct mp_config_table *mpc)
return 0;
}
if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) {
- printk("SMP mptable: checksum error!\n");
+ printk("MPTABLE: checksum error!\n");
return 0;
}
if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) {
- printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n",
+ printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n",
mpc->mpc_spec);
return 0;
}
if (!mpc->mpc_lapic) {
- printk(KERN_ERR "SMP mptable: null local APIC address!\n");
+ printk(KERN_ERR "MPTABLE: null local APIC address!\n");
return 0;
}
memcpy(str,mpc->mpc_oem,8);
- str[8]=0;
- printk(KERN_INFO "OEM ID: %s ",str);
+ str[8] = 0;
+ printk(KERN_INFO "MPTABLE: OEM ID: %s ",str);
memcpy(str,mpc->mpc_productid,12);
- str[12]=0;
- printk("Product ID: %s ",str);
+ str[12] = 0;
+ printk("MPTABLE: Product ID: %s ",str);
- printk("APIC at: 0x%X\n",mpc->mpc_lapic);
+ printk("MPTABLE: APIC at: 0x%X\n",mpc->mpc_lapic);
/* save the local APIC address, it might be non-default */
if (!acpi_lapic)
- mp_lapic_addr = mpc->mpc_lapic;
+ mp_lapic_addr = mpc->mpc_lapic;
/*
* Now process the configuration blocks.
@@ -302,7 +258,7 @@ static int __init smp_read_mpc(struct mp_config_table *mpc)
struct mpc_config_processor *m=
(struct mpc_config_processor *)mpt;
if (!acpi_lapic)
- MP_processor_info(m);
+ MP_processor_info(m);
mpt += sizeof(*m);
count += sizeof(*m);
break;
@@ -321,8 +277,8 @@ static int __init smp_read_mpc(struct mp_config_table *mpc)
struct mpc_config_ioapic *m=
(struct mpc_config_ioapic *)mpt;
MP_ioapic_info(m);
- mpt+=sizeof(*m);
- count+=sizeof(*m);
+ mpt += sizeof(*m);
+ count += sizeof(*m);
break;
}
case MP_INTSRC:
@@ -331,8 +287,8 @@ static int __init smp_read_mpc(struct mp_config_table *mpc)
(struct mpc_config_intsrc *)mpt;
MP_intsrc_info(m);
- mpt+=sizeof(*m);
- count+=sizeof(*m);
+ mpt += sizeof(*m);
+ count += sizeof(*m);
break;
}
case MP_LINTSRC:
@@ -340,15 +296,15 @@ static int __init smp_read_mpc(struct mp_config_table *mpc)
struct mpc_config_lintsrc *m=
(struct mpc_config_lintsrc *)mpt;
MP_lintsrc_info(m);
- mpt+=sizeof(*m);
- count+=sizeof(*m);
+ mpt += sizeof(*m);
+ count += sizeof(*m);
break;
}
}
}
clustered_apic_check();
if (!num_processors)
- printk(KERN_ERR "SMP mptable: no processors registered!\n");
+ printk(KERN_ERR "MPTABLE: no processors registered!\n");
return num_processors;
}
@@ -444,13 +400,10 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
* 2 CPUs, numbered 0 & 1.
*/
processor.mpc_type = MP_PROCESSOR;
- /* Either an integrated APIC or a discrete 82489DX. */
- processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
+ processor.mpc_apicver = 0;
processor.mpc_cpuflag = CPU_ENABLED;
- processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
- (boot_cpu_data.x86_model << 4) |
- boot_cpu_data.x86_mask;
- processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
+ processor.mpc_cpufeature = 0;
+ processor.mpc_featureflag = 0;
processor.mpc_reserved[0] = 0;
processor.mpc_reserved[1] = 0;
for (i = 0; i < 2; i++) {
@@ -469,14 +422,6 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
case 5:
memcpy(bus.mpc_bustype, "ISA ", 6);
break;
- case 2:
- case 6:
- case 3:
- memcpy(bus.mpc_bustype, "EISA ", 6);
- break;
- case 4:
- case 7:
- memcpy(bus.mpc_bustype, "MCA ", 6);
}
MP_bus_info(&bus);
if (mpc_default_type > 4) {
@@ -487,7 +432,7 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
ioapic.mpc_type = MP_IOAPIC;
ioapic.mpc_apicid = 2;
- ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
+ ioapic.mpc_apicver = 0;
ioapic.mpc_flags = MPC_APIC_USABLE;
ioapic.mpc_apicaddr = 0xFEC00000;
MP_ioapic_info(&ioapic);
@@ -530,13 +475,6 @@ void __init get_smp_config (void)
printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n");
printk("Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
- if (mpf->mpf_feature2 & (1<<7)) {
- printk(KERN_INFO " IMCR and PIC compatibility mode.\n");
- pic_mode = 1;
- } else {
- printk(KERN_INFO " Virtual Wire compatibility mode.\n");
- pic_mode = 0;
- }
/*
* Now see if we need to read further.
@@ -616,7 +554,7 @@ static int __init smp_scan_config (unsigned long base, unsigned long length)
return 0;
}
-void __init find_intel_smp (void)
+void __init find_smp_config(void)
{
unsigned int address;
@@ -633,9 +571,7 @@ void __init find_intel_smp (void)
smp_scan_config(0xF0000,0x10000))
return;
/*
- * If it is an SMP machine we should know now, unless the
- * configuration is in an EISA/MCA bus machine with an
- * extended bios data area.
+ * If it is an SMP machine we should know now.
*
* there is a real-mode segmented pointer pointing to the
* 4K EBDA area at 0x40E, calculate and scan it here.
@@ -656,69 +592,41 @@ void __init find_intel_smp (void)
printk(KERN_INFO "No mptable found.\n");
}
-/*
- * - Intel MP Configuration Table
- */
-void __init find_smp_config (void)
-{
-#ifdef CONFIG_X86_LOCAL_APIC
- find_intel_smp();
-#endif
-}
-
-
/* --------------------------------------------------------------------------
ACPI-based MP Configuration
-------------------------------------------------------------------------- */
#ifdef CONFIG_ACPI
-void __init mp_register_lapic_address (
- u64 address)
+void __init mp_register_lapic_address(u64 address)
{
mp_lapic_addr = (unsigned long) address;
-
set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
-
if (boot_cpu_id == -1U)
boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID));
-
- Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid);
}
-
-void __cpuinit mp_register_lapic (
- u8 id,
- u8 enabled)
+void __cpuinit mp_register_lapic (u8 id, u8 enabled)
{
struct mpc_config_processor processor;
int boot_cpu = 0;
- if (id >= MAX_APICS) {
- printk(KERN_WARNING "Processor #%d invalid (max %d)\n",
- id, MAX_APICS);
- return;
- }
-
- if (id == boot_cpu_physical_apicid)
+ if (id == boot_cpu_id)
boot_cpu = 1;
processor.mpc_type = MP_PROCESSOR;
processor.mpc_apicid = id;
- processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR));
+ processor.mpc_apicver = 0;
processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0);
processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0);
- processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
- (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
- processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
+ processor.mpc_cpufeature = 0;
+ processor.mpc_featureflag = 0;
processor.mpc_reserved[0] = 0;
processor.mpc_reserved[1] = 0;
MP_processor_info(&processor);
}
-#ifdef CONFIG_X86_IO_APIC
-
#define MP_ISA_BUS 0
#define MP_MAX_IOAPIC_PIN 127
@@ -729,11 +637,9 @@ static struct mp_ioapic_routing {
u32 pin_programmed[4];
} mp_ioapic_routing[MAX_IO_APICS];
-
-static int mp_find_ioapic (
- int gsi)
+static int mp_find_ioapic(int gsi)
{
- int i = 0;
+ int i = 0;
/* Find the IOAPIC that manages this GSI. */
for (i = 0; i < nr_ioapics; i++) {
@@ -743,28 +649,15 @@ static int mp_find_ioapic (
}
printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
-
return -1;
}
-
-void __init mp_register_ioapic (
- u8 id,
- u32 address,
- u32 gsi_base)
+void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base)
{
- int idx = 0;
+ int idx = 0;
- if (nr_ioapics >= MAX_IO_APICS) {
- printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
- "(found %d)\n", MAX_IO_APICS, nr_ioapics);
- panic("Recompile kernel with bigger MAX_IO_APICS!\n");
- }
- if (!address) {
- printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
- " found in MADT table, skipping!\n");
+ if (bad_ioapic(address))
return;
- }
idx = nr_ioapics++;
@@ -774,7 +667,7 @@ void __init mp_register_ioapic (
set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
mp_ioapics[idx].mpc_apicid = id;
- mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
+ mp_ioapics[idx].mpc_apicver = 0;
/*
* Build basic IRQ lookup table to facilitate gsi->io_apic lookups
@@ -785,21 +678,15 @@ void __init mp_register_ioapic (
mp_ioapic_routing[idx].gsi_end = gsi_base +
io_apic_get_redir_entries(idx);
- printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
+ printk(KERN_INFO "IOAPIC[%d]: apic_id %d, address 0x%x, "
"GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
- mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
+ mp_ioapics[idx].mpc_apicaddr,
mp_ioapic_routing[idx].gsi_start,
mp_ioapic_routing[idx].gsi_end);
-
- return;
}
-
-void __init mp_override_legacy_irq (
- u8 bus_irq,
- u8 polarity,
- u8 trigger,
- u32 gsi)
+void __init
+mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
{
struct mpc_config_intsrc intsrc;
int ioapic = -1;
@@ -837,22 +724,18 @@ void __init mp_override_legacy_irq (
mp_irqs[mp_irq_entries] = intsrc;
if (++mp_irq_entries == MAX_IRQ_SOURCES)
panic("Max # of irq sources exceeded!\n");
-
- return;
}
-
-void __init mp_config_acpi_legacy_irqs (void)
+void __init mp_config_acpi_legacy_irqs(void)
{
struct mpc_config_intsrc intsrc;
- int i = 0;
- int ioapic = -1;
+ int i = 0;
+ int ioapic = -1;
/*
* Fabricate the legacy ISA bus (bus #31).
*/
- mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
- Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
+ set_bit(MP_ISA_BUS, mp_bus_not_pci);
/*
* Locate the IOAPIC that manages the ISA IRQs (0-15).
@@ -905,24 +788,22 @@ void __init mp_config_acpi_legacy_irqs (void)
if (++mp_irq_entries == MAX_IRQ_SOURCES)
panic("Max # of irq sources exceeded!\n");
}
-
- return;
}
#define MAX_GSI_NUM 4096
int mp_register_gsi(u32 gsi, int triggering, int polarity)
{
- int ioapic = -1;
- int ioapic_pin = 0;
- int idx, bit = 0;
- static int pci_irq = 16;
+ int ioapic = -1;
+ int ioapic_pin = 0;
+ int idx, bit = 0;
+ static int pci_irq = 16;
/*
* Mapping between Global System Interrupts, which
* represent all possible interrupts, to the IRQs
* assigned to actual devices.
*/
- static int gsi_to_irq[MAX_GSI_NUM];
+ static int gsi_to_irq[MAX_GSI_NUM];
if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
return gsi;
@@ -996,6 +877,4 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity)
polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
return gsi;
}
-
-#endif /*CONFIG_X86_IO_APIC*/
#endif /*CONFIG_ACPI*/
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
index 5baa0c726e9..7af9cb3e2d9 100644
--- a/arch/x86_64/kernel/nmi.c
+++ b/arch/x86_64/kernel/nmi.c
@@ -28,71 +28,142 @@
#include <asm/mce.h>
#include <asm/intel_arch_perfmon.h>
-/*
- * lapic_nmi_owner tracks the ownership of the lapic NMI hardware:
- * - it may be reserved by some other driver, or not
- * - when not reserved by some other driver, it may be used for
- * the NMI watchdog, or not
- *
- * This is maintained separately from nmi_active because the NMI
- * watchdog may also be driven from the I/O APIC timer.
+int unknown_nmi_panic;
+int nmi_watchdog_enabled;
+int panic_on_unrecovered_nmi;
+
+/* perfctr_nmi_owner tracks the ownership of the perfctr registers:
+ * evtsel_nmi_owner tracks the ownership of the event selection
+ * - different performance counters/ event selection may be reserved for
+ * different subsystems this reservation system just tries to coordinate
+ * things a little
*/
-static DEFINE_SPINLOCK(lapic_nmi_owner_lock);
-static unsigned int lapic_nmi_owner;
-#define LAPIC_NMI_WATCHDOG (1<<0)
-#define LAPIC_NMI_RESERVED (1<<1)
+static DEFINE_PER_CPU(unsigned, perfctr_nmi_owner);
+static DEFINE_PER_CPU(unsigned, evntsel_nmi_owner[2]);
+
+/* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
+ * offset from MSR_P4_BSU_ESCR0. It will be the max for all platforms (for now)
+ */
+#define NMI_MAX_COUNTER_BITS 66
/* nmi_active:
- * +1: the lapic NMI watchdog is active, but can be disabled
- * 0: the lapic NMI watchdog has not been set up, and cannot
+ * >0: the lapic NMI watchdog is active, but can be disabled
+ * <0: the lapic NMI watchdog has not been set up, and cannot
* be enabled
- * -1: the lapic NMI watchdog is disabled, but can be enabled
+ * 0: the lapic NMI watchdog is disabled, but can be enabled
*/
-int nmi_active; /* oprofile uses this */
+atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */
int panic_on_timeout;
unsigned int nmi_watchdog = NMI_DEFAULT;
static unsigned int nmi_hz = HZ;
-static unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */
-static unsigned int nmi_p4_cccr_val;
-/* Note that these events don't tick when the CPU idles. This means
- the frequency varies with CPU load. */
+struct nmi_watchdog_ctlblk {
+ int enabled;
+ u64 check_bit;
+ unsigned int cccr_msr;
+ unsigned int perfctr_msr; /* the MSR to reset in NMI handler */
+ unsigned int evntsel_msr; /* the MSR to select the events to handle */
+};
+static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk);
-#define K7_EVNTSEL_ENABLE (1 << 22)
-#define K7_EVNTSEL_INT (1 << 20)
-#define K7_EVNTSEL_OS (1 << 17)
-#define K7_EVNTSEL_USR (1 << 16)
-#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76
-#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
+/* local prototypes */
+static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu);
-#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL
-#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK
+/* converts an msr to an appropriate reservation bit */
+static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
+{
+ /* returns the bit offset of the performance counter register */
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_AMD:
+ return (msr - MSR_K7_PERFCTR0);
+ case X86_VENDOR_INTEL:
+ if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
+ return (msr - MSR_ARCH_PERFMON_PERFCTR0);
+ else
+ return (msr - MSR_P4_BPU_PERFCTR0);
+ }
+ return 0;
+}
-#define MSR_P4_MISC_ENABLE 0x1A0
-#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7)
-#define MSR_P4_MISC_ENABLE_PEBS_UNAVAIL (1<<12)
-#define MSR_P4_PERFCTR0 0x300
-#define MSR_P4_CCCR0 0x360
-#define P4_ESCR_EVENT_SELECT(N) ((N)<<25)
-#define P4_ESCR_OS (1<<3)
-#define P4_ESCR_USR (1<<2)
-#define P4_CCCR_OVF_PMI0 (1<<26)
-#define P4_CCCR_OVF_PMI1 (1<<27)
-#define P4_CCCR_THRESHOLD(N) ((N)<<20)
-#define P4_CCCR_COMPLEMENT (1<<19)
-#define P4_CCCR_COMPARE (1<<18)
-#define P4_CCCR_REQUIRED (3<<16)
-#define P4_CCCR_ESCR_SELECT(N) ((N)<<13)
-#define P4_CCCR_ENABLE (1<<12)
-/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
- CRU_ESCR0 (with any non-null event selector) through a complemented
- max threshold. [IA32-Vol3, Section 14.9.9] */
-#define MSR_P4_IQ_COUNTER0 0x30C
-#define P4_NMI_CRU_ESCR0 (P4_ESCR_EVENT_SELECT(0x3F)|P4_ESCR_OS|P4_ESCR_USR)
-#define P4_NMI_IQ_CCCR0 \
- (P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \
- P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE)
+/* converts an msr to an appropriate reservation bit */
+static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
+{
+ /* returns the bit offset of the event selection register */
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_AMD:
+ return (msr - MSR_K7_EVNTSEL0);
+ case X86_VENDOR_INTEL:
+ if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
+ return (msr - MSR_ARCH_PERFMON_EVENTSEL0);
+ else
+ return (msr - MSR_P4_BSU_ESCR0);
+ }
+ return 0;
+}
+
+/* checks for a bit availability (hack for oprofile) */
+int avail_to_resrv_perfctr_nmi_bit(unsigned int counter)
+{
+ BUG_ON(counter > NMI_MAX_COUNTER_BITS);
+
+ return (!test_bit(counter, &__get_cpu_var(perfctr_nmi_owner)));
+}
+
+/* checks the an msr for availability */
+int avail_to_resrv_perfctr_nmi(unsigned int msr)
+{
+ unsigned int counter;
+
+ counter = nmi_perfctr_msr_to_bit(msr);
+ BUG_ON(counter > NMI_MAX_COUNTER_BITS);
+
+ return (!test_bit(counter, &__get_cpu_var(perfctr_nmi_owner)));
+}
+
+int reserve_perfctr_nmi(unsigned int msr)
+{
+ unsigned int counter;
+
+ counter = nmi_perfctr_msr_to_bit(msr);
+ BUG_ON(counter > NMI_MAX_COUNTER_BITS);
+
+ if (!test_and_set_bit(counter, &__get_cpu_var(perfctr_nmi_owner)))
+ return 1;
+ return 0;
+}
+
+void release_perfctr_nmi(unsigned int msr)
+{
+ unsigned int counter;
+
+ counter = nmi_perfctr_msr_to_bit(msr);
+ BUG_ON(counter > NMI_MAX_COUNTER_BITS);
+
+ clear_bit(counter, &__get_cpu_var(perfctr_nmi_owner));
+}
+
+int reserve_evntsel_nmi(unsigned int msr)
+{
+ unsigned int counter;
+
+ counter = nmi_evntsel_msr_to_bit(msr);
+ BUG_ON(counter > NMI_MAX_COUNTER_BITS);
+
+ if (!test_and_set_bit(counter, &__get_cpu_var(evntsel_nmi_owner)))
+ return 1;
+ return 0;
+}
+
+void release_evntsel_nmi(unsigned int msr)
+{
+ unsigned int counter;
+
+ counter = nmi_evntsel_msr_to_bit(msr);
+ BUG_ON(counter > NMI_MAX_COUNTER_BITS);
+
+ clear_bit(counter, &__get_cpu_var(evntsel_nmi_owner));
+}
static __cpuinit inline int nmi_known_cpu(void)
{
@@ -109,7 +180,7 @@ static __cpuinit inline int nmi_known_cpu(void)
}
/* Run after command line and cpu_init init, but before all other checks */
-void __cpuinit nmi_watchdog_default(void)
+void nmi_watchdog_default(void)
{
if (nmi_watchdog != NMI_DEFAULT)
return;
@@ -145,6 +216,12 @@ int __init check_nmi_watchdog (void)
int *counts;
int cpu;
+ if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DEFAULT))
+ return 0;
+
+ if (!atomic_read(&nmi_active))
+ return 0;
+
counts = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
if (!counts)
return -1;
@@ -162,26 +239,43 @@ int __init check_nmi_watchdog (void)
mdelay((10*1000)/nmi_hz); // wait 10 ticks
for_each_online_cpu(cpu) {
+ if (!per_cpu(nmi_watchdog_ctlblk, cpu).enabled)
+ continue;
if (cpu_pda(cpu)->__nmi_count - counts[cpu] <= 5) {
- endflag = 1;
printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n",
cpu,
counts[cpu],
cpu_pda(cpu)->__nmi_count);
- nmi_active = 0;
- lapic_nmi_owner &= ~LAPIC_NMI_WATCHDOG;
- nmi_perfctr_msr = 0;
- kfree(counts);
- return -1;
+ per_cpu(nmi_watchdog_ctlblk, cpu).enabled = 0;
+ atomic_dec(&nmi_active);
}
}
+ if (!atomic_read(&nmi_active)) {
+ kfree(counts);
+ atomic_set(&nmi_active, -1);
+ return -1;
+ }
endflag = 1;
printk("OK.\n");
/* now that we know it works we can reduce NMI frequency to
something more reasonable; makes a difference in some configs */
- if (nmi_watchdog == NMI_LOCAL_APIC)
+ if (nmi_watchdog == NMI_LOCAL_APIC) {
+ struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+
nmi_hz = 1;
+ /*
+ * On Intel CPUs with ARCH_PERFMON only 32 bits in the counter
+ * are writable, with higher bits sign extending from bit 31.
+ * So, we can only program the counter with 31 bit values and
+ * 32nd bit should be 1, for 33.. to be 1.
+ * Find the appropriate nmi_hz
+ */
+ if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0 &&
+ ((u64)cpu_khz * 1000) > 0x7fffffffULL) {
+ nmi_hz = ((u64)cpu_khz * 1000) / 0x7fffffffUL + 1;
+ }
+ }
kfree(counts);
return 0;
@@ -201,91 +295,65 @@ int __init setup_nmi_watchdog(char *str)
get_option(&str, &nmi);
- if (nmi >= NMI_INVALID)
+ if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE))
return 0;
+
+ if ((nmi == NMI_LOCAL_APIC) && (nmi_known_cpu() == 0))
+ return 0; /* no lapic support */
nmi_watchdog = nmi;
return 1;
}
__setup("nmi_watchdog=", setup_nmi_watchdog);
-static void disable_intel_arch_watchdog(void);
-
static void disable_lapic_nmi_watchdog(void)
{
- if (nmi_active <= 0)
+ BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
+
+ if (atomic_read(&nmi_active) <= 0)
return;
- switch (boot_cpu_data.x86_vendor) {
- case X86_VENDOR_AMD:
- wrmsr(MSR_K7_EVNTSEL0, 0, 0);
- break;
- case X86_VENDOR_INTEL:
- if (boot_cpu_data.x86 == 15) {
- wrmsr(MSR_P4_IQ_CCCR0, 0, 0);
- wrmsr(MSR_P4_CRU_ESCR0, 0, 0);
- } else if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
- disable_intel_arch_watchdog();
- }
- break;
- }
- nmi_active = -1;
- /* tell do_nmi() and others that we're not active any more */
- nmi_watchdog = 0;
-}
-static void enable_lapic_nmi_watchdog(void)
-{
- if (nmi_active < 0) {
- nmi_watchdog = NMI_LOCAL_APIC;
- touch_nmi_watchdog();
- setup_apic_nmi_watchdog();
- }
+ on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1);
+
+ BUG_ON(atomic_read(&nmi_active) != 0);
}
-int reserve_lapic_nmi(void)
+static void enable_lapic_nmi_watchdog(void)
{
- unsigned int old_owner;
+ BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
- spin_lock(&lapic_nmi_owner_lock);
- old_owner = lapic_nmi_owner;
- lapic_nmi_owner |= LAPIC_NMI_RESERVED;
- spin_unlock(&lapic_nmi_owner_lock);
- if (old_owner & LAPIC_NMI_RESERVED)
- return -EBUSY;
- if (old_owner & LAPIC_NMI_WATCHDOG)
- disable_lapic_nmi_watchdog();
- return 0;
-}
+ /* are we already enabled */
+ if (atomic_read(&nmi_active) != 0)
+ return;
-void release_lapic_nmi(void)
-{
- unsigned int new_owner;
+ /* are we lapic aware */
+ if (nmi_known_cpu() <= 0)
+ return;
- spin_lock(&lapic_nmi_owner_lock);
- new_owner = lapic_nmi_owner & ~LAPIC_NMI_RESERVED;
- lapic_nmi_owner = new_owner;
- spin_unlock(&lapic_nmi_owner_lock);
- if (new_owner & LAPIC_NMI_WATCHDOG)
- enable_lapic_nmi_watchdog();
+ on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1);
+ touch_nmi_watchdog();
}
void disable_timer_nmi_watchdog(void)
{
- if ((nmi_watchdog != NMI_IO_APIC) || (nmi_active <= 0))
+ BUG_ON(nmi_watchdog != NMI_IO_APIC);
+
+ if (atomic_read(&nmi_active) <= 0)
return;
disable_irq(0);
- unset_nmi_callback();
- nmi_active = -1;
- nmi_watchdog = NMI_NONE;
+ on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1);
+
+ BUG_ON(atomic_read(&nmi_active) != 0);
}
void enable_timer_nmi_watchdog(void)
{
- if (nmi_active < 0) {
- nmi_watchdog = NMI_IO_APIC;
+ BUG_ON(nmi_watchdog != NMI_IO_APIC);
+
+ if (atomic_read(&nmi_active) == 0) {
touch_nmi_watchdog();
- nmi_active = 1;
+ on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1);
enable_irq(0);
}
}
@@ -296,15 +364,20 @@ static int nmi_pm_active; /* nmi_active before suspend */
static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state)
{
- nmi_pm_active = nmi_active;
- disable_lapic_nmi_watchdog();
+ /* only CPU0 goes here, other CPUs should be offline */
+ nmi_pm_active = atomic_read(&nmi_active);
+ stop_apic_nmi_watchdog(NULL);
+ BUG_ON(atomic_read(&nmi_active) != 0);
return 0;
}
static int lapic_nmi_resume(struct sys_device *dev)
{
- if (nmi_pm_active > 0)
- enable_lapic_nmi_watchdog();
+ /* only CPU0 goes here, other CPUs should be offline */
+ if (nmi_pm_active > 0) {
+ setup_apic_nmi_watchdog(NULL);
+ touch_nmi_watchdog();
+ }
return 0;
}
@@ -323,7 +396,13 @@ static int __init init_lapic_nmi_sysfs(void)
{
int error;
- if (nmi_active == 0 || nmi_watchdog != NMI_LOCAL_APIC)
+ /* should really be a BUG_ON but b/c this is an
+ * init call, it just doesn't work. -dcz
+ */
+ if (nmi_watchdog != NMI_LOCAL_APIC)
+ return 0;
+
+ if ( atomic_read(&nmi_active) < 0 )
return 0;
error = sysdev_class_register(&nmi_sysclass);
@@ -341,74 +420,209 @@ late_initcall(init_lapic_nmi_sysfs);
* Original code written by Keith Owens.
*/
-static void clear_msr_range(unsigned int base, unsigned int n)
-{
- unsigned int i;
+/* Note that these events don't tick when the CPU idles. This means
+ the frequency varies with CPU load. */
- for(i = 0; i < n; ++i)
- wrmsr(base+i, 0, 0);
-}
+#define K7_EVNTSEL_ENABLE (1 << 22)
+#define K7_EVNTSEL_INT (1 << 20)
+#define K7_EVNTSEL_OS (1 << 17)
+#define K7_EVNTSEL_USR (1 << 16)
+#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76
+#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
-static void setup_k7_watchdog(void)
+static int setup_k7_watchdog(void)
{
- int i;
+ unsigned int perfctr_msr, evntsel_msr;
unsigned int evntsel;
+ struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
- nmi_perfctr_msr = MSR_K7_PERFCTR0;
+ perfctr_msr = MSR_K7_PERFCTR0;
+ evntsel_msr = MSR_K7_EVNTSEL0;
+ if (!reserve_perfctr_nmi(perfctr_msr))
+ goto fail;
- for(i = 0; i < 4; ++i) {
- /* Simulator may not support it */
- if (checking_wrmsrl(MSR_K7_EVNTSEL0+i, 0UL)) {
- nmi_perfctr_msr = 0;
- return;
- }
- wrmsrl(MSR_K7_PERFCTR0+i, 0UL);
- }
+ if (!reserve_evntsel_nmi(evntsel_msr))
+ goto fail1;
+
+ /* Simulator may not support it */
+ if (checking_wrmsrl(evntsel_msr, 0UL))
+ goto fail2;
+ wrmsrl(perfctr_msr, 0UL);
evntsel = K7_EVNTSEL_INT
| K7_EVNTSEL_OS
| K7_EVNTSEL_USR
| K7_NMI_EVENT;
- wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
- wrmsrl(MSR_K7_PERFCTR0, -((u64)cpu_khz * 1000 / nmi_hz));
+ /* setup the timer */
+ wrmsr(evntsel_msr, evntsel, 0);
+ wrmsrl(perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz));
apic_write(APIC_LVTPC, APIC_DM_NMI);
evntsel |= K7_EVNTSEL_ENABLE;
- wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
+ wrmsr(evntsel_msr, evntsel, 0);
+
+ wd->perfctr_msr = perfctr_msr;
+ wd->evntsel_msr = evntsel_msr;
+ wd->cccr_msr = 0; //unused
+ wd->check_bit = 1ULL<<63;
+ return 1;
+fail2:
+ release_evntsel_nmi(evntsel_msr);
+fail1:
+ release_perfctr_nmi(perfctr_msr);
+fail:
+ return 0;
}
-static void disable_intel_arch_watchdog(void)
+static void stop_k7_watchdog(void)
{
- unsigned ebx;
+ struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
- /*
- * Check whether the Architectural PerfMon supports
- * Unhalted Core Cycles Event or not.
- * NOTE: Corresponding bit = 0 in ebp indicates event present.
+ wrmsr(wd->evntsel_msr, 0, 0);
+
+ release_evntsel_nmi(wd->evntsel_msr);
+ release_perfctr_nmi(wd->perfctr_msr);
+}
+
+/* Note that these events don't tick when the CPU idles. This means
+ the frequency varies with CPU load. */
+
+#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7)
+#define P4_ESCR_EVENT_SELECT(N) ((N)<<25)
+#define P4_ESCR_OS (1<<3)
+#define P4_ESCR_USR (1<<2)
+#define P4_CCCR_OVF_PMI0 (1<<26)
+#define P4_CCCR_OVF_PMI1 (1<<27)
+#define P4_CCCR_THRESHOLD(N) ((N)<<20)
+#define P4_CCCR_COMPLEMENT (1<<19)
+#define P4_CCCR_COMPARE (1<<18)
+#define P4_CCCR_REQUIRED (3<<16)
+#define P4_CCCR_ESCR_SELECT(N) ((N)<<13)
+#define P4_CCCR_ENABLE (1<<12)
+#define P4_CCCR_OVF (1<<31)
+/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
+ CRU_ESCR0 (with any non-null event selector) through a complemented
+ max threshold. [IA32-Vol3, Section 14.9.9] */
+
+static int setup_p4_watchdog(void)
+{
+ unsigned int perfctr_msr, evntsel_msr, cccr_msr;
+ unsigned int evntsel, cccr_val;
+ unsigned int misc_enable, dummy;
+ unsigned int ht_num;
+ struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+
+ rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy);
+ if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
+ return 0;
+
+#ifdef CONFIG_SMP
+ /* detect which hyperthread we are on */
+ if (smp_num_siblings == 2) {
+ unsigned int ebx, apicid;
+
+ ebx = cpuid_ebx(1);
+ apicid = (ebx >> 24) & 0xff;
+ ht_num = apicid & 1;
+ } else
+#endif
+ ht_num = 0;
+
+ /* performance counters are shared resources
+ * assign each hyperthread its own set
+ * (re-use the ESCR0 register, seems safe
+ * and keeps the cccr_val the same)
*/
- ebx = cpuid_ebx(10);
- if (!(ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
- wrmsr(MSR_ARCH_PERFMON_EVENTSEL0, 0, 0);
+ if (!ht_num) {
+ /* logical cpu 0 */
+ perfctr_msr = MSR_P4_IQ_PERFCTR0;
+ evntsel_msr = MSR_P4_CRU_ESCR0;
+ cccr_msr = MSR_P4_IQ_CCCR0;
+ cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4);
+ } else {
+ /* logical cpu 1 */
+ perfctr_msr = MSR_P4_IQ_PERFCTR1;
+ evntsel_msr = MSR_P4_CRU_ESCR0;
+ cccr_msr = MSR_P4_IQ_CCCR1;
+ cccr_val = P4_CCCR_OVF_PMI1 | P4_CCCR_ESCR_SELECT(4);
+ }
+
+ if (!reserve_perfctr_nmi(perfctr_msr))
+ goto fail;
+
+ if (!reserve_evntsel_nmi(evntsel_msr))
+ goto fail1;
+
+ evntsel = P4_ESCR_EVENT_SELECT(0x3F)
+ | P4_ESCR_OS
+ | P4_ESCR_USR;
+
+ cccr_val |= P4_CCCR_THRESHOLD(15)
+ | P4_CCCR_COMPLEMENT
+ | P4_CCCR_COMPARE
+ | P4_CCCR_REQUIRED;
+
+ wrmsr(evntsel_msr, evntsel, 0);
+ wrmsr(cccr_msr, cccr_val, 0);
+ wrmsrl(perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz));
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+ cccr_val |= P4_CCCR_ENABLE;
+ wrmsr(cccr_msr, cccr_val, 0);
+
+ wd->perfctr_msr = perfctr_msr;
+ wd->evntsel_msr = evntsel_msr;
+ wd->cccr_msr = cccr_msr;
+ wd->check_bit = 1ULL<<39;
+ return 1;
+fail1:
+ release_perfctr_nmi(perfctr_msr);
+fail:
+ return 0;
+}
+
+static void stop_p4_watchdog(void)
+{
+ struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+
+ wrmsr(wd->cccr_msr, 0, 0);
+ wrmsr(wd->evntsel_msr, 0, 0);
+
+ release_evntsel_nmi(wd->evntsel_msr);
+ release_perfctr_nmi(wd->perfctr_msr);
}
+#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL
+#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK
+
static int setup_intel_arch_watchdog(void)
{
+ unsigned int ebx;
+ union cpuid10_eax eax;
+ unsigned int unused;
+ unsigned int perfctr_msr, evntsel_msr;
unsigned int evntsel;
- unsigned ebx;
+ struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
/*
* Check whether the Architectural PerfMon supports
* Unhalted Core Cycles Event or not.
- * NOTE: Corresponding bit = 0 in ebp indicates event present.
+ * NOTE: Corresponding bit = 0 in ebx indicates event present.
*/
- ebx = cpuid_ebx(10);
- if ((ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
- return 0;
+ cpuid(10, &(eax.full), &ebx, &unused, &unused);
+ if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
+ (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
+ goto fail;
+
+ perfctr_msr = MSR_ARCH_PERFMON_PERFCTR0;
+ evntsel_msr = MSR_ARCH_PERFMON_EVENTSEL0;
- nmi_perfctr_msr = MSR_ARCH_PERFMON_PERFCTR0;
+ if (!reserve_perfctr_nmi(perfctr_msr))
+ goto fail;
- clear_msr_range(MSR_ARCH_PERFMON_EVENTSEL0, 2);
- clear_msr_range(MSR_ARCH_PERFMON_PERFCTR0, 2);
+ if (!reserve_evntsel_nmi(evntsel_msr))
+ goto fail1;
+
+ wrmsrl(perfctr_msr, 0UL);
evntsel = ARCH_PERFMON_EVENTSEL_INT
| ARCH_PERFMON_EVENTSEL_OS
@@ -416,84 +630,122 @@ static int setup_intel_arch_watchdog(void)
| ARCH_PERFMON_NMI_EVENT_SEL
| ARCH_PERFMON_NMI_EVENT_UMASK;
- wrmsr(MSR_ARCH_PERFMON_EVENTSEL0, evntsel, 0);
- wrmsrl(MSR_ARCH_PERFMON_PERFCTR0, -((u64)cpu_khz * 1000 / nmi_hz));
+ /* setup the timer */
+ wrmsr(evntsel_msr, evntsel, 0);
+ wrmsrl(perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz));
+
apic_write(APIC_LVTPC, APIC_DM_NMI);
evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
- wrmsr(MSR_ARCH_PERFMON_EVENTSEL0, evntsel, 0);
+ wrmsr(evntsel_msr, evntsel, 0);
+
+ wd->perfctr_msr = perfctr_msr;
+ wd->evntsel_msr = evntsel_msr;
+ wd->cccr_msr = 0; //unused
+ wd->check_bit = 1ULL << (eax.split.bit_width - 1);
return 1;
+fail1:
+ release_perfctr_nmi(perfctr_msr);
+fail:
+ return 0;
}
-
-static int setup_p4_watchdog(void)
+static void stop_intel_arch_watchdog(void)
{
- unsigned int misc_enable, dummy;
+ unsigned int ebx;
+ union cpuid10_eax eax;
+ unsigned int unused;
+ struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
- rdmsr(MSR_P4_MISC_ENABLE, misc_enable, dummy);
- if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
- return 0;
+ /*
+ * Check whether the Architectural PerfMon supports
+ * Unhalted Core Cycles Event or not.
+ * NOTE: Corresponding bit = 0 in ebx indicates event present.
+ */
+ cpuid(10, &(eax.full), &ebx, &unused, &unused);
+ if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
+ (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
+ return;
- nmi_perfctr_msr = MSR_P4_IQ_COUNTER0;
- nmi_p4_cccr_val = P4_NMI_IQ_CCCR0;
-#ifdef CONFIG_SMP
- if (smp_num_siblings == 2)
- nmi_p4_cccr_val |= P4_CCCR_OVF_PMI1;
-#endif
+ wrmsr(wd->evntsel_msr, 0, 0);
- if (!(misc_enable & MSR_P4_MISC_ENABLE_PEBS_UNAVAIL))
- clear_msr_range(0x3F1, 2);
- /* MSR 0x3F0 seems to have a default value of 0xFC00, but current
- docs doesn't fully define it, so leave it alone for now. */
- if (boot_cpu_data.x86_model >= 0x3) {
- /* MSR_P4_IQ_ESCR0/1 (0x3ba/0x3bb) removed */
- clear_msr_range(0x3A0, 26);
- clear_msr_range(0x3BC, 3);
- } else {
- clear_msr_range(0x3A0, 31);
- }
- clear_msr_range(0x3C0, 6);
- clear_msr_range(0x3C8, 6);
- clear_msr_range(0x3E0, 2);
- clear_msr_range(MSR_P4_CCCR0, 18);
- clear_msr_range(MSR_P4_PERFCTR0, 18);
-
- wrmsr(MSR_P4_CRU_ESCR0, P4_NMI_CRU_ESCR0, 0);
- wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0 & ~P4_CCCR_ENABLE, 0);
- Dprintk("setting P4_IQ_COUNTER0 to 0x%08lx\n", -(cpu_khz * 1000UL / nmi_hz));
- wrmsrl(MSR_P4_IQ_COUNTER0, -((u64)cpu_khz * 1000 / nmi_hz));
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0);
- return 1;
+ release_evntsel_nmi(wd->evntsel_msr);
+ release_perfctr_nmi(wd->perfctr_msr);
}
-void setup_apic_nmi_watchdog(void)
+void setup_apic_nmi_watchdog(void *unused)
{
- switch (boot_cpu_data.x86_vendor) {
- case X86_VENDOR_AMD:
- if (boot_cpu_data.x86 != 15)
- return;
- if (strstr(boot_cpu_data.x86_model_id, "Screwdriver"))
- return;
- setup_k7_watchdog();
- break;
- case X86_VENDOR_INTEL:
- if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
- if (!setup_intel_arch_watchdog())
+ struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+
+ /* only support LOCAL and IO APICs for now */
+ if ((nmi_watchdog != NMI_LOCAL_APIC) &&
+ (nmi_watchdog != NMI_IO_APIC))
+ return;
+
+ if (wd->enabled == 1)
+ return;
+
+ /* cheap hack to support suspend/resume */
+ /* if cpu0 is not active neither should the other cpus */
+ if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0))
+ return;
+
+ if (nmi_watchdog == NMI_LOCAL_APIC) {
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_AMD:
+ if (strstr(boot_cpu_data.x86_model_id, "Screwdriver"))
return;
- } else if (boot_cpu_data.x86 == 15) {
+ if (!setup_k7_watchdog())
+ return;
+ break;
+ case X86_VENDOR_INTEL:
+ if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
+ if (!setup_intel_arch_watchdog())
+ return;
+ break;
+ }
if (!setup_p4_watchdog())
return;
- } else {
+ break;
+ default:
return;
}
+ }
+ wd->enabled = 1;
+ atomic_inc(&nmi_active);
+}
+
+void stop_apic_nmi_watchdog(void *unused)
+{
+ struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
- break;
+ /* only support LOCAL and IO APICs for now */
+ if ((nmi_watchdog != NMI_LOCAL_APIC) &&
+ (nmi_watchdog != NMI_IO_APIC))
+ return;
- default:
+ if (wd->enabled == 0)
return;
+
+ if (nmi_watchdog == NMI_LOCAL_APIC) {
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_AMD:
+ if (strstr(boot_cpu_data.x86_model_id, "Screwdriver"))
+ return;
+ stop_k7_watchdog();
+ break;
+ case X86_VENDOR_INTEL:
+ if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
+ stop_intel_arch_watchdog();
+ break;
+ }
+ stop_p4_watchdog();
+ break;
+ default:
+ return;
+ }
}
- lapic_nmi_owner = LAPIC_NMI_WATCHDOG;
- nmi_active = 1;
+ wd->enabled = 0;
+ atomic_dec(&nmi_active);
}
/*
@@ -526,93 +778,109 @@ void touch_nmi_watchdog (void)
touch_softlockup_watchdog();
}
-void __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
+int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
{
int sum;
int touched = 0;
+ struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+ u64 dummy;
+ int rc=0;
+
+ /* check for other users first */
+ if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
+ == NOTIFY_STOP) {
+ rc = 1;
+ touched = 1;
+ }
sum = read_pda(apic_timer_irqs);
if (__get_cpu_var(nmi_touch)) {
__get_cpu_var(nmi_touch) = 0;
touched = 1;
}
+
#ifdef CONFIG_X86_MCE
/* Could check oops_in_progress here too, but it's safer
not too */
if (atomic_read(&mce_entry) > 0)
touched = 1;
#endif
+ /* if the apic timer isn't firing, this cpu isn't doing much */
if (!touched && __get_cpu_var(last_irq_sum) == sum) {
/*
* Ayiee, looks like this CPU is stuck ...
* wait a few IRQs (5 seconds) before doing the oops ...
*/
local_inc(&__get_cpu_var(alert_counter));
- if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) {
- if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
- == NOTIFY_STOP) {
- local_set(&__get_cpu_var(alert_counter), 0);
- return;
- }
- die_nmi("NMI Watchdog detected LOCKUP on CPU %d\n", regs);
- }
+ if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz)
+ die_nmi("NMI Watchdog detected LOCKUP on CPU %d\n", regs,
+ panic_on_timeout);
} else {
__get_cpu_var(last_irq_sum) = sum;
local_set(&__get_cpu_var(alert_counter), 0);
}
- if (nmi_perfctr_msr) {
- if (nmi_perfctr_msr == MSR_P4_IQ_COUNTER0) {
- /*
- * P4 quirks:
- * - An overflown perfctr will assert its interrupt
- * until the OVF flag in its CCCR is cleared.
- * - LVTPC is masked on interrupt and must be
- * unmasked by the LVTPC handler.
- */
- wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0);
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- } else if (nmi_perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) {
- /*
- * For Intel based architectural perfmon
- * - LVTPC is masked on interrupt and must be
- * unmasked by the LVTPC handler.
+
+ /* see if the nmi watchdog went off */
+ if (wd->enabled) {
+ if (nmi_watchdog == NMI_LOCAL_APIC) {
+ rdmsrl(wd->perfctr_msr, dummy);
+ if (dummy & wd->check_bit){
+ /* this wasn't a watchdog timer interrupt */
+ goto done;
+ }
+
+ /* only Intel uses the cccr msr */
+ if (wd->cccr_msr != 0) {
+ /*
+ * P4 quirks:
+ * - An overflown perfctr will assert its interrupt
+ * until the OVF flag in its CCCR is cleared.
+ * - LVTPC is masked on interrupt and must be
+ * unmasked by the LVTPC handler.
+ */
+ rdmsrl(wd->cccr_msr, dummy);
+ dummy &= ~P4_CCCR_OVF;
+ wrmsrl(wd->cccr_msr, dummy);
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+ } else if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) {
+ /*
+ * ArchPerfom/Core Duo needs to re-unmask
+ * the apic vector
+ */
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+ }
+ /* start the cycle over again */
+ wrmsrl(wd->perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz));
+ rc = 1;
+ } else if (nmi_watchdog == NMI_IO_APIC) {
+ /* don't know how to accurately check for this.
+ * just assume it was a watchdog timer interrupt
+ * This matches the old behaviour.
*/
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- }
- wrmsrl(nmi_perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz));
+ rc = 1;
+ } else
+ printk(KERN_WARNING "Unknown enabled NMI hardware?!\n");
}
+done:
+ return rc;
}
-static __kprobes int dummy_nmi_callback(struct pt_regs * regs, int cpu)
-{
- return 0;
-}
-
-static nmi_callback_t nmi_callback = dummy_nmi_callback;
-
asmlinkage __kprobes void do_nmi(struct pt_regs * regs, long error_code)
{
- int cpu = safe_smp_processor_id();
-
nmi_enter();
add_pda(__nmi_count,1);
- if (!rcu_dereference(nmi_callback)(regs, cpu))
- default_do_nmi(regs);
+ default_do_nmi(regs);
nmi_exit();
}
-void set_nmi_callback(nmi_callback_t callback)
+int do_nmi_callback(struct pt_regs * regs, int cpu)
{
- vmalloc_sync_all();
- rcu_assign_pointer(nmi_callback, callback);
-}
-EXPORT_SYMBOL_GPL(set_nmi_callback);
-
-void unset_nmi_callback(void)
-{
- nmi_callback = dummy_nmi_callback;
+#ifdef CONFIG_SYSCTL
+ if (unknown_nmi_panic)
+ return unknown_nmi_panic_callback(regs, cpu);
+#endif
+ return 0;
}
-EXPORT_SYMBOL_GPL(unset_nmi_callback);
#ifdef CONFIG_SYSCTL
@@ -621,36 +889,42 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
unsigned char reason = get_nmi_reason();
char buf[64];
- if (!(reason & 0xc0)) {
- sprintf(buf, "NMI received for unknown reason %02x\n", reason);
- die_nmi(buf,regs);
- }
+ sprintf(buf, "NMI received for unknown reason %02x\n", reason);
+ die_nmi(buf, regs, 1); /* Always panic here */
return 0;
}
/*
- * proc handler for /proc/sys/kernel/unknown_nmi_panic
+ * proc handler for /proc/sys/kernel/nmi
*/
-int proc_unknown_nmi_panic(struct ctl_table *table, int write, struct file *file,
+int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
void __user *buffer, size_t *length, loff_t *ppos)
{
int old_state;
- old_state = unknown_nmi_panic;
+ nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
+ old_state = nmi_watchdog_enabled;
proc_dointvec(table, write, file, buffer, length, ppos);
- if (!!old_state == !!unknown_nmi_panic)
+ if (!!old_state == !!nmi_watchdog_enabled)
return 0;
- if (unknown_nmi_panic) {
- if (reserve_lapic_nmi() < 0) {
- unknown_nmi_panic = 0;
- return -EBUSY;
- } else {
- set_nmi_callback(unknown_nmi_panic_callback);
- }
+ if (atomic_read(&nmi_active) < 0) {
+ printk( KERN_WARNING "NMI watchdog is permanently disabled\n");
+ return -EIO;
+ }
+
+ /* if nmi_watchdog is not set yet, then set it */
+ nmi_watchdog_default();
+
+ if (nmi_watchdog == NMI_LOCAL_APIC) {
+ if (nmi_watchdog_enabled)
+ enable_lapic_nmi_watchdog();
+ else
+ disable_lapic_nmi_watchdog();
} else {
- release_lapic_nmi();
- unset_nmi_callback();
+ printk( KERN_WARNING
+ "NMI watchdog doesn't know what hardware to touch\n");
+ return -EIO;
}
return 0;
}
@@ -659,8 +933,12 @@ int proc_unknown_nmi_panic(struct ctl_table *table, int write, struct file *file
EXPORT_SYMBOL(nmi_active);
EXPORT_SYMBOL(nmi_watchdog);
-EXPORT_SYMBOL(reserve_lapic_nmi);
-EXPORT_SYMBOL(release_lapic_nmi);
+EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi);
+EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
+EXPORT_SYMBOL(reserve_perfctr_nmi);
+EXPORT_SYMBOL(release_perfctr_nmi);
+EXPORT_SYMBOL(reserve_evntsel_nmi);
+EXPORT_SYMBOL(release_evntsel_nmi);
EXPORT_SYMBOL(disable_timer_nmi_watchdog);
EXPORT_SYMBOL(enable_timer_nmi_watchdog);
EXPORT_SYMBOL(touch_nmi_watchdog);
diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c
index 146924ba5df..cfb09b07ae9 100644
--- a/arch/x86_64/kernel/pci-calgary.c
+++ b/arch/x86_64/kernel/pci-calgary.c
@@ -86,7 +86,8 @@
#define MAX_NUM_OF_PHBS 8 /* how many PHBs in total? */
#define MAX_NUM_CHASSIS 8 /* max number of chassis */
-#define MAX_PHB_BUS_NUM (MAX_NUM_OF_PHBS * MAX_NUM_CHASSIS * 2) /* max dev->bus->number */
+/* MAX_PHB_BUS_NUM is the maximal possible dev->bus->number */
+#define MAX_PHB_BUS_NUM (MAX_NUM_OF_PHBS * MAX_NUM_CHASSIS * 2)
#define PHBS_PER_CALGARY 4
/* register offsets in Calgary's internal register space */
@@ -111,31 +112,49 @@ static const unsigned long phb_offsets[] = {
0xB000 /* PHB3 */
};
-static char bus_to_phb[MAX_PHB_BUS_NUM];
-void* tce_table_kva[MAX_PHB_BUS_NUM];
unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED;
static int translate_empty_slots __read_mostly = 0;
static int calgary_detected __read_mostly = 0;
-/*
- * the bitmap of PHBs the user requested that we disable
- * translation on.
- */
-static DECLARE_BITMAP(translation_disabled, MAX_PHB_BUS_NUM);
+struct calgary_bus_info {
+ void *tce_space;
+ unsigned char translation_disabled;
+ signed char phbid;
+};
+
+static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, };
static void tce_cache_blast(struct iommu_table *tbl);
/* enable this to stress test the chip's TCE cache */
#ifdef CONFIG_IOMMU_DEBUG
-static inline void tce_cache_blast_stress(struct iommu_table *tbl)
+int debugging __read_mostly = 1;
+
+static inline unsigned long verify_bit_range(unsigned long* bitmap,
+ int expected, unsigned long start, unsigned long end)
{
- tce_cache_blast(tbl);
+ unsigned long idx = start;
+
+ BUG_ON(start >= end);
+
+ while (idx < end) {
+ if (!!test_bit(idx, bitmap) != expected)
+ return idx;
+ ++idx;
+ }
+
+ /* all bits have the expected value */
+ return ~0UL;
}
-#else
-static inline void tce_cache_blast_stress(struct iommu_table *tbl)
+#else /* debugging is disabled */
+int debugging __read_mostly = 0;
+
+static inline unsigned long verify_bit_range(unsigned long* bitmap,
+ int expected, unsigned long start, unsigned long end)
{
+ return ~0UL;
}
-#endif /* BLAST_TCE_CACHE_ON_UNMAP */
+#endif /* CONFIG_IOMMU_DEBUG */
static inline unsigned int num_dma_pages(unsigned long dma, unsigned int dmalen)
{
@@ -149,7 +168,7 @@ static inline unsigned int num_dma_pages(unsigned long dma, unsigned int dmalen)
static inline int translate_phb(struct pci_dev* dev)
{
- int disabled = test_bit(dev->bus->number, translation_disabled);
+ int disabled = bus_info[dev->bus->number].translation_disabled;
return !disabled;
}
@@ -158,6 +177,7 @@ static void iommu_range_reserve(struct iommu_table *tbl,
{
unsigned long index;
unsigned long end;
+ unsigned long badbit;
index = start_addr >> PAGE_SHIFT;
@@ -169,14 +189,15 @@ static void iommu_range_reserve(struct iommu_table *tbl,
if (end > tbl->it_size) /* don't go off the table */
end = tbl->it_size;
- while (index < end) {
- if (test_bit(index, tbl->it_map))
+ badbit = verify_bit_range(tbl->it_map, 0, index, end);
+ if (badbit != ~0UL) {
+ if (printk_ratelimit())
printk(KERN_ERR "Calgary: entry already allocated at "
"0x%lx tbl %p dma 0x%lx npages %u\n",
- index, tbl, start_addr, npages);
- ++index;
+ badbit, tbl, start_addr, npages);
}
- set_bit_string(tbl->it_map, start_addr >> PAGE_SHIFT, npages);
+
+ set_bit_string(tbl->it_map, index, npages);
}
static unsigned long iommu_range_alloc(struct iommu_table *tbl,
@@ -243,7 +264,7 @@ static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
unsigned int npages)
{
unsigned long entry;
- unsigned long i;
+ unsigned long badbit;
entry = dma_addr >> PAGE_SHIFT;
@@ -251,16 +272,15 @@ static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
tce_free(tbl, entry, npages);
- for (i = 0; i < npages; ++i) {
- if (!test_bit(entry + i, tbl->it_map))
+ badbit = verify_bit_range(tbl->it_map, 1, entry, entry + npages);
+ if (badbit != ~0UL) {
+ if (printk_ratelimit())
printk(KERN_ERR "Calgary: bit is off at 0x%lx "
"tbl %p dma 0x%Lx entry 0x%lx npages %u\n",
- entry + i, tbl, dma_addr, entry, npages);
+ badbit, tbl, dma_addr, entry, npages);
}
__clear_bit_string(tbl->it_map, entry, npages);
-
- tce_cache_blast_stress(tbl);
}
static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
@@ -454,7 +474,7 @@ static struct dma_mapping_ops calgary_dma_ops = {
static inline int busno_to_phbid(unsigned char num)
{
- return bus_to_phb[num];
+ return bus_info[num].phbid;
}
static inline unsigned long split_queue_offset(unsigned char num)
@@ -631,6 +651,10 @@ static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar)
if (ret)
return ret;
+ tbl = dev->sysdata;
+ tbl->it_base = (unsigned long)bus_info[dev->bus->number].tce_space;
+ tce_free(tbl, 0, tbl->it_size);
+
calgary_reserve_regions(dev);
/* set TARs for each PHB */
@@ -654,11 +678,12 @@ static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar)
return 0;
}
-static void __init calgary_free_tar(struct pci_dev *dev)
+static void __init calgary_free_bus(struct pci_dev *dev)
{
u64 val64;
struct iommu_table *tbl = dev->sysdata;
void __iomem *target;
+ unsigned int bitmapsz;
target = calgary_reg(tbl->bbar, tar_offset(dev->bus->number));
val64 = be64_to_cpu(readq(target));
@@ -666,8 +691,15 @@ static void __init calgary_free_tar(struct pci_dev *dev)
writeq(cpu_to_be64(val64), target);
readq(target); /* flush */
+ bitmapsz = tbl->it_size / BITS_PER_BYTE;
+ free_pages((unsigned long)tbl->it_map, get_order(bitmapsz));
+ tbl->it_map = NULL;
+
kfree(tbl);
dev->sysdata = NULL;
+
+ /* Can't free bootmem allocated memory after system is up :-( */
+ bus_info[dev->bus->number].tce_space = NULL;
}
static void calgary_watchdog(unsigned long data)
@@ -772,12 +804,11 @@ static inline unsigned int __init locate_register_space(struct pci_dev *dev)
return address;
}
-static int __init calgary_init_one_nontraslated(struct pci_dev *dev)
+static void __init calgary_init_one_nontraslated(struct pci_dev *dev)
{
+ pci_dev_get(dev);
dev->sysdata = NULL;
dev->bus->self = dev;
-
- return 0;
}
static int __init calgary_init_one(struct pci_dev *dev)
@@ -798,6 +829,7 @@ static int __init calgary_init_one(struct pci_dev *dev)
if (ret)
goto iounmap;
+ pci_dev_get(dev);
dev->bus->self = dev;
calgary_enable_translation(dev);
@@ -824,10 +856,9 @@ static int __init calgary_init(void)
calgary_init_one_nontraslated(dev);
continue;
}
- if (!tce_table_kva[dev->bus->number] && !translate_empty_slots) {
- pci_dev_put(dev);
+ if (!bus_info[dev->bus->number].tce_space && !translate_empty_slots)
continue;
- }
+
ret = calgary_init_one(dev);
if (ret)
goto error;
@@ -840,15 +871,18 @@ error:
dev = pci_find_device_reverse(PCI_VENDOR_ID_IBM,
PCI_DEVICE_ID_IBM_CALGARY,
dev);
+ if (!dev)
+ break;
if (!translate_phb(dev)) {
pci_dev_put(dev);
continue;
}
- if (!tce_table_kva[dev->bus->number] && !translate_empty_slots)
+ if (!bus_info[dev->bus->number].tce_space && !translate_empty_slots)
continue;
+
calgary_disable_translation(dev);
- calgary_free_tar(dev);
- pci_dev_put(dev);
+ calgary_free_bus(dev);
+ pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */
}
return ret;
@@ -890,13 +924,15 @@ void __init detect_calgary(void)
if (swiotlb || no_iommu || iommu_detected)
return;
+ if (!early_pci_allowed())
+ return;
+
specified_table_size = determine_tce_table_size(end_pfn * PAGE_SIZE);
for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) {
int dev;
-
- tce_table_kva[bus] = NULL;
- bus_to_phb[bus] = -1;
+ struct calgary_bus_info *info = &bus_info[bus];
+ info->phbid = -1;
if (read_pci_config(bus, 0, 0, 0) != PCI_VENDOR_DEVICE_ID_CALGARY)
continue;
@@ -907,12 +943,9 @@ void __init detect_calgary(void)
*/
phb = (phb + 1) % PHBS_PER_CALGARY;
- if (test_bit(bus, translation_disabled)) {
- printk(KERN_INFO "Calgary: translation is disabled for "
- "PHB 0x%x\n", bus);
- /* skip this phb, don't allocate a tbl for it */
+ if (info->translation_disabled)
continue;
- }
+
/*
* Scan the slots of the PCI bus to see if there is a device present.
* The parent bus will be the zero-ith device, so start at 1.
@@ -923,8 +956,8 @@ void __init detect_calgary(void)
tbl = alloc_tce_table();
if (!tbl)
goto cleanup;
- tce_table_kva[bus] = tbl;
- bus_to_phb[bus] = phb;
+ info->tce_space = tbl;
+ info->phbid = phb;
calgary_found = 1;
break;
}
@@ -934,15 +967,20 @@ void __init detect_calgary(void)
if (calgary_found) {
iommu_detected = 1;
calgary_detected = 1;
- printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected. "
- "TCE table spec is %d.\n", specified_table_size);
+ printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected.\n");
+ printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, "
+ "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size,
+ debugging ? "enabled" : "disabled");
}
return;
cleanup:
- for (--bus; bus >= 0; --bus)
- if (tce_table_kva[bus])
- free_tce_table(tce_table_kva[bus]);
+ for (--bus; bus >= 0; --bus) {
+ struct calgary_bus_info *info = &bus_info[bus];
+
+ if (info->tce_space)
+ free_tce_table(info->tce_space);
+ }
}
int __init calgary_iommu_init(void)
@@ -1016,7 +1054,7 @@ static int __init calgary_parse_options(char *p)
if (bridge < MAX_PHB_BUS_NUM) {
printk(KERN_INFO "Calgary: disabling "
"translation for PHB 0x%x\n", bridge);
- set_bit(bridge, translation_disabled);
+ bus_info[bridge].translation_disabled = 1;
}
}
diff --git a/arch/x86_64/kernel/pci-dma.c b/arch/x86_64/kernel/pci-dma.c
index 9c44f4f2433..f8d857453f8 100644
--- a/arch/x86_64/kernel/pci-dma.c
+++ b/arch/x86_64/kernel/pci-dma.c
@@ -170,8 +170,20 @@ void dma_free_coherent(struct device *dev, size_t size,
}
EXPORT_SYMBOL(dma_free_coherent);
+static int forbid_dac __read_mostly;
+
int dma_supported(struct device *dev, u64 mask)
{
+#ifdef CONFIG_PCI
+ if (mask > 0xffffffff && forbid_dac > 0) {
+
+
+
+ printk(KERN_INFO "PCI: Disallowing DAC for device %s\n", dev->bus_id);
+ return 0;
+ }
+#endif
+
if (dma_ops->dma_supported)
return dma_ops->dma_supported(dev, mask);
@@ -231,56 +243,66 @@ EXPORT_SYMBOL(dma_set_mask);
allowed overwrite iommu off workarounds for specific chipsets.
soft Use software bounce buffering (default for Intel machines)
noaperture Don't touch the aperture for AGP.
+ allowdac Allow DMA >4GB
+ nodac Forbid DMA >4GB
+ panic Force panic when IOMMU overflows
*/
__init int iommu_setup(char *p)
{
- iommu_merge = 1;
-
- while (*p) {
- if (!strncmp(p,"off",3))
- no_iommu = 1;
- /* gart_parse_options has more force support */
- if (!strncmp(p,"force",5))
- force_iommu = 1;
- if (!strncmp(p,"noforce",7)) {
- iommu_merge = 0;
- force_iommu = 0;
- }
-
- if (!strncmp(p, "biomerge",8)) {
- iommu_bio_merge = 4096;
- iommu_merge = 1;
- force_iommu = 1;
- }
- if (!strncmp(p, "panic",5))
- panic_on_overflow = 1;
- if (!strncmp(p, "nopanic",7))
- panic_on_overflow = 0;
- if (!strncmp(p, "merge",5)) {
- iommu_merge = 1;
- force_iommu = 1;
- }
- if (!strncmp(p, "nomerge",7))
- iommu_merge = 0;
- if (!strncmp(p, "forcesac",8))
- iommu_sac_force = 1;
+ iommu_merge = 1;
+
+ if (!p)
+ return -EINVAL;
+
+ while (*p) {
+ if (!strncmp(p,"off",3))
+ no_iommu = 1;
+ /* gart_parse_options has more force support */
+ if (!strncmp(p,"force",5))
+ force_iommu = 1;
+ if (!strncmp(p,"noforce",7)) {
+ iommu_merge = 0;
+ force_iommu = 0;
+ }
+
+ if (!strncmp(p, "biomerge",8)) {
+ iommu_bio_merge = 4096;
+ iommu_merge = 1;
+ force_iommu = 1;
+ }
+ if (!strncmp(p, "panic",5))
+ panic_on_overflow = 1;
+ if (!strncmp(p, "nopanic",7))
+ panic_on_overflow = 0;
+ if (!strncmp(p, "merge",5)) {
+ iommu_merge = 1;
+ force_iommu = 1;
+ }
+ if (!strncmp(p, "nomerge",7))
+ iommu_merge = 0;
+ if (!strncmp(p, "forcesac",8))
+ iommu_sac_force = 1;
+ if (!strncmp(p, "allowdac", 8))
+ forbid_dac = 0;
+ if (!strncmp(p, "nodac", 5))
+ forbid_dac = -1;
#ifdef CONFIG_SWIOTLB
- if (!strncmp(p, "soft",4))
- swiotlb = 1;
+ if (!strncmp(p, "soft",4))
+ swiotlb = 1;
#endif
#ifdef CONFIG_IOMMU
- gart_parse_options(p);
+ gart_parse_options(p);
#endif
- p += strcspn(p, ",");
- if (*p == ',')
- ++p;
- }
- return 1;
+ p += strcspn(p, ",");
+ if (*p == ',')
+ ++p;
+ }
+ return 0;
}
-__setup("iommu=", iommu_setup);
+early_param("iommu", iommu_setup);
void __init pci_iommu_alloc(void)
{
diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c
index 6d3e61baf7a..16261a8a330 100644
--- a/arch/x86_64/kernel/pci-gart.c
+++ b/arch/x86_64/kernel/pci-gart.c
@@ -239,8 +239,6 @@ dma_addr_t gart_map_single(struct device *dev, void *addr, size_t size, int dir)
{
unsigned long phys_mem, bus;
- BUG_ON(dir == DMA_NONE);
-
if (!dev)
dev = &fallback_dev;
@@ -383,7 +381,6 @@ int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
unsigned long pages = 0;
int need = 0, nextneed;
- BUG_ON(dir == DMA_NONE);
if (nents == 0)
return 0;
diff --git a/arch/x86_64/kernel/pci-nommu.c b/arch/x86_64/kernel/pci-nommu.c
index aad7609d8e9..df09ab05a1b 100644
--- a/arch/x86_64/kernel/pci-nommu.c
+++ b/arch/x86_64/kernel/pci-nommu.c
@@ -59,7 +59,6 @@ int nommu_map_sg(struct device *hwdev, struct scatterlist *sg,
{
int i;
- BUG_ON(direction == DMA_NONE);
for (i = 0; i < nents; i++ ) {
struct scatterlist *s = &sg[i];
BUG_ON(!s->page);
diff --git a/arch/x86_64/kernel/pci-swiotlb.c b/arch/x86_64/kernel/pci-swiotlb.c
index 6a55f87ba97..697f0aa794b 100644
--- a/arch/x86_64/kernel/pci-swiotlb.c
+++ b/arch/x86_64/kernel/pci-swiotlb.c
@@ -3,7 +3,8 @@
#include <linux/pci.h>
#include <linux/cache.h>
#include <linux/module.h>
-#include <asm/dma-mapping.h>
+#include <linux/dma-mapping.h>
+
#include <asm/proto.h>
#include <asm/swiotlb.h>
#include <asm/dma.h>
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index bb6745d13b8..458006ae19f 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -80,25 +80,25 @@ void idle_notifier_unregister(struct notifier_block *n)
}
EXPORT_SYMBOL(idle_notifier_unregister);
-enum idle_state { CPU_IDLE, CPU_NOT_IDLE };
-static DEFINE_PER_CPU(enum idle_state, idle_state) = CPU_NOT_IDLE;
-
void enter_idle(void)
{
- __get_cpu_var(idle_state) = CPU_IDLE;
+ write_pda(isidle, 1);
atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
}
static void __exit_idle(void)
{
- __get_cpu_var(idle_state) = CPU_NOT_IDLE;
+ if (read_pda(isidle) == 0)
+ return;
+ write_pda(isidle, 0);
atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
}
/* Called from interrupts to signify idle end */
void exit_idle(void)
{
- if (current->pid | read_pda(irqcount))
+ /* idle loop has pid 0 */
+ if (current->pid)
return;
__exit_idle();
}
@@ -220,6 +220,9 @@ void cpu_idle (void)
play_dead();
enter_idle();
idle();
+ /* In many cases the interrupt that ended idle
+ has already called exit_idle. But some idle
+ loops can be woken up without interrupt. */
__exit_idle();
}
@@ -350,6 +353,7 @@ void exit_thread(void)
kfree(t->io_bitmap_ptr);
t->io_bitmap_ptr = NULL;
+ clear_thread_flag(TIF_IO_BITMAP);
/*
* Careful, clear this in the TSS too:
*/
@@ -369,6 +373,7 @@ void flush_thread(void)
if (t->flags & _TIF_IA32)
current_thread_info()->status |= TS_COMPAT;
}
+ t->flags &= ~_TIF_DEBUG;
tsk->thread.debugreg0 = 0;
tsk->thread.debugreg1 = 0;
@@ -461,7 +466,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
asm("mov %%es,%0" : "=m" (p->thread.es));
asm("mov %%ds,%0" : "=m" (p->thread.ds));
- if (unlikely(me->thread.io_bitmap_ptr != NULL)) {
+ if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
if (!p->thread.io_bitmap_ptr) {
p->thread.io_bitmap_max = 0;
@@ -469,6 +474,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
}
memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
IO_BITMAP_BYTES);
+ set_tsk_thread_flag(p, TIF_IO_BITMAP);
}
/*
@@ -498,6 +504,40 @@ out:
*/
#define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r)
+static inline void __switch_to_xtra(struct task_struct *prev_p,
+ struct task_struct *next_p,
+ struct tss_struct *tss)
+{
+ struct thread_struct *prev, *next;
+
+ prev = &prev_p->thread,
+ next = &next_p->thread;
+
+ if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
+ loaddebug(next, 0);
+ loaddebug(next, 1);
+ loaddebug(next, 2);
+ loaddebug(next, 3);
+ /* no 4 and 5 */
+ loaddebug(next, 6);
+ loaddebug(next, 7);
+ }
+
+ if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
+ /*
+ * Copy the relevant range of the IO bitmap.
+ * Normally this is 128 bytes or less:
+ */
+ memcpy(tss->io_bitmap, next->io_bitmap_ptr,
+ max(prev->io_bitmap_max, next->io_bitmap_max));
+ } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
+ /*
+ * Clear any possible leftover bits:
+ */
+ memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
+ }
+}
+
/*
* switch_to(x,y) should switch tasks from x to y.
*
@@ -515,6 +555,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
int cpu = smp_processor_id();
struct tss_struct *tss = &per_cpu(init_tss, cpu);
+ /* we're going to use this soon, after a few expensive things */
+ if (next_p->fpu_counter>5)
+ prefetch(&next->i387.fxsave);
+
/*
* Reload esp0, LDT and the page table pointer:
*/
@@ -583,41 +627,29 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
And the AMD workaround requires it to be after DS reload. */
unlazy_fpu(prev_p);
write_pda(kernelstack,
- task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
-
+ (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
+#ifdef CONFIG_CC_STACKPROTECTOR
+ write_pda(stack_canary, next_p->stack_canary);
/*
- * Now maybe reload the debug registers
+ * Build time only check to make sure the stack_canary is at
+ * offset 40 in the pda; this is a gcc ABI requirement
*/
- if (unlikely(next->debugreg7)) {
- loaddebug(next, 0);
- loaddebug(next, 1);
- loaddebug(next, 2);
- loaddebug(next, 3);
- /* no 4 and 5 */
- loaddebug(next, 6);
- loaddebug(next, 7);
- }
-
+ BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
+#endif
- /*
- * Handle the IO bitmap
- */
- if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) {
- if (next->io_bitmap_ptr)
- /*
- * Copy the relevant range of the IO bitmap.
- * Normally this is 128 bytes or less:
- */
- memcpy(tss->io_bitmap, next->io_bitmap_ptr,
- max(prev->io_bitmap_max, next->io_bitmap_max));
- else {
- /*
- * Clear any possible leftover bits:
- */
- memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
- }
- }
+ /*
+ * Now maybe reload the debug registers and handle I/O bitmaps
+ */
+ if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW))
+ || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP))
+ __switch_to_xtra(prev_p, next_p, tss);
+ /* If the task has used fpu the last 5 timeslices, just do a full
+ * restore of the math state immediately to avoid the trap; the
+ * chances of needing FPU soon are obviously high now
+ */
+ if (next_p->fpu_counter>5)
+ math_state_restore();
return prev_p;
}
@@ -834,7 +866,7 @@ int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
unsigned long arch_align_stack(unsigned long sp)
{
- if (randomize_va_space)
+ if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
sp -= get_random_int() % 8192;
return sp & ~0xf;
}
diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c
index 2d50024c9f3..addc14af0c5 100644
--- a/arch/x86_64/kernel/ptrace.c
+++ b/arch/x86_64/kernel/ptrace.c
@@ -116,17 +116,17 @@ unsigned long convert_rip_to_linear(struct task_struct *child, struct pt_regs *r
return addr;
}
-static int is_at_popf(struct task_struct *child, struct pt_regs *regs)
+static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
{
int i, copied;
- unsigned char opcode[16];
+ unsigned char opcode[15];
unsigned long addr = convert_rip_to_linear(child, regs);
copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0);
for (i = 0; i < copied; i++) {
switch (opcode[i]) {
- /* popf */
- case 0x9d:
+ /* popf and iret */
+ case 0x9d: case 0xcf:
return 1;
/* CHECKME: 64 65 */
@@ -138,14 +138,17 @@ static int is_at_popf(struct task_struct *child, struct pt_regs *regs)
case 0x26: case 0x2e:
case 0x36: case 0x3e:
case 0x64: case 0x65:
- case 0xf0: case 0xf2: case 0xf3:
+ case 0xf2: case 0xf3:
continue;
- /* REX prefixes */
case 0x40 ... 0x4f:
+ if (regs->cs != __USER_CS)
+ /* 32-bit mode: register increment */
+ return 0;
+ /* 64-bit mode: REX prefix */
continue;
- /* CHECKME: f0, f2, f3 */
+ /* CHECKME: f2, f3 */
/*
* pushf: NOTE! We should probably not let
@@ -186,10 +189,8 @@ static void set_singlestep(struct task_struct *child)
* ..but if TF is changed by the instruction we will trace,
* don't mark it as being "us" that set it, so that we
* won't clear it by hand later.
- *
- * AK: this is not enough, LAHF and IRET can change TF in user space too.
*/
- if (is_at_popf(child, regs))
+ if (is_setting_trap_flag(child, regs))
return;
child->ptrace |= PT_DTRACE;
@@ -420,9 +421,13 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
if ((0x5554 >> ((data >> (16 + 4*i)) & 0xf)) & 1)
break;
if (i == 4) {
- child->thread.debugreg7 = data;
+ child->thread.debugreg7 = data;
+ if (data)
+ set_tsk_thread_flag(child, TIF_DEBUG);
+ else
+ clear_tsk_thread_flag(child, TIF_DEBUG);
ret = 0;
- }
+ }
break;
}
break;
diff --git a/arch/x86_64/kernel/relocate_kernel.S b/arch/x86_64/kernel/relocate_kernel.S
index d24fa9b72a2..14e95872c6a 100644
--- a/arch/x86_64/kernel/relocate_kernel.S
+++ b/arch/x86_64/kernel/relocate_kernel.S
@@ -7,31 +7,169 @@
*/
#include <linux/linkage.h>
+#include <asm/page.h>
+#include <asm/kexec.h>
- /*
- * Must be relocatable PIC code callable as a C function, that once
- * it starts can not use the previous processes stack.
- */
- .globl relocate_new_kernel
+/*
+ * Must be relocatable PIC code callable as a C function
+ */
+
+#define PTR(x) (x << 3)
+#define PAGE_ALIGNED (1 << PAGE_SHIFT)
+#define PAGE_ATTR 0x63 /* _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY */
+
+ .text
+ .align PAGE_ALIGNED
.code64
+ .globl relocate_kernel
+relocate_kernel:
+ /* %rdi indirection_page
+ * %rsi page_list
+ * %rdx start address
+ */
+
+ /* map the control page at its virtual address */
+
+ movq $0x0000ff8000000000, %r10 /* mask */
+ mov $(39 - 3), %cl /* bits to shift */
+ movq PTR(VA_CONTROL_PAGE)(%rsi), %r11 /* address to map */
+
+ movq %r11, %r9
+ andq %r10, %r9
+ shrq %cl, %r9
+
+ movq PTR(VA_PGD)(%rsi), %r8
+ addq %r8, %r9
+ movq PTR(PA_PUD_0)(%rsi), %r8
+ orq $PAGE_ATTR, %r8
+ movq %r8, (%r9)
+
+ shrq $9, %r10
+ sub $9, %cl
+
+ movq %r11, %r9
+ andq %r10, %r9
+ shrq %cl, %r9
+
+ movq PTR(VA_PUD_0)(%rsi), %r8
+ addq %r8, %r9
+ movq PTR(PA_PMD_0)(%rsi), %r8
+ orq $PAGE_ATTR, %r8
+ movq %r8, (%r9)
+
+ shrq $9, %r10
+ sub $9, %cl
+
+ movq %r11, %r9
+ andq %r10, %r9
+ shrq %cl, %r9
+
+ movq PTR(VA_PMD_0)(%rsi), %r8
+ addq %r8, %r9
+ movq PTR(PA_PTE_0)(%rsi), %r8
+ orq $PAGE_ATTR, %r8
+ movq %r8, (%r9)
+
+ shrq $9, %r10
+ sub $9, %cl
+
+ movq %r11, %r9
+ andq %r10, %r9
+ shrq %cl, %r9
+
+ movq PTR(VA_PTE_0)(%rsi), %r8
+ addq %r8, %r9
+ movq PTR(PA_CONTROL_PAGE)(%rsi), %r8
+ orq $PAGE_ATTR, %r8
+ movq %r8, (%r9)
+
+ /* identity map the control page at its physical address */
+
+ movq $0x0000ff8000000000, %r10 /* mask */
+ mov $(39 - 3), %cl /* bits to shift */
+ movq PTR(PA_CONTROL_PAGE)(%rsi), %r11 /* address to map */
+
+ movq %r11, %r9
+ andq %r10, %r9
+ shrq %cl, %r9
+
+ movq PTR(VA_PGD)(%rsi), %r8
+ addq %r8, %r9
+ movq PTR(PA_PUD_1)(%rsi), %r8
+ orq $PAGE_ATTR, %r8
+ movq %r8, (%r9)
+
+ shrq $9, %r10
+ sub $9, %cl
+
+ movq %r11, %r9
+ andq %r10, %r9
+ shrq %cl, %r9
+
+ movq PTR(VA_PUD_1)(%rsi), %r8
+ addq %r8, %r9
+ movq PTR(PA_PMD_1)(%rsi), %r8
+ orq $PAGE_ATTR, %r8
+ movq %r8, (%r9)
+
+ shrq $9, %r10
+ sub $9, %cl
+
+ movq %r11, %r9
+ andq %r10, %r9
+ shrq %cl, %r9
+
+ movq PTR(VA_PMD_1)(%rsi), %r8
+ addq %r8, %r9
+ movq PTR(PA_PTE_1)(%rsi), %r8
+ orq $PAGE_ATTR, %r8
+ movq %r8, (%r9)
+
+ shrq $9, %r10
+ sub $9, %cl
+
+ movq %r11, %r9
+ andq %r10, %r9
+ shrq %cl, %r9
+
+ movq PTR(VA_PTE_1)(%rsi), %r8
+ addq %r8, %r9
+ movq PTR(PA_CONTROL_PAGE)(%rsi), %r8
+ orq $PAGE_ATTR, %r8
+ movq %r8, (%r9)
+
relocate_new_kernel:
- /* %rdi page_list
- * %rsi reboot_code_buffer
+ /* %rdi indirection_page
+ * %rsi page_list
* %rdx start address
- * %rcx page_table
- * %r8 arg5
- * %r9 arg6
*/
/* zero out flags, and disable interrupts */
pushq $0
popfq
- /* set a new stack at the bottom of our page... */
- lea 4096(%rsi), %rsp
+ /* get physical address of control page now */
+ /* this is impossible after page table switch */
+ movq PTR(PA_CONTROL_PAGE)(%rsi), %r8
+
+ /* get physical address of page table now too */
+ movq PTR(PA_TABLE_PAGE)(%rsi), %rcx
- /* store the parameters back on the stack */
- pushq %rdx /* store the start address */
+ /* switch to new set of page tables */
+ movq PTR(PA_PGD)(%rsi), %r9
+ movq %r9, %cr3
+
+ /* setup a new stack at the end of the physical control page */
+ lea 4096(%r8), %rsp
+
+ /* jump to identity mapped page */
+ addq $(identity_mapped - relocate_kernel), %r8
+ pushq %r8
+ ret
+
+identity_mapped:
+ /* store the start address on the stack */
+ pushq %rdx
/* Set cr0 to a known state:
* 31 1 == Paging enabled
@@ -136,8 +274,3 @@ relocate_new_kernel:
xorq %r15, %r15
ret
-relocate_new_kernel_end:
-
- .globl relocate_new_kernel_size
-relocate_new_kernel_size:
- .quad relocate_new_kernel_end - relocate_new_kernel
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 34afad70482..fc944b5e8f4 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -74,16 +74,6 @@ EXPORT_SYMBOL(boot_cpu_data);
unsigned long mmu_cr4_features;
-int acpi_disabled;
-EXPORT_SYMBOL(acpi_disabled);
-#ifdef CONFIG_ACPI
-extern int __initdata acpi_ht;
-extern acpi_interrupt_flags acpi_sci_flags;
-int __initdata acpi_force = 0;
-#endif
-
-int acpi_numa __initdata;
-
/* Boot loader ID as an integer, for the benefit of proc_dointvec */
int bootloader_type;
@@ -107,7 +97,6 @@ struct sys_desc_table_struct {
struct edid_info edid_info;
EXPORT_SYMBOL_GPL(edid_info);
-struct e820map e820;
extern int root_mountflags;
@@ -134,9 +123,6 @@ struct resource standard_io_resources[] = {
.flags = IORESOURCE_BUSY | IORESOURCE_IO }
};
-#define STANDARD_IO_RESOURCES \
- (sizeof standard_io_resources / sizeof standard_io_resources[0])
-
#define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM)
struct resource data_resource = {
@@ -183,9 +169,6 @@ static struct resource adapter_rom_resources[] = {
.flags = IORESOURCE_ROM }
};
-#define ADAPTER_ROM_RESOURCES \
- (sizeof adapter_rom_resources / sizeof adapter_rom_resources[0])
-
static struct resource video_rom_resource = {
.name = "Video ROM",
.start = 0xc0000,
@@ -256,7 +239,8 @@ static void __init probe_roms(void)
}
/* check for adapter roms on 2k boundaries */
- for (i = 0; i < ADAPTER_ROM_RESOURCES && start < upper; start += 2048) {
+ for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper;
+ start += 2048) {
rom = isa_bus_to_virt(start);
if (!romsignature(rom))
continue;
@@ -276,185 +260,22 @@ static void __init probe_roms(void)
}
}
-/* Check for full argument with no trailing characters */
-static int fullarg(char *p, char *arg)
+#ifdef CONFIG_PROC_VMCORE
+/* elfcorehdr= specifies the location of elf core header
+ * stored by the crashed kernel. This option will be passed
+ * by kexec loader to the capture kernel.
+ */
+static int __init setup_elfcorehdr(char *arg)
{
- int l = strlen(arg);
- return !memcmp(p, arg, l) && (p[l] == 0 || isspace(p[l]));
+ char *end;
+ if (!arg)
+ return -EINVAL;
+ elfcorehdr_addr = memparse(arg, &end);
+ return end > arg ? 0 : -EINVAL;
}
-
-static __init void parse_cmdline_early (char ** cmdline_p)
-{
- char c = ' ', *to = command_line, *from = COMMAND_LINE;
- int len = 0;
- int userdef = 0;
-
- for (;;) {
- if (c != ' ')
- goto next_char;
-
-#ifdef CONFIG_SMP
- /*
- * If the BIOS enumerates physical processors before logical,
- * maxcpus=N at enumeration-time can be used to disable HT.
- */
- else if (!memcmp(from, "maxcpus=", 8)) {
- extern unsigned int maxcpus;
-
- maxcpus = simple_strtoul(from + 8, NULL, 0);
- }
-#endif
-#ifdef CONFIG_ACPI
- /* "acpi=off" disables both ACPI table parsing and interpreter init */
- if (fullarg(from,"acpi=off"))
- disable_acpi();
-
- if (fullarg(from, "acpi=force")) {
- /* add later when we do DMI horrors: */
- acpi_force = 1;
- acpi_disabled = 0;
- }
-
- /* acpi=ht just means: do ACPI MADT parsing
- at bootup, but don't enable the full ACPI interpreter */
- if (fullarg(from, "acpi=ht")) {
- if (!acpi_force)
- disable_acpi();
- acpi_ht = 1;
- }
- else if (fullarg(from, "pci=noacpi"))
- acpi_disable_pci();
- else if (fullarg(from, "acpi=noirq"))
- acpi_noirq_set();
-
- else if (fullarg(from, "acpi_sci=edge"))
- acpi_sci_flags.trigger = 1;
- else if (fullarg(from, "acpi_sci=level"))
- acpi_sci_flags.trigger = 3;
- else if (fullarg(from, "acpi_sci=high"))
- acpi_sci_flags.polarity = 1;
- else if (fullarg(from, "acpi_sci=low"))
- acpi_sci_flags.polarity = 3;
-
- /* acpi=strict disables out-of-spec workarounds */
- else if (fullarg(from, "acpi=strict")) {
- acpi_strict = 1;
- }
-#ifdef CONFIG_X86_IO_APIC
- else if (fullarg(from, "acpi_skip_timer_override"))
- acpi_skip_timer_override = 1;
-#endif
+early_param("elfcorehdr", setup_elfcorehdr);
#endif
- if (fullarg(from, "disable_timer_pin_1"))
- disable_timer_pin_1 = 1;
- if (fullarg(from, "enable_timer_pin_1"))
- disable_timer_pin_1 = -1;
-
- if (fullarg(from, "nolapic") || fullarg(from, "disableapic")) {
- clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
- disable_apic = 1;
- }
-
- if (fullarg(from, "noapic"))
- skip_ioapic_setup = 1;
-
- if (fullarg(from,"apic")) {
- skip_ioapic_setup = 0;
- ioapic_force = 1;
- }
-
- if (!memcmp(from, "mem=", 4))
- parse_memopt(from+4, &from);
-
- if (!memcmp(from, "memmap=", 7)) {
- /* exactmap option is for used defined memory */
- if (!memcmp(from+7, "exactmap", 8)) {
-#ifdef CONFIG_CRASH_DUMP
- /* If we are doing a crash dump, we
- * still need to know the real mem
- * size before original memory map is
- * reset.
- */
- saved_max_pfn = e820_end_of_ram();
-#endif
- from += 8+7;
- end_pfn_map = 0;
- e820.nr_map = 0;
- userdef = 1;
- }
- else {
- parse_memmapopt(from+7, &from);
- userdef = 1;
- }
- }
-
-#ifdef CONFIG_NUMA
- if (!memcmp(from, "numa=", 5))
- numa_setup(from+5);
-#endif
-
- if (!memcmp(from,"iommu=",6)) {
- iommu_setup(from+6);
- }
-
- if (fullarg(from,"oops=panic"))
- panic_on_oops = 1;
-
- if (!memcmp(from, "noexec=", 7))
- nonx_setup(from + 7);
-
-#ifdef CONFIG_KEXEC
- /* crashkernel=size@addr specifies the location to reserve for
- * a crash kernel. By reserving this memory we guarantee
- * that linux never set's it up as a DMA target.
- * Useful for holding code to do something appropriate
- * after a kernel panic.
- */
- else if (!memcmp(from, "crashkernel=", 12)) {
- unsigned long size, base;
- size = memparse(from+12, &from);
- if (*from == '@') {
- base = memparse(from+1, &from);
- /* FIXME: Do I want a sanity check
- * to validate the memory range?
- */
- crashk_res.start = base;
- crashk_res.end = base + size - 1;
- }
- }
-#endif
-
-#ifdef CONFIG_PROC_VMCORE
- /* elfcorehdr= specifies the location of elf core header
- * stored by the crashed kernel. This option will be passed
- * by kexec loader to the capture kernel.
- */
- else if(!memcmp(from, "elfcorehdr=", 11))
- elfcorehdr_addr = memparse(from+11, &from);
-#endif
-
-#ifdef CONFIG_HOTPLUG_CPU
- else if (!memcmp(from, "additional_cpus=", 16))
- setup_additional_cpus(from+16);
-#endif
-
- next_char:
- c = *(from++);
- if (!c)
- break;
- if (COMMAND_LINE_SIZE <= ++len)
- break;
- *(to++) = c;
- }
- if (userdef) {
- printk(KERN_INFO "user-defined physical RAM map:\n");
- e820_print_map("user");
- }
- *to = '\0';
- *cmdline_p = command_line;
-}
-
#ifndef CONFIG_NUMA
static void __init
contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
@@ -466,7 +287,8 @@ contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
if (bootmap == -1L)
panic("Cannot find bootmem map of size %ld\n",bootmap_size);
bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
- e820_bootmem_free(NODE_DATA(0), 0, end_pfn << PAGE_SHIFT);
+ e820_register_active_regions(0, start_pfn, end_pfn);
+ free_bootmem_with_active_regions(0, end_pfn);
reserve_bootmem(bootmap, bootmap_size);
}
#endif
@@ -521,6 +343,8 @@ static void discover_ebda(void)
void __init setup_arch(char **cmdline_p)
{
+ printk(KERN_INFO "Command line: %s\n", saved_command_line);
+
ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV);
screen_info = SCREEN_INFO;
edid_info = EDID_INFO;
@@ -547,16 +371,22 @@ void __init setup_arch(char **cmdline_p)
data_resource.start = virt_to_phys(&_etext);
data_resource.end = virt_to_phys(&_edata)-1;
- parse_cmdline_early(cmdline_p);
-
early_identify_cpu(&boot_cpu_data);
+ strlcpy(command_line, saved_command_line, COMMAND_LINE_SIZE);
+ *cmdline_p = command_line;
+
+ parse_early_param();
+
+ finish_e820_parsing();
+
+ e820_register_active_regions(0, 0, -1UL);
/*
* partially used pages are not usable - thus
* we are rounding upwards:
*/
end_pfn = e820_end_of_ram();
- num_physpages = end_pfn; /* for pfn_valid */
+ num_physpages = end_pfn;
check_efer();
@@ -576,6 +406,14 @@ void __init setup_arch(char **cmdline_p)
acpi_boot_table_init();
#endif
+ /* How many end-of-memory variables you have, grandma! */
+ max_low_pfn = end_pfn;
+ max_pfn = end_pfn;
+ high_memory = (void *)__va(end_pfn * PAGE_SIZE - 1) + 1;
+
+ /* Remove active ranges so rediscovery with NUMA-awareness happens */
+ remove_all_active_ranges();
+
#ifdef CONFIG_ACPI_NUMA
/*
* Parse SRAT to discover nodes.
@@ -625,12 +463,10 @@ void __init setup_arch(char **cmdline_p)
*/
acpi_reserve_bootmem();
#endif
-#ifdef CONFIG_X86_LOCAL_APIC
/*
* Find and reserve possible boot-time SMP configuration:
*/
find_smp_config();
-#endif
#ifdef CONFIG_BLK_DEV_INITRD
if (LOADER_TYPE && INITRD_START) {
if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) {
@@ -657,7 +493,9 @@ void __init setup_arch(char **cmdline_p)
paging_init();
- check_ioapic();
+#ifdef CONFIG_PCI
+ early_quirks();
+#endif
/*
* set this early, so we dont allocate cpu0
@@ -674,14 +512,12 @@ void __init setup_arch(char **cmdline_p)
init_cpu_to_node();
-#ifdef CONFIG_X86_LOCAL_APIC
/*
* get boot-time SMP configuration:
*/
if (smp_found_config)
get_smp_config();
init_apic_mappings();
-#endif
/*
* Request address space for all standard RAM and ROM resources
@@ -689,13 +525,14 @@ void __init setup_arch(char **cmdline_p)
*/
probe_roms();
e820_reserve_resources();
+ e820_mark_nosave_regions();
request_resource(&iomem_resource, &video_ram_resource);
{
unsigned i;
/* request I/O space for devices used on all i[345]86 PCs */
- for (i = 0; i < STANDARD_IO_RESOURCES; i++)
+ for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
request_resource(&ioport_resource, &standard_io_resources[i]);
}
@@ -838,7 +675,7 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
#endif
}
-static void __init init_amd(struct cpuinfo_x86 *c)
+static void __cpuinit init_amd(struct cpuinfo_x86 *c)
{
unsigned level;
@@ -894,6 +731,12 @@ static void __init init_amd(struct cpuinfo_x86 *c)
/* Fix cpuid4 emulation for more */
num_cache_leaves = 3;
+
+ /* When there is only one core no need to synchronize RDTSC */
+ if (num_possible_cpus() == 1)
+ set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
+ else
+ clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
}
static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
@@ -975,8 +818,7 @@ static void srat_detect_node(void)
node = first_node(node_online_map);
numa_set_node(cpu, node);
- if (acpi_numa > 0)
- printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
+ printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
#endif
}
@@ -1010,6 +852,8 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
(c->x86 == 0x6 && c->x86_model >= 0x0e))
set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
+ if (c->x86 == 6)
+ set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
c->x86_max_cores = intel_num_cpu_cores(c);
@@ -1228,8 +1072,8 @@ static int show_cpuinfo(struct seq_file *m, void *v)
/* Intel-defined (#2) */
"pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est",
- "tm2", NULL, "cid", NULL, NULL, "cx16", "xtpr", NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL,
+ NULL, NULL, "dca", NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
/* VIA/Cyrix/Centaur-defined */
diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c
index 417de564456..8c4b80fe71a 100644
--- a/arch/x86_64/kernel/setup64.c
+++ b/arch/x86_64/kernel/setup64.c
@@ -24,7 +24,7 @@
#include <asm/proto.h>
#include <asm/sections.h>
-char x86_boot_params[BOOT_PARAM_SIZE] __initdata = {0,};
+char x86_boot_params[BOOT_PARAM_SIZE] __initdata;
cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
@@ -46,8 +46,10 @@ Control non executable mappings for 64bit processes.
on Enable(default)
off Disable
*/
-int __init nonx_setup(char *str)
+static int __init nonx_setup(char *str)
{
+ if (!str)
+ return -EINVAL;
if (!strncmp(str, "on", 2)) {
__supported_pte_mask |= _PAGE_NX;
do_not_nx = 0;
@@ -55,9 +57,9 @@ int __init nonx_setup(char *str)
do_not_nx = 1;
__supported_pte_mask &= ~_PAGE_NX;
}
- return 1;
+ return 0;
}
-__setup("noexec=", nonx_setup); /* parsed early actually */
+early_param("noexec", nonx_setup);
int force_personality32 = 0;
@@ -93,12 +95,9 @@ void __init setup_per_cpu_areas(void)
#endif
/* Copy section for each CPU (we discard the original) */
- size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES);
-#ifdef CONFIG_MODULES
- if (size < PERCPU_ENOUGH_ROOM)
- size = PERCPU_ENOUGH_ROOM;
-#endif
+ size = PERCPU_ENOUGH_ROOM;
+ printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n", size);
for_each_cpu_mask (i, cpu_possible_map) {
char *ptr;
@@ -122,7 +121,10 @@ void pda_init(int cpu)
/* Setup up data that may be needed in __get_free_pages early */
asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0));
+ /* Memory clobbers used to order PDA accessed */
+ mb();
wrmsrl(MSR_GS_BASE, pda);
+ mb();
pda->cpunumber = cpu;
pda->irqcount = -1;
@@ -178,6 +180,8 @@ void __cpuinit check_efer(void)
}
}
+unsigned long kernel_eflags;
+
/*
* cpu_init() initializes state that is per-CPU. Some data is already
* initialized (naturally) in the bootstrap process, such as the GDT
@@ -235,28 +239,17 @@ void __cpuinit cpu_init (void)
* set up and load the per-CPU TSS
*/
for (v = 0; v < N_EXCEPTION_STACKS; v++) {
+ static const unsigned int order[N_EXCEPTION_STACKS] = {
+ [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
+ [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
+ };
if (cpu) {
- static const unsigned int order[N_EXCEPTION_STACKS] = {
- [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
- [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
- };
-
estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
if (!estacks)
panic("Cannot allocate exception stack %ld %d\n",
v, cpu);
}
- switch (v + 1) {
-#if DEBUG_STKSZ > EXCEPTION_STKSZ
- case DEBUG_STACK:
- cpu_pda(cpu)->debugstack = (unsigned long)estacks;
- estacks += DEBUG_STKSZ;
- break;
-#endif
- default:
- estacks += EXCEPTION_STKSZ;
- break;
- }
+ estacks += PAGE_SIZE << order[v];
orig_ist->ist[v] = t->ist[v] = (unsigned long)estacks;
}
@@ -290,4 +283,6 @@ void __cpuinit cpu_init (void)
set_debugreg(0UL, 7);
fpu_init();
+
+ raw_local_save_flags(kernel_eflags);
}
diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c
index 28161170fb0..49ec324cd14 100644
--- a/arch/x86_64/kernel/signal.c
+++ b/arch/x86_64/kernel/signal.c
@@ -38,37 +38,6 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
sigset_t *set, struct pt_regs * regs);
asmlinkage long
-sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize, struct pt_regs *regs)
-{
- sigset_t saveset, newset;
-
- /* XXX: Don't preclude handling different sized sigset_t's. */
- if (sigsetsize != sizeof(sigset_t))
- return -EINVAL;
-
- if (copy_from_user(&newset, unewset, sizeof(newset)))
- return -EFAULT;
- sigdelsetmask(&newset, ~_BLOCKABLE);
-
- spin_lock_irq(&current->sighand->siglock);
- saveset = current->blocked;
- current->blocked = newset;
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
-#ifdef DEBUG_SIG
- printk("rt_sigsuspend savset(%lx) newset(%lx) regs(%p) rip(%lx)\n",
- saveset, newset, regs, regs->rip);
-#endif
- regs->rax = -EINTR;
- while (1) {
- current->state = TASK_INTERRUPTIBLE;
- schedule();
- if (do_signal(regs, &saveset))
- return -EINTR;
- }
-}
-
-asmlinkage long
sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
struct pt_regs *regs)
{
@@ -308,11 +277,6 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
#endif
/* Set up registers for signal handler */
- {
- struct exec_domain *ed = current_thread_info()->exec_domain;
- if (unlikely(ed && ed->signal_invmap && sig < 32))
- sig = ed->signal_invmap[sig];
- }
regs->rdi = sig;
/* In case the signal handler was declared without prototypes */
regs->rax = 0;
@@ -341,11 +305,11 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
current->comm, current->pid, frame, regs->rip, frame->pretcode);
#endif
- return 1;
+ return 0;
give_sigsegv:
force_sigsegv(sig, current);
- return 0;
+ return -EFAULT;
}
/*
@@ -408,7 +372,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
#endif
ret = setup_rt_frame(sig, ka, info, oldset, regs);
- if (ret) {
+ if (ret == 0) {
spin_lock_irq(&current->sighand->siglock);
sigorsets(&current->blocked,&current->blocked,&ka->sa.sa_mask);
if (!(ka->sa.sa_flags & SA_NODEFER))
@@ -425,11 +389,12 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
* want to handle. Thus you cannot kill init even with a SIGKILL even by
* mistake.
*/
-int do_signal(struct pt_regs *regs, sigset_t *oldset)
+static void do_signal(struct pt_regs *regs)
{
struct k_sigaction ka;
siginfo_t info;
int signr;
+ sigset_t *oldset;
/*
* We want the common case to go fast, which
@@ -438,9 +403,11 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset)
* if so.
*/
if (!user_mode(regs))
- return 1;
+ return;
- if (!oldset)
+ if (test_thread_flag(TIF_RESTORE_SIGMASK))
+ oldset = &current->saved_sigmask;
+ else
oldset = &current->blocked;
signr = get_signal_to_deliver(&info, &ka, regs, NULL);
@@ -454,30 +421,46 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset)
set_debugreg(current->thread.debugreg7, 7);
/* Whee! Actually deliver the signal. */
- return handle_signal(signr, &info, &ka, oldset, regs);
+ if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
+ /* a signal was successfully delivered; the saved
+ * sigmask will have been stored in the signal frame,
+ * and will be restored by sigreturn, so we can simply
+ * clear the TIF_RESTORE_SIGMASK flag */
+ clear_thread_flag(TIF_RESTORE_SIGMASK);
+ }
+ return;
}
/* Did we come from a system call? */
if ((long)regs->orig_rax >= 0) {
/* Restart the system call - no handlers present */
long res = regs->rax;
- if (res == -ERESTARTNOHAND ||
- res == -ERESTARTSYS ||
- res == -ERESTARTNOINTR) {
+ switch (res) {
+ case -ERESTARTNOHAND:
+ case -ERESTARTSYS:
+ case -ERESTARTNOINTR:
regs->rax = regs->orig_rax;
regs->rip -= 2;
- }
- if (regs->rax == (unsigned long)-ERESTART_RESTARTBLOCK) {
+ break;
+ case -ERESTART_RESTARTBLOCK:
regs->rax = test_thread_flag(TIF_IA32) ?
__NR_ia32_restart_syscall :
__NR_restart_syscall;
regs->rip -= 2;
+ break;
}
}
- return 0;
+
+ /* if there's no signal to deliver, we just put the saved sigmask
+ back. */
+ if (test_thread_flag(TIF_RESTORE_SIGMASK)) {
+ clear_thread_flag(TIF_RESTORE_SIGMASK);
+ sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
+ }
}
-void do_notify_resume(struct pt_regs *regs, sigset_t *oldset, __u32 thread_info_flags)
+void
+do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
{
#ifdef DEBUG_SIG
printk("do_notify_resume flags:%x rip:%lx rsp:%lx caller:%lx pending:%lx\n",
@@ -491,8 +474,8 @@ void do_notify_resume(struct pt_regs *regs, sigset_t *oldset, __u32 thread_info_
}
/* deal with pending signal delivery */
- if (thread_info_flags & _TIF_SIGPENDING)
- do_signal(regs,oldset);
+ if (thread_info_flags & (_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK))
+ do_signal(regs);
}
void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c
index 06af6ca6012..4f67697f503 100644
--- a/arch/x86_64/kernel/smp.c
+++ b/arch/x86_64/kernel/smp.c
@@ -522,26 +522,3 @@ asmlinkage void smp_call_function_interrupt(void)
}
}
-int safe_smp_processor_id(void)
-{
- unsigned apicid, i;
-
- if (disable_apic)
- return 0;
-
- apicid = hard_smp_processor_id();
- if (apicid < NR_CPUS && x86_cpu_to_apicid[apicid] == apicid)
- return apicid;
-
- for (i = 0; i < NR_CPUS; ++i) {
- if (x86_cpu_to_apicid[i] == apicid)
- return i;
- }
-
- /* No entries in x86_cpu_to_apicid? Either no MPS|ACPI,
- * or called too early. Either way, we must be CPU 0. */
- if (x86_cpu_to_apicid[0] == BAD_APICID)
- return 0;
-
- return 0; /* Should not happen */
-}
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index 975380207b4..7b7a6870288 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -46,9 +46,10 @@
#include <linux/bootmem.h>
#include <linux/thread_info.h>
#include <linux/module.h>
-
#include <linux/delay.h>
#include <linux/mc146818rtc.h>
+#include <linux/smp.h>
+
#include <asm/mtrr.h>
#include <asm/pgalloc.h>
#include <asm/desc.h>
@@ -1090,7 +1091,6 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
/*
* Switch from PIC to APIC mode.
*/
- connect_bsp_APIC();
setup_local_APIC();
if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_id) {
@@ -1175,12 +1175,9 @@ int __cpuinit __cpu_up(unsigned int cpu)
void __init smp_cpus_done(unsigned int max_cpus)
{
smp_cleanup_boot();
-
-#ifdef CONFIG_X86_IO_APIC
setup_ioapic_dest();
-#endif
-
check_nmi_watchdog();
+ time_init_gtod();
}
#ifdef CONFIG_HOTPLUG_CPU
@@ -1233,6 +1230,8 @@ int __cpu_disable(void)
if (cpu == 0)
return -EBUSY;
+ if (nmi_watchdog == NMI_LOCAL_APIC)
+ stop_apic_nmi_watchdog(NULL);
clear_local_APIC();
/*
@@ -1272,11 +1271,11 @@ void __cpu_die(unsigned int cpu)
printk(KERN_ERR "CPU %u didn't die...\n", cpu);
}
-__init int setup_additional_cpus(char *s)
+static __init int setup_additional_cpus(char *s)
{
- return get_option(&s, &additional_cpus);
+ return s && get_option(&s, &additional_cpus) ? 0 : -EINVAL;
}
-__setup("additional_cpus=", setup_additional_cpus);
+early_param("additional_cpus", setup_additional_cpus);
#else /* ... !CONFIG_HOTPLUG_CPU */
diff --git a/arch/x86_64/kernel/stacktrace.c b/arch/x86_64/kernel/stacktrace.c
index 32cf55eb9af..6026b31d037 100644
--- a/arch/x86_64/kernel/stacktrace.c
+++ b/arch/x86_64/kernel/stacktrace.c
@@ -7,215 +7,49 @@
*/
#include <linux/sched.h>
#include <linux/stacktrace.h>
+#include <linux/module.h>
+#include <asm/stacktrace.h>
-#include <asm/smp.h>
-
-static inline int
-in_range(unsigned long start, unsigned long addr, unsigned long end)
+static void save_stack_warning(void *data, char *msg)
{
- return addr >= start && addr <= end;
}
-static unsigned long
-get_stack_end(struct task_struct *task, unsigned long stack)
+static void
+save_stack_warning_symbol(void *data, char *msg, unsigned long symbol)
{
- unsigned long stack_start, stack_end, flags;
- int i, cpu;
-
- /*
- * The most common case is that we are in the task stack:
- */
- stack_start = (unsigned long)task->thread_info;
- stack_end = stack_start + THREAD_SIZE;
-
- if (in_range(stack_start, stack, stack_end))
- return stack_end;
-
- /*
- * We are in an interrupt if irqstackptr is set:
- */
- raw_local_irq_save(flags);
- cpu = safe_smp_processor_id();
- stack_end = (unsigned long)cpu_pda(cpu)->irqstackptr;
-
- if (stack_end) {
- stack_start = stack_end & ~(IRQSTACKSIZE-1);
- if (in_range(stack_start, stack, stack_end))
- goto out_restore;
- /*
- * We get here if we are in an IRQ context but we
- * are also in an exception stack.
- */
- }
-
- /*
- * Iterate over all exception stacks, and figure out whether
- * 'stack' is in one of them:
- */
- for (i = 0; i < N_EXCEPTION_STACKS; i++) {
- /*
- * set 'end' to the end of the exception stack.
- */
- stack_end = per_cpu(init_tss, cpu).ist[i];
- stack_start = stack_end - EXCEPTION_STKSZ;
-
- /*
- * Is 'stack' above this exception frame's end?
- * If yes then skip to the next frame.
- */
- if (stack >= stack_end)
- continue;
- /*
- * Is 'stack' above this exception frame's start address?
- * If yes then we found the right frame.
- */
- if (stack >= stack_start)
- goto out_restore;
-
- /*
- * If this is a debug stack, and if it has a larger size than
- * the usual exception stacks, then 'stack' might still
- * be within the lower portion of the debug stack:
- */
-#if DEBUG_STKSZ > EXCEPTION_STKSZ
- if (i == DEBUG_STACK - 1 && stack >= stack_end - DEBUG_STKSZ) {
- /*
- * Black magic. A large debug stack is composed of
- * multiple exception stack entries, which we
- * iterate through now. Dont look:
- */
- do {
- stack_end -= EXCEPTION_STKSZ;
- stack_start -= EXCEPTION_STKSZ;
- } while (stack < stack_start);
-
- goto out_restore;
- }
-#endif
- }
- /*
- * Ok, 'stack' is not pointing to any of the system stacks.
- */
- stack_end = 0;
-
-out_restore:
- raw_local_irq_restore(flags);
-
- return stack_end;
}
-
-/*
- * Save stack-backtrace addresses into a stack_trace buffer:
- */
-static inline unsigned long
-save_context_stack(struct stack_trace *trace, unsigned int skip,
- unsigned long stack, unsigned long stack_end)
+static int save_stack_stack(void *data, char *name)
{
- unsigned long addr;
-
-#ifdef CONFIG_FRAME_POINTER
- unsigned long prev_stack = 0;
+ struct stack_trace *trace = (struct stack_trace *)data;
+ return trace->all_contexts ? 0 : -1;
+}
- while (in_range(prev_stack, stack, stack_end)) {
- pr_debug("stack: %p\n", (void *)stack);
- addr = (unsigned long)(((unsigned long *)stack)[1]);
- pr_debug("addr: %p\n", (void *)addr);
- if (!skip)
- trace->entries[trace->nr_entries++] = addr-1;
- else
- skip--;
- if (trace->nr_entries >= trace->max_entries)
- break;
- if (!addr)
- return 0;
- /*
- * Stack frames must go forwards (otherwise a loop could
- * happen if the stackframe is corrupted), so we move
- * prev_stack forwards:
- */
- prev_stack = stack;
- stack = (unsigned long)(((unsigned long *)stack)[0]);
- }
- pr_debug("invalid: %p\n", (void *)stack);
-#else
- while (stack < stack_end) {
- addr = ((unsigned long *)stack)[0];
- stack += sizeof(long);
- if (__kernel_text_address(addr)) {
- if (!skip)
- trace->entries[trace->nr_entries++] = addr-1;
- else
- skip--;
- if (trace->nr_entries >= trace->max_entries)
- break;
- }
+static void save_stack_address(void *data, unsigned long addr)
+{
+ struct stack_trace *trace = (struct stack_trace *)data;
+ if (trace->skip > 0) {
+ trace->skip--;
+ return;
}
-#endif
- return stack;
+ if (trace->nr_entries < trace->max_entries - 1)
+ trace->entries[trace->nr_entries++] = addr;
}
-#define MAX_STACKS 10
+static struct stacktrace_ops save_stack_ops = {
+ .warning = save_stack_warning,
+ .warning_symbol = save_stack_warning_symbol,
+ .stack = save_stack_stack,
+ .address = save_stack_address,
+};
/*
* Save stack-backtrace addresses into a stack_trace buffer.
- * If all_contexts is set, all contexts (hardirq, softirq and process)
- * are saved. If not set then only the current context is saved.
*/
-void save_stack_trace(struct stack_trace *trace,
- struct task_struct *task, int all_contexts,
- unsigned int skip)
+void save_stack_trace(struct stack_trace *trace, struct task_struct *task)
{
- unsigned long stack = (unsigned long)&stack;
- int i, nr_stacks = 0, stacks_done[MAX_STACKS];
-
- WARN_ON(trace->nr_entries || !trace->max_entries);
-
- if (!task)
- task = current;
-
- pr_debug("task: %p, ti: %p\n", task, task->thread_info);
-
- if (!task || task == current) {
- /* Grab rbp right from our regs: */
- asm ("mov %%rbp, %0" : "=r" (stack));
- pr_debug("rbp: %p\n", (void *)stack);
- } else {
- /* rbp is the last reg pushed by switch_to(): */
- stack = task->thread.rsp;
- pr_debug("other task rsp: %p\n", (void *)stack);
- stack = (unsigned long)(((unsigned long *)stack)[0]);
- pr_debug("other task rbp: %p\n", (void *)stack);
- }
-
- while (1) {
- unsigned long stack_end = get_stack_end(task, stack);
-
- pr_debug("stack: %p\n", (void *)stack);
- pr_debug("stack end: %p\n", (void *)stack_end);
-
- /*
- * Invalid stack addres?
- */
- if (!stack_end)
- return;
- /*
- * Were we in this stack already? (recursion)
- */
- for (i = 0; i < nr_stacks; i++)
- if (stacks_done[i] == stack_end)
- return;
- stacks_done[nr_stacks] = stack_end;
-
- stack = save_context_stack(trace, skip, stack, stack_end);
- if (!all_contexts || !stack ||
- trace->nr_entries >= trace->max_entries)
- return;
- trace->entries[trace->nr_entries++] = ULONG_MAX;
- if (trace->nr_entries >= trace->max_entries)
- return;
- if (++nr_stacks >= MAX_STACKS)
- return;
- }
+ dump_trace(task, NULL, NULL, &save_stack_ops, trace);
+ trace->entries[trace->nr_entries++] = ULONG_MAX;
}
+EXPORT_SYMBOL(save_stack_trace);
diff --git a/arch/x86_64/kernel/suspend_asm.S b/arch/x86_64/kernel/suspend_asm.S
index 320b6fb00cc..bfbe00763c6 100644
--- a/arch/x86_64/kernel/suspend_asm.S
+++ b/arch/x86_64/kernel/suspend_asm.S
@@ -54,7 +54,7 @@ ENTRY(restore_image)
movq %rcx, %cr3;
movq %rax, %cr4; # turn PGE back on
- movq pagedir_nosave(%rip), %rdx
+ movq restore_pblist(%rip), %rdx
loop:
testq %rdx, %rdx
jz done
diff --git a/arch/x86_64/kernel/tce.c b/arch/x86_64/kernel/tce.c
index 5530dda3f27..cbabfdf78e0 100644
--- a/arch/x86_64/kernel/tce.c
+++ b/arch/x86_64/kernel/tce.c
@@ -1,4 +1,6 @@
/*
+ * This file manages the translation entries for the IBM Calgary IOMMU.
+ *
* Derived from arch/powerpc/platforms/pseries/iommu.c
*
* Copyright (C) IBM Corporation, 2006
@@ -104,14 +106,6 @@ static int tce_table_setparms(struct pci_dev *dev, struct iommu_table *tbl)
/* set the tce table size - measured in entries */
tbl->it_size = table_size_to_number_of_entries(specified_table_size);
- tbl->it_base = (unsigned long)tce_table_kva[dev->bus->number];
- if (!tbl->it_base) {
- printk(KERN_ERR "Calgary: iommu_table_setparms: "
- "no table allocated?!\n");
- ret = -ENOMEM;
- goto done;
- }
-
/*
* number of bytes needed for the bitmap size in number of
* entries; we need one bit per entry
@@ -160,8 +154,6 @@ int build_tce_table(struct pci_dev *dev, void __iomem *bbar)
if (ret)
goto free_tbl;
- tce_free(tbl, 0, tbl->it_size);
-
tbl->bbar = bbar;
/*
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index 7a9b1822418..557e92af7be 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -24,6 +24,8 @@
#include <linux/device.h>
#include <linux/sysdev.h>
#include <linux/bcd.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
#include <linux/kallsyms.h>
#include <linux/acpi.h>
#ifdef CONFIG_ACPI
@@ -39,9 +41,7 @@
#include <asm/sections.h>
#include <linux/cpufreq.h>
#include <linux/hpet.h>
-#ifdef CONFIG_X86_LOCAL_APIC
#include <asm/apic.h>
-#endif
#ifdef CONFIG_CPU_FREQ
static void cpufreq_delayed_get(void);
@@ -49,7 +49,7 @@ static void cpufreq_delayed_get(void);
extern void i8254_timer_resume(void);
extern int using_apic_timer;
-static char *time_init_gtod(void);
+static char *timename = NULL;
DEFINE_SPINLOCK(rtc_lock);
EXPORT_SYMBOL(rtc_lock);
@@ -77,7 +77,6 @@ unsigned long long monotonic_base;
struct vxtime_data __vxtime __section_vxtime; /* for vsyscalls */
volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
-unsigned long __wall_jiffies __section_wall_jiffies = INITIAL_JIFFIES;
struct timespec __xtime __section_xtime;
struct timezone __sys_tz __section_sys_tz;
@@ -119,7 +118,7 @@ unsigned int (*do_gettimeoffset)(void) = do_gettimeoffset_tsc;
void do_gettimeofday(struct timeval *tv)
{
- unsigned long seq, t;
+ unsigned long seq;
unsigned int sec, usec;
do {
@@ -136,10 +135,7 @@ void do_gettimeofday(struct timeval *tv)
be found. Note when you fix it here you need to do the same
in arch/x86_64/kernel/vsyscall.c and export all needed
variables in vmlinux.lds. -AK */
-
- t = (jiffies - wall_jiffies) * USEC_PER_TICK +
- do_gettimeoffset();
- usec += t;
+ usec += do_gettimeoffset();
} while (read_seqretry(&xtime_lock, seq));
@@ -165,8 +161,7 @@ int do_settimeofday(struct timespec *tv)
write_seqlock_irq(&xtime_lock);
- nsec -= do_gettimeoffset() * NSEC_PER_USEC +
- (jiffies - wall_jiffies) * NSEC_PER_TICK;
+ nsec -= do_gettimeoffset() * NSEC_PER_USEC;
wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
@@ -187,20 +182,15 @@ unsigned long profile_pc(struct pt_regs *regs)
{
unsigned long pc = instruction_pointer(regs);
- /* Assume the lock function has either no stack frame or only a single
- word. This checks if the address on the stack looks like a kernel
- text address.
- There is a small window for false hits, but in that case the tick
- is just accounted to the spinlock function.
- Better would be to write these functions in assembler again
- and check exactly. */
+ /* Assume the lock function has either no stack frame or a copy
+ of eflags from PUSHF
+ Eflags always has bits 22 and up cleared unlike kernel addresses. */
if (!user_mode(regs) && in_lock_functions(pc)) {
- char *v = *(char **)regs->rsp;
- if ((v >= _stext && v <= _etext) ||
- (v >= _sinittext && v <= _einittext) ||
- (v >= (char *)MODULES_VADDR && v <= (char *)MODULES_END))
- return (unsigned long)v;
- return ((unsigned long *)regs->rsp)[1];
+ unsigned long *sp = (unsigned long *)regs->rsp;
+ if (sp[0] >> 22)
+ return sp[0];
+ if (sp[1] >> 22)
+ return sp[1];
}
return pc;
}
@@ -281,6 +271,7 @@ static void set_rtc_mmss(unsigned long nowtime)
* Note: This function is required to return accurate
* time even in the absence of multiple timer ticks.
*/
+static inline unsigned long long cycles_2_ns(unsigned long long cyc);
unsigned long long monotonic_clock(void)
{
unsigned long seq;
@@ -305,8 +296,7 @@ unsigned long long monotonic_clock(void)
base = monotonic_base;
} while (read_seqretry(&xtime_lock, seq));
this_offset = get_cycles_sync();
- /* FIXME: 1000 or 1000000? */
- offset = (this_offset - last_offset)*1000 / cpu_khz;
+ offset = cycles_2_ns(this_offset - last_offset);
}
return base + offset;
}
@@ -410,8 +400,7 @@ void main_timer_handler(struct pt_regs *regs)
offset %= USEC_PER_TICK;
}
- /* FIXME: 1000 or 1000000? */
- monotonic_base += (tsc - vxtime.last_tsc) * 1000000 / cpu_khz;
+ monotonic_base += cycles_2_ns(tsc - vxtime.last_tsc);
vxtime.last_tsc = tsc - vxtime.quot * delay / vxtime.tsc_quot;
@@ -421,16 +410,16 @@ void main_timer_handler(struct pt_regs *regs)
(((long) offset << US_SCALE) / vxtime.tsc_quot) - 1;
}
- if (lost > 0) {
+ if (lost > 0)
handle_lost_ticks(lost, regs);
- jiffies += lost;
- }
+ else
+ lost = 0;
/*
* Do the timer stuff.
*/
- do_timer(regs);
+ do_timer(lost + 1);
#ifndef CONFIG_SMP
update_process_times(user_mode(regs));
#endif
@@ -441,12 +430,8 @@ void main_timer_handler(struct pt_regs *regs)
* have to call the local interrupt handler.
*/
-#ifndef CONFIG_X86_LOCAL_APIC
- profile_tick(CPU_PROFILING, regs);
-#else
if (!using_apic_timer)
smp_local_timer_interrupt(regs);
-#endif
/*
* If we have an externally synchronized Linux clock, then update CMOS clock
@@ -470,10 +455,8 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
if (apic_runs_main_timer > 1)
return IRQ_HANDLED;
main_timer_handler(regs);
-#ifdef CONFIG_X86_LOCAL_APIC
if (using_apic_timer)
smp_send_timer_broadcast_ipi();
-#endif
return IRQ_HANDLED;
}
@@ -893,11 +876,17 @@ static struct irqaction irq0 = {
timer_interrupt, IRQF_DISABLED, CPU_MASK_NONE, "timer", NULL, NULL
};
-void __init time_init(void)
+static int __cpuinit
+time_cpu_notifier(struct notifier_block *nb, unsigned long action, void *hcpu)
{
- char *timename;
- char *gtod;
+ unsigned cpu = (unsigned long) hcpu;
+ if (action == CPU_ONLINE)
+ vsyscall_set_cpu(cpu);
+ return NOTIFY_DONE;
+}
+void __init time_init(void)
+{
if (nohpet)
vxtime.hpet_address = 0;
@@ -931,18 +920,17 @@ void __init time_init(void)
}
vxtime.mode = VXTIME_TSC;
- gtod = time_init_gtod();
-
- printk(KERN_INFO "time.c: Using %ld.%06ld MHz WALL %s GTOD %s timer.\n",
- vxtime_hz / 1000000, vxtime_hz % 1000000, timename, gtod);
- printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n",
- cpu_khz / 1000, cpu_khz % 1000);
vxtime.quot = (USEC_PER_SEC << US_SCALE) / vxtime_hz;
vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz;
vxtime.last_tsc = get_cycles_sync();
+ set_cyc2ns_scale(cpu_khz);
setup_irq(0, &irq0);
+ hotcpu_notifier(time_cpu_notifier, 0);
+ time_cpu_notifier(NULL, CPU_ONLINE, (void *)(long)smp_processor_id());
- set_cyc2ns_scale(cpu_khz);
+#ifndef CONFIG_SMP
+ time_init_gtod();
+#endif
}
/*
@@ -973,12 +961,18 @@ __cpuinit int unsynchronized_tsc(void)
/*
* Decide what mode gettimeofday should use.
*/
-__init static char *time_init_gtod(void)
+void time_init_gtod(void)
{
char *timetype;
if (unsynchronized_tsc())
notsc = 1;
+
+ if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP))
+ vgetcpu_mode = VGETCPU_RDTSCP;
+ else
+ vgetcpu_mode = VGETCPU_LSL;
+
if (vxtime.hpet_address && notsc) {
timetype = hpet_use_timer ? "HPET" : "PIT/HPET";
if (hpet_use_timer)
@@ -1001,7 +995,16 @@ __init static char *time_init_gtod(void)
timetype = hpet_use_timer ? "HPET/TSC" : "PIT/TSC";
vxtime.mode = VXTIME_TSC;
}
- return timetype;
+
+ printk(KERN_INFO "time.c: Using %ld.%06ld MHz WALL %s GTOD %s timer.\n",
+ vxtime_hz / 1000000, vxtime_hz % 1000000, timename, timetype);
+ printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n",
+ cpu_khz / 1000, cpu_khz % 1000);
+ vxtime.quot = (USEC_PER_SEC << US_SCALE) / vxtime_hz;
+ vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz;
+ vxtime.last_tsc = get_cycles_sync();
+
+ set_cyc2ns_scale(cpu_khz);
}
__setup("report_lost_ticks", time_setup);
@@ -1031,8 +1034,16 @@ static int timer_resume(struct sys_device *dev)
unsigned long flags;
unsigned long sec;
unsigned long ctime = get_cmos_time();
- unsigned long sleep_length = (ctime - sleep_start) * HZ;
+ long sleep_length = (ctime - sleep_start) * HZ;
+ if (sleep_length < 0) {
+ printk(KERN_WARNING "Time skew detected in timer resume!\n");
+ /* The time after the resume must not be earlier than the time
+ * before the suspend or some nasty things will happen
+ */
+ sleep_length = 0;
+ ctime = sleep_start;
+ }
if (vxtime.hpet_address)
hpet_reenable();
else
@@ -1055,7 +1066,6 @@ static int timer_resume(struct sys_device *dev)
vxtime.last_tsc = get_cycles_sync();
write_sequnlock_irqrestore(&xtime_lock,flags);
jiffies += sleep_length;
- wall_jiffies += sleep_length;
monotonic_base += sleep_length * (NSEC_PER_SEC/HZ);
touch_softlockup_watchdog();
return 0;
@@ -1148,23 +1158,25 @@ int hpet_rtc_timer_init(void)
hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
local_irq_save(flags);
+
cnt = hpet_readl(HPET_COUNTER);
cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq);
hpet_writel(cnt, HPET_T1_CMP);
hpet_t1_cmp = cnt;
- local_irq_restore(flags);
cfg = hpet_readl(HPET_T1_CFG);
cfg &= ~HPET_TN_PERIODIC;
cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
hpet_writel(cfg, HPET_T1_CFG);
+ local_irq_restore(flags);
+
return 1;
}
static void hpet_rtc_timer_reinit(void)
{
- unsigned int cfg, cnt;
+ unsigned int cfg, cnt, ticks_per_int, lost_ints;
if (unlikely(!(PIE_on | AIE_on | UIE_on))) {
cfg = hpet_readl(HPET_T1_CFG);
@@ -1179,10 +1191,33 @@ static void hpet_rtc_timer_reinit(void)
hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
/* It is more accurate to use the comparator value than current count.*/
- cnt = hpet_t1_cmp;
- cnt += hpet_tick*HZ/hpet_rtc_int_freq;
- hpet_writel(cnt, HPET_T1_CMP);
- hpet_t1_cmp = cnt;
+ ticks_per_int = hpet_tick * HZ / hpet_rtc_int_freq;
+ hpet_t1_cmp += ticks_per_int;
+ hpet_writel(hpet_t1_cmp, HPET_T1_CMP);
+
+ /*
+ * If the interrupt handler was delayed too long, the write above tries
+ * to schedule the next interrupt in the past and the hardware would
+ * not interrupt until the counter had wrapped around.
+ * So we have to check that the comparator wasn't set to a past time.
+ */
+ cnt = hpet_readl(HPET_COUNTER);
+ if (unlikely((int)(cnt - hpet_t1_cmp) > 0)) {
+ lost_ints = (cnt - hpet_t1_cmp) / ticks_per_int + 1;
+ /* Make sure that, even with the time needed to execute
+ * this code, the next scheduled interrupt has been moved
+ * back to the future: */
+ lost_ints++;
+
+ hpet_t1_cmp += lost_ints * ticks_per_int;
+ hpet_writel(hpet_t1_cmp, HPET_T1_CMP);
+
+ if (PIE_on)
+ PIE_count += lost_ints;
+
+ printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n",
+ hpet_rtc_int_freq);
+ }
}
/*
diff --git a/arch/x86_64/kernel/trampoline.S b/arch/x86_64/kernel/trampoline.S
index 23a03eb91fc..c79b99a9e2f 100644
--- a/arch/x86_64/kernel/trampoline.S
+++ b/arch/x86_64/kernel/trampoline.S
@@ -64,7 +64,7 @@ idt_48:
.word 0, 0 # idt base = 0L
gdt_48:
- .short __KERNEL32_CS + 7 # gdt limit
+ .short GDT_ENTRIES*8 - 1 # gdt limit
.long cpu_gdt_table-__START_KERNEL_map
.globl trampoline_end
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index b1249774d1e..01f2a8d254c 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -45,6 +45,7 @@
#include <asm/pda.h>
#include <asm/proto.h>
#include <asm/nmi.h>
+#include <asm/stacktrace.h>
asmlinkage void divide_error(void);
asmlinkage void debug(void);
@@ -142,7 +143,7 @@ void printk_address(unsigned long address)
#endif
static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
- unsigned *usedp, const char **idp)
+ unsigned *usedp, char **idp)
{
static char ids[][8] = {
[DEBUG_STACK - 1] = "#DB",
@@ -161,26 +162,7 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
* 'stack' is in one of them:
*/
for (k = 0; k < N_EXCEPTION_STACKS; k++) {
- unsigned long end;
-
- /*
- * set 'end' to the end of the exception stack.
- */
- switch (k + 1) {
- /*
- * TODO: this block is not needed i think, because
- * setup64.c:cpu_init() sets up t->ist[DEBUG_STACK]
- * properly too.
- */
-#if DEBUG_STKSZ > EXCEPTION_STKSZ
- case DEBUG_STACK:
- end = cpu_pda(cpu)->debugstack + DEBUG_STKSZ;
- break;
-#endif
- default:
- end = per_cpu(orig_ist, cpu).ist[k];
- break;
- }
+ unsigned long end = per_cpu(orig_ist, cpu).ist[k];
/*
* Is 'stack' above this exception frame's end?
* If yes then skip to the next frame.
@@ -234,13 +216,19 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
return NULL;
}
-static int show_trace_unwind(struct unwind_frame_info *info, void *context)
+struct ops_and_data {
+ struct stacktrace_ops *ops;
+ void *data;
+};
+
+static int dump_trace_unwind(struct unwind_frame_info *info, void *context)
{
+ struct ops_and_data *oad = (struct ops_and_data *)context;
int n = 0;
while (unwind(info) == 0 && UNW_PC(info)) {
n++;
- printk_address(UNW_PC(info));
+ oad->ops->address(oad->data, UNW_PC(info));
if (arch_unw_user_mode(info))
break;
}
@@ -254,45 +242,53 @@ static int show_trace_unwind(struct unwind_frame_info *info, void *context)
* severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
*/
-void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * stack)
+void dump_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * stack,
+ struct stacktrace_ops *ops, void *data)
{
- const unsigned cpu = safe_smp_processor_id();
+ const unsigned cpu = smp_processor_id();
unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
unsigned used = 0;
- printk("\nCall Trace:\n");
-
if (!tsk)
tsk = current;
if (call_trace >= 0) {
int unw_ret = 0;
struct unwind_frame_info info;
+ struct ops_and_data oad = { .ops = ops, .data = data };
if (regs) {
if (unwind_init_frame_info(&info, tsk, regs) == 0)
- unw_ret = show_trace_unwind(&info, NULL);
+ unw_ret = dump_trace_unwind(&info, &oad);
} else if (tsk == current)
- unw_ret = unwind_init_running(&info, show_trace_unwind, NULL);
+ unw_ret = unwind_init_running(&info, dump_trace_unwind, &oad);
else {
if (unwind_init_blocked(&info, tsk) == 0)
- unw_ret = show_trace_unwind(&info, NULL);
+ unw_ret = dump_trace_unwind(&info, &oad);
}
if (unw_ret > 0) {
if (call_trace == 1 && !arch_unw_user_mode(&info)) {
- print_symbol("DWARF2 unwinder stuck at %s\n",
+ ops->warning_symbol(data, "DWARF2 unwinder stuck at %s\n",
UNW_PC(&info));
if ((long)UNW_SP(&info) < 0) {
- printk("Leftover inexact backtrace:\n");
+ ops->warning(data, "Leftover inexact backtrace:\n");
stack = (unsigned long *)UNW_SP(&info);
+ if (!stack)
+ return;
} else
- printk("Full inexact backtrace again:\n");
+ ops->warning(data, "Full inexact backtrace again:\n");
} else if (call_trace >= 1)
return;
else
- printk("Full inexact backtrace again:\n");
+ ops->warning(data, "Full inexact backtrace again:\n");
} else
- printk("Inexact backtrace:\n");
+ ops->warning(data, "Inexact backtrace:\n");
+ }
+ if (!stack) {
+ unsigned long dummy;
+ stack = &dummy;
+ if (tsk && tsk != current)
+ stack = (unsigned long *)tsk->thread.rsp;
}
/*
@@ -303,7 +299,9 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s
#define HANDLE_STACK(cond) \
do while (cond) { \
unsigned long addr = *stack++; \
- if (kernel_text_address(addr)) { \
+ if (oops_in_progress ? \
+ __kernel_text_address(addr) : \
+ kernel_text_address(addr)) { \
/* \
* If the address is either in the text segment of the \
* kernel, or in the region which contains vmalloc'ed \
@@ -312,7 +310,7 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s
* down the cause of the crash will be able to figure \
* out the call path that was taken. \
*/ \
- printk_address(addr); \
+ ops->address(data, addr); \
} \
} while (0)
@@ -321,16 +319,17 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s
* current stack address. If the stacks consist of nested
* exceptions
*/
- for ( ; ; ) {
- const char *id;
+ for (;;) {
+ char *id;
unsigned long *estack_end;
estack_end = in_exception_stack(cpu, (unsigned long)stack,
&used, &id);
if (estack_end) {
- printk(" <%s>", id);
+ if (ops->stack(data, id) < 0)
+ break;
HANDLE_STACK (stack < estack_end);
- printk(" <EOE>");
+ ops->stack(data, "<EOE>");
/*
* We link to the next stack via the
* second-to-last pointer (index -2 to end) in the
@@ -345,7 +344,8 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s
(IRQSTACKSIZE - 64) / sizeof(*irqstack);
if (stack >= irqstack && stack < irqstack_end) {
- printk(" <IRQ>");
+ if (ops->stack(data, "IRQ") < 0)
+ break;
HANDLE_STACK (stack < irqstack_end);
/*
* We link to the next stack (which would be
@@ -354,7 +354,7 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s
*/
stack = (unsigned long *) (irqstack_end[-1]);
irqstack_end = NULL;
- printk(" <EOI>");
+ ops->stack(data, "EOI");
continue;
}
}
@@ -362,19 +362,57 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s
}
/*
- * This prints the process stack:
+ * This handles the process stack:
*/
HANDLE_STACK (((long) stack & (THREAD_SIZE-1)) != 0);
#undef HANDLE_STACK
+}
+EXPORT_SYMBOL(dump_trace);
+
+static void
+print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
+{
+ print_symbol(msg, symbol);
+ printk("\n");
+}
+
+static void print_trace_warning(void *data, char *msg)
+{
+ printk("%s\n", msg);
+}
+
+static int print_trace_stack(void *data, char *name)
+{
+ printk(" <%s> ", name);
+ return 0;
+}
+
+static void print_trace_address(void *data, unsigned long addr)
+{
+ printk_address(addr);
+}
+
+static struct stacktrace_ops print_trace_ops = {
+ .warning = print_trace_warning,
+ .warning_symbol = print_trace_warning_symbol,
+ .stack = print_trace_stack,
+ .address = print_trace_address,
+};
+void
+show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack)
+{
+ printk("\nCall Trace:\n");
+ dump_trace(tsk, regs, stack, &print_trace_ops, NULL);
printk("\n");
}
-static void _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long * rsp)
+static void
+_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *rsp)
{
unsigned long *stack;
int i;
- const int cpu = safe_smp_processor_id();
+ const int cpu = smp_processor_id();
unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr);
unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
@@ -428,7 +466,7 @@ void show_registers(struct pt_regs *regs)
int i;
int in_kernel = !user_mode(regs);
unsigned long rsp;
- const int cpu = safe_smp_processor_id();
+ const int cpu = smp_processor_id();
struct task_struct *cur = cpu_pda(cpu)->pcurrent;
rsp = regs->rsp;
@@ -503,9 +541,11 @@ static unsigned int die_nest_count;
unsigned __kprobes long oops_begin(void)
{
- int cpu = safe_smp_processor_id();
+ int cpu = smp_processor_id();
unsigned long flags;
+ oops_enter();
+
/* racy, but better than risking deadlock. */
local_irq_save(flags);
if (!spin_trylock(&die_lock)) {
@@ -534,6 +574,7 @@ void __kprobes oops_end(unsigned long flags)
spin_unlock_irqrestore(&die_lock, flags);
if (panic_on_oops)
panic("Fatal exception");
+ oops_exit();
}
void __kprobes __die(const char * str, struct pt_regs * regs, long err)
@@ -570,7 +611,7 @@ void die(const char * str, struct pt_regs * regs, long err)
do_exit(SIGSEGV);
}
-void __kprobes die_nmi(char *str, struct pt_regs *regs)
+void __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic)
{
unsigned long flags = oops_begin();
@@ -578,13 +619,12 @@ void __kprobes die_nmi(char *str, struct pt_regs *regs)
* We are in trouble anyway, lets at least try
* to get a message out.
*/
- printk(str, safe_smp_processor_id());
+ printk(str, smp_processor_id());
show_registers(regs);
if (kexec_should_crash(current))
crash_kexec(regs);
- if (panic_on_timeout || panic_on_oops)
- panic("nmi watchdog");
- printk("console shuts up ...\n");
+ if (do_panic || panic_on_oops)
+ panic("Non maskable interrupt");
oops_end(flags);
nmi_exit();
local_irq_enable();
@@ -730,8 +770,15 @@ asmlinkage void __kprobes do_general_protection(struct pt_regs * regs,
static __kprobes void
mem_parity_error(unsigned char reason, struct pt_regs * regs)
{
- printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n");
- printk("You probably have a hardware problem with your RAM chips\n");
+ printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
+ reason);
+ printk(KERN_EMERG "You probably have a hardware problem with your "
+ "RAM chips\n");
+
+ if (panic_on_unrecovered_nmi)
+ panic("NMI: Not continuing");
+
+ printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
/* Clear and disable the memory parity error line. */
reason = (reason & 0xf) | 4;
@@ -754,9 +801,15 @@ io_check_error(unsigned char reason, struct pt_regs * regs)
static __kprobes void
unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
-{ printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
- printk("Dazed and confused, but trying to continue\n");
- printk("Do you have a strange power saving mode enabled?\n");
+{
+ printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
+ reason);
+ printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
+
+ if (panic_on_unrecovered_nmi)
+ panic("NMI: Not continuing");
+
+ printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
}
/* Runs on IST stack. This code must keep interrupts off all the time.
@@ -776,17 +829,15 @@ asmlinkage __kprobes void default_do_nmi(struct pt_regs *regs)
if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
== NOTIFY_STOP)
return;
-#ifdef CONFIG_X86_LOCAL_APIC
/*
* Ok, so this is none of the documented NMI sources,
* so it must be the NMI watchdog.
*/
- if (nmi_watchdog > 0) {
- nmi_watchdog_tick(regs,reason);
+ if (nmi_watchdog_tick(regs,reason))
return;
- }
-#endif
- unknown_nmi_error(reason, regs);
+ if (!do_nmi_callback(regs,cpu))
+ unknown_nmi_error(reason, regs);
+
return;
}
if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
@@ -1071,6 +1122,7 @@ asmlinkage void math_state_restore(void)
init_fpu(me);
restore_fpu_checking(&me->thread.i387.fxsave);
task_thread_info(me)->status |= TS_USEDFPU;
+ me->fpu_counter++;
}
void __init trap_init(void)
@@ -1109,24 +1161,30 @@ void __init trap_init(void)
}
-/* Actual parsing is done early in setup.c. */
-static int __init oops_dummy(char *s)
+static int __init oops_setup(char *s)
{
- panic_on_oops = 1;
- return 1;
+ if (!s)
+ return -EINVAL;
+ if (!strcmp(s, "panic"))
+ panic_on_oops = 1;
+ return 0;
}
-__setup("oops=", oops_dummy);
+early_param("oops", oops_setup);
static int __init kstack_setup(char *s)
{
+ if (!s)
+ return -EINVAL;
kstack_depth_to_print = simple_strtoul(s,NULL,0);
- return 1;
+ return 0;
}
-__setup("kstack=", kstack_setup);
+early_param("kstack", kstack_setup);
#ifdef CONFIG_STACK_UNWIND
static int __init call_trace_setup(char *s)
{
+ if (!s)
+ return -EINVAL;
if (strcmp(s, "old") == 0)
call_trace = -1;
else if (strcmp(s, "both") == 0)
@@ -1135,7 +1193,7 @@ static int __init call_trace_setup(char *s)
call_trace = 1;
else if (strcmp(s, "new") == 0)
call_trace = 2;
- return 1;
+ return 0;
}
-__setup("call_trace=", call_trace_setup);
+early_param("call_trace", call_trace_setup);
#endif
diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S
index 7c4de31471d..b9df2ab6529 100644
--- a/arch/x86_64/kernel/vmlinux.lds.S
+++ b/arch/x86_64/kernel/vmlinux.lds.S
@@ -13,6 +13,12 @@ OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
OUTPUT_ARCH(i386:x86-64)
ENTRY(phys_startup_64)
jiffies_64 = jiffies;
+PHDRS {
+ text PT_LOAD FLAGS(5); /* R_E */
+ data PT_LOAD FLAGS(7); /* RWE */
+ user PT_LOAD FLAGS(7); /* RWE */
+ note PT_NOTE FLAGS(4); /* R__ */
+}
SECTIONS
{
. = __START_KERNEL;
@@ -31,7 +37,7 @@ SECTIONS
KPROBES_TEXT
*(.fixup)
*(.gnu.warning)
- } = 0x9090
+ } :text = 0x9090
/* out-of-line lock text */
.text.lock : AT(ADDR(.text.lock) - LOAD_OFFSET) { *(.text.lock) }
@@ -57,17 +63,10 @@ SECTIONS
.data : AT(ADDR(.data) - LOAD_OFFSET) {
*(.data)
CONSTRUCTORS
- }
+ } :data
_edata = .; /* End of data section */
- __bss_start = .; /* BSS */
- .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
- *(.bss.page_aligned)
- *(.bss)
- }
- __bss_stop = .;
-
. = ALIGN(PAGE_SIZE);
. = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
.data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
@@ -89,7 +88,7 @@ SECTIONS
#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
. = VSYSCALL_ADDR;
- .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) }
+ .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) } :user
__vsyscall_0 = VSYSCALL_VIRT_ADDR;
. = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
@@ -99,8 +98,8 @@ SECTIONS
.vxtime : AT(VLOAD(.vxtime)) { *(.vxtime) }
vxtime = VVIRT(.vxtime);
- .wall_jiffies : AT(VLOAD(.wall_jiffies)) { *(.wall_jiffies) }
- wall_jiffies = VVIRT(.wall_jiffies);
+ .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { *(.vgetcpu_mode) }
+ vgetcpu_mode = VVIRT(.vgetcpu_mode);
.sys_tz : AT(VLOAD(.sys_tz)) { *(.sys_tz) }
sys_tz = VVIRT(.sys_tz);
@@ -132,7 +131,7 @@ SECTIONS
. = ALIGN(8192); /* init_task */
.data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
*(.data.init_task)
- }
+ } :data
. = ALIGN(4096);
.data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
@@ -207,14 +206,12 @@ SECTIONS
__initramfs_start = .;
.init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { *(.init.ramfs) }
__initramfs_end = .;
- /* temporary here to work around NR_CPUS. If you see this comment in 2.6.17+
- complain */
- . = ALIGN(4096);
- __init_end = .;
- . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+ . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
__per_cpu_start = .;
.data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { *(.data.percpu) }
__per_cpu_end = .;
+ . = ALIGN(4096);
+ __init_end = .;
. = ALIGN(4096);
__nosave_begin = .;
@@ -222,6 +219,13 @@ SECTIONS
. = ALIGN(4096);
__nosave_end = .;
+ __bss_start = .; /* BSS */
+ .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
+ *(.bss.page_aligned)
+ *(.bss)
+ }
+ __bss_stop = .;
+
_end = . ;
/* Sections to be discarded */
diff --git a/arch/x86_64/kernel/vsmp.c b/arch/x86_64/kernel/vsmp.c
index 92f70c74965..044e852bd25 100644
--- a/arch/x86_64/kernel/vsmp.c
+++ b/arch/x86_64/kernel/vsmp.c
@@ -20,6 +20,9 @@ static int __init vsmp_init(void)
void *address;
unsigned int cap, ctl;
+ if (!early_pci_allowed())
+ return 0;
+
/* Check if we are running on a ScaleMP vSMP box */
if ((read_pci_config_16(0, 0x1f, 0, PCI_VENDOR_ID) != PCI_VENDOR_ID_SCALEMP) ||
(read_pci_config_16(0, 0x1f, 0, PCI_DEVICE_ID) != PCI_DEVICE_ID_SCALEMP_VSMP_CTL))
diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c
index f603037df16..a98b460af6a 100644
--- a/arch/x86_64/kernel/vsyscall.c
+++ b/arch/x86_64/kernel/vsyscall.c
@@ -26,6 +26,7 @@
#include <linux/seqlock.h>
#include <linux/jiffies.h>
#include <linux/sysctl.h>
+#include <linux/getcpu.h>
#include <asm/vsyscall.h>
#include <asm/pgtable.h>
@@ -33,11 +34,15 @@
#include <asm/fixmap.h>
#include <asm/errno.h>
#include <asm/io.h>
+#include <asm/segment.h>
+#include <asm/desc.h>
+#include <asm/topology.h>
#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
int __sysctl_vsyscall __section_sysctl_vsyscall = 1;
seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED;
+int __vgetcpu_mode __section_vgetcpu_mode;
#include <asm/unistd.h>
@@ -61,8 +66,7 @@ static __always_inline void do_vgettimeofday(struct timeval * tv)
sequence = read_seqbegin(&__xtime_lock);
sec = __xtime.tv_sec;
- usec = (__xtime.tv_nsec / 1000) +
- (__jiffies - __wall_jiffies) * (1000000 / HZ);
+ usec = __xtime.tv_nsec / 1000;
if (__vxtime.mode != VXTIME_HPET) {
t = get_cycles_sync();
@@ -72,7 +76,8 @@ static __always_inline void do_vgettimeofday(struct timeval * tv)
__vxtime.tsc_quot) >> 32;
/* See comment in x86_64 do_gettimeofday. */
} else {
- usec += ((readl((void *)fix_to_virt(VSYSCALL_HPET) + 0xf0) -
+ usec += ((readl((void __iomem *)
+ fix_to_virt(VSYSCALL_HPET) + 0xf0) -
__vxtime.last) * __vxtime.quot) >> 32;
}
} while (read_seqretry(&__xtime_lock, sequence));
@@ -127,9 +132,46 @@ time_t __vsyscall(1) vtime(time_t *t)
return __xtime.tv_sec;
}
-long __vsyscall(2) venosys_0(void)
+/* Fast way to get current CPU and node.
+ This helps to do per node and per CPU caches in user space.
+ The result is not guaranteed without CPU affinity, but usually
+ works out because the scheduler tries to keep a thread on the same
+ CPU.
+
+ tcache must point to a two element sized long array.
+ All arguments can be NULL. */
+long __vsyscall(2)
+vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
{
- return -ENOSYS;
+ unsigned int dummy, p;
+ unsigned long j = 0;
+
+ /* Fast cache - only recompute value once per jiffies and avoid
+ relatively costly rdtscp/cpuid otherwise.
+ This works because the scheduler usually keeps the process
+ on the same CPU and this syscall doesn't guarantee its
+ results anyways.
+ We do this here because otherwise user space would do it on
+ its own in a likely inferior way (no access to jiffies).
+ If you don't like it pass NULL. */
+ if (tcache && tcache->blob[0] == (j = __jiffies)) {
+ p = tcache->blob[1];
+ } else if (__vgetcpu_mode == VGETCPU_RDTSCP) {
+ /* Load per CPU data from RDTSCP */
+ rdtscp(dummy, dummy, p);
+ } else {
+ /* Load per CPU data from GDT */
+ asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
+ }
+ if (tcache) {
+ tcache->blob[0] = j;
+ tcache->blob[1] = p;
+ }
+ if (cpu)
+ *cpu = p & 0xfff;
+ if (node)
+ *node = p >> 12;
+ return 0;
}
long __vsyscall(3) venosys_1(void)
@@ -149,7 +191,8 @@ static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
extern u16 vsysc1, vsysc2;
- u16 *map1, *map2;
+ u16 __iomem *map1;
+ u16 __iomem *map2;
int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
if (!write)
return ret;
@@ -164,11 +207,11 @@ static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
goto out;
}
if (!sysctl_vsyscall) {
- *map1 = SYSCALL;
- *map2 = SYSCALL;
+ writew(SYSCALL, map1);
+ writew(SYSCALL, map2);
} else {
- *map1 = NOP2;
- *map2 = NOP2;
+ writew(NOP2, map1);
+ writew(NOP2, map2);
}
iounmap(map2);
out:
@@ -200,6 +243,43 @@ static ctl_table kernel_root_table2[] = {
#endif
+static void __cpuinit write_rdtscp_cb(void *info)
+{
+ write_rdtscp_aux((unsigned long)info);
+}
+
+void __cpuinit vsyscall_set_cpu(int cpu)
+{
+ unsigned long *d;
+ unsigned long node = 0;
+#ifdef CONFIG_NUMA
+ node = cpu_to_node[cpu];
+#endif
+ if (cpu_has(&cpu_data[cpu], X86_FEATURE_RDTSCP)) {
+ void *info = (void *)((node << 12) | cpu);
+ /* Can happen on preemptive kernel */
+ if (get_cpu() == cpu)
+ write_rdtscp_cb(info);
+#ifdef CONFIG_SMP
+ else {
+ /* the notifier is unfortunately not executed on the
+ target CPU */
+ smp_call_function_single(cpu,write_rdtscp_cb,info,0,1);
+ }
+#endif
+ put_cpu();
+ }
+
+ /* Store cpu number in limit so that it can be loaded quickly
+ in user space in vgetcpu.
+ 12 bits for the CPU and 8 bits for the node. */
+ d = (unsigned long *)(cpu_gdt(cpu) + GDT_ENTRY_PER_CPU);
+ *d = 0x0f40000000000ULL;
+ *d |= cpu;
+ *d |= (node & 0xf) << 12;
+ *d |= (node >> 4) << 48;
+}
+
static void __init map_vsyscall(void)
{
extern char __vsyscall_0;
@@ -214,6 +294,7 @@ static int __init vsyscall_init(void)
VSYSCALL_ADDR(__NR_vgettimeofday)));
BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
+ BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu));
map_vsyscall();
#ifdef CONFIG_SYSCTL
register_sysctl_table(kernel_root_table2, 0);
diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c
index 370952c4ff2..c3454af5e3a 100644
--- a/arch/x86_64/kernel/x8664_ksyms.c
+++ b/arch/x86_64/kernel/x8664_ksyms.c
@@ -29,6 +29,7 @@ EXPORT_SYMBOL(__put_user_8);
EXPORT_SYMBOL(copy_user_generic);
EXPORT_SYMBOL(copy_from_user);
EXPORT_SYMBOL(copy_to_user);
+EXPORT_SYMBOL(__copy_from_user_inatomic);
EXPORT_SYMBOL(copy_page);
EXPORT_SYMBOL(clear_page);