summaryrefslogtreecommitdiffstats
path: root/arch/i386/kernel
diff options
context:
space:
mode:
authorSteven Whitehouse <swhiteho@redhat.com>2006-09-28 08:29:59 -0400
committerSteven Whitehouse <swhiteho@redhat.com>2006-09-28 08:29:59 -0400
commit185a257f2f73bcd89050ad02da5bedbc28fc43fa (patch)
tree5e32586114534ed3f2165614cba3d578f5d87307 /arch/i386/kernel
parent3f1a9aaeffd8d1cbc5ab9776c45cbd66af1c9699 (diff)
parenta77c64c1a641950626181b4857abb701d8f38ccc (diff)
Merge branch 'master' into gfs2
Diffstat (limited to 'arch/i386/kernel')
-rw-r--r--arch/i386/kernel/Makefile3
-rw-r--r--arch/i386/kernel/acpi/Makefile2
-rw-r--r--arch/i386/kernel/acpi/boot.c181
-rw-r--r--arch/i386/kernel/acpi/earlyquirk.c6
-rw-r--r--arch/i386/kernel/apic.c31
-rw-r--r--arch/i386/kernel/apm.c26
-rw-r--r--arch/i386/kernel/cpu/amd.c7
-rw-r--r--arch/i386/kernel/cpu/centaur.c24
-rw-r--r--arch/i386/kernel/cpu/common.c8
-rw-r--r--arch/i386/kernel/cpu/cpu.h2
-rw-r--r--arch/i386/kernel/cpu/cyrix.c42
-rw-r--r--arch/i386/kernel/cpu/intel.c3
-rw-r--r--arch/i386/kernel/cpu/mcheck/Makefile2
-rw-r--r--arch/i386/kernel/cpu/mcheck/p4.c26
-rw-r--r--arch/i386/kernel/cpu/mcheck/therm_throt.c180
-rw-r--r--arch/i386/kernel/cpu/mtrr/generic.c4
-rw-r--r--arch/i386/kernel/cpu/nexgen.c9
-rw-r--r--arch/i386/kernel/cpu/proc.c4
-rw-r--r--arch/i386/kernel/cpu/rise.c4
-rw-r--r--arch/i386/kernel/cpu/transmeta.c7
-rw-r--r--arch/i386/kernel/cpu/umc.c7
-rw-r--r--arch/i386/kernel/crash.c22
-rw-r--r--arch/i386/kernel/efi_stub.S1
-rw-r--r--arch/i386/kernel/entry.S110
-rw-r--r--arch/i386/kernel/head.S67
-rw-r--r--arch/i386/kernel/i8259.c6
-rw-r--r--arch/i386/kernel/io_apic.c125
-rw-r--r--arch/i386/kernel/machine_kexec.c140
-rw-r--r--arch/i386/kernel/mca.c8
-rw-r--r--arch/i386/kernel/microcode.c774
-rw-r--r--arch/i386/kernel/mpparse.c70
-rw-r--r--arch/i386/kernel/nmi.c940
-rw-r--r--arch/i386/kernel/process.c14
-rw-r--r--arch/i386/kernel/ptrace.c10
-rw-r--r--arch/i386/kernel/reboot.c12
-rw-r--r--arch/i386/kernel/relocate_kernel.S162
-rw-r--r--arch/i386/kernel/semaphore.c134
-rw-r--r--arch/i386/kernel/setup.c408
-rw-r--r--arch/i386/kernel/smp.c66
-rw-r--r--arch/i386/kernel/smpboot.c25
-rw-r--r--arch/i386/kernel/srat.c102
-rw-r--r--arch/i386/kernel/stacktrace.c98
-rw-r--r--arch/i386/kernel/syscall_table.S1
-rw-r--r--arch/i386/kernel/time.c73
-rw-r--r--arch/i386/kernel/time_hpet.c37
-rw-r--r--arch/i386/kernel/topology.c21
-rw-r--r--arch/i386/kernel/traps.c242
-rw-r--r--arch/i386/kernel/tsc.c2
-rw-r--r--arch/i386/kernel/vmlinux.lds.S12
49 files changed, 2584 insertions, 1676 deletions
diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile
index 5427a842e84..1a884b6e6e5 100644
--- a/arch/i386/kernel/Makefile
+++ b/arch/i386/kernel/Makefile
@@ -4,7 +4,7 @@
extra-y := head.o init_task.o vmlinux.lds
-obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o \
+obj-y := process.o signal.o entry.o traps.o irq.o \
ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \
pci-dma.o i386_ksyms.o i387.o bootflag.o \
quirks.o i8237.o topology.o alternative.o i8253.o tsc.o
@@ -81,4 +81,5 @@ $(obj)/vsyscall-syms.o: $(src)/vsyscall.lds \
$(call if_changed,syscall)
k8-y += ../../x86_64/kernel/k8.o
+stacktrace-y += ../../x86_64/kernel/stacktrace.o
diff --git a/arch/i386/kernel/acpi/Makefile b/arch/i386/kernel/acpi/Makefile
index 7e9ac99354f..7f7be01f44e 100644
--- a/arch/i386/kernel/acpi/Makefile
+++ b/arch/i386/kernel/acpi/Makefile
@@ -1,5 +1,7 @@
obj-$(CONFIG_ACPI) += boot.o
+ifneq ($(CONFIG_PCI),)
obj-$(CONFIG_X86_IO_APIC) += earlyquirk.o
+endif
obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup.o
ifneq ($(CONFIG_ACPI_PROCESSOR),)
diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c
index ee003bc0e8b..1aaea6ab8c4 100644
--- a/arch/i386/kernel/acpi/boot.c
+++ b/arch/i386/kernel/acpi/boot.c
@@ -26,9 +26,12 @@
#include <linux/init.h>
#include <linux/acpi.h>
#include <linux/efi.h>
+#include <linux/cpumask.h>
#include <linux/module.h>
#include <linux/dmi.h>
#include <linux/irq.h>
+#include <linux/bootmem.h>
+#include <linux/ioport.h>
#include <asm/pgtable.h>
#include <asm/io_apic.h>
@@ -36,11 +39,17 @@
#include <asm/io.h>
#include <asm/mpspec.h>
-#ifdef CONFIG_X86_64
+static int __initdata acpi_force = 0;
-extern void __init clustered_apic_check(void);
+#ifdef CONFIG_ACPI
+int acpi_disabled = 0;
+#else
+int acpi_disabled = 1;
+#endif
+EXPORT_SYMBOL(acpi_disabled);
+
+#ifdef CONFIG_X86_64
-extern int gsi_irq_sharing(int gsi);
#include <asm/proto.h>
static inline int acpi_madt_oem_check(char *oem_id, char *oem_table_id) { return 0; }
@@ -506,16 +515,76 @@ EXPORT_SYMBOL(acpi_register_gsi);
#ifdef CONFIG_ACPI_HOTPLUG_CPU
int acpi_map_lsapic(acpi_handle handle, int *pcpu)
{
- /* TBD */
- return -EINVAL;
+ struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
+ union acpi_object *obj;
+ struct acpi_table_lapic *lapic;
+ cpumask_t tmp_map, new_map;
+ u8 physid;
+ int cpu;
+
+ if (ACPI_FAILURE(acpi_evaluate_object(handle, "_MAT", NULL, &buffer)))
+ return -EINVAL;
+
+ if (!buffer.length || !buffer.pointer)
+ return -EINVAL;
+
+ obj = buffer.pointer;
+ if (obj->type != ACPI_TYPE_BUFFER ||
+ obj->buffer.length < sizeof(*lapic)) {
+ kfree(buffer.pointer);
+ return -EINVAL;
+ }
+
+ lapic = (struct acpi_table_lapic *)obj->buffer.pointer;
+
+ if ((lapic->header.type != ACPI_MADT_LAPIC) ||
+ (!lapic->flags.enabled)) {
+ kfree(buffer.pointer);
+ return -EINVAL;
+ }
+
+ physid = lapic->id;
+
+ kfree(buffer.pointer);
+ buffer.length = ACPI_ALLOCATE_BUFFER;
+ buffer.pointer = NULL;
+
+ tmp_map = cpu_present_map;
+ mp_register_lapic(physid, lapic->flags.enabled);
+
+ /*
+ * If mp_register_lapic successfully generates a new logical cpu
+ * number, then the following will get us exactly what was mapped
+ */
+ cpus_andnot(new_map, cpu_present_map, tmp_map);
+ if (cpus_empty(new_map)) {
+ printk ("Unable to map lapic to logical cpu number\n");
+ return -EINVAL;
+ }
+
+ cpu = first_cpu(new_map);
+
+ *pcpu = cpu;
+ return 0;
}
EXPORT_SYMBOL(acpi_map_lsapic);
int acpi_unmap_lsapic(int cpu)
{
- /* TBD */
- return -EINVAL;
+ int i;
+
+ for_each_possible_cpu(i) {
+ if (x86_acpiid_to_apicid[i] == x86_cpu_to_apicid[cpu]) {
+ x86_acpiid_to_apicid[i] = -1;
+ break;
+ }
+ }
+ x86_cpu_to_apicid[cpu] = -1;
+ cpu_clear(cpu, cpu_present_map);
+ num_processors--;
+
+ return (0);
}
EXPORT_SYMBOL(acpi_unmap_lsapic);
@@ -579,6 +648,8 @@ static int __init acpi_parse_sbf(unsigned long phys_addr, unsigned long size)
static int __init acpi_parse_hpet(unsigned long phys, unsigned long size)
{
struct acpi_table_hpet *hpet_tbl;
+ struct resource *hpet_res;
+ resource_size_t res_start;
if (!phys || !size)
return -EINVAL;
@@ -594,12 +665,26 @@ static int __init acpi_parse_hpet(unsigned long phys, unsigned long size)
"memory.\n");
return -1;
}
+
+#define HPET_RESOURCE_NAME_SIZE 9
+ hpet_res = alloc_bootmem(sizeof(*hpet_res) + HPET_RESOURCE_NAME_SIZE);
+ if (hpet_res) {
+ memset(hpet_res, 0, sizeof(*hpet_res));
+ hpet_res->name = (void *)&hpet_res[1];
+ hpet_res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ snprintf((char *)hpet_res->name, HPET_RESOURCE_NAME_SIZE,
+ "HPET %u", hpet_tbl->number);
+ hpet_res->end = (1 * 1024) - 1;
+ }
+
#ifdef CONFIG_X86_64
vxtime.hpet_address = hpet_tbl->addr.addrl |
((long)hpet_tbl->addr.addrh << 32);
printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n",
hpet_tbl->id, vxtime.hpet_address);
+
+ res_start = vxtime.hpet_address;
#else /* X86 */
{
extern unsigned long hpet_address;
@@ -607,9 +692,17 @@ static int __init acpi_parse_hpet(unsigned long phys, unsigned long size)
hpet_address = hpet_tbl->addr.addrl;
printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n",
hpet_tbl->id, hpet_address);
+
+ res_start = hpet_address;
}
#endif /* X86 */
+ if (hpet_res) {
+ hpet_res->start = res_start;
+ hpet_res->end += res_start;
+ insert_resource(&iomem_resource, hpet_res);
+ }
+
return 0;
}
#else
@@ -860,8 +953,6 @@ static void __init acpi_process_madt(void)
return;
}
-extern int acpi_force;
-
#ifdef __i386__
static int __init disable_acpi_irq(struct dmi_system_id *d)
@@ -1163,3 +1254,75 @@ int __init acpi_boot_init(void)
return 0;
}
+
+static int __init parse_acpi(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
+
+ /* "acpi=off" disables both ACPI table parsing and interpreter */
+ if (strcmp(arg, "off") == 0) {
+ disable_acpi();
+ }
+ /* acpi=force to over-ride black-list */
+ else if (strcmp(arg, "force") == 0) {
+ acpi_force = 1;
+ acpi_ht = 1;
+ acpi_disabled = 0;
+ }
+ /* acpi=strict disables out-of-spec workarounds */
+ else if (strcmp(arg, "strict") == 0) {
+ acpi_strict = 1;
+ }
+ /* Limit ACPI just to boot-time to enable HT */
+ else if (strcmp(arg, "ht") == 0) {
+ if (!acpi_force)
+ disable_acpi();
+ acpi_ht = 1;
+ }
+ /* "acpi=noirq" disables ACPI interrupt routing */
+ else if (strcmp(arg, "noirq") == 0) {
+ acpi_noirq_set();
+ } else {
+ /* Core will printk when we return error. */
+ return -EINVAL;
+ }
+ return 0;
+}
+early_param("acpi", parse_acpi);
+
+/* FIXME: Using pci= for an ACPI parameter is a travesty. */
+static int __init parse_pci(char *arg)
+{
+ if (arg && strcmp(arg, "noacpi") == 0)
+ acpi_disable_pci();
+ return 0;
+}
+early_param("pci", parse_pci);
+
+#ifdef CONFIG_X86_IO_APIC
+static int __init parse_acpi_skip_timer_override(char *arg)
+{
+ acpi_skip_timer_override = 1;
+ return 0;
+}
+early_param("acpi_skip_timer_override", parse_acpi_skip_timer_override);
+#endif /* CONFIG_X86_IO_APIC */
+
+static int __init setup_acpi_sci(char *s)
+{
+ if (!s)
+ return -EINVAL;
+ if (!strcmp(s, "edge"))
+ acpi_sci_flags.trigger = 1;
+ else if (!strcmp(s, "level"))
+ acpi_sci_flags.trigger = 3;
+ else if (!strcmp(s, "high"))
+ acpi_sci_flags.polarity = 1;
+ else if (!strcmp(s, "low"))
+ acpi_sci_flags.polarity = 3;
+ else
+ return -EINVAL;
+ return 0;
+}
+early_param("acpi_sci", setup_acpi_sci);
diff --git a/arch/i386/kernel/acpi/earlyquirk.c b/arch/i386/kernel/acpi/earlyquirk.c
index 1649a175a20..fe799b11ac0 100644
--- a/arch/i386/kernel/acpi/earlyquirk.c
+++ b/arch/i386/kernel/acpi/earlyquirk.c
@@ -48,7 +48,11 @@ void __init check_acpi_pci(void)
int num, slot, func;
/* Assume the machine supports type 1. If not it will
- always read ffffffff and should not have any side effect. */
+ always read ffffffff and should not have any side effect.
+ Actually a few buggy systems can machine check. Allow the user
+ to disable it by command line option at least -AK */
+ if (!early_pci_allowed())
+ return;
/* Poor man's PCI discovery */
for (num = 0; num < 32; num++) {
diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c
index 8c844d07862..90faae5c5d3 100644
--- a/arch/i386/kernel/apic.c
+++ b/arch/i386/kernel/apic.c
@@ -52,7 +52,18 @@ static cpumask_t timer_bcast_ipi;
/*
* Knob to control our willingness to enable the local APIC.
*/
-int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */
+static int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */
+
+static inline void lapic_disable(void)
+{
+ enable_local_apic = -1;
+ clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
+}
+
+static inline void lapic_enable(void)
+{
+ enable_local_apic = 1;
+}
/*
* Debug level
@@ -586,8 +597,7 @@ void __devinit setup_local_APIC(void)
printk("No ESR for 82489DX.\n");
}
- if (nmi_watchdog == NMI_LOCAL_APIC)
- setup_apic_nmi_watchdog();
+ setup_apic_nmi_watchdog(NULL);
apic_pm_activate();
}
@@ -1373,3 +1383,18 @@ int __init APIC_init_uniprocessor (void)
return 0;
}
+
+static int __init parse_lapic(char *arg)
+{
+ lapic_enable();
+ return 0;
+}
+early_param("lapic", parse_lapic);
+
+static int __init parse_nolapic(char *arg)
+{
+ lapic_disable();
+ return 0;
+}
+early_param("nolapic", parse_nolapic);
+
diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c
index 8591f2fa920..ff9ce4b5eaa 100644
--- a/arch/i386/kernel/apm.c
+++ b/arch/i386/kernel/apm.c
@@ -1154,9 +1154,11 @@ out:
static void set_time(void)
{
+ struct timespec ts;
if (got_clock_diff) { /* Must know time zone in order to set clock */
- xtime.tv_sec = get_cmos_time() + clock_cmos_diff;
- xtime.tv_nsec = 0;
+ ts.tv_sec = get_cmos_time() + clock_cmos_diff;
+ ts.tv_nsec = 0;
+ do_settimeofday(&ts);
}
}
@@ -1232,13 +1234,8 @@ static int suspend(int vetoable)
restore_processor_state();
local_irq_disable();
- write_seqlock(&xtime_lock);
- spin_lock(&i8253_lock);
- reinit_timer();
set_time();
-
- spin_unlock(&i8253_lock);
- write_sequnlock(&xtime_lock);
+ reinit_timer();
if (err == APM_NO_ERROR)
err = APM_SUCCESS;
@@ -1365,9 +1362,7 @@ static void check_events(void)
ignore_bounce = 1;
if ((event != APM_NORMAL_RESUME)
|| (ignore_normal_resume == 0)) {
- write_seqlock_irq(&xtime_lock);
set_time();
- write_sequnlock_irq(&xtime_lock);
device_resume();
pm_send_all(PM_RESUME, (void *)0);
queue_event(event, NULL);
@@ -1383,9 +1378,7 @@ static void check_events(void)
break;
case APM_UPDATE_TIME:
- write_seqlock_irq(&xtime_lock);
set_time();
- write_sequnlock_irq(&xtime_lock);
break;
case APM_CRITICAL_SUSPEND:
@@ -2339,6 +2332,7 @@ static int __init apm_init(void)
ret = kernel_thread(apm, NULL, CLONE_KERNEL | SIGCHLD);
if (ret < 0) {
printk(KERN_ERR "apm: disabled - Unable to start kernel thread.\n");
+ remove_proc_entry("apm", NULL);
return -ENOMEM;
}
@@ -2348,7 +2342,13 @@ static int __init apm_init(void)
return 0;
}
- misc_register(&apm_device);
+ /*
+ * Note we don't actually care if the misc_device cannot be registered.
+ * this driver can do its job without it, even if userspace can't
+ * control it. just log the error
+ */
+ if (misc_register(&apm_device))
+ printk(KERN_WARNING "apm: Could not register misc device.\n");
if (HZ != 100)
idle_period = (idle_period * HZ) / 100;
diff --git a/arch/i386/kernel/cpu/amd.c b/arch/i386/kernel/cpu/amd.c
index e6a2d6b80cd..e4758095d87 100644
--- a/arch/i386/kernel/cpu/amd.c
+++ b/arch/i386/kernel/cpu/amd.c
@@ -22,7 +22,7 @@
extern void vide(void);
__asm__(".align 4\nvide: ret");
-static void __init init_amd(struct cpuinfo_x86 *c)
+static void __cpuinit init_amd(struct cpuinfo_x86 *c)
{
u32 l, h;
int mbytes = num_physpages >> (20-PAGE_SHIFT);
@@ -246,7 +246,7 @@ static void __init init_amd(struct cpuinfo_x86 *c)
num_cache_leaves = 3;
}
-static unsigned int amd_size_cache(struct cpuinfo_x86 * c, unsigned int size)
+static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 * c, unsigned int size)
{
/* AMD errata T13 (order #21922) */
if ((c->x86 == 6)) {
@@ -259,7 +259,7 @@ static unsigned int amd_size_cache(struct cpuinfo_x86 * c, unsigned int size)
return size;
}
-static struct cpu_dev amd_cpu_dev __initdata = {
+static struct cpu_dev amd_cpu_dev __cpuinitdata = {
.c_vendor = "AMD",
.c_ident = { "AuthenticAMD" },
.c_models = {
@@ -275,7 +275,6 @@ static struct cpu_dev amd_cpu_dev __initdata = {
},
},
.c_init = init_amd,
- .c_identify = generic_identify,
.c_size_cache = amd_size_cache,
};
diff --git a/arch/i386/kernel/cpu/centaur.c b/arch/i386/kernel/cpu/centaur.c
index bd75629dd26..8c25047975c 100644
--- a/arch/i386/kernel/cpu/centaur.c
+++ b/arch/i386/kernel/cpu/centaur.c
@@ -9,7 +9,7 @@
#ifdef CONFIG_X86_OOSTORE
-static u32 __init power2(u32 x)
+static u32 __cpuinit power2(u32 x)
{
u32 s=1;
while(s<=x)
@@ -22,7 +22,7 @@ static u32 __init power2(u32 x)
* Set up an actual MCR
*/
-static void __init centaur_mcr_insert(int reg, u32 base, u32 size, int key)
+static void __cpuinit centaur_mcr_insert(int reg, u32 base, u32 size, int key)
{
u32 lo, hi;
@@ -40,7 +40,7 @@ static void __init centaur_mcr_insert(int reg, u32 base, u32 size, int key)
* Shortcut: We know you can't put 4Gig of RAM on a winchip
*/
-static u32 __init ramtop(void) /* 16388 */
+static u32 __cpuinit ramtop(void) /* 16388 */
{
int i;
u32 top = 0;
@@ -91,7 +91,7 @@ static u32 __init ramtop(void) /* 16388 */
* Compute a set of MCR's to give maximum coverage
*/
-static int __init centaur_mcr_compute(int nr, int key)
+static int __cpuinit centaur_mcr_compute(int nr, int key)
{
u32 mem = ramtop();
u32 root = power2(mem);
@@ -166,7 +166,7 @@ static int __init centaur_mcr_compute(int nr, int key)
return ct;
}
-static void __init centaur_create_optimal_mcr(void)
+static void __cpuinit centaur_create_optimal_mcr(void)
{
int i;
/*
@@ -189,7 +189,7 @@ static void __init centaur_create_optimal_mcr(void)
wrmsr(MSR_IDT_MCR0+i, 0, 0);
}
-static void __init winchip2_create_optimal_mcr(void)
+static void __cpuinit winchip2_create_optimal_mcr(void)
{
u32 lo, hi;
int i;
@@ -227,7 +227,7 @@ static void __init winchip2_create_optimal_mcr(void)
* Handle the MCR key on the Winchip 2.
*/
-static void __init winchip2_unprotect_mcr(void)
+static void __cpuinit winchip2_unprotect_mcr(void)
{
u32 lo, hi;
u32 key;
@@ -239,7 +239,7 @@ static void __init winchip2_unprotect_mcr(void)
wrmsr(MSR_IDT_MCR_CTRL, lo, hi);
}
-static void __init winchip2_protect_mcr(void)
+static void __cpuinit winchip2_protect_mcr(void)
{
u32 lo, hi;
@@ -257,7 +257,7 @@ static void __init winchip2_protect_mcr(void)
#define RNG_ENABLED (1 << 3)
#define RNG_ENABLE (1 << 6) /* MSR_VIA_RNG */
-static void __init init_c3(struct cpuinfo_x86 *c)
+static void __cpuinit init_c3(struct cpuinfo_x86 *c)
{
u32 lo, hi;
@@ -303,7 +303,7 @@ static void __init init_c3(struct cpuinfo_x86 *c)
display_cacheinfo(c);
}
-static void __init init_centaur(struct cpuinfo_x86 *c)
+static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
{
enum {
ECX8=1<<1,
@@ -442,7 +442,7 @@ static void __init init_centaur(struct cpuinfo_x86 *c)
}
}
-static unsigned int centaur_size_cache(struct cpuinfo_x86 * c, unsigned int size)
+static unsigned int __cpuinit centaur_size_cache(struct cpuinfo_x86 * c, unsigned int size)
{
/* VIA C3 CPUs (670-68F) need further shifting. */
if ((c->x86 == 6) && ((c->x86_model == 7) || (c->x86_model == 8)))
@@ -457,7 +457,7 @@ static unsigned int centaur_size_cache(struct cpuinfo_x86 * c, unsigned int size
return size;
}
-static struct cpu_dev centaur_cpu_dev __initdata = {
+static struct cpu_dev centaur_cpu_dev __cpuinitdata = {
.c_vendor = "Centaur",
.c_ident = { "CentaurHauls" },
.c_init = init_centaur,
diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c
index 70c87de582c..2799baaadf4 100644
--- a/arch/i386/kernel/cpu/common.c
+++ b/arch/i386/kernel/cpu/common.c
@@ -36,7 +36,7 @@ struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {};
extern int disable_pse;
-static void default_init(struct cpuinfo_x86 * c)
+static void __cpuinit default_init(struct cpuinfo_x86 * c)
{
/* Not much we can do here... */
/* Check if at least it has cpuid */
@@ -49,7 +49,7 @@ static void default_init(struct cpuinfo_x86 * c)
}
}
-static struct cpu_dev default_cpu = {
+static struct cpu_dev __cpuinitdata default_cpu = {
.c_init = default_init,
.c_vendor = "Unknown",
};
@@ -265,7 +265,7 @@ static void __init early_cpu_detect(void)
}
}
-void __cpuinit generic_identify(struct cpuinfo_x86 * c)
+static void __cpuinit generic_identify(struct cpuinfo_x86 * c)
{
u32 tfms, xlvl;
int ebx;
@@ -675,7 +675,7 @@ old_gdt:
#endif
/* Clear %fs and %gs. */
- asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs");
+ asm volatile ("movl %0, %%fs; movl %0, %%gs" : : "r" (0));
/* Clear all 6 debug registers: */
set_debugreg(0, 0);
diff --git a/arch/i386/kernel/cpu/cpu.h b/arch/i386/kernel/cpu/cpu.h
index 5a1d4f163e8..2f6432cef6f 100644
--- a/arch/i386/kernel/cpu/cpu.h
+++ b/arch/i386/kernel/cpu/cpu.h
@@ -24,7 +24,5 @@ extern struct cpu_dev * cpu_devs [X86_VENDOR_NUM];
extern int get_model_name(struct cpuinfo_x86 *c);
extern void display_cacheinfo(struct cpuinfo_x86 *c);
-extern void generic_identify(struct cpuinfo_x86 * c);
-
extern void early_intel_workaround(struct cpuinfo_x86 *c);
diff --git a/arch/i386/kernel/cpu/cyrix.c b/arch/i386/kernel/cpu/cyrix.c
index f03b7f94c30..c0c3b59de32 100644
--- a/arch/i386/kernel/cpu/cyrix.c
+++ b/arch/i386/kernel/cpu/cyrix.c
@@ -12,7 +12,7 @@
/*
* Read NSC/Cyrix DEVID registers (DIR) to get more detailed info. about the CPU
*/
-static void __init do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
+static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
{
unsigned char ccr2, ccr3;
unsigned long flags;
@@ -52,25 +52,25 @@ static void __init do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
* Actually since bugs.h doesn't even reference this perhaps someone should
* fix the documentation ???
*/
-static unsigned char Cx86_dir0_msb __initdata = 0;
+static unsigned char Cx86_dir0_msb __cpuinitdata = 0;
-static char Cx86_model[][9] __initdata = {
+static char Cx86_model[][9] __cpuinitdata = {
"Cx486", "Cx486", "5x86 ", "6x86", "MediaGX ", "6x86MX ",
"M II ", "Unknown"
};
-static char Cx486_name[][5] __initdata = {
+static char Cx486_name[][5] __cpuinitdata = {
"SLC", "DLC", "SLC2", "DLC2", "SRx", "DRx",
"SRx2", "DRx2"
};
-static char Cx486S_name[][4] __initdata = {
+static char Cx486S_name[][4] __cpuinitdata = {
"S", "S2", "Se", "S2e"
};
-static char Cx486D_name[][4] __initdata = {
+static char Cx486D_name[][4] __cpuinitdata = {
"DX", "DX2", "?", "?", "?", "DX4"
};
-static char Cx86_cb[] __initdata = "?.5x Core/Bus Clock";
-static char cyrix_model_mult1[] __initdata = "12??43";
-static char cyrix_model_mult2[] __initdata = "12233445";
+static char Cx86_cb[] __cpuinitdata = "?.5x Core/Bus Clock";
+static char cyrix_model_mult1[] __cpuinitdata = "12??43";
+static char cyrix_model_mult2[] __cpuinitdata = "12233445";
/*
* Reset the slow-loop (SLOP) bit on the 686(L) which is set by some old
@@ -82,7 +82,7 @@ static char cyrix_model_mult2[] __initdata = "12233445";
extern void calibrate_delay(void) __init;
-static void __init check_cx686_slop(struct cpuinfo_x86 *c)
+static void __cpuinit check_cx686_slop(struct cpuinfo_x86 *c)
{
unsigned long flags;
@@ -107,7 +107,7 @@ static void __init check_cx686_slop(struct cpuinfo_x86 *c)
}
-static void __init set_cx86_reorder(void)
+static void __cpuinit set_cx86_reorder(void)
{
u8 ccr3;
@@ -122,7 +122,7 @@ static void __init set_cx86_reorder(void)
setCx86(CX86_CCR3, ccr3);
}
-static void __init set_cx86_memwb(void)
+static void __cpuinit set_cx86_memwb(void)
{
u32 cr0;
@@ -137,7 +137,7 @@ static void __init set_cx86_memwb(void)
setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x14 );
}
-static void __init set_cx86_inc(void)
+static void __cpuinit set_cx86_inc(void)
{
unsigned char ccr3;
@@ -158,7 +158,7 @@ static void __init set_cx86_inc(void)
* Configure later MediaGX and/or Geode processor.
*/
-static void __init geode_configure(void)
+static void __cpuinit geode_configure(void)
{
unsigned long flags;
u8 ccr3, ccr4;
@@ -184,14 +184,14 @@ static void __init geode_configure(void)
#ifdef CONFIG_PCI
-static struct pci_device_id __initdata cyrix_55x0[] = {
+static struct pci_device_id __cpuinitdata cyrix_55x0[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5510) },
{ PCI_DEVICE(PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5520) },
{ },
};
#endif
-static void __init init_cyrix(struct cpuinfo_x86 *c)
+static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
{
unsigned char dir0, dir0_msn, dir0_lsn, dir1 = 0;
char *buf = c->x86_model_id;
@@ -346,7 +346,7 @@ static void __init init_cyrix(struct cpuinfo_x86 *c)
/*
* Handle National Semiconductor branded processors
*/
-static void __init init_nsc(struct cpuinfo_x86 *c)
+static void __cpuinit init_nsc(struct cpuinfo_x86 *c)
{
/* There may be GX1 processors in the wild that are branded
* NSC and not Cyrix.
@@ -394,7 +394,7 @@ static inline int test_cyrix_52div(void)
return (unsigned char) (test >> 8) == 0x02;
}
-static void cyrix_identify(struct cpuinfo_x86 * c)
+static void __cpuinit cyrix_identify(struct cpuinfo_x86 * c)
{
/* Detect Cyrix with disabled CPUID */
if ( c->x86 == 4 && test_cyrix_52div() ) {
@@ -427,10 +427,9 @@ static void cyrix_identify(struct cpuinfo_x86 * c)
local_irq_restore(flags);
}
}
- generic_identify(c);
}
-static struct cpu_dev cyrix_cpu_dev __initdata = {
+static struct cpu_dev cyrix_cpu_dev __cpuinitdata = {
.c_vendor = "Cyrix",
.c_ident = { "CyrixInstead" },
.c_init = init_cyrix,
@@ -453,11 +452,10 @@ static int __init cyrix_exit_cpu(void)
late_initcall(cyrix_exit_cpu);
-static struct cpu_dev nsc_cpu_dev __initdata = {
+static struct cpu_dev nsc_cpu_dev __cpuinitdata = {
.c_vendor = "NSC",
.c_ident = { "Geode by NSC" },
.c_init = init_nsc,
- .c_identify = generic_identify,
};
int __init nsc_init_cpu(void)
diff --git a/arch/i386/kernel/cpu/intel.c b/arch/i386/kernel/cpu/intel.c
index 5a2e270924b..94a95aa5227 100644
--- a/arch/i386/kernel/cpu/intel.c
+++ b/arch/i386/kernel/cpu/intel.c
@@ -198,7 +198,7 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
}
-static unsigned int intel_size_cache(struct cpuinfo_x86 * c, unsigned int size)
+static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 * c, unsigned int size)
{
/* Intel PIII Tualatin. This comes in two flavours.
* One has 256kb of cache, the other 512. We have no way
@@ -263,7 +263,6 @@ static struct cpu_dev intel_cpu_dev __cpuinitdata = {
},
},
.c_init = init_intel,
- .c_identify = generic_identify,
.c_size_cache = intel_size_cache,
};
diff --git a/arch/i386/kernel/cpu/mcheck/Makefile b/arch/i386/kernel/cpu/mcheck/Makefile
index 30808f3d671..f1ebe1c1c17 100644
--- a/arch/i386/kernel/cpu/mcheck/Makefile
+++ b/arch/i386/kernel/cpu/mcheck/Makefile
@@ -1,2 +1,2 @@
-obj-y = mce.o k7.o p4.o p5.o p6.o winchip.o
+obj-y = mce.o k7.o p4.o p5.o p6.o winchip.o therm_throt.o
obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o
diff --git a/arch/i386/kernel/cpu/mcheck/p4.c b/arch/i386/kernel/cpu/mcheck/p4.c
index b95f1b3d53a..504434a4601 100644
--- a/arch/i386/kernel/cpu/mcheck/p4.c
+++ b/arch/i386/kernel/cpu/mcheck/p4.c
@@ -13,6 +13,8 @@
#include <asm/msr.h>
#include <asm/apic.h>
+#include <asm/therm_throt.h>
+
#include "mce.h"
/* as supported by the P4/Xeon family */
@@ -44,25 +46,12 @@ static void unexpected_thermal_interrupt(struct pt_regs *regs)
/* P4/Xeon Thermal transition interrupt handler */
static void intel_thermal_interrupt(struct pt_regs *regs)
{
- u32 l, h;
- unsigned int cpu = smp_processor_id();
- static unsigned long next[NR_CPUS];
+ __u64 msr_val;
ack_APIC_irq();
- if (time_after(next[cpu], jiffies))
- return;
-
- next[cpu] = jiffies + HZ*5;
- rdmsr(MSR_IA32_THERM_STATUS, l, h);
- if (l & 0x1) {
- printk(KERN_EMERG "CPU%d: Temperature above threshold\n", cpu);
- printk(KERN_EMERG "CPU%d: Running in modulated clock mode\n",
- cpu);
- add_taint(TAINT_MACHINE_CHECK);
- } else {
- printk(KERN_INFO "CPU%d: Temperature/speed normal\n", cpu);
- }
+ rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
+ therm_throt_process(msr_val & 0x1);
}
/* Thermal interrupt handler for this CPU setup */
@@ -122,10 +111,13 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
rdmsr (MSR_IA32_MISC_ENABLE, l, h);
wrmsr (MSR_IA32_MISC_ENABLE, l | (1<<3), h);
-
+
l = apic_read (APIC_LVTTHMR);
apic_write_around (APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
printk (KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu);
+
+ /* enable thermal throttle processing */
+ atomic_set(&therm_throt_en, 1);
return;
}
#endif /* CONFIG_X86_MCE_P4THERMAL */
diff --git a/arch/i386/kernel/cpu/mcheck/therm_throt.c b/arch/i386/kernel/cpu/mcheck/therm_throt.c
new file mode 100644
index 00000000000..4f43047de40
--- /dev/null
+++ b/arch/i386/kernel/cpu/mcheck/therm_throt.c
@@ -0,0 +1,180 @@
+/*
+ * linux/arch/i386/kerne/cpu/mcheck/therm_throt.c
+ *
+ * Thermal throttle event support code (such as syslog messaging and rate
+ * limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c).
+ * This allows consistent reporting of CPU thermal throttle events.
+ *
+ * Maintains a counter in /sys that keeps track of the number of thermal
+ * events, such that the user knows how bad the thermal problem might be
+ * (since the logging to syslog and mcelog is rate limited).
+ *
+ * Author: Dmitriy Zavin (dmitriyz@google.com)
+ *
+ * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c.
+ * Inspired by Ross Biro's and Al Borchers' counter code.
+ */
+
+#include <linux/percpu.h>
+#include <linux/sysdev.h>
+#include <linux/cpu.h>
+#include <asm/cpu.h>
+#include <linux/notifier.h>
+#include <asm/therm_throt.h>
+
+/* How long to wait between reporting thermal events */
+#define CHECK_INTERVAL (300 * HZ)
+
+static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES;
+static DEFINE_PER_CPU(unsigned long, thermal_throttle_count);
+atomic_t therm_throt_en = ATOMIC_INIT(0);
+
+#ifdef CONFIG_SYSFS
+#define define_therm_throt_sysdev_one_ro(_name) \
+ static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL)
+
+#define define_therm_throt_sysdev_show_func(name) \
+static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \
+ char *buf) \
+{ \
+ unsigned int cpu = dev->id; \
+ ssize_t ret; \
+ \
+ preempt_disable(); /* CPU hotplug */ \
+ if (cpu_online(cpu)) \
+ ret = sprintf(buf, "%lu\n", \
+ per_cpu(thermal_throttle_##name, cpu)); \
+ else \
+ ret = 0; \
+ preempt_enable(); \
+ \
+ return ret; \
+}
+
+define_therm_throt_sysdev_show_func(count);
+define_therm_throt_sysdev_one_ro(count);
+
+static struct attribute *thermal_throttle_attrs[] = {
+ &attr_count.attr,
+ NULL
+};
+
+static struct attribute_group thermal_throttle_attr_group = {
+ .attrs = thermal_throttle_attrs,
+ .name = "thermal_throttle"
+};
+#endif /* CONFIG_SYSFS */
+
+/***
+ * therm_throt_process - Process thermal throttling event from interrupt
+ * @curr: Whether the condition is current or not (boolean), since the
+ * thermal interrupt normally gets called both when the thermal
+ * event begins and once the event has ended.
+ *
+ * This function is called by the thermal interrupt after the
+ * IRQ has been acknowledged.
+ *
+ * It will take care of rate limiting and printing messages to the syslog.
+ *
+ * Returns: 0 : Event should NOT be further logged, i.e. still in
+ * "timeout" from previous log message.
+ * 1 : Event should be logged further, and a message has been
+ * printed to the syslog.
+ */
+int therm_throt_process(int curr)
+{
+ unsigned int cpu = smp_processor_id();
+ __u64 tmp_jiffs = get_jiffies_64();
+
+ if (curr)
+ __get_cpu_var(thermal_throttle_count)++;
+
+ if (time_before64(tmp_jiffs, __get_cpu_var(next_check)))
+ return 0;
+
+ __get_cpu_var(next_check) = tmp_jiffs + CHECK_INTERVAL;
+
+ /* if we just entered the thermal event */
+ if (curr) {
+ printk(KERN_CRIT "CPU%d: Temperature above threshold, "
+ "cpu clock throttled (total events = %lu)\n", cpu,
+ __get_cpu_var(thermal_throttle_count));
+
+ add_taint(TAINT_MACHINE_CHECK);
+ } else {
+ printk(KERN_CRIT "CPU%d: Temperature/speed normal\n", cpu);
+ }
+
+ return 1;
+}
+
+#ifdef CONFIG_SYSFS
+/* Add/Remove thermal_throttle interface for CPU device */
+static __cpuinit int thermal_throttle_add_dev(struct sys_device * sys_dev)
+{
+ sysfs_create_group(&sys_dev->kobj, &thermal_throttle_attr_group);
+ return 0;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static __cpuinit int thermal_throttle_remove_dev(struct sys_device * sys_dev)
+{
+ sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group);
+ return 0;
+}
+
+/* Mutex protecting device creation against CPU hotplug */
+static DEFINE_MUTEX(therm_cpu_lock);
+
+/* Get notified when a cpu comes on/off. Be hotplug friendly. */
+static __cpuinit int thermal_throttle_cpu_callback(struct notifier_block *nfb,
+ unsigned long action,
+ void *hcpu)
+{
+ unsigned int cpu = (unsigned long)hcpu;
+ struct sys_device *sys_dev;
+
+ sys_dev = get_cpu_sysdev(cpu);
+ mutex_lock(&therm_cpu_lock);
+ switch (action) {
+ case CPU_ONLINE:
+ thermal_throttle_add_dev(sys_dev);
+ break;
+ case CPU_DEAD:
+ thermal_throttle_remove_dev(sys_dev);
+ break;
+ }
+ mutex_unlock(&therm_cpu_lock);
+ return NOTIFY_OK;
+}
+
+static struct notifier_block thermal_throttle_cpu_notifier =
+{
+ .notifier_call = thermal_throttle_cpu_callback,
+};
+#endif /* CONFIG_HOTPLUG_CPU */
+
+static __init int thermal_throttle_init_device(void)
+{
+ unsigned int cpu = 0;
+
+ if (!atomic_read(&therm_throt_en))
+ return 0;
+
+ register_hotcpu_notifier(&thermal_throttle_cpu_notifier);
+
+#ifdef CONFIG_HOTPLUG_CPU
+ mutex_lock(&therm_cpu_lock);
+#endif
+ /* connect live CPUs to sysfs */
+ for_each_online_cpu(cpu)
+ thermal_throttle_add_dev(get_cpu_sysdev(cpu));
+#ifdef CONFIG_HOTPLUG_CPU
+ mutex_unlock(&therm_cpu_lock);
+#endif
+
+ return 0;
+}
+
+device_initcall(thermal_throttle_init_device);
+#endif /* CONFIG_SYSFS */
diff --git a/arch/i386/kernel/cpu/mtrr/generic.c b/arch/i386/kernel/cpu/mtrr/generic.c
index 169ac8e0db6..0b61eed8bbd 100644
--- a/arch/i386/kernel/cpu/mtrr/generic.c
+++ b/arch/i386/kernel/cpu/mtrr/generic.c
@@ -243,7 +243,7 @@ static DEFINE_SPINLOCK(set_atomicity_lock);
* has been called.
*/
-static void prepare_set(void)
+static void prepare_set(void) __acquires(set_atomicity_lock)
{
unsigned long cr0;
@@ -274,7 +274,7 @@ static void prepare_set(void)
mtrr_wrmsr(MTRRdefType_MSR, deftype_lo & 0xf300UL, deftype_hi);
}
-static void post_set(void)
+static void post_set(void) __releases(set_atomicity_lock)
{
/* Flush TLBs (no need to flush caches - they are disabled) */
__flush_tlb();
diff --git a/arch/i386/kernel/cpu/nexgen.c b/arch/i386/kernel/cpu/nexgen.c
index ad87fa58058..8bf23cc80c6 100644
--- a/arch/i386/kernel/cpu/nexgen.c
+++ b/arch/i386/kernel/cpu/nexgen.c
@@ -10,7 +10,7 @@
* to have CPUID. (Thanks to Herbert Oppmann)
*/
-static int __init deep_magic_nexgen_probe(void)
+static int __cpuinit deep_magic_nexgen_probe(void)
{
int ret;
@@ -27,21 +27,20 @@ static int __init deep_magic_nexgen_probe(void)
return ret;
}
-static void __init init_nexgen(struct cpuinfo_x86 * c)
+static void __cpuinit init_nexgen(struct cpuinfo_x86 * c)
{
c->x86_cache_size = 256; /* A few had 1 MB... */
}
-static void __init nexgen_identify(struct cpuinfo_x86 * c)
+static void __cpuinit nexgen_identify(struct cpuinfo_x86 * c)
{
/* Detect NexGen with old hypercode */
if ( deep_magic_nexgen_probe() ) {
strcpy(c->x86_vendor_id, "NexGenDriven");
}
- generic_identify(c);
}
-static struct cpu_dev nexgen_cpu_dev __initdata = {
+static struct cpu_dev nexgen_cpu_dev __cpuinitdata = {
.c_vendor = "Nexgen",
.c_ident = { "NexGenDriven" },
.c_models = {
diff --git a/arch/i386/kernel/cpu/proc.c b/arch/i386/kernel/cpu/proc.c
index f54a15268ed..76aac088a32 100644
--- a/arch/i386/kernel/cpu/proc.c
+++ b/arch/i386/kernel/cpu/proc.c
@@ -46,8 +46,8 @@ static int show_cpuinfo(struct seq_file *m, void *v)
/* Intel-defined (#2) */
"pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est",
- "tm2", NULL, "cid", NULL, NULL, "cx16", "xtpr", NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL,
+ NULL, NULL, "dca", NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
/* VIA/Cyrix/Centaur-defined */
diff --git a/arch/i386/kernel/cpu/rise.c b/arch/i386/kernel/cpu/rise.c
index d08d5a2811c..9317f741498 100644
--- a/arch/i386/kernel/cpu/rise.c
+++ b/arch/i386/kernel/cpu/rise.c
@@ -5,7 +5,7 @@
#include "cpu.h"
-static void __init init_rise(struct cpuinfo_x86 *c)
+static void __cpuinit init_rise(struct cpuinfo_x86 *c)
{
printk("CPU: Rise iDragon");
if (c->x86_model > 2)
@@ -28,7 +28,7 @@ static void __init init_rise(struct cpuinfo_x86 *c)
set_bit(X86_FEATURE_CX8, c->x86_capability);
}
-static struct cpu_dev rise_cpu_dev __initdata = {
+static struct cpu_dev rise_cpu_dev __cpuinitdata = {
.c_vendor = "Rise",
.c_ident = { "RiseRiseRise" },
.c_models = {
diff --git a/arch/i386/kernel/cpu/transmeta.c b/arch/i386/kernel/cpu/transmeta.c
index 7214c9b577a..4056fb7d2cd 100644
--- a/arch/i386/kernel/cpu/transmeta.c
+++ b/arch/i386/kernel/cpu/transmeta.c
@@ -5,7 +5,7 @@
#include <asm/msr.h>
#include "cpu.h"
-static void __init init_transmeta(struct cpuinfo_x86 *c)
+static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
{
unsigned int cap_mask, uk, max, dummy;
unsigned int cms_rev1, cms_rev2;
@@ -85,10 +85,9 @@ static void __init init_transmeta(struct cpuinfo_x86 *c)
#endif
}
-static void __init transmeta_identify(struct cpuinfo_x86 * c)
+static void __cpuinit transmeta_identify(struct cpuinfo_x86 * c)
{
u32 xlvl;
- generic_identify(c);
/* Transmeta-defined flags: level 0x80860001 */
xlvl = cpuid_eax(0x80860000);
@@ -98,7 +97,7 @@ static void __init transmeta_identify(struct cpuinfo_x86 * c)
}
}
-static struct cpu_dev transmeta_cpu_dev __initdata = {
+static struct cpu_dev transmeta_cpu_dev __cpuinitdata = {
.c_vendor = "Transmeta",
.c_ident = { "GenuineTMx86", "TransmetaCPU" },
.c_init = init_transmeta,
diff --git a/arch/i386/kernel/cpu/umc.c b/arch/i386/kernel/cpu/umc.c
index 2cd988f6dc5..1bf3f87e9c5 100644
--- a/arch/i386/kernel/cpu/umc.c
+++ b/arch/i386/kernel/cpu/umc.c
@@ -5,12 +5,8 @@
/* UMC chips appear to be only either 386 or 486, so no special init takes place.
*/
-static void __init init_umc(struct cpuinfo_x86 * c)
-{
-
-}
-static struct cpu_dev umc_cpu_dev __initdata = {
+static struct cpu_dev umc_cpu_dev __cpuinitdata = {
.c_vendor = "UMC",
.c_ident = { "UMC UMC UMC" },
.c_models = {
@@ -21,7 +17,6 @@ static struct cpu_dev umc_cpu_dev __initdata = {
}
},
},
- .c_init = init_umc,
};
int __init umc_init_cpu(void)
diff --git a/arch/i386/kernel/crash.c b/arch/i386/kernel/crash.c
index 5b96f038367..67d297dc100 100644
--- a/arch/i386/kernel/crash.c
+++ b/arch/i386/kernel/crash.c
@@ -22,6 +22,8 @@
#include <asm/nmi.h>
#include <asm/hw_irq.h>
#include <asm/apic.h>
+#include <asm/kdebug.h>
+
#include <mach_ipi.h>
@@ -93,16 +95,25 @@ static void crash_save_self(struct pt_regs *regs)
#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
static atomic_t waiting_for_crash_ipi;
-static int crash_nmi_callback(struct pt_regs *regs, int cpu)
+static int crash_nmi_callback(struct notifier_block *self,
+ unsigned long val, void *data)
{
+ struct pt_regs *regs;
struct pt_regs fixed_regs;
+ int cpu;
+
+ if (val != DIE_NMI_IPI)
+ return NOTIFY_OK;
+
+ regs = ((struct die_args *)data)->regs;
+ cpu = raw_smp_processor_id();
/* Don't do anything if this handler is invoked on crashing cpu.
* Otherwise, system will completely hang. Crashing cpu can get
* an NMI if system was initially booted with nmi_watchdog parameter.
*/
if (cpu == crashing_cpu)
- return 1;
+ return NOTIFY_STOP;
local_irq_disable();
if (!user_mode_vm(regs)) {
@@ -125,13 +136,18 @@ static void smp_send_nmi_allbutself(void)
send_IPI_allbutself(NMI_VECTOR);
}
+static struct notifier_block crash_nmi_nb = {
+ .notifier_call = crash_nmi_callback,
+};
+
static void nmi_shootdown_cpus(void)
{
unsigned long msecs;
atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
/* Would it be better to replace the trap vector here? */
- set_nmi_callback(crash_nmi_callback);
+ if (register_die_notifier(&crash_nmi_nb))
+ return; /* return what? */
/* Ensure the new callback function is set before sending
* out the NMI
*/
diff --git a/arch/i386/kernel/efi_stub.S b/arch/i386/kernel/efi_stub.S
index d3ee73a3eee..ef00bb77d7e 100644
--- a/arch/i386/kernel/efi_stub.S
+++ b/arch/i386/kernel/efi_stub.S
@@ -7,7 +7,6 @@
#include <linux/linkage.h>
#include <asm/page.h>
-#include <asm/pgtable.h>
/*
* efi_call_phys(void *, ...) is a function with variable parameters.
diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index 87f9f60b803..5a63d6fdb70 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -76,8 +76,15 @@ DF_MASK = 0x00000400
NT_MASK = 0x00004000
VM_MASK = 0x00020000
+/* These are replaces for paravirtualization */
+#define DISABLE_INTERRUPTS cli
+#define ENABLE_INTERRUPTS sti
+#define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit
+#define INTERRUPT_RETURN iret
+#define GET_CR0_INTO_EAX movl %cr0, %eax
+
#ifdef CONFIG_PREEMPT
-#define preempt_stop cli; TRACE_IRQS_OFF
+#define preempt_stop DISABLE_INTERRUPTS; TRACE_IRQS_OFF
#else
#define preempt_stop
#define resume_kernel restore_nocheck
@@ -176,18 +183,21 @@ VM_MASK = 0x00020000
#define RING0_INT_FRAME \
CFI_STARTPROC simple;\
+ CFI_SIGNAL_FRAME;\
CFI_DEF_CFA esp, 3*4;\
/*CFI_OFFSET cs, -2*4;*/\
CFI_OFFSET eip, -3*4
#define RING0_EC_FRAME \
CFI_STARTPROC simple;\
+ CFI_SIGNAL_FRAME;\
CFI_DEF_CFA esp, 4*4;\
/*CFI_OFFSET cs, -2*4;*/\
CFI_OFFSET eip, -3*4
#define RING0_PTREGS_FRAME \
CFI_STARTPROC simple;\
+ CFI_SIGNAL_FRAME;\
CFI_DEF_CFA esp, OLDESP-EBX;\
/*CFI_OFFSET cs, CS-OLDESP;*/\
CFI_OFFSET eip, EIP-OLDESP;\
@@ -233,10 +243,11 @@ ret_from_intr:
check_userspace:
movl EFLAGS(%esp), %eax # mix EFLAGS and CS
movb CS(%esp), %al
- testl $(VM_MASK | 3), %eax
- jz resume_kernel
+ andl $(VM_MASK | SEGMENT_RPL_MASK), %eax
+ cmpl $USER_RPL, %eax
+ jb resume_kernel # not returning to v8086 or userspace
ENTRY(resume_userspace)
- cli # make sure we don't miss an interrupt
+ DISABLE_INTERRUPTS # make sure we don't miss an interrupt
# setting need_resched or sigpending
# between sampling and the iret
movl TI_flags(%ebp), %ecx
@@ -247,7 +258,7 @@ ENTRY(resume_userspace)
#ifdef CONFIG_PREEMPT
ENTRY(resume_kernel)
- cli
+ DISABLE_INTERRUPTS
cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ?
jnz restore_nocheck
need_resched:
@@ -267,6 +278,7 @@ need_resched:
# sysenter call handler stub
ENTRY(sysenter_entry)
CFI_STARTPROC simple
+ CFI_SIGNAL_FRAME
CFI_DEF_CFA esp, 0
CFI_REGISTER esp, ebp
movl TSS_sysenter_esp0(%esp),%esp
@@ -275,7 +287,7 @@ sysenter_past_esp:
* No need to follow this irqs on/off section: the syscall
* disabled irqs and here we enable it straight after entry:
*/
- sti
+ ENABLE_INTERRUPTS
pushl $(__USER_DS)
CFI_ADJUST_CFA_OFFSET 4
/*CFI_REL_OFFSET ss, 0*/
@@ -320,7 +332,7 @@ sysenter_past_esp:
jae syscall_badsys
call *sys_call_table(,%eax,4)
movl %eax,EAX(%esp)
- cli
+ DISABLE_INTERRUPTS
TRACE_IRQS_OFF
movl TI_flags(%ebp), %ecx
testw $_TIF_ALLWORK_MASK, %cx
@@ -330,8 +342,7 @@ sysenter_past_esp:
movl OLDESP(%esp), %ecx
xorl %ebp,%ebp
TRACE_IRQS_ON
- sti
- sysexit
+ ENABLE_INTERRUPTS_SYSEXIT
CFI_ENDPROC
@@ -356,7 +367,7 @@ syscall_call:
call *sys_call_table(,%eax,4)
movl %eax,EAX(%esp) # store the return value
syscall_exit:
- cli # make sure we don't miss an interrupt
+ DISABLE_INTERRUPTS # make sure we don't miss an interrupt
# setting need_resched or sigpending
# between sampling and the iret
TRACE_IRQS_OFF
@@ -371,8 +382,8 @@ restore_all:
# See comments in process.c:copy_thread() for details.
movb OLDSS(%esp), %ah
movb CS(%esp), %al
- andl $(VM_MASK | (4 << 8) | 3), %eax
- cmpl $((4 << 8) | 3), %eax
+ andl $(VM_MASK | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
+ cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
CFI_REMEMBER_STATE
je ldt_ss # returning to user-space with LDT SS
restore_nocheck:
@@ -381,11 +392,11 @@ restore_nocheck_notrace:
RESTORE_REGS
addl $4, %esp
CFI_ADJUST_CFA_OFFSET -4
-1: iret
+1: INTERRUPT_RETURN
.section .fixup,"ax"
iret_exc:
TRACE_IRQS_ON
- sti
+ ENABLE_INTERRUPTS
pushl $0 # no error code
pushl $do_iret_error
jmp error_code
@@ -409,7 +420,7 @@ ldt_ss:
* dosemu and wine happy. */
subl $8, %esp # reserve space for switch16 pointer
CFI_ADJUST_CFA_OFFSET 8
- cli
+ DISABLE_INTERRUPTS
TRACE_IRQS_OFF
movl %esp, %eax
/* Set up the 16bit stack frame with switch32 pointer on top,
@@ -419,7 +430,7 @@ ldt_ss:
TRACE_IRQS_IRET
RESTORE_REGS
lss 20+4(%esp), %esp # switch to 16bit stack
-1: iret
+1: INTERRUPT_RETURN
.section __ex_table,"a"
.align 4
.long 1b,iret_exc
@@ -434,7 +445,7 @@ work_pending:
jz work_notifysig
work_resched:
call schedule
- cli # make sure we don't miss an interrupt
+ DISABLE_INTERRUPTS # make sure we don't miss an interrupt
# setting need_resched or sigpending
# between sampling and the iret
TRACE_IRQS_OFF
@@ -490,7 +501,7 @@ syscall_exit_work:
testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
jz work_pending
TRACE_IRQS_ON
- sti # could let do_syscall_trace() call
+ ENABLE_INTERRUPTS # could let do_syscall_trace() call
# schedule() instead
movl %esp, %eax
movl $1, %edx
@@ -591,11 +602,9 @@ ENTRY(name) \
/* The include is where all of the SMP etc. interrupts come from */
#include "entry_arch.h"
-ENTRY(divide_error)
- RING0_INT_FRAME
- pushl $0 # no error code
- CFI_ADJUST_CFA_OFFSET 4
- pushl $do_divide_error
+KPROBE_ENTRY(page_fault)
+ RING0_EC_FRAME
+ pushl $do_page_fault
CFI_ADJUST_CFA_OFFSET 4
ALIGN
error_code:
@@ -645,6 +654,7 @@ error_code:
call *%edi
jmp ret_from_exception
CFI_ENDPROC
+KPROBE_END(page_fault)
ENTRY(coprocessor_error)
RING0_INT_FRAME
@@ -669,7 +679,7 @@ ENTRY(device_not_available)
pushl $-1 # mark this as an int
CFI_ADJUST_CFA_OFFSET 4
SAVE_ALL
- movl %cr0, %eax
+ GET_CR0_INTO_EAX
testl $0x4, %eax # EM (math emulation bit)
jne device_not_available_emulate
preempt_stop
@@ -702,9 +712,15 @@ device_not_available_emulate:
jne ok; \
label: \
movl TSS_sysenter_esp0+offset(%esp),%esp; \
+ CFI_DEF_CFA esp, 0; \
+ CFI_UNDEFINED eip; \
pushfl; \
+ CFI_ADJUST_CFA_OFFSET 4; \
pushl $__KERNEL_CS; \
- pushl $sysenter_past_esp
+ CFI_ADJUST_CFA_OFFSET 4; \
+ pushl $sysenter_past_esp; \
+ CFI_ADJUST_CFA_OFFSET 4; \
+ CFI_REL_OFFSET eip, 0
KPROBE_ENTRY(debug)
RING0_INT_FRAME
@@ -720,7 +736,8 @@ debug_stack_correct:
call do_debug
jmp ret_from_exception
CFI_ENDPROC
- .previous .text
+KPROBE_END(debug)
+
/*
* NMI is doubly nasty. It can happen _while_ we're handling
* a debug fault, and the debug fault hasn't yet been able to
@@ -729,7 +746,7 @@ debug_stack_correct:
* check whether we got an NMI on the debug path where the debug
* fault happened on the sysenter path.
*/
-ENTRY(nmi)
+KPROBE_ENTRY(nmi)
RING0_INT_FRAME
pushl %eax
CFI_ADJUST_CFA_OFFSET 4
@@ -754,6 +771,7 @@ ENTRY(nmi)
cmpl $sysenter_entry,12(%esp)
je nmi_debug_stack_check
nmi_stack_correct:
+ /* We have a RING0_INT_FRAME here */
pushl %eax
CFI_ADJUST_CFA_OFFSET 4
SAVE_ALL
@@ -764,9 +782,12 @@ nmi_stack_correct:
CFI_ENDPROC
nmi_stack_fixup:
+ RING0_INT_FRAME
FIX_STACK(12,nmi_stack_correct, 1)
jmp nmi_stack_correct
+
nmi_debug_stack_check:
+ /* We have a RING0_INT_FRAME here */
cmpw $__KERNEL_CS,16(%esp)
jne nmi_stack_correct
cmpl $debug,(%esp)
@@ -777,8 +798,10 @@ nmi_debug_stack_check:
jmp nmi_stack_correct
nmi_16bit_stack:
- RING0_INT_FRAME
- /* create the pointer to lss back */
+ /* We have a RING0_INT_FRAME here.
+ *
+ * create the pointer to lss back
+ */
pushl %ss
CFI_ADJUST_CFA_OFFSET 4
pushl %esp
@@ -799,12 +822,13 @@ nmi_16bit_stack:
call do_nmi
RESTORE_REGS
lss 12+4(%esp), %esp # back to 16bit stack
-1: iret
+1: INTERRUPT_RETURN
CFI_ENDPROC
.section __ex_table,"a"
.align 4
.long 1b,iret_exc
.previous
+KPROBE_END(nmi)
KPROBE_ENTRY(int3)
RING0_INT_FRAME
@@ -816,7 +840,7 @@ KPROBE_ENTRY(int3)
call do_int3
jmp ret_from_exception
CFI_ENDPROC
- .previous .text
+KPROBE_END(int3)
ENTRY(overflow)
RING0_INT_FRAME
@@ -881,7 +905,7 @@ KPROBE_ENTRY(general_protection)
CFI_ADJUST_CFA_OFFSET 4
jmp error_code
CFI_ENDPROC
- .previous .text
+KPROBE_END(general_protection)
ENTRY(alignment_check)
RING0_EC_FRAME
@@ -890,13 +914,14 @@ ENTRY(alignment_check)
jmp error_code
CFI_ENDPROC
-KPROBE_ENTRY(page_fault)
- RING0_EC_FRAME
- pushl $do_page_fault
+ENTRY(divide_error)
+ RING0_INT_FRAME
+ pushl $0 # no error code
+ CFI_ADJUST_CFA_OFFSET 4
+ pushl $do_divide_error
CFI_ADJUST_CFA_OFFSET 4
jmp error_code
CFI_ENDPROC
- .previous .text
#ifdef CONFIG_X86_MCE
ENTRY(machine_check)
@@ -949,6 +974,19 @@ ENTRY(arch_unwind_init_running)
ENDPROC(arch_unwind_init_running)
#endif
+ENTRY(kernel_thread_helper)
+ pushl $0 # fake return address for unwinder
+ CFI_STARTPROC
+ movl %edx,%eax
+ push %edx
+ CFI_ADJUST_CFA_OFFSET 4
+ call *%ebx
+ push %eax
+ CFI_ADJUST_CFA_OFFSET 4
+ call do_exit
+ CFI_ENDPROC
+ENDPROC(kernel_thread_helper)
+
.section .rodata,"a"
#include "syscall_table.S"
diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S
index a6b8bd89aa2..be9d883c62c 100644
--- a/arch/i386/kernel/head.S
+++ b/arch/i386/kernel/head.S
@@ -371,8 +371,65 @@ rp_sidt:
addl $8,%edi
dec %ecx
jne rp_sidt
+
+.macro set_early_handler handler,trapno
+ lea \handler,%edx
+ movl $(__KERNEL_CS << 16),%eax
+ movw %dx,%ax
+ movw $0x8E00,%dx /* interrupt gate - dpl=0, present */
+ lea idt_table,%edi
+ movl %eax,8*\trapno(%edi)
+ movl %edx,8*\trapno+4(%edi)
+.endm
+
+ set_early_handler handler=early_divide_err,trapno=0
+ set_early_handler handler=early_illegal_opcode,trapno=6
+ set_early_handler handler=early_protection_fault,trapno=13
+ set_early_handler handler=early_page_fault,trapno=14
+
ret
+early_divide_err:
+ xor %edx,%edx
+ pushl $0 /* fake errcode */
+ jmp early_fault
+
+early_illegal_opcode:
+ movl $6,%edx
+ pushl $0 /* fake errcode */
+ jmp early_fault
+
+early_protection_fault:
+ movl $13,%edx
+ jmp early_fault
+
+early_page_fault:
+ movl $14,%edx
+ jmp early_fault
+
+early_fault:
+ cld
+#ifdef CONFIG_PRINTK
+ movl $(__KERNEL_DS),%eax
+ movl %eax,%ds
+ movl %eax,%es
+ cmpl $2,early_recursion_flag
+ je hlt_loop
+ incl early_recursion_flag
+ movl %cr2,%eax
+ pushl %eax
+ pushl %edx /* trapno */
+ pushl $fault_msg
+#ifdef CONFIG_EARLY_PRINTK
+ call early_printk
+#else
+ call printk
+#endif
+#endif
+hlt_loop:
+ hlt
+ jmp hlt_loop
+
/* This is the default interrupt "handler" :-) */
ALIGN
ignore_int:
@@ -386,6 +443,9 @@ ignore_int:
movl $(__KERNEL_DS),%eax
movl %eax,%ds
movl %eax,%es
+ cmpl $2,early_recursion_flag
+ je hlt_loop
+ incl early_recursion_flag
pushl 16(%esp)
pushl 24(%esp)
pushl 32(%esp)
@@ -431,9 +491,16 @@ ENTRY(stack_start)
ready: .byte 0
+early_recursion_flag:
+ .long 0
+
int_msg:
.asciz "Unknown interrupt or fault at EIP %p %p %p\n"
+fault_msg:
+ .ascii "Int %d: CR2 %p err %p EIP %p CS %p flags %p\n"
+ .asciz "Stack: %p %p %p %p %p %p %p %p\n"
+
/*
* The IDT and GDT 'descriptors' are a strange 48-bit object
* only used by the lidt and lgdt instructions. They are not
diff --git a/arch/i386/kernel/i8259.c b/arch/i386/kernel/i8259.c
index d4756d154f4..ea5f4e7958d 100644
--- a/arch/i386/kernel/i8259.c
+++ b/arch/i386/kernel/i8259.c
@@ -45,6 +45,8 @@ static void end_8259A_irq (unsigned int irq)
#define shutdown_8259A_irq disable_8259A_irq
+static int i8259A_auto_eoi;
+
static void mask_and_ack_8259A(unsigned int);
unsigned int startup_8259A_irq(unsigned int irq)
@@ -253,7 +255,7 @@ static void save_ELCR(char *trigger)
static int i8259A_resume(struct sys_device *dev)
{
- init_8259A(0);
+ init_8259A(i8259A_auto_eoi);
restore_ELCR(irq_trigger);
return 0;
}
@@ -301,6 +303,8 @@ void init_8259A(int auto_eoi)
{
unsigned long flags;
+ i8259A_auto_eoi = auto_eoi;
+
spin_lock_irqsave(&i8259A_lock, flags);
outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index 4fb32c551fe..fd0df75cfbd 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -40,6 +40,7 @@
#include <asm/nmi.h>
#include <mach_apic.h>
+#include <mach_apicdef.h>
#include "io_ports.h"
@@ -65,7 +66,7 @@ int sis_apic_bug = -1;
*/
int nr_ioapic_registers[MAX_IO_APICS];
-int disable_timer_pin_1 __initdata;
+static int disable_timer_pin_1 __initdata;
/*
* Rough estimation of how many shared IRQs there are, can
@@ -93,6 +94,34 @@ int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1};
#define vector_to_irq(vector) (vector)
#endif
+
+union entry_union {
+ struct { u32 w1, w2; };
+ struct IO_APIC_route_entry entry;
+};
+
+static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
+{
+ union entry_union eu;
+ unsigned long flags;
+ spin_lock_irqsave(&ioapic_lock, flags);
+ eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
+ eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+ return eu.entry;
+}
+
+static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
+{
+ unsigned long flags;
+ union entry_union eu;
+ eu.entry = e;
+ spin_lock_irqsave(&ioapic_lock, flags);
+ io_apic_write(apic, 0x10 + 2*pin, eu.w1);
+ io_apic_write(apic, 0x11 + 2*pin, eu.w2);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
/*
* The common case is 1:1 IRQ<->pin mappings. Sometimes there are
* shared ISA-space IRQs, so we have to support them. We are super
@@ -200,13 +229,9 @@ static void unmask_IO_APIC_irq (unsigned int irq)
static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
{
struct IO_APIC_route_entry entry;
- unsigned long flags;
/* Check delivery_mode to be sure we're not clearing an SMI pin */
- spin_lock_irqsave(&ioapic_lock, flags);
- *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
- *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ entry = ioapic_read_entry(apic, pin);
if (entry.delivery_mode == dest_SMI)
return;
@@ -215,10 +240,7 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
*/
memset(&entry, 0, sizeof(entry));
entry.mask = 1;
- spin_lock_irqsave(&ioapic_lock, flags);
- io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0));
- io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1));
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ ioapic_write_entry(apic, pin, entry);
}
static void clear_IO_APIC (void)
@@ -1283,9 +1305,8 @@ static void __init setup_IO_APIC_irqs(void)
if (!apic && (irq < 16))
disable_8259A_irq(irq);
}
+ ioapic_write_entry(apic, pin, entry);
spin_lock_irqsave(&ioapic_lock, flags);
- io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
- io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
set_native_irq_info(irq, TARGET_CPUS);
spin_unlock_irqrestore(&ioapic_lock, flags);
}
@@ -1301,7 +1322,6 @@ static void __init setup_IO_APIC_irqs(void)
static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
{
struct IO_APIC_route_entry entry;
- unsigned long flags;
memset(&entry,0,sizeof(entry));
@@ -1331,10 +1351,7 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in
/*
* Add it to the IO-APIC irq-routing table:
*/
- spin_lock_irqsave(&ioapic_lock, flags);
- io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
- io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ ioapic_write_entry(apic, pin, entry);
enable_8259A_irq(0);
}
@@ -1444,10 +1461,7 @@ void __init print_IO_APIC(void)
for (i = 0; i <= reg_01.bits.entries; i++) {
struct IO_APIC_route_entry entry;
- spin_lock_irqsave(&ioapic_lock, flags);
- *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2);
- *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ entry = ioapic_read_entry(apic, i);
printk(KERN_DEBUG " %02x %03X %02X ",
i,
@@ -1666,10 +1680,7 @@ static void __init enable_IO_APIC(void)
/* See if any of the pins is in ExtINT mode */
for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
struct IO_APIC_route_entry entry;
- spin_lock_irqsave(&ioapic_lock, flags);
- *(((int *)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
- *(((int *)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ entry = ioapic_read_entry(apic, pin);
/* If the interrupt line is enabled and in ExtInt mode
@@ -1726,7 +1737,6 @@ void disable_IO_APIC(void)
*/
if (ioapic_i8259.pin != -1) {
struct IO_APIC_route_entry entry;
- unsigned long flags;
memset(&entry, 0, sizeof(entry));
entry.mask = 0; /* Enabled */
@@ -1743,12 +1753,7 @@ void disable_IO_APIC(void)
/*
* Add it to the IO-APIC irq-routing table:
*/
- spin_lock_irqsave(&ioapic_lock, flags);
- io_apic_write(ioapic_i8259.apic, 0x11+2*ioapic_i8259.pin,
- *(((int *)&entry)+1));
- io_apic_write(ioapic_i8259.apic, 0x10+2*ioapic_i8259.pin,
- *(((int *)&entry)+0));
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
}
disconnect_bsp_APIC(ioapic_i8259.pin != -1);
}
@@ -2213,17 +2218,13 @@ static inline void unlock_ExtINT_logic(void)
int apic, pin, i;
struct IO_APIC_route_entry entry0, entry1;
unsigned char save_control, save_freq_select;
- unsigned long flags;
pin = find_isa_irq_pin(8, mp_INT);
apic = find_isa_irq_apic(8, mp_INT);
if (pin == -1)
return;
- spin_lock_irqsave(&ioapic_lock, flags);
- *(((int *)&entry0) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
- *(((int *)&entry0) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ entry0 = ioapic_read_entry(apic, pin);
clear_IO_APIC_pin(apic, pin);
memset(&entry1, 0, sizeof(entry1));
@@ -2236,10 +2237,7 @@ static inline void unlock_ExtINT_logic(void)
entry1.trigger = 0;
entry1.vector = 0;
- spin_lock_irqsave(&ioapic_lock, flags);
- io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry1) + 1));
- io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry1) + 0));
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ ioapic_write_entry(apic, pin, entry1);
save_control = CMOS_READ(RTC_CONTROL);
save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
@@ -2258,10 +2256,7 @@ static inline void unlock_ExtINT_logic(void)
CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
clear_IO_APIC_pin(apic, pin);
- spin_lock_irqsave(&ioapic_lock, flags);
- io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry0) + 1));
- io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry0) + 0));
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ ioapic_write_entry(apic, pin, entry0);
}
int timer_uses_ioapic_pin_0;
@@ -2461,17 +2456,12 @@ static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
{
struct IO_APIC_route_entry *entry;
struct sysfs_ioapic_data *data;
- unsigned long flags;
int i;
data = container_of(dev, struct sysfs_ioapic_data, dev);
entry = data->entry;
- spin_lock_irqsave(&ioapic_lock, flags);
- for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
- *(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i);
- *(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i);
- }
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ for (i = 0; i < nr_ioapic_registers[dev->id]; i ++)
+ entry[i] = ioapic_read_entry(dev->id, i);
return 0;
}
@@ -2493,11 +2483,9 @@ static int ioapic_resume(struct sys_device *dev)
reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
io_apic_write(dev->id, 0, reg_00.raw);
}
- for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
- io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1));
- io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0));
- }
spin_unlock_irqrestore(&ioapic_lock, flags);
+ for (i = 0; i < nr_ioapic_registers[dev->id]; i ++)
+ ioapic_write_entry(dev->id, i, entry[i]);
return 0;
}
@@ -2694,9 +2682,8 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a
if (!ioapic && (irq < 16))
disable_8259A_irq(irq);
+ ioapic_write_entry(ioapic, pin, entry);
spin_lock_irqsave(&ioapic_lock, flags);
- io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1));
- io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0));
set_native_irq_info(use_pci_vector() ? entry.vector : irq, TARGET_CPUS);
spin_unlock_irqrestore(&ioapic_lock, flags);
@@ -2704,3 +2691,25 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a
}
#endif /* CONFIG_ACPI */
+
+static int __init parse_disable_timer_pin_1(char *arg)
+{
+ disable_timer_pin_1 = 1;
+ return 0;
+}
+early_param("disable_timer_pin_1", parse_disable_timer_pin_1);
+
+static int __init parse_enable_timer_pin_1(char *arg)
+{
+ disable_timer_pin_1 = -1;
+ return 0;
+}
+early_param("enable_timer_pin_1", parse_enable_timer_pin_1);
+
+static int __init parse_noapic(char *arg)
+{
+ /* disable IO-APIC */
+ disable_ioapic_setup();
+ return 0;
+}
+early_param("noapic", parse_noapic);
diff --git a/arch/i386/kernel/machine_kexec.c b/arch/i386/kernel/machine_kexec.c
index 6b1ae6ba76f..91966bafb3d 100644
--- a/arch/i386/kernel/machine_kexec.c
+++ b/arch/i386/kernel/machine_kexec.c
@@ -9,6 +9,7 @@
#include <linux/mm.h>
#include <linux/kexec.h>
#include <linux/delay.h>
+#include <linux/init.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
@@ -20,70 +21,13 @@
#include <asm/system.h>
#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
-
-#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
-#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
-#define L2_ATTR (_PAGE_PRESENT)
-
-#define LEVEL0_SIZE (1UL << 12UL)
-
-#ifndef CONFIG_X86_PAE
-#define LEVEL1_SIZE (1UL << 22UL)
-static u32 pgtable_level1[1024] PAGE_ALIGNED;
-
-static void identity_map_page(unsigned long address)
-{
- unsigned long level1_index, level2_index;
- u32 *pgtable_level2;
-
- /* Find the current page table */
- pgtable_level2 = __va(read_cr3());
-
- /* Find the indexes of the physical address to identity map */
- level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE;
- level2_index = address / LEVEL1_SIZE;
-
- /* Identity map the page table entry */
- pgtable_level1[level1_index] = address | L0_ATTR;
- pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR;
-
- /* Flush the tlb so the new mapping takes effect.
- * Global tlb entries are not flushed but that is not an issue.
- */
- load_cr3(pgtable_level2);
-}
-
-#else
-#define LEVEL1_SIZE (1UL << 21UL)
-#define LEVEL2_SIZE (1UL << 30UL)
-static u64 pgtable_level1[512] PAGE_ALIGNED;
-static u64 pgtable_level2[512] PAGE_ALIGNED;
-
-static void identity_map_page(unsigned long address)
-{
- unsigned long level1_index, level2_index, level3_index;
- u64 *pgtable_level3;
-
- /* Find the current page table */
- pgtable_level3 = __va(read_cr3());
-
- /* Find the indexes of the physical address to identity map */
- level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE;
- level2_index = (address % LEVEL2_SIZE)/LEVEL1_SIZE;
- level3_index = address / LEVEL2_SIZE;
-
- /* Identity map the page table entry */
- pgtable_level1[level1_index] = address | L0_ATTR;
- pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR;
- set_64bit(&pgtable_level3[level3_index],
- __pa(pgtable_level2) | L2_ATTR);
-
- /* Flush the tlb so the new mapping takes effect.
- * Global tlb entries are not flushed but that is not an issue.
- */
- load_cr3(pgtable_level3);
-}
+static u32 kexec_pgd[1024] PAGE_ALIGNED;
+#ifdef CONFIG_X86_PAE
+static u32 kexec_pmd0[1024] PAGE_ALIGNED;
+static u32 kexec_pmd1[1024] PAGE_ALIGNED;
#endif
+static u32 kexec_pte0[1024] PAGE_ALIGNED;
+static u32 kexec_pte1[1024] PAGE_ALIGNED;
static void set_idt(void *newidt, __u16 limit)
{
@@ -127,16 +71,6 @@ static void load_segments(void)
#undef __STR
}
-typedef asmlinkage NORET_TYPE void (*relocate_new_kernel_t)(
- unsigned long indirection_page,
- unsigned long reboot_code_buffer,
- unsigned long start_address,
- unsigned int has_pae) ATTRIB_NORET;
-
-extern const unsigned char relocate_new_kernel[];
-extern void relocate_new_kernel_end(void);
-extern const unsigned int relocate_new_kernel_size;
-
/*
* A architecture hook called to validate the
* proposed image and prepare the control pages
@@ -169,25 +103,29 @@ void machine_kexec_cleanup(struct kimage *image)
*/
NORET_TYPE void machine_kexec(struct kimage *image)
{
- unsigned long page_list;
- unsigned long reboot_code_buffer;
-
- relocate_new_kernel_t rnk;
+ unsigned long page_list[PAGES_NR];
+ void *control_page;
/* Interrupts aren't acceptable while we reboot */
local_irq_disable();
- /* Compute some offsets */
- reboot_code_buffer = page_to_pfn(image->control_code_page)
- << PAGE_SHIFT;
- page_list = image->head;
-
- /* Set up an identity mapping for the reboot_code_buffer */
- identity_map_page(reboot_code_buffer);
-
- /* copy it out */
- memcpy((void *)reboot_code_buffer, relocate_new_kernel,
- relocate_new_kernel_size);
+ control_page = page_address(image->control_code_page);
+ memcpy(control_page, relocate_kernel, PAGE_SIZE);
+
+ page_list[PA_CONTROL_PAGE] = __pa(control_page);
+ page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel;
+ page_list[PA_PGD] = __pa(kexec_pgd);
+ page_list[VA_PGD] = (unsigned long)kexec_pgd;
+#ifdef CONFIG_X86_PAE
+ page_list[PA_PMD_0] = __pa(kexec_pmd0);
+ page_list[VA_PMD_0] = (unsigned long)kexec_pmd0;
+ page_list[PA_PMD_1] = __pa(kexec_pmd1);
+ page_list[VA_PMD_1] = (unsigned long)kexec_pmd1;
+#endif
+ page_list[PA_PTE_0] = __pa(kexec_pte0);
+ page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
+ page_list[PA_PTE_1] = __pa(kexec_pte1);
+ page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
/* The segment registers are funny things, they have both a
* visible and an invisible part. Whenever the visible part is
@@ -206,6 +144,28 @@ NORET_TYPE void machine_kexec(struct kimage *image)
set_idt(phys_to_virt(0),0);
/* now call it */
- rnk = (relocate_new_kernel_t) reboot_code_buffer;
- (*rnk)(page_list, reboot_code_buffer, image->start, cpu_has_pae);
+ relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
+ image->start, cpu_has_pae);
+}
+
+/* crashkernel=size@addr specifies the location to reserve for
+ * a crash kernel. By reserving this memory we guarantee
+ * that linux never sets it up as a DMA target.
+ * Useful for holding code to do something appropriate
+ * after a kernel panic.
+ */
+static int __init parse_crashkernel(char *arg)
+{
+ unsigned long size, base;
+ size = memparse(arg, &arg);
+ if (*arg == '@') {
+ base = memparse(arg+1, &arg);
+ /* FIXME: Do I want a sanity check
+ * to validate the memory range?
+ */
+ crashk_res.start = base;
+ crashk_res.end = base + size - 1;
+ }
+ return 0;
}
+early_param("crashkernel", parse_crashkernel);
diff --git a/arch/i386/kernel/mca.c b/arch/i386/kernel/mca.c
index cd5456f14af..eb57a851789 100644
--- a/arch/i386/kernel/mca.c
+++ b/arch/i386/kernel/mca.c
@@ -42,6 +42,7 @@
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/mca.h>
+#include <linux/kprobes.h>
#include <asm/system.h>
#include <asm/io.h>
#include <linux/proc_fs.h>
@@ -414,7 +415,8 @@ subsys_initcall(mca_init);
/*--------------------------------------------------------------------*/
-static void mca_handle_nmi_device(struct mca_device *mca_dev, int check_flag)
+static __kprobes void
+mca_handle_nmi_device(struct mca_device *mca_dev, int check_flag)
{
int slot = mca_dev->slot;
@@ -444,7 +446,7 @@ static void mca_handle_nmi_device(struct mca_device *mca_dev, int check_flag)
/*--------------------------------------------------------------------*/
-static int mca_handle_nmi_callback(struct device *dev, void *data)
+static int __kprobes mca_handle_nmi_callback(struct device *dev, void *data)
{
struct mca_device *mca_dev = to_mca_device(dev);
unsigned char pos5;
@@ -462,7 +464,7 @@ static int mca_handle_nmi_callback(struct device *dev, void *data)
return 0;
}
-void mca_handle_nmi(void)
+void __kprobes mca_handle_nmi(void)
{
/* First try - scan the various adapters and see if a specific
* adapter was responsible for the error.
diff --git a/arch/i386/kernel/microcode.c b/arch/i386/kernel/microcode.c
index 40b44cc0d14..9b9479768d5 100644
--- a/arch/i386/kernel/microcode.c
+++ b/arch/i386/kernel/microcode.c
@@ -2,6 +2,7 @@
* Intel CPU Microcode Update Driver for Linux
*
* Copyright (C) 2000-2004 Tigran Aivazian
+ * 2006 Shaohua Li <shaohua.li@intel.com>
*
* This driver allows to upgrade microcode on Intel processors
* belonging to IA-32 family - PentiumPro, Pentium II,
@@ -82,6 +83,9 @@
#include <linux/spinlock.h>
#include <linux/mm.h>
#include <linux/mutex.h>
+#include <linux/cpu.h>
+#include <linux/firmware.h>
+#include <linux/platform_device.h>
#include <asm/msr.h>
#include <asm/uaccess.h>
@@ -91,9 +95,6 @@ MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver");
MODULE_AUTHOR("Tigran Aivazian <tigran@veritas.com>");
MODULE_LICENSE("GPL");
-static int verbose;
-module_param(verbose, int, 0644);
-
#define MICROCODE_VERSION "1.14a"
#define DEFAULT_UCODE_DATASIZE (2000) /* 2000 bytes */
@@ -120,55 +121,40 @@ static DEFINE_SPINLOCK(microcode_update_lock);
/* no concurrent ->write()s are allowed on /dev/cpu/microcode */
static DEFINE_MUTEX(microcode_mutex);
-static void __user *user_buffer; /* user area microcode data buffer */
-static unsigned int user_buffer_size; /* it's size */
-
-typedef enum mc_error_code {
- MC_SUCCESS = 0,
- MC_IGNORED = 1,
- MC_NOTFOUND = 2,
- MC_MARKED = 3,
- MC_ALLOCATED = 4,
-} mc_error_code_t;
-
static struct ucode_cpu_info {
+ int valid;
unsigned int sig;
- unsigned int pf, orig_pf;
+ unsigned int pf;
unsigned int rev;
- unsigned int cksum;
- mc_error_code_t err;
microcode_t *mc;
} ucode_cpu_info[NR_CPUS];
-
-static int microcode_open (struct inode *unused1, struct file *unused2)
-{
- return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
-}
-static void collect_cpu_info (void *unused)
+static void collect_cpu_info(int cpu_num)
{
- int cpu_num = smp_processor_id();
struct cpuinfo_x86 *c = cpu_data + cpu_num;
struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
unsigned int val[2];
- uci->sig = uci->pf = uci->rev = uci->cksum = 0;
- uci->err = MC_NOTFOUND;
+ /* We should bind the task to the CPU */
+ BUG_ON(raw_smp_processor_id() != cpu_num);
+ uci->pf = uci->rev = 0;
uci->mc = NULL;
+ uci->valid = 1;
if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
cpu_has(c, X86_FEATURE_IA64)) {
- printk(KERN_ERR "microcode: CPU%d not a capable Intel processor\n", cpu_num);
+ printk(KERN_ERR "microcode: CPU%d not a capable Intel "
+ "processor\n", cpu_num);
+ uci->valid = 0;
return;
- } else {
- uci->sig = cpuid_eax(0x00000001);
+ }
- if ((c->x86_model >= 5) || (c->x86 > 6)) {
- /* get processor flags from MSR 0x17 */
- rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
- uci->pf = 1 << ((val[1] >> 18) & 7);
- }
- uci->orig_pf = uci->pf;
+ uci->sig = cpuid_eax(0x00000001);
+
+ if ((c->x86_model >= 5) || (c->x86 > 6)) {
+ /* get processor flags from MSR 0x17 */
+ rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
+ uci->pf = 1 << ((val[1] >> 18) & 7);
}
wrmsr(MSR_IA32_UCODE_REV, 0, 0);
@@ -180,218 +166,159 @@ static void collect_cpu_info (void *unused)
uci->sig, uci->pf, uci->rev);
}
-static inline void mark_microcode_update (int cpu_num, microcode_header_t *mc_header, int sig, int pf, int cksum)
+static inline int microcode_update_match(int cpu_num,
+ microcode_header_t *mc_header, int sig, int pf)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
- pr_debug("Microcode Found.\n");
- pr_debug(" Header Revision 0x%x\n", mc_header->hdrver);
- pr_debug(" Loader Revision 0x%x\n", mc_header->ldrver);
- pr_debug(" Revision 0x%x \n", mc_header->rev);
- pr_debug(" Date %x/%x/%x\n",
- ((mc_header->date >> 24 ) & 0xff),
- ((mc_header->date >> 16 ) & 0xff),
- (mc_header->date & 0xFFFF));
- pr_debug(" Signature 0x%x\n", sig);
- pr_debug(" Type 0x%x Family 0x%x Model 0x%x Stepping 0x%x\n",
- ((sig >> 12) & 0x3),
- ((sig >> 8) & 0xf),
- ((sig >> 4) & 0xf),
- ((sig & 0xf)));
- pr_debug(" Processor Flags 0x%x\n", pf);
- pr_debug(" Checksum 0x%x\n", cksum);
-
- if (mc_header->rev < uci->rev) {
- if (uci->err == MC_NOTFOUND) {
- uci->err = MC_IGNORED;
- uci->cksum = mc_header->rev;
- } else if (uci->err == MC_IGNORED && uci->cksum < mc_header->rev)
- uci->cksum = mc_header->rev;
- } else if (mc_header->rev == uci->rev) {
- if (uci->err < MC_MARKED) {
- /* notify the caller of success on this cpu */
- uci->err = MC_SUCCESS;
- }
- } else if (uci->err != MC_ALLOCATED || mc_header->rev > uci->mc->hdr.rev) {
- pr_debug("microcode: CPU%d found a matching microcode update with "
- " revision 0x%x (current=0x%x)\n", cpu_num, mc_header->rev, uci->rev);
- uci->cksum = cksum;
- uci->pf = pf; /* keep the original mc pf for cksum calculation */
- uci->err = MC_MARKED; /* found the match */
- for_each_online_cpu(cpu_num) {
- if (ucode_cpu_info + cpu_num != uci
- && ucode_cpu_info[cpu_num].mc == uci->mc) {
- uci->mc = NULL;
- break;
- }
- }
- if (uci->mc != NULL) {
- vfree(uci->mc);
- uci->mc = NULL;
- }
- }
- return;
+ if (!sigmatch(sig, uci->sig, pf, uci->pf)
+ || mc_header->rev <= uci->rev)
+ return 0;
+ return 1;
}
-static int find_matching_ucodes (void)
+static int microcode_sanity_check(void *mc)
{
- int cursor = 0;
- int error = 0;
-
- while (cursor + MC_HEADER_SIZE < user_buffer_size) {
- microcode_header_t mc_header;
- void *newmc = NULL;
- int i, sum, cpu_num, allocated_flag, total_size, data_size, ext_table_size;
+ microcode_header_t *mc_header = mc;
+ struct extended_sigtable *ext_header = NULL;
+ struct extended_signature *ext_sig;
+ unsigned long total_size, data_size, ext_table_size;
+ int sum, orig_sum, ext_sigcount = 0, i;
+
+ total_size = get_totalsize(mc_header);
+ data_size = get_datasize(mc_header);
+ if (data_size + MC_HEADER_SIZE > total_size) {
+ printk(KERN_ERR "microcode: error! "
+ "Bad data size in microcode data file\n");
+ return -EINVAL;
+ }
- if (copy_from_user(&mc_header, user_buffer + cursor, MC_HEADER_SIZE)) {
- printk(KERN_ERR "microcode: error! Can not read user data\n");
- error = -EFAULT;
- goto out;
+ if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
+ printk(KERN_ERR "microcode: error! "
+ "Unknown microcode update format\n");
+ return -EINVAL;
+ }
+ ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
+ if (ext_table_size) {
+ if ((ext_table_size < EXT_HEADER_SIZE)
+ || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
+ printk(KERN_ERR "microcode: error! "
+ "Small exttable size in microcode data file\n");
+ return -EINVAL;
}
-
- total_size = get_totalsize(&mc_header);
- if ((cursor + total_size > user_buffer_size) || (total_size < DEFAULT_UCODE_TOTALSIZE)) {
- printk(KERN_ERR "microcode: error! Bad data in microcode data file\n");
- error = -EINVAL;
- goto out;
+ ext_header = mc + MC_HEADER_SIZE + data_size;
+ if (ext_table_size != exttable_size(ext_header)) {
+ printk(KERN_ERR "microcode: error! "
+ "Bad exttable size in microcode data file\n");
+ return -EFAULT;
}
+ ext_sigcount = ext_header->count;
+ }
- data_size = get_datasize(&mc_header);
- if ((data_size + MC_HEADER_SIZE > total_size) || (data_size < DEFAULT_UCODE_DATASIZE)) {
- printk(KERN_ERR "microcode: error! Bad data in microcode data file\n");
- error = -EINVAL;
- goto out;
+ /* check extended table checksum */
+ if (ext_table_size) {
+ int ext_table_sum = 0;
+ int *ext_tablep = (int *)ext_header;
+
+ i = ext_table_size / DWSIZE;
+ while (i--)
+ ext_table_sum += ext_tablep[i];
+ if (ext_table_sum) {
+ printk(KERN_WARNING "microcode: aborting, "
+ "bad extended signature table checksum\n");
+ return -EINVAL;
}
+ }
- if (mc_header.ldrver != 1 || mc_header.hdrver != 1) {
- printk(KERN_ERR "microcode: error! Unknown microcode update format\n");
- error = -EINVAL;
- goto out;
+ /* calculate the checksum */
+ orig_sum = 0;
+ i = (MC_HEADER_SIZE + data_size) / DWSIZE;
+ while (i--)
+ orig_sum += ((int *)mc)[i];
+ if (orig_sum) {
+ printk(KERN_ERR "microcode: aborting, bad checksum\n");
+ return -EINVAL;
+ }
+ if (!ext_table_size)
+ return 0;
+ /* check extended signature checksum */
+ for (i = 0; i < ext_sigcount; i++) {
+ ext_sig = (struct extended_signature *)((void *)ext_header
+ + EXT_HEADER_SIZE + EXT_SIGNATURE_SIZE * i);
+ sum = orig_sum
+ - (mc_header->sig + mc_header->pf + mc_header->cksum)
+ + (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
+ if (sum) {
+ printk(KERN_ERR "microcode: aborting, bad checksum\n");
+ return -EINVAL;
}
+ }
+ return 0;
+}
- for_each_online_cpu(cpu_num) {
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
-
- if (sigmatch(mc_header.sig, uci->sig, mc_header.pf, uci->orig_pf))
- mark_microcode_update(cpu_num, &mc_header, mc_header.sig, mc_header.pf, mc_header.cksum);
- }
+/*
+ * return 0 - no update found
+ * return 1 - found update
+ * return < 0 - error
+ */
+static int get_maching_microcode(void *mc, int cpu)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+ microcode_header_t *mc_header = mc;
+ struct extended_sigtable *ext_header;
+ unsigned long total_size = get_totalsize(mc_header);
+ int ext_sigcount, i;
+ struct extended_signature *ext_sig;
+ void *new_mc;
+
+ if (microcode_update_match(cpu, mc_header,
+ mc_header->sig, mc_header->pf))
+ goto find;
+
+ if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE)
+ return 0;
+
+ ext_header = (struct extended_sigtable *)(mc +
+ get_datasize(mc_header) + MC_HEADER_SIZE);
+ ext_sigcount = ext_header->count;
+ ext_sig = (struct extended_signature *)((void *)ext_header
+ + EXT_HEADER_SIZE);
+ for (i = 0; i < ext_sigcount; i++) {
+ if (microcode_update_match(cpu, mc_header,
+ ext_sig->sig, ext_sig->pf))
+ goto find;
+ ext_sig++;
+ }
+ return 0;
+find:
+ pr_debug("microcode: CPU %d found a matching microcode update with"
+ " version 0x%x (current=0x%x)\n", cpu, mc_header->rev,uci->rev);
+ new_mc = vmalloc(total_size);
+ if (!new_mc) {
+ printk(KERN_ERR "microcode: error! Can not allocate memory\n");
+ return -ENOMEM;
+ }
- ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
- if (ext_table_size) {
- struct extended_sigtable ext_header;
- struct extended_signature ext_sig;
- int ext_sigcount;
+ /* free previous update file */
+ vfree(uci->mc);
- if ((ext_table_size < EXT_HEADER_SIZE)
- || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
- printk(KERN_ERR "microcode: error! Bad data in microcode data file\n");
- error = -EINVAL;
- goto out;
- }
- if (copy_from_user(&ext_header, user_buffer + cursor
- + MC_HEADER_SIZE + data_size, EXT_HEADER_SIZE)) {
- printk(KERN_ERR "microcode: error! Can not read user data\n");
- error = -EFAULT;
- goto out;
- }
- if (ext_table_size != exttable_size(&ext_header)) {
- printk(KERN_ERR "microcode: error! Bad data in microcode data file\n");
- error = -EFAULT;
- goto out;
- }
-
- ext_sigcount = ext_header.count;
-
- for (i = 0; i < ext_sigcount; i++) {
- if (copy_from_user(&ext_sig, user_buffer + cursor + MC_HEADER_SIZE + data_size + EXT_HEADER_SIZE
- + EXT_SIGNATURE_SIZE * i, EXT_SIGNATURE_SIZE)) {
- printk(KERN_ERR "microcode: error! Can not read user data\n");
- error = -EFAULT;
- goto out;
- }
- for_each_online_cpu(cpu_num) {
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
-
- if (sigmatch(ext_sig.sig, uci->sig, ext_sig.pf, uci->orig_pf)) {
- mark_microcode_update(cpu_num, &mc_header, ext_sig.sig, ext_sig.pf, ext_sig.cksum);
- }
- }
- }
- }
- /* now check if any cpu has matched */
- allocated_flag = 0;
- sum = 0;
- for_each_online_cpu(cpu_num) {
- if (ucode_cpu_info[cpu_num].err == MC_MARKED) {
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
- if (!allocated_flag) {
- allocated_flag = 1;
- newmc = vmalloc(total_size);
- if (!newmc) {
- printk(KERN_ERR "microcode: error! Can not allocate memory\n");
- error = -ENOMEM;
- goto out;
- }
- if (copy_from_user(newmc + MC_HEADER_SIZE,
- user_buffer + cursor + MC_HEADER_SIZE,
- total_size - MC_HEADER_SIZE)) {
- printk(KERN_ERR "microcode: error! Can not read user data\n");
- vfree(newmc);
- error = -EFAULT;
- goto out;
- }
- memcpy(newmc, &mc_header, MC_HEADER_SIZE);
- /* check extended table checksum */
- if (ext_table_size) {
- int ext_table_sum = 0;
- int * ext_tablep = (((void *) newmc) + MC_HEADER_SIZE + data_size);
- i = ext_table_size / DWSIZE;
- while (i--) ext_table_sum += ext_tablep[i];
- if (ext_table_sum) {
- printk(KERN_WARNING "microcode: aborting, bad extended signature table checksum\n");
- vfree(newmc);
- error = -EINVAL;
- goto out;
- }
- }
-
- /* calculate the checksum */
- i = (MC_HEADER_SIZE + data_size) / DWSIZE;
- while (i--) sum += ((int *)newmc)[i];
- sum -= (mc_header.sig + mc_header.pf + mc_header.cksum);
- }
- ucode_cpu_info[cpu_num].mc = newmc;
- ucode_cpu_info[cpu_num].err = MC_ALLOCATED; /* mc updated */
- if (sum + uci->sig + uci->pf + uci->cksum != 0) {
- printk(KERN_ERR "microcode: CPU%d aborting, bad checksum\n", cpu_num);
- error = -EINVAL;
- goto out;
- }
- }
- }
- cursor += total_size; /* goto the next update patch */
- } /* end of while */
-out:
- return error;
+ memcpy(new_mc, mc, total_size);
+ uci->mc = new_mc;
+ return 1;
}
-static void do_update_one (void * unused)
+static void apply_microcode(int cpu)
{
unsigned long flags;
unsigned int val[2];
- int cpu_num = smp_processor_id();
+ int cpu_num = raw_smp_processor_id();
struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
- if (uci->mc == NULL) {
- if (verbose) {
- if (uci->err == MC_SUCCESS)
- printk(KERN_INFO "microcode: CPU%d already at revision 0x%x\n",
- cpu_num, uci->rev);
- else
- printk(KERN_INFO "microcode: No new microcode data for CPU%d\n", cpu_num);
- }
+ /* We should bind the task to the CPU */
+ BUG_ON(cpu_num != cpu);
+
+ if (uci->mc == NULL)
return;
- }
/* serialize access to the physical write to MSR 0x79 */
spin_lock_irqsave(&microcode_update_lock, flags);
@@ -408,68 +335,107 @@ static void do_update_one (void * unused)
/* get the current revision from MSR 0x8B */
rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
- /* notify the caller of success on this cpu */
- uci->err = MC_SUCCESS;
spin_unlock_irqrestore(&microcode_update_lock, flags);
- printk(KERN_INFO "microcode: CPU%d updated from revision "
+ if (val[1] != uci->mc->hdr.rev) {
+ printk(KERN_ERR "microcode: CPU%d updated from revision "
+ "0x%x to 0x%x failed\n", cpu_num, uci->rev, val[1]);
+ return;
+ }
+ pr_debug("microcode: CPU%d updated from revision "
"0x%x to 0x%x, date = %08x \n",
cpu_num, uci->rev, val[1], uci->mc->hdr.date);
- return;
+ uci->rev = val[1];
}
-static int do_microcode_update (void)
-{
- int i, error;
+#ifdef CONFIG_MICROCODE_OLD_INTERFACE
+static void __user *user_buffer; /* user area microcode data buffer */
+static unsigned int user_buffer_size; /* it's size */
- if (on_each_cpu(collect_cpu_info, NULL, 1, 1) != 0) {
- printk(KERN_ERR "microcode: Error! Could not run on all processors\n");
- error = -EIO;
- goto out;
+static long get_next_ucode(void **mc, long offset)
+{
+ microcode_header_t mc_header;
+ unsigned long total_size;
+
+ /* No more data */
+ if (offset >= user_buffer_size)
+ return 0;
+ if (copy_from_user(&mc_header, user_buffer + offset, MC_HEADER_SIZE)) {
+ printk(KERN_ERR "microcode: error! Can not read user data\n");
+ return -EFAULT;
}
-
- if ((error = find_matching_ucodes())) {
- printk(KERN_ERR "microcode: Error in the microcode data\n");
- goto out_free;
+ total_size = get_totalsize(&mc_header);
+ if (offset + total_size > user_buffer_size) {
+ printk(KERN_ERR "microcode: error! Bad total size in microcode "
+ "data file\n");
+ return -EINVAL;
}
-
- if (on_each_cpu(do_update_one, NULL, 1, 1) != 0) {
- printk(KERN_ERR "microcode: Error! Could not run on all processors\n");
- error = -EIO;
+ *mc = vmalloc(total_size);
+ if (!*mc)
+ return -ENOMEM;
+ if (copy_from_user(*mc, user_buffer + offset, total_size)) {
+ printk(KERN_ERR "microcode: error! Can not read user data\n");
+ vfree(*mc);
+ return -EFAULT;
}
+ return offset + total_size;
+}
+
+static int do_microcode_update (void)
+{
+ long cursor = 0;
+ int error = 0;
+ void *new_mc;
+ int cpu;
+ cpumask_t old;
+
+ old = current->cpus_allowed;
-out_free:
- for_each_online_cpu(i) {
- if (ucode_cpu_info[i].mc) {
- int j;
- void *tmp = ucode_cpu_info[i].mc;
- vfree(tmp);
- for_each_online_cpu(j) {
- if (ucode_cpu_info[j].mc == tmp)
- ucode_cpu_info[j].mc = NULL;
- }
+ while ((cursor = get_next_ucode(&new_mc, cursor)) > 0) {
+ error = microcode_sanity_check(new_mc);
+ if (error)
+ goto out;
+ /*
+ * It's possible the data file has multiple matching ucode,
+ * lets keep searching till the latest version
+ */
+ for_each_online_cpu(cpu) {
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+ if (!uci->valid)
+ continue;
+ set_cpus_allowed(current, cpumask_of_cpu(cpu));
+ error = get_maching_microcode(new_mc, cpu);
+ if (error < 0)
+ goto out;
+ if (error == 1)
+ apply_microcode(cpu);
}
- if (ucode_cpu_info[i].err == MC_IGNORED && verbose)
- printk(KERN_WARNING "microcode: CPU%d not 'upgrading' to earlier revision"
- " 0x%x (current=0x%x)\n", i, ucode_cpu_info[i].cksum, ucode_cpu_info[i].rev);
+ vfree(new_mc);
}
out:
+ if (cursor > 0)
+ vfree(new_mc);
+ if (cursor < 0)
+ error = cursor;
+ set_cpus_allowed(current, old);
return error;
}
+static int microcode_open (struct inode *unused1, struct file *unused2)
+{
+ return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
+}
+
static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos)
{
ssize_t ret;
- if (len < DEFAULT_UCODE_TOTALSIZE) {
- printk(KERN_ERR "microcode: not enough data\n");
- return -EINVAL;
- }
-
if ((len >> PAGE_SHIFT) > num_physpages) {
printk(KERN_ERR "microcode: too much data (max %ld pages)\n", num_physpages);
return -EINVAL;
}
+ lock_cpu_hotplug();
mutex_lock(&microcode_mutex);
user_buffer = (void __user *) buf;
@@ -480,6 +446,7 @@ static ssize_t microcode_write (struct file *file, const char __user *buf, size_
ret = (ssize_t)len;
mutex_unlock(&microcode_mutex);
+ unlock_cpu_hotplug();
return ret;
}
@@ -496,7 +463,7 @@ static struct miscdevice microcode_dev = {
.fops = &microcode_fops,
};
-static int __init microcode_init (void)
+static int __init microcode_dev_init (void)
{
int error;
@@ -508,6 +475,280 @@ static int __init microcode_init (void)
return error;
}
+ return 0;
+}
+
+static void __exit microcode_dev_exit (void)
+{
+ misc_deregister(&microcode_dev);
+}
+
+MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
+#else
+#define microcode_dev_init() 0
+#define microcode_dev_exit() do { } while(0)
+#endif
+
+static long get_next_ucode_from_buffer(void **mc, void *buf,
+ unsigned long size, long offset)
+{
+ microcode_header_t *mc_header;
+ unsigned long total_size;
+
+ /* No more data */
+ if (offset >= size)
+ return 0;
+ mc_header = (microcode_header_t *)(buf + offset);
+ total_size = get_totalsize(mc_header);
+
+ if (offset + total_size > size) {
+ printk(KERN_ERR "microcode: error! Bad data in microcode data file\n");
+ return -EINVAL;
+ }
+
+ *mc = vmalloc(total_size);
+ if (!*mc) {
+ printk(KERN_ERR "microcode: error! Can not allocate memory\n");
+ return -ENOMEM;
+ }
+ memcpy(*mc, buf + offset, total_size);
+ return offset + total_size;
+}
+
+/* fake device for request_firmware */
+static struct platform_device *microcode_pdev;
+
+static int cpu_request_microcode(int cpu)
+{
+ char name[30];
+ struct cpuinfo_x86 *c = cpu_data + cpu;
+ const struct firmware *firmware;
+ void *buf;
+ unsigned long size;
+ long offset = 0;
+ int error;
+ void *mc;
+
+ /* We should bind the task to the CPU */
+ BUG_ON(cpu != raw_smp_processor_id());
+ sprintf(name,"intel-ucode/%02x-%02x-%02x",
+ c->x86, c->x86_model, c->x86_mask);
+ error = request_firmware(&firmware, name, &microcode_pdev->dev);
+ if (error) {
+ pr_debug("ucode data file %s load failed\n", name);
+ return error;
+ }
+ buf = (void *)firmware->data;
+ size = firmware->size;
+ while ((offset = get_next_ucode_from_buffer(&mc, buf, size, offset))
+ > 0) {
+ error = microcode_sanity_check(mc);
+ if (error)
+ break;
+ error = get_maching_microcode(mc, cpu);
+ if (error < 0)
+ break;
+ /*
+ * It's possible the data file has multiple matching ucode,
+ * lets keep searching till the latest version
+ */
+ if (error == 1) {
+ apply_microcode(cpu);
+ error = 0;
+ }
+ vfree(mc);
+ }
+ if (offset > 0)
+ vfree(mc);
+ if (offset < 0)
+ error = offset;
+ release_firmware(firmware);
+
+ return error;
+}
+
+static void microcode_init_cpu(int cpu)
+{
+ cpumask_t old;
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+ old = current->cpus_allowed;
+
+ set_cpus_allowed(current, cpumask_of_cpu(cpu));
+ mutex_lock(&microcode_mutex);
+ collect_cpu_info(cpu);
+ if (uci->valid)
+ cpu_request_microcode(cpu);
+ mutex_unlock(&microcode_mutex);
+ set_cpus_allowed(current, old);
+}
+
+static void microcode_fini_cpu(int cpu)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+ mutex_lock(&microcode_mutex);
+ uci->valid = 0;
+ vfree(uci->mc);
+ uci->mc = NULL;
+ mutex_unlock(&microcode_mutex);
+}
+
+static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
+ char *end;
+ unsigned long val = simple_strtoul(buf, &end, 0);
+ int err = 0;
+ int cpu = dev->id;
+
+ if (end == buf)
+ return -EINVAL;
+ if (val == 1) {
+ cpumask_t old;
+
+ old = current->cpus_allowed;
+
+ lock_cpu_hotplug();
+ set_cpus_allowed(current, cpumask_of_cpu(cpu));
+
+ mutex_lock(&microcode_mutex);
+ if (uci->valid)
+ err = cpu_request_microcode(cpu);
+ mutex_unlock(&microcode_mutex);
+ unlock_cpu_hotplug();
+ set_cpus_allowed(current, old);
+ }
+ if (err)
+ return err;
+ return sz;
+}
+
+static ssize_t version_show(struct sys_device *dev, char *buf)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
+
+ return sprintf(buf, "0x%x\n", uci->rev);
+}
+
+static ssize_t pf_show(struct sys_device *dev, char *buf)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
+
+ return sprintf(buf, "0x%x\n", uci->pf);
+}
+
+static SYSDEV_ATTR(reload, 0200, NULL, reload_store);
+static SYSDEV_ATTR(version, 0400, version_show, NULL);
+static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL);
+
+static struct attribute *mc_default_attrs[] = {
+ &attr_reload.attr,
+ &attr_version.attr,
+ &attr_processor_flags.attr,
+ NULL
+};
+
+static struct attribute_group mc_attr_group = {
+ .attrs = mc_default_attrs,
+ .name = "microcode",
+};
+
+static int mc_sysdev_add(struct sys_device *sys_dev)
+{
+ int cpu = sys_dev->id;
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+ if (!cpu_online(cpu))
+ return 0;
+ pr_debug("Microcode:CPU %d added\n", cpu);
+ memset(uci, 0, sizeof(*uci));
+ sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
+
+ microcode_init_cpu(cpu);
+ return 0;
+}
+
+static int mc_sysdev_remove(struct sys_device *sys_dev)
+{
+ int cpu = sys_dev->id;
+
+ if (!cpu_online(cpu))
+ return 0;
+ pr_debug("Microcode:CPU %d removed\n", cpu);
+ microcode_fini_cpu(cpu);
+ sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
+ return 0;
+}
+
+static int mc_sysdev_resume(struct sys_device *dev)
+{
+ int cpu = dev->id;
+
+ if (!cpu_online(cpu))
+ return 0;
+ pr_debug("Microcode:CPU %d resumed\n", cpu);
+ /* only CPU 0 will apply ucode here */
+ apply_microcode(0);
+ return 0;
+}
+
+static struct sysdev_driver mc_sysdev_driver = {
+ .add = mc_sysdev_add,
+ .remove = mc_sysdev_remove,
+ .resume = mc_sysdev_resume,
+};
+
+#ifdef CONFIG_HOTPLUG_CPU
+static __cpuinit int
+mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
+{
+ unsigned int cpu = (unsigned long)hcpu;
+ struct sys_device *sys_dev;
+
+ sys_dev = get_cpu_sysdev(cpu);
+ switch (action) {
+ case CPU_ONLINE:
+ case CPU_DOWN_FAILED:
+ mc_sysdev_add(sys_dev);
+ break;
+ case CPU_DOWN_PREPARE:
+ mc_sysdev_remove(sys_dev);
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block mc_cpu_notifier = {
+ .notifier_call = mc_cpu_callback,
+};
+#endif
+
+static int __init microcode_init (void)
+{
+ int error;
+
+ error = microcode_dev_init();
+ if (error)
+ return error;
+ microcode_pdev = platform_device_register_simple("microcode", -1,
+ NULL, 0);
+ if (IS_ERR(microcode_pdev)) {
+ microcode_dev_exit();
+ return PTR_ERR(microcode_pdev);
+ }
+
+ lock_cpu_hotplug();
+ error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver);
+ unlock_cpu_hotplug();
+ if (error) {
+ microcode_dev_exit();
+ platform_device_unregister(microcode_pdev);
+ return error;
+ }
+
+ register_hotcpu_notifier(&mc_cpu_notifier);
+
printk(KERN_INFO
"IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@veritas.com>\n");
return 0;
@@ -515,9 +756,16 @@ static int __init microcode_init (void)
static void __exit microcode_exit (void)
{
- misc_deregister(&microcode_dev);
+ microcode_dev_exit();
+
+ unregister_hotcpu_notifier(&mc_cpu_notifier);
+
+ lock_cpu_hotplug();
+ sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
+ unlock_cpu_hotplug();
+
+ platform_device_unregister(microcode_pdev);
}
module_init(microcode_init)
module_exit(microcode_exit)
-MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
diff --git a/arch/i386/kernel/mpparse.c b/arch/i386/kernel/mpparse.c
index a70b5fa0ef0..442aaf8c77e 100644
--- a/arch/i386/kernel/mpparse.c
+++ b/arch/i386/kernel/mpparse.c
@@ -30,6 +30,7 @@
#include <asm/io_apic.h>
#include <mach_apic.h>
+#include <mach_apicdef.h>
#include <mach_mpparse.h>
#include <bios_ebda.h>
@@ -68,7 +69,7 @@ unsigned int def_to_bigsmp = 0;
/* Processor that is doing the boot up */
unsigned int boot_cpu_physical_apicid = -1U;
/* Internal processor count */
-static unsigned int __devinitdata num_processors;
+unsigned int __cpuinitdata num_processors;
/* Bitmask of physically existing CPUs */
physid_mask_t phys_cpu_present_map;
@@ -228,12 +229,14 @@ static void __init MP_bus_info (struct mpc_config_bus *m)
mpc_oem_bus_info(m, str, translation_table[mpc_record]);
+#if MAX_MP_BUSSES < 256
if (m->mpc_busid >= MAX_MP_BUSSES) {
printk(KERN_WARNING "MP table busid value (%d) for bustype %s "
" is too large, max. supported is %d\n",
m->mpc_busid, str, MAX_MP_BUSSES - 1);
return;
}
+#endif
if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA)-1) == 0) {
mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
@@ -293,19 +296,6 @@ static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m)
m->mpc_irqtype, m->mpc_irqflag & 3,
(m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
- /*
- * Well it seems all SMP boards in existence
- * use ExtINT/LVT1 == LINT0 and
- * NMI/LVT2 == LINT1 - the following check
- * will show us if this assumptions is false.
- * Until then we do not have to add baggage.
- */
- if ((m->mpc_irqtype == mp_ExtINT) &&
- (m->mpc_destapiclint != 0))
- BUG();
- if ((m->mpc_irqtype == mp_NMI) &&
- (m->mpc_destapiclint != 1))
- BUG();
}
#ifdef CONFIG_X86_NUMAQ
@@ -822,8 +812,7 @@ int es7000_plat;
#ifdef CONFIG_ACPI
-void __init mp_register_lapic_address (
- u64 address)
+void __init mp_register_lapic_address(u64 address)
{
mp_lapic_addr = (unsigned long) address;
@@ -835,13 +824,10 @@ void __init mp_register_lapic_address (
Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid);
}
-
-void __devinit mp_register_lapic (
- u8 id,
- u8 enabled)
+void __devinit mp_register_lapic (u8 id, u8 enabled)
{
struct mpc_config_processor processor;
- int boot_cpu = 0;
+ int boot_cpu = 0;
if (MAX_APICS - id <= 0) {
printk(KERN_WARNING "Processor #%d invalid (max %d)\n",
@@ -878,11 +864,9 @@ static struct mp_ioapic_routing {
u32 pin_programmed[4];
} mp_ioapic_routing[MAX_IO_APICS];
-
-static int mp_find_ioapic (
- int gsi)
+static int mp_find_ioapic (int gsi)
{
- int i = 0;
+ int i = 0;
/* Find the IOAPIC that manages this GSI. */
for (i = 0; i < nr_ioapics; i++) {
@@ -895,15 +879,11 @@ static int mp_find_ioapic (
return -1;
}
-
-void __init mp_register_ioapic (
- u8 id,
- u32 address,
- u32 gsi_base)
+void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base)
{
- int idx = 0;
- int tmpid;
+ int idx = 0;
+ int tmpid;
if (nr_ioapics >= MAX_IO_APICS) {
printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
@@ -949,16 +929,10 @@ void __init mp_register_ioapic (
mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
mp_ioapic_routing[idx].gsi_base,
mp_ioapic_routing[idx].gsi_end);
-
- return;
}
-
-void __init mp_override_legacy_irq (
- u8 bus_irq,
- u8 polarity,
- u8 trigger,
- u32 gsi)
+void __init
+mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
{
struct mpc_config_intsrc intsrc;
int ioapic = -1;
@@ -996,15 +970,13 @@ void __init mp_override_legacy_irq (
mp_irqs[mp_irq_entries] = intsrc;
if (++mp_irq_entries == MAX_IRQ_SOURCES)
panic("Max # of irq sources exceeded!\n");
-
- return;
}
void __init mp_config_acpi_legacy_irqs (void)
{
struct mpc_config_intsrc intsrc;
- int i = 0;
- int ioapic = -1;
+ int i = 0;
+ int ioapic = -1;
/*
* Fabricate the legacy ISA bus (bus #31).
@@ -1073,12 +1045,12 @@ void __init mp_config_acpi_legacy_irqs (void)
#define MAX_GSI_NUM 4096
-int mp_register_gsi (u32 gsi, int triggering, int polarity)
+int mp_register_gsi(u32 gsi, int triggering, int polarity)
{
- int ioapic = -1;
- int ioapic_pin = 0;
- int idx, bit = 0;
- static int pci_irq = 16;
+ int ioapic = -1;
+ int ioapic_pin = 0;
+ int idx, bit = 0;
+ static int pci_irq = 16;
/*
* Mapping between Global System Interrups, which
* represent all possible interrupts, and IRQs
diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c
index acb351478e4..dbda706fdd1 100644
--- a/arch/i386/kernel/nmi.c
+++ b/arch/i386/kernel/nmi.c
@@ -21,83 +21,174 @@
#include <linux/sysdev.h>
#include <linux/sysctl.h>
#include <linux/percpu.h>
+#include <linux/dmi.h>
+#include <linux/kprobes.h>
#include <asm/smp.h>
#include <asm/nmi.h>
+#include <asm/kdebug.h>
#include <asm/intel_arch_perfmon.h>
#include "mach_traps.h"
-unsigned int nmi_watchdog = NMI_NONE;
-extern int unknown_nmi_panic;
-static unsigned int nmi_hz = HZ;
-static unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */
-static unsigned int nmi_p4_cccr_val;
-extern void show_registers(struct pt_regs *regs);
+/* perfctr_nmi_owner tracks the ownership of the perfctr registers:
+ * evtsel_nmi_owner tracks the ownership of the event selection
+ * - different performance counters/ event selection may be reserved for
+ * different subsystems this reservation system just tries to coordinate
+ * things a little
+ */
+static DEFINE_PER_CPU(unsigned long, perfctr_nmi_owner);
+static DEFINE_PER_CPU(unsigned long, evntsel_nmi_owner[3]);
-/*
- * lapic_nmi_owner tracks the ownership of the lapic NMI hardware:
- * - it may be reserved by some other driver, or not
- * - when not reserved by some other driver, it may be used for
- * the NMI watchdog, or not
- *
- * This is maintained separately from nmi_active because the NMI
- * watchdog may also be driven from the I/O APIC timer.
+/* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
+ * offset from MSR_P4_BSU_ESCR0. It will be the max for all platforms (for now)
*/
-static DEFINE_SPINLOCK(lapic_nmi_owner_lock);
-static unsigned int lapic_nmi_owner;
-#define LAPIC_NMI_WATCHDOG (1<<0)
-#define LAPIC_NMI_RESERVED (1<<1)
+#define NMI_MAX_COUNTER_BITS 66
/* nmi_active:
- * +1: the lapic NMI watchdog is active, but can be disabled
- * 0: the lapic NMI watchdog has not been set up, and cannot
+ * >0: the lapic NMI watchdog is active, but can be disabled
+ * <0: the lapic NMI watchdog has not been set up, and cannot
* be enabled
- * -1: the lapic NMI watchdog is disabled, but can be enabled
+ * 0: the lapic NMI watchdog is disabled, but can be enabled
*/
-int nmi_active;
+atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */
-#define K7_EVNTSEL_ENABLE (1 << 22)
-#define K7_EVNTSEL_INT (1 << 20)
-#define K7_EVNTSEL_OS (1 << 17)
-#define K7_EVNTSEL_USR (1 << 16)
-#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76
-#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
+unsigned int nmi_watchdog = NMI_DEFAULT;
+static unsigned int nmi_hz = HZ;
-#define P6_EVNTSEL0_ENABLE (1 << 22)
-#define P6_EVNTSEL_INT (1 << 20)
-#define P6_EVNTSEL_OS (1 << 17)
-#define P6_EVNTSEL_USR (1 << 16)
-#define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79
-#define P6_NMI_EVENT P6_EVENT_CPU_CLOCKS_NOT_HALTED
+struct nmi_watchdog_ctlblk {
+ int enabled;
+ u64 check_bit;
+ unsigned int cccr_msr;
+ unsigned int perfctr_msr; /* the MSR to reset in NMI handler */
+ unsigned int evntsel_msr; /* the MSR to select the events to handle */
+};
+static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk);
-#define MSR_P4_MISC_ENABLE 0x1A0
-#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7)
-#define MSR_P4_MISC_ENABLE_PEBS_UNAVAIL (1<<12)
-#define MSR_P4_PERFCTR0 0x300
-#define MSR_P4_CCCR0 0x360
-#define P4_ESCR_EVENT_SELECT(N) ((N)<<25)
-#define P4_ESCR_OS (1<<3)
-#define P4_ESCR_USR (1<<2)
-#define P4_CCCR_OVF_PMI0 (1<<26)
-#define P4_CCCR_OVF_PMI1 (1<<27)
-#define P4_CCCR_THRESHOLD(N) ((N)<<20)
-#define P4_CCCR_COMPLEMENT (1<<19)
-#define P4_CCCR_COMPARE (1<<18)
-#define P4_CCCR_REQUIRED (3<<16)
-#define P4_CCCR_ESCR_SELECT(N) ((N)<<13)
-#define P4_CCCR_ENABLE (1<<12)
-/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
- CRU_ESCR0 (with any non-null event selector) through a complemented
- max threshold. [IA32-Vol3, Section 14.9.9] */
-#define MSR_P4_IQ_COUNTER0 0x30C
-#define P4_NMI_CRU_ESCR0 (P4_ESCR_EVENT_SELECT(0x3F)|P4_ESCR_OS|P4_ESCR_USR)
-#define P4_NMI_IQ_CCCR0 \
- (P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \
- P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE)
+/* local prototypes */
+static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu);
-#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL
-#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK
+extern void show_registers(struct pt_regs *regs);
+extern int unknown_nmi_panic;
+
+/* converts an msr to an appropriate reservation bit */
+static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
+{
+ /* returns the bit offset of the performance counter register */
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_AMD:
+ return (msr - MSR_K7_PERFCTR0);
+ case X86_VENDOR_INTEL:
+ if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
+ return (msr - MSR_ARCH_PERFMON_PERFCTR0);
+
+ switch (boot_cpu_data.x86) {
+ case 6:
+ return (msr - MSR_P6_PERFCTR0);
+ case 15:
+ return (msr - MSR_P4_BPU_PERFCTR0);
+ }
+ }
+ return 0;
+}
+
+/* converts an msr to an appropriate reservation bit */
+static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
+{
+ /* returns the bit offset of the event selection register */
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_AMD:
+ return (msr - MSR_K7_EVNTSEL0);
+ case X86_VENDOR_INTEL:
+ if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
+ return (msr - MSR_ARCH_PERFMON_EVENTSEL0);
+
+ switch (boot_cpu_data.x86) {
+ case 6:
+ return (msr - MSR_P6_EVNTSEL0);
+ case 15:
+ return (msr - MSR_P4_BSU_ESCR0);
+ }
+ }
+ return 0;
+}
+
+/* checks for a bit availability (hack for oprofile) */
+int avail_to_resrv_perfctr_nmi_bit(unsigned int counter)
+{
+ BUG_ON(counter > NMI_MAX_COUNTER_BITS);
+
+ return (!test_bit(counter, &__get_cpu_var(perfctr_nmi_owner)));
+}
+
+/* checks the an msr for availability */
+int avail_to_resrv_perfctr_nmi(unsigned int msr)
+{
+ unsigned int counter;
+
+ counter = nmi_perfctr_msr_to_bit(msr);
+ BUG_ON(counter > NMI_MAX_COUNTER_BITS);
+
+ return (!test_bit(counter, &__get_cpu_var(perfctr_nmi_owner)));
+}
+
+int reserve_perfctr_nmi(unsigned int msr)
+{
+ unsigned int counter;
+
+ counter = nmi_perfctr_msr_to_bit(msr);
+ BUG_ON(counter > NMI_MAX_COUNTER_BITS);
+
+ if (!test_and_set_bit(counter, &__get_cpu_var(perfctr_nmi_owner)))
+ return 1;
+ return 0;
+}
+
+void release_perfctr_nmi(unsigned int msr)
+{
+ unsigned int counter;
+
+ counter = nmi_perfctr_msr_to_bit(msr);
+ BUG_ON(counter > NMI_MAX_COUNTER_BITS);
+
+ clear_bit(counter, &__get_cpu_var(perfctr_nmi_owner));
+}
+
+int reserve_evntsel_nmi(unsigned int msr)
+{
+ unsigned int counter;
+
+ counter = nmi_evntsel_msr_to_bit(msr);
+ BUG_ON(counter > NMI_MAX_COUNTER_BITS);
+
+ if (!test_and_set_bit(counter, &__get_cpu_var(evntsel_nmi_owner)[0]))
+ return 1;
+ return 0;
+}
+
+void release_evntsel_nmi(unsigned int msr)
+{
+ unsigned int counter;
+
+ counter = nmi_evntsel_msr_to_bit(msr);
+ BUG_ON(counter > NMI_MAX_COUNTER_BITS);
+
+ clear_bit(counter, &__get_cpu_var(evntsel_nmi_owner)[0]);
+}
+
+static __cpuinit inline int nmi_known_cpu(void)
+{
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_AMD:
+ return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6));
+ case X86_VENDOR_INTEL:
+ if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
+ return 1;
+ else
+ return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6));
+ }
+ return 0;
+}
#ifdef CONFIG_SMP
/* The performance counters used by NMI_LOCAL_APIC don't trigger when
@@ -125,7 +216,18 @@ static int __init check_nmi_watchdog(void)
unsigned int *prev_nmi_count;
int cpu;
- if (nmi_watchdog == NMI_NONE)
+ /* Enable NMI watchdog for newer systems.
+ Actually it should be safe for most systems before 2004 too except
+ for some IBM systems that corrupt registers when NMI happens
+ during SMM. Unfortunately we don't have more exact information
+ on these and use this coarse check. */
+ if (nmi_watchdog == NMI_DEFAULT && dmi_get_year(DMI_BIOS_DATE) >= 2004)
+ nmi_watchdog = NMI_LOCAL_APIC;
+
+ if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DEFAULT))
+ return 0;
+
+ if (!atomic_read(&nmi_active))
return 0;
prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
@@ -149,25 +251,45 @@ static int __init check_nmi_watchdog(void)
if (!cpu_isset(cpu, cpu_callin_map))
continue;
#endif
+ if (!per_cpu(nmi_watchdog_ctlblk, cpu).enabled)
+ continue;
if (nmi_count(cpu) - prev_nmi_count[cpu] <= 5) {
- endflag = 1;
printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n",
cpu,
prev_nmi_count[cpu],
nmi_count(cpu));
- nmi_active = 0;
- lapic_nmi_owner &= ~LAPIC_NMI_WATCHDOG;
- kfree(prev_nmi_count);
- return -1;
+ per_cpu(nmi_watchdog_ctlblk, cpu).enabled = 0;
+ atomic_dec(&nmi_active);
}
}
+ if (!atomic_read(&nmi_active)) {
+ kfree(prev_nmi_count);
+ atomic_set(&nmi_active, -1);
+ return -1;
+ }
endflag = 1;
printk("OK.\n");
/* now that we know it works we can reduce NMI frequency to
something more reasonable; makes a difference in some configs */
- if (nmi_watchdog == NMI_LOCAL_APIC)
+ if (nmi_watchdog == NMI_LOCAL_APIC) {
+ struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+
nmi_hz = 1;
+ /*
+ * On Intel CPUs with ARCH_PERFMON only 32 bits in the counter
+ * are writable, with higher bits sign extending from bit 31.
+ * So, we can only program the counter with 31 bit values and
+ * 32nd bit should be 1, for 33.. to be 1.
+ * Find the appropriate nmi_hz
+ */
+ if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0 &&
+ ((u64)cpu_khz * 1000) > 0x7fffffffULL) {
+ u64 count = (u64)cpu_khz * 1000;
+ do_div(count, 0x7fffffffUL);
+ nmi_hz = count + 1;
+ }
+ }
kfree(prev_nmi_count);
return 0;
@@ -181,124 +303,70 @@ static int __init setup_nmi_watchdog(char *str)
get_option(&str, &nmi);
- if (nmi >= NMI_INVALID)
+ if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE))
return 0;
- if (nmi == NMI_NONE)
- nmi_watchdog = nmi;
/*
* If any other x86 CPU has a local APIC, then
* please test the NMI stuff there and send me the
* missing bits. Right now Intel P6/P4 and AMD K7 only.
*/
- if ((nmi == NMI_LOCAL_APIC) &&
- (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
- (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15))
- nmi_watchdog = nmi;
- if ((nmi == NMI_LOCAL_APIC) &&
- (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) &&
- (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15))
- nmi_watchdog = nmi;
- /*
- * We can enable the IO-APIC watchdog
- * unconditionally.
- */
- if (nmi == NMI_IO_APIC) {
- nmi_active = 1;
- nmi_watchdog = nmi;
- }
+ if ((nmi == NMI_LOCAL_APIC) && (nmi_known_cpu() == 0))
+ return 0; /* no lapic support */
+ nmi_watchdog = nmi;
return 1;
}
__setup("nmi_watchdog=", setup_nmi_watchdog);
-static void disable_intel_arch_watchdog(void);
-
static void disable_lapic_nmi_watchdog(void)
{
- if (nmi_active <= 0)
+ BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
+
+ if (atomic_read(&nmi_active) <= 0)
return;
- switch (boot_cpu_data.x86_vendor) {
- case X86_VENDOR_AMD:
- wrmsr(MSR_K7_EVNTSEL0, 0, 0);
- break;
- case X86_VENDOR_INTEL:
- if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
- disable_intel_arch_watchdog();
- break;
- }
- switch (boot_cpu_data.x86) {
- case 6:
- if (boot_cpu_data.x86_model > 0xd)
- break;
- wrmsr(MSR_P6_EVNTSEL0, 0, 0);
- break;
- case 15:
- if (boot_cpu_data.x86_model > 0x4)
- break;
+ on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1);
- wrmsr(MSR_P4_IQ_CCCR0, 0, 0);
- wrmsr(MSR_P4_CRU_ESCR0, 0, 0);
- break;
- }
- break;
- }
- nmi_active = -1;
- /* tell do_nmi() and others that we're not active any more */
- nmi_watchdog = 0;
+ BUG_ON(atomic_read(&nmi_active) != 0);
}
static void enable_lapic_nmi_watchdog(void)
{
- if (nmi_active < 0) {
- nmi_watchdog = NMI_LOCAL_APIC;
- setup_apic_nmi_watchdog();
- }
-}
+ BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
-int reserve_lapic_nmi(void)
-{
- unsigned int old_owner;
-
- spin_lock(&lapic_nmi_owner_lock);
- old_owner = lapic_nmi_owner;
- lapic_nmi_owner |= LAPIC_NMI_RESERVED;
- spin_unlock(&lapic_nmi_owner_lock);
- if (old_owner & LAPIC_NMI_RESERVED)
- return -EBUSY;
- if (old_owner & LAPIC_NMI_WATCHDOG)
- disable_lapic_nmi_watchdog();
- return 0;
-}
+ /* are we already enabled */
+ if (atomic_read(&nmi_active) != 0)
+ return;
-void release_lapic_nmi(void)
-{
- unsigned int new_owner;
+ /* are we lapic aware */
+ if (nmi_known_cpu() <= 0)
+ return;
- spin_lock(&lapic_nmi_owner_lock);
- new_owner = lapic_nmi_owner & ~LAPIC_NMI_RESERVED;
- lapic_nmi_owner = new_owner;
- spin_unlock(&lapic_nmi_owner_lock);
- if (new_owner & LAPIC_NMI_WATCHDOG)
- enable_lapic_nmi_watchdog();
+ on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1);
+ touch_nmi_watchdog();
}
void disable_timer_nmi_watchdog(void)
{
- if ((nmi_watchdog != NMI_IO_APIC) || (nmi_active <= 0))
+ BUG_ON(nmi_watchdog != NMI_IO_APIC);
+
+ if (atomic_read(&nmi_active) <= 0)
return;
- unset_nmi_callback();
- nmi_active = -1;
- nmi_watchdog = NMI_NONE;
+ disable_irq(0);
+ on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1);
+
+ BUG_ON(atomic_read(&nmi_active) != 0);
}
void enable_timer_nmi_watchdog(void)
{
- if (nmi_active < 0) {
- nmi_watchdog = NMI_IO_APIC;
+ BUG_ON(nmi_watchdog != NMI_IO_APIC);
+
+ if (atomic_read(&nmi_active) == 0) {
touch_nmi_watchdog();
- nmi_active = 1;
+ on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1);
+ enable_irq(0);
}
}
@@ -308,15 +376,20 @@ static int nmi_pm_active; /* nmi_active before suspend */
static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state)
{
- nmi_pm_active = nmi_active;
- disable_lapic_nmi_watchdog();
+ /* only CPU0 goes here, other CPUs should be offline */
+ nmi_pm_active = atomic_read(&nmi_active);
+ stop_apic_nmi_watchdog(NULL);
+ BUG_ON(atomic_read(&nmi_active) != 0);
return 0;
}
static int lapic_nmi_resume(struct sys_device *dev)
{
- if (nmi_pm_active > 0)
- enable_lapic_nmi_watchdog();
+ /* only CPU0 goes here, other CPUs should be offline */
+ if (nmi_pm_active > 0) {
+ setup_apic_nmi_watchdog(NULL);
+ touch_nmi_watchdog();
+ }
return 0;
}
@@ -336,7 +409,13 @@ static int __init init_lapic_nmi_sysfs(void)
{
int error;
- if (nmi_active == 0 || nmi_watchdog != NMI_LOCAL_APIC)
+ /* should really be a BUG_ON but b/c this is an
+ * init call, it just doesn't work. -dcz
+ */
+ if (nmi_watchdog != NMI_LOCAL_APIC)
+ return 0;
+
+ if ( atomic_read(&nmi_active) < 0 )
return 0;
error = sysdev_class_register(&nmi_sysclass);
@@ -354,138 +433,269 @@ late_initcall(init_lapic_nmi_sysfs);
* Original code written by Keith Owens.
*/
-static void clear_msr_range(unsigned int base, unsigned int n)
-{
- unsigned int i;
-
- for(i = 0; i < n; ++i)
- wrmsr(base+i, 0, 0);
-}
-
-static void write_watchdog_counter(const char *descr)
+static void write_watchdog_counter(unsigned int perfctr_msr, const char *descr)
{
u64 count = (u64)cpu_khz * 1000;
do_div(count, nmi_hz);
if(descr)
Dprintk("setting %s to -0x%08Lx\n", descr, count);
- wrmsrl(nmi_perfctr_msr, 0 - count);
+ wrmsrl(perfctr_msr, 0 - count);
}
-static void setup_k7_watchdog(void)
+/* Note that these events don't tick when the CPU idles. This means
+ the frequency varies with CPU load. */
+
+#define K7_EVNTSEL_ENABLE (1 << 22)
+#define K7_EVNTSEL_INT (1 << 20)
+#define K7_EVNTSEL_OS (1 << 17)
+#define K7_EVNTSEL_USR (1 << 16)
+#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76
+#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
+
+static int setup_k7_watchdog(void)
{
+ unsigned int perfctr_msr, evntsel_msr;
unsigned int evntsel;
+ struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+
+ perfctr_msr = MSR_K7_PERFCTR0;
+ evntsel_msr = MSR_K7_EVNTSEL0;
+ if (!reserve_perfctr_nmi(perfctr_msr))
+ goto fail;
- nmi_perfctr_msr = MSR_K7_PERFCTR0;
+ if (!reserve_evntsel_nmi(evntsel_msr))
+ goto fail1;
- clear_msr_range(MSR_K7_EVNTSEL0, 4);
- clear_msr_range(MSR_K7_PERFCTR0, 4);
+ wrmsrl(perfctr_msr, 0UL);
evntsel = K7_EVNTSEL_INT
| K7_EVNTSEL_OS
| K7_EVNTSEL_USR
| K7_NMI_EVENT;
- wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
- write_watchdog_counter("K7_PERFCTR0");
+ /* setup the timer */
+ wrmsr(evntsel_msr, evntsel, 0);
+ write_watchdog_counter(perfctr_msr, "K7_PERFCTR0");
apic_write(APIC_LVTPC, APIC_DM_NMI);
evntsel |= K7_EVNTSEL_ENABLE;
- wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
+ wrmsr(evntsel_msr, evntsel, 0);
+
+ wd->perfctr_msr = perfctr_msr;
+ wd->evntsel_msr = evntsel_msr;
+ wd->cccr_msr = 0; //unused
+ wd->check_bit = 1ULL<<63;
+ return 1;
+fail1:
+ release_perfctr_nmi(perfctr_msr);
+fail:
+ return 0;
+}
+
+static void stop_k7_watchdog(void)
+{
+ struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+
+ wrmsr(wd->evntsel_msr, 0, 0);
+
+ release_evntsel_nmi(wd->evntsel_msr);
+ release_perfctr_nmi(wd->perfctr_msr);
}
-static void setup_p6_watchdog(void)
+#define P6_EVNTSEL0_ENABLE (1 << 22)
+#define P6_EVNTSEL_INT (1 << 20)
+#define P6_EVNTSEL_OS (1 << 17)
+#define P6_EVNTSEL_USR (1 << 16)
+#define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79
+#define P6_NMI_EVENT P6_EVENT_CPU_CLOCKS_NOT_HALTED
+
+static int setup_p6_watchdog(void)
{
+ unsigned int perfctr_msr, evntsel_msr;
unsigned int evntsel;
+ struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+
+ perfctr_msr = MSR_P6_PERFCTR0;
+ evntsel_msr = MSR_P6_EVNTSEL0;
+ if (!reserve_perfctr_nmi(perfctr_msr))
+ goto fail;
- nmi_perfctr_msr = MSR_P6_PERFCTR0;
+ if (!reserve_evntsel_nmi(evntsel_msr))
+ goto fail1;
- clear_msr_range(MSR_P6_EVNTSEL0, 2);
- clear_msr_range(MSR_P6_PERFCTR0, 2);
+ wrmsrl(perfctr_msr, 0UL);
evntsel = P6_EVNTSEL_INT
| P6_EVNTSEL_OS
| P6_EVNTSEL_USR
| P6_NMI_EVENT;
- wrmsr(MSR_P6_EVNTSEL0, evntsel, 0);
- write_watchdog_counter("P6_PERFCTR0");
+ /* setup the timer */
+ wrmsr(evntsel_msr, evntsel, 0);
+ write_watchdog_counter(perfctr_msr, "P6_PERFCTR0");
apic_write(APIC_LVTPC, APIC_DM_NMI);
evntsel |= P6_EVNTSEL0_ENABLE;
- wrmsr(MSR_P6_EVNTSEL0, evntsel, 0);
+ wrmsr(evntsel_msr, evntsel, 0);
+
+ wd->perfctr_msr = perfctr_msr;
+ wd->evntsel_msr = evntsel_msr;
+ wd->cccr_msr = 0; //unused
+ wd->check_bit = 1ULL<<39;
+ return 1;
+fail1:
+ release_perfctr_nmi(perfctr_msr);
+fail:
+ return 0;
+}
+
+static void stop_p6_watchdog(void)
+{
+ struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+
+ wrmsr(wd->evntsel_msr, 0, 0);
+
+ release_evntsel_nmi(wd->evntsel_msr);
+ release_perfctr_nmi(wd->perfctr_msr);
}
+/* Note that these events don't tick when the CPU idles. This means
+ the frequency varies with CPU load. */
+
+#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7)
+#define P4_ESCR_EVENT_SELECT(N) ((N)<<25)
+#define P4_ESCR_OS (1<<3)
+#define P4_ESCR_USR (1<<2)
+#define P4_CCCR_OVF_PMI0 (1<<26)
+#define P4_CCCR_OVF_PMI1 (1<<27)
+#define P4_CCCR_THRESHOLD(N) ((N)<<20)
+#define P4_CCCR_COMPLEMENT (1<<19)
+#define P4_CCCR_COMPARE (1<<18)
+#define P4_CCCR_REQUIRED (3<<16)
+#define P4_CCCR_ESCR_SELECT(N) ((N)<<13)
+#define P4_CCCR_ENABLE (1<<12)
+#define P4_CCCR_OVF (1<<31)
+/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
+ CRU_ESCR0 (with any non-null event selector) through a complemented
+ max threshold. [IA32-Vol3, Section 14.9.9] */
+
static int setup_p4_watchdog(void)
{
+ unsigned int perfctr_msr, evntsel_msr, cccr_msr;
+ unsigned int evntsel, cccr_val;
unsigned int misc_enable, dummy;
+ unsigned int ht_num;
+ struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
- rdmsr(MSR_P4_MISC_ENABLE, misc_enable, dummy);
+ rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy);
if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
return 0;
- nmi_perfctr_msr = MSR_P4_IQ_COUNTER0;
- nmi_p4_cccr_val = P4_NMI_IQ_CCCR0;
#ifdef CONFIG_SMP
- if (smp_num_siblings == 2)
- nmi_p4_cccr_val |= P4_CCCR_OVF_PMI1;
+ /* detect which hyperthread we are on */
+ if (smp_num_siblings == 2) {
+ unsigned int ebx, apicid;
+
+ ebx = cpuid_ebx(1);
+ apicid = (ebx >> 24) & 0xff;
+ ht_num = apicid & 1;
+ } else
#endif
+ ht_num = 0;
- if (!(misc_enable & MSR_P4_MISC_ENABLE_PEBS_UNAVAIL))
- clear_msr_range(0x3F1, 2);
- /* MSR 0x3F0 seems to have a default value of 0xFC00, but current
- docs doesn't fully define it, so leave it alone for now. */
- if (boot_cpu_data.x86_model >= 0x3) {
- /* MSR_P4_IQ_ESCR0/1 (0x3ba/0x3bb) removed */
- clear_msr_range(0x3A0, 26);
- clear_msr_range(0x3BC, 3);
+ /* performance counters are shared resources
+ * assign each hyperthread its own set
+ * (re-use the ESCR0 register, seems safe
+ * and keeps the cccr_val the same)
+ */
+ if (!ht_num) {
+ /* logical cpu 0 */
+ perfctr_msr = MSR_P4_IQ_PERFCTR0;
+ evntsel_msr = MSR_P4_CRU_ESCR0;
+ cccr_msr = MSR_P4_IQ_CCCR0;
+ cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4);
} else {
- clear_msr_range(0x3A0, 31);
+ /* logical cpu 1 */
+ perfctr_msr = MSR_P4_IQ_PERFCTR1;
+ evntsel_msr = MSR_P4_CRU_ESCR0;
+ cccr_msr = MSR_P4_IQ_CCCR1;
+ cccr_val = P4_CCCR_OVF_PMI1 | P4_CCCR_ESCR_SELECT(4);
}
- clear_msr_range(0x3C0, 6);
- clear_msr_range(0x3C8, 6);
- clear_msr_range(0x3E0, 2);
- clear_msr_range(MSR_P4_CCCR0, 18);
- clear_msr_range(MSR_P4_PERFCTR0, 18);
-
- wrmsr(MSR_P4_CRU_ESCR0, P4_NMI_CRU_ESCR0, 0);
- wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0 & ~P4_CCCR_ENABLE, 0);
- write_watchdog_counter("P4_IQ_COUNTER0");
+
+ if (!reserve_perfctr_nmi(perfctr_msr))
+ goto fail;
+
+ if (!reserve_evntsel_nmi(evntsel_msr))
+ goto fail1;
+
+ evntsel = P4_ESCR_EVENT_SELECT(0x3F)
+ | P4_ESCR_OS
+ | P4_ESCR_USR;
+
+ cccr_val |= P4_CCCR_THRESHOLD(15)
+ | P4_CCCR_COMPLEMENT
+ | P4_CCCR_COMPARE
+ | P4_CCCR_REQUIRED;
+
+ wrmsr(evntsel_msr, evntsel, 0);
+ wrmsr(cccr_msr, cccr_val, 0);
+ write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0");
apic_write(APIC_LVTPC, APIC_DM_NMI);
- wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0);
+ cccr_val |= P4_CCCR_ENABLE;
+ wrmsr(cccr_msr, cccr_val, 0);
+ wd->perfctr_msr = perfctr_msr;
+ wd->evntsel_msr = evntsel_msr;
+ wd->cccr_msr = cccr_msr;
+ wd->check_bit = 1ULL<<39;
return 1;
+fail1:
+ release_perfctr_nmi(perfctr_msr);
+fail:
+ return 0;
}
-static void disable_intel_arch_watchdog(void)
+static void stop_p4_watchdog(void)
{
- unsigned ebx;
+ struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
- /*
- * Check whether the Architectural PerfMon supports
- * Unhalted Core Cycles Event or not.
- * NOTE: Corresponding bit = 0 in ebp indicates event present.
- */
- ebx = cpuid_ebx(10);
- if (!(ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
- wrmsr(MSR_ARCH_PERFMON_EVENTSEL0, 0, 0);
+ wrmsr(wd->cccr_msr, 0, 0);
+ wrmsr(wd->evntsel_msr, 0, 0);
+
+ release_evntsel_nmi(wd->evntsel_msr);
+ release_perfctr_nmi(wd->perfctr_msr);
}
+#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL
+#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK
+
static int setup_intel_arch_watchdog(void)
{
+ unsigned int ebx;
+ union cpuid10_eax eax;
+ unsigned int unused;
+ unsigned int perfctr_msr, evntsel_msr;
unsigned int evntsel;
- unsigned ebx;
+ struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
/*
* Check whether the Architectural PerfMon supports
* Unhalted Core Cycles Event or not.
- * NOTE: Corresponding bit = 0 in ebp indicates event present.
+ * NOTE: Corresponding bit = 0 in ebx indicates event present.
*/
- ebx = cpuid_ebx(10);
- if ((ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
- return 0;
+ cpuid(10, &(eax.full), &ebx, &unused, &unused);
+ if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
+ (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
+ goto fail;
+
+ perfctr_msr = MSR_ARCH_PERFMON_PERFCTR0;
+ evntsel_msr = MSR_ARCH_PERFMON_EVENTSEL0;
- nmi_perfctr_msr = MSR_ARCH_PERFMON_PERFCTR0;
+ if (!reserve_perfctr_nmi(perfctr_msr))
+ goto fail;
- clear_msr_range(MSR_ARCH_PERFMON_EVENTSEL0, 2);
- clear_msr_range(MSR_ARCH_PERFMON_PERFCTR0, 2);
+ if (!reserve_evntsel_nmi(evntsel_msr))
+ goto fail1;
+
+ wrmsrl(perfctr_msr, 0UL);
evntsel = ARCH_PERFMON_EVENTSEL_INT
| ARCH_PERFMON_EVENTSEL_OS
@@ -493,51 +703,145 @@ static int setup_intel_arch_watchdog(void)
| ARCH_PERFMON_NMI_EVENT_SEL
| ARCH_PERFMON_NMI_EVENT_UMASK;
- wrmsr(MSR_ARCH_PERFMON_EVENTSEL0, evntsel, 0);
- write_watchdog_counter("INTEL_ARCH_PERFCTR0");
+ /* setup the timer */
+ wrmsr(evntsel_msr, evntsel, 0);
+ write_watchdog_counter(perfctr_msr, "INTEL_ARCH_PERFCTR0");
apic_write(APIC_LVTPC, APIC_DM_NMI);
evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
- wrmsr(MSR_ARCH_PERFMON_EVENTSEL0, evntsel, 0);
+ wrmsr(evntsel_msr, evntsel, 0);
+
+ wd->perfctr_msr = perfctr_msr;
+ wd->evntsel_msr = evntsel_msr;
+ wd->cccr_msr = 0; //unused
+ wd->check_bit = 1ULL << (eax.split.bit_width - 1);
return 1;
+fail1:
+ release_perfctr_nmi(perfctr_msr);
+fail:
+ return 0;
}
-void setup_apic_nmi_watchdog (void)
+static void stop_intel_arch_watchdog(void)
{
- switch (boot_cpu_data.x86_vendor) {
- case X86_VENDOR_AMD:
- if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15)
- return;
- setup_k7_watchdog();
- break;
- case X86_VENDOR_INTEL:
- if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
- if (!setup_intel_arch_watchdog())
+ unsigned int ebx;
+ union cpuid10_eax eax;
+ unsigned int unused;
+ struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+
+ /*
+ * Check whether the Architectural PerfMon supports
+ * Unhalted Core Cycles Event or not.
+ * NOTE: Corresponding bit = 0 in ebx indicates event present.
+ */
+ cpuid(10, &(eax.full), &ebx, &unused, &unused);
+ if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
+ (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
+ return;
+
+ wrmsr(wd->evntsel_msr, 0, 0);
+ release_evntsel_nmi(wd->evntsel_msr);
+ release_perfctr_nmi(wd->perfctr_msr);
+}
+
+void setup_apic_nmi_watchdog (void *unused)
+{
+ struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+
+ /* only support LOCAL and IO APICs for now */
+ if ((nmi_watchdog != NMI_LOCAL_APIC) &&
+ (nmi_watchdog != NMI_IO_APIC))
+ return;
+
+ if (wd->enabled == 1)
+ return;
+
+ /* cheap hack to support suspend/resume */
+ /* if cpu0 is not active neither should the other cpus */
+ if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0))
+ return;
+
+ if (nmi_watchdog == NMI_LOCAL_APIC) {
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_AMD:
+ if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15)
return;
- break;
- }
- switch (boot_cpu_data.x86) {
- case 6:
- if (boot_cpu_data.x86_model > 0xd)
+ if (!setup_k7_watchdog())
return;
-
- setup_p6_watchdog();
break;
- case 15:
- if (boot_cpu_data.x86_model > 0x4)
- return;
+ case X86_VENDOR_INTEL:
+ if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
+ if (!setup_intel_arch_watchdog())
+ return;
+ break;
+ }
+ switch (boot_cpu_data.x86) {
+ case 6:
+ if (boot_cpu_data.x86_model > 0xd)
+ return;
+
+ if (!setup_p6_watchdog())
+ return;
+ break;
+ case 15:
+ if (boot_cpu_data.x86_model > 0x4)
+ return;
- if (!setup_p4_watchdog())
+ if (!setup_p4_watchdog())
+ return;
+ break;
+ default:
return;
+ }
break;
default:
return;
}
- break;
- default:
+ }
+ wd->enabled = 1;
+ atomic_inc(&nmi_active);
+}
+
+void stop_apic_nmi_watchdog(void *unused)
+{
+ struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+
+ /* only support LOCAL and IO APICs for now */
+ if ((nmi_watchdog != NMI_LOCAL_APIC) &&
+ (nmi_watchdog != NMI_IO_APIC))
+ return;
+
+ if (wd->enabled == 0)
return;
+
+ if (nmi_watchdog == NMI_LOCAL_APIC) {
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_AMD:
+ stop_k7_watchdog();
+ break;
+ case X86_VENDOR_INTEL:
+ if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
+ stop_intel_arch_watchdog();
+ break;
+ }
+ switch (boot_cpu_data.x86) {
+ case 6:
+ if (boot_cpu_data.x86_model > 0xd)
+ break;
+ stop_p6_watchdog();
+ break;
+ case 15:
+ if (boot_cpu_data.x86_model > 0x4)
+ break;
+ stop_p4_watchdog();
+ break;
+ }
+ break;
+ default:
+ return;
+ }
}
- lapic_nmi_owner = LAPIC_NMI_WATCHDOG;
- nmi_active = 1;
+ wd->enabled = 0;
+ atomic_dec(&nmi_active);
}
/*
@@ -579,7 +883,7 @@ EXPORT_SYMBOL(touch_nmi_watchdog);
extern void die_nmi(struct pt_regs *, const char *msg);
-void nmi_watchdog_tick (struct pt_regs * regs)
+__kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
{
/*
@@ -588,11 +892,23 @@ void nmi_watchdog_tick (struct pt_regs * regs)
* smp_processor_id().
*/
unsigned int sum;
+ int touched = 0;
int cpu = smp_processor_id();
+ struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+ u64 dummy;
+ int rc=0;
+
+ /* check for other users first */
+ if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
+ == NOTIFY_STOP) {
+ rc = 1;
+ touched = 1;
+ }
sum = per_cpu(irq_stat, cpu).apic_timer_irqs;
- if (last_irq_sums[cpu] == sum) {
+ /* if the apic timer isn't firing, this cpu isn't doing much */
+ if (!touched && last_irq_sums[cpu] == sum) {
/*
* Ayiee, looks like this CPU is stuck ...
* wait a few IRQs (5 seconds) before doing the oops ...
@@ -607,27 +923,59 @@ void nmi_watchdog_tick (struct pt_regs * regs)
last_irq_sums[cpu] = sum;
alert_counter[cpu] = 0;
}
- if (nmi_perfctr_msr) {
- if (nmi_perfctr_msr == MSR_P4_IQ_COUNTER0) {
- /*
- * P4 quirks:
- * - An overflown perfctr will assert its interrupt
- * until the OVF flag in its CCCR is cleared.
- * - LVTPC is masked on interrupt and must be
- * unmasked by the LVTPC handler.
+ /* see if the nmi watchdog went off */
+ if (wd->enabled) {
+ if (nmi_watchdog == NMI_LOCAL_APIC) {
+ rdmsrl(wd->perfctr_msr, dummy);
+ if (dummy & wd->check_bit){
+ /* this wasn't a watchdog timer interrupt */
+ goto done;
+ }
+
+ /* only Intel P4 uses the cccr msr */
+ if (wd->cccr_msr != 0) {
+ /*
+ * P4 quirks:
+ * - An overflown perfctr will assert its interrupt
+ * until the OVF flag in its CCCR is cleared.
+ * - LVTPC is masked on interrupt and must be
+ * unmasked by the LVTPC handler.
+ */
+ rdmsrl(wd->cccr_msr, dummy);
+ dummy &= ~P4_CCCR_OVF;
+ wrmsrl(wd->cccr_msr, dummy);
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+ }
+ else if (wd->perfctr_msr == MSR_P6_PERFCTR0 ||
+ wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) {
+ /* P6 based Pentium M need to re-unmask
+ * the apic vector but it doesn't hurt
+ * other P6 variant.
+ * ArchPerfom/Core Duo also needs this */
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+ }
+ /* start the cycle over again */
+ write_watchdog_counter(wd->perfctr_msr, NULL);
+ rc = 1;
+ } else if (nmi_watchdog == NMI_IO_APIC) {
+ /* don't know how to accurately check for this.
+ * just assume it was a watchdog timer interrupt
+ * This matches the old behaviour.
*/
- wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0);
- apic_write(APIC_LVTPC, APIC_DM_NMI);
+ rc = 1;
}
- else if (nmi_perfctr_msr == MSR_P6_PERFCTR0 ||
- nmi_perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) {
- /* Only P6 based Pentium M need to re-unmask
- * the apic vector but it doesn't hurt
- * other P6 variant */
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- }
- write_watchdog_counter(NULL);
}
+done:
+ return rc;
+}
+
+int do_nmi_callback(struct pt_regs * regs, int cpu)
+{
+#ifdef CONFIG_SYSCTL
+ if (unknown_nmi_panic)
+ return unknown_nmi_panic_callback(regs, cpu);
+#endif
+ return 0;
}
#ifdef CONFIG_SYSCTL
@@ -637,36 +985,46 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
unsigned char reason = get_nmi_reason();
char buf[64];
- if (!(reason & 0xc0)) {
- sprintf(buf, "NMI received for unknown reason %02x\n", reason);
- die_nmi(regs, buf);
- }
+ sprintf(buf, "NMI received for unknown reason %02x\n", reason);
+ die_nmi(regs, buf);
return 0;
}
/*
- * proc handler for /proc/sys/kernel/unknown_nmi_panic
+ * proc handler for /proc/sys/kernel/nmi
*/
-int proc_unknown_nmi_panic(ctl_table *table, int write, struct file *file,
+int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
void __user *buffer, size_t *length, loff_t *ppos)
{
int old_state;
- old_state = unknown_nmi_panic;
+ nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
+ old_state = nmi_watchdog_enabled;
proc_dointvec(table, write, file, buffer, length, ppos);
- if (!!old_state == !!unknown_nmi_panic)
+ if (!!old_state == !!nmi_watchdog_enabled)
return 0;
- if (unknown_nmi_panic) {
- if (reserve_lapic_nmi() < 0) {
- unknown_nmi_panic = 0;
- return -EBUSY;
- } else {
- set_nmi_callback(unknown_nmi_panic_callback);
- }
+ if (atomic_read(&nmi_active) < 0) {
+ printk( KERN_WARNING "NMI watchdog is permanently disabled\n");
+ return -EIO;
+ }
+
+ if (nmi_watchdog == NMI_DEFAULT) {
+ if (nmi_known_cpu() > 0)
+ nmi_watchdog = NMI_LOCAL_APIC;
+ else
+ nmi_watchdog = NMI_IO_APIC;
+ }
+
+ if (nmi_watchdog == NMI_LOCAL_APIC) {
+ if (nmi_watchdog_enabled)
+ enable_lapic_nmi_watchdog();
+ else
+ disable_lapic_nmi_watchdog();
} else {
- release_lapic_nmi();
- unset_nmi_callback();
+ printk( KERN_WARNING
+ "NMI watchdog doesn't know what hardware to touch\n");
+ return -EIO;
}
return 0;
}
@@ -675,7 +1033,11 @@ int proc_unknown_nmi_panic(ctl_table *table, int write, struct file *file,
EXPORT_SYMBOL(nmi_active);
EXPORT_SYMBOL(nmi_watchdog);
-EXPORT_SYMBOL(reserve_lapic_nmi);
-EXPORT_SYMBOL(release_lapic_nmi);
+EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi);
+EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
+EXPORT_SYMBOL(reserve_perfctr_nmi);
+EXPORT_SYMBOL(release_perfctr_nmi);
+EXPORT_SYMBOL(reserve_evntsel_nmi);
+EXPORT_SYMBOL(release_evntsel_nmi);
EXPORT_SYMBOL(disable_timer_nmi_watchdog);
EXPORT_SYMBOL(enable_timer_nmi_watchdog);
diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index 8657c739656..8c190ca7ae4 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -37,6 +37,7 @@
#include <linux/kallsyms.h>
#include <linux/ptrace.h>
#include <linux/random.h>
+#include <linux/personality.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
@@ -320,15 +321,6 @@ void show_regs(struct pt_regs * regs)
* the "args".
*/
extern void kernel_thread_helper(void);
-__asm__(".section .text\n"
- ".align 4\n"
- "kernel_thread_helper:\n\t"
- "movl %edx,%eax\n\t"
- "pushl %edx\n\t"
- "call *%ebx\n\t"
- "pushl %eax\n\t"
- "call do_exit\n"
- ".previous");
/*
* Create a kernel thread
@@ -346,7 +338,7 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
regs.xes = __USER_DS;
regs.orig_eax = -1;
regs.eip = (unsigned long) kernel_thread_helper;
- regs.xcs = __KERNEL_CS;
+ regs.xcs = __KERNEL_CS | get_kernel_rpl();
regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
/* Ok, create the new process.. */
@@ -905,7 +897,7 @@ asmlinkage int sys_get_thread_area(struct user_desc __user *u_info)
unsigned long arch_align_stack(unsigned long sp)
{
- if (randomize_va_space)
+ if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
sp -= get_random_int() % 8192;
return sp & ~0xf;
}
diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c
index d3db03f4085..775f50e9395 100644
--- a/arch/i386/kernel/ptrace.c
+++ b/arch/i386/kernel/ptrace.c
@@ -185,17 +185,17 @@ static unsigned long convert_eip_to_linear(struct task_struct *child, struct pt_
return addr;
}
-static inline int is_at_popf(struct task_struct *child, struct pt_regs *regs)
+static inline int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
{
int i, copied;
- unsigned char opcode[16];
+ unsigned char opcode[15];
unsigned long addr = convert_eip_to_linear(child, regs);
copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0);
for (i = 0; i < copied; i++) {
switch (opcode[i]) {
- /* popf */
- case 0x9d:
+ /* popf and iret */
+ case 0x9d: case 0xcf:
return 1;
/* opcode and address size prefixes */
case 0x66: case 0x67:
@@ -247,7 +247,7 @@ static void set_singlestep(struct task_struct *child)
* don't mark it as being "us" that set it, so that we
* won't clear it by hand later.
*/
- if (is_at_popf(child, regs))
+ if (is_setting_trap_flag(child, regs))
return;
child->ptrace |= PT_DTRACE;
diff --git a/arch/i386/kernel/reboot.c b/arch/i386/kernel/reboot.c
index 54cfeabbc5e..84278e0093a 100644
--- a/arch/i386/kernel/reboot.c
+++ b/arch/i386/kernel/reboot.c
@@ -145,14 +145,10 @@ real_mode_gdt_entries [3] =
0x000092000100ffffULL /* 16-bit real-mode 64k data at 0x00000100 */
};
-static struct
-{
- unsigned short size __attribute__ ((packed));
- unsigned long long * base __attribute__ ((packed));
-}
-real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, real_mode_gdt_entries },
-real_mode_idt = { 0x3ff, NULL },
-no_idt = { 0, NULL };
+static struct Xgt_desc_struct
+real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, (long)real_mode_gdt_entries },
+real_mode_idt = { 0x3ff, 0 },
+no_idt = { 0, 0 };
/* This is 16-bit protected mode code to disable paging and the cache,
diff --git a/arch/i386/kernel/relocate_kernel.S b/arch/i386/kernel/relocate_kernel.S
index d312616effa..f151d6fae46 100644
--- a/arch/i386/kernel/relocate_kernel.S
+++ b/arch/i386/kernel/relocate_kernel.S
@@ -7,16 +7,138 @@
*/
#include <linux/linkage.h>
+#include <asm/page.h>
+#include <asm/kexec.h>
+
+/*
+ * Must be relocatable PIC code callable as a C function
+ */
+
+#define PTR(x) (x << 2)
+#define PAGE_ALIGNED (1 << PAGE_SHIFT)
+#define PAGE_ATTR 0x63 /* _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY */
+#define PAE_PGD_ATTR 0x01 /* _PAGE_PRESENT */
+
+ .text
+ .align PAGE_ALIGNED
+ .globl relocate_kernel
+relocate_kernel:
+ movl 8(%esp), %ebp /* list of pages */
+
+#ifdef CONFIG_X86_PAE
+ /* map the control page at its virtual address */
+
+ movl PTR(VA_PGD)(%ebp), %edi
+ movl PTR(VA_CONTROL_PAGE)(%ebp), %eax
+ andl $0xc0000000, %eax
+ shrl $27, %eax
+ addl %edi, %eax
+
+ movl PTR(PA_PMD_0)(%ebp), %edx
+ orl $PAE_PGD_ATTR, %edx
+ movl %edx, (%eax)
+
+ movl PTR(VA_PMD_0)(%ebp), %edi
+ movl PTR(VA_CONTROL_PAGE)(%ebp), %eax
+ andl $0x3fe00000, %eax
+ shrl $18, %eax
+ addl %edi, %eax
+
+ movl PTR(PA_PTE_0)(%ebp), %edx
+ orl $PAGE_ATTR, %edx
+ movl %edx, (%eax)
+
+ movl PTR(VA_PTE_0)(%ebp), %edi
+ movl PTR(VA_CONTROL_PAGE)(%ebp), %eax
+ andl $0x001ff000, %eax
+ shrl $9, %eax
+ addl %edi, %eax
+
+ movl PTR(PA_CONTROL_PAGE)(%ebp), %edx
+ orl $PAGE_ATTR, %edx
+ movl %edx, (%eax)
+
+ /* identity map the control page at its physical address */
+
+ movl PTR(VA_PGD)(%ebp), %edi
+ movl PTR(PA_CONTROL_PAGE)(%ebp), %eax
+ andl $0xc0000000, %eax
+ shrl $27, %eax
+ addl %edi, %eax
+
+ movl PTR(PA_PMD_1)(%ebp), %edx
+ orl $PAE_PGD_ATTR, %edx
+ movl %edx, (%eax)
+
+ movl PTR(VA_PMD_1)(%ebp), %edi
+ movl PTR(PA_CONTROL_PAGE)(%ebp), %eax
+ andl $0x3fe00000, %eax
+ shrl $18, %eax
+ addl %edi, %eax
+
+ movl PTR(PA_PTE_1)(%ebp), %edx
+ orl $PAGE_ATTR, %edx
+ movl %edx, (%eax)
+
+ movl PTR(VA_PTE_1)(%ebp), %edi
+ movl PTR(PA_CONTROL_PAGE)(%ebp), %eax
+ andl $0x001ff000, %eax
+ shrl $9, %eax
+ addl %edi, %eax
+
+ movl PTR(PA_CONTROL_PAGE)(%ebp), %edx
+ orl $PAGE_ATTR, %edx
+ movl %edx, (%eax)
+#else
+ /* map the control page at its virtual address */
+
+ movl PTR(VA_PGD)(%ebp), %edi
+ movl PTR(VA_CONTROL_PAGE)(%ebp), %eax
+ andl $0xffc00000, %eax
+ shrl $20, %eax
+ addl %edi, %eax
+
+ movl PTR(PA_PTE_0)(%ebp), %edx
+ orl $PAGE_ATTR, %edx
+ movl %edx, (%eax)
+
+ movl PTR(VA_PTE_0)(%ebp), %edi
+ movl PTR(VA_CONTROL_PAGE)(%ebp), %eax
+ andl $0x003ff000, %eax
+ shrl $10, %eax
+ addl %edi, %eax
+
+ movl PTR(PA_CONTROL_PAGE)(%ebp), %edx
+ orl $PAGE_ATTR, %edx
+ movl %edx, (%eax)
+
+ /* identity map the control page at its physical address */
+
+ movl PTR(VA_PGD)(%ebp), %edi
+ movl PTR(PA_CONTROL_PAGE)(%ebp), %eax
+ andl $0xffc00000, %eax
+ shrl $20, %eax
+ addl %edi, %eax
+
+ movl PTR(PA_PTE_1)(%ebp), %edx
+ orl $PAGE_ATTR, %edx
+ movl %edx, (%eax)
+
+ movl PTR(VA_PTE_1)(%ebp), %edi
+ movl PTR(PA_CONTROL_PAGE)(%ebp), %eax
+ andl $0x003ff000, %eax
+ shrl $10, %eax
+ addl %edi, %eax
+
+ movl PTR(PA_CONTROL_PAGE)(%ebp), %edx
+ orl $PAGE_ATTR, %edx
+ movl %edx, (%eax)
+#endif
- /*
- * Must be relocatable PIC code callable as a C function, that once
- * it starts can not use the previous processes stack.
- */
- .globl relocate_new_kernel
relocate_new_kernel:
/* read the arguments and say goodbye to the stack */
movl 4(%esp), %ebx /* page_list */
- movl 8(%esp), %ebp /* reboot_code_buffer */
+ movl 8(%esp), %ebp /* list of pages */
movl 12(%esp), %edx /* start address */
movl 16(%esp), %ecx /* cpu_has_pae */
@@ -24,11 +146,26 @@ relocate_new_kernel:
pushl $0
popfl
- /* set a new stack at the bottom of our page... */
- lea 4096(%ebp), %esp
+ /* get physical address of control page now */
+ /* this is impossible after page table switch */
+ movl PTR(PA_CONTROL_PAGE)(%ebp), %edi
- /* store the parameters back on the stack */
- pushl %edx /* store the start address */
+ /* switch to new set of page tables */
+ movl PTR(PA_PGD)(%ebp), %eax
+ movl %eax, %cr3
+
+ /* setup a new stack at the end of the physical control page */
+ lea 4096(%edi), %esp
+
+ /* jump to identity mapped page */
+ movl %edi, %eax
+ addl $(identity_mapped - relocate_kernel), %eax
+ pushl %eax
+ ret
+
+identity_mapped:
+ /* store the start address on the stack */
+ pushl %edx
/* Set cr0 to a known state:
* 31 0 == Paging disabled
@@ -113,8 +250,3 @@ relocate_new_kernel:
xorl %edi, %edi
xorl %ebp, %ebp
ret
-relocate_new_kernel_end:
-
- .globl relocate_new_kernel_size
-relocate_new_kernel_size:
- .long relocate_new_kernel_end - relocate_new_kernel
diff --git a/arch/i386/kernel/semaphore.c b/arch/i386/kernel/semaphore.c
deleted file mode 100644
index 98352c374c7..00000000000
--- a/arch/i386/kernel/semaphore.c
+++ /dev/null
@@ -1,134 +0,0 @@
-/*
- * i386 semaphore implementation.
- *
- * (C) Copyright 1999 Linus Torvalds
- *
- * Portions Copyright 1999 Red Hat, Inc.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * rw semaphores implemented November 1999 by Benjamin LaHaise <bcrl@kvack.org>
- */
-#include <asm/semaphore.h>
-
-/*
- * The semaphore operations have a special calling sequence that
- * allow us to do a simpler in-line version of them. These routines
- * need to convert that sequence back into the C sequence when
- * there is contention on the semaphore.
- *
- * %eax contains the semaphore pointer on entry. Save the C-clobbered
- * registers (%eax, %edx and %ecx) except %eax whish is either a return
- * value or just clobbered..
- */
-asm(
-".section .sched.text\n"
-".align 4\n"
-".globl __down_failed\n"
-"__down_failed:\n\t"
-#if defined(CONFIG_FRAME_POINTER)
- "pushl %ebp\n\t"
- "movl %esp,%ebp\n\t"
-#endif
- "pushl %edx\n\t"
- "pushl %ecx\n\t"
- "call __down\n\t"
- "popl %ecx\n\t"
- "popl %edx\n\t"
-#if defined(CONFIG_FRAME_POINTER)
- "movl %ebp,%esp\n\t"
- "popl %ebp\n\t"
-#endif
- "ret"
-);
-
-asm(
-".section .sched.text\n"
-".align 4\n"
-".globl __down_failed_interruptible\n"
-"__down_failed_interruptible:\n\t"
-#if defined(CONFIG_FRAME_POINTER)
- "pushl %ebp\n\t"
- "movl %esp,%ebp\n\t"
-#endif
- "pushl %edx\n\t"
- "pushl %ecx\n\t"
- "call __down_interruptible\n\t"
- "popl %ecx\n\t"
- "popl %edx\n\t"
-#if defined(CONFIG_FRAME_POINTER)
- "movl %ebp,%esp\n\t"
- "popl %ebp\n\t"
-#endif
- "ret"
-);
-
-asm(
-".section .sched.text\n"
-".align 4\n"
-".globl __down_failed_trylock\n"
-"__down_failed_trylock:\n\t"
-#if defined(CONFIG_FRAME_POINTER)
- "pushl %ebp\n\t"
- "movl %esp,%ebp\n\t"
-#endif
- "pushl %edx\n\t"
- "pushl %ecx\n\t"
- "call __down_trylock\n\t"
- "popl %ecx\n\t"
- "popl %edx\n\t"
-#if defined(CONFIG_FRAME_POINTER)
- "movl %ebp,%esp\n\t"
- "popl %ebp\n\t"
-#endif
- "ret"
-);
-
-asm(
-".section .sched.text\n"
-".align 4\n"
-".globl __up_wakeup\n"
-"__up_wakeup:\n\t"
- "pushl %edx\n\t"
- "pushl %ecx\n\t"
- "call __up\n\t"
- "popl %ecx\n\t"
- "popl %edx\n\t"
- "ret"
-);
-
-/*
- * rw spinlock fallbacks
- */
-#if defined(CONFIG_SMP)
-asm(
-".section .sched.text\n"
-".align 4\n"
-".globl __write_lock_failed\n"
-"__write_lock_failed:\n\t"
- LOCK_PREFIX "addl $" RW_LOCK_BIAS_STR ",(%eax)\n"
-"1: rep; nop\n\t"
- "cmpl $" RW_LOCK_BIAS_STR ",(%eax)\n\t"
- "jne 1b\n\t"
- LOCK_PREFIX "subl $" RW_LOCK_BIAS_STR ",(%eax)\n\t"
- "jnz __write_lock_failed\n\t"
- "ret"
-);
-
-asm(
-".section .sched.text\n"
-".align 4\n"
-".globl __read_lock_failed\n"
-"__read_lock_failed:\n\t"
- LOCK_PREFIX "incl (%eax)\n"
-"1: rep; nop\n\t"
- "cmpl $1,(%eax)\n\t"
- "js 1b\n\t"
- LOCK_PREFIX "decl (%eax)\n\t"
- "js __read_lock_failed\n\t"
- "ret"
-);
-#endif
diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
index f1682206d30..814cdebf737 100644
--- a/arch/i386/kernel/setup.c
+++ b/arch/i386/kernel/setup.c
@@ -53,6 +53,7 @@
#include <asm/apic.h>
#include <asm/e820.h>
#include <asm/mpspec.h>
+#include <asm/mmzone.h>
#include <asm/setup.h>
#include <asm/arch_hooks.h>
#include <asm/sections.h>
@@ -89,18 +90,6 @@ EXPORT_SYMBOL(boot_cpu_data);
unsigned long mmu_cr4_features;
-#ifdef CONFIG_ACPI
- int acpi_disabled = 0;
-#else
- int acpi_disabled = 1;
-#endif
-EXPORT_SYMBOL(acpi_disabled);
-
-#ifdef CONFIG_ACPI
-int __initdata acpi_force = 0;
-extern acpi_interrupt_flags acpi_sci_flags;
-#endif
-
/* for MCA, but anyone else can use it if they want */
unsigned int machine_id;
#ifdef CONFIG_MCA
@@ -148,7 +137,6 @@ EXPORT_SYMBOL(ist_info);
struct e820map e820;
extern void early_cpu_init(void);
-extern void generic_apic_probe(char *);
extern int root_mountflags;
unsigned long saved_videomode;
@@ -700,238 +688,150 @@ static inline void copy_edd(void)
}
#endif
-static void __init parse_cmdline_early (char ** cmdline_p)
-{
- char c = ' ', *to = command_line, *from = saved_command_line;
- int len = 0;
- int userdef = 0;
+static int __initdata user_defined_memmap = 0;
- /* Save unparsed command line copy for /proc/cmdline */
- saved_command_line[COMMAND_LINE_SIZE-1] = '\0';
+/*
+ * "mem=nopentium" disables the 4MB page tables.
+ * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM
+ * to <mem>, overriding the bios size.
+ * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from
+ * <start> to <start>+<mem>, overriding the bios size.
+ *
+ * HPA tells me bootloaders need to parse mem=, so no new
+ * option should be mem= [also see Documentation/i386/boot.txt]
+ */
+static int __init parse_mem(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
- for (;;) {
- if (c != ' ')
- goto next_char;
- /*
- * "mem=nopentium" disables the 4MB page tables.
- * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM
- * to <mem>, overriding the bios size.
- * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from
- * <start> to <start>+<mem>, overriding the bios size.
- *
- * HPA tells me bootloaders need to parse mem=, so no new
- * option should be mem= [also see Documentation/i386/boot.txt]
+ if (strcmp(arg, "nopentium") == 0) {
+ clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
+ disable_pse = 1;
+ } else {
+ /* If the user specifies memory size, we
+ * limit the BIOS-provided memory map to
+ * that size. exactmap can be used to specify
+ * the exact map. mem=number can be used to
+ * trim the existing memory map.
*/
- if (!memcmp(from, "mem=", 4)) {
- if (to != command_line)
- to--;
- if (!memcmp(from+4, "nopentium", 9)) {
- from += 9+4;
- clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
- disable_pse = 1;
- } else {
- /* If the user specifies memory size, we
- * limit the BIOS-provided memory map to
- * that size. exactmap can be used to specify
- * the exact map. mem=number can be used to
- * trim the existing memory map.
- */
- unsigned long long mem_size;
-
- mem_size = memparse(from+4, &from);
- limit_regions(mem_size);
- userdef=1;
- }
- }
-
- else if (!memcmp(from, "memmap=", 7)) {
- if (to != command_line)
- to--;
- if (!memcmp(from+7, "exactmap", 8)) {
-#ifdef CONFIG_CRASH_DUMP
- /* If we are doing a crash dump, we
- * still need to know the real mem
- * size before original memory map is
- * reset.
- */
- find_max_pfn();
- saved_max_pfn = max_pfn;
-#endif
- from += 8+7;
- e820.nr_map = 0;
- userdef = 1;
- } else {
- /* If the user specifies memory size, we
- * limit the BIOS-provided memory map to
- * that size. exactmap can be used to specify
- * the exact map. mem=number can be used to
- * trim the existing memory map.
- */
- unsigned long long start_at, mem_size;
+ unsigned long long mem_size;
- mem_size = memparse(from+7, &from);
- if (*from == '@') {
- start_at = memparse(from+1, &from);
- add_memory_region(start_at, mem_size, E820_RAM);
- } else if (*from == '#') {
- start_at = memparse(from+1, &from);
- add_memory_region(start_at, mem_size, E820_ACPI);
- } else if (*from == '$') {
- start_at = memparse(from+1, &from);
- add_memory_region(start_at, mem_size, E820_RESERVED);
- } else {
- limit_regions(mem_size);
- userdef=1;
- }
- }
- }
-
- else if (!memcmp(from, "noexec=", 7))
- noexec_setup(from + 7);
+ mem_size = memparse(arg, &arg);
+ limit_regions(mem_size);
+ user_defined_memmap = 1;
+ }
+ return 0;
+}
+early_param("mem", parse_mem);
+static int __init parse_memmap(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
-#ifdef CONFIG_X86_SMP
- /*
- * If the BIOS enumerates physical processors before logical,
- * maxcpus=N at enumeration-time can be used to disable HT.
+ if (strcmp(arg, "exactmap") == 0) {
+#ifdef CONFIG_CRASH_DUMP
+ /* If we are doing a crash dump, we
+ * still need to know the real mem
+ * size before original memory map is
+ * reset.
*/
- else if (!memcmp(from, "maxcpus=", 8)) {
- extern unsigned int maxcpus;
-
- maxcpus = simple_strtoul(from + 8, NULL, 0);
- }
+ find_max_pfn();
+ saved_max_pfn = max_pfn;
#endif
-
-#ifdef CONFIG_ACPI
- /* "acpi=off" disables both ACPI table parsing and interpreter */
- else if (!memcmp(from, "acpi=off", 8)) {
- disable_acpi();
- }
-
- /* acpi=force to over-ride black-list */
- else if (!memcmp(from, "acpi=force", 10)) {
- acpi_force = 1;
- acpi_ht = 1;
- acpi_disabled = 0;
- }
-
- /* acpi=strict disables out-of-spec workarounds */
- else if (!memcmp(from, "acpi=strict", 11)) {
- acpi_strict = 1;
- }
-
- /* Limit ACPI just to boot-time to enable HT */
- else if (!memcmp(from, "acpi=ht", 7)) {
- if (!acpi_force)
- disable_acpi();
- acpi_ht = 1;
- }
-
- /* "pci=noacpi" disable ACPI IRQ routing and PCI scan */
- else if (!memcmp(from, "pci=noacpi", 10)) {
- acpi_disable_pci();
- }
- /* "acpi=noirq" disables ACPI interrupt routing */
- else if (!memcmp(from, "acpi=noirq", 10)) {
- acpi_noirq_set();
+ e820.nr_map = 0;
+ user_defined_memmap = 1;
+ } else {
+ /* If the user specifies memory size, we
+ * limit the BIOS-provided memory map to
+ * that size. exactmap can be used to specify
+ * the exact map. mem=number can be used to
+ * trim the existing memory map.
+ */
+ unsigned long long start_at, mem_size;
+
+ mem_size = memparse(arg, &arg);
+ if (*arg == '@') {
+ start_at = memparse(arg+1, &arg);
+ add_memory_region(start_at, mem_size, E820_RAM);
+ } else if (*arg == '#') {
+ start_at = memparse(arg+1, &arg);
+ add_memory_region(start_at, mem_size, E820_ACPI);
+ } else if (*arg == '$') {
+ start_at = memparse(arg+1, &arg);
+ add_memory_region(start_at, mem_size, E820_RESERVED);
+ } else {
+ limit_regions(mem_size);
+ user_defined_memmap = 1;
}
+ }
+ return 0;
+}
+early_param("memmap", parse_memmap);
- else if (!memcmp(from, "acpi_sci=edge", 13))
- acpi_sci_flags.trigger = 1;
-
- else if (!memcmp(from, "acpi_sci=level", 14))
- acpi_sci_flags.trigger = 3;
-
- else if (!memcmp(from, "acpi_sci=high", 13))
- acpi_sci_flags.polarity = 1;
+#ifdef CONFIG_PROC_VMCORE
+/* elfcorehdr= specifies the location of elf core header
+ * stored by the crashed kernel.
+ */
+static int __init parse_elfcorehdr(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
- else if (!memcmp(from, "acpi_sci=low", 12))
- acpi_sci_flags.polarity = 3;
+ elfcorehdr_addr = memparse(arg, &arg);
+ return 0;
+}
+early_param("elfcorehdr", parse_elfcorehdr);
+#endif /* CONFIG_PROC_VMCORE */
-#ifdef CONFIG_X86_IO_APIC
- else if (!memcmp(from, "acpi_skip_timer_override", 24))
- acpi_skip_timer_override = 1;
+/*
+ * highmem=size forces highmem to be exactly 'size' bytes.
+ * This works even on boxes that have no highmem otherwise.
+ * This also works to reduce highmem size on bigger boxes.
+ */
+static int __init parse_highmem(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
- if (!memcmp(from, "disable_timer_pin_1", 19))
- disable_timer_pin_1 = 1;
- if (!memcmp(from, "enable_timer_pin_1", 18))
- disable_timer_pin_1 = -1;
+ highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT;
+ return 0;
+}
+early_param("highmem", parse_highmem);
- /* disable IO-APIC */
- else if (!memcmp(from, "noapic", 6))
- disable_ioapic_setup();
-#endif /* CONFIG_X86_IO_APIC */
-#endif /* CONFIG_ACPI */
+/*
+ * vmalloc=size forces the vmalloc area to be exactly 'size'
+ * bytes. This can be used to increase (or decrease) the
+ * vmalloc area - the default is 128m.
+ */
+static int __init parse_vmalloc(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
-#ifdef CONFIG_X86_LOCAL_APIC
- /* enable local APIC */
- else if (!memcmp(from, "lapic", 5))
- lapic_enable();
+ __VMALLOC_RESERVE = memparse(arg, &arg);
+ return 0;
+}
+early_param("vmalloc", parse_vmalloc);
- /* disable local APIC */
- else if (!memcmp(from, "nolapic", 6))
- lapic_disable();
-#endif /* CONFIG_X86_LOCAL_APIC */
+/*
+ * reservetop=size reserves a hole at the top of the kernel address space which
+ * a hypervisor can load into later. Needed for dynamically loaded hypervisors,
+ * so relocating the fixmap can be done before paging initialization.
+ */
+static int __init parse_reservetop(char *arg)
+{
+ unsigned long address;
-#ifdef CONFIG_KEXEC
- /* crashkernel=size@addr specifies the location to reserve for
- * a crash kernel. By reserving this memory we guarantee
- * that linux never set's it up as a DMA target.
- * Useful for holding code to do something appropriate
- * after a kernel panic.
- */
- else if (!memcmp(from, "crashkernel=", 12)) {
- unsigned long size, base;
- size = memparse(from+12, &from);
- if (*from == '@') {
- base = memparse(from+1, &from);
- /* FIXME: Do I want a sanity check
- * to validate the memory range?
- */
- crashk_res.start = base;
- crashk_res.end = base + size - 1;
- }
- }
-#endif
-#ifdef CONFIG_PROC_VMCORE
- /* elfcorehdr= specifies the location of elf core header
- * stored by the crashed kernel.
- */
- else if (!memcmp(from, "elfcorehdr=", 11))
- elfcorehdr_addr = memparse(from+11, &from);
-#endif
+ if (!arg)
+ return -EINVAL;
- /*
- * highmem=size forces highmem to be exactly 'size' bytes.
- * This works even on boxes that have no highmem otherwise.
- * This also works to reduce highmem size on bigger boxes.
- */
- else if (!memcmp(from, "highmem=", 8))
- highmem_pages = memparse(from+8, &from) >> PAGE_SHIFT;
-
- /*
- * vmalloc=size forces the vmalloc area to be exactly 'size'
- * bytes. This can be used to increase (or decrease) the
- * vmalloc area - the default is 128m.
- */
- else if (!memcmp(from, "vmalloc=", 8))
- __VMALLOC_RESERVE = memparse(from+8, &from);
-
- next_char:
- c = *(from++);
- if (!c)
- break;
- if (COMMAND_LINE_SIZE <= ++len)
- break;
- *(to++) = c;
- }
- *to = '\0';
- *cmdline_p = command_line;
- if (userdef) {
- printk(KERN_INFO "user-defined physical RAM map:\n");
- print_memory_map("user");
- }
+ address = memparse(arg, &arg);
+ reserve_top_address(address);
+ return 0;
}
+early_param("reservetop", parse_reservetop);
/*
* Callback for efi_memory_walk.
@@ -1170,6 +1070,14 @@ static unsigned long __init setup_memory(void)
}
printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
pages_to_mb(highend_pfn - highstart_pfn));
+ num_physpages = highend_pfn;
+ high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
+#else
+ num_physpages = max_low_pfn;
+ high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
+#endif
+#ifdef CONFIG_FLATMEM
+ max_mapnr = num_physpages;
#endif
printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
pages_to_mb(max_low_pfn));
@@ -1181,22 +1089,20 @@ static unsigned long __init setup_memory(void)
void __init zone_sizes_init(void)
{
- unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
- unsigned int max_dma, low;
-
- max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
- low = max_low_pfn;
-
- if (low < max_dma)
- zones_size[ZONE_DMA] = low;
- else {
- zones_size[ZONE_DMA] = max_dma;
- zones_size[ZONE_NORMAL] = low - max_dma;
#ifdef CONFIG_HIGHMEM
- zones_size[ZONE_HIGHMEM] = highend_pfn - low;
+ unsigned long max_zone_pfns[MAX_NR_ZONES] = {
+ virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT,
+ max_low_pfn,
+ highend_pfn};
+ add_active_range(0, 0, highend_pfn);
+#else
+ unsigned long max_zone_pfns[MAX_NR_ZONES] = {
+ virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT,
+ max_low_pfn};
+ add_active_range(0, 0, max_low_pfn);
#endif
- }
- free_area_init(zones_size);
+
+ free_area_init_nodes(max_zone_pfns);
}
#else
extern unsigned long __init setup_memory(void);
@@ -1258,7 +1164,7 @@ void __init setup_bootmem_allocator(void)
*/
find_smp_config();
#endif
-
+ numa_kva_reserve();
#ifdef CONFIG_BLK_DEV_INITRD
if (LOADER_TYPE && INITRD_START) {
if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) {
@@ -1499,17 +1405,15 @@ void __init setup_arch(char **cmdline_p)
data_resource.start = virt_to_phys(_etext);
data_resource.end = virt_to_phys(_edata)-1;
- parse_cmdline_early(cmdline_p);
+ parse_early_param();
-#ifdef CONFIG_EARLY_PRINTK
- {
- char *s = strstr(*cmdline_p, "earlyprintk=");
- if (s) {
- setup_early_printk(strchr(s, '=') + 1);
- printk("early console enabled\n");
- }
+ if (user_defined_memmap) {
+ printk(KERN_INFO "user-defined physical RAM map:\n");
+ print_memory_map("user");
}
-#endif
+
+ strlcpy(command_line, saved_command_line, COMMAND_LINE_SIZE);
+ *cmdline_p = command_line;
max_low_pfn = setup_memory();
@@ -1538,7 +1442,7 @@ void __init setup_arch(char **cmdline_p)
dmi_scan_machine();
#ifdef CONFIG_X86_GENERICARCH
- generic_apic_probe(*cmdline_p);
+ generic_apic_probe();
#endif
if (efi_enabled)
efi_map_memmap();
@@ -1550,9 +1454,11 @@ void __init setup_arch(char **cmdline_p)
acpi_boot_table_init();
#endif
+#ifdef CONFIG_PCI
#ifdef CONFIG_X86_IO_APIC
check_acpi_pci(); /* Checks more than just ACPI actually */
#endif
+#endif
#ifdef CONFIG_ACPI
acpi_boot_init();
diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c
index c10789d7a9d..465188e2d70 100644
--- a/arch/i386/kernel/smp.c
+++ b/arch/i386/kernel/smp.c
@@ -634,3 +634,69 @@ fastcall void smp_call_function_interrupt(struct pt_regs *regs)
}
}
+/*
+ * this function sends a 'generic call function' IPI to one other CPU
+ * in the system.
+ *
+ * cpu is a standard Linux logical CPU number.
+ */
+static void
+__smp_call_function_single(int cpu, void (*func) (void *info), void *info,
+ int nonatomic, int wait)
+{
+ struct call_data_struct data;
+ int cpus = 1;
+
+ data.func = func;
+ data.info = info;
+ atomic_set(&data.started, 0);
+ data.wait = wait;
+ if (wait)
+ atomic_set(&data.finished, 0);
+
+ call_data = &data;
+ wmb();
+ /* Send a message to all other CPUs and wait for them to respond */
+ send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_VECTOR);
+
+ /* Wait for response */
+ while (atomic_read(&data.started) != cpus)
+ cpu_relax();
+
+ if (!wait)
+ return;
+
+ while (atomic_read(&data.finished) != cpus)
+ cpu_relax();
+}
+
+/*
+ * smp_call_function_single - Run a function on another CPU
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @nonatomic: Currently unused.
+ * @wait: If true, wait until function has completed on other CPUs.
+ *
+ * Retrurns 0 on success, else a negative status code.
+ *
+ * Does not return until the remote CPU is nearly ready to execute <func>
+ * or is or has executed.
+ */
+
+int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
+ int nonatomic, int wait)
+{
+ /* prevent preemption and reschedule on another processor */
+ int me = get_cpu();
+ if (cpu == me) {
+ WARN_ON(1);
+ put_cpu();
+ return -EBUSY;
+ }
+ spin_lock_bh(&call_lock);
+ __smp_call_function_single(cpu, func, info, nonatomic, wait);
+ spin_unlock_bh(&call_lock);
+ put_cpu();
+ return 0;
+}
+EXPORT_SYMBOL(smp_call_function_single);
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index f948419c888..020d873b7d2 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -177,6 +177,9 @@ static void __devinit smp_store_cpu_info(int id)
*/
if ((c->x86_vendor == X86_VENDOR_AMD) && (c->x86 == 6)) {
+ if (num_possible_cpus() == 1)
+ goto valid_k7;
+
/* Athlon 660/661 is valid. */
if ((c->x86_model==6) && ((c->x86_mask==0) || (c->x86_mask==1)))
goto valid_k7;
@@ -642,9 +645,13 @@ static void map_cpu_to_logical_apicid(void)
{
int cpu = smp_processor_id();
int apicid = logical_smp_processor_id();
+ int node = apicid_to_node(apicid);
+
+ if (!node_online(node))
+ node = first_online_node;
cpu_2_logical_apicid[cpu] = apicid;
- map_cpu_to_node(cpu, apicid_to_node(apicid));
+ map_cpu_to_node(cpu, node);
}
static void unmap_cpu_to_logical_apicid(int cpu)
@@ -1372,7 +1379,8 @@ int __cpu_disable(void)
*/
if (cpu == 0)
return -EBUSY;
-
+ if (nmi_watchdog == NMI_LOCAL_APIC)
+ stop_apic_nmi_watchdog(NULL);
clear_local_APIC();
/* Allow any queued timer interrupts to get serviced */
local_irq_enable();
@@ -1486,3 +1494,16 @@ void __init smp_intr_init(void)
/* IPI for generic function call */
set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
}
+
+/*
+ * If the BIOS enumerates physical processors before logical,
+ * maxcpus=N at enumeration-time can be used to disable HT.
+ */
+static int __init parse_maxcpus(char *arg)
+{
+ extern unsigned int maxcpus;
+
+ maxcpus = simple_strtoul(arg, NULL, 0);
+ return 0;
+}
+early_param("maxcpus", parse_maxcpus);
diff --git a/arch/i386/kernel/srat.c b/arch/i386/kernel/srat.c
index b1809c9a089..32413122c4c 100644
--- a/arch/i386/kernel/srat.c
+++ b/arch/i386/kernel/srat.c
@@ -42,7 +42,7 @@
#define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8)
static u8 pxm_bitmap[PXM_BITMAP_LEN]; /* bitmap of proximity domains */
-#define MAX_CHUNKS_PER_NODE 4
+#define MAX_CHUNKS_PER_NODE 3
#define MAXCHUNKS (MAX_CHUNKS_PER_NODE * MAX_NUMNODES)
struct node_memory_chunk_s {
unsigned long start_pfn;
@@ -54,8 +54,6 @@ struct node_memory_chunk_s {
static struct node_memory_chunk_s node_memory_chunk[MAXCHUNKS];
static int num_memory_chunks; /* total number of memory chunks */
-static int zholes_size_init;
-static unsigned long zholes_size[MAX_NUMNODES * MAX_NR_ZONES];
extern void * boot_ioremap(unsigned long, unsigned long);
@@ -135,50 +133,6 @@ static void __init parse_memory_affinity_structure (char *sratp)
"enabled and removable" : "enabled" ) );
}
-#if MAX_NR_ZONES != 4
-#error "MAX_NR_ZONES != 4, chunk_to_zone requires review"
-#endif
-/* Take a chunk of pages from page frame cstart to cend and count the number
- * of pages in each zone, returned via zones[].
- */
-static __init void chunk_to_zones(unsigned long cstart, unsigned long cend,
- unsigned long *zones)
-{
- unsigned long max_dma;
- extern unsigned long max_low_pfn;
-
- int z;
- unsigned long rend;
-
- /* FIXME: MAX_DMA_ADDRESS and max_low_pfn are trying to provide
- * similarly scoped information and should be handled in a consistant
- * manner.
- */
- max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
-
- /* Split the hole into the zones in which it falls. Repeatedly
- * take the segment in which the remaining hole starts, round it
- * to the end of that zone.
- */
- memset(zones, 0, MAX_NR_ZONES * sizeof(long));
- while (cstart < cend) {
- if (cstart < max_dma) {
- z = ZONE_DMA;
- rend = (cend < max_dma)? cend : max_dma;
-
- } else if (cstart < max_low_pfn) {
- z = ZONE_NORMAL;
- rend = (cend < max_low_pfn)? cend : max_low_pfn;
-
- } else {
- z = ZONE_HIGHMEM;
- rend = cend;
- }
- zones[z] += rend - cstart;
- cstart = rend;
- }
-}
-
/*
* The SRAT table always lists ascending addresses, so can always
* assume that the first "start" address that you see is the real
@@ -223,7 +177,6 @@ static int __init acpi20_parse_srat(struct acpi_table_srat *sratp)
memset(pxm_bitmap, 0, sizeof(pxm_bitmap)); /* init proximity domain bitmap */
memset(node_memory_chunk, 0, sizeof(node_memory_chunk));
- memset(zholes_size, 0, sizeof(zholes_size));
num_memory_chunks = 0;
while (p < end) {
@@ -287,6 +240,7 @@ static int __init acpi20_parse_srat(struct acpi_table_srat *sratp)
printk("chunk %d nid %d start_pfn %08lx end_pfn %08lx\n",
j, chunk->nid, chunk->start_pfn, chunk->end_pfn);
node_read_chunk(chunk->nid, chunk);
+ add_active_range(chunk->nid, chunk->start_pfn, chunk->end_pfn);
}
for_each_online_node(nid) {
@@ -395,57 +349,7 @@ int __init get_memcfg_from_srat(void)
return acpi20_parse_srat((struct acpi_table_srat *)header);
}
out_err:
+ remove_all_active_ranges();
printk("failed to get NUMA memory information from SRAT table\n");
return 0;
}
-
-/* For each node run the memory list to determine whether there are
- * any memory holes. For each hole determine which ZONE they fall
- * into.
- *
- * NOTE#1: this requires knowledge of the zone boundries and so
- * _cannot_ be performed before those are calculated in setup_memory.
- *
- * NOTE#2: we rely on the fact that the memory chunks are ordered by
- * start pfn number during setup.
- */
-static void __init get_zholes_init(void)
-{
- int nid;
- int c;
- int first;
- unsigned long end = 0;
-
- for_each_online_node(nid) {
- first = 1;
- for (c = 0; c < num_memory_chunks; c++){
- if (node_memory_chunk[c].nid == nid) {
- if (first) {
- end = node_memory_chunk[c].end_pfn;
- first = 0;
-
- } else {
- /* Record any gap between this chunk
- * and the previous chunk on this node
- * against the zones it spans.
- */
- chunk_to_zones(end,
- node_memory_chunk[c].start_pfn,
- &zholes_size[nid * MAX_NR_ZONES]);
- }
- }
- }
- }
-}
-
-unsigned long * __init get_zholes_size(int nid)
-{
- if (!zholes_size_init) {
- zholes_size_init++;
- get_zholes_init();
- }
- if (nid >= MAX_NUMNODES || !node_online(nid))
- printk("%s: nid = %d is invalid/offline. num_online_nodes = %d",
- __FUNCTION__, nid, num_online_nodes());
- return &zholes_size[nid * MAX_NR_ZONES];
-}
diff --git a/arch/i386/kernel/stacktrace.c b/arch/i386/kernel/stacktrace.c
deleted file mode 100644
index e62a037ab39..00000000000
--- a/arch/i386/kernel/stacktrace.c
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * arch/i386/kernel/stacktrace.c
- *
- * Stack trace management functions
- *
- * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
- */
-#include <linux/sched.h>
-#include <linux/stacktrace.h>
-
-static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
-{
- return p > (void *)tinfo &&
- p < (void *)tinfo + THREAD_SIZE - 3;
-}
-
-/*
- * Save stack-backtrace addresses into a stack_trace buffer:
- */
-static inline unsigned long
-save_context_stack(struct stack_trace *trace, unsigned int skip,
- struct thread_info *tinfo, unsigned long *stack,
- unsigned long ebp)
-{
- unsigned long addr;
-
-#ifdef CONFIG_FRAME_POINTER
- while (valid_stack_ptr(tinfo, (void *)ebp)) {
- addr = *(unsigned long *)(ebp + 4);
- if (!skip)
- trace->entries[trace->nr_entries++] = addr;
- else
- skip--;
- if (trace->nr_entries >= trace->max_entries)
- break;
- /*
- * break out of recursive entries (such as
- * end_of_stack_stop_unwind_function):
- */
- if (ebp == *(unsigned long *)ebp)
- break;
-
- ebp = *(unsigned long *)ebp;
- }
-#else
- while (valid_stack_ptr(tinfo, stack)) {
- addr = *stack++;
- if (__kernel_text_address(addr)) {
- if (!skip)
- trace->entries[trace->nr_entries++] = addr;
- else
- skip--;
- if (trace->nr_entries >= trace->max_entries)
- break;
- }
- }
-#endif
-
- return ebp;
-}
-
-/*
- * Save stack-backtrace addresses into a stack_trace buffer.
- * If all_contexts is set, all contexts (hardirq, softirq and process)
- * are saved. If not set then only the current context is saved.
- */
-void save_stack_trace(struct stack_trace *trace,
- struct task_struct *task, int all_contexts,
- unsigned int skip)
-{
- unsigned long ebp;
- unsigned long *stack = &ebp;
-
- WARN_ON(trace->nr_entries || !trace->max_entries);
-
- if (!task || task == current) {
- /* Grab ebp right from our regs: */
- asm ("movl %%ebp, %0" : "=r" (ebp));
- } else {
- /* ebp is the last reg pushed by switch_to(): */
- ebp = *(unsigned long *) task->thread.esp;
- }
-
- while (1) {
- struct thread_info *context = (struct thread_info *)
- ((unsigned long)stack & (~(THREAD_SIZE - 1)));
-
- ebp = save_context_stack(trace, skip, context, stack, ebp);
- stack = (unsigned long *)context->previous_esp;
- if (!all_contexts || !stack ||
- trace->nr_entries >= trace->max_entries)
- break;
- trace->entries[trace->nr_entries++] = ULONG_MAX;
- if (trace->nr_entries >= trace->max_entries)
- break;
- }
-}
-
diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index dd63d477539..7e639f78b0b 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -317,3 +317,4 @@ ENTRY(sys_call_table)
.long sys_tee /* 315 */
.long sys_vmsplice
.long sys_move_pages
+ .long sys_getcpu
diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c
index edd00f6cee3..86944acfb64 100644
--- a/arch/i386/kernel/time.c
+++ b/arch/i386/kernel/time.c
@@ -130,18 +130,33 @@ static int set_rtc_mmss(unsigned long nowtime)
int timer_ack;
-#if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER)
unsigned long profile_pc(struct pt_regs *regs)
{
unsigned long pc = instruction_pointer(regs);
- if (!user_mode_vm(regs) && in_lock_functions(pc))
+#ifdef CONFIG_SMP
+ if (!user_mode_vm(regs) && in_lock_functions(pc)) {
+#ifdef CONFIG_FRAME_POINTER
return *(unsigned long *)(regs->ebp + 4);
-
+#else
+ unsigned long *sp;
+ if ((regs->xcs & 3) == 0)
+ sp = (unsigned long *)&regs->esp;
+ else
+ sp = (unsigned long *)regs->esp;
+ /* Return address is either directly at stack pointer
+ or above a saved eflags. Eflags has bits 22-31 zero,
+ kernel addresses don't. */
+ if (sp[0] >> 22)
+ return sp[0];
+ if (sp[1] >> 22)
+ return sp[1];
+#endif
+ }
+#endif
return pc;
}
EXPORT_SYMBOL(profile_pc);
-#endif
/*
* This is the same as the above, except we _also_ save the current
@@ -270,16 +285,19 @@ void notify_arch_cmos_timer(void)
mod_timer(&sync_cmos_timer, jiffies + 1);
}
-static long clock_cmos_diff, sleep_start;
+static long clock_cmos_diff;
+static unsigned long sleep_start;
static int timer_suspend(struct sys_device *dev, pm_message_t state)
{
/*
* Estimate time zone so that set_time can update the clock
*/
- clock_cmos_diff = -get_cmos_time();
+ unsigned long ctime = get_cmos_time();
+
+ clock_cmos_diff = -ctime;
clock_cmos_diff += get_seconds();
- sleep_start = get_cmos_time();
+ sleep_start = ctime;
return 0;
}
@@ -287,18 +305,29 @@ static int timer_resume(struct sys_device *dev)
{
unsigned long flags;
unsigned long sec;
- unsigned long sleep_length;
-
+ unsigned long ctime = get_cmos_time();
+ long sleep_length = (ctime - sleep_start) * HZ;
+ struct timespec ts;
+
+ if (sleep_length < 0) {
+ printk(KERN_WARNING "CMOS clock skew detected in timer resume!\n");
+ /* The time after the resume must not be earlier than the time
+ * before the suspend or some nasty things will happen
+ */
+ sleep_length = 0;
+ ctime = sleep_start;
+ }
#ifdef CONFIG_HPET_TIMER
if (is_hpet_enabled())
hpet_reenable();
#endif
setup_pit_timer();
- sec = get_cmos_time() + clock_cmos_diff;
- sleep_length = (get_cmos_time() - sleep_start) * HZ;
+
+ sec = ctime + clock_cmos_diff;
+ ts.tv_sec = sec;
+ ts.tv_nsec = 0;
+ do_settimeofday(&ts);
write_seqlock_irqsave(&xtime_lock, flags);
- xtime.tv_sec = sec;
- xtime.tv_nsec = 0;
jiffies_64 += sleep_length;
wall_jiffies += sleep_length;
write_sequnlock_irqrestore(&xtime_lock, flags);
@@ -334,10 +363,11 @@ extern void (*late_time_init)(void);
/* Duplicate of time_init() below, with hpet_enable part added */
static void __init hpet_time_init(void)
{
- xtime.tv_sec = get_cmos_time();
- xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
- set_normalized_timespec(&wall_to_monotonic,
- -xtime.tv_sec, -xtime.tv_nsec);
+ struct timespec ts;
+ ts.tv_sec = get_cmos_time();
+ ts.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
+
+ do_settimeofday(&ts);
if ((hpet_enable() >= 0) && hpet_use_timer) {
printk("Using HPET for base-timer\n");
@@ -349,6 +379,7 @@ static void __init hpet_time_init(void)
void __init time_init(void)
{
+ struct timespec ts;
#ifdef CONFIG_HPET_TIMER
if (is_hpet_capable()) {
/*
@@ -359,10 +390,10 @@ void __init time_init(void)
return;
}
#endif
- xtime.tv_sec = get_cmos_time();
- xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
- set_normalized_timespec(&wall_to_monotonic,
- -xtime.tv_sec, -xtime.tv_nsec);
+ ts.tv_sec = get_cmos_time();
+ ts.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
+
+ do_settimeofday(&ts);
time_init_hook();
}
diff --git a/arch/i386/kernel/time_hpet.c b/arch/i386/kernel/time_hpet.c
index 14a1376fedd..6bf14a4e995 100644
--- a/arch/i386/kernel/time_hpet.c
+++ b/arch/i386/kernel/time_hpet.c
@@ -301,23 +301,25 @@ int hpet_rtc_timer_init(void)
hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
local_irq_save(flags);
+
cnt = hpet_readl(HPET_COUNTER);
cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq);
hpet_writel(cnt, HPET_T1_CMP);
hpet_t1_cmp = cnt;
- local_irq_restore(flags);
cfg = hpet_readl(HPET_T1_CFG);
cfg &= ~HPET_TN_PERIODIC;
cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
hpet_writel(cfg, HPET_T1_CFG);
+ local_irq_restore(flags);
+
return 1;
}
static void hpet_rtc_timer_reinit(void)
{
- unsigned int cfg, cnt;
+ unsigned int cfg, cnt, ticks_per_int, lost_ints;
if (unlikely(!(PIE_on | AIE_on | UIE_on))) {
cfg = hpet_readl(HPET_T1_CFG);
@@ -332,10 +334,33 @@ static void hpet_rtc_timer_reinit(void)
hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
/* It is more accurate to use the comparator value than current count.*/
- cnt = hpet_t1_cmp;
- cnt += hpet_tick*HZ/hpet_rtc_int_freq;
- hpet_writel(cnt, HPET_T1_CMP);
- hpet_t1_cmp = cnt;
+ ticks_per_int = hpet_tick * HZ / hpet_rtc_int_freq;
+ hpet_t1_cmp += ticks_per_int;
+ hpet_writel(hpet_t1_cmp, HPET_T1_CMP);
+
+ /*
+ * If the interrupt handler was delayed too long, the write above tries
+ * to schedule the next interrupt in the past and the hardware would
+ * not interrupt until the counter had wrapped around.
+ * So we have to check that the comparator wasn't set to a past time.
+ */
+ cnt = hpet_readl(HPET_COUNTER);
+ if (unlikely((int)(cnt - hpet_t1_cmp) > 0)) {
+ lost_ints = (cnt - hpet_t1_cmp) / ticks_per_int + 1;
+ /* Make sure that, even with the time needed to execute
+ * this code, the next scheduled interrupt has been moved
+ * back to the future: */
+ lost_ints++;
+
+ hpet_t1_cmp += lost_ints * ticks_per_int;
+ hpet_writel(hpet_t1_cmp, HPET_T1_CMP);
+
+ if (PIE_on)
+ PIE_count += lost_ints;
+
+ printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n",
+ hpet_rtc_int_freq);
+ }
}
/*
diff --git a/arch/i386/kernel/topology.c b/arch/i386/kernel/topology.c
index e2e281d4bcc..07d6da36a82 100644
--- a/arch/i386/kernel/topology.c
+++ b/arch/i386/kernel/topology.c
@@ -28,6 +28,7 @@
#include <linux/init.h>
#include <linux/smp.h>
#include <linux/nodemask.h>
+#include <linux/mmzone.h>
#include <asm/cpu.h>
static struct i386_cpu cpu_devices[NR_CPUS];
@@ -55,34 +56,18 @@ EXPORT_SYMBOL(arch_register_cpu);
EXPORT_SYMBOL(arch_unregister_cpu);
#endif /*CONFIG_HOTPLUG_CPU*/
-
-
-#ifdef CONFIG_NUMA
-#include <linux/mmzone.h>
-
static int __init topology_init(void)
{
int i;
+#ifdef CONFIG_NUMA
for_each_online_node(i)
register_one_node(i);
+#endif /* CONFIG_NUMA */
for_each_present_cpu(i)
arch_register_cpu(i);
return 0;
}
-#else /* !CONFIG_NUMA */
-
-static int __init topology_init(void)
-{
- int i;
-
- for_each_present_cpu(i)
- arch_register_cpu(i);
- return 0;
-}
-
-#endif /* CONFIG_NUMA */
-
subsys_initcall(topology_init);
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index 7e9edafffd8..a13037fe0ee 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -28,6 +28,7 @@
#include <linux/kprobes.h>
#include <linux/kexec.h>
#include <linux/unwind.h>
+#include <linux/uaccess.h>
#ifdef CONFIG_EISA
#include <linux/ioport.h>
@@ -40,7 +41,6 @@
#include <asm/processor.h>
#include <asm/system.h>
-#include <asm/uaccess.h>
#include <asm/io.h>
#include <asm/atomic.h>
#include <asm/debugreg.h>
@@ -51,6 +51,7 @@
#include <asm/smp.h>
#include <asm/arch_hooks.h>
#include <asm/kdebug.h>
+#include <asm/stacktrace.h>
#include <linux/module.h>
@@ -118,26 +119,16 @@ static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
p < (void *)tinfo + THREAD_SIZE - 3;
}
-/*
- * Print one address/symbol entries per line.
- */
-static inline void print_addr_and_symbol(unsigned long addr, char *log_lvl)
-{
- printk(" [<%08lx>] ", addr);
-
- print_symbol("%s\n", addr);
-}
-
static inline unsigned long print_context_stack(struct thread_info *tinfo,
unsigned long *stack, unsigned long ebp,
- char *log_lvl)
+ struct stacktrace_ops *ops, void *data)
{
unsigned long addr;
#ifdef CONFIG_FRAME_POINTER
while (valid_stack_ptr(tinfo, (void *)ebp)) {
addr = *(unsigned long *)(ebp + 4);
- print_addr_and_symbol(addr, log_lvl);
+ ops->address(data, addr);
/*
* break out of recursive entries (such as
* end_of_stack_stop_unwind_function):
@@ -150,30 +141,37 @@ static inline unsigned long print_context_stack(struct thread_info *tinfo,
while (valid_stack_ptr(tinfo, stack)) {
addr = *stack++;
if (__kernel_text_address(addr))
- print_addr_and_symbol(addr, log_lvl);
+ ops->address(data, addr);
}
#endif
return ebp;
}
+struct ops_and_data {
+ struct stacktrace_ops *ops;
+ void *data;
+};
+
static asmlinkage int
-show_trace_unwind(struct unwind_frame_info *info, void *log_lvl)
+dump_trace_unwind(struct unwind_frame_info *info, void *data)
{
+ struct ops_and_data *oad = (struct ops_and_data *)data;
int n = 0;
while (unwind(info) == 0 && UNW_PC(info)) {
n++;
- print_addr_and_symbol(UNW_PC(info), log_lvl);
+ oad->ops->address(oad->data, UNW_PC(info));
if (arch_unw_user_mode(info))
break;
}
return n;
}
-static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, char *log_lvl)
+void dump_trace(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *stack,
+ struct stacktrace_ops *ops, void *data)
{
- unsigned long ebp;
+ unsigned long ebp = 0;
if (!task)
task = current;
@@ -181,54 +179,116 @@ static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
if (call_trace >= 0) {
int unw_ret = 0;
struct unwind_frame_info info;
+ struct ops_and_data oad = { .ops = ops, .data = data };
if (regs) {
if (unwind_init_frame_info(&info, task, regs) == 0)
- unw_ret = show_trace_unwind(&info, log_lvl);
+ unw_ret = dump_trace_unwind(&info, &oad);
} else if (task == current)
- unw_ret = unwind_init_running(&info, show_trace_unwind, log_lvl);
+ unw_ret = unwind_init_running(&info, dump_trace_unwind, &oad);
else {
if (unwind_init_blocked(&info, task) == 0)
- unw_ret = show_trace_unwind(&info, log_lvl);
+ unw_ret = dump_trace_unwind(&info, &oad);
}
if (unw_ret > 0) {
if (call_trace == 1 && !arch_unw_user_mode(&info)) {
- print_symbol("DWARF2 unwinder stuck at %s\n",
+ ops->warning_symbol(data, "DWARF2 unwinder stuck at %s\n",
UNW_PC(&info));
if (UNW_SP(&info) >= PAGE_OFFSET) {
- printk("Leftover inexact backtrace:\n");
+ ops->warning(data, "Leftover inexact backtrace:\n");
stack = (void *)UNW_SP(&info);
+ if (!stack)
+ return;
+ ebp = UNW_FP(&info);
} else
- printk("Full inexact backtrace again:\n");
+ ops->warning(data, "Full inexact backtrace again:\n");
} else if (call_trace >= 1)
return;
else
- printk("Full inexact backtrace again:\n");
+ ops->warning(data, "Full inexact backtrace again:\n");
} else
- printk("Inexact backtrace:\n");
+ ops->warning(data, "Inexact backtrace:\n");
+ }
+ if (!stack) {
+ unsigned long dummy;
+ stack = &dummy;
+ if (task && task != current)
+ stack = (unsigned long *)task->thread.esp;
}
- if (task == current) {
- /* Grab ebp right from our regs */
- asm ("movl %%ebp, %0" : "=r" (ebp) : );
- } else {
- /* ebp is the last reg pushed by switch_to */
- ebp = *(unsigned long *) task->thread.esp;
+#ifdef CONFIG_FRAME_POINTER
+ if (!ebp) {
+ if (task == current) {
+ /* Grab ebp right from our regs */
+ asm ("movl %%ebp, %0" : "=r" (ebp) : );
+ } else {
+ /* ebp is the last reg pushed by switch_to */
+ ebp = *(unsigned long *) task->thread.esp;
+ }
}
+#endif
while (1) {
struct thread_info *context;
context = (struct thread_info *)
((unsigned long)stack & (~(THREAD_SIZE - 1)));
- ebp = print_context_stack(context, stack, ebp, log_lvl);
+ ebp = print_context_stack(context, stack, ebp, ops, data);
+ /* Should be after the line below, but somewhere
+ in early boot context comes out corrupted and we
+ can't reference it -AK */
+ if (ops->stack(data, "IRQ") < 0)
+ break;
stack = (unsigned long*)context->previous_esp;
if (!stack)
break;
- printk("%s =======================\n", log_lvl);
}
}
+EXPORT_SYMBOL(dump_trace);
+
+static void
+print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
+{
+ printk(data);
+ print_symbol(msg, symbol);
+ printk("\n");
+}
+
+static void print_trace_warning(void *data, char *msg)
+{
+ printk("%s%s\n", (char *)data, msg);
+}
+
+static int print_trace_stack(void *data, char *name)
+{
+ return 0;
+}
+
+/*
+ * Print one address/symbol entries per line.
+ */
+static void print_trace_address(void *data, unsigned long addr)
+{
+ printk("%s [<%08lx>] ", (char *)data, addr);
+ print_symbol("%s\n", addr);
+}
+
+static struct stacktrace_ops print_trace_ops = {
+ .warning = print_trace_warning,
+ .warning_symbol = print_trace_warning_symbol,
+ .stack = print_trace_stack,
+ .address = print_trace_address,
+};
+
+static void
+show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
+ unsigned long * stack, char *log_lvl)
+{
+ dump_trace(task, regs, stack, &print_trace_ops, log_lvl);
+ printk("%s =======================\n", log_lvl);
+}
-void show_trace(struct task_struct *task, struct pt_regs *regs, unsigned long * stack)
+void show_trace(struct task_struct *task, struct pt_regs *regs,
+ unsigned long * stack)
{
show_trace_log_lvl(task, regs, stack, "");
}
@@ -291,8 +351,9 @@ void show_registers(struct pt_regs *regs)
ss = regs->xss & 0xffff;
}
print_modules();
- printk(KERN_EMERG "CPU: %d\nEIP: %04x:[<%08lx>] %s VLI\n"
- "EFLAGS: %08lx (%s %.*s) \n",
+ printk(KERN_EMERG "CPU: %d\n"
+ KERN_EMERG "EIP: %04x:[<%08lx>] %s VLI\n"
+ KERN_EMERG "EFLAGS: %08lx (%s %.*s)\n",
smp_processor_id(), 0xffff & regs->xcs, regs->eip,
print_tainted(), regs->eflags, system_utsname.release,
(int)strcspn(system_utsname.version, " "),
@@ -313,6 +374,8 @@ void show_registers(struct pt_regs *regs)
*/
if (in_kernel) {
u8 __user *eip;
+ int code_bytes = 64;
+ unsigned char c;
printk("\n" KERN_EMERG "Stack: ");
show_stack_log_lvl(NULL, regs, (unsigned long *)esp, KERN_EMERG);
@@ -320,9 +383,12 @@ void show_registers(struct pt_regs *regs)
printk(KERN_EMERG "Code: ");
eip = (u8 __user *)regs->eip - 43;
- for (i = 0; i < 64; i++, eip++) {
- unsigned char c;
-
+ if (eip < (u8 __user *)PAGE_OFFSET || __get_user(c, eip)) {
+ /* try starting at EIP */
+ eip = (u8 __user *)regs->eip;
+ code_bytes = 32;
+ }
+ for (i = 0; i < code_bytes; i++, eip++) {
if (eip < (u8 __user *)PAGE_OFFSET || __get_user(c, eip)) {
printk(" Bad EIP value.");
break;
@@ -343,7 +409,7 @@ static void handle_BUG(struct pt_regs *regs)
if (eip < PAGE_OFFSET)
return;
- if (__get_user(ud2, (unsigned short __user *)eip))
+ if (probe_kernel_address((unsigned short __user *)eip, ud2))
return;
if (ud2 != 0x0b0f)
return;
@@ -356,7 +422,8 @@ static void handle_BUG(struct pt_regs *regs)
char *file;
char c;
- if (__get_user(line, (unsigned short __user *)(eip + 2)))
+ if (probe_kernel_address((unsigned short __user *)(eip + 2),
+ line))
break;
if (__get_user(file, (char * __user *)(eip + 4)) ||
(unsigned long)file < PAGE_OFFSET || __get_user(c, file))
@@ -629,18 +696,24 @@ gp_in_kernel:
}
}
-static void mem_parity_error(unsigned char reason, struct pt_regs * regs)
+static __kprobes void
+mem_parity_error(unsigned char reason, struct pt_regs * regs)
{
- printk(KERN_EMERG "Uhhuh. NMI received. Dazed and confused, but trying "
- "to continue\n");
+ printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on "
+ "CPU %d.\n", reason, smp_processor_id());
printk(KERN_EMERG "You probably have a hardware problem with your RAM "
"chips\n");
+ if (panic_on_unrecovered_nmi)
+ panic("NMI: Not continuing");
+
+ printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
/* Clear and disable the memory parity error line. */
clear_mem_error(reason);
}
-static void io_check_error(unsigned char reason, struct pt_regs * regs)
+static __kprobes void
+io_check_error(unsigned char reason, struct pt_regs * regs)
{
unsigned long i;
@@ -656,7 +729,8 @@ static void io_check_error(unsigned char reason, struct pt_regs * regs)
outb(reason, 0x61);
}
-static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
+static __kprobes void
+unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
{
#ifdef CONFIG_MCA
/* Might actually be able to figure out what the guilty party
@@ -666,15 +740,18 @@ static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
return;
}
#endif
- printk("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
- reason, smp_processor_id());
- printk("Dazed and confused, but trying to continue\n");
- printk("Do you have a strange power saving mode enabled?\n");
+ printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on "
+ "CPU %d.\n", reason, smp_processor_id());
+ printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
+ if (panic_on_unrecovered_nmi)
+ panic("NMI: Not continuing");
+
+ printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
}
static DEFINE_SPINLOCK(nmi_print_lock);
-void die_nmi (struct pt_regs *regs, const char *msg)
+void __kprobes die_nmi(struct pt_regs *regs, const char *msg)
{
if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) ==
NOTIFY_STOP)
@@ -706,7 +783,7 @@ void die_nmi (struct pt_regs *regs, const char *msg)
do_exit(SIGSEGV);
}
-static void default_do_nmi(struct pt_regs * regs)
+static __kprobes void default_do_nmi(struct pt_regs * regs)
{
unsigned char reason = 0;
@@ -723,12 +800,12 @@ static void default_do_nmi(struct pt_regs * regs)
* Ok, so this is none of the documented NMI sources,
* so it must be the NMI watchdog.
*/
- if (nmi_watchdog) {
- nmi_watchdog_tick(regs);
+ if (nmi_watchdog_tick(regs, reason))
return;
- }
+ if (!do_nmi_callback(regs, smp_processor_id()))
#endif
- unknown_nmi_error(reason, regs);
+ unknown_nmi_error(reason, regs);
+
return;
}
if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
@@ -744,14 +821,7 @@ static void default_do_nmi(struct pt_regs * regs)
reassert_nmi();
}
-static int dummy_nmi_callback(struct pt_regs * regs, int cpu)
-{
- return 0;
-}
-
-static nmi_callback_t nmi_callback = dummy_nmi_callback;
-
-fastcall void do_nmi(struct pt_regs * regs, long error_code)
+fastcall __kprobes void do_nmi(struct pt_regs * regs, long error_code)
{
int cpu;
@@ -761,25 +831,11 @@ fastcall void do_nmi(struct pt_regs * regs, long error_code)
++nmi_count(cpu);
- if (!rcu_dereference(nmi_callback)(regs, cpu))
- default_do_nmi(regs);
+ default_do_nmi(regs);
nmi_exit();
}
-void set_nmi_callback(nmi_callback_t callback)
-{
- vmalloc_sync_all();
- rcu_assign_pointer(nmi_callback, callback);
-}
-EXPORT_SYMBOL_GPL(set_nmi_callback);
-
-void unset_nmi_callback(void)
-{
- nmi_callback = dummy_nmi_callback;
-}
-EXPORT_SYMBOL_GPL(unset_nmi_callback);
-
#ifdef CONFIG_KPROBES
fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code)
{
@@ -1119,20 +1175,6 @@ void __init trap_init_f00f_bug(void)
}
#endif
-#define _set_gate(gate_addr,type,dpl,addr,seg) \
-do { \
- int __d0, __d1; \
- __asm__ __volatile__ ("movw %%dx,%%ax\n\t" \
- "movw %4,%%dx\n\t" \
- "movl %%eax,%0\n\t" \
- "movl %%edx,%1" \
- :"=m" (*((long *) (gate_addr))), \
- "=m" (*(1+(long *) (gate_addr))), "=&a" (__d0), "=&d" (__d1) \
- :"i" ((short) (0x8000+(dpl<<13)+(type<<8))), \
- "3" ((char *) (addr)),"2" ((seg) << 16)); \
-} while (0)
-
-
/*
* This needs to use 'idt_table' rather than 'idt', and
* thus use the _nonmapped_ version of the IDT, as the
@@ -1141,7 +1183,7 @@ do { \
*/
void set_intr_gate(unsigned int n, void *addr)
{
- _set_gate(idt_table+n,14,0,addr,__KERNEL_CS);
+ _set_gate(n, DESCTYPE_INT, addr, __KERNEL_CS);
}
/*
@@ -1149,22 +1191,22 @@ void set_intr_gate(unsigned int n, void *addr)
*/
static inline void set_system_intr_gate(unsigned int n, void *addr)
{
- _set_gate(idt_table+n, 14, 3, addr, __KERNEL_CS);
+ _set_gate(n, DESCTYPE_INT | DESCTYPE_DPL3, addr, __KERNEL_CS);
}
static void __init set_trap_gate(unsigned int n, void *addr)
{
- _set_gate(idt_table+n,15,0,addr,__KERNEL_CS);
+ _set_gate(n, DESCTYPE_TRAP, addr, __KERNEL_CS);
}
static void __init set_system_gate(unsigned int n, void *addr)
{
- _set_gate(idt_table+n,15,3,addr,__KERNEL_CS);
+ _set_gate(n, DESCTYPE_TRAP | DESCTYPE_DPL3, addr, __KERNEL_CS);
}
static void __init set_task_gate(unsigned int n, unsigned int gdt_entry)
{
- _set_gate(idt_table+n,5,0,0,(gdt_entry<<3));
+ _set_gate(n, DESCTYPE_TASK, (void *)0, (gdt_entry<<3));
}
diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c
index 7e0d8dab207..b8fa0a8b2e4 100644
--- a/arch/i386/kernel/tsc.c
+++ b/arch/i386/kernel/tsc.c
@@ -192,7 +192,7 @@ int recalibrate_cpu_khz(void)
EXPORT_SYMBOL(recalibrate_cpu_khz);
-void tsc_init(void)
+void __init tsc_init(void)
{
if (!cpu_has_tsc || tsc_disable)
return;
diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S
index 2d4f1386e2b..1e7ac1c44dd 100644
--- a/arch/i386/kernel/vmlinux.lds.S
+++ b/arch/i386/kernel/vmlinux.lds.S
@@ -13,6 +13,12 @@ OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
OUTPUT_ARCH(i386)
ENTRY(phys_startup_32)
jiffies = jiffies_64;
+
+PHDRS {
+ text PT_LOAD FLAGS(5); /* R_E */
+ data PT_LOAD FLAGS(7); /* RWE */
+ note PT_NOTE FLAGS(4); /* R__ */
+}
SECTIONS
{
. = __KERNEL_START;
@@ -26,7 +32,7 @@ SECTIONS
KPROBES_TEXT
*(.fixup)
*(.gnu.warning)
- } = 0x9090
+ } :text = 0x9090
_etext = .; /* End of text section */
@@ -48,7 +54,7 @@ SECTIONS
.data : AT(ADDR(.data) - LOAD_OFFSET) { /* Data */
*(.data)
CONSTRUCTORS
- }
+ } :data
. = ALIGN(4096);
__nosave_begin = .;
@@ -184,4 +190,6 @@ SECTIONS
STABS_DEBUG
DWARF_DEBUG
+
+ NOTES
}