summaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
authorH. Peter Anvin <hpa@linux.intel.com>2012-01-19 12:56:50 -0800
committerH. Peter Anvin <hpa@linux.intel.com>2012-01-19 12:56:50 -0800
commit282f445a779ed76fca9884fe377bf56a3088b208 (patch)
treed9abcf526baee0100672851e0a8894c19e762a39 /arch/x86/kernel
parent68f30fbee19cc67849b9fa8e153ede70758afe81 (diff)
parent90a4c0f51e8e44111a926be6f4c87af3938a79c3 (diff)
Merge remote-tracking branch 'linus/master' into x86/urgent
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile4
-rw-r--r--arch/x86/kernel/amd_nb.c31
-rw-r--r--arch/x86/kernel/apm_32.c16
-rw-r--r--arch/x86/kernel/asm-offsets.c2
-rw-r--r--arch/x86/kernel/asm-offsets_32.c8
-rw-r--r--arch/x86/kernel/asm-offsets_64.c19
-rw-r--r--arch/x86/kernel/cpu/common.c24
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c25
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h4
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c138
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c19
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c63
-rw-r--r--arch/x86/kernel/cpuid.c2
-rw-r--r--arch/x86/kernel/e820.c63
-rw-r--r--arch/x86/kernel/early_printk.c4
-rw-r--r--arch/x86/kernel/entry_32.S47
-rw-r--r--arch/x86/kernel/entry_64.S232
-rw-r--r--arch/x86/kernel/head_64.S4
-rw-r--r--arch/x86/kernel/hpet.c1
-rw-r--r--arch/x86/kernel/irq_32.c5
-rw-r--r--arch/x86/kernel/irq_64.c35
-rw-r--r--arch/x86/kernel/irqinit.c2
-rw-r--r--arch/x86/kernel/kvm.c181
-rw-r--r--arch/x86/kernel/microcode_core.c64
-rw-r--r--arch/x86/kernel/msr.c2
-rw-r--r--arch/x86/kernel/nmi.c102
-rw-r--r--arch/x86/kernel/nmi_selftest.c180
-rw-r--r--arch/x86/kernel/pci-dma.c11
-rw-r--r--arch/x86/kernel/ptrace.c25
-rw-r--r--arch/x86/kernel/setup.c7
-rw-r--r--arch/x86/kernel/signal.c6
-rw-r--r--arch/x86/kernel/smp.c72
-rw-r--r--arch/x86/kernel/smpboot.c17
-rw-r--r--arch/x86/kernel/syscall_32.c25
-rw-r--r--arch/x86/kernel/syscall_64.c20
-rw-r--r--arch/x86/kernel/syscall_table_32.S350
-rw-r--r--arch/x86/kernel/traps.c20
-rw-r--r--arch/x86/kernel/tsc.c20
-rw-r--r--arch/x86/kernel/vm86_32.c6
-rw-r--r--arch/x86/kernel/x86_init.c1
40 files changed, 997 insertions, 860 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 8baca3c4871..5369059c07a 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -25,7 +25,8 @@ obj-$(CONFIG_IRQ_WORK) += irq_work.o
obj-y += probe_roms.o
obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
-obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o
+obj-y += syscall_$(BITS).o
+obj-$(CONFIG_X86_64) += vsyscall_64.o
obj-$(CONFIG_X86_64) += vsyscall_emu_64.o
obj-y += bootflag.o e820.o
obj-y += pci-dma.o quirks.o topology.o kdebugfs.o
@@ -80,6 +81,7 @@ obj-$(CONFIG_APB_TIMER) += apb_timer.o
obj-$(CONFIG_AMD_NB) += amd_nb.o
obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o
obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o
+obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o
obj-$(CONFIG_KVM_GUEST) += kvm.o
obj-$(CONFIG_KVM_CLOCK) += kvmclock.o
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 013c1810ce7..be16854591c 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -119,6 +119,37 @@ bool __init early_is_amd_nb(u32 device)
return false;
}
+struct resource *amd_get_mmconfig_range(struct resource *res)
+{
+ u32 address;
+ u64 base, msr;
+ unsigned segn_busn_bits;
+
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
+ return NULL;
+
+ /* assume all cpus from fam10h have mmconfig */
+ if (boot_cpu_data.x86 < 0x10)
+ return NULL;
+
+ address = MSR_FAM10H_MMIO_CONF_BASE;
+ rdmsrl(address, msr);
+
+ /* mmconfig is not enabled */
+ if (!(msr & FAM10H_MMIO_CONF_ENABLE))
+ return NULL;
+
+ base = msr & (FAM10H_MMIO_CONF_BASE_MASK<<FAM10H_MMIO_CONF_BASE_SHIFT);
+
+ segn_busn_bits = (msr >> FAM10H_MMIO_CONF_BUSRANGE_SHIFT) &
+ FAM10H_MMIO_CONF_BUSRANGE_MASK;
+
+ res->flags = IORESOURCE_MEM;
+ res->start = base;
+ res->end = base + (1ULL<<(segn_busn_bits + 20)) - 1;
+ return res;
+}
+
int amd_get_subcaches(int cpu)
{
struct pci_dev *link = node_to_amd_nb(amd_get_nb_id(cpu))->link;
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index a46bd383953..f76623cbe26 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -383,21 +383,21 @@ static int ignore_sys_suspend;
static int ignore_normal_resume;
static int bounce_interval __read_mostly = DEFAULT_BOUNCE_INTERVAL;
-static int debug __read_mostly;
-static int smp __read_mostly;
+static bool debug __read_mostly;
+static bool smp __read_mostly;
static int apm_disabled = -1;
#ifdef CONFIG_SMP
-static int power_off;
+static bool power_off;
#else
-static int power_off = 1;
+static bool power_off = 1;
#endif
-static int realmode_power_off;
+static bool realmode_power_off;
#ifdef CONFIG_APM_ALLOW_INTS
-static int allow_ints = 1;
+static bool allow_ints = 1;
#else
-static int allow_ints;
+static bool allow_ints;
#endif
-static int broken_psr;
+static bool broken_psr;
static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue);
static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue);
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 4f13fafc526..68de2dc962e 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -67,4 +67,6 @@ void common(void) {
OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
OFFSET(BP_version, boot_params, hdr.version);
OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
+ OFFSET(BP_pref_address, boot_params, hdr.pref_address);
+ OFFSET(BP_code32_start, boot_params, hdr.code32_start);
}
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 395a10e6806..85d98ab15cd 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -3,6 +3,11 @@
#include <linux/lguest.h>
#include "../../../drivers/lguest/lg.h"
+#define __SYSCALL_I386(nr, sym, compat) [nr] = 1,
+static char syscalls[] = {
+#include <asm/syscalls_32.h>
+};
+
/* workaround for a warning with -Wmissing-prototypes */
void foo(void);
@@ -76,4 +81,7 @@ void foo(void)
OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode);
OFFSET(LGUEST_PAGES_regs, lguest_pages, regs);
#endif
+ BLANK();
+ DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
+ DEFINE(NR_syscalls, sizeof(syscalls));
}
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index e72a1194af2..834e897b1e2 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -1,11 +1,12 @@
#include <asm/ia32.h>
-#define __NO_STUBS 1
-#undef __SYSCALL
-#undef _ASM_X86_UNISTD_64_H
-#define __SYSCALL(nr, sym) [nr] = 1,
-static char syscalls[] = {
-#include <asm/unistd.h>
+#define __SYSCALL_64(nr, sym, compat) [nr] = 1,
+static char syscalls_64[] = {
+#include <asm/syscalls_64.h>
+};
+#define __SYSCALL_I386(nr, sym, compat) [nr] = 1,
+static char syscalls_ia32[] = {
+#include <asm/syscalls_32.h>
};
int main(void)
@@ -72,7 +73,11 @@ int main(void)
OFFSET(TSS_ist, tss_struct, x86_tss.ist);
BLANK();
- DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
+ DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1);
+ DEFINE(NR_syscalls, sizeof(syscalls_64));
+
+ DEFINE(__NR_ia32_syscall_max, sizeof(syscalls_ia32) - 1);
+ DEFINE(IA32_NR_syscalls, sizeof(syscalls_ia32));
return 0;
}
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 850f2963a42..d43cad74f16 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1021,6 +1021,8 @@ __setup("clearcpuid=", setup_disablecpuid);
#ifdef CONFIG_X86_64
struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table };
+struct desc_ptr nmi_idt_descr = { NR_VECTORS * 16 - 1,
+ (unsigned long) nmi_idt_table };
DEFINE_PER_CPU_FIRST(union irq_stack_union,
irq_stack_union) __aligned(PAGE_SIZE);
@@ -1085,6 +1087,26 @@ unsigned long kernel_eflags;
*/
DEFINE_PER_CPU(struct orig_ist, orig_ist);
+static DEFINE_PER_CPU(unsigned long, debug_stack_addr);
+DEFINE_PER_CPU(int, debug_stack_usage);
+
+int is_debug_stack(unsigned long addr)
+{
+ return __get_cpu_var(debug_stack_usage) ||
+ (addr <= __get_cpu_var(debug_stack_addr) &&
+ addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ));
+}
+
+void debug_stack_set_zero(void)
+{
+ load_idt((const struct desc_ptr *)&nmi_idt_descr);
+}
+
+void debug_stack_reset(void)
+{
+ load_idt((const struct desc_ptr *)&idt_descr);
+}
+
#else /* CONFIG_X86_64 */
DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
@@ -1212,6 +1234,8 @@ void __cpuinit cpu_init(void)
estacks += exception_stack_sizes[v];
oist->ist[v] = t->x86_tss.ist[v] =
(unsigned long)estacks;
+ if (v == DEBUG_STACK-1)
+ per_cpu(debug_stack_addr, cpu) = (unsigned long)estacks;
}
}
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index a3b0811693c..6b45e5e7a90 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -844,8 +844,7 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu)
#include <linux/kobject.h>
#include <linux/sysfs.h>
-
-extern struct sysdev_class cpu_sysdev_class; /* from drivers/base/cpu.c */
+#include <linux/cpu.h>
/* pointer to kobject for cpuX/cache */
static DEFINE_PER_CPU(struct kobject *, ici_cache_kobject);
@@ -1073,9 +1072,9 @@ err_out:
static DECLARE_BITMAP(cache_dev_map, NR_CPUS);
/* Add/Remove cache interface for CPU device */
-static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
+static int __cpuinit cache_add_dev(struct device *dev)
{
- unsigned int cpu = sys_dev->id;
+ unsigned int cpu = dev->id;
unsigned long i, j;
struct _index_kobject *this_object;
struct _cpuid4_info *this_leaf;
@@ -1087,7 +1086,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
retval = kobject_init_and_add(per_cpu(ici_cache_kobject, cpu),
&ktype_percpu_entry,
- &sys_dev->kobj, "%s", "cache");
+ &dev->kobj, "%s", "cache");
if (retval < 0) {
cpuid4_cache_sysfs_exit(cpu);
return retval;
@@ -1124,9 +1123,9 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
return 0;
}
-static void __cpuinit cache_remove_dev(struct sys_device * sys_dev)
+static void __cpuinit cache_remove_dev(struct device *dev)
{
- unsigned int cpu = sys_dev->id;
+ unsigned int cpu = dev->id;
unsigned long i;
if (per_cpu(ici_cpuid4_info, cpu) == NULL)
@@ -1145,17 +1144,17 @@ static int __cpuinit cacheinfo_cpu_callback(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
unsigned int cpu = (unsigned long)hcpu;
- struct sys_device *sys_dev;
+ struct device *dev;
- sys_dev = get_cpu_sysdev(cpu);
+ dev = get_cpu_device(cpu);
switch (action) {
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
- cache_add_dev(sys_dev);
+ cache_add_dev(dev);
break;
case CPU_DEAD:
case CPU_DEAD_FROZEN:
- cache_remove_dev(sys_dev);
+ cache_remove_dev(dev);
break;
}
return NOTIFY_OK;
@@ -1174,9 +1173,9 @@ static int __cpuinit cache_sysfs_init(void)
for_each_online_cpu(i) {
int err;
- struct sys_device *sys_dev = get_cpu_sysdev(i);
+ struct device *dev = get_cpu_device(i);
- err = cache_add_dev(sys_dev);
+ err = cache_add_dev(dev);
if (err)
return err;
}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index fefcc69ee8b..ed44c8a6585 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -1,4 +1,4 @@
-#include <linux/sysdev.h>
+#include <linux/device.h>
#include <asm/mce.h>
enum severity_level {
@@ -17,7 +17,7 @@ enum severity_level {
struct mce_bank {
u64 ctl; /* subevents to enable */
unsigned char init; /* initialise bank? */
- struct sysdev_attribute attr; /* sysdev attribute */
+ struct device_attribute attr; /* device attribute */
char attrname[ATTR_LEN]; /* attribute name */
};
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index cbe82b5918c..5a11ae2e9e9 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -19,7 +19,7 @@
#include <linux/kernel.h>
#include <linux/percpu.h>
#include <linux/string.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
#include <linux/syscore_ops.h>
#include <linux/delay.h>
#include <linux/ctype.h>
@@ -1818,7 +1818,7 @@ static struct syscore_ops mce_syscore_ops = {
};
/*
- * mce_sysdev: Sysfs support
+ * mce_device: Sysfs support
*/
static void mce_cpu_restart(void *data)
@@ -1854,27 +1854,28 @@ static void mce_enable_ce(void *all)
__mcheck_cpu_init_timer();
}
-static struct sysdev_class mce_sysdev_class = {
+static struct bus_type mce_subsys = {
.name = "machinecheck",
+ .dev_name = "machinecheck",
};
-DEFINE_PER_CPU(struct sys_device, mce_sysdev);
+struct device *mce_device[CONFIG_NR_CPUS];
__cpuinitdata
void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
-static inline struct mce_bank *attr_to_bank(struct sysdev_attribute *attr)
+static inline struct mce_bank *attr_to_bank(struct device_attribute *attr)
{
return container_of(attr, struct mce_bank, attr);
}
-static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
+static ssize_t show_bank(struct device *s, struct device_attribute *attr,
char *buf)
{
return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);
}
-static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
+static ssize_t set_bank(struct device *s, struct device_attribute *attr,
const char *buf, size_t size)
{
u64 new;
@@ -1889,14 +1890,14 @@ static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
}
static ssize_t
-show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf)
+show_trigger(struct device *s, struct device_attribute *attr, char *buf)
{
strcpy(buf, mce_helper);
strcat(buf, "\n");
return strlen(mce_helper) + 1;
}
-static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
+static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
const char *buf, size_t siz)
{
char *p;
@@ -1911,8 +1912,8 @@ static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
return strlen(mce_helper) + !!p;
}
-static ssize_t set_ignore_ce(struct sys_device *s,
- struct sysdev_attribute *attr,
+static ssize_t set_ignore_ce(struct device *s,
+ struct device_attribute *attr,
const char *buf, size_t size)
{
u64 new;
@@ -1935,8 +1936,8 @@ static ssize_t set_ignore_ce(struct sys_device *s,
return size;
}
-static ssize_t set_cmci_disabled(struct sys_device *s,
- struct sysdev_attribute *attr,
+static ssize_t set_cmci_disabled(struct device *s,
+ struct device_attribute *attr,
const char *buf, size_t size)
{
u64 new;
@@ -1958,108 +1959,117 @@ static ssize_t set_cmci_disabled(struct sys_device *s,
return size;
}
-static ssize_t store_int_with_restart(struct sys_device *s,
- struct sysdev_attribute *attr,
+static ssize_t store_int_with_restart(struct device *s,
+ struct device_attribute *attr,
const char *buf, size_t size)
{
- ssize_t ret = sysdev_store_int(s, attr, buf, size);
+ ssize_t ret = device_store_int(s, attr, buf, size);
mce_restart();
return ret;
}
-static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
-static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
-static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout);
-static SYSDEV_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce);
+static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
+static DEVICE_INT_ATTR(tolerant, 0644, tolerant);
+static DEVICE_INT_ATTR(monarch_timeout, 0644, monarch_timeout);
+static DEVICE_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce);
-static struct sysdev_ext_attribute attr_check_interval = {
- _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int,
- store_int_with_restart),
+static struct dev_ext_attribute dev_attr_check_interval = {
+ __ATTR(check_interval, 0644, device_show_int, store_int_with_restart),
&check_interval
};
-static struct sysdev_ext_attribute attr_ignore_ce = {
- _SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_ignore_ce),
+static struct dev_ext_attribute dev_attr_ignore_ce = {
+ __ATTR(ignore_ce, 0644, device_show_int, set_ignore_ce),
&mce_ignore_ce
};
-static struct sysdev_ext_attribute attr_cmci_disabled = {
- _SYSDEV_ATTR(cmci_disabled, 0644, sysdev_show_int, set_cmci_disabled),
+static struct dev_ext_attribute dev_attr_cmci_disabled = {
+ __ATTR(cmci_disabled, 0644, device_show_int, set_cmci_disabled),
&mce_cmci_disabled
};
-static struct sysdev_attribute *mce_sysdev_attrs[] = {
- &attr_tolerant.attr,
- &attr_check_interval.attr,
- &attr_trigger,
- &attr_monarch_timeout.attr,
- &attr_dont_log_ce.attr,
- &attr_ignore_ce.attr,
- &attr_cmci_disabled.attr,
+static struct device_attribute *mce_device_attrs[] = {
+ &dev_attr_tolerant.attr,
+ &dev_attr_check_interval.attr,
+ &dev_attr_trigger,
+ &dev_attr_monarch_timeout.attr,
+ &dev_attr_dont_log_ce.attr,
+ &dev_attr_ignore_ce.attr,
+ &dev_attr_cmci_disabled.attr,
NULL
};
-static cpumask_var_t mce_sysdev_initialized;
+static cpumask_var_t mce_device_initialized;
-/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */
-static __cpuinit int mce_sysdev_create(unsigned int cpu)
+static void mce_device_release(struct device *dev)
{
- struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu);
+ kfree(dev);
+}
+
+/* Per cpu device init. All of the cpus still share the same ctrl bank: */
+static __cpuinit int mce_device_create(unsigned int cpu)
+{
+ struct device *dev;
int err;
int i, j;
if (!mce_available(&boot_cpu_data))
return -EIO;
- memset(&sysdev->kobj, 0, sizeof(struct kobject));
- sysdev->id = cpu;
- sysdev->cls = &mce_sysdev_class;
+ dev = kzalloc(sizeof *dev, GFP_KERNEL);
+ if (!dev)
+ return -ENOMEM;
+ dev->id = cpu;
+ dev->bus = &mce_subsys;
+ dev->release = &mce_device_release;
- err = sysdev_register(sysdev);
+ err = device_register(dev);
if (err)
return err;
- for (i = 0; mce_sysdev_attrs[i]; i++) {
- err = sysdev_create_file(sysdev, mce_sysdev_attrs[i]);
+ for (i = 0; mce_device_attrs[i]; i++) {
+ err = device_create_file(dev, mce_device_attrs[i]);
if (err)
goto error;
}
for (j = 0; j < banks; j++) {
- err = sysdev_create_file(sysdev, &mce_banks[j].attr);
+ err = device_create_file(dev, &mce_banks[j].attr);
if (err)
goto error2;
}
- cpumask_set_cpu(cpu, mce_sysdev_initialized);
+ cpumask_set_cpu(cpu, mce_device_initialized);
+ mce_device[cpu] = dev;
return 0;
error2:
while (--j >= 0)
- sysdev_remove_file(sysdev, &mce_banks[j].attr);
+ device_remove_file(dev, &mce_banks[j].attr);
error:
while (--i >= 0)
- sysdev_remove_file(sysdev, mce_sysdev_attrs[i]);
+ device_remove_file(dev, mce_device_attrs[i]);
- sysdev_unregister(sysdev);
+ device_unregister(dev);
return err;
}
-static __cpuinit void mce_sysdev_remove(unsigned int cpu)
+static __cpuinit void mce_device_remove(unsigned int cpu)
{
- struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu);
+ struct device *dev = mce_device[cpu];
int i;
- if (!cpumask_test_cpu(cpu, mce_sysdev_initialized))
+ if (!cpumask_test_cpu(cpu, mce_device_initialized))
return;
- for (i = 0; mce_sysdev_attrs[i]; i++)
- sysdev_remove_file(sysdev, mce_sysdev_attrs[i]);
+ for (i = 0; mce_device_attrs[i]; i++)
+ device_remove_file(dev, mce_device_attrs[i]);
for (i = 0; i < banks; i++)
- sysdev_remove_file(sysdev, &mce_banks[i].attr);
+ device_remove_file(dev, &mce_banks[i].attr);
- sysdev_unregister(sysdev);
- cpumask_clear_cpu(cpu, mce_sysdev_initialized);
+ device_unregister(dev);
+ cpumask_clear_cpu(cpu, mce_device_initialized);
+ mce_device[cpu] = NULL;
}
/* Make sure there are no machine checks on offlined CPUs. */
@@ -2109,7 +2119,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
switch (action) {
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
- mce_sysdev_create(cpu);
+ mce_device_create(cpu);
if (threshold_cpu_callback)
threshold_cpu_callback(action, cpu);
break;
@@ -2117,7 +2127,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
case CPU_DEAD_FROZEN:
if (threshold_cpu_callback)
threshold_cpu_callback(action, cpu);
- mce_sysdev_remove(cpu);
+ mce_device_remove(cpu);
break;
case CPU_DOWN_PREPARE:
case CPU_DOWN_PREPARE_FROZEN:
@@ -2151,7 +2161,7 @@ static __init void mce_init_banks(void)
for (i = 0; i < banks; i++) {
struct mce_bank *b = &mce_banks[i];
- struct sysdev_attribute *a = &b->attr;
+ struct device_attribute *a = &b->attr;
sysfs_attr_init(&a->attr);
a->attr.name = b->attrname;
@@ -2171,16 +2181,16 @@ static __init int mcheck_init_device(void)
if (!mce_available(&boot_cpu_data))
return -EIO;
- zalloc_cpumask_var(&mce_sysdev_initialized, GFP_KERNEL);
+ zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL);
mce_init_banks();
- err = sysdev_class_register(&mce_sysdev_class);
+ err = subsys_system_register(&mce_subsys, NULL);
if (err)
return err;
for_each_online_cpu(i) {
- err = mce_sysdev_create(i);
+ err = mce_device_create(i);
if (err)
return err;
}
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 1d76872b6a4..786e76a8632 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -17,7 +17,6 @@
#include <linux/notifier.h>
#include <linux/kobject.h>
#include <linux/percpu.h>
-#include <linux/sysdev.h>
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/sysfs.h>
@@ -524,6 +523,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
{
int i, err = 0;
struct threshold_bank *b = NULL;
+ struct device *dev = mce_device[cpu];
char name[32];
sprintf(name, "threshold_bank%i", bank);
@@ -544,8 +544,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
if (!b)
goto out;
- err = sysfs_create_link(&per_cpu(mce_sysdev, cpu).kobj,
- b->kobj, name);
+ err = sysfs_create_link(&dev->kobj, b->kobj, name);
if (err)
goto out;
@@ -566,7 +565,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
goto out;
}
- b->kobj = kobject_create_and_add(name, &per_cpu(mce_sysdev, cpu).kobj);
+ b->kobj = kobject_create_and_add(name, &dev->kobj);
if (!b->kobj)
goto out_free;
@@ -586,8 +585,9 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
if (i == cpu)
continue;
- err = sysfs_create_link(&per_cpu(mce_sysdev, i).kobj,
- b->kobj, name);
+ dev = mce_device[i];
+ if (dev)
+ err = sysfs_create_link(&dev->kobj,b->kobj, name);
if (err)
goto out;
@@ -650,6 +650,7 @@ static void deallocate_threshold_block(unsigned int cpu,
static void threshold_remove_bank(unsigned int cpu, int bank)
{
struct threshold_bank *b;
+ struct device *dev;
char name[32];
int i = 0;
@@ -664,7 +665,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
#ifdef CONFIG_SMP
/* sibling symlink */
if (shared_bank[bank] && b->blocks->cpu != cpu) {
- sysfs_remove_link(&per_cpu(mce_sysdev, cpu).kobj, name);
+ sysfs_remove_link(&mce_device[cpu]->kobj, name);
per_cpu(threshold_banks, cpu)[bank] = NULL;
return;
@@ -676,7 +677,9 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
if (i == cpu)
continue;
- sysfs_remove_link(&per_cpu(mce_sysdev, i).kobj, name);
+ dev = mce_device[i];
+ if (dev)
+ sysfs_remove_link(&dev->kobj, name);
per_cpu(threshold_banks, i)[bank] = NULL;
}
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 39c6089891e..67bb17a37a0 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -19,7 +19,6 @@
#include <linux/kernel.h>
#include <linux/percpu.h>
#include <linux/export.h>
-#include <linux/sysdev.h>
#include <linux/types.h>
#include <linux/init.h>
#include <linux/smp.h>
@@ -69,16 +68,16 @@ static atomic_t therm_throt_en = ATOMIC_INIT(0);
static u32 lvtthmr_init __read_mostly;
#ifdef CONFIG_SYSFS
-#define define_therm_throt_sysdev_one_ro(_name) \
- static SYSDEV_ATTR(_name, 0444, \
- therm_throt_sysdev_show_##_name, \
+#define define_therm_throt_device_one_ro(_name) \
+ static DEVICE_ATTR(_name, 0444, \
+ therm_throt_device_show_##_name, \
NULL) \
-#define define_therm_throt_sysdev_show_func(event, name) \
+#define define_therm_throt_device_show_func(event, name) \
\
-static ssize_t therm_throt_sysdev_show_##event##_##name( \
- struct sys_device *dev, \
- struct sysdev_attribute *attr, \
+static ssize_t therm_throt_device_show_##event##_##name( \
+ struct device *dev, \
+ struct device_attribute *attr, \
char *buf) \
{ \
unsigned int cpu = dev->id; \
@@ -95,20 +94,20 @@ static ssize_t therm_throt_sysdev_show_##event##_##name( \
return ret; \
}
-define_therm_throt_sysdev_show_func(core_throttle, count);
-define_therm_throt_sysdev_one_ro(core_throttle_count);
+define_therm_throt_device_show_func(core_throttle, count);
+define_therm_throt_device_one_ro(core_throttle_count);
-define_therm_throt_sysdev_show_func(core_power_limit, count);
-define_therm_throt_sysdev_one_ro(core_power_limit_count);
+define_therm_throt_device_show_func(core_power_limit, count);
+define_therm_throt_device_one_ro(core_power_limit_count);
-define_therm_throt_sysdev_show_func(package_throttle, count);
-define_therm_throt_sysdev_one_ro(package_throttle_count);
+define_therm_throt_device_show_func(package_throttle, count);
+define_therm_throt_device_one_ro(package_throttle_count);
-define_therm_throt_sysdev_show_func(package_power_limit, count);
-define_therm_throt_sysdev_one_ro(package_power_limit_count);
+define_therm_throt_device_show_func(package_power_limit, count);
+define_therm_throt_device_one_ro(package_power_limit_count);
static struct attribute *thermal_throttle_attrs[] = {
- &attr_core_throttle_count.attr,
+ &dev_attr_core_throttle_count.attr,
NULL
};
@@ -223,36 +222,36 @@ static int thresh_event_valid(int event)
#ifdef CONFIG_SYSFS
/* Add/Remove thermal_throttle interface for CPU device: */
-static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev,
+static __cpuinit int thermal_throttle_add_dev(struct device *dev,
unsigned int cpu)
{
int err;
struct cpuinfo_x86 *c = &cpu_data(cpu);
- err = sysfs_create_group(&sys_dev->kobj, &thermal_attr_group);
+ err = sysfs_create_group(&dev->kobj, &thermal_attr_group);
if (err)
return err;
if (cpu_has(c, X86_FEATURE_PLN))
- err = sysfs_add_file_to_group(&sys_dev->kobj,
- &attr_core_power_limit_count.attr,
+ err = sysfs_add_file_to_group(&dev->kobj,
+ &dev_attr_core_power_limit_count.attr,
thermal_attr_group.name);
if (cpu_has(c, X86_FEATURE_PTS)) {
- err = sysfs_add_file_to_group(&sys_dev->kobj,
- &attr_package_throttle_count.attr,
+ err = sysfs_add_file_to_group(&dev->kobj,
+ &dev_attr_package_throttle_count.attr,
thermal_attr_group.name);
if (cpu_has(c, X86_FEATURE_PLN))
- err = sysfs_add_file_to_group(&sys_dev->kobj,
- &attr_package_power_limit_count.attr,
+ err = sysfs_add_file_to_group(&dev->kobj,
+ &dev_attr_package_power_limit_count.attr,
thermal_attr_group.name);
}
return err;
}
-static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev)
+static __cpuinit void thermal_throttle_remove_dev(struct device *dev)
{
- sysfs_remove_group(&sys_dev->kobj, &thermal_attr_group);
+ sysfs_remove_group(&dev->kobj, &thermal_attr_group);
}
/* Mutex protecting device creation against CPU hotplug: */
@@ -265,16 +264,16 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb,
void *hcpu)
{
unsigned int cpu = (unsigned long)hcpu;
- struct sys_device *sys_dev;
+ struct device *dev;
int err = 0;
- sys_dev = get_cpu_sysdev(cpu);
+ dev = get_cpu_device(cpu);
switch (action) {
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
mutex_lock(&therm_cpu_lock);
- err = thermal_throttle_add_dev(sys_dev, cpu);
+ err = thermal_throttle_add_dev(dev, cpu);
mutex_unlock(&therm_cpu_lock);
WARN_ON(err);
break;
@@ -283,7 +282,7 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb,
case CPU_DEAD:
case CPU_DEAD_FROZEN:
mutex_lock(&therm_cpu_lock);
- thermal_throttle_remove_dev(sys_dev);
+ thermal_throttle_remove_dev(dev);
mutex_unlock(&therm_cpu_lock);
break;
}
@@ -310,7 +309,7 @@ static __init int thermal_throttle_init_device(void)
#endif
/* connect live CPUs to sysfs */
for_each_online_cpu(cpu) {
- err = thermal_throttle_add_dev(get_cpu_sysdev(cpu), cpu);
+ err = thermal_throttle_add_dev(get_cpu_device(cpu), cpu);
WARN_ON(err);
}
#ifdef CONFIG_HOTPLUG_CPU
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 212a6a42527..a524353d93f 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -177,7 +177,7 @@ static struct notifier_block __refdata cpuid_class_cpu_notifier =
.notifier_call = cpuid_class_cpu_callback,
};
-static char *cpuid_devnode(struct device *dev, mode_t *mode)
+static char *cpuid_devnode(struct device *dev, umode_t *mode)
{
return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt));
}
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 8071e2f3d6e..62d61e9976e 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -19,6 +19,7 @@
#include <linux/acpi.h>
#include <linux/firmware-map.h>
#include <linux/memblock.h>
+#include <linux/sort.h>
#include <asm/e820.h>
#include <asm/proto.h>
@@ -227,22 +228,38 @@ void __init e820_print_map(char *who)
* ____________________33__
* ______________________4_
*/
+struct change_member {
+ struct e820entry *pbios; /* pointer to original bios entry */
+ unsigned long long addr; /* address for this change point */
+};
+
+static int __init cpcompare(const void *a, const void *b)
+{
+ struct change_member * const *app = a, * const *bpp = b;
+ const struct change_member *ap = *app, *bp = *bpp;
+
+ /*
+ * Inputs are pointers to two elements of change_point[]. If their
+ * addresses are unequal, their difference dominates. If the addresses
+ * are equal, then consider one that represents the end of its region
+ * to be greater than one that does not.
+ */
+ if (ap->addr != bp->addr)
+ return ap->addr > bp->addr ? 1 : -1;
+
+ return (ap->addr != ap->pbios->addr) - (bp->addr != bp->pbios->addr);
+}
int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
u32 *pnr_map)
{
- struct change_member {
- struct e820entry *pbios; /* pointer to original bios entry */
- unsigned long long addr; /* address for this change point */
- };
static struct change_member change_point_list[2*E820_X_MAX] __initdata;
static struct change_member *change_point[2*E820_X_MAX] __initdata;
static struct e820entry *overlap_list[E820_X_MAX] __initdata;
static struct e820entry new_bios[E820_X_MAX] __initdata;
- struct change_member *change_tmp;
unsigned long current_type, last_type;
unsigned long long last_addr;
- int chgidx, still_changing;
+ int chgidx;
int overlap_entries;
int new_bios_entry;
int old_nr, new_nr, chg_nr;
@@ -279,35 +296,7 @@ int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
chg_nr = chgidx;
/* sort change-point list by memory addresses (low -> high) */
- still_changing = 1;
- while (still_changing) {
- still_changing = 0;
- for (i = 1; i < chg_nr; i++) {
- unsigned long long curaddr, lastaddr;
- unsigned long long curpbaddr, lastpbaddr;
-
- curaddr = change_point[i]->addr;
- lastaddr = change_point[i - 1]->addr;
- curpbaddr = change_point[i]->pbios->addr;
- lastpbaddr = change_point[i - 1]->pbios->addr;
-
- /*
- * swap entries, when:
- *
- * curaddr > lastaddr or
- * curaddr == lastaddr and curaddr == curpbaddr and
- * lastaddr != lastpbaddr
- */
- if (curaddr < lastaddr ||
- (curaddr == lastaddr && curaddr == curpbaddr &&
- lastaddr != lastpbaddr)) {
- change_tmp = change_point[i];
- change_point[i] = change_point[i-1];
- change_point[i-1] = change_tmp;
- still_changing = 1;
- }
- }
- }
+ sort(change_point, chg_nr, sizeof *change_point, cpcompare, NULL);
/* create a new bios memory map, removing overlaps */
overlap_entries = 0; /* number of entries in the overlap table */
@@ -714,7 +703,7 @@ void __init e820_mark_nosave_regions(unsigned long limit_pfn)
}
#endif
-#ifdef CONFIG_HIBERNATION
+#ifdef CONFIG_ACPI
/**
* Mark ACPI NVS memory region, so that we can save/restore it during
* hibernation and the subsequent resume.
@@ -727,7 +716,7 @@ static int __init e820_mark_nvs_memory(void)
struct e820entry *ei = &e820.map[i];
if (ei->type == E820_NVS)
- suspend_nvs_register(ei->addr, ei->size);
+ acpi_nvs_register(ei->addr, ei->size);
}
return 0;
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index cd28a350f7f..9b9f18b4991 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -240,14 +240,14 @@ static int __init setup_early_printk(char *buf)
if (!strncmp(buf, "xen", 3))
early_console_register(&xenboot_console, keep);
#endif
-#ifdef CONFIG_EARLY_PRINTK_MRST
+#ifdef CONFIG_EARLY_PRINTK_INTEL_MID
if (!strncmp(buf, "mrst", 4)) {
mrst_early_console_init();
early_console_register(&early_mrst_console, keep);
}
if (!strncmp(buf, "hsu", 3)) {
- hsu_early_console_init();
+ hsu_early_console_init(buf + 3);
early_console_register(&early_hsu_console, keep);
}
#endif
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 22d0e21b4dd..79d97e68f04 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -42,6 +42,7 @@
*/
#include <linux/linkage.h>
+#include <linux/err.h>
#include <asm/thread_info.h>
#include <asm/irqflags.h>
#include <asm/errno.h>
@@ -81,8 +82,6 @@
* enough to patch inline, increasing performance.
*/
-#define nr_syscalls ((syscall_table_size)/4)
-
#ifdef CONFIG_PREEMPT
#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
#else
@@ -423,7 +422,7 @@ sysenter_past_esp:
testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
jnz sysenter_audit
sysenter_do_call:
- cmpl $(nr_syscalls), %eax
+ cmpl $(NR_syscalls), %eax
jae syscall_badsys
call *sys_call_table(,%eax,4)
movl %eax,PT_EAX(%esp)
@@ -455,7 +454,7 @@ sysenter_audit:
movl %ebx,%ecx /* 3rd arg: 1st syscall arg */
movl %eax,%edx /* 2nd arg: syscall number */
movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */
- call audit_syscall_entry
+ call __audit_syscall_entry
pushl_cfi %ebx
movl PT_EAX(%esp),%eax /* reload syscall number */
jmp sysenter_do_call
@@ -466,11 +465,10 @@ sysexit_audit:
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_ANY)
movl %eax,%edx /* second arg, syscall return value */
- cmpl $0,%eax /* is it < 0? */
- setl %al /* 1 if so, 0 if not */
+ cmpl $-MAX_ERRNO,%eax /* is it an error ? */
+ setbe %al /* 1 if so, 0 if not */
movzbl %al,%eax /* zero-extend that */
- inc %eax /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
- call audit_syscall_exit
+ call __audit_syscall_exit
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF
movl TI_flags(%ebp), %ecx
@@ -504,7 +502,7 @@ ENTRY(system_call)
# system call tracing in operation / emulation
testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
jnz syscall_trace_entry
- cmpl $(nr_syscalls), %eax
+ cmpl $(NR_syscalls), %eax
jae syscall_badsys
syscall_call:
call *sys_call_table(,%eax,4)
@@ -654,7 +652,7 @@ syscall_trace_entry:
movl %esp, %eax
call syscall_trace_enter
/* What it returned is what we'll actually use. */
- cmpl $(nr_syscalls), %eax
+ cmpl $(NR_syscalls), %eax
jnae syscall_call
jmp syscall_exit
END(syscall_trace_entry)
@@ -694,29 +692,28 @@ END(syscall_badsys)
* System calls that need a pt_regs pointer.
*/
#define PTREGSCALL0(name) \
- ALIGN; \
-ptregs_##name: \
+ENTRY(ptregs_##name) ; \
leal 4(%esp),%eax; \
- jmp sys_##name;
+ jmp sys_##name; \
+ENDPROC(ptregs_##name)
#define PTREGSCALL1(name) \
- ALIGN; \
-ptregs_##name: \
+ENTRY(ptregs_##name) ; \
leal 4(%esp),%edx; \
movl (PT_EBX+4)(%esp),%eax; \
- jmp sys_##name;
+ jmp sys_##name; \
+ENDPROC(ptregs_##name)
#define PTREGSCALL2(name) \
- ALIGN; \
-ptregs_##name: \
+ENTRY(ptregs_##name) ; \
leal 4(%esp),%ecx; \
movl (PT_ECX+4)(%esp),%edx; \
movl (PT_EBX+4)(%esp),%eax; \
- jmp sys_##name;
+ jmp sys_##name; \
+ENDPROC(ptregs_##name)
#define PTREGSCALL3(name) \
- ALIGN; \
-ptregs_##name: \
+ENTRY(ptregs_##name) ; \
CFI_STARTPROC; \
leal 4(%esp),%eax; \
pushl_cfi %eax; \
@@ -741,8 +738,7 @@ PTREGSCALL2(vm86)
PTREGSCALL1(vm86old)
/* Clone is an oddball. The 4th arg is in %edi */
- ALIGN;
-ptregs_clone:
+ENTRY(ptregs_clone)
CFI_STARTPROC
leal 4(%esp),%eax
pushl_cfi %eax
@@ -1213,11 +1209,6 @@ return_to_handler:
jmp *%ecx
#endif
-.section .rodata,"a"
-#include "syscall_table_32.S"
-
-syscall_table_size=(.-sys_call_table)
-
/*
* Some functions should be protected against kprobes
*/
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index a20e1cb9dc8..3fe8239fd8f 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -55,6 +55,7 @@
#include <asm/paravirt.h>
#include <asm/ftrace.h>
#include <asm/percpu.h>
+#include <linux/err.h>
/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
#include <linux/elf-em.h>
@@ -548,7 +549,7 @@ badsys:
#ifdef CONFIG_AUDITSYSCALL
/*
* Fast path for syscall audit without full syscall trace.
- * We just call audit_syscall_entry() directly, and then
+ * We just call __audit_syscall_entry() directly, and then
* jump back to the normal fast path.
*/
auditsys:
@@ -558,22 +559,21 @@ auditsys:
movq %rdi,%rdx /* 3rd arg: 1st syscall arg */
movq %rax,%rsi /* 2nd arg: syscall number */
movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */
- call audit_syscall_entry
+ call __audit_syscall_entry
LOAD_ARGS 0 /* reload call-clobbered registers */
jmp system_call_fastpath
/*
- * Return fast path for syscall audit. Call audit_syscall_exit()
+ * Return fast path for syscall audit. Call __audit_syscall_exit()
* directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
* masked off.
*/
sysret_audit:
movq RAX-ARGOFFSET(%rsp),%rsi /* second arg, syscall return value */
- cmpq $0,%rsi /* is it < 0? */
- setl %al /* 1 if so, 0 if not */
+ cmpq $-MAX_ERRNO,%rsi /* is it < -MAX_ERRNO? */
+ setbe %al /* 1 if so, 0 if not */
movzbl %al,%edi /* zero-extend that into %edi */
- inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
- call audit_syscall_exit
+ call __audit_syscall_exit
movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
jmp sysret_check
#endif /* CONFIG_AUDITSYSCALL */
@@ -1480,62 +1480,214 @@ ENTRY(error_exit)
CFI_ENDPROC
END(error_exit)
+/*
+ * Test if a given stack is an NMI stack or not.
+ */
+ .macro test_in_nmi reg stack nmi_ret normal_ret
+ cmpq %\reg, \stack
+ ja \normal_ret
+ subq $EXCEPTION_STKSZ, %\reg
+ cmpq %\reg, \stack
+ jb \normal_ret
+ jmp \nmi_ret
+ .endm
/* runs on exception stack */
ENTRY(nmi)
INTR_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
- pushq_cfi $-1
+ /*
+ * We allow breakpoints in NMIs. If a breakpoint occurs, then
+ * the iretq it performs will take us out of NMI context.
+ * This means that we can have nested NMIs where the next
+ * NMI is using the top of the stack of the previous NMI. We
+ * can't let it execute because the nested NMI will corrupt the
+ * stack of the previous NMI. NMI handlers are not re-entrant
+ * anyway.
+ *
+ * To handle this case we do the following:
+ * Check the a special location on the stack that contains
+ * a variable that is set when NMIs are executing.
+ * The interrupted task's stack is also checked to see if it
+ * is an NMI stack.
+ * If the variable is not set and the stack is not the NMI
+ * stack then:
+ * o Set the special variable on the stack
+ * o Copy the interrupt frame into a "saved" location on the stack
+ * o Copy the interrupt frame into a "copy" location on the stack
+ * o Continue processing the NMI
+ * If the variable is set or the previous stack is the NMI stack:
+ * o Modify the "copy" location to jump to the repeate_nmi
+ * o return back to the first NMI
+ *
+ * Now on exit of the first NMI, we first clear the stack variable
+ * The NMI stack will tell any nested NMIs at that point that it is
+ * nested. Then we pop the stack normally with iret, and if there was
+ * a nested NMI that updated the copy interrupt stack frame, a
+ * jump will be made to the repeat_nmi code that will handle the second
+ * NMI.
+ */
+
+ /* Use %rdx as out temp variable throughout */
+ pushq_cfi %rdx
+
+ /*
+ * Check the special variable on the stack to see if NMIs are
+ * executing.
+ */
+ cmp $1, -8(%rsp)
+ je nested_nmi
+
+ /*
+ * Now test if the previous stack was an NMI stack.
+ * We need the double check. We check the NMI stack to satisfy the
+ * race when the first NMI clears the variable before returning.
+ * We check the variable because the first NMI could be in a
+ * breakpoint routine using a breakpoint stack.
+ */
+ lea 6*8(%rsp), %rdx
+ test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi
+
+nested_nmi:
+ /*
+ * Do nothing if we interrupted the fixup in repeat_nmi.
+ * It's about to repeat the NMI handler, so we are fine
+ * with ignoring this one.
+ */
+ movq $repeat_nmi, %rdx
+ cmpq 8(%rsp), %rdx
+ ja 1f
+ movq $end_repeat_nmi, %rdx
+ cmpq 8(%rsp), %rdx
+ ja nested_nmi_out
+
+1:
+ /* Set up the interrupted NMIs stack to jump to repeat_nmi */
+ leaq -6*8(%rsp), %rdx
+ movq %rdx, %rsp
+ CFI_ADJUST_CFA_OFFSET 6*8
+ pushq_cfi $__KERNEL_DS
+ pushq_cfi %rdx
+ pushfq_cfi
+ pushq_cfi $__KERNEL_CS
+ pushq_cfi $repeat_nmi
+
+ /* Put stack back */
+ addq $(11*8), %rsp
+ CFI_ADJUST_CFA_OFFSET -11*8
+
+nested_nmi_out:
+ popq_cfi %rdx
+
+ /* No need to check faults here */
+ INTERRUPT_RETURN
+
+first_nmi:
+ /*
+ * Because nested NMIs will use the pushed location that we
+ * stored in rdx, we must keep that space available.
+ * Here's what our stack frame will look like:
+ * +-------------------------+
+ * | original SS |
+ * | original Return RSP |
+ * | original RFLAGS |
+ * | original CS |
+ * | original RIP |
+ * +-------------------------+
+ * | temp storage for rdx |
+ * +-------------------------+
+ * | NMI executing variable |
+ * +-------------------------+
+ * | Saved SS |
+ * | Saved Return RSP |
+ * | Saved RFLAGS |
+ * | Saved CS |
+ * | Saved RIP |
+ * +-------------------------+
+ * | copied SS |
+ * | copied Return RSP |
+ * | copied RFLAGS |
+ * | copied CS |
+ * | copied RIP |
+ * +-------------------------+
+ * | pt_regs |
+ * +-------------------------+
+ *
+ * The saved RIP is used to fix up the copied RIP that a nested
+ * NMI may zero out. The original stack frame and the temp storage
+ * is also used by nested NMIs and can not be trusted on exit.
+ */
+ /* Set the NMI executing variable on the stack. */
+ pushq_cfi $1
+
+ /* Copy the stack frame to the Saved frame */
+ .rept 5
+ pushq_cfi 6*8(%rsp)
+ .endr
+
+ /* Make another copy, this one may be modified by nested NMIs */
+ .rept 5
+ pushq_cfi 4*8(%rsp)
+ .endr
+
+ /* Do not pop rdx, nested NMIs will corrupt it */
+ movq 11*8(%rsp), %rdx
+
+ /*
+ * Everything below this point can be preempted by a nested
+ * NMI if the first NMI took an exception. Repeated NMIs
+ * caused by an exception and nested NMI will start here, and
+ * can still be preempted by another NMI.
+ */
+restart_nmi:
+ pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
subq $ORIG_RAX-R15, %rsp
CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
+ /*
+ * Use save_paranoid to handle SWAPGS, but no need to use paranoid_exit
+ * as we should not be calling schedule in NMI context.
+ * Even with normal interrupts enabled. An NMI should not be
+ * setting NEED_RESCHED or anything that normal interrupts and
+ * exceptions might do.
+ */
call save_paranoid
DEFAULT_FRAME 0
/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
movq %rsp,%rdi
movq $-1,%rsi
call do_nmi
-#ifdef CONFIG_TRACE_IRQFLAGS
- /* paranoidexit; without TRACE_IRQS_OFF */
- /* ebx: no swapgs flag */
- DISABLE_INTERRUPTS(CLBR_NONE)
testl %ebx,%ebx /* swapgs needed? */
jnz nmi_restore
- testl $3,CS(%rsp)
- jnz nmi_userspace
nmi_swapgs:
SWAPGS_UNSAFE_STACK
nmi_restore:
RESTORE_ALL 8
+ /* Clear the NMI executing stack variable */
+ movq $0, 10*8(%rsp)
jmp irq_return
-nmi_userspace:
- GET_THREAD_INFO(%rcx)
- movl TI_flags(%rcx),%ebx
- andl $_TIF_WORK_MASK,%ebx
- jz nmi_swapgs
- movq %rsp,%rdi /* &pt_regs */
- call sync_regs
- movq %rax,%rsp /* switch stack for scheduling */
- testl $_TIF_NEED_RESCHED,%ebx
- jnz nmi_schedule
- movl %ebx,%edx /* arg3: thread flags */
- ENABLE_INTERRUPTS(CLBR_NONE)
- xorl %esi,%esi /* arg2: oldset */
- movq %rsp,%rdi /* arg1: &pt_regs */
- call do_notify_resume
- DISABLE_INTERRUPTS(CLBR_NONE)
- jmp nmi_userspace
-nmi_schedule:
- ENABLE_INTERRUPTS(CLBR_ANY)
- call schedule
- DISABLE_INTERRUPTS(CLBR_ANY)
- jmp nmi_userspace
- CFI_ENDPROC
-#else
- jmp paranoid_exit
CFI_ENDPROC
-#endif
END(nmi)
+ /*
+ * If an NMI hit an iret because of an exception or breakpoint,
+ * it can lose its NMI context, and a nested NMI may come in.
+ * In that case, the nested NMI will change the preempted NMI's
+ * stack to jump to here when it does the final iret.
+ */
+repeat_nmi:
+ INTR_FRAME
+ /* Update the stack variable to say we are still in NMI */
+ movq $1, 5*8(%rsp)
+
+ /* copy the saved stack back to copy stack */
+ .rept 5
+ pushq_cfi 4*8(%rsp)
+ .endr
+
+ jmp restart_nmi
+ CFI_ENDPROC
+end_repeat_nmi:
+
ENTRY(ignore_sysret)
CFI_STARTPROC
mov $-ENOSYS,%eax
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index e11e39478a4..40f4eb3766d 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -417,6 +417,10 @@ ENTRY(phys_base)
ENTRY(idt_table)
.skip IDT_ENTRIES * 16
+ .align L1_CACHE_BYTES
+ENTRY(nmi_idt_table)
+ .skip IDT_ENTRIES * 16
+
__PAGE_ALIGNED_BSS
.align PAGE_SIZE
ENTRY(empty_zero_page)
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 07b0a56a754..ad0de0c2714 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -2,7 +2,6 @@
#include <linux/clockchips.h>
#include <linux/interrupt.h>
#include <linux/export.h>
-#include <linux/sysdev.h>
#include <linux/delay.h>
#include <linux/errno.h>
#include <linux/i8253.h>
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 72090705a65..40fc86161d9 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -28,6 +28,9 @@ DEFINE_PER_CPU(struct pt_regs *, irq_regs);
EXPORT_PER_CPU_SYMBOL(irq_regs);
#ifdef CONFIG_DEBUG_STACKOVERFLOW
+
+int sysctl_panic_on_stackoverflow __read_mostly;
+
/* Debugging check for stack overflow: is there less than 1KB free? */
static int check_stack_overflow(void)
{
@@ -43,6 +46,8 @@ static void print_stack_overflow(void)
{
printk(KERN_WARNING "low stack detected by irq handler\n");
dump_stack();
+ if (sysctl_panic_on_stackoverflow)
+ panic("low stack detected by irq handler - check messages\n");
}
#else
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 69bca468c47..d04d3ecded6 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -26,6 +26,8 @@ EXPORT_PER_CPU_SYMBOL(irq_stat);
DEFINE_PER_CPU(struct pt_regs *, irq_regs);
EXPORT_PER_CPU_SYMBOL(irq_regs);
+int sysctl_panic_on_stackoverflow;
+
/*
* Probabilistic stack overflow check:
*
@@ -36,18 +38,39 @@ EXPORT_PER_CPU_SYMBOL(irq_regs);
static inline void stack_overflow_check(struct pt_regs *regs)
{
#ifdef CONFIG_DEBUG_STACKOVERFLOW
+#define STACK_TOP_MARGIN 128
+ struct orig_ist *oist;
+ u64 irq_stack_top, irq_stack_bottom;
+ u64 estack_top, estack_bottom;
u64 curbase = (u64)task_stack_page(current);
if (user_mode_vm(regs))
return;
- WARN_ONCE(regs->sp >= curbase &&
- regs->sp <= curbase + THREAD_SIZE &&
- regs->sp < curbase + sizeof(struct thread_info) +
- sizeof(struct pt_regs) + 128,
+ if (regs->sp >= curbase + sizeof(struct thread_info) +
+ sizeof(struct pt_regs) + STACK_TOP_MARGIN &&
+ regs->sp <= curbase + THREAD_SIZE)
+ return;
+
+ irq_stack_top = (u64)__get_cpu_var(irq_stack_union.irq_stack) +
+ STACK_TOP_MARGIN;
+ irq_stack_bottom = (u64)__get_cpu_var(irq_stack_ptr);
+ if (regs->sp >= irq_stack_top && regs->sp <= irq_stack_bottom)
+ return;
+
+ oist = &__get_cpu_var(orig_ist);
+ estack_top = (u64)oist->ist[0] - EXCEPTION_STKSZ + STACK_TOP_MARGIN;
+ estack_bottom = (u64)oist->ist[N_EXCEPTION_STACKS - 1];
+ if (regs->sp >= estack_top && regs->sp <= estack_bottom)
+ return;
+
+ WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx)\n",
+ current->comm, curbase, regs->sp,
+ irq_stack_top, irq_stack_bottom,
+ estack_top, estack_bottom);
- "do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n",
- current->comm, curbase, regs->sp);
+ if (sysctl_panic_on_stackoverflow)
+ panic("low stack detected by irq handler - check messages\n");
#endif
}
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index b3300e6bace..313fb5cddbc 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -9,7 +9,7 @@
#include <linux/kprobes.h>
#include <linux/init.h>
#include <linux/kernel_stat.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
#include <linux/bitops.h>
#include <linux/acpi.h>
#include <linux/io.h>
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index a9c2116001d..f0c6fd6f176 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -39,8 +39,6 @@
#include <asm/desc.h>
#include <asm/tlbflush.h>
-#define MMU_QUEUE_SIZE 1024
-
static int kvmapf = 1;
static int parse_no_kvmapf(char *arg)
@@ -60,21 +58,10 @@ static int parse_no_stealacc(char *arg)
early_param("no-steal-acc", parse_no_stealacc);
-struct kvm_para_state {
- u8 mmu_queue[MMU_QUEUE_SIZE];
- int mmu_queue_len;
-};
-
-static DEFINE_PER_CPU(struct kvm_para_state, para_state);
static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64);
static int has_steal_clock = 0;
-static struct kvm_para_state *kvm_para_state(void)
-{
- return &per_cpu(para_state, raw_smp_processor_id());
-}
-
/*
* No need for any "IO delay" on KVM
*/
@@ -271,151 +258,6 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
}
}
-static void kvm_mmu_op(void *buffer, unsigned len)
-{
- int r;
- unsigned long a1, a2;
-
- do {
- a1 = __pa(buffer);
- a2 = 0; /* on i386 __pa() always returns <4G */
- r = kvm_hypercall3(KVM_HC_MMU_OP, len, a1, a2);
- buffer += r;
- len -= r;
- } while (len);
-}
-
-static void mmu_queue_flush(struct kvm_para_state *state)
-{
- if (state->mmu_queue_len) {
- kvm_mmu_op(state->mmu_queue, state->mmu_queue_len);
- state->mmu_queue_len = 0;
- }
-}
-
-static void kvm_deferred_mmu_op(void *buffer, int len)
-{
- struct kvm_para_state *state = kvm_para_state();
-
- if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU) {
- kvm_mmu_op(buffer, len);
- return;
- }
- if (state->mmu_queue_len + len > sizeof state->mmu_queue)
- mmu_queue_flush(state);
- memcpy(state->mmu_queue + state->mmu_queue_len, buffer, len);
- state->mmu_queue_len += len;
-}
-
-static void kvm_mmu_write(void *dest, u64 val)
-{
- __u64 pte_phys;
- struct kvm_mmu_op_write_pte wpte;
-
-#ifdef CONFIG_HIGHPTE
- struct page *page;
- unsigned long dst = (unsigned long) dest;
-
- page = kmap_atomic_to_page(dest);
- pte_phys = page_to_pfn(page);
- pte_phys <<= PAGE_SHIFT;
- pte_phys += (dst & ~(PAGE_MASK));
-#else
- pte_phys = (unsigned long)__pa(dest);
-#endif
- wpte.header.op = KVM_MMU_OP_WRITE_PTE;
- wpte.pte_val = val;
- wpte.pte_phys = pte_phys;
-
- kvm_deferred_mmu_op(&wpte, sizeof wpte);
-}
-
-/*
- * We only need to hook operations that are MMU writes. We hook these so that
- * we can use lazy MMU mode to batch these operations. We could probably
- * improve the performance of the host code if we used some of the information
- * here to simplify processing of batched writes.
- */
-static void kvm_set_pte(pte_t *ptep, pte_t pte)
-{
- kvm_mmu_write(ptep, pte_val(pte));
-}
-
-static void kvm_set_pte_at(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pte)
-{
- kvm_mmu_write(ptep, pte_val(pte));
-}
-
-static void kvm_set_pmd(pmd_t *pmdp, pmd_t pmd)
-{
- kvm_mmu_write(pmdp, pmd_val(pmd));
-}
-
-#if PAGETABLE_LEVELS >= 3
-#ifdef CONFIG_X86_PAE
-static void kvm_set_pte_atomic(pte_t *ptep, pte_t pte)
-{
- kvm_mmu_write(ptep, pte_val(pte));
-}
-
-static void kvm_pte_clear(struct mm_struct *mm,
- unsigned long addr, pte_t *ptep)
-{
- kvm_mmu_write(ptep, 0);
-}
-
-static void kvm_pmd_clear(pmd_t *pmdp)
-{
- kvm_mmu_write(pmdp, 0);
-}
-#endif
-
-static void kvm_set_pud(pud_t *pudp, pud_t pud)
-{
- kvm_mmu_write(pudp, pud_val(pud));
-}
-
-#if PAGETABLE_LEVELS == 4
-static void kvm_set_pgd(pgd_t *pgdp, pgd_t pgd)
-{
- kvm_mmu_write(pgdp, pgd_val(pgd));
-}
-#endif
-#endif /* PAGETABLE_LEVELS >= 3 */
-
-static void kvm_flush_tlb(void)
-{
- struct kvm_mmu_op_flush_tlb ftlb = {
- .header.op = KVM_MMU_OP_FLUSH_TLB,
- };
-
- kvm_deferred_mmu_op(&ftlb, sizeof ftlb);
-}
-
-static void kvm_release_pt(unsigned long pfn)
-{
- struct kvm_mmu_op_release_pt rpt = {
- .header.op = KVM_MMU_OP_RELEASE_PT,
- .pt_phys = (u64)pfn << PAGE_SHIFT,
- };
-
- kvm_mmu_op(&rpt, sizeof rpt);
-}
-
-static void kvm_enter_lazy_mmu(void)
-{
- paravirt_enter_lazy_mmu();
-}
-
-static void kvm_leave_lazy_mmu(void)
-{
- struct kvm_para_state *state = kvm_para_state();
-
- mmu_queue_flush(state);
- paravirt_leave_lazy_mmu();
-}
-
static void __init paravirt_ops_setup(void)
{
pv_info.name = "KVM";
@@ -424,29 +266,6 @@ static void __init paravirt_ops_setup(void)
if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY))
pv_cpu_ops.io_delay = kvm_io_delay;
- if (kvm_para_has_feature(KVM_FEATURE_MMU_OP)) {
- pv_mmu_ops.set_pte = kvm_set_pte;
- pv_mmu_ops.set_pte_at = kvm_set_pte_at;
- pv_mmu_ops.set_pmd = kvm_set_pmd;
-#if PAGETABLE_LEVELS >= 3
-#ifdef CONFIG_X86_PAE
- pv_mmu_ops.set_pte_atomic = kvm_set_pte_atomic;
- pv_mmu_ops.pte_clear = kvm_pte_clear;
- pv_mmu_ops.pmd_clear = kvm_pmd_clear;
-#endif
- pv_mmu_ops.set_pud = kvm_set_pud;
-#if PAGETABLE_LEVELS == 4
- pv_mmu_ops.set_pgd = kvm_set_pgd;
-#endif
-#endif
- pv_mmu_ops.flush_tlb_user = kvm_flush_tlb;
- pv_mmu_ops.release_pte = kvm_release_pt;
- pv_mmu_ops.release_pmd = kvm_release_pt;
- pv_mmu_ops.release_pud = kvm_release_pt;
-
- pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu;
- pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu;
- }
#ifdef CONFIG_X86_IO_APIC
no_timer_check = 1;
#endif
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 9302e2d0eb4..fda91c30710 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -292,8 +292,8 @@ static int reload_for_cpu(int cpu)
return err;
}
-static ssize_t reload_store(struct sys_device *dev,
- struct sysdev_attribute *attr,
+static ssize_t reload_store(struct device *dev,
+ struct device_attribute *attr,
const char *buf, size_t size)
{
unsigned long val;
@@ -318,30 +318,30 @@ static ssize_t reload_store(struct sys_device *dev,
return ret;
}
-static ssize_t version_show(struct sys_device *dev,
- struct sysdev_attribute *attr, char *buf)
+static ssize_t version_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
return sprintf(buf, "0x%x\n", uci->cpu_sig.rev);
}
-static ssize_t pf_show(struct sys_device *dev,
- struct sysdev_attribute *attr, char *buf)
+static ssize_t pf_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
return sprintf(buf, "0x%x\n", uci->cpu_sig.pf);
}
-static SYSDEV_ATTR(reload, 0200, NULL, reload_store);
-static SYSDEV_ATTR(version, 0400, version_show, NULL);
-static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL);
+static DEVICE_ATTR(reload, 0200, NULL, reload_store);
+static DEVICE_ATTR(version, 0400, version_show, NULL);
+static DEVICE_ATTR(processor_flags, 0400, pf_show, NULL);
static struct attribute *mc_default_attrs[] = {
- &attr_reload.attr,
- &attr_version.attr,
- &attr_processor_flags.attr,
+ &dev_attr_reload.attr,
+ &dev_attr_version.attr,
+ &dev_attr_processor_flags.attr,
NULL
};
@@ -405,43 +405,45 @@ static enum ucode_state microcode_update_cpu(int cpu)
return ustate;
}
-static int mc_sysdev_add(struct sys_device *sys_dev)
+static int mc_device_add(struct device *dev, struct subsys_interface *sif)
{
- int err, cpu = sys_dev->id;
+ int err, cpu = dev->id;
if (!cpu_online(cpu))
return 0;
pr_debug("CPU%d added\n", cpu);
- err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
+ err = sysfs_create_group(&dev->kobj, &mc_attr_group);
if (err)
return err;
if (microcode_init_cpu(cpu) == UCODE_ERROR) {
- sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
+ sysfs_remove_group(&dev->kobj, &mc_attr_group);
return -EINVAL;
}
return err;
}
-static int mc_sysdev_remove(struct sys_device *sys_dev)
+static int mc_device_remove(struct device *dev, struct subsys_interface *sif)
{
- int cpu = sys_dev->id;
+ int cpu = dev->id;
if (!cpu_online(cpu))
return 0;
pr_debug("CPU%d removed\n", cpu);
microcode_fini_cpu(cpu);
- sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
+ sysfs_remove_group(&dev->kobj, &mc_attr_group);
return 0;
}
-static struct sysdev_driver mc_sysdev_driver = {
- .add = mc_sysdev_add,
- .remove = mc_sysdev_remove,
+static struct subsys_interface mc_cpu_interface = {
+ .name = "microcode",
+ .subsys = &cpu_subsys,
+ .add_dev = mc_device_add,
+ .remove_dev = mc_device_remove,
};
/**
@@ -464,9 +466,9 @@ static __cpuinit int
mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
{
unsigned int cpu = (unsigned long)hcpu;
- struct sys_device *sys_dev;
+ struct device *dev;
- sys_dev = get_cpu_sysdev(cpu);
+ dev = get_cpu_device(cpu);
switch (action) {
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
@@ -474,13 +476,13 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
case CPU_DOWN_FAILED:
case CPU_DOWN_FAILED_FROZEN:
pr_debug("CPU%d added\n", cpu);
- if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group))
+ if (sysfs_create_group(&dev->kobj, &mc_attr_group))
pr_err("Failed to create group for CPU%d\n", cpu);
break;
case CPU_DOWN_PREPARE:
case CPU_DOWN_PREPARE_FROZEN:
/* Suspend is in progress, only remove the interface */
- sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
+ sysfs_remove_group(&dev->kobj, &mc_attr_group);
pr_debug("CPU%d removed\n", cpu);
break;
@@ -525,7 +527,7 @@ static int __init microcode_init(void)
get_online_cpus();
mutex_lock(&microcode_mutex);
- error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver);
+ error = subsys_interface_register(&mc_cpu_interface);
mutex_unlock(&microcode_mutex);
put_online_cpus();
@@ -535,7 +537,7 @@ static int __init microcode_init(void)
error = microcode_dev_init();
if (error)
- goto out_sysdev_driver;
+ goto out_driver;
register_syscore_ops(&mc_syscore_ops);
register_hotcpu_notifier(&mc_cpu_notifier);
@@ -545,11 +547,11 @@ static int __init microcode_init(void)
return 0;
-out_sysdev_driver:
+out_driver:
get_online_cpus();
mutex_lock(&microcode_mutex);
- sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
+ subsys_interface_unregister(&mc_cpu_interface);
mutex_unlock(&microcode_mutex);
put_online_cpus();
@@ -573,7 +575,7 @@ static void __exit microcode_exit(void)
get_online_cpus();
mutex_lock(&microcode_mutex);
- sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
+ subsys_interface_unregister(&mc_cpu_interface);
mutex_unlock(&microcode_mutex);
put_online_cpus();
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 12fcbe2c143..96356762a51 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -236,7 +236,7 @@ static struct notifier_block __refdata msr_class_cpu_notifier = {
.notifier_call = msr_class_cpu_callback,
};
-static char *msr_devnode(struct device *dev, mode_t *mode)
+static char *msr_devnode(struct device *dev, umode_t *mode)
{
return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt));
}
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index e88f37b58dd..47acaf31916 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -405,9 +405,108 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
unknown_nmi_error(reason, regs);
}
+/*
+ * NMIs can hit breakpoints which will cause it to lose its
+ * NMI context with the CPU when the breakpoint does an iret.
+ */
+#ifdef CONFIG_X86_32
+/*
+ * For i386, NMIs use the same stack as the kernel, and we can
+ * add a workaround to the iret problem in C. Simply have 3 states
+ * the NMI can be in.
+ *
+ * 1) not running
+ * 2) executing
+ * 3) latched
+ *
+ * When no NMI is in progress, it is in the "not running" state.
+ * When an NMI comes in, it goes into the "executing" state.
+ * Normally, if another NMI is triggered, it does not interrupt
+ * the running NMI and the HW will simply latch it so that when
+ * the first NMI finishes, it will restart the second NMI.
+ * (Note, the latch is binary, thus multiple NMIs triggering,
+ * when one is running, are ignored. Only one NMI is restarted.)
+ *
+ * If an NMI hits a breakpoint that executes an iret, another
+ * NMI can preempt it. We do not want to allow this new NMI
+ * to run, but we want to execute it when the first one finishes.
+ * We set the state to "latched", and the first NMI will perform
+ * an cmpxchg on the state, and if it doesn't successfully
+ * reset the state to "not running" it will restart the next
+ * NMI.
+ */
+enum nmi_states {
+ NMI_NOT_RUNNING,
+ NMI_EXECUTING,
+ NMI_LATCHED,
+};
+static DEFINE_PER_CPU(enum nmi_states, nmi_state);
+
+#define nmi_nesting_preprocess(regs) \
+ do { \
+ if (__get_cpu_var(nmi_state) != NMI_NOT_RUNNING) { \
+ __get_cpu_var(nmi_state) = NMI_LATCHED; \
+ return; \
+ } \
+ nmi_restart: \
+ __get_cpu_var(nmi_state) = NMI_EXECUTING; \
+ } while (0)
+
+#define nmi_nesting_postprocess() \
+ do { \
+ if (cmpxchg(&__get_cpu_var(nmi_state), \
+ NMI_EXECUTING, NMI_NOT_RUNNING) != NMI_EXECUTING) \
+ goto nmi_restart; \
+ } while (0)
+#else /* x86_64 */
+/*
+ * In x86_64 things are a bit more difficult. This has the same problem
+ * where an NMI hitting a breakpoint that calls iret will remove the
+ * NMI context, allowing a nested NMI to enter. What makes this more
+ * difficult is that both NMIs and breakpoints have their own stack.
+ * When a new NMI or breakpoint is executed, the stack is set to a fixed
+ * point. If an NMI is nested, it will have its stack set at that same
+ * fixed address that the first NMI had, and will start corrupting the
+ * stack. This is handled in entry_64.S, but the same problem exists with
+ * the breakpoint stack.
+ *
+ * If a breakpoint is being processed, and the debug stack is being used,
+ * if an NMI comes in and also hits a breakpoint, the stack pointer
+ * will be set to the same fixed address as the breakpoint that was
+ * interrupted, causing that stack to be corrupted. To handle this case,
+ * check if the stack that was interrupted is the debug stack, and if
+ * so, change the IDT so that new breakpoints will use the current stack
+ * and not switch to the fixed address. On return of the NMI, switch back
+ * to the original IDT.
+ */
+static DEFINE_PER_CPU(int, update_debug_stack);
+
+static inline void nmi_nesting_preprocess(struct pt_regs *regs)
+{
+ /*
+ * If we interrupted a breakpoint, it is possible that
+ * the nmi handler will have breakpoints too. We need to
+ * change the IDT such that breakpoints that happen here
+ * continue to use the NMI stack.
+ */
+ if (unlikely(is_debug_stack(regs->sp))) {
+ debug_stack_set_zero();
+ __get_cpu_var(update_debug_stack) = 1;
+ }
+}
+
+static inline void nmi_nesting_postprocess(void)
+{
+ if (unlikely(__get_cpu_var(update_debug_stack)))
+ debug_stack_reset();
+}
+#endif
+
dotraplinkage notrace __kprobes void
do_nmi(struct pt_regs *regs, long error_code)
{
+ nmi_nesting_preprocess(regs);
+
nmi_enter();
inc_irq_stat(__nmi_count);
@@ -416,6 +515,9 @@ do_nmi(struct pt_regs *regs, long error_code)
default_do_nmi(regs);
nmi_exit();
+
+ /* On i386, may loop back to preprocess */
+ nmi_nesting_postprocess();
}
void stop_nmi(void)
diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c
new file mode 100644
index 00000000000..0d01a8ea4e1
--- /dev/null
+++ b/arch/x86/kernel/nmi_selftest.c
@@ -0,0 +1,180 @@
+/*
+ * arch/x86/kernel/nmi-selftest.c
+ *
+ * Testsuite for NMI: IPIs
+ *
+ * Started by Don Zickus:
+ * (using lib/locking-selftest.c as a guide)
+ *
+ * Copyright (C) 2011 Red Hat, Inc., Don Zickus <dzickus@redhat.com>
+ */
+
+#include <linux/smp.h>
+#include <linux/cpumask.h>
+#include <linux/delay.h>
+
+#include <asm/apic.h>
+#include <asm/nmi.h>
+
+#define SUCCESS 0
+#define FAILURE 1
+#define TIMEOUT 2
+
+static int nmi_fail;
+
+/* check to see if NMI IPIs work on this machine */
+static DECLARE_BITMAP(nmi_ipi_mask, NR_CPUS) __read_mostly;
+
+static int testcase_total;
+static int testcase_successes;
+static int expected_testcase_failures;
+static int unexpected_testcase_failures;
+static int unexpected_testcase_unknowns;
+
+static int nmi_unk_cb(unsigned int val, struct pt_regs *regs)
+{
+ unexpected_testcase_unknowns++;
+ return NMI_HANDLED;
+}
+
+static void init_nmi_testsuite(void)
+{
+ /* trap all the unknown NMIs we may generate */
+ register_nmi_handler(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk");
+}
+
+static void cleanup_nmi_testsuite(void)
+{
+ unregister_nmi_handler(NMI_UNKNOWN, "nmi_selftest_unk");
+}
+
+static int test_nmi_ipi_callback(unsigned int val, struct pt_regs *regs)
+{
+ int cpu = raw_smp_processor_id();
+
+ if (cpumask_test_and_clear_cpu(cpu, to_cpumask(nmi_ipi_mask)))
+ return NMI_HANDLED;
+
+ return NMI_DONE;
+}
+
+static void test_nmi_ipi(struct cpumask *mask)
+{
+ unsigned long timeout;
+
+ if (register_nmi_handler(NMI_LOCAL, test_nmi_ipi_callback,
+ NMI_FLAG_FIRST, "nmi_selftest")) {
+ nmi_fail = FAILURE;
+ return;
+ }
+
+ /* sync above data before sending NMI */
+ wmb();
+
+ apic->send_IPI_mask(mask, NMI_VECTOR);
+
+ /* Don't wait longer than a second */
+ timeout = USEC_PER_SEC;
+ while (!cpumask_empty(mask) && timeout--)
+ udelay(1);
+
+ /* What happens if we timeout, do we still unregister?? */
+ unregister_nmi_handler(NMI_LOCAL, "nmi_selftest");
+
+ if (!timeout)
+ nmi_fail = TIMEOUT;
+ return;
+}
+
+static void remote_ipi(void)
+{
+ cpumask_copy(to_cpumask(nmi_ipi_mask), cpu_online_mask);
+ cpumask_clear_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask));
+ if (!cpumask_empty(to_cpumask(nmi_ipi_mask)))
+ test_nmi_ipi(to_cpumask(nmi_ipi_mask));
+}
+
+static void local_ipi(void)
+{
+ cpumask_clear(to_cpumask(nmi_ipi_mask));
+ cpumask_set_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask));
+ test_nmi_ipi(to_cpumask(nmi_ipi_mask));
+}
+
+static void reset_nmi(void)
+{
+ nmi_fail = 0;
+}
+
+static void dotest(void (*testcase_fn)(void), int expected)
+{
+ testcase_fn();
+ /*
+ * Filter out expected failures:
+ */
+ if (nmi_fail != expected) {
+ unexpected_testcase_failures++;
+
+ if (nmi_fail == FAILURE)
+ printk("FAILED |");
+ else if (nmi_fail == TIMEOUT)
+ printk("TIMEOUT|");
+ else
+ printk("ERROR |");
+ dump_stack();
+ } else {
+ testcase_successes++;
+ printk(" ok |");
+ }
+ testcase_total++;
+
+ reset_nmi();
+}
+
+static inline void print_testname(const char *testname)
+{
+ printk("%12s:", testname);
+}
+
+void nmi_selftest(void)
+{
+ init_nmi_testsuite();
+
+ /*
+ * Run the testsuite:
+ */
+ printk("----------------\n");
+ printk("| NMI testsuite:\n");
+ printk("--------------------\n");
+
+ print_testname("remote IPI");
+ dotest(remote_ipi, SUCCESS);
+ printk("\n");
+ print_testname("local IPI");
+ dotest(local_ipi, SUCCESS);
+ printk("\n");
+
+ cleanup_nmi_testsuite();
+
+ if (unexpected_testcase_failures) {
+ printk("--------------------\n");
+ printk("BUG: %3d unexpected failures (out of %3d) - debugging disabled! |\n",
+ unexpected_testcase_failures, testcase_total);
+ printk("-----------------------------------------------------------------\n");
+ } else if (expected_testcase_failures && testcase_successes) {
+ printk("--------------------\n");
+ printk("%3d out of %3d testcases failed, as expected. |\n",
+ expected_testcase_failures, testcase_total);
+ printk("----------------------------------------------------\n");
+ } else if (expected_testcase_failures && !testcase_successes) {
+ printk("--------------------\n");
+ printk("All %3d testcases failed, as expected. |\n",
+ expected_testcase_failures);
+ printk("----------------------------------------\n");
+ } else {
+ printk("--------------------\n");
+ printk("Good, all %3d testcases passed! |\n",
+ testcase_successes);
+ printk("---------------------------------\n");
+ }
+}
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 80dc793b3f6..1c4d769e21e 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -45,6 +45,15 @@ int iommu_detected __read_mostly = 0;
*/
int iommu_pass_through __read_mostly;
+/*
+ * Group multi-function PCI devices into a single device-group for the
+ * iommu_device_group interface. This tells the iommu driver to pretend
+ * it cannot distinguish between functions of a device, exposing only one
+ * group for the device. Useful for disallowing use of individual PCI
+ * functions from userspace drivers.
+ */
+int iommu_group_mf __read_mostly;
+
extern struct iommu_table_entry __iommu_table[], __iommu_table_end[];
/* Dummy device used for NULL arguments (normally ISA). */
@@ -169,6 +178,8 @@ static __init int iommu_setup(char *p)
#endif
if (!strncmp(p, "pt", 2))
iommu_pass_through = 1;
+ if (!strncmp(p, "group_mf", 8))
+ iommu_group_mf = 1;
gart_parse_options(p);
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 89a04c7b5bb..50267386b76 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1392,20 +1392,18 @@ long syscall_trace_enter(struct pt_regs *regs)
if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
trace_sys_enter(regs, regs->orig_ax);
- if (unlikely(current->audit_context)) {
- if (IS_IA32)
- audit_syscall_entry(AUDIT_ARCH_I386,
- regs->orig_ax,
- regs->bx, regs->cx,
- regs->dx, regs->si);
+ if (IS_IA32)
+ audit_syscall_entry(AUDIT_ARCH_I386,
+ regs->orig_ax,
+ regs->bx, regs->cx,
+ regs->dx, regs->si);
#ifdef CONFIG_X86_64
- else
- audit_syscall_entry(AUDIT_ARCH_X86_64,
- regs->orig_ax,
- regs->di, regs->si,
- regs->dx, regs->r10);
+ else
+ audit_syscall_entry(AUDIT_ARCH_X86_64,
+ regs->orig_ax,
+ regs->di, regs->si,
+ regs->dx, regs->r10);
#endif
- }
return ret ?: regs->orig_ax;
}
@@ -1414,8 +1412,7 @@ void syscall_trace_leave(struct pt_regs *regs)
{
bool step;
- if (unlikely(current->audit_context))
- audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
+ audit_syscall_exit(regs);
if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
trace_sys_exit(regs, regs->ax);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index d05444ac2ae..d7d5099fe87 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -749,12 +749,7 @@ void __init setup_arch(char **cmdline_p)
#endif
#ifdef CONFIG_EFI
if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
-#ifdef CONFIG_X86_32
- "EL32",
-#else
- "EL64",
-#endif
- 4)) {
+ EFI_LOADER_SIGNATURE, 4)) {
efi_enabled = 1;
efi_memblock_x86_reserve_range();
}
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 54ddaeb221c..46a01bdc27e 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -682,7 +682,6 @@ static int
handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
struct pt_regs *regs)
{
- sigset_t blocked;
int ret;
/* Are we from a system call? */
@@ -733,10 +732,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
*/
regs->flags &= ~X86_EFLAGS_TF;
- sigorsets(&blocked, &current->blocked, &ka->sa.sa_mask);
- if (!(ka->sa.sa_flags & SA_NODEFER))
- sigaddset(&blocked, sig);
- set_current_blocked(&blocked);
+ block_sigmask(ka, sig);
tracehook_signal_handler(sig, info, ka, regs,
test_thread_flag(TIF_SINGLESTEP));
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 16204dc1548..66c74f481ca 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -29,6 +29,7 @@
#include <asm/mmu_context.h>
#include <asm/proto.h>
#include <asm/apic.h>
+#include <asm/nmi.h>
/*
* Some notes on x86 processor bugs affecting SMP operation:
*
@@ -148,6 +149,60 @@ void native_send_call_func_ipi(const struct cpumask *mask)
free_cpumask_var(allbutself);
}
+static atomic_t stopping_cpu = ATOMIC_INIT(-1);
+
+static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
+{
+ /* We are registered on stopping cpu too, avoid spurious NMI */
+ if (raw_smp_processor_id() == atomic_read(&stopping_cpu))
+ return NMI_HANDLED;
+
+ stop_this_cpu(NULL);
+
+ return NMI_HANDLED;
+}
+
+static void native_nmi_stop_other_cpus(int wait)
+{
+ unsigned long flags;
+ unsigned long timeout;
+
+ if (reboot_force)
+ return;
+
+ /*
+ * Use an own vector here because smp_call_function
+ * does lots of things not suitable in a panic situation.
+ */
+ if (num_online_cpus() > 1) {
+ /* did someone beat us here? */
+ if (atomic_cmpxchg(&stopping_cpu, -1, safe_smp_processor_id()) != -1)
+ return;
+
+ if (register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback,
+ NMI_FLAG_FIRST, "smp_stop"))
+ /* Note: we ignore failures here */
+ return;
+
+ /* sync above data before sending NMI */
+ wmb();
+
+ apic->send_IPI_allbutself(NMI_VECTOR);
+
+ /*
+ * Don't wait longer than a second if the caller
+ * didn't ask us to wait.
+ */
+ timeout = USEC_PER_SEC;
+ while (num_online_cpus() > 1 && (wait || timeout--))
+ udelay(1);
+ }
+
+ local_irq_save(flags);
+ disable_local_APIC();
+ local_irq_restore(flags);
+}
+
/*
* this function calls the 'stop' function on all other CPUs in the system.
*/
@@ -160,7 +215,7 @@ asmlinkage void smp_reboot_interrupt(void)
irq_exit();
}
-static void native_stop_other_cpus(int wait)
+static void native_irq_stop_other_cpus(int wait)
{
unsigned long flags;
unsigned long timeout;
@@ -194,6 +249,11 @@ static void native_stop_other_cpus(int wait)
local_irq_restore(flags);
}
+static void native_smp_disable_nmi_ipi(void)
+{
+ smp_ops.stop_other_cpus = native_irq_stop_other_cpus;
+}
+
/*
* Reschedule call back.
*/
@@ -225,12 +285,20 @@ void smp_call_function_single_interrupt(struct pt_regs *regs)
irq_exit();
}
+static int __init nonmi_ipi_setup(char *str)
+{
+ native_smp_disable_nmi_ipi();
+ return 1;
+}
+
+__setup("nonmi_ipi", nonmi_ipi_setup);
+
struct smp_ops smp_ops = {
.smp_prepare_boot_cpu = native_smp_prepare_boot_cpu,
.smp_prepare_cpus = native_smp_prepare_cpus,
.smp_cpus_done = native_smp_cpus_done,
- .stop_other_cpus = native_stop_other_cpus,
+ .stop_other_cpus = native_nmi_stop_other_cpus,
.smp_send_reschedule = native_smp_send_reschedule,
.cpu_up = native_cpu_up,
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index e38e21754ee..66d250c00d1 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -207,23 +207,29 @@ static void __cpuinit smp_callin(void)
* Need to setup vector mappings before we enable interrupts.
*/
setup_vector_irq(smp_processor_id());
+
+ /*
+ * Save our processor parameters. Note: this information
+ * is needed for clock calibration.
+ */
+ smp_store_cpu_info(cpuid);
+
/*
* Get our bogomips.
+ * Update loops_per_jiffy in cpu_data. Previous call to
+ * smp_store_cpu_info() stored a value that is close but not as
+ * accurate as the value just calculated.
*
* Need to enable IRQs because it can take longer and then
* the NMI watchdog might kill us.
*/
local_irq_enable();
calibrate_delay();
+ cpu_data(cpuid).loops_per_jiffy = loops_per_jiffy;
local_irq_disable();
pr_debug("Stack at about %p\n", &cpuid);
/*
- * Save our processor parameters
- */
- smp_store_cpu_info(cpuid);
-
- /*
* This must be done before setting cpu_online_mask
* or calling notify_cpu_starting.
*/
@@ -1143,6 +1149,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
{
pr_debug("Boot done.\n");
+ nmi_selftest();
impress_friends();
#ifdef CONFIG_X86_IO_APIC
setup_ioapic_dest();
diff --git a/arch/x86/kernel/syscall_32.c b/arch/x86/kernel/syscall_32.c
new file mode 100644
index 00000000000..147fcd4941c
--- /dev/null
+++ b/arch/x86/kernel/syscall_32.c
@@ -0,0 +1,25 @@
+/* System call table for i386. */
+
+#include <linux/linkage.h>
+#include <linux/sys.h>
+#include <linux/cache.h>
+#include <asm/asm-offsets.h>
+
+#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void sym(void) ;
+#include <asm/syscalls_32.h>
+#undef __SYSCALL_I386
+
+#define __SYSCALL_I386(nr, sym, compat) [nr] = sym,
+
+typedef asmlinkage void (*sys_call_ptr_t)(void);
+
+extern asmlinkage void sys_ni_syscall(void);
+
+const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
+ /*
+ * Smells like a compiler bug -- it doesn't work
+ * when the & below is removed.
+ */
+ [0 ... __NR_syscall_max] = &sys_ni_syscall,
+#include <asm/syscalls_32.h>
+};
diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c
index de87d600829..7ac7943be02 100644
--- a/arch/x86/kernel/syscall_64.c
+++ b/arch/x86/kernel/syscall_64.c
@@ -5,15 +5,11 @@
#include <linux/cache.h>
#include <asm/asm-offsets.h>
-#define __NO_STUBS
+#define __SYSCALL_64(nr, sym, compat) extern asmlinkage void sym(void) ;
+#include <asm/syscalls_64.h>
+#undef __SYSCALL_64
-#define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ;
-#undef _ASM_X86_UNISTD_64_H
-#include <asm/unistd_64.h>
-
-#undef __SYSCALL
-#define __SYSCALL(nr, sym) [nr] = sym,
-#undef _ASM_X86_UNISTD_64_H
+#define __SYSCALL_64(nr, sym, compat) [nr] = sym,
typedef void (*sys_call_ptr_t)(void);
@@ -21,9 +17,9 @@ extern void sys_ni_syscall(void);
const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
/*
- *Smells like a like a compiler bug -- it doesn't work
- *when the & below is removed.
- */
+ * Smells like a compiler bug -- it doesn't work
+ * when the & below is removed.
+ */
[0 ... __NR_syscall_max] = &sys_ni_syscall,
-#include <asm/unistd_64.h>
+#include <asm/syscalls_64.h>
};
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
deleted file mode 100644
index 9a0e3129392..00000000000
--- a/arch/x86/kernel/syscall_table_32.S
+++ /dev/null
@@ -1,350 +0,0 @@
-ENTRY(sys_call_table)
- .long sys_restart_syscall /* 0 - old "setup()" system call, used for restarting */
- .long sys_exit
- .long ptregs_fork
- .long sys_read
- .long sys_write
- .long sys_open /* 5 */
- .long sys_close
- .long sys_waitpid
- .long sys_creat
- .long sys_link
- .long sys_unlink /* 10 */
- .long ptregs_execve
- .long sys_chdir
- .long sys_time
- .long sys_mknod
- .long sys_chmod /* 15 */
- .long sys_lchown16
- .long sys_ni_syscall /* old break syscall holder */
- .long sys_stat
- .long sys_lseek
- .long sys_getpid /* 20 */
- .long sys_mount
- .long sys_oldumount
- .long sys_setuid16
- .long sys_getuid16
- .long sys_stime /* 25 */
- .long sys_ptrace
- .long sys_alarm
- .long sys_fstat
- .long sys_pause
- .long sys_utime /* 30 */
- .long sys_ni_syscall /* old stty syscall holder */
- .long sys_ni_syscall /* old gtty syscall holder */
- .long sys_access
- .long sys_nice
- .long sys_ni_syscall /* 35 - old ftime syscall holder */
- .long sys_sync
- .long sys_kill
- .long sys_rename
- .long sys_mkdir
- .long sys_rmdir /* 40 */
- .long sys_dup
- .long sys_pipe
- .long sys_times
- .long sys_ni_syscall /* old prof syscall holder */
- .long sys_brk /* 45 */
- .long sys_setgid16
- .long sys_getgid16
- .long sys_signal
- .long sys_geteuid16
- .long sys_getegid16 /* 50 */
- .long sys_acct
- .long sys_umount /* recycled never used phys() */
- .long sys_ni_syscall /* old lock syscall holder */
- .long sys_ioctl
- .long sys_fcntl /* 55 */
- .long sys_ni_syscall /* old mpx syscall holder */
- .long sys_setpgid
- .long sys_ni_syscall /* old ulimit syscall holder */
- .long sys_olduname
- .long sys_umask /* 60 */
- .long sys_chroot
- .long sys_ustat
- .long sys_dup2
- .long sys_getppid
- .long sys_getpgrp /* 65 */
- .long sys_setsid
- .long sys_sigaction
- .long sys_sgetmask
- .long sys_ssetmask
- .long sys_setreuid16 /* 70 */
- .long sys_setregid16
- .long sys_sigsuspend
- .long sys_sigpending
- .long sys_sethostname
- .long sys_setrlimit /* 75 */
- .long sys_old_getrlimit
- .long sys_getrusage
- .long sys_gettimeofday
- .long sys_settimeofday
- .long sys_getgroups16 /* 80 */
- .long sys_setgroups16
- .long sys_old_select
- .long sys_symlink
- .long sys_lstat
- .long sys_readlink /* 85 */
- .long sys_uselib
- .long sys_swapon
- .long sys_reboot
- .long sys_old_readdir
- .long sys_old_mmap /* 90 */
- .long sys_munmap
- .long sys_truncate
- .long sys_ftruncate
- .long sys_fchmod
- .long sys_fchown16 /* 95 */
- .long sys_getpriority
- .long sys_setpriority
- .long sys_ni_syscall /* old profil syscall holder */
- .long sys_statfs
- .long sys_fstatfs /* 100 */
- .long sys_ioperm
- .long sys_socketcall
- .long sys_syslog
- .long sys_setitimer
- .long sys_getitimer /* 105 */
- .long sys_newstat
- .long sys_newlstat
- .long sys_newfstat
- .long sys_uname
- .long ptregs_iopl /* 110 */
- .long sys_vhangup
- .long sys_ni_syscall /* old "idle" system call */
- .long ptregs_vm86old
- .long sys_wait4
- .long sys_swapoff /* 115 */
- .long sys_sysinfo
- .long sys_ipc
- .long sys_fsync
- .long ptregs_sigreturn
- .long ptregs_clone /* 120 */
- .long sys_setdomainname
- .long sys_newuname
- .long sys_modify_ldt
- .long sys_adjtimex
- .long sys_mprotect /* 125 */
- .long sys_sigprocmask
- .long sys_ni_syscall /* old "create_module" */
- .long sys_init_module
- .long sys_delete_module
- .long sys_ni_syscall /* 130: old "get_kernel_syms" */
- .long sys_quotactl
- .long sys_getpgid
- .long sys_fchdir
- .long sys_bdflush
- .long sys_sysfs /* 135 */
- .long sys_personality
- .long sys_ni_syscall /* reserved for afs_syscall */
- .long sys_setfsuid16
- .long sys_setfsgid16
- .long sys_llseek /* 140 */
- .long sys_getdents
- .long sys_select
- .long sys_flock
- .long sys_msync
- .long sys_readv /* 145 */
- .long sys_writev
- .long sys_getsid
- .long sys_fdatasync
- .long sys_sysctl
- .long sys_mlock /* 150 */
- .long sys_munlock
- .long sys_mlockall
- .long sys_munlockall
- .long sys_sched_setparam
- .long sys_sched_getparam /* 155 */
- .long sys_sched_setscheduler
- .long sys_sched_getscheduler
- .long sys_sched_yield
- .long sys_sched_get_priority_max
- .long sys_sched_get_priority_min /* 160 */
- .long sys_sched_rr_get_interval
- .long sys_nanosleep
- .long sys_mremap
- .long sys_setresuid16
- .long sys_getresuid16 /* 165 */
- .long ptregs_vm86
- .long sys_ni_syscall /* Old sys_query_module */
- .long sys_poll
- .long sys_ni_syscall /* Old nfsservctl */
- .long sys_setresgid16 /* 170 */
- .long sys_getresgid16
- .long sys_prctl
- .long ptregs_rt_sigreturn
- .long sys_rt_sigaction
- .long sys_rt_sigprocmask /* 175 */
- .long sys_rt_sigpending
- .long sys_rt_sigtimedwait
- .long sys_rt_sigqueueinfo
- .long sys_rt_sigsuspend
- .long sys_pread64 /* 180 */
- .long sys_pwrite64
- .long sys_chown16
- .long sys_getcwd
- .long sys_capget
- .long sys_capset /* 185 */
- .long ptregs_sigaltstack
- .long sys_sendfile
- .long sys_ni_syscall /* reserved for streams1 */
- .long sys_ni_syscall /* reserved for streams2 */
- .long ptregs_vfork /* 190 */
- .long sys_getrlimit
- .long sys_mmap_pgoff
- .long sys_truncate64
- .long sys_ftruncate64
- .long sys_stat64 /* 195 */
- .long sys_lstat64
- .long sys_fstat64
- .long sys_lchown
- .long sys_getuid
- .long sys_getgid /* 200 */
- .long sys_geteuid
- .long sys_getegid
- .long sys_setreuid
- .long sys_setregid
- .long sys_getgroups /* 205 */
- .long sys_setgroups
- .long sys_fchown
- .long sys_setresuid
- .long sys_getresuid
- .long sys_setresgid /* 210 */
- .long sys_getresgid
- .long sys_chown
- .long sys_setuid
- .long sys_setgid
- .long sys_setfsuid /* 215 */
- .long sys_setfsgid
- .long sys_pivot_root
- .long sys_mincore
- .long sys_madvise
- .long sys_getdents64 /* 220 */
- .long sys_fcntl64
- .long sys_ni_syscall /* reserved for TUX */
- .long sys_ni_syscall
- .long sys_gettid
- .long sys_readahead /* 225 */
- .long sys_setxattr
- .long sys_lsetxattr
- .long sys_fsetxattr
- .long sys_getxattr
- .long sys_lgetxattr /* 230 */
- .long sys_fgetxattr
- .long sys_listxattr
- .long sys_llistxattr
- .long sys_flistxattr
- .long sys_removexattr /* 235 */
- .long sys_lremovexattr
- .long sys_fremovexattr
- .long sys_tkill
- .long sys_sendfile64
- .long sys_futex /* 240 */
- .long sys_sched_setaffinity
- .long sys_sched_getaffinity
- .long sys_set_thread_area
- .long sys_get_thread_area
- .long sys_io_setup /* 245 */
- .long sys_io_destroy
- .long sys_io_getevents
- .long sys_io_submit
- .long sys_io_cancel
- .long sys_fadvise64 /* 250 */
- .long sys_ni_syscall
- .long sys_exit_group
- .long sys_lookup_dcookie
- .long sys_epoll_create
- .long sys_epoll_ctl /* 255 */
- .long sys_epoll_wait
- .long sys_remap_file_pages
- .long sys_set_tid_address
- .long sys_timer_create
- .long sys_timer_settime /* 260 */
- .long sys_timer_gettime
- .long sys_timer_getoverrun
- .long sys_timer_delete
- .long sys_clock_settime
- .long sys_clock_gettime /* 265 */
- .long sys_clock_getres
- .long sys_clock_nanosleep
- .long sys_statfs64
- .long sys_fstatfs64
- .long sys_tgkill /* 270 */
- .long sys_utimes
- .long sys_fadvise64_64
- .long sys_ni_syscall /* sys_vserver */
- .long sys_mbind
- .long sys_get_mempolicy
- .long sys_set_mempolicy
- .long sys_mq_open
- .long sys_mq_unlink
- .long sys_mq_timedsend
- .long sys_mq_timedreceive /* 280 */
- .long sys_mq_notify
- .long sys_mq_getsetattr
- .long sys_kexec_load
- .long sys_waitid
- .long sys_ni_syscall /* 285 */ /* available */
- .long sys_add_key
- .long sys_request_key
- .long sys_keyctl
- .long sys_ioprio_set
- .long sys_ioprio_get /* 290 */
- .long sys_inotify_init
- .long sys_inotify_add_watch
- .long sys_inotify_rm_watch
- .long sys_migrate_pages
- .long sys_openat /* 295 */
- .long sys_mkdirat
- .long sys_mknodat
- .long sys_fchownat
- .long sys_futimesat
- .long sys_fstatat64 /* 300 */
- .long sys_unlinkat
- .long sys_renameat
- .long sys_linkat
- .long sys_symlinkat
- .long sys_readlinkat /* 305 */
- .long sys_fchmodat
- .long sys_faccessat
- .long sys_pselect6
- .long sys_ppoll
- .long sys_unshare /* 310 */
- .long sys_set_robust_list
- .long sys_get_robust_list
- .long sys_splice
- .long sys_sync_file_range
- .long sys_tee /* 315 */
- .long sys_vmsplice
- .long sys_move_pages
- .long sys_getcpu
- .long sys_epoll_pwait
- .long sys_utimensat /* 320 */
- .long sys_signalfd
- .long sys_timerfd_create
- .long sys_eventfd
- .long sys_fallocate
- .long sys_timerfd_settime /* 325 */
- .long sys_timerfd_gettime
- .long sys_signalfd4
- .long sys_eventfd2
- .long sys_epoll_create1
- .long sys_dup3 /* 330 */
- .long sys_pipe2
- .long sys_inotify_init1
- .long sys_preadv
- .long sys_pwritev
- .long sys_rt_tgsigqueueinfo /* 335 */
- .long sys_perf_event_open
- .long sys_recvmmsg
- .long sys_fanotify_init
- .long sys_fanotify_mark
- .long sys_prlimit64 /* 340 */
- .long sys_name_to_handle_at
- .long sys_open_by_handle_at
- .long sys_clock_adjtime
- .long sys_syncfs
- .long sys_sendmmsg /* 345 */
- .long sys_setns
- .long sys_process_vm_readv
- .long sys_process_vm_writev
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index fa1191fb679..482ec3af206 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -311,9 +311,15 @@ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
== NOTIFY_STOP)
return;
+ /*
+ * Let others (NMI) know that the debug stack is in use
+ * as we may switch to the interrupt stack.
+ */
+ debug_stack_usage_inc();
preempt_conditional_sti(regs);
do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
preempt_conditional_cli(regs);
+ debug_stack_usage_dec();
}
#ifdef CONFIG_X86_64
@@ -406,6 +412,12 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
SIGTRAP) == NOTIFY_STOP)
return;
+ /*
+ * Let others (NMI) know that the debug stack is in use
+ * as we may switch to the interrupt stack.
+ */
+ debug_stack_usage_inc();
+
/* It's safe to allow irq's after DR6 has been saved */
preempt_conditional_sti(regs);
@@ -413,6 +425,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
handle_vm86_trap((struct kernel_vm86_regs *) regs,
error_code, 1);
preempt_conditional_cli(regs);
+ debug_stack_usage_dec();
return;
}
@@ -432,6 +445,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp)
send_sigtrap(tsk, regs, error_code, si_code);
preempt_conditional_cli(regs);
+ debug_stack_usage_dec();
return;
}
@@ -718,4 +732,10 @@ void __init trap_init(void)
cpu_init();
x86_init.irqs.trap_init();
+
+#ifdef CONFIG_X86_64
+ memcpy(&nmi_idt_table, &idt_table, IDT_ENTRIES * 16);
+ set_nmi_gate(1, &debug);
+ set_nmi_gate(3, &int3);
+#endif
}
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index f5469461117..a62c201c97e 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -993,3 +993,23 @@ void __init tsc_init(void)
check_system_tsc_reliable();
}
+#ifdef CONFIG_SMP
+/*
+ * If we have a constant TSC and are using the TSC for the delay loop,
+ * we can skip clock calibration if another cpu in the same socket has already
+ * been calibrated. This assumes that CONSTANT_TSC applies to all
+ * cpus in the socket - this should be a safe assumption.
+ */
+unsigned long __cpuinit calibrate_delay_is_known(void)
+{
+ int i, cpu = smp_processor_id();
+
+ if (!tsc_disabled && !cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC))
+ return 0;
+
+ for_each_online_cpu(i)
+ if (cpu_data(i).phys_proc_id == cpu_data(cpu).phys_proc_id)
+ return cpu_data(i).loops_per_jiffy;
+ return 0;
+}
+#endif
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 863f8753ab0..b466cab5ba1 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -335,9 +335,11 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
if (info->flags & VM86_SCREEN_BITMAP)
mark_screen_rdonly(tsk->mm);
- /*call audit_syscall_exit since we do not exit via the normal paths */
+ /*call __audit_syscall_exit since we do not exit via the normal paths */
+#ifdef CONFIG_AUDITSYSCALL
if (unlikely(current->audit_context))
- audit_syscall_exit(AUDITSC_RESULT(0), 0);
+ __audit_syscall_exit(1, 0);
+#endif
__asm__ __volatile__(
"movl %0,%%esp\n\t"
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 91f83e21b98..947a06ccc67 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -115,4 +115,5 @@ struct x86_msi_ops x86_msi = {
.setup_msi_irqs = native_setup_msi_irqs,
.teardown_msi_irq = native_teardown_msi_irq,
.teardown_msi_irqs = default_teardown_msi_irqs,
+ .restore_msi_irqs = default_restore_msi_irqs,
};