diff options
Diffstat (limited to 'arch/x86/kernel')
27 files changed, 534 insertions, 355 deletions
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index e48cafcf92a..bacf4b0d91f 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1706,3 +1706,9 @@ int __acpi_release_global_lock(unsigned int *lock) } while (unlikely (val != old)); return old & 0x1; } + +void __init arch_reserve_mem_area(acpi_physical_address addr, size_t size) +{ + e820_add_region(addr, size, E820_ACPI); + update_e820(); +} diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index a65829ac2b9..9c2aa89a11c 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -22,6 +22,7 @@ #include <linux/hardirq.h> #include <linux/delay.h> +#include <asm/numachip/numachip.h> #include <asm/numachip/numachip_csr.h> #include <asm/smp.h> #include <asm/apic.h> @@ -179,6 +180,7 @@ static int __init numachip_system_init(void) return 0; x86_cpuinit.fixup_cpu_id = fixup_cpu_id; + x86_init.pci.arch_init = pci_numachip_init; map_csrs(); diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index 6a05c1d327a..5b7d4fa5d3b 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -24,8 +24,6 @@ struct mce_bank { int mce_severity(struct mce *a, int tolerant, char **msg); struct dentry *mce_get_debugfs_dir(void); -extern int mce_ser; - extern struct mce_bank *mce_banks; #ifdef CONFIG_X86_MCE_INTEL diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 13017626f9a..beb1f1689e5 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -193,9 +193,9 @@ int mce_severity(struct mce *m, int tolerant, char **msg) continue; if ((m->mcgstatus & s->mcgmask) != s->mcgres) continue; - if (s->ser == SER_REQUIRED && !mce_ser) + if (s->ser == SER_REQUIRED && !mca_cfg.ser) continue; - if (s->ser == NO_SER && mce_ser) + if (s->ser == NO_SER && mca_cfg.ser) continue; if (s->context && ctx != s->context) continue; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 46cbf868969..80dbda84f1c 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -58,34 +58,26 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex); #define CREATE_TRACE_POINTS #include <trace/events/mce.h> -int mce_disabled __read_mostly; - #define SPINUNIT 100 /* 100ns */ atomic_t mce_entry; DEFINE_PER_CPU(unsigned, mce_exception_count); -/* - * Tolerant levels: - * 0: always panic on uncorrected errors, log corrected errors - * 1: panic or SIGBUS on uncorrected errors, log corrected errors - * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors - * 3: never panic or SIGBUS, log all errors (for testing only) - */ -static int tolerant __read_mostly = 1; -static int banks __read_mostly; -static int rip_msr __read_mostly; -static int mce_bootlog __read_mostly = -1; -static int monarch_timeout __read_mostly = -1; -static int mce_panic_timeout __read_mostly; -static int mce_dont_log_ce __read_mostly; -int mce_cmci_disabled __read_mostly; -int mce_ignore_ce __read_mostly; -int mce_ser __read_mostly; -int mce_bios_cmci_threshold __read_mostly; - -struct mce_bank *mce_banks __read_mostly; +struct mce_bank *mce_banks __read_mostly; + +struct mca_config mca_cfg __read_mostly = { + .bootlog = -1, + /* + * Tolerant levels: + * 0: always panic on uncorrected errors, log corrected errors + * 1: panic or SIGBUS on uncorrected errors, log corrected errors + * 2: SIGBUS or log uncorrected errors (if possible), log corr. errors + * 3: never panic or SIGBUS, log all errors (for testing only) + */ + .tolerant = 1, + .monarch_timeout = -1 +}; /* User mode helper program triggered by machine check event */ static unsigned long mce_need_notify; @@ -302,7 +294,7 @@ static void wait_for_panic(void) while (timeout-- > 0) udelay(1); if (panic_timeout == 0) - panic_timeout = mce_panic_timeout; + panic_timeout = mca_cfg.panic_timeout; panic("Panicing machine check CPU died"); } @@ -360,7 +352,7 @@ static void mce_panic(char *msg, struct mce *final, char *exp) pr_emerg(HW_ERR "Machine check: %s\n", exp); if (!fake_panic) { if (panic_timeout == 0) - panic_timeout = mce_panic_timeout; + panic_timeout = mca_cfg.panic_timeout; panic(msg); } else pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg); @@ -372,7 +364,7 @@ static int msr_to_offset(u32 msr) { unsigned bank = __this_cpu_read(injectm.bank); - if (msr == rip_msr) + if (msr == mca_cfg.rip_msr) return offsetof(struct mce, ip); if (msr == MSR_IA32_MCx_STATUS(bank)) return offsetof(struct mce, status); @@ -451,8 +443,8 @@ static inline void mce_gather_info(struct mce *m, struct pt_regs *regs) m->cs |= 3; } /* Use accurate RIP reporting if available. */ - if (rip_msr) - m->ip = mce_rdmsrl(rip_msr); + if (mca_cfg.rip_msr) + m->ip = mce_rdmsrl(mca_cfg.rip_msr); } } @@ -513,7 +505,7 @@ static int mce_ring_add(unsigned long pfn) int mce_available(struct cpuinfo_x86 *c) { - if (mce_disabled) + if (mca_cfg.disabled) return 0; return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); } @@ -565,7 +557,7 @@ static void mce_read_aux(struct mce *m, int i) /* * Mask the reported address by the reported granularity. */ - if (mce_ser && (m->status & MCI_STATUS_MISCV)) { + if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) { u8 shift = MCI_MISC_ADDR_LSB(m->misc); m->addr >>= shift; m->addr <<= shift; @@ -599,7 +591,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) mce_gather_info(&m, NULL); - for (i = 0; i < banks; i++) { + for (i = 0; i < mca_cfg.banks; i++) { if (!mce_banks[i].ctl || !test_bit(i, *b)) continue; @@ -620,7 +612,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) * TBD do the same check for MCI_STATUS_EN here? */ if (!(flags & MCP_UC) && - (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC))) + (m.status & (mca_cfg.ser ? MCI_STATUS_S : MCI_STATUS_UC))) continue; mce_read_aux(&m, i); @@ -631,7 +623,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) * Don't get the IP here because it's unlikely to * have anything to do with the actual error location. */ - if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) + if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce) mce_log(&m); /* @@ -658,14 +650,14 @@ static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, { int i, ret = 0; - for (i = 0; i < banks; i++) { + for (i = 0; i < mca_cfg.banks; i++) { m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); if (m->status & MCI_STATUS_VAL) { __set_bit(i, validp); if (quirk_no_way_out) quirk_no_way_out(i, m, regs); } - if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) + if (mce_severity(m, mca_cfg.tolerant, msg) >= MCE_PANIC_SEVERITY) ret = 1; } return ret; @@ -696,11 +688,11 @@ static int mce_timed_out(u64 *t) rmb(); if (atomic_read(&mce_paniced)) wait_for_panic(); - if (!monarch_timeout) + if (!mca_cfg.monarch_timeout) goto out; if ((s64)*t < SPINUNIT) { /* CHECKME: Make panic default for 1 too? */ - if (tolerant < 1) + if (mca_cfg.tolerant < 1) mce_panic("Timeout synchronizing machine check over CPUs", NULL, NULL); cpu_missing = 1; @@ -750,7 +742,8 @@ static void mce_reign(void) * Grade the severity of the errors of all the CPUs. */ for_each_possible_cpu(cpu) { - int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant, + int severity = mce_severity(&per_cpu(mces_seen, cpu), + mca_cfg.tolerant, &nmsg); if (severity > global_worst) { msg = nmsg; @@ -764,7 +757,7 @@ static void mce_reign(void) * This dumps all the mces in the log buffer and stops the * other CPUs. */ - if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3) + if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) mce_panic("Fatal Machine check", m, msg); /* @@ -777,7 +770,7 @@ static void mce_reign(void) * No machine check event found. Must be some external * source or one CPU is hung. Panic. */ - if (global_worst <= MCE_KEEP_SEVERITY && tolerant < 3) + if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3) mce_panic("Machine check from unknown source", NULL, NULL); /* @@ -801,7 +794,7 @@ static int mce_start(int *no_way_out) { int order; int cpus = num_online_cpus(); - u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; + u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC; if (!timeout) return -1; @@ -865,7 +858,7 @@ static int mce_start(int *no_way_out) static int mce_end(int order) { int ret = -1; - u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; + u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC; if (!timeout) goto reset; @@ -946,7 +939,7 @@ static void mce_clear_state(unsigned long *toclear) { int i; - for (i = 0; i < banks; i++) { + for (i = 0; i < mca_cfg.banks; i++) { if (test_bit(i, toclear)) mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); } @@ -1011,6 +1004,7 @@ static void mce_clear_info(struct mce_info *mi) */ void do_machine_check(struct pt_regs *regs, long error_code) { + struct mca_config *cfg = &mca_cfg; struct mce m, *final; int i; int worst = 0; @@ -1022,7 +1016,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) int order; /* * If no_way_out gets set, there is no safe way to recover from this - * MCE. If tolerant is cranked up, we'll try anyway. + * MCE. If mca_cfg.tolerant is cranked up, we'll try anyway. */ int no_way_out = 0; /* @@ -1038,7 +1032,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) this_cpu_inc(mce_exception_count); - if (!banks) + if (!cfg->banks) goto out; mce_gather_info(&m, regs); @@ -1065,7 +1059,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) * because the first one to see it will clear it. */ order = mce_start(&no_way_out); - for (i = 0; i < banks; i++) { + for (i = 0; i < cfg->banks; i++) { __clear_bit(i, toclear); if (!test_bit(i, valid_banks)) continue; @@ -1084,7 +1078,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) * Non uncorrected or non signaled errors are handled by * machine_check_poll. Leave them alone, unless this panics. */ - if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) && + if (!(m.status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) && !no_way_out) continue; @@ -1093,7 +1087,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) */ add_taint(TAINT_MACHINE_CHECK); - severity = mce_severity(&m, tolerant, NULL); + severity = mce_severity(&m, cfg->tolerant, NULL); /* * When machine check was for corrected handler don't touch, @@ -1117,7 +1111,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) * When the ring overflows we just ignore the AO error. * RED-PEN add some logging mechanism when * usable_address or mce_add_ring fails. - * RED-PEN don't ignore overflow for tolerant == 0 + * RED-PEN don't ignore overflow for mca_cfg.tolerant == 0 */ if (severity == MCE_AO_SEVERITY && mce_usable_address(&m)) mce_ring_add(m.addr >> PAGE_SHIFT); @@ -1149,7 +1143,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) * issues we try to recover, or limit damage to the current * process. */ - if (tolerant < 3) { + if (cfg->tolerant < 3) { if (no_way_out) mce_panic("Fatal machine check on current CPU", &m, msg); if (worst == MCE_AR_SEVERITY) { @@ -1377,11 +1371,13 @@ EXPORT_SYMBOL_GPL(mce_notify_irq); static int __cpuinit __mcheck_cpu_mce_banks_init(void) { int i; + u8 num_banks = mca_cfg.banks; - mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL); + mce_banks = kzalloc(num_banks * sizeof(struct mce_bank), GFP_KERNEL); if (!mce_banks) return -ENOMEM; - for (i = 0; i < banks; i++) { + + for (i = 0; i < num_banks; i++) { struct mce_bank *b = &mce_banks[i]; b->ctl = -1ULL; @@ -1401,7 +1397,7 @@ static int __cpuinit __mcheck_cpu_cap_init(void) rdmsrl(MSR_IA32_MCG_CAP, cap); b = cap & MCG_BANKCNT_MASK; - if (!banks) + if (!mca_cfg.banks) pr_info("CPU supports %d MCE banks\n", b); if (b > MAX_NR_BANKS) { @@ -1411,8 +1407,9 @@ static int __cpuinit __mcheck_cpu_cap_init(void) } /* Don't support asymmetric configurations today */ - WARN_ON(banks != 0 && b != banks); - banks = b; + WARN_ON(mca_cfg.banks != 0 && b != mca_cfg.banks); + mca_cfg.banks = b; + if (!mce_banks) { int err = __mcheck_cpu_mce_banks_init(); @@ -1422,25 +1419,29 @@ static int __cpuinit __mcheck_cpu_cap_init(void) /* Use accurate RIP reporting if available. */ if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) - rip_msr = MSR_IA32_MCG_EIP; + mca_cfg.rip_msr = MSR_IA32_MCG_EIP; if (cap & MCG_SER_P) - mce_ser = 1; + mca_cfg.ser = true; return 0; } static void __mcheck_cpu_init_generic(void) { + enum mcp_flags m_fl = 0; mce_banks_t all_banks; u64 cap; int i; + if (!mca_cfg.bootlog) + m_fl = MCP_DONTLOG; + /* * Log the machine checks left over from the previous reset. */ bitmap_fill(all_banks, MAX_NR_BANKS); - machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks); + machine_check_poll(MCP_UC | m_fl, &all_banks); set_in_cr4(X86_CR4_MCE); @@ -1448,7 +1449,7 @@ static void __mcheck_cpu_init_generic(void) if (cap & MCG_CTL_P) wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); - for (i = 0; i < banks; i++) { + for (i = 0; i < mca_cfg.banks; i++) { struct mce_bank *b = &mce_banks[i]; if (!b->init) @@ -1489,6 +1490,8 @@ static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs) /* Add per CPU specific workarounds here */ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) { + struct mca_config *cfg = &mca_cfg; + if (c->x86_vendor == X86_VENDOR_UNKNOWN) { pr_info("unknown CPU type - not enabling MCE support\n"); return -EOPNOTSUPP; @@ -1496,7 +1499,7 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) /* This should be disabled by the BIOS, but isn't always */ if (c->x86_vendor == X86_VENDOR_AMD) { - if (c->x86 == 15 && banks > 4) { + if (c->x86 == 15 && cfg->banks > 4) { /* * disable GART TBL walk error reporting, which * trips off incorrectly with the IOMMU & 3ware @@ -1504,18 +1507,18 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) */ clear_bit(10, (unsigned long *)&mce_banks[4].ctl); } - if (c->x86 <= 17 && mce_bootlog < 0) { + if (c->x86 <= 17 && cfg->bootlog < 0) { /* * Lots of broken BIOS around that don't clear them * by default and leave crap in there. Don't log: */ - mce_bootlog = 0; + cfg->bootlog = 0; } /* * Various K7s with broken bank 0 around. Always disable * by default. */ - if (c->x86 == 6 && banks > 0) + if (c->x86 == 6 && cfg->banks > 0) mce_banks[0].ctl = 0; /* @@ -1566,7 +1569,7 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) * valid event later, merely don't write CTL0. */ - if (c->x86 == 6 && c->x86_model < 0x1A && banks > 0) + if (c->x86 == 6 && c->x86_model < 0x1A && cfg->banks > 0) mce_banks[0].init = 0; /* @@ -1574,23 +1577,23 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) * synchronization with a one second timeout. */ if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && - monarch_timeout < 0) - monarch_timeout = USEC_PER_SEC; + cfg->monarch_timeout < 0) + cfg->monarch_timeout = USEC_PER_SEC; /* * There are also broken BIOSes on some Pentium M and * earlier systems: */ - if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0) - mce_bootlog = 0; + if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0) + cfg->bootlog = 0; if (c->x86 == 6 && c->x86_model == 45) quirk_no_way_out = quirk_sandybridge_ifu; } - if (monarch_timeout < 0) - monarch_timeout = 0; - if (mce_bootlog != 0) - mce_panic_timeout = 30; + if (cfg->monarch_timeout < 0) + cfg->monarch_timeout = 0; + if (cfg->bootlog != 0) + cfg->panic_timeout = 30; return 0; } @@ -1635,7 +1638,7 @@ static void mce_start_timer(unsigned int cpu, struct timer_list *t) __this_cpu_write(mce_next_interval, iv); - if (mce_ignore_ce || !iv) + if (mca_cfg.ignore_ce || !iv) return; t->expires = round_jiffies(jiffies + iv); @@ -1668,7 +1671,7 @@ void (*machine_check_vector)(struct pt_regs *, long error_code) = */ void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c) { - if (mce_disabled) + if (mca_cfg.disabled) return; if (__mcheck_cpu_ancient_init(c)) @@ -1678,7 +1681,7 @@ void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c) return; if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) { - mce_disabled = 1; + mca_cfg.disabled = true; return; } @@ -1951,6 +1954,8 @@ static struct miscdevice mce_chrdev_device = { */ static int __init mcheck_enable(char *str) { + struct mca_config *cfg = &mca_cfg; + if (*str == 0) { enable_p5_mce(); return 1; @@ -1958,22 +1963,22 @@ static int __init mcheck_enable(char *str) if (*str == '=') str++; if (!strcmp(str, "off")) - mce_disabled = 1; + cfg->disabled = true; else if (!strcmp(str, "no_cmci")) - mce_cmci_disabled = 1; + cfg->cmci_disabled = true; else if (!strcmp(str, "dont_log_ce")) - mce_dont_log_ce = 1; + cfg->dont_log_ce = true; else if (!strcmp(str, "ignore_ce")) - mce_ignore_ce = 1; + cfg->ignore_ce = true; else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) - mce_bootlog = (str[0] == 'b'); + cfg->bootlog = (str[0] == 'b'); else if (!strcmp(str, "bios_cmci_threshold")) - mce_bios_cmci_threshold = 1; + cfg->bios_cmci_threshold = true; else if (isdigit(str[0])) { - get_option(&str, &tolerant); + get_option(&str, &(cfg->tolerant)); if (*str == ',') { ++str; - get_option(&str, &monarch_timeout); + get_option(&str, &(cfg->monarch_timeout)); } } else { pr_info("mce argument %s ignored. Please use /sys\n", str); @@ -2002,7 +2007,7 @@ static int mce_disable_error_reporting(void) { int i; - for (i = 0; i < banks; i++) { + for (i = 0; i < mca_cfg.banks; i++) { struct mce_bank *b = &mce_banks[i]; if (b->init) @@ -2142,15 +2147,15 @@ static ssize_t set_ignore_ce(struct device *s, if (strict_strtoull(buf, 0, &new) < 0) return -EINVAL; - if (mce_ignore_ce ^ !!new) { + if (mca_cfg.ignore_ce ^ !!new) { if (new) { /* disable ce features */ mce_timer_delete_all(); on_each_cpu(mce_disable_cmci, NULL, 1); - mce_ignore_ce = 1; + mca_cfg.ignore_ce = true; } else { /* enable ce features */ - mce_ignore_ce = 0; + mca_cfg.ignore_ce = false; on_each_cpu(mce_enable_ce, (void *)1, 1); } } @@ -2166,14 +2171,14 @@ static ssize_t set_cmci_disabled(struct device *s, if (strict_strtoull(buf, 0, &new) < 0) return -EINVAL; - if (mce_cmci_disabled ^ !!new) { + if (mca_cfg.cmci_disabled ^ !!new) { if (new) { /* disable cmci */ on_each_cpu(mce_disable_cmci, NULL, 1); - mce_cmci_disabled = 1; + mca_cfg.cmci_disabled = true; } else { /* enable cmci */ - mce_cmci_disabled = 0; + mca_cfg.cmci_disabled = false; on_each_cpu(mce_enable_ce, NULL, 1); } } @@ -2190,9 +2195,9 @@ static ssize_t store_int_with_restart(struct device *s, } static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger); -static DEVICE_INT_ATTR(tolerant, 0644, tolerant); -static DEVICE_INT_ATTR(monarch_timeout, 0644, monarch_timeout); -static DEVICE_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce); +static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant); +static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout); +static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce); static struct dev_ext_attribute dev_attr_check_interval = { __ATTR(check_interval, 0644, device_show_int, store_int_with_restart), @@ -2200,13 +2205,13 @@ static struct dev_ext_attribute dev_attr_check_interval = { }; static struct dev_ext_attribute dev_attr_ignore_ce = { - __ATTR(ignore_ce, 0644, device_show_int, set_ignore_ce), - &mce_ignore_ce + __ATTR(ignore_ce, 0644, device_show_bool, set_ignore_ce), + &mca_cfg.ignore_ce }; static struct dev_ext_attribute dev_attr_cmci_disabled = { - __ATTR(cmci_disabled, 0644, device_show_int, set_cmci_disabled), - &mce_cmci_disabled + __ATTR(cmci_disabled, 0644, device_show_bool, set_cmci_disabled), + &mca_cfg.cmci_disabled }; static struct device_attribute *mce_device_attrs[] = { @@ -2253,7 +2258,7 @@ static __cpuinit int mce_device_create(unsigned int cpu) if (err) goto error; } - for (j = 0; j < banks; j++) { + for (j = 0; j < mca_cfg.banks; j++) { err = device_create_file(dev, &mce_banks[j].attr); if (err) goto error2; @@ -2285,7 +2290,7 @@ static __cpuinit void mce_device_remove(unsigned int cpu) for (i = 0; mce_device_attrs[i]; i++) device_remove_file(dev, mce_device_attrs[i]); - for (i = 0; i < banks; i++) + for (i = 0; i < mca_cfg.banks; i++) device_remove_file(dev, &mce_banks[i].attr); device_unregister(dev); @@ -2304,7 +2309,7 @@ static void __cpuinit mce_disable_cpu(void *h) if (!(action & CPU_TASKS_FROZEN)) cmci_clear(); - for (i = 0; i < banks; i++) { + for (i = 0; i < mca_cfg.banks; i++) { struct mce_bank *b = &mce_banks[i]; if (b->init) @@ -2322,7 +2327,7 @@ static void __cpuinit mce_reenable_cpu(void *h) if (!(action & CPU_TASKS_FROZEN)) cmci_reenable(); - for (i = 0; i < banks; i++) { + for (i = 0; i < mca_cfg.banks; i++) { struct mce_bank *b = &mce_banks[i]; if (b->init) @@ -2375,7 +2380,7 @@ static __init void mce_init_banks(void) { int i; - for (i = 0; i < banks; i++) { + for (i = 0; i < mca_cfg.banks; i++) { struct mce_bank *b = &mce_banks[i]; struct device_attribute *a = &b->attr; @@ -2426,7 +2431,7 @@ device_initcall_sync(mcheck_init_device); */ static int __init mcheck_disable(char *str) { - mce_disabled = 1; + mca_cfg.disabled = true; return 1; } __setup("nomce", mcheck_disable); diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 4f9a3cbfc4a..402c454fbff 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -53,7 +53,7 @@ static int cmci_supported(int *banks) { u64 cap; - if (mce_cmci_disabled || mce_ignore_ce) + if (mca_cfg.cmci_disabled || mca_cfg.ignore_ce) return 0; /* @@ -200,7 +200,7 @@ static void cmci_discover(int banks) continue; } - if (!mce_bios_cmci_threshold) { + if (!mca_cfg.bios_cmci_threshold) { val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK; val |= CMCI_THRESHOLD; } else if (!(val & MCI_CTL2_CMCI_THRESHOLD_MASK)) { @@ -227,7 +227,7 @@ static void cmci_discover(int banks) * set the thresholds properly or does not work with * this boot option. Note down now and report later. */ - if (mce_bios_cmci_threshold && bios_zero_thresh && + if (mca_cfg.bios_cmci_threshold && bios_zero_thresh && (val & MCI_CTL2_CMCI_THRESHOLD_MASK)) bios_wrong_thresh = 1; } else { @@ -235,7 +235,7 @@ static void cmci_discover(int banks) } } raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); - if (mce_bios_cmci_threshold && bios_wrong_thresh) { + if (mca_cfg.bios_cmci_threshold && bios_wrong_thresh) { pr_info_once( "bios_cmci_threshold: Some banks do not have valid thresholds set\n"); pr_info_once( diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 4428fd178bc..6774c17a557 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -340,9 +340,6 @@ int x86_setup_perfctr(struct perf_event *event) /* BTS is currently only allowed for user-mode. */ if (!attr->exclude_kernel) return -EOPNOTSUPP; - - if (!attr->exclude_guest) - return -EOPNOTSUPP; } hwc->config |= config; @@ -385,9 +382,6 @@ int x86_pmu_hw_config(struct perf_event *event) if (event->attr.precise_ip) { int precise = 0; - if (!event->attr.exclude_guest) - return -EOPNOTSUPP; - /* Support for constant skid */ if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) { precise++; diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index 3cf3d97cce3..b43200dbfe7 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -2500,7 +2500,7 @@ static bool pcidrv_registered; /* * add a pci uncore device */ -static int __devinit uncore_pci_add(struct intel_uncore_type *type, struct pci_dev *pdev) +static int uncore_pci_add(struct intel_uncore_type *type, struct pci_dev *pdev) { struct intel_uncore_pmu *pmu; struct intel_uncore_box *box; @@ -2571,8 +2571,8 @@ static void uncore_pci_remove(struct pci_dev *pdev) kfree(box); } -static int __devinit uncore_pci_probe(struct pci_dev *pdev, - const struct pci_device_id *id) +static int uncore_pci_probe(struct pci_dev *pdev, + const struct pci_device_id *id) { struct intel_uncore_type *type; diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index fbd89556229..3286a92e662 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -26,11 +26,6 @@ static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c, #ifdef CONFIG_X86_32 static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c) { - /* - * We use exception 16 if we have hardware math and we've either seen - * it or the CPU claims it is internal - */ - int fpu_exception = c->hard_math && (ignore_fpu_irq || cpu_has_fpu); seq_printf(m, "fdiv_bug\t: %s\n" "hlt_bug\t\t: %s\n" @@ -45,7 +40,7 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c) c->f00f_bug ? "yes" : "no", c->coma_bug ? "yes" : "no", c->hard_math ? "yes" : "no", - fpu_exception ? "yes" : "no", + c->hard_math ? "yes" : "no", c->cpuid_level, c->wp_works_ok ? "yes" : "no"); } diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 13ad89971d4..74467feb4dc 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -16,6 +16,7 @@ #include <linux/delay.h> #include <linux/elf.h> #include <linux/elfcore.h> +#include <linux/module.h> #include <asm/processor.h> #include <asm/hardirq.h> @@ -30,6 +31,27 @@ int in_crash_kexec; +/* + * This is used to VMCLEAR all VMCSs loaded on the + * processor. And when loading kvm_intel module, the + * callback function pointer will be assigned. + * + * protected by rcu. + */ +crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss = NULL; +EXPORT_SYMBOL_GPL(crash_vmclear_loaded_vmcss); + +static inline void cpu_crash_vmclear_loaded_vmcss(void) +{ + crash_vmclear_fn *do_vmclear_operation = NULL; + + rcu_read_lock(); + do_vmclear_operation = rcu_dereference(crash_vmclear_loaded_vmcss); + if (do_vmclear_operation) + do_vmclear_operation(); + rcu_read_unlock(); +} + #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) static void kdump_nmi_callback(int cpu, struct pt_regs *regs) @@ -46,6 +68,11 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs) #endif crash_save_cpu(regs, cpu); + /* + * VMCLEAR VMCSs loaded on all cpus if needed. + */ + cpu_crash_vmclear_loaded_vmcss(); + /* Disable VMX or SVM if needed. * * We need to disable virtualization on all CPUs. @@ -88,6 +115,11 @@ void native_machine_crash_shutdown(struct pt_regs *regs) kdump_nmi_shootdown_cpus(); + /* + * VMCLEAR VMCSs loaded on this cpu if needed. + */ + cpu_crash_vmclear_loaded_vmcss(); + /* Booting kdump kernel with VMX or SVM enabled won't work, * because (among other limitations) we can't disable paging * with the virt flags. diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index c763116c535..6ed91d9980e 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -739,7 +739,6 @@ ENTRY(ptregs_##name) ; \ ENDPROC(ptregs_##name) PTREGSCALL1(iopl) -PTREGSCALL2(sigaltstack) PTREGSCALL0(sigreturn) PTREGSCALL0(rt_sigreturn) PTREGSCALL2(vm86) @@ -1066,7 +1065,6 @@ ENTRY(xen_failsafe_callback) lea 16(%esp),%esp CFI_ADJUST_CFA_OFFSET -16 jz 5f - addl $16,%esp jmp iret_exc 5: pushl_cfi $-1 /* orig_ax = -1 => not a system call */ SAVE_ALL diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 70641aff0c2..cb3c591339a 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -864,7 +864,6 @@ END(stub_\func) FORK_LIKE clone FORK_LIKE fork FORK_LIKE vfork - PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx PTREGSCALL stub_iopl, sys_iopl, %rsi ENTRY(ptregscall_common) @@ -913,8 +912,6 @@ ENTRY(stub_rt_sigreturn) END(stub_rt_sigreturn) #ifdef CONFIG_X86_X32_ABI - PTREGSCALL stub_x32_sigaltstack, sys32_sigaltstack, %rdx - ENTRY(stub_x32_rt_sigreturn) CFI_STARTPROC addq $8, %rsp @@ -1784,6 +1781,7 @@ first_nmi: * Leave room for the "copied" frame */ subq $(5*8), %rsp + CFI_ADJUST_CFA_OFFSET 5*8 /* Copy the stack frame to the Saved frame */ .rept 5 @@ -1866,10 +1864,8 @@ end_repeat_nmi: nmi_swapgs: SWAPGS_UNSAFE_STACK nmi_restore: - RESTORE_ALL 8 - - /* Pop the extra iret frame */ - addq $(5*8), %rsp + /* Pop the extra iret frame at once */ + RESTORE_ALL 6*8 /* Clear the NMI executing stack variable */ movq $0, 5*8(%rsp) diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 8e7f6556028..c8932c79e78 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -300,6 +300,12 @@ ENTRY(startup_32_smp) leal -__PAGE_OFFSET(%ecx),%esp default_entry: +#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \ + X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \ + X86_CR0_PG) + movl $(CR0_STATE & ~X86_CR0_PG),%eax + movl %eax,%cr0 + /* * New page tables may be in 4Mbyte page mode and may * be using the global pages. @@ -364,8 +370,7 @@ default_entry: */ movl $pa(initial_page_table), %eax movl %eax,%cr3 /* set the page table pointer.. */ - movl %cr0,%eax - orl $X86_CR0_PG,%eax + movl $CR0_STATE,%eax movl %eax,%cr0 /* ..and set paging (PG) bit */ ljmp $__BOOT_CS,$1f /* Clear prefetch and normalize %eip */ 1: diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 6e03b0d6913..7dc4e459c2b 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -42,39 +42,6 @@ * (these are usually mapped into the 0x30-0xff vector range) */ -#ifdef CONFIG_X86_32 -/* - * Note that on a 486, we don't want to do a SIGFPE on an irq13 - * as the irq is unreliable, and exception 16 works correctly - * (ie as explained in the intel literature). On a 386, you - * can't use exception 16 due to bad IBM design, so we have to - * rely on the less exact irq13. - * - * Careful.. Not only is IRQ13 unreliable, but it is also - * leads to races. IBM designers who came up with it should - * be shot. - */ - -static irqreturn_t math_error_irq(int cpl, void *dev_id) -{ - outb(0, 0xF0); - if (ignore_fpu_irq || !boot_cpu_data.hard_math) - return IRQ_NONE; - math_error(get_irq_regs(), 0, X86_TRAP_MF); - return IRQ_HANDLED; -} - -/* - * New motherboards sometimes make IRQ 13 be a PCI interrupt, - * so allow interrupt sharing. - */ -static struct irqaction fpu_irq = { - .handler = math_error_irq, - .name = "fpu", - .flags = IRQF_NO_THREAD, -}; -#endif - /* * IRQ2 is cascade interrupt to second interrupt controller */ @@ -242,13 +209,6 @@ void __init native_init_IRQ(void) setup_irq(2, &irq2); #ifdef CONFIG_X86_32 - /* - * External FPU? Set up irq13 if so, for - * original braindamaged IBM FERR coupling. - */ - if (boot_cpu_data.hard_math && !cpu_has_fpu) - setup_irq(FPU_IRQ, &fpu_irq); - irq_ctx_init(smp_processor_id()); #endif } diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 4180a874c76..9c2bd8bd4b4 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -42,6 +42,8 @@ #include <asm/apic.h> #include <asm/apicdef.h> #include <asm/hypervisor.h> +#include <asm/kvm_guest.h> +#include <asm/context_tracking.h> static int kvmapf = 1; @@ -62,6 +64,15 @@ static int parse_no_stealacc(char *arg) early_param("no-steal-acc", parse_no_stealacc); +static int kvmclock_vsyscall = 1; +static int parse_no_kvmclock_vsyscall(char *arg) +{ + kvmclock_vsyscall = 0; + return 0; +} + +early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall); + static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64); static int has_steal_clock = 0; @@ -110,11 +121,8 @@ void kvm_async_pf_task_wait(u32 token) struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; struct kvm_task_sleep_node n, *e; DEFINE_WAIT(wait); - int cpu, idle; - cpu = get_cpu(); - idle = idle_cpu(cpu); - put_cpu(); + rcu_irq_enter(); spin_lock(&b->lock); e = _find_apf_task(b, token); @@ -123,12 +131,14 @@ void kvm_async_pf_task_wait(u32 token) hlist_del(&e->link); kfree(e); spin_unlock(&b->lock); + + rcu_irq_exit(); return; } n.token = token; n.cpu = smp_processor_id(); - n.halted = idle || preempt_count() > 1; + n.halted = is_idle_task(current) || preempt_count() > 1; init_waitqueue_head(&n.wq); hlist_add_head(&n.link, &b->list); spin_unlock(&b->lock); @@ -147,13 +157,16 @@ void kvm_async_pf_task_wait(u32 token) /* * We cannot reschedule. So halt. */ + rcu_irq_exit(); native_safe_halt(); + rcu_irq_enter(); local_irq_disable(); } } if (!n.halted) finish_wait(&n.wq, &wait); + rcu_irq_exit(); return; } EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait); @@ -247,10 +260,10 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code) break; case KVM_PV_REASON_PAGE_NOT_PRESENT: /* page is swapped out by the host. */ - rcu_irq_enter(); + exception_enter(regs); exit_idle(); kvm_async_pf_task_wait((u32)read_cr2()); - rcu_irq_exit(); + exception_exit(regs); break; case KVM_PV_REASON_PAGE_READY: rcu_irq_enter(); @@ -471,6 +484,9 @@ void __init kvm_guest_init(void) if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) apic_set_eoi_write(kvm_guest_apic_eoi_write); + if (kvmclock_vsyscall) + kvm_setup_vsyscall_timeinfo(); + #ifdef CONFIG_SMP smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; register_cpu_notifier(&kvm_cpu_notifier); diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index f1b42b3a186..220a360010f 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -23,6 +23,7 @@ #include <asm/apic.h> #include <linux/percpu.h> #include <linux/hardirq.h> +#include <linux/memblock.h> #include <asm/x86_init.h> #include <asm/reboot.h> @@ -39,7 +40,7 @@ static int parse_no_kvmclock(char *arg) early_param("no-kvmclock", parse_no_kvmclock); /* The hypervisor will put information about time periodically here */ -static DEFINE_PER_CPU_SHARED_ALIGNED(struct pvclock_vcpu_time_info, hv_clock); +static struct pvclock_vsyscall_time_info *hv_clock; static struct pvclock_wall_clock wall_clock; /* @@ -52,15 +53,20 @@ static unsigned long kvm_get_wallclock(void) struct pvclock_vcpu_time_info *vcpu_time; struct timespec ts; int low, high; + int cpu; low = (int)__pa_symbol(&wall_clock); high = ((u64)__pa_symbol(&wall_clock) >> 32); native_write_msr(msr_kvm_wall_clock, low, high); - vcpu_time = &get_cpu_var(hv_clock); + preempt_disable(); + cpu = smp_processor_id(); + + vcpu_time = &hv_clock[cpu].pvti; pvclock_read_wallclock(&wall_clock, vcpu_time, &ts); - put_cpu_var(hv_clock); + + preempt_enable(); return ts.tv_sec; } @@ -74,9 +80,11 @@ static cycle_t kvm_clock_read(void) { struct pvclock_vcpu_time_info *src; cycle_t ret; + int cpu; preempt_disable_notrace(); - src = &__get_cpu_var(hv_clock); + cpu = smp_processor_id(); + src = &hv_clock[cpu].pvti; ret = pvclock_clocksource_read(src); preempt_enable_notrace(); return ret; @@ -99,8 +107,15 @@ static cycle_t kvm_clock_get_cycles(struct clocksource *cs) static unsigned long kvm_get_tsc_khz(void) { struct pvclock_vcpu_time_info *src; - src = &per_cpu(hv_clock, 0); - return pvclock_tsc_khz(src); + int cpu; + unsigned long tsc_khz; + + preempt_disable(); + cpu = smp_processor_id(); + src = &hv_clock[cpu].pvti; + tsc_khz = pvclock_tsc_khz(src); + preempt_enable(); + return tsc_khz; } static void kvm_get_preset_lpj(void) @@ -119,10 +134,14 @@ bool kvm_check_and_clear_guest_paused(void) { bool ret = false; struct pvclock_vcpu_time_info *src; + int cpu = smp_processor_id(); - src = &__get_cpu_var(hv_clock); + if (!hv_clock) + return ret; + + src = &hv_clock[cpu].pvti; if ((src->flags & PVCLOCK_GUEST_STOPPED) != 0) { - __this_cpu_and(hv_clock.flags, ~PVCLOCK_GUEST_STOPPED); + src->flags &= ~PVCLOCK_GUEST_STOPPED; ret = true; } @@ -141,9 +160,10 @@ int kvm_register_clock(char *txt) { int cpu = smp_processor_id(); int low, high, ret; + struct pvclock_vcpu_time_info *src = &hv_clock[cpu].pvti; - low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1; - high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32); + low = (int)__pa(src) | 1; + high = ((u64)__pa(src) >> 32); ret = native_write_msr_safe(msr_kvm_system_time, low, high); printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n", cpu, high, low, txt); @@ -197,6 +217,8 @@ static void kvm_shutdown(void) void __init kvmclock_init(void) { + unsigned long mem; + if (!kvm_para_available()) return; @@ -209,8 +231,18 @@ void __init kvmclock_init(void) printk(KERN_INFO "kvm-clock: Using msrs %x and %x", msr_kvm_system_time, msr_kvm_wall_clock); - if (kvm_register_clock("boot clock")) + mem = memblock_alloc(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS, + PAGE_SIZE); + if (!mem) + return; + hv_clock = __va(mem); + + if (kvm_register_clock("boot clock")) { + hv_clock = NULL; + memblock_free(mem, + sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS); return; + } pv_time_ops.sched_clock = kvm_clock_read; x86_platform.calibrate_tsc = kvm_get_tsc_khz; x86_platform.get_wallclock = kvm_get_wallclock; @@ -233,3 +265,37 @@ void __init kvmclock_init(void) if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); } + +int __init kvm_setup_vsyscall_timeinfo(void) +{ +#ifdef CONFIG_X86_64 + int cpu; + int ret; + u8 flags; + struct pvclock_vcpu_time_info *vcpu_time; + unsigned int size; + + size = sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS; + + preempt_disable(); + cpu = smp_processor_id(); + + vcpu_time = &hv_clock[cpu].pvti; + flags = pvclock_read_flags(vcpu_time); + + if (!(flags & PVCLOCK_TSC_STABLE_BIT)) { + preempt_enable(); + return 1; + } + + if ((ret = pvclock_init_vsyscall(hv_clock, size))) { + preempt_enable(); + return ret; + } + + preempt_enable(); + + kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK; +#endif + return 0; +} diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index a7c5661f849..4929502c137 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -174,6 +174,9 @@ static int msr_open(struct inode *inode, struct file *file) unsigned int cpu; struct cpuinfo_x86 *c; + if (!capable(CAP_SYS_RAWIO)) + return -EPERM; + cpu = iminor(file->f_path.dentry->d_inode); if (cpu >= nr_cpu_ids || !cpu_online(cpu)) return -ENXIO; /* No such CPU */ diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index de2b7ad7027..872079a67e4 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -56,7 +56,7 @@ struct device x86_dma_fallback_dev = { EXPORT_SYMBOL(x86_dma_fallback_dev); /* Number of entries preallocated for DMA-API debugging */ -#define PREALLOC_DMA_DEBUG_ENTRIES 32768 +#define PREALLOC_DMA_DEBUG_ENTRIES 65536 int dma_set_mask(struct device *dev, u64 mask) { @@ -265,7 +265,7 @@ rootfs_initcall(pci_iommu_init); #ifdef CONFIG_PCI /* Many VIA bridges seem to corrupt data for DAC. Disable it here */ -static __devinit void via_no_dac(struct pci_dev *dev) +static void via_no_dac(struct pci_dev *dev) { if (forbid_dac == 0) { dev_info(&dev->dev, "disabling DAC on VIA PCI bridge\n"); diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 42eb3300dfc..85c39590c1a 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -17,23 +17,13 @@ #include <linux/kernel.h> #include <linux/percpu.h> +#include <linux/notifier.h> +#include <linux/sched.h> +#include <linux/gfp.h> +#include <linux/bootmem.h> +#include <asm/fixmap.h> #include <asm/pvclock.h> -/* - * These are perodically updated - * xen: magic shared_info page - * kvm: gpa registered via msr - * and then copied here. - */ -struct pvclock_shadow_time { - u64 tsc_timestamp; /* TSC at last update of time vals. */ - u64 system_timestamp; /* Time, in nanosecs, since boot. */ - u32 tsc_to_nsec_mul; - int tsc_shift; - u32 version; - u8 flags; -}; - static u8 valid_flags __read_mostly = 0; void pvclock_set_flags(u8 flags) @@ -41,34 +31,6 @@ void pvclock_set_flags(u8 flags) valid_flags = flags; } -static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow) -{ - u64 delta = native_read_tsc() - shadow->tsc_timestamp; - return pvclock_scale_delta(delta, shadow->tsc_to_nsec_mul, - shadow->tsc_shift); -} - -/* - * Reads a consistent set of time-base values from hypervisor, - * into a shadow data area. - */ -static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst, - struct pvclock_vcpu_time_info *src) -{ - do { - dst->version = src->version; - rmb(); /* fetch version before data */ - dst->tsc_timestamp = src->tsc_timestamp; - dst->system_timestamp = src->system_time; - dst->tsc_to_nsec_mul = src->tsc_to_system_mul; - dst->tsc_shift = src->tsc_shift; - dst->flags = src->flags; - rmb(); /* test version after fetching data */ - } while ((src->version & 1) || (dst->version != src->version)); - - return dst->version; -} - unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src) { u64 pv_tsc_khz = 1000000ULL << 32; @@ -88,23 +50,32 @@ void pvclock_resume(void) atomic64_set(&last_value, 0); } +u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src) +{ + unsigned version; + cycle_t ret; + u8 flags; + + do { + version = __pvclock_read_cycles(src, &ret, &flags); + } while ((src->version & 1) || version != src->version); + + return flags & valid_flags; +} + cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) { - struct pvclock_shadow_time shadow; unsigned version; - cycle_t ret, offset; + cycle_t ret; u64 last; + u8 flags; do { - version = pvclock_get_time_values(&shadow, src); - barrier(); - offset = pvclock_get_nsec_offset(&shadow); - ret = shadow.system_timestamp + offset; - barrier(); - } while (version != src->version); + version = __pvclock_read_cycles(src, &ret, &flags); + } while ((src->version & 1) || version != src->version); if ((valid_flags & PVCLOCK_TSC_STABLE_BIT) && - (shadow.flags & PVCLOCK_TSC_STABLE_BIT)) + (flags & PVCLOCK_TSC_STABLE_BIT)) return ret; /* @@ -156,3 +127,71 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock, set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); } + +static struct pvclock_vsyscall_time_info *pvclock_vdso_info; + +static struct pvclock_vsyscall_time_info * +pvclock_get_vsyscall_user_time_info(int cpu) +{ + if (!pvclock_vdso_info) { + BUG(); + return NULL; + } + + return &pvclock_vdso_info[cpu]; +} + +struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu) +{ + return &pvclock_get_vsyscall_user_time_info(cpu)->pvti; +} + +#ifdef CONFIG_X86_64 +static int pvclock_task_migrate(struct notifier_block *nb, unsigned long l, + void *v) +{ + struct task_migration_notifier *mn = v; + struct pvclock_vsyscall_time_info *pvti; + + pvti = pvclock_get_vsyscall_user_time_info(mn->from_cpu); + + /* this is NULL when pvclock vsyscall is not initialized */ + if (unlikely(pvti == NULL)) + return NOTIFY_DONE; + + pvti->migrate_count++; + + return NOTIFY_DONE; +} + +static struct notifier_block pvclock_migrate = { + .notifier_call = pvclock_task_migrate, +}; + +/* + * Initialize the generic pvclock vsyscall state. This will allocate + * a/some page(s) for the per-vcpu pvclock information, set up a + * fixmap mapping for the page(s) + */ + +int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i, + int size) +{ + int idx; + + WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE); + + pvclock_vdso_info = i; + + for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) { + __set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx, + __pa_symbol(i) + (idx*PAGE_SIZE), + PAGE_KERNEL_VVAR); + } + + + register_task_migration_notifier(&pvclock_migrate); + + return 0; +} +#endif diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 1b27de56356..26ee48a33dc 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -8,7 +8,7 @@ #if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI) -static void __devinit quirk_intel_irqbalance(struct pci_dev *dev) +static void quirk_intel_irqbalance(struct pci_dev *dev) { u8 config; u16 word; @@ -512,7 +512,7 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS, #if defined(CONFIG_PCI) && defined(CONFIG_NUMA) /* Set correct numa_node information for AMD NB functions */ -static void __devinit quirk_amd_nb_node(struct pci_dev *dev) +static void quirk_amd_nb_node(struct pci_dev *dev) { struct pci_dev *nb_ht; unsigned int devfn; diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 4e8ba39eaf0..76fa1e9a2b3 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -584,7 +584,7 @@ static void native_machine_emergency_restart(void) break; case BOOT_EFI: - if (efi_enabled) + if (efi_enabled(EFI_RUNTIME_SERVICES)) efi.reset_system(reboot_mode ? EFI_RESET_WARM : EFI_RESET_COLD, diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index ca45696f30f..8b24289cc10 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -143,11 +143,7 @@ int default_check_phys_apicid_present(int phys_apicid) } #endif -#ifndef CONFIG_DEBUG_BOOT_PARAMS -struct boot_params __initdata boot_params; -#else struct boot_params boot_params; -#endif /* * Machine setup.. @@ -614,6 +610,83 @@ static __init void reserve_ibft_region(void) static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10; +static bool __init snb_gfx_workaround_needed(void) +{ +#ifdef CONFIG_PCI + int i; + u16 vendor, devid; + static const __initconst u16 snb_ids[] = { + 0x0102, + 0x0112, + 0x0122, + 0x0106, + 0x0116, + 0x0126, + 0x010a, + }; + + /* Assume no if something weird is going on with PCI */ + if (!early_pci_allowed()) + return false; + + vendor = read_pci_config_16(0, 2, 0, PCI_VENDOR_ID); + if (vendor != 0x8086) + return false; + + devid = read_pci_config_16(0, 2, 0, PCI_DEVICE_ID); + for (i = 0; i < ARRAY_SIZE(snb_ids); i++) + if (devid == snb_ids[i]) + return true; +#endif + + return false; +} + +/* + * Sandy Bridge graphics has trouble with certain ranges, exclude + * them from allocation. + */ +static void __init trim_snb_memory(void) +{ + static const __initconst unsigned long bad_pages[] = { + 0x20050000, + 0x20110000, + 0x20130000, + 0x20138000, + 0x40004000, + }; + int i; + + if (!snb_gfx_workaround_needed()) + return; + + printk(KERN_DEBUG "reserving inaccessible SNB gfx pages\n"); + + /* + * Reserve all memory below the 1 MB mark that has not + * already been reserved. + */ + memblock_reserve(0, 1<<20); + + for (i = 0; i < ARRAY_SIZE(bad_pages); i++) { + if (memblock_reserve(bad_pages[i], PAGE_SIZE)) + printk(KERN_WARNING "failed to reserve 0x%08lx\n", + bad_pages[i]); + } +} + +/* + * Here we put platform-specific memory range workarounds, i.e. + * memory known to be corrupt or otherwise in need to be reserved on + * specific platforms. + * + * If this gets used more widely it could use a real dispatch mechanism. + */ +static void __init trim_platform_memory_ranges(void) +{ + trim_snb_memory(); +} + static void __init trim_bios_range(void) { /* @@ -634,6 +707,7 @@ static void __init trim_bios_range(void) * take them out. */ e820_remove_range(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_RAM, 1); + sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); } @@ -733,15 +807,15 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_EFI if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, "EL32", 4)) { - efi_enabled = 1; - efi_64bit = false; + set_bit(EFI_BOOT, &x86_efi_facility); } else if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, "EL64", 4)) { - efi_enabled = 1; - efi_64bit = true; + set_bit(EFI_BOOT, &x86_efi_facility); + set_bit(EFI_64BIT, &x86_efi_facility); } - if (efi_enabled && efi_memblock_x86_reserve_range()) - efi_enabled = 0; + + if (efi_enabled(EFI_BOOT)) + efi_memblock_x86_reserve_range(); #endif x86_init.oem.arch_setup(); @@ -814,7 +888,7 @@ void __init setup_arch(char **cmdline_p) finish_e820_parsing(); - if (efi_enabled) + if (efi_enabled(EFI_BOOT)) efi_init(); dmi_scan_machine(); @@ -897,7 +971,7 @@ void __init setup_arch(char **cmdline_p) * The EFI specification says that boot service code won't be called * after ExitBootServices(). This is, in fact, a lie. */ - if (efi_enabled) + if (efi_enabled(EFI_MEMMAP)) efi_reserve_boot_services(); /* preallocate 4k for mptable mpc */ @@ -912,6 +986,8 @@ void __init setup_arch(char **cmdline_p) setup_real_mode(); + trim_platform_memory_ranges(); + init_gbpages(); /* max_pfn_mapped is updated here */ @@ -956,6 +1032,10 @@ void __init setup_arch(char **cmdline_p) reserve_initrd(); +#if defined(CONFIG_ACPI) && defined(CONFIG_BLK_DEV_INITRD) + acpi_initrd_override((void *)initrd_start, initrd_end - initrd_start); +#endif + reserve_crashkernel(); vsmp_init(); @@ -1034,7 +1114,7 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_VT #if defined(CONFIG_VGA_CONSOLE) - if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY)) + if (!efi_enabled(EFI_BOOT) || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY)) conswitchp = &vga_con; #elif defined(CONFIG_DUMMY_CONSOLE) conswitchp = &dummy_con; @@ -1051,14 +1131,14 @@ void __init setup_arch(char **cmdline_p) register_refined_jiffies(CLOCK_TICK_RATE); #ifdef CONFIG_EFI - /* Once setup is done above, disable efi_enabled on mismatched - * firmware/kernel archtectures since there is no support for - * runtime services. + /* Once setup is done above, unmap the EFI memory map on + * mismatched firmware/kernel archtectures since there is no + * support for runtime services. */ - if (efi_enabled && IS_ENABLED(CONFIG_X86_64) != efi_64bit) { + if (efi_enabled(EFI_BOOT) && + IS_ENABLED(CONFIG_X86_64) != efi_enabled(EFI_64BIT)) { pr_info("efi: Setup done, disabling due to 32/64-bit mismatch\n"); efi_unmap_memmap(); - efi_enabled = 0; } #endif } diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index fbbb604313a..d6bf1f34a6e 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -364,10 +364,7 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, else put_user_ex(0, &frame->uc.uc_flags); put_user_ex(0, &frame->uc.uc_link); - put_user_ex(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); - put_user_ex(sas_ss_flags(regs->sp), - &frame->uc.uc_stack.ss_flags); - put_user_ex(current->sas_ss_size, &frame->uc.uc_stack.ss_size); + err |= __save_altstack(&frame->uc.uc_stack, regs->sp); /* Set up to return from userspace. */ restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn); @@ -414,7 +411,6 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, struct rt_sigframe __user *frame; void __user *fp = NULL; int err = 0; - struct task_struct *me = current; frame = get_sigframe(ka, regs, sizeof(struct rt_sigframe), &fp); @@ -433,10 +429,7 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, else put_user_ex(0, &frame->uc.uc_flags); put_user_ex(0, &frame->uc.uc_link); - put_user_ex(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp); - put_user_ex(sas_ss_flags(regs->sp), - &frame->uc.uc_stack.ss_flags); - put_user_ex(me->sas_ss_size, &frame->uc.uc_stack.ss_size); + err |= __save_altstack(&frame->uc.uc_stack, regs->sp); /* Set up to return from userspace. If provided, use a stub already in userspace. */ @@ -503,10 +496,7 @@ static int x32_setup_rt_frame(int sig, struct k_sigaction *ka, else put_user_ex(0, &frame->uc.uc_flags); put_user_ex(0, &frame->uc.uc_link); - put_user_ex(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); - put_user_ex(sas_ss_flags(regs->sp), - &frame->uc.uc_stack.ss_flags); - put_user_ex(current->sas_ss_size, &frame->uc.uc_stack.ss_size); + err |= __compat_save_altstack(&frame->uc.uc_stack, regs->sp); put_user_ex(0, &frame->uc.uc__pad0); if (ka->sa.sa_flags & SA_RESTORER) { @@ -603,13 +593,6 @@ sys_sigaction(int sig, const struct old_sigaction __user *act, } #endif /* CONFIG_X86_32 */ -long -sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, - struct pt_regs *regs) -{ - return do_sigaltstack(uss, uoss, regs->sp); -} - /* * Do a signal return; undo the signal stack. */ @@ -659,7 +642,7 @@ long sys_rt_sigreturn(struct pt_regs *regs) if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) goto badframe; - if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->sp) == -EFAULT) + if (restore_altstack(&frame->uc.uc_stack)) goto badframe; return ax; @@ -865,7 +848,6 @@ asmlinkage long sys32_x32_rt_sigreturn(struct pt_regs *regs) struct rt_sigframe_x32 __user *frame; sigset_t set; unsigned long ax; - struct pt_regs tregs; frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8); @@ -879,8 +861,7 @@ asmlinkage long sys32_x32_rt_sigreturn(struct pt_regs *regs) if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) goto badframe; - tregs = *regs; - if (sys32_sigaltstack(&frame->uc.uc_stack, NULL, &tregs) == -EFAULT) + if (compat_restore_altstack(&frame->uc.uc_stack)) goto badframe; return ax; diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index cd3b2438a98..9b4d51d0c0d 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -165,10 +165,11 @@ void set_task_blockstep(struct task_struct *task, bool on) * Ensure irq/preemption can't change debugctl in between. * Note also that both TIF_BLOCKSTEP and debugctl should * be changed atomically wrt preemption. - * FIXME: this means that set/clear TIF_BLOCKSTEP is simply - * wrong if task != current, SIGKILL can wakeup the stopped - * tracee and set/clear can play with the running task, this - * can confuse the next __switch_to_xtra(). + * + * NOTE: this means that set/clear TIF_BLOCKSTEP is only safe if + * task is current or it can't be running, otherwise we can race + * with __switch_to_xtra(). We rely on ptrace_freeze_traced() but + * PTRACE_KILL is not safe. */ local_irq_disable(); debugctl = get_debugctlmsr(); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index eb8586693e0..ecffca11f4e 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -69,9 +69,6 @@ asmlinkage int system_call(void); -/* Do we ignore FPU interrupts ? */ -char ignore_fpu_irq; - /* * The IDT has to be page-aligned to simplify the Pentium * F0 0F bug workaround. @@ -564,9 +561,6 @@ void math_error(struct pt_regs *regs, int error_code, int trapnr) dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) { -#ifdef CONFIG_X86_32 - ignore_fpu_irq = 1; -#endif exception_enter(regs); math_error(regs, error_code, X86_TRAP_MF); exception_exit(regs); diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 5c9687b1bde..1dfe69cc78a 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -182,7 +182,7 @@ static void mark_screen_rdonly(struct mm_struct *mm) if (pud_none_or_clear_bad(pud)) goto out; pmd = pmd_offset(pud, 0xA0000); - split_huge_page_pmd(mm, pmd); + split_huge_page_pmd_mm(mm, 0xA0000, pmd); if (pmd_none_or_clear_bad(pmd)) goto out; pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl); diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 3a3e8c9e280..9a907a67be8 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -145,19 +145,6 @@ static int addr_to_vsyscall_nr(unsigned long addr) return nr; } -#ifdef CONFIG_SECCOMP -static int vsyscall_seccomp(struct task_struct *tsk, int syscall_nr) -{ - if (!seccomp_mode(&tsk->seccomp)) - return 0; - task_pt_regs(tsk)->orig_ax = syscall_nr; - task_pt_regs(tsk)->ax = syscall_nr; - return __secure_computing(syscall_nr); -} -#else -#define vsyscall_seccomp(_tsk, _nr) 0 -#endif - static bool write_ok_or_segv(unsigned long ptr, size_t size) { /* @@ -190,10 +177,9 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) { struct task_struct *tsk; unsigned long caller; - int vsyscall_nr; + int vsyscall_nr, syscall_nr, tmp; int prev_sig_on_uaccess_error; long ret; - int skip; /* * No point in checking CS -- the only way to get here is a user mode @@ -225,56 +211,84 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) } tsk = current; - /* - * With a real vsyscall, page faults cause SIGSEGV. We want to - * preserve that behavior to make writing exploits harder. - */ - prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error; - current_thread_info()->sig_on_uaccess_error = 1; /* + * Check for access_ok violations and find the syscall nr. + * * NULL is a valid user pointer (in the access_ok sense) on 32-bit and * 64-bit, so we don't need to special-case it here. For all the * vsyscalls, NULL means "don't write anything" not "write it at * address 0". */ - ret = -EFAULT; - skip = 0; switch (vsyscall_nr) { case 0: - skip = vsyscall_seccomp(tsk, __NR_gettimeofday); - if (skip) - break; - if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) || - !write_ok_or_segv(regs->si, sizeof(struct timezone))) - break; + !write_ok_or_segv(regs->si, sizeof(struct timezone))) { + ret = -EFAULT; + goto check_fault; + } + + syscall_nr = __NR_gettimeofday; + break; + + case 1: + if (!write_ok_or_segv(regs->di, sizeof(time_t))) { + ret = -EFAULT; + goto check_fault; + } + + syscall_nr = __NR_time; + break; + + case 2: + if (!write_ok_or_segv(regs->di, sizeof(unsigned)) || + !write_ok_or_segv(regs->si, sizeof(unsigned))) { + ret = -EFAULT; + goto check_fault; + } + + syscall_nr = __NR_getcpu; + break; + } + + /* + * Handle seccomp. regs->ip must be the original value. + * See seccomp_send_sigsys and Documentation/prctl/seccomp_filter.txt. + * + * We could optimize the seccomp disabled case, but performance + * here doesn't matter. + */ + regs->orig_ax = syscall_nr; + regs->ax = -ENOSYS; + tmp = secure_computing(syscall_nr); + if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) { + warn_bad_vsyscall(KERN_DEBUG, regs, + "seccomp tried to change syscall nr or ip"); + do_exit(SIGSYS); + } + if (tmp) + goto do_ret; /* skip requested */ + /* + * With a real vsyscall, page faults cause SIGSEGV. We want to + * preserve that behavior to make writing exploits harder. + */ + prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error; + current_thread_info()->sig_on_uaccess_error = 1; + + ret = -EFAULT; + switch (vsyscall_nr) { + case 0: ret = sys_gettimeofday( (struct timeval __user *)regs->di, (struct timezone __user *)regs->si); break; case 1: - skip = vsyscall_seccomp(tsk, __NR_time); - if (skip) - break; - - if (!write_ok_or_segv(regs->di, sizeof(time_t))) - break; - ret = sys_time((time_t __user *)regs->di); break; case 2: - skip = vsyscall_seccomp(tsk, __NR_getcpu); - if (skip) - break; - - if (!write_ok_or_segv(regs->di, sizeof(unsigned)) || - !write_ok_or_segv(regs->si, sizeof(unsigned))) - break; - ret = sys_getcpu((unsigned __user *)regs->di, (unsigned __user *)regs->si, NULL); @@ -283,12 +297,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error; - if (skip) { - if ((long)regs->ax <= 0L) /* seccomp errno emulation */ - goto do_ret; - goto done; /* seccomp trace/trap */ - } - +check_fault: if (ret == -EFAULT) { /* Bad news -- userspace fed a bad pointer to a vsyscall. */ warn_bad_vsyscall(KERN_INFO, regs, @@ -311,7 +320,6 @@ do_ret: /* Emulate a ret instruction. */ regs->ip = caller; regs->sp += 8; -done: return true; sigsegv: |