diff options
Diffstat (limited to 'arch/x86/kernel')
57 files changed, 1254 insertions, 416 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 8f1e77440b2..5d4502c8b98 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -28,8 +28,7 @@ obj-$(CONFIG_X86_32) += i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-$(CONFIG_X86_64) += mcount_64.o obj-y += syscall_$(BITS).o vsyscall_gtod.o -obj-$(CONFIG_X86_64) += vsyscall_64.o -obj-$(CONFIG_X86_64) += vsyscall_emu_64.o +obj-$(CONFIG_X86_VSYSCALL_EMULATION) += vsyscall_64.o vsyscall_emu_64.o obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o obj-$(CONFIG_SYSFS) += ksysfs.o obj-y += bootflag.o e820.o diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index f04dbb3069b..5caed1dd7cc 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -21,6 +21,7 @@ const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M10H_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M60H_NB_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F3) }, {} @@ -30,6 +31,7 @@ EXPORT_SYMBOL(amd_nb_misc_ids); static const struct pci_device_id amd_nb_link_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M60H_NB_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F4) }, {} diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 4128b5fcb55..c2fd21fed00 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -40,7 +40,7 @@ static unsigned int get_apic_id(unsigned long x) unsigned int id; rdmsrl(MSR_FAM10H_NODE_ID, value); - id = ((x >> 24) & 0xffU) | ((value << 2) & 0x3f00U); + id = ((x >> 24) & 0xffU) | ((value << 2) & 0xff00U); return id; } @@ -145,7 +145,7 @@ static void numachip_send_IPI_all(int vector) static void numachip_send_IPI_self(int vector) { - __default_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL); + apic_write(APIC_SELF_IPI, vector); } static int __init numachip_probe(void) @@ -153,20 +153,8 @@ static int __init numachip_probe(void) return apic == &apic_numachip; } -static void __init map_csrs(void) -{ - printk(KERN_INFO "NumaChip: Mapping local CSR space (%016llx - %016llx)\n", - NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_BASE + NUMACHIP_LCSR_SIZE - 1); - init_extra_mapping_uc(NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_SIZE); - - printk(KERN_INFO "NumaChip: Mapping global CSR space (%016llx - %016llx)\n", - NUMACHIP_GCSR_BASE, NUMACHIP_GCSR_BASE + NUMACHIP_GCSR_SIZE - 1); - init_extra_mapping_uc(NUMACHIP_GCSR_BASE, NUMACHIP_GCSR_SIZE); -} - static void fixup_cpu_id(struct cpuinfo_x86 *c, int node) { - if (c->phys_proc_id != node) { c->phys_proc_id = node; per_cpu(cpu_llc_id, smp_processor_id()) = node; @@ -175,19 +163,15 @@ static void fixup_cpu_id(struct cpuinfo_x86 *c, int node) static int __init numachip_system_init(void) { - unsigned int val; - if (!numachip_system) return 0; + init_extra_mapping_uc(NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_SIZE); + init_extra_mapping_uc(NUMACHIP_GCSR_BASE, NUMACHIP_GCSR_SIZE); + x86_cpuinit.fixup_cpu_id = fixup_cpu_id; x86_init.pci.arch_init = pci_numachip_init; - map_csrs(); - - val = read_lcsr(CSR_G0_NODE_IDS); - printk(KERN_INFO "NumaChip: Local NodeID = %08x\n", val); - return 0; } early_initcall(numachip_system_init); diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index 6a1e71bde32..6873ab925d0 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -18,6 +18,7 @@ #include <linux/nmi.h> #include <linux/module.h> #include <linux/delay.h> +#include <linux/seq_buf.h> #ifdef CONFIG_HARDLOCKUP_DETECTOR u64 hw_nmi_get_sample_period(int watchdog_thresh) @@ -29,14 +30,35 @@ u64 hw_nmi_get_sample_period(int watchdog_thresh) #ifdef arch_trigger_all_cpu_backtrace /* For reliability, we're prepared to waste bits here. */ static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly; +static cpumask_t printtrace_mask; + +#define NMI_BUF_SIZE 4096 + +struct nmi_seq_buf { + unsigned char buffer[NMI_BUF_SIZE]; + struct seq_buf seq; +}; + +/* Safe printing in NMI context */ +static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq); /* "in progress" flag of arch_trigger_all_cpu_backtrace */ static unsigned long backtrace_flag; +static void print_seq_line(struct nmi_seq_buf *s, int start, int end) +{ + const char *buf = s->buffer + start; + + printk("%.*s", (end - start) + 1, buf); +} + void arch_trigger_all_cpu_backtrace(bool include_self) { + struct nmi_seq_buf *s; + int len; + int cpu; int i; - int cpu = get_cpu(); + int this_cpu = get_cpu(); if (test_and_set_bit(0, &backtrace_flag)) { /* @@ -49,7 +71,17 @@ void arch_trigger_all_cpu_backtrace(bool include_self) cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask); if (!include_self) - cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); + cpumask_clear_cpu(this_cpu, to_cpumask(backtrace_mask)); + + cpumask_copy(&printtrace_mask, to_cpumask(backtrace_mask)); + /* + * Set up per_cpu seq_buf buffers that the NMIs running on the other + * CPUs will write to. + */ + for_each_cpu(cpu, to_cpumask(backtrace_mask)) { + s = &per_cpu(nmi_print_seq, cpu); + seq_buf_init(&s->seq, s->buffer, NMI_BUF_SIZE); + } if (!cpumask_empty(to_cpumask(backtrace_mask))) { pr_info("sending NMI to %s CPUs:\n", @@ -65,11 +97,58 @@ void arch_trigger_all_cpu_backtrace(bool include_self) touch_softlockup_watchdog(); } + /* + * Now that all the NMIs have triggered, we can dump out their + * back traces safely to the console. + */ + for_each_cpu(cpu, &printtrace_mask) { + int last_i = 0; + + s = &per_cpu(nmi_print_seq, cpu); + len = seq_buf_used(&s->seq); + if (!len) + continue; + + /* Print line by line. */ + for (i = 0; i < len; i++) { + if (s->buffer[i] == '\n') { + print_seq_line(s, last_i, i); + last_i = i + 1; + } + } + /* Check if there was a partial line. */ + if (last_i < len) { + print_seq_line(s, last_i, len - 1); + pr_cont("\n"); + } + } + clear_bit(0, &backtrace_flag); smp_mb__after_atomic(); put_cpu(); } +/* + * It is not safe to call printk() directly from NMI handlers. + * It may be fine if the NMI detected a lock up and we have no choice + * but to do so, but doing a NMI on all other CPUs to get a back trace + * can be done with a sysrq-l. We don't want that to lock up, which + * can happen if the NMI interrupts a printk in progress. + * + * Instead, we redirect the vprintk() to this nmi_vprintk() that writes + * the content into a per cpu seq_buf buffer. Then when the NMIs are + * all done, we can safely dump the contents of the seq_buf to a printk() + * from a non NMI context. + */ +static int nmi_vprintk(const char *fmt, va_list args) +{ + struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq); + unsigned int len = seq_buf_used(&s->seq); + + seq_buf_vprintf(&s->seq, fmt, args); + return seq_buf_used(&s->seq) - len; +} + static int arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs) { @@ -78,12 +157,14 @@ arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs) cpu = smp_processor_id(); if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { - static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED; + printk_func_t printk_func_save = this_cpu_read(printk_func); - arch_spin_lock(&lock); + /* Replace printk to write into the NMI seq */ + this_cpu_write(printk_func, nmi_vprintk); printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); show_regs(regs); - arch_spin_unlock(&lock); + this_cpu_write(printk_func, printk_func_save); + cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); return NMI_HANDLED; } diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 1183d545da1..7ffe0a2b870 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3158,7 +3158,7 @@ msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; msg.address_lo |= MSI_ADDR_DEST_ID(dest); - __write_msi_msg(data->msi_desc, &msg); + __pci_write_msi_msg(data->msi_desc, &msg); return IRQ_SET_MASK_OK_NOCOPY; } @@ -3169,8 +3169,8 @@ msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) */ static struct irq_chip msi_chip = { .name = "PCI-MSI", - .irq_unmask = unmask_msi_irq, - .irq_mask = mask_msi_irq, + .irq_unmask = pci_msi_unmask_irq, + .irq_mask = pci_msi_mask_irq, .irq_ack = ack_apic_edge, .irq_set_affinity = msi_set_affinity, .irq_retrigger = ioapic_retrigger_irq, @@ -3196,7 +3196,7 @@ int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, * MSI message denotes a contiguous group of IRQs, written for 0th IRQ. */ if (!irq_offset) - write_msi_msg(irq, &msg); + pci_write_msi_msg(irq, &msg); setup_remapped_irq(irq, irq_cfg(irq), chip); diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 58487445141..927ec923594 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -378,7 +378,6 @@ static struct cpuidle_driver apm_idle_driver = { { /* entry 1 is for APM idle */ .name = "APM", .desc = "APM idle", - .flags = CPUIDLE_FLAG_TIME_VALID, .exit_latency = 250, /* WAG */ .target_residency = 500, /* WAG */ .enter = &apm_cpu_idle diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index d67c4be3e8b..3b3b9d33ac1 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -1,3 +1,7 @@ +#ifndef __LINUX_KBUILD_H +# error "Please do not build this file directly, build asm-offsets.c instead" +#endif + #include <asm/ucontext.h> #include <linux/lguest.h> diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index e7c798b354f..fdcbb4d27c9 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -1,3 +1,7 @@ +#ifndef __LINUX_KBUILD_H +# error "Please do not build this file directly, build asm-offsets.c instead" +#endif + #include <asm/ia32.h> #define __SYSCALL_64(nr, sym, compat) [nr] = 1, @@ -48,7 +52,6 @@ int main(void) #define ENTRY(entry) OFFSET(pt_regs_ ## entry, pt_regs, entry) ENTRY(bx); - ENTRY(bx); ENTRY(cx); ENTRY(dx); ENTRY(sp); diff --git a/arch/x86/kernel/audit_64.c b/arch/x86/kernel/audit_64.c index 06d3e5a14d9..f3672508b24 100644 --- a/arch/x86/kernel/audit_64.c +++ b/arch/x86/kernel/audit_64.c @@ -50,6 +50,7 @@ int audit_classify_syscall(int abi, unsigned syscall) case __NR_openat: return 3; case __NR_execve: + case __NR_execveat: return 5; default: return 0; diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 813d29d00a1..15c5df92f74 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -566,6 +566,17 @@ static void init_amd_k8(struct cpuinfo_x86 *c) if (!c->x86_model_id[0]) strcpy(c->x86_model_id, "Hammer"); + +#ifdef CONFIG_SMP + /* + * Disable TLB flush filter by setting HWCR.FFDIS on K8 + * bit 6 of msr C001_0015 + * + * Errata 63 for SH-B3 steppings + * Errata 122 for all steppings (F+ have it disabled by default) + */ + msr_set_bit(MSR_K7_HWCR, 6); +#endif } static void init_amd_gh(struct cpuinfo_x86 *c) @@ -636,18 +647,6 @@ static void init_amd(struct cpuinfo_x86 *c) { u32 dummy; -#ifdef CONFIG_SMP - /* - * Disable TLB flush filter by setting HWCR.FFDIS on K8 - * bit 6 of msr C001_0015 - * - * Errata 63 for SH-B3 steppings - * Errata 122 for all steppings (F+ have it disabled by default) - */ - if (c->x86 == 0xf) - msr_set_bit(MSR_K7_HWCR, 6); -#endif - early_init_amd(c); /* diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index cfa9b5b2c27..c6049650c09 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -958,14 +958,6 @@ static void identify_cpu(struct cpuinfo_x86 *c) } #ifdef CONFIG_X86_64 -static void vgetcpu_set_mode(void) -{ - if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP)) - vgetcpu_mode = VGETCPU_RDTSCP; - else - vgetcpu_mode = VGETCPU_LSL; -} - #ifdef CONFIG_IA32_EMULATION /* May not be __init: called during resume */ static void syscall32_cpu_init(void) @@ -1008,8 +1000,6 @@ void __init identify_boot_cpu(void) #ifdef CONFIG_X86_32 sysenter_setup(); enable_sep_cpu(); -#else - vgetcpu_set_mode(); #endif cpu_detect_tlb(&boot_cpu_data); } diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index 09edd0b65fe..10b46906767 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -3,6 +3,8 @@ enum severity_level { MCE_NO_SEVERITY, + MCE_DEFERRED_SEVERITY, + MCE_UCNA_SEVERITY = MCE_DEFERRED_SEVERITY, MCE_KEEP_SEVERITY, MCE_SOME_SEVERITY, MCE_AO_SEVERITY, @@ -21,7 +23,7 @@ struct mce_bank { char attrname[ATTR_LEN]; /* attribute name */ }; -int mce_severity(struct mce *a, int tolerant, char **msg); +int mce_severity(struct mce *a, int tolerant, char **msg, bool is_excp); struct dentry *mce_get_debugfs_dir(void); extern struct mce_bank *mce_banks; diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index c370e1c4468..8bb433043a7 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -31,6 +31,7 @@ enum context { IN_KERNEL = 1, IN_USER = 2 }; enum ser { SER_REQUIRED = 1, NO_SER = 2 }; +enum exception { EXCP_CONTEXT = 1, NO_EXCP = 2 }; static struct severity { u64 mask; @@ -40,6 +41,7 @@ static struct severity { unsigned char mcgres; unsigned char ser; unsigned char context; + unsigned char excp; unsigned char covered; char *msg; } severities[] = { @@ -48,6 +50,8 @@ static struct severity { #define USER .context = IN_USER #define SER .ser = SER_REQUIRED #define NOSER .ser = NO_SER +#define EXCP .excp = EXCP_CONTEXT +#define NOEXCP .excp = NO_EXCP #define BITCLR(x) .mask = x, .result = 0 #define BITSET(x) .mask = x, .result = x #define MCGMASK(x, y) .mcgmask = x, .mcgres = y @@ -62,7 +66,7 @@ static struct severity { ), MCESEV( NO, "Not enabled", - BITCLR(MCI_STATUS_EN) + EXCP, BITCLR(MCI_STATUS_EN) ), MCESEV( PANIC, "Processor context corrupt", @@ -71,16 +75,20 @@ static struct severity { /* When MCIP is not set something is very confused */ MCESEV( PANIC, "MCIP not set in MCA handler", - MCGMASK(MCG_STATUS_MCIP, 0) + EXCP, MCGMASK(MCG_STATUS_MCIP, 0) ), /* Neither return not error IP -- no chance to recover -> PANIC */ MCESEV( PANIC, "Neither restart nor error IP", - MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0) + EXCP, MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0) ), MCESEV( PANIC, "In kernel and no restart IP", - KERNEL, MCGMASK(MCG_STATUS_RIPV, 0) + EXCP, KERNEL, MCGMASK(MCG_STATUS_RIPV, 0) + ), + MCESEV( + DEFERRED, "Deferred error", + NOSER, MASK(MCI_STATUS_UC|MCI_STATUS_DEFERRED|MCI_STATUS_POISON, MCI_STATUS_DEFERRED) ), MCESEV( KEEP, "Corrected error", @@ -89,7 +97,7 @@ static struct severity { /* ignore OVER for UCNA */ MCESEV( - KEEP, "Uncorrected no action required", + UCNA, "Uncorrected no action required", SER, MASK(MCI_UC_SAR, MCI_STATUS_UC) ), MCESEV( @@ -178,8 +186,9 @@ static int error_context(struct mce *m) return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL; } -int mce_severity(struct mce *m, int tolerant, char **msg) +int mce_severity(struct mce *m, int tolerant, char **msg, bool is_excp) { + enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP); enum context ctx = error_context(m); struct severity *s; @@ -194,6 +203,8 @@ int mce_severity(struct mce *m, int tolerant, char **msg) continue; if (s->context && ctx != s->context) continue; + if (s->excp && excp != s->excp) + continue; if (msg) *msg = s->msg; s->covered = 1; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 61a9668cebf..d2c611699cd 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -292,10 +292,10 @@ static void print_mce(struct mce *m) #define PANIC_TIMEOUT 5 /* 5 seconds */ -static atomic_t mce_paniced; +static atomic_t mce_panicked; static int fake_panic; -static atomic_t mce_fake_paniced; +static atomic_t mce_fake_panicked; /* Panic in progress. Enable interrupts and wait for final IPI */ static void wait_for_panic(void) @@ -319,7 +319,7 @@ static void mce_panic(char *msg, struct mce *final, char *exp) /* * Make sure only one CPU runs in machine check panic */ - if (atomic_inc_return(&mce_paniced) > 1) + if (atomic_inc_return(&mce_panicked) > 1) wait_for_panic(); barrier(); @@ -327,7 +327,7 @@ static void mce_panic(char *msg, struct mce *final, char *exp) console_verbose(); } else { /* Don't log too much for fake panic */ - if (atomic_inc_return(&mce_fake_paniced) > 1) + if (atomic_inc_return(&mce_fake_panicked) > 1) return; } /* First print corrected ones that are still unlogged */ @@ -575,6 +575,37 @@ static void mce_read_aux(struct mce *m, int i) } } +static bool memory_error(struct mce *m) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + + if (c->x86_vendor == X86_VENDOR_AMD) { + /* + * coming soon + */ + return false; + } else if (c->x86_vendor == X86_VENDOR_INTEL) { + /* + * Intel SDM Volume 3B - 15.9.2 Compound Error Codes + * + * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for + * indicating a memory error. Bit 8 is used for indicating a + * cache hierarchy error. The combination of bit 2 and bit 3 + * is used for indicating a `generic' cache hierarchy error + * But we can't just blindly check the above bits, because if + * bit 11 is set, then it is a bus/interconnect error - and + * either way the above bits just gives more detail on what + * bus/interconnect error happened. Note that bit 12 can be + * ignored, as it's the "filter" bit. + */ + return (m->status & 0xef80) == BIT(7) || + (m->status & 0xef00) == BIT(8) || + (m->status & 0xeffc) == 0xc; + } + + return false; +} + DEFINE_PER_CPU(unsigned, mce_poll_count); /* @@ -595,6 +626,7 @@ DEFINE_PER_CPU(unsigned, mce_poll_count); void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) { struct mce m; + int severity; int i; this_cpu_inc(mce_poll_count); @@ -630,6 +662,20 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) if (!(flags & MCP_TIMESTAMP)) m.tsc = 0; + + severity = mce_severity(&m, mca_cfg.tolerant, NULL, false); + + /* + * In the cases where we don't have a valid address after all, + * do not add it into the ring buffer. + */ + if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m)) { + if (m.status & MCI_STATUS_ADDRV) { + mce_ring_add(m.addr >> PAGE_SHIFT); + mce_schedule_work(); + } + } + /* * Don't get the IP here because it's unlikely to * have anything to do with the actual error location. @@ -668,7 +714,8 @@ static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, if (quirk_no_way_out) quirk_no_way_out(i, m, regs); } - if (mce_severity(m, mca_cfg.tolerant, msg) >= MCE_PANIC_SEVERITY) + if (mce_severity(m, mca_cfg.tolerant, msg, true) >= + MCE_PANIC_SEVERITY) ret = 1; } return ret; @@ -697,7 +744,7 @@ static int mce_timed_out(u64 *t) * might have been modified by someone else. */ rmb(); - if (atomic_read(&mce_paniced)) + if (atomic_read(&mce_panicked)) wait_for_panic(); if (!mca_cfg.monarch_timeout) goto out; @@ -754,7 +801,7 @@ static void mce_reign(void) for_each_possible_cpu(cpu) { int severity = mce_severity(&per_cpu(mces_seen, cpu), mca_cfg.tolerant, - &nmsg); + &nmsg, true); if (severity > global_worst) { msg = nmsg; global_worst = severity; @@ -1095,13 +1142,14 @@ void do_machine_check(struct pt_regs *regs, long error_code) */ add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); - severity = mce_severity(&m, cfg->tolerant, NULL); + severity = mce_severity(&m, cfg->tolerant, NULL, true); /* - * When machine check was for corrected handler don't touch, - * unless we're panicing. + * When machine check was for corrected/deferred handler don't + * touch, unless we're panicing. */ - if (severity == MCE_KEEP_SEVERITY && !no_way_out) + if ((severity == MCE_KEEP_SEVERITY || + severity == MCE_UCNA_SEVERITY) && !no_way_out) continue; __set_bit(i, toclear); if (severity == MCE_NO_SEVERITY) { @@ -2520,7 +2568,7 @@ struct dentry *mce_get_debugfs_dir(void) static void mce_reset(void) { cpu_missing = 0; - atomic_set(&mce_fake_paniced, 0); + atomic_set(&mce_fake_panicked, 0); atomic_set(&mce_executing, 0); atomic_set(&mce_callin, 0); atomic_set(&global_nwo, 0); diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 5d4999f95ae..f1c3769bbd6 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -212,12 +212,12 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) unsigned int cpu = smp_processor_id(); u32 low = 0, high = 0, address = 0; unsigned int bank, block; - int offset = -1; + int offset = -1, new; for (bank = 0; bank < mca_cfg.banks; ++bank) { for (block = 0; block < NR_BLOCKS; ++block) { if (block == 0) - address = MSR_IA32_MC0_MISC + bank * 4; + address = MSR_IA32_MCx_MISC(bank); else if (block == 1) { address = (low & MASK_BLKPTR_LO) >> 21; if (!address) @@ -247,13 +247,18 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) b.address = address; b.interrupt_capable = lvt_interrupt_supported(bank, high); - if (b.interrupt_capable) { - int new = (high & MASK_LVTOFF_HI) >> 20; - offset = setup_APIC_mce(offset, new); - } + if (!b.interrupt_capable) + goto init; + + new = (high & MASK_LVTOFF_HI) >> 20; + offset = setup_APIC_mce(offset, new); + + if ((offset == new) && + (mce_threshold_vector != amd_threshold_interrupt)) + mce_threshold_vector = amd_threshold_interrupt; +init: mce_threshold_block_init(&b, offset); - mce_threshold_vector = amd_threshold_interrupt; } } } @@ -270,18 +275,17 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) static void amd_threshold_interrupt(void) { u32 low = 0, high = 0, address = 0; + int cpu = smp_processor_id(); unsigned int bank, block; struct mce m; - mce_setup(&m); - /* assume first bank caused it */ for (bank = 0; bank < mca_cfg.banks; ++bank) { - if (!(per_cpu(bank_map, m.cpu) & (1 << bank))) + if (!(per_cpu(bank_map, cpu) & (1 << bank))) continue; for (block = 0; block < NR_BLOCKS; ++block) { if (block == 0) { - address = MSR_IA32_MC0_MISC + bank * 4; + address = MSR_IA32_MCx_MISC(bank); } else if (block == 1) { address = (low & MASK_BLKPTR_LO) >> 21; if (!address) @@ -309,21 +313,20 @@ static void amd_threshold_interrupt(void) * Log the machine check that caused the threshold * event. */ - machine_check_poll(MCP_TIMESTAMP, - this_cpu_ptr(&mce_poll_banks)); - - if (high & MASK_OVERFLOW_HI) { - rdmsrl(address, m.misc); - rdmsrl(MSR_IA32_MC0_STATUS + bank * 4, - m.status); - m.bank = K8_MCE_THRESHOLD_BASE - + bank * NR_BLOCKS - + block; - mce_log(&m); - return; - } + if (high & MASK_OVERFLOW_HI) + goto log; } } + return; + +log: + mce_setup(&m); + rdmsrl(MSR_IA32_MCx_STATUS(bank), m.status); + m.misc = ((u64)high << 32) | low; + m.bank = bank; + mce_log(&m); + + wrmsrl(MSR_IA32_MCx_STATUS(bank), 0); } /* @@ -617,8 +620,7 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank) } } - err = allocate_threshold_blocks(cpu, bank, 0, - MSR_IA32_MC0_MISC + bank * 4); + err = allocate_threshold_blocks(cpu, bank, 0, MSR_IA32_MCx_MISC(bank)); if (!err) goto out; diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 8fffd845e22..bfbbe6195e2 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -376,7 +376,7 @@ static enum ucode_state __load_microcode_amd(u8 family, const u8 *data, return UCODE_OK; } -enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size) +enum ucode_state load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size) { enum ucode_state ret; @@ -390,8 +390,8 @@ enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size) #if defined(CONFIG_MICROCODE_AMD_EARLY) && defined(CONFIG_X86_32) /* save BSP's matching patch for early load */ - if (cpu_data(smp_processor_id()).cpu_index == boot_cpu_data.cpu_index) { - struct ucode_patch *p = find_patch(smp_processor_id()); + if (cpu_data(cpu).cpu_index == boot_cpu_data.cpu_index) { + struct ucode_patch *p = find_patch(cpu); if (p) { memset(amd_ucode_patch, 0, PATCH_MAX_SIZE); memcpy(amd_ucode_patch, p->data, min_t(u32, ksize(p->data), @@ -444,7 +444,7 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device, goto fw_release; } - ret = load_microcode_amd(c->x86, fw->data, fw->size); + ret = load_microcode_amd(cpu, c->x86, fw->data, fw->size); fw_release: release_firmware(fw); diff --git a/arch/x86/kernel/cpu/microcode/amd_early.c b/arch/x86/kernel/cpu/microcode/amd_early.c index 06674473b0e..737737edbd1 100644 --- a/arch/x86/kernel/cpu/microcode/amd_early.c +++ b/arch/x86/kernel/cpu/microcode/amd_early.c @@ -389,7 +389,7 @@ int __init save_microcode_in_initrd_amd(void) eax = cpuid_eax(0x00000001); eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); - ret = load_microcode_amd(eax, container, container_size); + ret = load_microcode_amd(smp_processor_id(), eax, container, container_size); if (ret != UCODE_OK) retval = -EINVAL; @@ -402,3 +402,21 @@ int __init save_microcode_in_initrd_amd(void) return retval; } + +void reload_ucode_amd(void) +{ + struct microcode_amd *mc; + u32 rev, eax; + + rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax); + + mc = (struct microcode_amd *)amd_ucode_patch; + + if (mc && rev < mc->hdr.patch_id) { + if (!__apply_microcode_amd(mc)) { + ucode_new_rev = mc->hdr.patch_id; + pr_info("microcode: reload patch_level=0x%08x\n", + ucode_new_rev); + } + } +} diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 2ce9051174e..15c29096136 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -466,13 +466,7 @@ static void mc_bp_resume(void) if (uci->valid && uci->mc) microcode_ops->apply_microcode(cpu); else if (!uci->mc) - /* - * We might resume and not have applied late microcode but still - * have a newer patch stashed from the early loader. We don't - * have it in uci->mc so we have to load it the same way we're - * applying patches early on the APs. - */ - load_ucode_ap(); + reload_early_microcode(); } static struct syscore_ops mc_syscore_ops = { @@ -557,7 +551,7 @@ static int __init microcode_init(void) struct cpuinfo_x86 *c = &cpu_data(0); int error; - if (dis_ucode_ldr) + if (paravirt_enabled() || dis_ucode_ldr) return 0; if (c->x86_vendor == X86_VENDOR_INTEL) diff --git a/arch/x86/kernel/cpu/microcode/core_early.c b/arch/x86/kernel/cpu/microcode/core_early.c index 2c017f242a7..d45df4bd16a 100644 --- a/arch/x86/kernel/cpu/microcode/core_early.c +++ b/arch/x86/kernel/cpu/microcode/core_early.c @@ -176,3 +176,24 @@ int __init save_microcode_in_initrd(void) return 0; } + +void reload_early_microcode(void) +{ + int vendor, x86; + + vendor = x86_vendor(); + x86 = x86_family(); + + switch (vendor) { + case X86_VENDOR_INTEL: + if (x86 >= 6) + reload_ucode_intel(); + break; + case X86_VENDOR_AMD: + if (x86 >= 0x10) + reload_ucode_amd(); + break; + default: + break; + } +} diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c index b88343f7a3b..ec9df6f9cd4 100644 --- a/arch/x86/kernel/cpu/microcode/intel_early.c +++ b/arch/x86/kernel/cpu/microcode/intel_early.c @@ -650,8 +650,7 @@ static inline void print_ucode(struct ucode_cpu_info *uci) } #endif -static int apply_microcode_early(struct mc_saved_data *mc_saved_data, - struct ucode_cpu_info *uci) +static int apply_microcode_early(struct ucode_cpu_info *uci, bool early) { struct microcode_intel *mc_intel; unsigned int val[2]; @@ -680,7 +679,10 @@ static int apply_microcode_early(struct mc_saved_data *mc_saved_data, #endif uci->cpu_sig.rev = val[1]; - print_ucode(uci); + if (early) + print_ucode(uci); + else + print_ucode_info(uci, mc_intel->hdr.date); return 0; } @@ -715,12 +717,17 @@ _load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data, unsigned long initrd_end_early, struct ucode_cpu_info *uci) { + enum ucode_state ret; + collect_cpu_info_early(uci); scan_microcode(initrd_start_early, initrd_end_early, mc_saved_data, mc_saved_in_initrd, uci); - load_microcode(mc_saved_data, mc_saved_in_initrd, - initrd_start_early, uci); - apply_microcode_early(mc_saved_data, uci); + + ret = load_microcode(mc_saved_data, mc_saved_in_initrd, + initrd_start_early, uci); + + if (ret == UCODE_OK) + apply_microcode_early(uci, true); } void __init @@ -749,7 +756,8 @@ load_ucode_intel_bsp(void) initrd_end_early = initrd_start_early + ramdisk_size; _load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd, - initrd_start_early, initrd_end_early, &uci); + initrd_start_early, initrd_end_early, + &uci); #endif } @@ -783,5 +791,23 @@ void load_ucode_intel_ap(void) collect_cpu_info_early(&uci); load_microcode(mc_saved_data_p, mc_saved_in_initrd_p, initrd_start_addr, &uci); - apply_microcode_early(mc_saved_data_p, &uci); + apply_microcode_early(&uci, true); +} + +void reload_ucode_intel(void) +{ + struct ucode_cpu_info uci; + enum ucode_state ret; + + if (!mc_saved_data.mc_saved_count) + return; + + collect_cpu_info_early(&uci); + + ret = generic_load_microcode_early(mc_saved_data.mc_saved, + mc_saved_data.mc_saved_count, &uci); + if (ret != UCODE_OK) + return; + + apply_microcode_early(&uci, false); } diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index fc5eb390b36..4e6cdb0ddc7 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -253,6 +253,10 @@ struct cpu_hw_events { #define INTEL_UEVENT_CONSTRAINT(c, n) \ EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK) +/* Like UEVENT_CONSTRAINT, but match flags too */ +#define INTEL_FLAGS_UEVENT_CONSTRAINT(c, n) \ + EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS) + #define INTEL_PLD_CONSTRAINT(c, n) \ __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LDLAT) diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index cbb1be3ed9e..a61f5c6911d 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -565,6 +565,21 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) perf_ibs->offset_max, offset + 1); } while (offset < offset_max); + if (event->attr.sample_type & PERF_SAMPLE_RAW) { + /* + * Read IbsBrTarget and IbsOpData4 separately + * depending on their availability. + * Can't add to offset_max as they are staggered + */ + if (ibs_caps & IBS_CAPS_BRNTRGT) { + rdmsrl(MSR_AMD64_IBSBRTARGET, *buf++); + size++; + } + if (ibs_caps & IBS_CAPS_OPDATA4) { + rdmsrl(MSR_AMD64_IBSOPDATA4, *buf++); + size++; + } + } ibs_data.size = sizeof(u64) * size; regs = *iregs; diff --git a/arch/x86/kernel/cpu/perf_event_amd_iommu.c b/arch/x86/kernel/cpu/perf_event_amd_iommu.c index 639d1289b1b..97242a9242b 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_iommu.c +++ b/arch/x86/kernel/cpu/perf_event_amd_iommu.c @@ -130,10 +130,7 @@ static ssize_t _iommu_cpumask_show(struct device *dev, struct device_attribute *attr, char *buf) { - int n = cpulist_scnprintf(buf, PAGE_SIZE - 2, &iommu_cpumask); - buf[n++] = '\n'; - buf[n] = '\0'; - return n; + return cpumap_print_to_pagebuf(true, buf, &iommu_cpumask); } static DEVICE_ATTR(cpumask, S_IRUGO, _iommu_cpumask_show, NULL); diff --git a/arch/x86/kernel/cpu/perf_event_amd_uncore.c b/arch/x86/kernel/cpu/perf_event_amd_uncore.c index 30790d798e6..cc6cedb8f25 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_amd_uncore.c @@ -219,7 +219,6 @@ static ssize_t amd_uncore_attr_show_cpumask(struct device *dev, struct device_attribute *attr, char *buf) { - int n; cpumask_t *active_mask; struct pmu *pmu = dev_get_drvdata(dev); @@ -230,10 +229,7 @@ static ssize_t amd_uncore_attr_show_cpumask(struct device *dev, else return 0; - n = cpulist_scnprintf(buf, PAGE_SIZE - 2, active_mask); - buf[n++] = '\n'; - buf[n] = '\0'; - return n; + return cpumap_print_to_pagebuf(true, buf, active_mask); } static DEVICE_ATTR(cpumask, S_IRUGO, amd_uncore_attr_show_cpumask, NULL); diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 46211bcc813..3c895d480cd 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -552,18 +552,18 @@ int intel_pmu_drain_bts_buffer(void) * PEBS */ struct event_constraint intel_core2_pebs_event_constraints[] = { - INTEL_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */ - INTEL_UEVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */ - INTEL_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */ - INTEL_UEVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */ - INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */ + INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */ EVENT_CONSTRAINT_END }; struct event_constraint intel_atom_pebs_event_constraints[] = { - INTEL_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */ - INTEL_UEVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */ - INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */ + INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */ EVENT_CONSTRAINT_END }; @@ -577,36 +577,36 @@ struct event_constraint intel_slm_pebs_event_constraints[] = { struct event_constraint intel_nehalem_pebs_event_constraints[] = { INTEL_PLD_CONSTRAINT(0x100b, 0xf), /* MEM_INST_RETIRED.* */ - INTEL_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */ - INTEL_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */ - INTEL_EVENT_CONSTRAINT(0xc0, 0xf), /* INST_RETIRED.ANY */ + INTEL_FLAGS_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */ + INTEL_FLAGS_EVENT_CONSTRAINT(0xc0, 0xf), /* INST_RETIRED.ANY */ INTEL_EVENT_CONSTRAINT(0xc2, 0xf), /* UOPS_RETIRED.* */ - INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ - INTEL_UEVENT_CONSTRAINT(0x02c5, 0xf), /* BR_MISP_RETIRED.NEAR_CALL */ - INTEL_EVENT_CONSTRAINT(0xc7, 0xf), /* SSEX_UOPS_RETIRED.* */ - INTEL_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */ - INTEL_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */ - INTEL_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */ + INTEL_FLAGS_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x02c5, 0xf), /* BR_MISP_RETIRED.NEAR_CALL */ + INTEL_FLAGS_EVENT_CONSTRAINT(0xc7, 0xf), /* SSEX_UOPS_RETIRED.* */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */ + INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */ + INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */ EVENT_CONSTRAINT_END }; struct event_constraint intel_westmere_pebs_event_constraints[] = { INTEL_PLD_CONSTRAINT(0x100b, 0xf), /* MEM_INST_RETIRED.* */ - INTEL_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */ - INTEL_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */ - INTEL_EVENT_CONSTRAINT(0xc0, 0xf), /* INSTR_RETIRED.* */ + INTEL_FLAGS_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */ + INTEL_FLAGS_EVENT_CONSTRAINT(0xc0, 0xf), /* INSTR_RETIRED.* */ INTEL_EVENT_CONSTRAINT(0xc2, 0xf), /* UOPS_RETIRED.* */ - INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ - INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */ - INTEL_EVENT_CONSTRAINT(0xc7, 0xf), /* SSEX_UOPS_RETIRED.* */ - INTEL_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */ - INTEL_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */ - INTEL_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */ + INTEL_FLAGS_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ + INTEL_FLAGS_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */ + INTEL_FLAGS_EVENT_CONSTRAINT(0xc7, 0xf), /* SSEX_UOPS_RETIRED.* */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */ + INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */ + INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */ EVENT_CONSTRAINT_END }; struct event_constraint intel_snb_pebs_event_constraints[] = { - INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */ INTEL_PLD_CONSTRAINT(0x01cd, 0x8), /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */ INTEL_PST_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORES */ /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */ @@ -617,7 +617,7 @@ struct event_constraint intel_snb_pebs_event_constraints[] = { }; struct event_constraint intel_ivb_pebs_event_constraints[] = { - INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */ INTEL_PLD_CONSTRAINT(0x01cd, 0x8), /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */ INTEL_PST_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORES */ /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */ @@ -628,7 +628,7 @@ struct event_constraint intel_ivb_pebs_event_constraints[] = { }; struct event_constraint intel_hsw_pebs_event_constraints[] = { - INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */ INTEL_PLD_CONSTRAINT(0x01cd, 0xf), /* MEM_TRANS_RETIRED.* */ /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */ INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf), @@ -724,6 +724,7 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) unsigned long ip = regs->ip; int is_64bit = 0; void *kaddr; + int size; /* * We don't need to fixup if the PEBS assist is fault like @@ -758,11 +759,12 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) return 1; } + size = ip - to; if (!kernel_ip(ip)) { - int size, bytes; + int bytes; u8 *buf = this_cpu_read(insn_buffer); - size = ip - to; /* Must fit our buffer, see above */ + /* 'size' must fit our buffer, see above */ bytes = copy_from_user_nmi(buf, (void __user *)to, size); if (bytes != 0) return 0; @@ -780,11 +782,20 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) #ifdef CONFIG_X86_64 is_64bit = kernel_ip(to) || !test_thread_flag(TIF_IA32); #endif - insn_init(&insn, kaddr, is_64bit); + insn_init(&insn, kaddr, size, is_64bit); insn_get_length(&insn); + /* + * Make sure there was not a problem decoding the + * instruction and getting the length. This is + * doubly important because we have an infinite + * loop if insn.length=0. + */ + if (!insn.length) + break; to += insn.length; kaddr += insn.length; + size -= insn.length; } while (to < ip); if (to == ip) { @@ -886,6 +897,29 @@ static void __intel_pmu_pebs_event(struct perf_event *event, regs.bp = pebs->bp; regs.sp = pebs->sp; + if (sample_type & PERF_SAMPLE_REGS_INTR) { + regs.ax = pebs->ax; + regs.bx = pebs->bx; + regs.cx = pebs->cx; + regs.dx = pebs->dx; + regs.si = pebs->si; + regs.di = pebs->di; + regs.bp = pebs->bp; + regs.sp = pebs->sp; + + regs.flags = pebs->flags; +#ifndef CONFIG_X86_32 + regs.r8 = pebs->r8; + regs.r9 = pebs->r9; + regs.r10 = pebs->r10; + regs.r11 = pebs->r11; + regs.r12 = pebs->r12; + regs.r13 = pebs->r13; + regs.r14 = pebs->r14; + regs.r15 = pebs->r15; +#endif + } + if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format >= 2) { regs.ip = pebs->real_ip; regs.flags |= PERF_EFLAGS_EXACT; diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index 45fa730a528..58f1a94beaf 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -465,7 +465,7 @@ static int branch_type(unsigned long from, unsigned long to, int abort) { struct insn insn; void *addr; - int bytes, size = MAX_INSN_SIZE; + int bytes_read, bytes_left; int ret = X86_BR_NONE; int ext, to_plm, from_plm; u8 buf[MAX_INSN_SIZE]; @@ -493,8 +493,10 @@ static int branch_type(unsigned long from, unsigned long to, int abort) return X86_BR_NONE; /* may fail if text not present */ - bytes = copy_from_user_nmi(buf, (void __user *)from, size); - if (bytes != 0) + bytes_left = copy_from_user_nmi(buf, (void __user *)from, + MAX_INSN_SIZE); + bytes_read = MAX_INSN_SIZE - bytes_left; + if (!bytes_read) return X86_BR_NONE; addr = buf; @@ -505,10 +507,19 @@ static int branch_type(unsigned long from, unsigned long to, int abort) * Ensure we don't blindy read any address by validating it is * a known text address. */ - if (kernel_text_address(from)) + if (kernel_text_address(from)) { addr = (void *)from; - else + /* + * Assume we can get the maximum possible size + * when grabbing kernel data. This is not + * _strictly_ true since we could possibly be + * executing up next to a memory hole, but + * it is very unlikely to be a problem. + */ + bytes_read = MAX_INSN_SIZE; + } else { return X86_BR_NONE; + } } /* @@ -518,8 +529,10 @@ static int branch_type(unsigned long from, unsigned long to, int abort) #ifdef CONFIG_X86_64 is64 = kernel_ip((unsigned long)addr) || !test_thread_flag(TIF_IA32); #endif - insn_init(&insn, addr, is64); + insn_init(&insn, addr, bytes_read, is64); insn_get_opcode(&insn); + if (!insn.opcode.got) + return X86_BR_ABORT; switch (insn.opcode.bytes[0]) { case 0xf: diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c index d64f275fe27..673f930c700 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c +++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c @@ -365,11 +365,7 @@ static void rapl_pmu_event_read(struct perf_event *event) static ssize_t rapl_get_attr_cpumask(struct device *dev, struct device_attribute *attr, char *buf) { - int n = cpulist_scnprintf(buf, PAGE_SIZE - 2, &rapl_cpu_mask); - - buf[n++] = '\n'; - buf[n] = '\0'; - return n; + return cpumap_print_to_pagebuf(true, buf, &rapl_cpu_mask); } static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL); diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index 9762dbd9f3f..08f3fed2b0f 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -647,11 +647,7 @@ static int uncore_pmu_event_init(struct perf_event *event) static ssize_t uncore_get_attr_cpumask(struct device *dev, struct device_attribute *attr, char *buf) { - int n = cpulist_scnprintf(buf, PAGE_SIZE - 2, &uncore_cpu_mask); - - buf[n++] = '\n'; - buf[n] = '\0'; - return n; + return cpumap_print_to_pagebuf(true, buf, &uncore_cpu_mask); } static DEVICE_ATTR(cpumask, S_IRUGO, uncore_get_attr_cpumask, NULL); diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c index f9ed429d6e4..745b158e9a6 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c @@ -449,7 +449,11 @@ static struct attribute *snbep_uncore_qpi_formats_attr[] = { static struct uncore_event_desc snbep_uncore_imc_events[] = { INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"), INTEL_UNCORE_EVENT_DESC(cas_count_read, "event=0x04,umask=0x03"), + INTEL_UNCORE_EVENT_DESC(cas_count_read.scale, "6.103515625e-5"), + INTEL_UNCORE_EVENT_DESC(cas_count_read.unit, "MiB"), INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"), + INTEL_UNCORE_EVENT_DESC(cas_count_write.scale, "6.103515625e-5"), + INTEL_UNCORE_EVENT_DESC(cas_count_write.unit, "MiB"), { /* end: all zeroes */ }, }; @@ -2036,7 +2040,11 @@ static struct intel_uncore_type hswep_uncore_ha = { static struct uncore_event_desc hswep_uncore_imc_events[] = { INTEL_UNCORE_EVENT_DESC(clockticks, "event=0x00,umask=0x00"), INTEL_UNCORE_EVENT_DESC(cas_count_read, "event=0x04,umask=0x03"), + INTEL_UNCORE_EVENT_DESC(cas_count_read.scale, "6.103515625e-5"), + INTEL_UNCORE_EVENT_DESC(cas_count_read.unit, "MiB"), INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"), + INTEL_UNCORE_EVENT_DESC(cas_count_write.scale, "6.103515625e-5"), + INTEL_UNCORE_EVENT_DESC(cas_count_write.unit, "MiB"), { /* end: all zeroes */ }, }; diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 5433658e598..e7d8c760847 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -72,7 +72,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) if (c->x86_mask || c->cpuid_level >= 0) seq_printf(m, "stepping\t: %d\n", c->x86_mask); else - seq_printf(m, "stepping\t: unknown\n"); + seq_puts(m, "stepping\t: unknown\n"); if (c->microcode) seq_printf(m, "microcode\t: 0x%x\n", c->microcode); @@ -92,12 +92,12 @@ static int show_cpuinfo(struct seq_file *m, void *v) show_cpuinfo_core(m, c, cpu); show_cpuinfo_misc(m, c); - seq_printf(m, "flags\t\t:"); + seq_puts(m, "flags\t\t:"); for (i = 0; i < 32*NCAPINTS; i++) if (cpu_has(c, i) && x86_cap_flags[i] != NULL) seq_printf(m, " %s", x86_cap_flags[i]); - seq_printf(m, "\nbugs\t\t:"); + seq_puts(m, "\nbugs\t\t:"); for (i = 0; i < 32*NBUGINTS; i++) { unsigned int bug_bit = 32*NCAPINTS + i; @@ -118,7 +118,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n", c->x86_phys_bits, c->x86_virt_bits); - seq_printf(m, "power management:"); + seq_puts(m, "power management:"); for (i = 0; i < 32; i++) { if (c->x86_power & (1 << i)) { if (i < ARRAY_SIZE(x86_power_flags) && @@ -131,7 +131,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) } } - seq_printf(m, "\n\n"); + seq_puts(m, "\n\n"); return 0; } diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 4a8013d5594..60639093d53 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -36,6 +36,11 @@ void init_scattered_cpuid_features(struct cpuinfo_x86 *c) { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006, 0 }, { X86_FEATURE_PLN, CR_EAX, 4, 0x00000006, 0 }, { X86_FEATURE_PTS, CR_EAX, 6, 0x00000006, 0 }, + { X86_FEATURE_HWP, CR_EAX, 7, 0x00000006, 0 }, + { X86_FEATURE_HWP_NOITFY, CR_EAX, 8, 0x00000006, 0 }, + { X86_FEATURE_HWP_ACT_WINDOW, CR_EAX, 9, 0x00000006, 0 }, + { X86_FEATURE_HWP_EPP, CR_EAX,10, 0x00000006, 0 }, + { X86_FEATURE_HWP_PKG_REQ, CR_EAX,11, 0x00000006, 0 }, { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 }, { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 }, { X86_FEATURE_HW_PSTATE, CR_EDX, 7, 0x80000007, 0 }, diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 3225ae6c518..83741a71558 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -143,7 +143,7 @@ static int cpuid_device_create(int cpu) dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), NULL, "cpu%d", cpu); - return IS_ERR(dev) ? PTR_ERR(dev) : 0; + return PTR_ERR_OR_ZERO(dev); } static void cpuid_device_destroy(int cpu) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 49f88648161..dd2f07ae9d0 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1114,8 +1114,8 @@ void __init memblock_find_dma_reserve(void) * at first, and assume boot_mem will not take below MAX_DMA_PFN */ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { - start_pfn = min_t(unsigned long, start_pfn, MAX_DMA_PFN); - end_pfn = min_t(unsigned long, end_pfn, MAX_DMA_PFN); + start_pfn = min(start_pfn, MAX_DMA_PFN); + end_pfn = min(end_pfn, MAX_DMA_PFN); nr_pages += end_pfn - start_pfn; } diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 2e1a6853e00..fe9f0b79a18 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -455,6 +455,23 @@ struct intel_stolen_funcs { u32 (*base)(int num, int slot, int func, size_t size); }; +static size_t __init gen9_stolen_size(int num, int slot, int func) +{ + u16 gmch_ctrl; + + gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL); + gmch_ctrl >>= BDW_GMCH_GMS_SHIFT; + gmch_ctrl &= BDW_GMCH_GMS_MASK; + + if (gmch_ctrl < 0xf0) + return gmch_ctrl << 25; /* 32 MB units */ + else + /* 4MB increments starting at 0xf0 for 4MB */ + return (gmch_ctrl - 0xf0 + 1) << 22; +} + +typedef size_t (*stolen_size_fn)(int num, int slot, int func); + static const struct intel_stolen_funcs i830_stolen_funcs __initconst = { .base = i830_stolen_base, .size = i830_stolen_size, @@ -490,6 +507,11 @@ static const struct intel_stolen_funcs gen8_stolen_funcs __initconst = { .size = gen8_stolen_size, }; +static const struct intel_stolen_funcs gen9_stolen_funcs __initconst = { + .base = intel_stolen_base, + .size = gen9_stolen_size, +}; + static const struct intel_stolen_funcs chv_stolen_funcs __initconst = { .base = intel_stolen_base, .size = chv_stolen_size, @@ -523,6 +545,7 @@ static const struct pci_device_id intel_stolen_ids[] __initconst = { INTEL_BDW_M_IDS(&gen8_stolen_funcs), INTEL_BDW_D_IDS(&gen8_stolen_funcs), INTEL_CHV_IDS(&chv_stolen_funcs), + INTEL_SKL_IDS(&gen9_stolen_funcs), }; static void __init intel_graphics_stolen(int num, int slot, int func) diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 344b63f18d1..1cf7c97ff17 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1191,10 +1191,10 @@ ENTRY(ftrace_graph_caller) pushl %eax pushl %ecx pushl %edx - movl 0xc(%esp), %edx - lea 0x4(%ebp), %eax + movl 0xc(%esp), %eax + lea 0x4(%ebp), %edx movl (%ebp), %ecx - subl $MCOUNT_INSN_SIZE, %edx + subl $MCOUNT_INSN_SIZE, %eax call prepare_ftrace_return popl %edx popl %ecx diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index c0226ab5410..90878aa38db 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -652,6 +652,20 @@ ENTRY(stub_execve) CFI_ENDPROC END(stub_execve) +ENTRY(stub_execveat) + CFI_STARTPROC + addq $8, %rsp + PARTIAL_FRAME 0 + SAVE_REST + FIXUP_TOP_OF_STACK %r11 + call sys_execveat + RESTORE_TOP_OF_STACK %r11 + movq %rax,RAX(%rsp) + RESTORE_REST + jmp int_ret_from_sys_call + CFI_ENDPROC +END(stub_execveat) + /* * sigreturn is special because it needs to restore all registers on return. * This cannot be done with SYSRET, so use the IRET return path instead. @@ -697,6 +711,20 @@ ENTRY(stub_x32_execve) CFI_ENDPROC END(stub_x32_execve) +ENTRY(stub_x32_execveat) + CFI_STARTPROC + addq $8, %rsp + PARTIAL_FRAME 0 + SAVE_REST + FIXUP_TOP_OF_STACK %r11 + call compat_sys_execveat + RESTORE_TOP_OF_STACK %r11 + movq %rax,RAX(%rsp) + RESTORE_REST + jmp int_ret_from_sys_call + CFI_ENDPROC +END(stub_x32_execveat) + #endif /* diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index 94d857fb103..f5d0730e7b0 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -122,9 +122,6 @@ static void init_espfix_random(void) void __init init_espfix_bsp(void) { pgd_t *pgd_p; - pteval_t ptemask; - - ptemask = __supported_pte_mask; /* Install the espfix pud into the kernel page directory */ pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)]; diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 3386dc9aa33..2142376dc8c 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -17,6 +17,7 @@ #include <linux/ftrace.h> #include <linux/percpu.h> #include <linux/sched.h> +#include <linux/slab.h> #include <linux/init.h> #include <linux/list.h> #include <linux/module.h> @@ -47,7 +48,7 @@ int ftrace_arch_code_modify_post_process(void) union ftrace_code_union { char code[MCOUNT_INSN_SIZE]; struct { - char e8; + unsigned char e8; int offset; } __attribute__((packed)); }; @@ -582,7 +583,7 @@ void ftrace_replace_code(int enable) remove_breakpoints: pr_warn("Failed on %s (%d):\n", report, count); - ftrace_bug(ret, rec ? rec->ip : 0); + ftrace_bug(ret, rec); for_ftrace_rec_iter(iter) { rec = ftrace_rec_iter_record(iter); /* @@ -644,13 +645,8 @@ int __init ftrace_dyn_arch_init(void) { return 0; } -#endif - -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - -#ifdef CONFIG_DYNAMIC_FTRACE -extern void ftrace_graph_call(void); +#if defined(CONFIG_X86_64) || defined(CONFIG_FUNCTION_GRAPH_TRACER) static unsigned char *ftrace_jmp_replace(unsigned long ip, unsigned long addr) { static union ftrace_code_union calc; @@ -664,6 +660,280 @@ static unsigned char *ftrace_jmp_replace(unsigned long ip, unsigned long addr) */ return calc.code; } +#endif + +/* Currently only x86_64 supports dynamic trampolines */ +#ifdef CONFIG_X86_64 + +#ifdef CONFIG_MODULES +#include <linux/moduleloader.h> +/* Module allocation simplifies allocating memory for code */ +static inline void *alloc_tramp(unsigned long size) +{ + return module_alloc(size); +} +static inline void tramp_free(void *tramp) +{ + module_free(NULL, tramp); +} +#else +/* Trampolines can only be created if modules are supported */ +static inline void *alloc_tramp(unsigned long size) +{ + return NULL; +} +static inline void tramp_free(void *tramp) { } +#endif + +/* Defined as markers to the end of the ftrace default trampolines */ +extern void ftrace_caller_end(void); +extern void ftrace_regs_caller_end(void); +extern void ftrace_return(void); +extern void ftrace_caller_op_ptr(void); +extern void ftrace_regs_caller_op_ptr(void); + +/* movq function_trace_op(%rip), %rdx */ +/* 0x48 0x8b 0x15 <offset-to-ftrace_trace_op (4 bytes)> */ +#define OP_REF_SIZE 7 + +/* + * The ftrace_ops is passed to the function callback. Since the + * trampoline only services a single ftrace_ops, we can pass in + * that ops directly. + * + * The ftrace_op_code_union is used to create a pointer to the + * ftrace_ops that will be passed to the callback function. + */ +union ftrace_op_code_union { + char code[OP_REF_SIZE]; + struct { + char op[3]; + int offset; + } __attribute__((packed)); +}; + +static unsigned long +create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) +{ + unsigned const char *jmp; + unsigned long start_offset; + unsigned long end_offset; + unsigned long op_offset; + unsigned long offset; + unsigned long size; + unsigned long ip; + unsigned long *ptr; + void *trampoline; + /* 48 8b 15 <offset> is movq <offset>(%rip), %rdx */ + unsigned const char op_ref[] = { 0x48, 0x8b, 0x15 }; + union ftrace_op_code_union op_ptr; + int ret; + + if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) { + start_offset = (unsigned long)ftrace_regs_caller; + end_offset = (unsigned long)ftrace_regs_caller_end; + op_offset = (unsigned long)ftrace_regs_caller_op_ptr; + } else { + start_offset = (unsigned long)ftrace_caller; + end_offset = (unsigned long)ftrace_caller_end; + op_offset = (unsigned long)ftrace_caller_op_ptr; + } + + size = end_offset - start_offset; + + /* + * Allocate enough size to store the ftrace_caller code, + * the jmp to ftrace_return, as well as the address of + * the ftrace_ops this trampoline is used for. + */ + trampoline = alloc_tramp(size + MCOUNT_INSN_SIZE + sizeof(void *)); + if (!trampoline) + return 0; + + *tramp_size = size + MCOUNT_INSN_SIZE + sizeof(void *); + + /* Copy ftrace_caller onto the trampoline memory */ + ret = probe_kernel_read(trampoline, (void *)start_offset, size); + if (WARN_ON(ret < 0)) { + tramp_free(trampoline); + return 0; + } + + ip = (unsigned long)trampoline + size; + + /* The trampoline ends with a jmp to ftrace_return */ + jmp = ftrace_jmp_replace(ip, (unsigned long)ftrace_return); + memcpy(trampoline + size, jmp, MCOUNT_INSN_SIZE); + + /* + * The address of the ftrace_ops that is used for this trampoline + * is stored at the end of the trampoline. This will be used to + * load the third parameter for the callback. Basically, that + * location at the end of the trampoline takes the place of + * the global function_trace_op variable. + */ + + ptr = (unsigned long *)(trampoline + size + MCOUNT_INSN_SIZE); + *ptr = (unsigned long)ops; + + op_offset -= start_offset; + memcpy(&op_ptr, trampoline + op_offset, OP_REF_SIZE); + + /* Are we pointing to the reference? */ + if (WARN_ON(memcmp(op_ptr.op, op_ref, 3) != 0)) { + tramp_free(trampoline); + return 0; + } + + /* Load the contents of ptr into the callback parameter */ + offset = (unsigned long)ptr; + offset -= (unsigned long)trampoline + op_offset + OP_REF_SIZE; + + op_ptr.offset = offset; + + /* put in the new offset to the ftrace_ops */ + memcpy(trampoline + op_offset, &op_ptr, OP_REF_SIZE); + + /* ALLOC_TRAMP flags lets us know we created it */ + ops->flags |= FTRACE_OPS_FL_ALLOC_TRAMP; + + return (unsigned long)trampoline; +} + +static unsigned long calc_trampoline_call_offset(bool save_regs) +{ + unsigned long start_offset; + unsigned long call_offset; + + if (save_regs) { + start_offset = (unsigned long)ftrace_regs_caller; + call_offset = (unsigned long)ftrace_regs_call; + } else { + start_offset = (unsigned long)ftrace_caller; + call_offset = (unsigned long)ftrace_call; + } + + return call_offset - start_offset; +} + +void arch_ftrace_update_trampoline(struct ftrace_ops *ops) +{ + ftrace_func_t func; + unsigned char *new; + unsigned long offset; + unsigned long ip; + unsigned int size; + int ret; + + if (ops->trampoline) { + /* + * The ftrace_ops caller may set up its own trampoline. + * In such a case, this code must not modify it. + */ + if (!(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) + return; + } else { + ops->trampoline = create_trampoline(ops, &size); + if (!ops->trampoline) + return; + ops->trampoline_size = size; + } + + offset = calc_trampoline_call_offset(ops->flags & FTRACE_OPS_FL_SAVE_REGS); + ip = ops->trampoline + offset; + + func = ftrace_ops_get_func(ops); + + /* Do a safe modify in case the trampoline is executing */ + new = ftrace_call_replace(ip, (unsigned long)func); + ret = update_ftrace_func(ip, new); + + /* The update should never fail */ + WARN_ON(ret); +} + +/* Return the address of the function the trampoline calls */ +static void *addr_from_call(void *ptr) +{ + union ftrace_code_union calc; + int ret; + + ret = probe_kernel_read(&calc, ptr, MCOUNT_INSN_SIZE); + if (WARN_ON_ONCE(ret < 0)) + return NULL; + + /* Make sure this is a call */ + if (WARN_ON_ONCE(calc.e8 != 0xe8)) { + pr_warn("Expected e8, got %x\n", calc.e8); + return NULL; + } + + return ptr + MCOUNT_INSN_SIZE + calc.offset; +} + +void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, + unsigned long frame_pointer); + +/* + * If the ops->trampoline was not allocated, then it probably + * has a static trampoline func, or is the ftrace caller itself. + */ +static void *static_tramp_func(struct ftrace_ops *ops, struct dyn_ftrace *rec) +{ + unsigned long offset; + bool save_regs = rec->flags & FTRACE_FL_REGS_EN; + void *ptr; + + if (ops && ops->trampoline) { +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + /* + * We only know about function graph tracer setting as static + * trampoline. + */ + if (ops->trampoline == FTRACE_GRAPH_ADDR) + return (void *)prepare_ftrace_return; +#endif + return NULL; + } + + offset = calc_trampoline_call_offset(save_regs); + + if (save_regs) + ptr = (void *)FTRACE_REGS_ADDR + offset; + else + ptr = (void *)FTRACE_ADDR + offset; + + return addr_from_call(ptr); +} + +void *arch_ftrace_trampoline_func(struct ftrace_ops *ops, struct dyn_ftrace *rec) +{ + unsigned long offset; + + /* If we didn't allocate this trampoline, consider it static */ + if (!ops || !(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) + return static_tramp_func(ops, rec); + + offset = calc_trampoline_call_offset(ops->flags & FTRACE_OPS_FL_SAVE_REGS); + return addr_from_call((void *)ops->trampoline + offset); +} + +void arch_ftrace_trampoline_free(struct ftrace_ops *ops) +{ + if (!ops || !(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) + return; + + tramp_free((void *)ops->trampoline); + ops->trampoline = 0; +} + +#endif /* CONFIG_X86_64 */ +#endif /* CONFIG_DYNAMIC_FTRACE */ + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + +#ifdef CONFIG_DYNAMIC_FTRACE +extern void ftrace_graph_call(void); static int ftrace_mod_jmp(unsigned long ip, void *func) { @@ -694,7 +964,7 @@ int ftrace_disable_ftrace_graph_caller(void) * Hook the return address and push it in the stack of return addrs * in current thread info. */ -void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, +void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, unsigned long frame_pointer) { unsigned long old; diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 922d2858102..6307a0f0cf1 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -59,78 +59,78 @@ int arch_show_interrupts(struct seq_file *p, int prec) seq_printf(p, "%*s: ", prec, "NMI"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->__nmi_count); - seq_printf(p, " Non-maskable interrupts\n"); + seq_puts(p, " Non-maskable interrupts\n"); #ifdef CONFIG_X86_LOCAL_APIC seq_printf(p, "%*s: ", prec, "LOC"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs); - seq_printf(p, " Local timer interrupts\n"); + seq_puts(p, " Local timer interrupts\n"); seq_printf(p, "%*s: ", prec, "SPU"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count); - seq_printf(p, " Spurious interrupts\n"); + seq_puts(p, " Spurious interrupts\n"); seq_printf(p, "%*s: ", prec, "PMI"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs); - seq_printf(p, " Performance monitoring interrupts\n"); + seq_puts(p, " Performance monitoring interrupts\n"); seq_printf(p, "%*s: ", prec, "IWI"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_irq_work_irqs); - seq_printf(p, " IRQ work interrupts\n"); + seq_puts(p, " IRQ work interrupts\n"); seq_printf(p, "%*s: ", prec, "RTR"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->icr_read_retry_count); - seq_printf(p, " APIC ICR read retries\n"); + seq_puts(p, " APIC ICR read retries\n"); #endif if (x86_platform_ipi_callback) { seq_printf(p, "%*s: ", prec, "PLT"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->x86_platform_ipis); - seq_printf(p, " Platform interrupts\n"); + seq_puts(p, " Platform interrupts\n"); } #ifdef CONFIG_SMP seq_printf(p, "%*s: ", prec, "RES"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_resched_count); - seq_printf(p, " Rescheduling interrupts\n"); + seq_puts(p, " Rescheduling interrupts\n"); seq_printf(p, "%*s: ", prec, "CAL"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_call_count - irq_stats(j)->irq_tlb_count); - seq_printf(p, " Function call interrupts\n"); + seq_puts(p, " Function call interrupts\n"); seq_printf(p, "%*s: ", prec, "TLB"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count); - seq_printf(p, " TLB shootdowns\n"); + seq_puts(p, " TLB shootdowns\n"); #endif #ifdef CONFIG_X86_THERMAL_VECTOR seq_printf(p, "%*s: ", prec, "TRM"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count); - seq_printf(p, " Thermal event interrupts\n"); + seq_puts(p, " Thermal event interrupts\n"); #endif #ifdef CONFIG_X86_MCE_THRESHOLD seq_printf(p, "%*s: ", prec, "THR"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); - seq_printf(p, " Threshold APIC interrupts\n"); + seq_puts(p, " Threshold APIC interrupts\n"); #endif #ifdef CONFIG_X86_MCE seq_printf(p, "%*s: ", prec, "MCE"); for_each_online_cpu(j) seq_printf(p, "%10u ", per_cpu(mce_exception_count, j)); - seq_printf(p, " Machine check exceptions\n"); + seq_puts(p, " Machine check exceptions\n"); seq_printf(p, "%*s: ", prec, "MCP"); for_each_online_cpu(j) seq_printf(p, "%10u ", per_cpu(mce_poll_count, j)); - seq_printf(p, " Machine check polls\n"); + seq_puts(p, " Machine check polls\n"); #endif #if IS_ENABLED(CONFIG_HYPERV) || defined(CONFIG_XEN) seq_printf(p, "%*s: ", prec, "THR"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_hv_callback_count); - seq_printf(p, " Hypervisor callback interrupts\n"); + seq_puts(p, " Hypervisor callback interrupts\n"); #endif seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count)); #if defined(CONFIG_X86_IO_APIC) diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 67e6d19ef1b..f7e3cd50ece 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -285,7 +285,7 @@ static int can_probe(unsigned long paddr) * normally used, we just go through if there is no kprobe. */ __addr = recover_probed_instruction(buf, addr); - kernel_insn_init(&insn, (void *)__addr); + kernel_insn_init(&insn, (void *)__addr, MAX_INSN_SIZE); insn_get_length(&insn); /* @@ -330,8 +330,10 @@ int __copy_instruction(u8 *dest, u8 *src) { struct insn insn; kprobe_opcode_t buf[MAX_INSN_SIZE]; + unsigned long recovered_insn = + recover_probed_instruction(buf, (unsigned long)src); - kernel_insn_init(&insn, (void *)recover_probed_instruction(buf, (unsigned long)src)); + kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE); insn_get_length(&insn); /* Another subsystem puts a breakpoint, failed to recover */ if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) @@ -342,7 +344,7 @@ int __copy_instruction(u8 *dest, u8 *src) if (insn_rip_relative(&insn)) { s64 newdisp; u8 *disp; - kernel_insn_init(&insn, dest); + kernel_insn_init(&insn, dest, insn.length); insn_get_displacement(&insn); /* * The copied instruction uses the %rip-relative addressing diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c index 717b02a22e6..5f8f0b3cc67 100644 --- a/arch/x86/kernel/kprobes/ftrace.c +++ b/arch/x86/kernel/kprobes/ftrace.c @@ -27,7 +27,7 @@ static nokprobe_inline int __skip_singlestep(struct kprobe *p, struct pt_regs *regs, - struct kprobe_ctlblk *kcb) + struct kprobe_ctlblk *kcb, unsigned long orig_ip) { /* * Emulate singlestep (and also recover regs->ip) @@ -39,6 +39,8 @@ int __skip_singlestep(struct kprobe *p, struct pt_regs *regs, p->post_handler(p, regs, 0); } __this_cpu_write(current_kprobe, NULL); + if (orig_ip) + regs->ip = orig_ip; return 1; } @@ -46,7 +48,7 @@ int skip_singlestep(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb) { if (kprobe_ftrace(p)) - return __skip_singlestep(p, regs, kcb); + return __skip_singlestep(p, regs, kcb, 0); else return 0; } @@ -71,13 +73,14 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, if (kprobe_running()) { kprobes_inc_nmissed_count(p); } else { + unsigned long orig_ip = regs->ip; /* Kprobe handler expects regs->ip = ip + 1 as breakpoint hit */ regs->ip = ip + sizeof(kprobe_opcode_t); __this_cpu_write(current_kprobe, p); kcb->kprobe_status = KPROBE_HIT_ACTIVE; if (!p->pre_handler || !p->pre_handler(p, regs)) - __skip_singlestep(p, regs, kcb); + __skip_singlestep(p, regs, kcb, orig_ip); /* * If pre_handler returns !0, it sets regs->ip and * resets current kprobe. diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index f1314d0bcf0..7c523bbf3dc 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -251,13 +251,15 @@ static int can_optimize(unsigned long paddr) /* Decode instructions */ addr = paddr - offset; while (addr < paddr - offset + size) { /* Decode until function end */ + unsigned long recovered_insn; if (search_exception_tables(addr)) /* * Since some fixup code will jumps into this function, * we can't optimize kprobe in this function. */ return 0; - kernel_insn_init(&insn, (void *)recover_probed_instruction(buf, addr)); + recovered_insn = recover_probed_instruction(buf, addr); + kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE); insn_get_length(&insn); /* Another subsystem puts a breakpoint */ if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) diff --git a/arch/x86/kernel/mcount_64.S b/arch/x86/kernel/mcount_64.S index c73aecf10d3..94ea120fa21 100644 --- a/arch/x86/kernel/mcount_64.S +++ b/arch/x86/kernel/mcount_64.S @@ -21,40 +21,159 @@ # define function_hook mcount #endif +/* All cases save the original rbp (8 bytes) */ +#ifdef CONFIG_FRAME_POINTER +# ifdef CC_USING_FENTRY +/* Save parent and function stack frames (rip and rbp) */ +# define MCOUNT_FRAME_SIZE (8+16*2) +# else +/* Save just function stack frame (rip and rbp) */ +# define MCOUNT_FRAME_SIZE (8+16) +# endif +#else +/* No need to save a stack frame */ +# define MCOUNT_FRAME_SIZE 8 +#endif /* CONFIG_FRAME_POINTER */ + +/* Size of stack used to save mcount regs in save_mcount_regs */ +#define MCOUNT_REG_SIZE (SS+8 + MCOUNT_FRAME_SIZE) + +/* + * gcc -pg option adds a call to 'mcount' in most functions. + * When -mfentry is used, the call is to 'fentry' and not 'mcount' + * and is done before the function's stack frame is set up. + * They both require a set of regs to be saved before calling + * any C code and restored before returning back to the function. + * + * On boot up, all these calls are converted into nops. When tracing + * is enabled, the call can jump to either ftrace_caller or + * ftrace_regs_caller. Callbacks (tracing functions) that require + * ftrace_regs_caller (like kprobes) need to have pt_regs passed to + * it. For this reason, the size of the pt_regs structure will be + * allocated on the stack and the required mcount registers will + * be saved in the locations that pt_regs has them in. + */ + +/* + * @added: the amount of stack added before calling this + * + * After this is called, the following registers contain: + * + * %rdi - holds the address that called the trampoline + * %rsi - holds the parent function (traced function's return address) + * %rdx - holds the original %rbp + */ +.macro save_mcount_regs added=0 + + /* Always save the original rbp */ + pushq %rbp + +#ifdef CONFIG_FRAME_POINTER + /* + * Stack traces will stop at the ftrace trampoline if the frame pointer + * is not set up properly. If fentry is used, we need to save a frame + * pointer for the parent as well as the function traced, because the + * fentry is called before the stack frame is set up, where as mcount + * is called afterward. + */ +#ifdef CC_USING_FENTRY + /* Save the parent pointer (skip orig rbp and our return address) */ + pushq \added+8*2(%rsp) + pushq %rbp + movq %rsp, %rbp + /* Save the return address (now skip orig rbp, rbp and parent) */ + pushq \added+8*3(%rsp) +#else + /* Can't assume that rip is before this (unless added was zero) */ + pushq \added+8(%rsp) +#endif + pushq %rbp + movq %rsp, %rbp +#endif /* CONFIG_FRAME_POINTER */ + + /* + * We add enough stack to save all regs. + */ + subq $(MCOUNT_REG_SIZE - MCOUNT_FRAME_SIZE), %rsp + movq %rax, RAX(%rsp) + movq %rcx, RCX(%rsp) + movq %rdx, RDX(%rsp) + movq %rsi, RSI(%rsp) + movq %rdi, RDI(%rsp) + movq %r8, R8(%rsp) + movq %r9, R9(%rsp) + /* + * Save the original RBP. Even though the mcount ABI does not + * require this, it helps out callers. + */ + movq MCOUNT_REG_SIZE-8(%rsp), %rdx + movq %rdx, RBP(%rsp) + + /* Copy the parent address into %rsi (second parameter) */ +#ifdef CC_USING_FENTRY + movq MCOUNT_REG_SIZE+8+\added(%rsp), %rsi +#else + /* %rdx contains original %rbp */ + movq 8(%rdx), %rsi +#endif + + /* Move RIP to its proper location */ + movq MCOUNT_REG_SIZE+\added(%rsp), %rdi + movq %rdi, RIP(%rsp) + + /* + * Now %rdi (the first parameter) has the return address of + * where ftrace_call returns. But the callbacks expect the + * address of the call itself. + */ + subq $MCOUNT_INSN_SIZE, %rdi + .endm + +.macro restore_mcount_regs + movq R9(%rsp), %r9 + movq R8(%rsp), %r8 + movq RDI(%rsp), %rdi + movq RSI(%rsp), %rsi + movq RDX(%rsp), %rdx + movq RCX(%rsp), %rcx + movq RAX(%rsp), %rax + + /* ftrace_regs_caller can modify %rbp */ + movq RBP(%rsp), %rbp + + addq $MCOUNT_REG_SIZE, %rsp + + .endm + #ifdef CONFIG_DYNAMIC_FTRACE ENTRY(function_hook) retq END(function_hook) -/* skip is set if stack has been adjusted */ -.macro ftrace_caller_setup skip=0 - MCOUNT_SAVE_FRAME \skip +ENTRY(ftrace_caller) + /* save_mcount_regs fills in first two parameters */ + save_mcount_regs +GLOBAL(ftrace_caller_op_ptr) /* Load the ftrace_ops into the 3rd parameter */ movq function_trace_op(%rip), %rdx - /* Load ip into the first parameter */ - movq RIP(%rsp), %rdi - subq $MCOUNT_INSN_SIZE, %rdi - /* Load the parent_ip into the second parameter */ -#ifdef CC_USING_FENTRY - movq SS+16(%rsp), %rsi -#else - movq 8(%rbp), %rsi -#endif -.endm - -ENTRY(ftrace_caller) - ftrace_caller_setup /* regs go into 4th parameter (but make it NULL) */ movq $0, %rcx GLOBAL(ftrace_call) call ftrace_stub - MCOUNT_RESTORE_FRAME -ftrace_return: + restore_mcount_regs + + /* + * The copied trampoline must call ftrace_return as it + * still may need to call the function graph tracer. + */ +GLOBAL(ftrace_caller_end) + +GLOBAL(ftrace_return) #ifdef CONFIG_FUNCTION_GRAPH_TRACER GLOBAL(ftrace_graph_call) @@ -66,11 +185,16 @@ GLOBAL(ftrace_stub) END(ftrace_caller) ENTRY(ftrace_regs_caller) - /* Save the current flags before compare (in SS location)*/ + /* Save the current flags before any operations that can change them */ pushfq - /* skip=8 to skip flags saved in SS */ - ftrace_caller_setup 8 + /* added 8 bytes to save flags */ + save_mcount_regs 8 + /* save_mcount_regs fills in first two parameters */ + +GLOBAL(ftrace_regs_caller_op_ptr) + /* Load the ftrace_ops into the 3rd parameter */ + movq function_trace_op(%rip), %rdx /* Save the rest of pt_regs */ movq %r15, R15(%rsp) @@ -79,18 +203,17 @@ ENTRY(ftrace_regs_caller) movq %r12, R12(%rsp) movq %r11, R11(%rsp) movq %r10, R10(%rsp) - movq %rbp, RBP(%rsp) movq %rbx, RBX(%rsp) /* Copy saved flags */ - movq SS(%rsp), %rcx + movq MCOUNT_REG_SIZE(%rsp), %rcx movq %rcx, EFLAGS(%rsp) /* Kernel segments */ movq $__KERNEL_DS, %rcx movq %rcx, SS(%rsp) movq $__KERNEL_CS, %rcx movq %rcx, CS(%rsp) - /* Stack - skipping return address */ - leaq SS+16(%rsp), %rcx + /* Stack - skipping return address and flags */ + leaq MCOUNT_REG_SIZE+8*2(%rsp), %rcx movq %rcx, RSP(%rsp) /* regs go into 4th parameter */ @@ -101,11 +224,11 @@ GLOBAL(ftrace_regs_call) /* Copy flags back to SS, to restore them */ movq EFLAGS(%rsp), %rax - movq %rax, SS(%rsp) + movq %rax, MCOUNT_REG_SIZE(%rsp) /* Handlers can change the RIP */ movq RIP(%rsp), %rax - movq %rax, SS+8(%rsp) + movq %rax, MCOUNT_REG_SIZE+8(%rsp) /* restore the rest of pt_regs */ movq R15(%rsp), %r15 @@ -113,19 +236,22 @@ GLOBAL(ftrace_regs_call) movq R13(%rsp), %r13 movq R12(%rsp), %r12 movq R10(%rsp), %r10 - movq RBP(%rsp), %rbp movq RBX(%rsp), %rbx - /* skip=8 to skip flags saved in SS */ - MCOUNT_RESTORE_FRAME 8 + restore_mcount_regs /* Restore flags */ popfq - jmp ftrace_return + /* + * As this jmp to ftrace_return can be a short jump + * it must not be copied into the trampoline. + * The trampoline will add the code to jump + * to the return. + */ +GLOBAL(ftrace_regs_caller_end) - popfq - jmp ftrace_stub + jmp ftrace_return END(ftrace_regs_caller) @@ -136,6 +262,7 @@ ENTRY(function_hook) cmpq $ftrace_stub, ftrace_trace_function jnz trace +fgraph_trace: #ifdef CONFIG_FUNCTION_GRAPH_TRACER cmpq $ftrace_stub, ftrace_graph_return jnz ftrace_graph_caller @@ -148,42 +275,35 @@ GLOBAL(ftrace_stub) retq trace: - MCOUNT_SAVE_FRAME - - movq RIP(%rsp), %rdi -#ifdef CC_USING_FENTRY - movq SS+16(%rsp), %rsi -#else - movq 8(%rbp), %rsi -#endif - subq $MCOUNT_INSN_SIZE, %rdi + /* save_mcount_regs fills in first two parameters */ + save_mcount_regs call *ftrace_trace_function - MCOUNT_RESTORE_FRAME + restore_mcount_regs - jmp ftrace_stub + jmp fgraph_trace END(function_hook) #endif /* CONFIG_DYNAMIC_FTRACE */ #endif /* CONFIG_FUNCTION_TRACER */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER ENTRY(ftrace_graph_caller) - MCOUNT_SAVE_FRAME + /* Saves rbp into %rdx and fills first parameter */ + save_mcount_regs #ifdef CC_USING_FENTRY - leaq SS+16(%rsp), %rdi + leaq MCOUNT_REG_SIZE+8(%rsp), %rsi movq $0, %rdx /* No framepointers needed */ #else - leaq 8(%rbp), %rdi - movq (%rbp), %rdx + /* Save address of the return address of traced function */ + leaq 8(%rdx), %rsi + /* ftrace does sanity checks against frame pointers */ + movq (%rdx), %rdx #endif - movq RIP(%rsp), %rsi - subq $MCOUNT_INSN_SIZE, %rsi - call prepare_ftrace_return - MCOUNT_RESTORE_FRAME + restore_mcount_regs retq END(ftrace_graph_caller) diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index c9603ac80de..113e7078485 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -22,6 +22,8 @@ * an SMP box will direct the access to CPU %d. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/types.h> @@ -50,11 +52,11 @@ static loff_t msr_seek(struct file *file, loff_t offset, int orig) mutex_lock(&inode->i_mutex); switch (orig) { - case 0: + case SEEK_SET: file->f_pos = offset; ret = file->f_pos; break; - case 1: + case SEEK_CUR: file->f_pos += offset; ret = file->f_pos; break; @@ -206,7 +208,7 @@ static int msr_device_create(int cpu) dev = device_create(msr_class, NULL, MKDEV(MSR_MAJOR, cpu), NULL, "msr%d", cpu); - return IS_ERR(dev) ? PTR_ERR(dev) : 0; + return PTR_ERR_OR_ZERO(dev); } static void msr_device_destroy(int cpu) @@ -248,8 +250,7 @@ static int __init msr_init(void) i = 0; if (__register_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr", &msr_fops)) { - printk(KERN_ERR "msr: unable to get major %d for msr\n", - MSR_MAJOR); + pr_err("unable to get major %d for msr\n", MSR_MAJOR); err = -EBUSY; goto out; } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 3ed4a68d401..5a2c02913af 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -283,24 +283,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) fpu = switch_fpu_prepare(prev_p, next_p, cpu); - /* - * Reload esp0, LDT and the page table pointer: - */ + /* Reload esp0 and ss1. */ load_sp0(tss, next); - /* - * Switch DS and ES. - * This won't pick up thread selector changes, but I guess that is ok. - */ - savesegment(es, prev->es); - if (unlikely(next->es | prev->es)) - loadsegment(es, next->es); - - savesegment(ds, prev->ds); - if (unlikely(next->ds | prev->ds)) - loadsegment(ds, next->ds); - - /* We must save %fs and %gs before load_TLS() because * %fs and %gs may be cleared by load_TLS(). * @@ -309,41 +294,101 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) savesegment(fs, fsindex); savesegment(gs, gsindex); + /* + * Load TLS before restoring any segments so that segment loads + * reference the correct GDT entries. + */ load_TLS(next, cpu); /* - * Leave lazy mode, flushing any hypercalls made here. - * This must be done before restoring TLS segments so - * the GDT and LDT are properly updated, and must be - * done before math_state_restore, so the TS bit is up - * to date. + * Leave lazy mode, flushing any hypercalls made here. This + * must be done after loading TLS entries in the GDT but before + * loading segments that might reference them, and and it must + * be done before math_state_restore, so the TS bit is up to + * date. */ arch_end_context_switch(next_p); + /* Switch DS and ES. + * + * Reading them only returns the selectors, but writing them (if + * nonzero) loads the full descriptor from the GDT or LDT. The + * LDT for next is loaded in switch_mm, and the GDT is loaded + * above. + * + * We therefore need to write new values to the segment + * registers on every context switch unless both the new and old + * values are zero. + * + * Note that we don't need to do anything for CS and SS, as + * those are saved and restored as part of pt_regs. + */ + savesegment(es, prev->es); + if (unlikely(next->es | prev->es)) + loadsegment(es, next->es); + + savesegment(ds, prev->ds); + if (unlikely(next->ds | prev->ds)) + loadsegment(ds, next->ds); + /* * Switch FS and GS. * - * Segment register != 0 always requires a reload. Also - * reload when it has changed. When prev process used 64bit - * base always reload to avoid an information leak. + * These are even more complicated than FS and GS: they have + * 64-bit bases are that controlled by arch_prctl. Those bases + * only differ from the values in the GDT or LDT if the selector + * is 0. + * + * Loading the segment register resets the hidden base part of + * the register to 0 or the value from the GDT / LDT. If the + * next base address zero, writing 0 to the segment register is + * much faster than using wrmsr to explicitly zero the base. + * + * The thread_struct.fs and thread_struct.gs values are 0 + * if the fs and gs bases respectively are not overridden + * from the values implied by fsindex and gsindex. They + * are nonzero, and store the nonzero base addresses, if + * the bases are overridden. + * + * (fs != 0 && fsindex != 0) || (gs != 0 && gsindex != 0) should + * be impossible. + * + * Therefore we need to reload the segment registers if either + * the old or new selector is nonzero, and we need to override + * the base address if next thread expects it to be overridden. + * + * This code is unnecessarily slow in the case where the old and + * new indexes are zero and the new base is nonzero -- it will + * unnecessarily write 0 to the selector before writing the new + * base address. + * + * Note: This all depends on arch_prctl being the only way that + * user code can override the segment base. Once wrfsbase and + * wrgsbase are enabled, most of this code will need to change. */ if (unlikely(fsindex | next->fsindex | prev->fs)) { loadsegment(fs, next->fsindex); + /* - * Check if the user used a selector != 0; if yes - * clear 64bit base, since overloaded base is always - * mapped to the Null selector + * If user code wrote a nonzero value to FS, then it also + * cleared the overridden base address. + * + * XXX: if user code wrote 0 to FS and cleared the base + * address itself, we won't notice and we'll incorrectly + * restore the prior base address next time we reschdule + * the process. */ if (fsindex) prev->fs = 0; } - /* when next process has a 64bit base use it */ if (next->fs) wrmsrl(MSR_FS_BASE, next->fs); prev->fsindex = fsindex; if (unlikely(gsindex | next->gsindex | prev->gs)) { load_gs_index(next->gsindex); + + /* This works (and fails) the same way as fsindex above. */ if (gsindex) prev->gs = 0; } diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index ab08aa2276f..ab4734e5411 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -960,6 +960,8 @@ void __init setup_arch(char **cmdline_p) init_mm.end_data = (unsigned long) _edata; init_mm.brk = _brk_end; + mpx_mm_init(&init_mm); + code_resource.start = __pa_symbol(_text); code_resource.end = __pa_symbol(_etext)-1; data_resource.start = __pa_symbol(_etext); @@ -1190,9 +1192,7 @@ void __init setup_arch(char **cmdline_p) tboot_probe(); -#ifdef CONFIG_X86_64 map_vsyscall(); -#endif generic_apic_probe(); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 5cdff035774..e4fcb87ba7a 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -30,7 +30,7 @@ EXPORT_PER_CPU_SYMBOL(cpu_number); #define BOOT_PERCPU_OFFSET 0 #endif -DEFINE_PER_CPU(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET; +DEFINE_PER_CPU_READ_MOSTLY(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET; EXPORT_PER_CPU_SYMBOL(this_cpu_off); unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = { diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 668d8f2a878..7a8f5845e8e 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -99,7 +99,7 @@ EXPORT_PER_CPU_SYMBOL(cpu_core_map); DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); /* Per CPU bogomips and other parameters */ -DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); +DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info); EXPORT_PER_CPU_SYMBOL(cpu_info); atomic_t init_deasserted; diff --git a/arch/x86/kernel/sysfb.c b/arch/x86/kernel/sysfb.c index 193ec2ce46c..160386e9fc1 100644 --- a/arch/x86/kernel/sysfb.c +++ b/arch/x86/kernel/sysfb.c @@ -67,7 +67,7 @@ static __init int sysfb_init(void) pd = platform_device_register_resndata(NULL, name, 0, NULL, 0, si, sizeof(*si)); - return IS_ERR(pd) ? PTR_ERR(pd) : 0; + return PTR_ERR_OR_ZERO(pd); } /* must execute after PCI subsystem for EFI quirks */ diff --git a/arch/x86/kernel/sysfb_simplefb.c b/arch/x86/kernel/sysfb_simplefb.c index 86179d40989..764a29f84de 100644 --- a/arch/x86/kernel/sysfb_simplefb.c +++ b/arch/x86/kernel/sysfb_simplefb.c @@ -88,8 +88,5 @@ __init int create_simplefb(const struct screen_info *si, pd = platform_device_register_resndata(NULL, "simple-framebuffer", 0, &res, 1, mode, sizeof(*mode)); - if (IS_ERR(pd)) - return PTR_ERR(pd); - - return 0; + return PTR_ERR_OR_ZERO(pd); } diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index 0fa29609b2c..25adc0e16ea 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -23,7 +23,7 @@ #include <asm/time.h> #ifdef CONFIG_X86_64 -__visible DEFINE_VVAR(volatile unsigned long, jiffies) = INITIAL_JIFFIES; +__visible volatile unsigned long jiffies __cacheline_aligned = INITIAL_JIFFIES; #endif unsigned long profile_pc(struct pt_regs *regs) diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c index f7fec09e3e3..3e551eee87b 100644 --- a/arch/x86/kernel/tls.c +++ b/arch/x86/kernel/tls.c @@ -27,6 +27,43 @@ static int get_free_idx(void) return -ESRCH; } +static bool tls_desc_okay(const struct user_desc *info) +{ + if (LDT_empty(info)) + return true; + + /* + * espfix is required for 16-bit data segments, but espfix + * only works for LDT segments. + */ + if (!info->seg_32bit) + return false; + + /* Only allow data segments in the TLS array. */ + if (info->contents > 1) + return false; + + /* + * Non-present segments with DPL 3 present an interesting attack + * surface. The kernel should handle such segments correctly, + * but TLS is very difficult to protect in a sandbox, so prevent + * such segments from being created. + * + * If userspace needs to remove a TLS entry, it can still delete + * it outright. + */ + if (info->seg_not_present) + return false; + +#ifdef CONFIG_X86_64 + /* The L bit makes no sense for data. */ + if (info->lm) + return false; +#endif + + return true; +} + static void set_tls_desc(struct task_struct *p, int idx, const struct user_desc *info, int n) { @@ -66,6 +103,9 @@ int do_set_thread_area(struct task_struct *p, int idx, if (copy_from_user(&info, u_info, sizeof(info))) return -EFAULT; + if (!tls_desc_okay(&info)) + return -EINVAL; + if (idx == -1) idx = info.entry_number; @@ -192,6 +232,7 @@ int regset_tls_set(struct task_struct *target, const struct user_regset *regset, { struct user_desc infobuf[GDT_ENTRY_TLS_ENTRIES]; const struct user_desc *info; + int i; if (pos >= GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) || (pos % sizeof(struct user_desc)) != 0 || @@ -205,6 +246,10 @@ int regset_tls_set(struct task_struct *target, const struct user_regset *regset, else info = infobuf; + for (i = 0; i < count / sizeof(struct user_desc); i++) + if (!tls_desc_okay(info + i)) + return -EINVAL; + set_tls_desc(target, GDT_ENTRY_TLS_MIN + (pos / sizeof(struct user_desc)), info, count / sizeof(struct user_desc)); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index de801f22128..a9ae2057989 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -60,6 +60,7 @@ #include <asm/fixmap.h> #include <asm/mach_traps.h> #include <asm/alternative.h> +#include <asm/mpx.h> #ifdef CONFIG_X86_64 #include <asm/x86_init.h> @@ -228,7 +229,6 @@ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ DO_ERROR(X86_TRAP_DE, SIGFPE, "divide error", divide_error) DO_ERROR(X86_TRAP_OF, SIGSEGV, "overflow", overflow) -DO_ERROR(X86_TRAP_BR, SIGSEGV, "bounds", bounds) DO_ERROR(X86_TRAP_UD, SIGILL, "invalid opcode", invalid_op) DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun",coprocessor_segment_overrun) DO_ERROR(X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS) @@ -286,6 +286,89 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) } #endif +dotraplinkage void do_bounds(struct pt_regs *regs, long error_code) +{ + struct task_struct *tsk = current; + struct xsave_struct *xsave_buf; + enum ctx_state prev_state; + struct bndcsr *bndcsr; + siginfo_t *info; + + prev_state = exception_enter(); + if (notify_die(DIE_TRAP, "bounds", regs, error_code, + X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP) + goto exit; + conditional_sti(regs); + + if (!user_mode(regs)) + die("bounds", regs, error_code); + + if (!cpu_feature_enabled(X86_FEATURE_MPX)) { + /* The exception is not from Intel MPX */ + goto exit_trap; + } + + /* + * We need to look at BNDSTATUS to resolve this exception. + * It is not directly accessible, though, so we need to + * do an xsave and then pull it out of the xsave buffer. + */ + fpu_save_init(&tsk->thread.fpu); + xsave_buf = &(tsk->thread.fpu.state->xsave); + bndcsr = get_xsave_addr(xsave_buf, XSTATE_BNDCSR); + if (!bndcsr) + goto exit_trap; + + /* + * The error code field of the BNDSTATUS register communicates status + * information of a bound range exception #BR or operation involving + * bound directory. + */ + switch (bndcsr->bndstatus & MPX_BNDSTA_ERROR_CODE) { + case 2: /* Bound directory has invalid entry. */ + if (mpx_handle_bd_fault(xsave_buf)) + goto exit_trap; + break; /* Success, it was handled */ + case 1: /* Bound violation. */ + info = mpx_generate_siginfo(regs, xsave_buf); + if (PTR_ERR(info)) { + /* + * We failed to decode the MPX instruction. Act as if + * the exception was not caused by MPX. + */ + goto exit_trap; + } + /* + * Success, we decoded the instruction and retrieved + * an 'info' containing the address being accessed + * which caused the exception. This information + * allows and application to possibly handle the + * #BR exception itself. + */ + do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, info); + kfree(info); + break; + case 0: /* No exception caused by Intel MPX operations. */ + goto exit_trap; + default: + die("bounds", regs, error_code); + } + +exit: + exception_exit(prev_state); + return; +exit_trap: + /* + * This path out is for all the cases where we could not + * handle the exception in some way (like allocating a + * table or telling userspace about it. We will also end + * up here if the kernel has MPX turned off at compile + * time.. + */ + do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, NULL); + exception_exit(prev_state); +} + dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code) { @@ -387,7 +470,7 @@ NOKPROBE_SYMBOL(do_int3); * for scheduling or signal handling. The actual stack switch is done in * entry.S */ -asmlinkage __visible struct pt_regs *sync_regs(struct pt_regs *eregs) +asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs) { struct pt_regs *regs = eregs; /* Did already sync */ @@ -413,7 +496,7 @@ struct bad_iret_stack { struct pt_regs regs; }; -asmlinkage __visible +asmlinkage __visible notrace struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s) { /* @@ -436,6 +519,7 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s) BUG_ON(!user_mode_vm(&new_stack->regs)); return new_stack; } +NOKPROBE_SYMBOL(fixup_bad_iret); #endif /* diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 5d1cbfe4ae5..8b96a947021 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -219,7 +219,7 @@ static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool { u32 volatile *good_insns; - insn_init(insn, auprobe->insn, x86_64); + insn_init(insn, auprobe->insn, sizeof(auprobe->insn), x86_64); /* has the side-effect of processing the entire instruction */ insn_get_length(insn); if (WARN_ON_ONCE(!insn_complete(insn))) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 49edf2dd361..00bf300fd84 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -186,6 +186,8 @@ SECTIONS * start another segment - init. */ PERCPU_VADDR(INTERNODE_CACHE_BYTES, 0, :percpu) + ASSERT(SIZEOF(.data..percpu) < CONFIG_PHYSICAL_START, + "per-CPU data too large - increase CONFIG_PHYSICAL_START") #endif INIT_TEXT_SECTION(PAGE_SIZE) diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 957779f4eb4..2dcc6ff6fdc 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -1,59 +1,43 @@ /* + * Copyright (c) 2012-2014 Andy Lutomirski <luto@amacapital.net> + * + * Based on the original implementation which is: * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE * Copyright 2003 Andi Kleen, SuSE Labs. * - * [ NOTE: this mechanism is now deprecated in favor of the vDSO. ] + * Parts of the original code have been moved to arch/x86/vdso/vma.c + * + * This file implements vsyscall emulation. vsyscalls are a legacy ABI: + * Userspace can request certain kernel services by calling fixed + * addresses. This concept is problematic: * - * Thanks to hpa@transmeta.com for some useful hint. - * Special thanks to Ingo Molnar for his early experience with - * a different vsyscall implementation for Linux/IA32 and for the name. + * - It interferes with ASLR. + * - It's awkward to write code that lives in kernel addresses but is + * callable by userspace at fixed addresses. + * - The whole concept is impossible for 32-bit compat userspace. + * - UML cannot easily virtualize a vsyscall. * - * vsyscall 1 is located at -10Mbyte, vsyscall 2 is located - * at virtual address -10Mbyte+1024bytes etc... There are at max 4 - * vsyscalls. One vsyscall can reserve more than 1 slot to avoid - * jumping out of line if necessary. We cannot add more with this - * mechanism because older kernels won't return -ENOSYS. + * As of mid-2014, I believe that there is no new userspace code that + * will use a vsyscall if the vDSO is present. I hope that there will + * soon be no new userspace code that will ever use a vsyscall. * - * Note: the concept clashes with user mode linux. UML users should - * use the vDSO. + * The code in this file emulates vsyscalls when notified of a page + * fault to a vsyscall address. */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/time.h> -#include <linux/init.h> #include <linux/kernel.h> #include <linux/timer.h> -#include <linux/seqlock.h> -#include <linux/jiffies.h> -#include <linux/sysctl.h> -#include <linux/topology.h> -#include <linux/timekeeper_internal.h> -#include <linux/getcpu.h> -#include <linux/cpu.h> -#include <linux/smp.h> -#include <linux/notifier.h> #include <linux/syscalls.h> #include <linux/ratelimit.h> #include <asm/vsyscall.h> -#include <asm/pgtable.h> -#include <asm/compat.h> -#include <asm/page.h> #include <asm/unistd.h> #include <asm/fixmap.h> -#include <asm/errno.h> -#include <asm/io.h> -#include <asm/segment.h> -#include <asm/desc.h> -#include <asm/topology.h> #include <asm/traps.h> #define CREATE_TRACE_POINTS #include "vsyscall_trace.h" -DEFINE_VVAR(int, vgetcpu_mode); - static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE; static int __init vsyscall_setup(char *str) @@ -222,6 +206,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) "seccomp tried to change syscall nr or ip"); do_exit(SIGSYS); } + regs->orig_ax = -1; if (tmp) goto do_ret; /* skip requested */ @@ -284,46 +269,54 @@ sigsegv: } /* - * Assume __initcall executes before all user space. Hopefully kmod - * doesn't violate that. We'll find out if it does. + * A pseudo VMA to allow ptrace access for the vsyscall page. This only + * covers the 64bit vsyscall page now. 32bit has a real VMA now and does + * not need special handling anymore: */ -static void vsyscall_set_cpu(int cpu) +static const char *gate_vma_name(struct vm_area_struct *vma) { - unsigned long d; - unsigned long node = 0; -#ifdef CONFIG_NUMA - node = cpu_to_node(cpu); -#endif - if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP)) - write_rdtscp_aux((node << 12) | cpu); - - /* - * Store cpu number in limit so that it can be loaded quickly - * in user space in vgetcpu. (12 bits for the CPU and 8 bits for the node) - */ - d = 0x0f40000000000ULL; - d |= cpu; - d |= (node & 0xf) << 12; - d |= (node >> 4) << 48; - - write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S); + return "[vsyscall]"; } - -static void cpu_vsyscall_init(void *arg) +static struct vm_operations_struct gate_vma_ops = { + .name = gate_vma_name, +}; +static struct vm_area_struct gate_vma = { + .vm_start = VSYSCALL_ADDR, + .vm_end = VSYSCALL_ADDR + PAGE_SIZE, + .vm_page_prot = PAGE_READONLY_EXEC, + .vm_flags = VM_READ | VM_EXEC, + .vm_ops = &gate_vma_ops, +}; + +struct vm_area_struct *get_gate_vma(struct mm_struct *mm) { - /* preemption should be already off */ - vsyscall_set_cpu(raw_smp_processor_id()); +#ifdef CONFIG_IA32_EMULATION + if (!mm || mm->context.ia32_compat) + return NULL; +#endif + if (vsyscall_mode == NONE) + return NULL; + return &gate_vma; } -static int -cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg) +int in_gate_area(struct mm_struct *mm, unsigned long addr) { - long cpu = (long)arg; + struct vm_area_struct *vma = get_gate_vma(mm); + + if (!vma) + return 0; - if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) - smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1); + return (addr >= vma->vm_start) && (addr < vma->vm_end); +} - return NOTIFY_DONE; +/* + * Use this when you have no reliable mm, typically from interrupt + * context. It is less reliable than using a task's mm and may give + * false positives. + */ +int in_gate_area_no_mm(unsigned long addr) +{ + return vsyscall_mode != NONE && (addr & PAGE_MASK) == VSYSCALL_ADDR; } void __init map_vsyscall(void) @@ -331,24 +324,12 @@ void __init map_vsyscall(void) extern char __vsyscall_page; unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page); - __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall, - vsyscall_mode == NATIVE - ? PAGE_KERNEL_VSYSCALL - : PAGE_KERNEL_VVAR); + if (vsyscall_mode != NONE) + __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall, + vsyscall_mode == NATIVE + ? PAGE_KERNEL_VSYSCALL + : PAGE_KERNEL_VVAR); + BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) != (unsigned long)VSYSCALL_ADDR); } - -static int __init vsyscall_init(void) -{ - cpu_notifier_register_begin(); - - on_each_cpu(cpu_vsyscall_init, NULL, 1); - /* notifier priority > KVM */ - __hotcpu_notifier(cpu_vsyscall_notifier, 30); - - cpu_notifier_register_done(); - - return 0; -} -__initcall(vsyscall_init); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index e48b674639c..234b0722de5 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -116,8 +116,6 @@ struct x86_msi_ops x86_msi = { .teardown_msi_irqs = default_teardown_msi_irqs, .restore_msi_irqs = default_restore_msi_irqs, .setup_hpet_msi = default_setup_hpet_msi, - .msi_mask_irq = default_msi_mask_irq, - .msix_mask_irq = default_msix_mask_irq, }; /* MSI arch specific hooks */ @@ -140,14 +138,6 @@ void arch_restore_msi_irqs(struct pci_dev *dev) { x86_msi.restore_msi_irqs(dev); } -u32 arch_msi_mask_irq(struct msi_desc *desc, u32 mask, u32 flag) -{ - return x86_msi.msi_mask_irq(desc, mask, flag); -} -u32 arch_msix_mask_irq(struct msi_desc *desc, u32 flag) -{ - return x86_msi.msix_mask_irq(desc, flag); -} #endif struct x86_io_apic_ops x86_io_apic_ops = { |