diff options
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r-- | arch/x86/kernel/aperture_64.c | 59 | ||||
-rw-r--r-- | arch/x86/kernel/apic/hw_nmi.c | 2 | ||||
-rw-r--r-- | arch/x86/kernel/apic/x2apic_uv_x.c | 26 | ||||
-rw-r--r-- | arch/x86/kernel/apm_32.c | 11 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce.c | 50 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/microcode/core.c | 6 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/microcode/core_early.c | 37 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/perf_event.c | 1 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/perf_event_intel.c | 1 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/perf_event_intel_ds.c | 22 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/rdrand.c | 1 | ||||
-rw-r--r-- | arch/x86/kernel/entry_64.S | 185 | ||||
-rw-r--r-- | arch/x86/kernel/iosf_mbi.c | 13 | ||||
-rw-r--r-- | arch/x86/kernel/kvm.c | 2 | ||||
-rw-r--r-- | arch/x86/kernel/ldt.c | 4 | ||||
-rw-r--r-- | arch/x86/kernel/uprobes.c | 551 |
16 files changed, 603 insertions, 368 deletions
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 9fa8aa051f5..76164e173a2 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -10,6 +10,8 @@ * * Copyright 2002 Andi Kleen, SuSE Labs. */ +#define pr_fmt(fmt) "AGP: " fmt + #include <linux/kernel.h> #include <linux/types.h> #include <linux/init.h> @@ -75,14 +77,13 @@ static u32 __init allocate_aperture(void) addr = memblock_find_in_range(GART_MIN_ADDR, GART_MAX_ADDR, aper_size, aper_size); if (!addr) { - printk(KERN_ERR - "Cannot allocate aperture memory hole (%lx,%uK)\n", - addr, aper_size>>10); + pr_err("Cannot allocate aperture memory hole [mem %#010lx-%#010lx] (%uKB)\n", + addr, addr + aper_size - 1, aper_size >> 10); return 0; } memblock_reserve(addr, aper_size); - printk(KERN_INFO "Mapping aperture over %d KB of RAM @ %lx\n", - aper_size >> 10, addr); + pr_info("Mapping aperture over RAM [mem %#010lx-%#010lx] (%uKB)\n", + addr, addr + aper_size - 1, aper_size >> 10); register_nosave_region(addr >> PAGE_SHIFT, (addr+aper_size) >> PAGE_SHIFT); @@ -126,10 +127,11 @@ static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order) u64 aper; u32 old_order; - printk(KERN_INFO "AGP bridge at %02x:%02x:%02x\n", bus, slot, func); + pr_info("pci 0000:%02x:%02x:%02x: AGP bridge\n", bus, slot, func); apsizereg = read_pci_config_16(bus, slot, func, cap + 0x14); if (apsizereg == 0xffffffff) { - printk(KERN_ERR "APSIZE in AGP bridge unreadable\n"); + pr_err("pci 0000:%02x:%02x.%d: APSIZE unreadable\n", + bus, slot, func); return 0; } @@ -153,16 +155,18 @@ static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order) * On some sick chips, APSIZE is 0. It means it wants 4G * so let double check that order, and lets trust AMD NB settings: */ - printk(KERN_INFO "Aperture from AGP @ %Lx old size %u MB\n", - aper, 32 << old_order); + pr_info("pci 0000:%02x:%02x.%d: AGP aperture [bus addr %#010Lx-%#010Lx] (old size %uMB)\n", + bus, slot, func, aper, aper + (32ULL << (old_order + 20)) - 1, + 32 << old_order); if (aper + (32ULL<<(20 + *order)) > 0x100000000ULL) { - printk(KERN_INFO "Aperture size %u MB (APSIZE %x) is not right, using settings from NB\n", - 32 << *order, apsizereg); + pr_info("pci 0000:%02x:%02x.%d: AGP aperture size %uMB (APSIZE %#x) is not right, using settings from NB\n", + bus, slot, func, 32 << *order, apsizereg); *order = old_order; } - printk(KERN_INFO "Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n", - aper, 32 << *order, apsizereg); + pr_info("pci 0000:%02x:%02x.%d: AGP aperture [bus addr %#010Lx-%#010Lx] (%uMB, APSIZE %#x)\n", + bus, slot, func, aper, aper + (32ULL << (*order + 20)) - 1, + 32 << *order, apsizereg); if (!aperture_valid(aper, (32*1024*1024) << *order, 32<<20)) return 0; @@ -218,7 +222,7 @@ static u32 __init search_agp_bridge(u32 *order, int *valid_agp) } } } - printk(KERN_INFO "No AGP bridge found\n"); + pr_info("No AGP bridge found\n"); return 0; } @@ -310,7 +314,8 @@ void __init early_gart_iommu_check(void) if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) { /* reserve it, so we can reuse it in second kernel */ - printk(KERN_INFO "update e820 for GART\n"); + pr_info("e820: reserve [mem %#010Lx-%#010Lx] for GART\n", + aper_base, aper_base + aper_size - 1); e820_add_region(aper_base, aper_size, E820_RESERVED); update_e820(); } @@ -354,7 +359,7 @@ int __init gart_iommu_hole_init(void) !early_pci_allowed()) return -ENODEV; - printk(KERN_INFO "Checking aperture...\n"); + pr_info("Checking aperture...\n"); if (!fallback_aper_force) agp_aper_base = search_agp_bridge(&agp_aper_order, &valid_agp); @@ -395,8 +400,9 @@ int __init gart_iommu_hole_init(void) aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff; aper_base <<= 25; - printk(KERN_INFO "Node %d: aperture @ %Lx size %u MB\n", - node, aper_base, aper_size >> 20); + pr_info("Node %d: aperture [bus addr %#010Lx-%#010Lx] (%uMB)\n", + node, aper_base, aper_base + aper_size - 1, + aper_size >> 20); node++; if (!aperture_valid(aper_base, aper_size, 64<<20)) { @@ -407,9 +413,9 @@ int __init gart_iommu_hole_init(void) if (!no_iommu && max_pfn > MAX_DMA32_PFN && !printed_gart_size_msg) { - printk(KERN_ERR "you are using iommu with agp, but GART size is less than 64M\n"); - printk(KERN_ERR "please increase GART size in your BIOS setup\n"); - printk(KERN_ERR "if BIOS doesn't have that option, contact your HW vendor!\n"); + pr_err("you are using iommu with agp, but GART size is less than 64MB\n"); + pr_err("please increase GART size in your BIOS setup\n"); + pr_err("if BIOS doesn't have that option, contact your HW vendor!\n"); printed_gart_size_msg = 1; } } else { @@ -446,13 +452,10 @@ out: force_iommu || valid_agp || fallback_aper_force) { - printk(KERN_INFO - "Your BIOS doesn't leave a aperture memory hole\n"); - printk(KERN_INFO - "Please enable the IOMMU option in the BIOS setup\n"); - printk(KERN_INFO - "This costs you %d MB of RAM\n", - 32 << fallback_aper_order); + pr_info("Your BIOS doesn't leave a aperture memory hole\n"); + pr_info("Please enable the IOMMU option in the BIOS setup\n"); + pr_info("This costs you %dMB of RAM\n", + 32 << fallback_aper_order); aper_order = fallback_aper_order; aper_alloc = allocate_aperture(); diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index a698d7165c9..eab67047dec 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -57,7 +57,7 @@ void arch_trigger_all_cpu_backtrace(void) } clear_bit(0, &backtrace_flag); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); } static int __kprobes diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 7834389ba5b..293b41df54e 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -5,7 +5,7 @@ * * SGI UV APIC functions (note: not an Intel compatible APIC) * - * Copyright (C) 2007-2013 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2007-2014 Silicon Graphics, Inc. All rights reserved. */ #include <linux/cpumask.h> #include <linux/hardirq.h> @@ -440,6 +440,20 @@ static __initdata struct redir_addr redir_addrs[] = { {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR, UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR}, }; +static unsigned char get_n_lshift(int m_val) +{ + union uv3h_gr0_gam_gr_config_u m_gr_config; + + if (is_uv1_hub()) + return m_val; + + if (is_uv2_hub()) + return m_val == 40 ? 40 : 39; + + m_gr_config.v = uv_read_local_mmr(UV3H_GR0_GAM_GR_CONFIG); + return m_gr_config.s3.m_skt; +} + static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size) { union uvh_rh_gam_alias210_overlay_config_2_mmr_u alias; @@ -849,6 +863,7 @@ void __init uv_system_init(void) int gnode_extra, min_pnode = 999999, max_pnode = -1; unsigned long mmr_base, present, paddr; unsigned short pnode_mask; + unsigned char n_lshift; char *hub = (is_uv1_hub() ? "UV1" : (is_uv2_hub() ? "UV2" : "UV3")); @@ -860,6 +875,7 @@ void __init uv_system_init(void) m_val = m_n_config.s.m_skt; n_val = m_n_config.s.n_skt; pnode_mask = (1 << n_val) - 1; + n_lshift = get_n_lshift(m_val); mmr_base = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) & ~UV_MMR_ENABLE; @@ -867,8 +883,9 @@ void __init uv_system_init(void) node_id.v = uv_read_local_mmr(UVH_NODE_ID); gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1; gnode_upper = ((unsigned long)gnode_extra << m_val); - pr_info("UV: N:%d M:%d pnode_mask:0x%x gnode_upper/extra:0x%lx/0x%x\n", - n_val, m_val, pnode_mask, gnode_upper, gnode_extra); + pr_info("UV: N:%d M:%d pnode_mask:0x%x gnode_upper/extra:0x%lx/0x%x n_lshift 0x%x\n", + n_val, m_val, pnode_mask, gnode_upper, gnode_extra, + n_lshift); pr_info("UV: global MMR base 0x%lx\n", mmr_base); @@ -935,8 +952,7 @@ void __init uv_system_init(void) uv_cpu_hub_info(cpu)->hub_revision = uv_hub_info->hub_revision; uv_cpu_hub_info(cpu)->m_shift = 64 - m_val; - uv_cpu_hub_info(cpu)->n_lshift = is_uv2_1_hub() ? - (m_val == 40 ? 40 : 39) : m_val; + uv_cpu_hub_info(cpu)->n_lshift = n_lshift; pnode = uv_apicid_to_pnode(apicid); blade = boot_pnode_to_blade(pnode); diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 3ab03430211..f3a1f04ed4c 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -844,21 +844,10 @@ static int apm_do_idle(void) int polling; int err = 0; - polling = !!(current_thread_info()->status & TS_POLLING); - if (polling) { - current_thread_info()->status &= ~TS_POLLING; - /* - * TS_POLLING-cleared state must be visible before we - * test NEED_RESCHED: - */ - smp_mb(); - } if (!need_resched()) { idled = 1; ret = apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax, &err); } - if (polling) - current_thread_info()->status |= TS_POLLING; if (!idled) return 0; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 68317c80de7..6cc800381d1 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -704,8 +704,7 @@ static int mce_timed_out(u64 *t) if (!mca_cfg.monarch_timeout) goto out; if ((s64)*t < SPINUNIT) { - /* CHECKME: Make panic default for 1 too? */ - if (mca_cfg.tolerant < 1) + if (mca_cfg.tolerant <= 1) mce_panic("Timeout synchronizing machine check over CPUs", NULL, NULL); cpu_missing = 1; @@ -2437,32 +2436,65 @@ static __init int mcheck_init_device(void) int err; int i = 0; - if (!mce_available(&boot_cpu_data)) - return -EIO; + if (!mce_available(&boot_cpu_data)) { + err = -EIO; + goto err_out; + } - zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL); + if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) { + err = -ENOMEM; + goto err_out; + } mce_init_banks(); err = subsys_system_register(&mce_subsys, NULL); if (err) - return err; + goto err_out_mem; cpu_notifier_register_begin(); for_each_online_cpu(i) { err = mce_device_create(i); if (err) { cpu_notifier_register_done(); - return err; + goto err_device_create; } } - register_syscore_ops(&mce_syscore_ops); __register_hotcpu_notifier(&mce_cpu_notifier); cpu_notifier_register_done(); + register_syscore_ops(&mce_syscore_ops); + /* register character device /dev/mcelog */ - misc_register(&mce_chrdev_device); + err = misc_register(&mce_chrdev_device); + if (err) + goto err_register; + + return 0; + +err_register: + unregister_syscore_ops(&mce_syscore_ops); + + cpu_notifier_register_begin(); + __unregister_hotcpu_notifier(&mce_cpu_notifier); + cpu_notifier_register_done(); + +err_device_create: + /* + * We didn't keep track of which devices were created above, but + * even if we had, the set of online cpus might have changed. + * Play safe and remove for every possible cpu, since + * mce_device_remove() will do the right thing. + */ + for_each_possible_cpu(i) + mce_device_remove(i); + +err_out_mem: + free_cpumask_var(mce_device_initialized); + +err_out: + pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err); return err; } diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 15c987698b0..dd9d6190b08 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -97,6 +97,9 @@ MODULE_LICENSE("GPL"); static struct microcode_ops *microcode_ops; +bool dis_ucode_ldr; +module_param(dis_ucode_ldr, bool, 0); + /* * Synchronization. * @@ -546,6 +549,9 @@ static int __init microcode_init(void) struct cpuinfo_x86 *c = &cpu_data(0); int error; + if (dis_ucode_ldr) + return 0; + if (c->x86_vendor == X86_VENDOR_INTEL) microcode_ops = init_intel_microcode(); else if (c->x86_vendor == X86_VENDOR_AMD) diff --git a/arch/x86/kernel/cpu/microcode/core_early.c b/arch/x86/kernel/cpu/microcode/core_early.c index be7f8514f57..5f28a64e71e 100644 --- a/arch/x86/kernel/cpu/microcode/core_early.c +++ b/arch/x86/kernel/cpu/microcode/core_early.c @@ -17,9 +17,11 @@ * 2 of the License, or (at your option) any later version. */ #include <linux/module.h> +#include <asm/microcode.h> #include <asm/microcode_intel.h> #include <asm/microcode_amd.h> #include <asm/processor.h> +#include <asm/cmdline.h> #define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24)) #define CPUID_INTEL1 QCHAR('G', 'e', 'n', 'u') @@ -72,10 +74,33 @@ static int x86_family(void) return x86; } +static bool __init check_loader_disabled_bsp(void) +{ +#ifdef CONFIG_X86_32 + const char *cmdline = (const char *)__pa_nodebug(boot_command_line); + const char *opt = "dis_ucode_ldr"; + const char *option = (const char *)__pa_nodebug(opt); + bool *res = (bool *)__pa_nodebug(&dis_ucode_ldr); + +#else /* CONFIG_X86_64 */ + const char *cmdline = boot_command_line; + const char *option = "dis_ucode_ldr"; + bool *res = &dis_ucode_ldr; +#endif + + if (cmdline_find_option_bool(cmdline, option)) + *res = true; + + return *res; +} + void __init load_ucode_bsp(void) { int vendor, x86; + if (check_loader_disabled_bsp()) + return; + if (!have_cpuid_p()) return; @@ -96,10 +121,22 @@ void __init load_ucode_bsp(void) } } +static bool check_loader_disabled_ap(void) +{ +#ifdef CONFIG_X86_32 + return __pa_nodebug(dis_ucode_ldr); +#else + return dis_ucode_ldr; +#endif +} + void load_ucode_ap(void) { int vendor, x86; + if (check_loader_disabled_ap()) + return; + if (!have_cpuid_p()) return; diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index ae407f7226c..89f3b7c1af2 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -721,6 +721,7 @@ int perf_assign_events(struct perf_event **events, int n, return sched.state.unassigned; } +EXPORT_SYMBOL_GPL(perf_assign_events); int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) { diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index aa333d96688..adb02aa62af 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -169,7 +169,6 @@ static struct event_constraint intel_slm_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ - FIXED_EVENT_CONSTRAINT(0x013c, 2), /* CPU_CLK_UNHALTED.REF */ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* pseudo CPU_CLK_UNHALTED.REF */ EVENT_CONSTRAINT_END }; diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index ae96cfa5edd..980970cb744 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -108,15 +108,31 @@ static u64 precise_store_data(u64 status) return val; } -static u64 precise_store_data_hsw(u64 status) +static u64 precise_store_data_hsw(struct perf_event *event, u64 status) { union perf_mem_data_src dse; + u64 cfg = event->hw.config & INTEL_ARCH_EVENT_MASK; dse.val = 0; dse.mem_op = PERF_MEM_OP_STORE; dse.mem_lvl = PERF_MEM_LVL_NA; + + /* + * L1 info only valid for following events: + * + * MEM_UOPS_RETIRED.STLB_MISS_STORES + * MEM_UOPS_RETIRED.LOCK_STORES + * MEM_UOPS_RETIRED.SPLIT_STORES + * MEM_UOPS_RETIRED.ALL_STORES + */ + if (cfg != 0x12d0 && cfg != 0x22d0 && cfg != 0x42d0 && cfg != 0x82d0) + return dse.mem_lvl; + if (status & 1) - dse.mem_lvl = PERF_MEM_LVL_L1; + dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT; + else + dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_MISS; + /* Nothing else supported. Sorry. */ return dse.val; } @@ -887,7 +903,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event, data.data_src.val = load_latency_data(pebs->dse); else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW) data.data_src.val = - precise_store_data_hsw(pebs->dse); + precise_store_data_hsw(event, pebs->dse); else data.data_src.val = precise_store_data(pebs->dse); } diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c index 384df5105fb..136ac74dee8 100644 --- a/arch/x86/kernel/cpu/rdrand.c +++ b/arch/x86/kernel/cpu/rdrand.c @@ -27,6 +27,7 @@ static int __init x86_rdrand_setup(char *s) { setup_clear_cpu_cap(X86_FEATURE_RDRAND); + setup_clear_cpu_cap(X86_FEATURE_RDSEED); return 1; } __setup("nordrand", x86_rdrand_setup); diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 1e96c3628bf..be846d2468f 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -36,7 +36,7 @@ * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack * frame that is otherwise undefined after a SYSCALL * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging. - * - errorentry/paranoidentry/zeroentry - Define exception entry points. + * - idtentry - Define exception entry points. */ #include <linux/linkage.h> @@ -1203,125 +1203,100 @@ apicinterrupt IRQ_WORK_VECTOR \ /* * Exception entry points. */ -.macro zeroentry sym do_sym -ENTRY(\sym) - INTR_FRAME - ASM_CLAC - PARAVIRT_ADJUST_EXCEPTION_FRAME - pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ - subq $ORIG_RAX-R15, %rsp - CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 - call error_entry - DEFAULT_FRAME 0 - movq %rsp,%rdi /* pt_regs pointer */ - xorl %esi,%esi /* no error code */ - call \do_sym - jmp error_exit /* %ebx: no swapgs flag */ - CFI_ENDPROC -END(\sym) -.endm +#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8) -.macro paranoidzeroentry sym do_sym +.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ENTRY(\sym) - INTR_FRAME - ASM_CLAC - PARAVIRT_ADJUST_EXCEPTION_FRAME - pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ - subq $ORIG_RAX-R15, %rsp - CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 - call save_paranoid - TRACE_IRQS_OFF - movq %rsp,%rdi /* pt_regs pointer */ - xorl %esi,%esi /* no error code */ - call \do_sym - jmp paranoid_exit /* %ebx: no swapgs flag */ - CFI_ENDPROC -END(\sym) -.endm + /* Sanity check */ + .if \shift_ist != -1 && \paranoid == 0 + .error "using shift_ist requires paranoid=1" + .endif -#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8) -.macro paranoidzeroentry_ist sym do_sym ist -ENTRY(\sym) + .if \has_error_code + XCPT_FRAME + .else INTR_FRAME - ASM_CLAC - PARAVIRT_ADJUST_EXCEPTION_FRAME - pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ - subq $ORIG_RAX-R15, %rsp - CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 - call save_paranoid - TRACE_IRQS_OFF_DEBUG - movq %rsp,%rdi /* pt_regs pointer */ - xorl %esi,%esi /* no error code */ - subq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist) - call \do_sym - addq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist) - jmp paranoid_exit /* %ebx: no swapgs flag */ - CFI_ENDPROC -END(\sym) -.endm + .endif -.macro errorentry sym do_sym -ENTRY(\sym) - XCPT_FRAME ASM_CLAC PARAVIRT_ADJUST_EXCEPTION_FRAME + + .ifeq \has_error_code + pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ + .endif + subq $ORIG_RAX-R15, %rsp CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 + + .if \paranoid + call save_paranoid + .else call error_entry + .endif + DEFAULT_FRAME 0 + + .if \paranoid + .if \shift_ist != -1 + TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */ + .else + TRACE_IRQS_OFF + .endif + .endif + movq %rsp,%rdi /* pt_regs pointer */ + + .if \has_error_code movq ORIG_RAX(%rsp),%rsi /* get error code */ movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */ + .else + xorl %esi,%esi /* no error code */ + .endif + + .if \shift_ist != -1 + subq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist) + .endif + call \do_sym + + .if \shift_ist != -1 + addq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist) + .endif + + .if \paranoid + jmp paranoid_exit /* %ebx: no swapgs flag */ + .else jmp error_exit /* %ebx: no swapgs flag */ + .endif + CFI_ENDPROC END(\sym) .endm #ifdef CONFIG_TRACING -.macro trace_errorentry sym do_sym -errorentry trace(\sym) trace(\do_sym) -errorentry \sym \do_sym +.macro trace_idtentry sym do_sym has_error_code:req +idtentry trace(\sym) trace(\do_sym) has_error_code=\has_error_code +idtentry \sym \do_sym has_error_code=\has_error_code .endm #else -.macro trace_errorentry sym do_sym -errorentry \sym \do_sym +.macro trace_idtentry sym do_sym has_error_code:req +idtentry \sym \do_sym has_error_code=\has_error_code .endm #endif - /* error code is on the stack already */ -.macro paranoiderrorentry sym do_sym -ENTRY(\sym) - XCPT_FRAME - ASM_CLAC - PARAVIRT_ADJUST_EXCEPTION_FRAME - subq $ORIG_RAX-R15, %rsp - CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 - call save_paranoid - DEFAULT_FRAME 0 - TRACE_IRQS_OFF - movq %rsp,%rdi /* pt_regs pointer */ - movq ORIG_RAX(%rsp),%rsi /* get error code */ - movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */ - call \do_sym - jmp paranoid_exit /* %ebx: no swapgs flag */ - CFI_ENDPROC -END(\sym) -.endm - -zeroentry divide_error do_divide_error -zeroentry overflow do_overflow -zeroentry bounds do_bounds -zeroentry invalid_op do_invalid_op -zeroentry device_not_available do_device_not_available -paranoiderrorentry double_fault do_double_fault -zeroentry coprocessor_segment_overrun do_coprocessor_segment_overrun -errorentry invalid_TSS do_invalid_TSS -errorentry segment_not_present do_segment_not_present -zeroentry spurious_interrupt_bug do_spurious_interrupt_bug -zeroentry coprocessor_error do_coprocessor_error -errorentry alignment_check do_alignment_check -zeroentry simd_coprocessor_error do_simd_coprocessor_error +idtentry divide_error do_divide_error has_error_code=0 +idtentry overflow do_overflow has_error_code=0 +idtentry bounds do_bounds has_error_code=0 +idtentry invalid_op do_invalid_op has_error_code=0 +idtentry device_not_available do_device_not_available has_error_code=0 +idtentry double_fault do_double_fault has_error_code=1 paranoid=1 +idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0 +idtentry invalid_TSS do_invalid_TSS has_error_code=1 +idtentry segment_not_present do_segment_not_present has_error_code=1 +idtentry spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0 +idtentry coprocessor_error do_coprocessor_error has_error_code=0 +idtentry alignment_check do_alignment_check has_error_code=1 +idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0 /* Reload gs selector with exception handling */ @@ -1371,7 +1346,7 @@ ENTRY(do_softirq_own_stack) END(do_softirq_own_stack) #ifdef CONFIG_XEN -zeroentry xen_hypervisor_callback xen_do_hypervisor_callback +idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0 /* * A note on the "critical region" in our callback handler. @@ -1482,21 +1457,21 @@ apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ */ .pushsection .kprobes.text, "ax" -paranoidzeroentry_ist debug do_debug DEBUG_STACK -paranoidzeroentry_ist int3 do_int3 DEBUG_STACK -paranoiderrorentry stack_segment do_stack_segment +idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK +idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK +idtentry stack_segment do_stack_segment has_error_code=1 paranoid=1 #ifdef CONFIG_XEN -zeroentry xen_debug do_debug -zeroentry xen_int3 do_int3 -errorentry xen_stack_segment do_stack_segment +idtentry xen_debug do_debug has_error_code=0 +idtentry xen_int3 do_int3 has_error_code=0 +idtentry xen_stack_segment do_stack_segment has_error_code=1 #endif -errorentry general_protection do_general_protection -trace_errorentry page_fault do_page_fault +idtentry general_protection do_general_protection has_error_code=1 +trace_idtentry page_fault do_page_fault has_error_code=1 #ifdef CONFIG_KVM_GUEST -errorentry async_page_fault do_async_page_fault +idtentry async_page_fault do_async_page_fault has_error_code=1 #endif #ifdef CONFIG_X86_MCE -paranoidzeroentry machine_check *machine_check_vector(%rip) +idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip) #endif /* diff --git a/arch/x86/kernel/iosf_mbi.c b/arch/x86/kernel/iosf_mbi.c index c3aae667284..d30acdc1229 100644 --- a/arch/x86/kernel/iosf_mbi.c +++ b/arch/x86/kernel/iosf_mbi.c @@ -25,6 +25,9 @@ #include <asm/iosf_mbi.h> +#define PCI_DEVICE_ID_BAYTRAIL 0x0F00 +#define PCI_DEVICE_ID_QUARK_X1000 0x0958 + static DEFINE_SPINLOCK(iosf_mbi_lock); static inline u32 iosf_mbi_form_mcr(u8 op, u8 port, u8 offset) @@ -177,6 +180,13 @@ int iosf_mbi_modify(u8 port, u8 opcode, u32 offset, u32 mdr, u32 mask) } EXPORT_SYMBOL(iosf_mbi_modify); +bool iosf_mbi_available(void) +{ + /* Mbi isn't hot-pluggable. No remove routine is provided */ + return mbi_pdev; +} +EXPORT_SYMBOL(iosf_mbi_available); + static int iosf_mbi_probe(struct pci_dev *pdev, const struct pci_device_id *unused) { @@ -193,7 +203,8 @@ static int iosf_mbi_probe(struct pci_dev *pdev, } static DEFINE_PCI_DEVICE_TABLE(iosf_mbi_pci_ids) = { - { PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x0F00) }, + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_BAYTRAIL) }, + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_QUARK_X1000) }, { 0, }, }; MODULE_DEVICE_TABLE(pci, iosf_mbi_pci_ids); diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 0331cb389d6..7e97371387f 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -259,7 +259,7 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code) switch (kvm_read_and_reset_pf_reason()) { default: - do_page_fault(regs, error_code); + trace_do_page_fault(regs, error_code); break; case KVM_PV_REASON_PAGE_NOT_PRESENT: /* page is swapped out by the host. */ diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index af1d14a9ebd..dcbbaa165bd 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -20,6 +20,8 @@ #include <asm/mmu_context.h> #include <asm/syscalls.h> +int sysctl_ldt16 = 0; + #ifdef CONFIG_SMP static void flush_ldt(void *current_mm) { @@ -234,7 +236,7 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) * IRET leaking the high bits of the kernel stack address. */ #ifdef CONFIG_X86_64 - if (!ldt_info.seg_32bit) { + if (!ldt_info.seg_32bit && !sysctl_ldt16) { error = -EINVAL; goto out_unlock; } diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 2ed845928b5..ace22916ade 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -53,7 +53,7 @@ #define OPCODE1(insn) ((insn)->opcode.bytes[0]) #define OPCODE2(insn) ((insn)->opcode.bytes[1]) #define OPCODE3(insn) ((insn)->opcode.bytes[2]) -#define MODRM_REG(insn) X86_MODRM_REG(insn->modrm.value) +#define MODRM_REG(insn) X86_MODRM_REG((insn)->modrm.value) #define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\ (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ @@ -229,63 +229,6 @@ static int validate_insn_32bits(struct arch_uprobe *auprobe, struct insn *insn) return -ENOTSUPP; } -/* - * Figure out which fixups arch_uprobe_post_xol() will need to perform, and - * annotate arch_uprobe->fixups accordingly. To start with, - * arch_uprobe->fixups is either zero or it reflects rip-related fixups. - */ -static void prepare_fixups(struct arch_uprobe *auprobe, struct insn *insn) -{ - bool fix_ip = true, fix_call = false; /* defaults */ - int reg; - - insn_get_opcode(insn); /* should be a nop */ - - switch (OPCODE1(insn)) { - case 0x9d: - /* popf */ - auprobe->fixups |= UPROBE_FIX_SETF; - break; - case 0xc3: /* ret/lret */ - case 0xcb: - case 0xc2: - case 0xca: - /* ip is correct */ - fix_ip = false; - break; - case 0xe8: /* call relative - Fix return addr */ - fix_call = true; - break; - case 0x9a: /* call absolute - Fix return addr, not ip */ - fix_call = true; - fix_ip = false; - break; - case 0xff: - insn_get_modrm(insn); - reg = MODRM_REG(insn); - if (reg == 2 || reg == 3) { - /* call or lcall, indirect */ - /* Fix return addr; ip is correct. */ - fix_call = true; - fix_ip = false; - } else if (reg == 4 || reg == 5) { - /* jmp or ljmp, indirect */ - /* ip is correct. */ - fix_ip = false; - } - break; - case 0xea: /* jmp absolute -- ip is correct */ - fix_ip = false; - break; - default: - break; - } - if (fix_ip) - auprobe->fixups |= UPROBE_FIX_IP; - if (fix_call) - auprobe->fixups |= UPROBE_FIX_CALL; -} - #ifdef CONFIG_X86_64 /* * If arch_uprobe->insn doesn't use rip-relative addressing, return @@ -310,15 +253,11 @@ static void prepare_fixups(struct arch_uprobe *auprobe, struct insn *insn) * - The displacement is always 4 bytes. */ static void -handle_riprel_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn) +handle_riprel_insn(struct arch_uprobe *auprobe, struct insn *insn) { u8 *cursor; u8 reg; - if (mm->context.ia32_compat) - return; - - auprobe->rip_rela_target_address = 0x0; if (!insn_rip_relative(insn)) return; @@ -372,7 +311,48 @@ handle_riprel_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, struct ins cursor++; memmove(cursor, cursor + insn->displacement.nbytes, insn->immediate.nbytes); } - return; +} + +/* + * If we're emulating a rip-relative instruction, save the contents + * of the scratch register and store the target address in that register. + */ +static void +pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs, + struct arch_uprobe_task *autask) +{ + if (auprobe->fixups & UPROBE_FIX_RIP_AX) { + autask->saved_scratch_register = regs->ax; + regs->ax = current->utask->vaddr; + regs->ax += auprobe->rip_rela_target_address; + } else if (auprobe->fixups & UPROBE_FIX_RIP_CX) { + autask->saved_scratch_register = regs->cx; + regs->cx = current->utask->vaddr; + regs->cx += auprobe->rip_rela_target_address; + } +} + +static void +handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, long *correction) +{ + if (auprobe->fixups & (UPROBE_FIX_RIP_AX | UPROBE_FIX_RIP_CX)) { + struct arch_uprobe_task *autask; + + autask = ¤t->utask->autask; + if (auprobe->fixups & UPROBE_FIX_RIP_AX) + regs->ax = autask->saved_scratch_register; + else + regs->cx = autask->saved_scratch_register; + + /* + * The original instruction includes a displacement, and so + * is 4 bytes longer than what we've just single-stepped. + * Caller may need to apply other fixups to handle stuff + * like "jmpq *...(%rip)" and "callq *...(%rip)". + */ + if (correction) + *correction += 4; + } } static int validate_insn_64bits(struct arch_uprobe *auprobe, struct insn *insn) @@ -401,9 +381,19 @@ static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, return validate_insn_64bits(auprobe, insn); } #else /* 32-bit: */ -static void handle_riprel_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn) +/* + * No RIP-relative addressing on 32-bit + */ +static void handle_riprel_insn(struct arch_uprobe *auprobe, struct insn *insn) +{ +} +static void pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs, + struct arch_uprobe_task *autask) +{ +} +static void handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, + long *correction) { - /* No RIP-relative addressing on 32-bit */ } static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn) @@ -412,141 +402,311 @@ static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, } #endif /* CONFIG_X86_64 */ -/** - * arch_uprobe_analyze_insn - instruction analysis including validity and fixups. - * @mm: the probed address space. - * @arch_uprobe: the probepoint information. - * @addr: virtual address at which to install the probepoint - * Return 0 on success or a -ve number on error. +struct uprobe_xol_ops { + bool (*emulate)(struct arch_uprobe *, struct pt_regs *); + int (*pre_xol)(struct arch_uprobe *, struct pt_regs *); + int (*post_xol)(struct arch_uprobe *, struct pt_regs *); +}; + +static inline int sizeof_long(void) +{ + return is_ia32_task() ? 4 : 8; +} + +static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + pre_xol_rip_insn(auprobe, regs, ¤t->utask->autask); + return 0; +} + +/* + * Adjust the return address pushed by a call insn executed out of line. */ -int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr) +static int adjust_ret_addr(unsigned long sp, long correction) { - int ret; - struct insn insn; + int rasize = sizeof_long(); + long ra; - auprobe->fixups = 0; - ret = validate_insn_bits(auprobe, mm, &insn); - if (ret != 0) - return ret; + if (copy_from_user(&ra, (void __user *)sp, rasize)) + return -EFAULT; - handle_riprel_insn(auprobe, mm, &insn); - prepare_fixups(auprobe, &insn); + ra += correction; + if (copy_to_user((void __user *)sp, &ra, rasize)) + return -EFAULT; return 0; } -#ifdef CONFIG_X86_64 -/* - * If we're emulating a rip-relative instruction, save the contents - * of the scratch register and store the target address in that register. - */ -static void -pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs, - struct arch_uprobe_task *autask) +static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) { - if (auprobe->fixups & UPROBE_FIX_RIP_AX) { - autask->saved_scratch_register = regs->ax; - regs->ax = current->utask->vaddr; - regs->ax += auprobe->rip_rela_target_address; - } else if (auprobe->fixups & UPROBE_FIX_RIP_CX) { - autask->saved_scratch_register = regs->cx; - regs->cx = current->utask->vaddr; - regs->cx += auprobe->rip_rela_target_address; + struct uprobe_task *utask = current->utask; + long correction = (long)(utask->vaddr - utask->xol_vaddr); + + handle_riprel_post_xol(auprobe, regs, &correction); + if (auprobe->fixups & UPROBE_FIX_IP) + regs->ip += correction; + + if (auprobe->fixups & UPROBE_FIX_CALL) { + if (adjust_ret_addr(regs->sp, correction)) { + regs->sp += sizeof_long(); + return -ERESTART; + } } + + return 0; } -#else -static void -pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs, - struct arch_uprobe_task *autask) + +static struct uprobe_xol_ops default_xol_ops = { + .pre_xol = default_pre_xol_op, + .post_xol = default_post_xol_op, +}; + +static bool branch_is_call(struct arch_uprobe *auprobe) { - /* No RIP-relative addressing on 32-bit */ + return auprobe->branch.opc1 == 0xe8; } -#endif -/* - * arch_uprobe_pre_xol - prepare to execute out of line. - * @auprobe: the probepoint information. - * @regs: reflects the saved user state of current task. - */ -int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) -{ - struct arch_uprobe_task *autask; +#define CASE_COND \ + COND(70, 71, XF(OF)) \ + COND(72, 73, XF(CF)) \ + COND(74, 75, XF(ZF)) \ + COND(78, 79, XF(SF)) \ + COND(7a, 7b, XF(PF)) \ + COND(76, 77, XF(CF) || XF(ZF)) \ + COND(7c, 7d, XF(SF) != XF(OF)) \ + COND(7e, 7f, XF(ZF) || XF(SF) != XF(OF)) - autask = ¤t->utask->autask; - autask->saved_trap_nr = current->thread.trap_nr; - current->thread.trap_nr = UPROBE_TRAP_NR; - regs->ip = current->utask->xol_vaddr; - pre_xol_rip_insn(auprobe, regs, autask); +#define COND(op_y, op_n, expr) \ + case 0x ## op_y: DO((expr) != 0) \ + case 0x ## op_n: DO((expr) == 0) - autask->saved_tf = !!(regs->flags & X86_EFLAGS_TF); - regs->flags |= X86_EFLAGS_TF; - if (test_tsk_thread_flag(current, TIF_BLOCKSTEP)) - set_task_blockstep(current, false); +#define XF(xf) (!!(flags & X86_EFLAGS_ ## xf)) - return 0; +static bool is_cond_jmp_opcode(u8 opcode) +{ + switch (opcode) { + #define DO(expr) \ + return true; + CASE_COND + #undef DO + + default: + return false; + } } -/* - * This function is called by arch_uprobe_post_xol() to adjust the return - * address pushed by a call instruction executed out of line. - */ -static int adjust_ret_addr(unsigned long sp, long correction) +static bool check_jmp_cond(struct arch_uprobe *auprobe, struct pt_regs *regs) { - int rasize, ncopied; - long ra = 0; + unsigned long flags = regs->flags; - if (is_ia32_task()) - rasize = 4; - else - rasize = 8; + switch (auprobe->branch.opc1) { + #define DO(expr) \ + return expr; + CASE_COND + #undef DO - ncopied = copy_from_user(&ra, (void __user *)sp, rasize); - if (unlikely(ncopied)) - return -EFAULT; + default: /* not a conditional jmp */ + return true; + } +} - ra += correction; - ncopied = copy_to_user((void __user *)sp, &ra, rasize); - if (unlikely(ncopied)) - return -EFAULT; +#undef XF +#undef COND +#undef CASE_COND - return 0; +static bool branch_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + unsigned long new_ip = regs->ip += auprobe->branch.ilen; + unsigned long offs = (long)auprobe->branch.offs; + + if (branch_is_call(auprobe)) { + unsigned long new_sp = regs->sp - sizeof_long(); + /* + * If it fails we execute this (mangled, see the comment in + * branch_clear_offset) insn out-of-line. In the likely case + * this should trigger the trap, and the probed application + * should die or restart the same insn after it handles the + * signal, arch_uprobe_post_xol() won't be even called. + * + * But there is corner case, see the comment in ->post_xol(). + */ + if (copy_to_user((void __user *)new_sp, &new_ip, sizeof_long())) + return false; + regs->sp = new_sp; + } else if (!check_jmp_cond(auprobe, regs)) { + offs = 0; + } + + regs->ip = new_ip + offs; + return true; } -#ifdef CONFIG_X86_64 -static bool is_riprel_insn(struct arch_uprobe *auprobe) +static int branch_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) { - return ((auprobe->fixups & (UPROBE_FIX_RIP_AX | UPROBE_FIX_RIP_CX)) != 0); + BUG_ON(!branch_is_call(auprobe)); + /* + * We can only get here if branch_emulate_op() failed to push the ret + * address _and_ another thread expanded our stack before the (mangled) + * "call" insn was executed out-of-line. Just restore ->sp and restart. + * We could also restore ->ip and try to call branch_emulate_op() again. + */ + regs->sp += sizeof_long(); + return -ERESTART; } -static void -handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, long *correction) +static void branch_clear_offset(struct arch_uprobe *auprobe, struct insn *insn) { - if (is_riprel_insn(auprobe)) { - struct arch_uprobe_task *autask; + /* + * Turn this insn into "call 1f; 1:", this is what we will execute + * out-of-line if ->emulate() fails. We only need this to generate + * a trap, so that the probed task receives the correct signal with + * the properly filled siginfo. + * + * But see the comment in ->post_xol(), in the unlikely case it can + * succeed. So we need to ensure that the new ->ip can not fall into + * the non-canonical area and trigger #GP. + * + * We could turn it into (say) "pushf", but then we would need to + * divorce ->insn[] and ->ixol[]. We need to preserve the 1st byte + * of ->insn[] for set_orig_insn(). + */ + memset(auprobe->insn + insn_offset_immediate(insn), + 0, insn->immediate.nbytes); +} - autask = ¤t->utask->autask; - if (auprobe->fixups & UPROBE_FIX_RIP_AX) - regs->ax = autask->saved_scratch_register; - else - regs->cx = autask->saved_scratch_register; +static struct uprobe_xol_ops branch_xol_ops = { + .emulate = branch_emulate_op, + .post_xol = branch_post_xol_op, +}; + +/* Returns -ENOSYS if branch_xol_ops doesn't handle this insn */ +static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) +{ + u8 opc1 = OPCODE1(insn); + + /* has the side-effect of processing the entire instruction */ + insn_get_length(insn); + if (WARN_ON_ONCE(!insn_complete(insn))) + return -ENOEXEC; + + switch (opc1) { + case 0xeb: /* jmp 8 */ + case 0xe9: /* jmp 32 */ + case 0x90: /* prefix* + nop; same as jmp with .offs = 0 */ + break; + + case 0xe8: /* call relative */ + branch_clear_offset(auprobe, insn); + break; + case 0x0f: + if (insn->opcode.nbytes != 2) + return -ENOSYS; /* - * The original instruction includes a displacement, and so - * is 4 bytes longer than what we've just single-stepped. - * Fall through to handle stuff like "jmpq *...(%rip)" and - * "callq *...(%rip)". + * If it is a "near" conditional jmp, OPCODE2() - 0x10 matches + * OPCODE1() of the "short" jmp which checks the same condition. */ - if (correction) - *correction += 4; + opc1 = OPCODE2(insn) - 0x10; + default: + if (!is_cond_jmp_opcode(opc1)) + return -ENOSYS; } + + auprobe->branch.opc1 = opc1; + auprobe->branch.ilen = insn->length; + auprobe->branch.offs = insn->immediate.value; + + auprobe->ops = &branch_xol_ops; + return 0; } -#else -static void -handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, long *correction) + +/** + * arch_uprobe_analyze_insn - instruction analysis including validity and fixups. + * @mm: the probed address space. + * @arch_uprobe: the probepoint information. + * @addr: virtual address at which to install the probepoint + * Return 0 on success or a -ve number on error. + */ +int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr) +{ + struct insn insn; + bool fix_ip = true, fix_call = false; + int ret; + + ret = validate_insn_bits(auprobe, mm, &insn); + if (ret) + return ret; + + ret = branch_setup_xol_ops(auprobe, &insn); + if (ret != -ENOSYS) + return ret; + + /* + * Figure out which fixups arch_uprobe_post_xol() will need to perform, + * and annotate arch_uprobe->fixups accordingly. To start with, ->fixups + * is either zero or it reflects rip-related fixups. + */ + switch (OPCODE1(&insn)) { + case 0x9d: /* popf */ + auprobe->fixups |= UPROBE_FIX_SETF; + break; + case 0xc3: /* ret or lret -- ip is correct */ + case 0xcb: + case 0xc2: + case 0xca: + fix_ip = false; + break; + case 0x9a: /* call absolute - Fix return addr, not ip */ + fix_call = true; + fix_ip = false; + break; + case 0xea: /* jmp absolute -- ip is correct */ + fix_ip = false; + break; + case 0xff: + insn_get_modrm(&insn); + switch (MODRM_REG(&insn)) { + case 2: case 3: /* call or lcall, indirect */ + fix_call = true; + case 4: case 5: /* jmp or ljmp, indirect */ + fix_ip = false; + } + /* fall through */ + default: + handle_riprel_insn(auprobe, &insn); + } + + if (fix_ip) + auprobe->fixups |= UPROBE_FIX_IP; + if (fix_call) + auprobe->fixups |= UPROBE_FIX_CALL; + + auprobe->ops = &default_xol_ops; + return 0; +} + +/* + * arch_uprobe_pre_xol - prepare to execute out of line. + * @auprobe: the probepoint information. + * @regs: reflects the saved user state of current task. + */ +int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { - /* No RIP-relative addressing on 32-bit */ + struct uprobe_task *utask = current->utask; + + regs->ip = utask->xol_vaddr; + utask->autask.saved_trap_nr = current->thread.trap_nr; + current->thread.trap_nr = UPROBE_TRAP_NR; + + utask->autask.saved_tf = !!(regs->flags & X86_EFLAGS_TF); + regs->flags |= X86_EFLAGS_TF; + if (test_tsk_thread_flag(current, TIF_BLOCKSTEP)) + set_task_blockstep(current, false); + + if (auprobe->ops->pre_xol) + return auprobe->ops->pre_xol(auprobe, regs); + return 0; } -#endif /* * If xol insn itself traps and generates a signal(Say, @@ -592,22 +752,25 @@ bool arch_uprobe_xol_was_trapped(struct task_struct *t) */ int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { - struct uprobe_task *utask; - long correction; - int result = 0; + struct uprobe_task *utask = current->utask; WARN_ON_ONCE(current->thread.trap_nr != UPROBE_TRAP_NR); - utask = current->utask; - current->thread.trap_nr = utask->autask.saved_trap_nr; - correction = (long)(utask->vaddr - utask->xol_vaddr); - handle_riprel_post_xol(auprobe, regs, &correction); - if (auprobe->fixups & UPROBE_FIX_IP) - regs->ip += correction; - - if (auprobe->fixups & UPROBE_FIX_CALL) - result = adjust_ret_addr(regs->sp, correction); + if (auprobe->ops->post_xol) { + int err = auprobe->ops->post_xol(auprobe, regs); + if (err) { + arch_uprobe_abort_xol(auprobe, regs); + /* + * Restart the probed insn. ->post_xol() must ensure + * this is really possible if it returns -ERESTART. + */ + if (err == -ERESTART) + return 0; + return err; + } + } + current->thread.trap_nr = utask->autask.saved_trap_nr; /* * arch_uprobe_pre_xol() doesn't save the state of TIF_BLOCKSTEP * so we can get an extra SIGTRAP if we do not clear TF. We need @@ -618,7 +781,7 @@ int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) else if (!(auprobe->fixups & UPROBE_FIX_SETF)) regs->flags &= ~X86_EFLAGS_TF; - return result; + return 0; } /* callback routine for handling exceptions. */ @@ -652,8 +815,9 @@ int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, /* * This function gets called when XOL instruction either gets trapped or - * the thread has a fatal signal, so reset the instruction pointer to its - * probed address. + * the thread has a fatal signal, or if arch_uprobe_post_xol() failed. + * Reset the instruction pointer to its probed address for the potential + * restart or for post mortem analysis. */ void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { @@ -668,25 +832,10 @@ void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) regs->flags &= ~X86_EFLAGS_TF; } -/* - * Skip these instructions as per the currently known x86 ISA. - * rep=0x66*; nop=0x90 - */ static bool __skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) { - int i; - - for (i = 0; i < MAX_UINSN_BYTES; i++) { - if (auprobe->insn[i] == 0x66) - continue; - - if (auprobe->insn[i] == 0x90) { - regs->ip += i + 1; - return true; - } - - break; - } + if (auprobe->ops->emulate) + return auprobe->ops->emulate(auprobe, regs); return false; } @@ -701,23 +850,21 @@ bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) unsigned long arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs) { - int rasize, ncopied; + int rasize = sizeof_long(), nleft; unsigned long orig_ret_vaddr = 0; /* clear high bits for 32-bit apps */ - rasize = is_ia32_task() ? 4 : 8; - ncopied = copy_from_user(&orig_ret_vaddr, (void __user *)regs->sp, rasize); - if (unlikely(ncopied)) + if (copy_from_user(&orig_ret_vaddr, (void __user *)regs->sp, rasize)) return -1; /* check whether address has been already hijacked */ if (orig_ret_vaddr == trampoline_vaddr) return orig_ret_vaddr; - ncopied = copy_to_user((void __user *)regs->sp, &trampoline_vaddr, rasize); - if (likely(!ncopied)) + nleft = copy_to_user((void __user *)regs->sp, &trampoline_vaddr, rasize); + if (likely(!nleft)) return orig_ret_vaddr; - if (ncopied != rasize) { + if (nleft != rasize) { pr_err("uprobe: return address clobbered: pid=%d, %%sp=%#lx, " "%%ip=%#lx\n", current->pid, regs->sp, regs->ip); |