diff options
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r-- | arch/x86/kernel/aperture_64.c | 6 | ||||
-rw-r--r-- | arch/x86/kernel/apic/io_apic.c | 4 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/common.c | 8 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce.c | 37 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mtrr/cleanup.c | 2 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/perf_event.c | 11 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/perf_event.h | 2 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/perf_event_intel.c | 145 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/perf_event_intel_ds.c | 9 | ||||
-rw-r--r-- | arch/x86/kernel/entry_32.S | 13 | ||||
-rw-r--r-- | arch/x86/kernel/entry_64.S | 44 | ||||
-rw-r--r-- | arch/x86/kernel/ftrace.c | 102 | ||||
-rw-r--r-- | arch/x86/kernel/hpet.c | 2 | ||||
-rw-r--r-- | arch/x86/kernel/kvmclock.c | 5 | ||||
-rw-r--r-- | arch/x86/kernel/nmi.c | 6 | ||||
-rw-r--r-- | arch/x86/kernel/nmi_selftest.c | 4 | ||||
-rw-r--r-- | arch/x86/kernel/pci-dma.c | 3 | ||||
-rw-r--r-- | arch/x86/kernel/ptrace.c | 6 | ||||
-rw-r--r-- | arch/x86/kernel/reboot.c | 6 | ||||
-rw-r--r-- | arch/x86/kernel/signal.c | 62 | ||||
-rw-r--r-- | arch/x86/kernel/smpboot.c | 26 | ||||
-rw-r--r-- | arch/x86/kernel/traps.c | 8 |
22 files changed, 342 insertions, 169 deletions
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 6e76c191a83..d5fd66f0d4c 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -20,7 +20,6 @@ #include <linux/bitops.h> #include <linux/ioport.h> #include <linux/suspend.h> -#include <linux/kmemleak.h> #include <asm/e820.h> #include <asm/io.h> #include <asm/iommu.h> @@ -95,11 +94,6 @@ static u32 __init allocate_aperture(void) return 0; } memblock_reserve(addr, aper_size); - /* - * Kmemleak should not scan this block as it may not be mapped via the - * kernel direct mapping. - */ - kmemleak_ignore(phys_to_virt(addr)); printk(KERN_INFO "Mapping aperture over %d KB of RAM @ %lx\n", aper_size >> 10, addr); insert_aperture_resource((u32)addr, aper_size); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index ac96561d1a9..5f0ff597437 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1195,7 +1195,7 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg) BUG_ON(!cfg->vector); vector = cfg->vector; - for_each_cpu_and(cpu, cfg->domain, cpu_online_mask) + for_each_cpu(cpu, cfg->domain) per_cpu(vector_irq, cpu)[vector] = -1; cfg->vector = 0; @@ -1203,7 +1203,7 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg) if (likely(!cfg->move_in_progress)) return; - for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) { + for_each_cpu(cpu, cfg->old_domain) { for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { if (per_cpu(vector_irq, cpu)[vector] != irq) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 82f29e70d05..6b9333b429b 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1101,14 +1101,20 @@ int is_debug_stack(unsigned long addr) addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ)); } +static DEFINE_PER_CPU(u32, debug_stack_use_ctr); + void debug_stack_set_zero(void) { + this_cpu_inc(debug_stack_use_ctr); load_idt((const struct desc_ptr *)&nmi_idt_descr); } void debug_stack_reset(void) { - load_idt((const struct desc_ptr *)&idt_descr); + if (WARN_ON(!this_cpu_read(debug_stack_use_ctr))) + return; + if (this_cpu_dec_return(debug_stack_use_ctr) == 0) + load_idt((const struct desc_ptr *)&idt_descr); } #else /* CONFIG_X86_64 */ diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index b772dd6ad45..da27c5d2168 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1251,15 +1251,15 @@ void mce_log_therm_throt_event(__u64 status) * poller finds an MCE, poll 2x faster. When the poller finds no more * errors, poll 2x slower (up to check_interval seconds). */ -static int check_interval = 5 * 60; /* 5 minutes */ +static unsigned long check_interval = 5 * 60; /* 5 minutes */ -static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */ +static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */ static DEFINE_PER_CPU(struct timer_list, mce_timer); -static void mce_start_timer(unsigned long data) +static void mce_timer_fn(unsigned long data) { - struct timer_list *t = &per_cpu(mce_timer, data); - int *n; + struct timer_list *t = &__get_cpu_var(mce_timer); + unsigned long iv; WARN_ON(smp_processor_id() != data); @@ -1272,13 +1272,14 @@ static void mce_start_timer(unsigned long data) * Alert userspace if needed. If we logged an MCE, reduce the * polling interval, otherwise increase the polling interval. */ - n = &__get_cpu_var(mce_next_interval); + iv = __this_cpu_read(mce_next_interval); if (mce_notify_irq()) - *n = max(*n/2, HZ/100); + iv = max(iv / 2, (unsigned long) HZ/100); else - *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); + iv = min(iv * 2, round_jiffies_relative(check_interval * HZ)); + __this_cpu_write(mce_next_interval, iv); - t->expires = jiffies + *n; + t->expires = jiffies + iv; add_timer_on(t, smp_processor_id()); } @@ -1472,9 +1473,9 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) rdmsrl(msrs[i], val); /* CntP bit set? */ - if (val & BIT(62)) { - val &= ~BIT(62); - wrmsrl(msrs[i], val); + if (val & BIT_64(62)) { + val &= ~BIT_64(62); + wrmsrl(msrs[i], val); } } @@ -1556,17 +1557,17 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) static void __mcheck_cpu_init_timer(void) { struct timer_list *t = &__get_cpu_var(mce_timer); - int *n = &__get_cpu_var(mce_next_interval); + unsigned long iv = check_interval * HZ; - setup_timer(t, mce_start_timer, smp_processor_id()); + setup_timer(t, mce_timer_fn, smp_processor_id()); if (mce_ignore_ce) return; - *n = check_interval * HZ; - if (!*n) + __this_cpu_write(mce_next_interval, iv); + if (!iv) return; - t->expires = round_jiffies(jiffies + *n); + t->expires = round_jiffies(jiffies + iv); add_timer_on(t, smp_processor_id()); } @@ -2276,7 +2277,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) case CPU_DOWN_FAILED_FROZEN: if (!mce_ignore_ce && check_interval) { t->expires = round_jiffies(jiffies + - __get_cpu_var(mce_next_interval)); + per_cpu(mce_next_interval, cpu)); add_timer_on(t, cpu); } smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index ac140c7be39..bdda2e6c673 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -266,7 +266,7 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk, if (align > max_align) align = max_align; - sizek = 1 << align; + sizek = 1UL << align; if (debug_print) { char start_factor = 'K', size_factor = 'K'; unsigned long start_base, size_base; diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index e049d6da018..c4706cf9c01 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1496,6 +1496,7 @@ static struct cpu_hw_events *allocate_fake_cpuc(void) if (!cpuc->shared_regs) goto error; } + cpuc->is_fake = 1; return cpuc; error: free_fake_cpuc(cpuc); @@ -1756,6 +1757,12 @@ perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs) dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry); } +static inline int +valid_user_frame(const void __user *fp, unsigned long size) +{ + return (__range_not_ok(fp, size, TASK_SIZE) == 0); +} + #ifdef CONFIG_COMPAT #include <asm/compat.h> @@ -1780,7 +1787,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) if (bytes != sizeof(frame)) break; - if (fp < compat_ptr(regs->sp)) + if (!valid_user_frame(fp, sizeof(frame))) break; perf_callchain_store(entry, frame.return_address); @@ -1826,7 +1833,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) if (bytes != sizeof(frame)) break; - if ((unsigned long)fp < regs->sp) + if (!valid_user_frame(fp, sizeof(frame))) break; perf_callchain_store(entry, frame.return_address); diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 6638aaf5449..7241e2fc3c1 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -117,6 +117,7 @@ struct cpu_hw_events { struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */ unsigned int group_flag; + int is_fake; /* * Intel DebugStore bits @@ -364,6 +365,7 @@ struct x86_pmu { int pebs_record_size; void (*drain_pebs)(struct pt_regs *regs); struct event_constraint *pebs_constraints; + void (*pebs_aliases)(struct perf_event *event); /* * Intel LBR diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 166546ec6ae..187c294bc65 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1119,27 +1119,33 @@ intel_bts_constraints(struct perf_event *event) return NULL; } -static bool intel_try_alt_er(struct perf_event *event, int orig_idx) +static int intel_alt_er(int idx) { if (!(x86_pmu.er_flags & ERF_HAS_RSP_1)) - return false; + return idx; - if (event->hw.extra_reg.idx == EXTRA_REG_RSP_0) { - event->hw.config &= ~INTEL_ARCH_EVENT_MASK; - event->hw.config |= 0x01bb; - event->hw.extra_reg.idx = EXTRA_REG_RSP_1; - event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1; - } else if (event->hw.extra_reg.idx == EXTRA_REG_RSP_1) { + if (idx == EXTRA_REG_RSP_0) + return EXTRA_REG_RSP_1; + + if (idx == EXTRA_REG_RSP_1) + return EXTRA_REG_RSP_0; + + return idx; +} + +static void intel_fixup_er(struct perf_event *event, int idx) +{ + event->hw.extra_reg.idx = idx; + + if (idx == EXTRA_REG_RSP_0) { event->hw.config &= ~INTEL_ARCH_EVENT_MASK; event->hw.config |= 0x01b7; - event->hw.extra_reg.idx = EXTRA_REG_RSP_0; event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0; + } else if (idx == EXTRA_REG_RSP_1) { + event->hw.config &= ~INTEL_ARCH_EVENT_MASK; + event->hw.config |= 0x01bb; + event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1; } - - if (event->hw.extra_reg.idx == orig_idx) - return false; - - return true; } /* @@ -1157,14 +1163,18 @@ __intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc, struct event_constraint *c = &emptyconstraint; struct er_account *era; unsigned long flags; - int orig_idx = reg->idx; + int idx = reg->idx; - /* already allocated shared msr */ - if (reg->alloc) + /* + * reg->alloc can be set due to existing state, so for fake cpuc we + * need to ignore this, otherwise we might fail to allocate proper fake + * state for this extra reg constraint. Also see the comment below. + */ + if (reg->alloc && !cpuc->is_fake) return NULL; /* call x86_get_event_constraint() */ again: - era = &cpuc->shared_regs->regs[reg->idx]; + era = &cpuc->shared_regs->regs[idx]; /* * we use spin_lock_irqsave() to avoid lockdep issues when * passing a fake cpuc @@ -1173,6 +1183,29 @@ again: if (!atomic_read(&era->ref) || era->config == reg->config) { + /* + * If its a fake cpuc -- as per validate_{group,event}() we + * shouldn't touch event state and we can avoid doing so + * since both will only call get_event_constraints() once + * on each event, this avoids the need for reg->alloc. + * + * Not doing the ER fixup will only result in era->reg being + * wrong, but since we won't actually try and program hardware + * this isn't a problem either. + */ + if (!cpuc->is_fake) { + if (idx != reg->idx) + intel_fixup_er(event, idx); + + /* + * x86_schedule_events() can call get_event_constraints() + * multiple times on events in the case of incremental + * scheduling(). reg->alloc ensures we only do the ER + * allocation once. + */ + reg->alloc = 1; + } + /* lock in msr value */ era->config = reg->config; era->reg = reg->reg; @@ -1180,17 +1213,17 @@ again: /* one more user */ atomic_inc(&era->ref); - /* no need to reallocate during incremental event scheduling */ - reg->alloc = 1; - /* * need to call x86_get_event_constraint() * to check if associated event has constraints */ c = NULL; - } else if (intel_try_alt_er(event, orig_idx)) { - raw_spin_unlock_irqrestore(&era->lock, flags); - goto again; + } else { + idx = intel_alt_er(idx); + if (idx != reg->idx) { + raw_spin_unlock_irqrestore(&era->lock, flags); + goto again; + } } raw_spin_unlock_irqrestore(&era->lock, flags); @@ -1204,11 +1237,14 @@ __intel_shared_reg_put_constraints(struct cpu_hw_events *cpuc, struct er_account *era; /* - * only put constraint if extra reg was actually - * allocated. Also takes care of event which do - * not use an extra shared reg + * Only put constraint if extra reg was actually allocated. Also takes + * care of event which do not use an extra shared reg. + * + * Also, if this is a fake cpuc we shouldn't touch any event state + * (reg->alloc) and we don't care about leaving inconsistent cpuc state + * either since it'll be thrown out. */ - if (!reg->alloc) + if (!reg->alloc || cpuc->is_fake) return; era = &cpuc->shared_regs->regs[reg->idx]; @@ -1300,15 +1336,9 @@ static void intel_put_event_constraints(struct cpu_hw_events *cpuc, intel_put_shared_regs_event_constraints(cpuc, event); } -static int intel_pmu_hw_config(struct perf_event *event) +static void intel_pebs_aliases_core2(struct perf_event *event) { - int ret = x86_pmu_hw_config(event); - - if (ret) - return ret; - - if (event->attr.precise_ip && - (event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) { + if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) { /* * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P * (0x003c) so that we can use it with PEBS. @@ -1329,10 +1359,48 @@ static int intel_pmu_hw_config(struct perf_event *event) */ u64 alt_config = X86_CONFIG(.event=0xc0, .inv=1, .cmask=16); + alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK); + event->hw.config = alt_config; + } +} + +static void intel_pebs_aliases_snb(struct perf_event *event) +{ + if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) { + /* + * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P + * (0x003c) so that we can use it with PEBS. + * + * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't + * PEBS capable. However we can use UOPS_RETIRED.ALL + * (0x01c2), which is a PEBS capable event, to get the same + * count. + * + * UOPS_RETIRED.ALL counts the number of cycles that retires + * CNTMASK micro-ops. By setting CNTMASK to a value (16) + * larger than the maximum number of micro-ops that can be + * retired per cycle (4) and then inverting the condition, we + * count all cycles that retire 16 or less micro-ops, which + * is every cycle. + * + * Thereby we gain a PEBS capable cycle counter. + */ + u64 alt_config = X86_CONFIG(.event=0xc2, .umask=0x01, .inv=1, .cmask=16); alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK); event->hw.config = alt_config; } +} + +static int intel_pmu_hw_config(struct perf_event *event) +{ + int ret = x86_pmu_hw_config(event); + + if (ret) + return ret; + + if (event->attr.precise_ip && x86_pmu.pebs_aliases) + x86_pmu.pebs_aliases(event); if (intel_pmu_needs_lbr_smpl(event)) { ret = intel_pmu_setup_lbr_filter(event); @@ -1607,6 +1675,7 @@ static __initconst const struct x86_pmu intel_pmu = { .max_period = (1ULL << 31) - 1, .get_event_constraints = intel_get_event_constraints, .put_event_constraints = intel_put_event_constraints, + .pebs_aliases = intel_pebs_aliases_core2, .format_attrs = intel_arch3_formats_attr, @@ -1840,8 +1909,9 @@ __init int intel_pmu_init(void) break; case 42: /* SandyBridge */ - x86_add_quirk(intel_sandybridge_quirk); case 45: /* SandyBridge, "Romely-EP" */ + x86_add_quirk(intel_sandybridge_quirk); + case 58: /* IvyBridge */ memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids)); @@ -1849,6 +1919,7 @@ __init int intel_pmu_init(void) x86_pmu.event_constraints = intel_snb_event_constraints; x86_pmu.pebs_constraints = intel_snb_pebs_event_constraints; + x86_pmu.pebs_aliases = intel_pebs_aliases_snb; x86_pmu.extra_regs = intel_snb_extra_regs; /* all extra regs are per-cpu when HT is on */ x86_pmu.er_flags |= ERF_HAS_RSP_1; diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 5a3edc27f6e..35e2192df9f 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -400,14 +400,7 @@ struct event_constraint intel_snb_pebs_event_constraints[] = { INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */ INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.* */ - INTEL_UEVENT_CONSTRAINT(0x11d0, 0xf), /* MEM_UOP_RETIRED.STLB_MISS_LOADS */ - INTEL_UEVENT_CONSTRAINT(0x12d0, 0xf), /* MEM_UOP_RETIRED.STLB_MISS_STORES */ - INTEL_UEVENT_CONSTRAINT(0x21d0, 0xf), /* MEM_UOP_RETIRED.LOCK_LOADS */ - INTEL_UEVENT_CONSTRAINT(0x22d0, 0xf), /* MEM_UOP_RETIRED.LOCK_STORES */ - INTEL_UEVENT_CONSTRAINT(0x41d0, 0xf), /* MEM_UOP_RETIRED.SPLIT_LOADS */ - INTEL_UEVENT_CONSTRAINT(0x42d0, 0xf), /* MEM_UOP_RETIRED.SPLIT_STORES */ - INTEL_UEVENT_CONSTRAINT(0x81d0, 0xf), /* MEM_UOP_RETIRED.ANY_LOADS */ - INTEL_UEVENT_CONSTRAINT(0x82d0, 0xf), /* MEM_UOP_RETIRED.ANY_STORES */ + INTEL_EVENT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */ INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ INTEL_UEVENT_CONSTRAINT(0x02d4, 0xf), /* MEM_LOAD_UOPS_MISC_RETIRED.LLC_MISS */ diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 01ccf9b7147..623f2883747 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -316,7 +316,6 @@ ret_from_exception: preempt_stop(CLBR_ANY) ret_from_intr: GET_THREAD_INFO(%ebp) -resume_userspace_sig: #ifdef CONFIG_VM86 movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS movb PT_CS(%esp), %al @@ -615,9 +614,13 @@ work_notifysig: # deal with pending signals and # vm86-space TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) + movb PT_CS(%esp), %bl + andb $SEGMENT_RPL_MASK, %bl + cmpb $USER_RPL, %bl + jb resume_kernel xorl %edx, %edx call do_notify_resume - jmp resume_userspace_sig + jmp resume_userspace ALIGN work_notifysig_v86: @@ -630,9 +633,13 @@ work_notifysig_v86: #endif TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) + movb PT_CS(%esp), %bl + andb $SEGMENT_RPL_MASK, %bl + cmpb $USER_RPL, %bl + jb resume_kernel xorl %edx, %edx call do_notify_resume - jmp resume_userspace_sig + jmp resume_userspace END(work_pending) # perform syscall exit tracing diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 320852d0202..7d65133b51b 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -191,6 +191,44 @@ ENDPROC(native_usergs_sysret64) .endm /* + * When dynamic function tracer is enabled it will add a breakpoint + * to all locations that it is about to modify, sync CPUs, update + * all the code, sync CPUs, then remove the breakpoints. In this time + * if lockdep is enabled, it might jump back into the debug handler + * outside the updating of the IST protection. (TRACE_IRQS_ON/OFF). + * + * We need to change the IDT table before calling TRACE_IRQS_ON/OFF to + * make sure the stack pointer does not get reset back to the top + * of the debug stack, and instead just reuses the current stack. + */ +#if defined(CONFIG_DYNAMIC_FTRACE) && defined(CONFIG_TRACE_IRQFLAGS) + +.macro TRACE_IRQS_OFF_DEBUG + call debug_stack_set_zero + TRACE_IRQS_OFF + call debug_stack_reset +.endm + +.macro TRACE_IRQS_ON_DEBUG + call debug_stack_set_zero + TRACE_IRQS_ON + call debug_stack_reset +.endm + +.macro TRACE_IRQS_IRETQ_DEBUG offset=ARGOFFSET + bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */ + jnc 1f + TRACE_IRQS_ON_DEBUG +1: +.endm + +#else +# define TRACE_IRQS_OFF_DEBUG TRACE_IRQS_OFF +# define TRACE_IRQS_ON_DEBUG TRACE_IRQS_ON +# define TRACE_IRQS_IRETQ_DEBUG TRACE_IRQS_IRETQ +#endif + +/* * C code is not supposed to know about undefined top of stack. Every time * a C function with an pt_regs argument is called from the SYSCALL based * fast path FIXUP_TOP_OF_STACK is needed. @@ -1098,7 +1136,7 @@ ENTRY(\sym) subq $ORIG_RAX-R15, %rsp CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 call save_paranoid - TRACE_IRQS_OFF + TRACE_IRQS_OFF_DEBUG movq %rsp,%rdi /* pt_regs pointer */ xorl %esi,%esi /* no error code */ subq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist) @@ -1393,7 +1431,7 @@ paranoidzeroentry machine_check *machine_check_vector(%rip) ENTRY(paranoid_exit) DEFAULT_FRAME DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF + TRACE_IRQS_OFF_DEBUG testl %ebx,%ebx /* swapgs needed? */ jnz paranoid_restore testl $3,CS(%rsp) @@ -1404,7 +1442,7 @@ paranoid_swapgs: RESTORE_ALL 8 jmp irq_return paranoid_restore: - TRACE_IRQS_IRETQ 0 + TRACE_IRQS_IRETQ_DEBUG 0 RESTORE_ALL 8 jmp irq_return paranoid_userspace: diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 32ff36596ab..c3a7cb4bf6e 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -100,7 +100,7 @@ static const unsigned char *ftrace_nop_replace(void) } static int -ftrace_modify_code(unsigned long ip, unsigned const char *old_code, +ftrace_modify_code_direct(unsigned long ip, unsigned const char *old_code, unsigned const char *new_code) { unsigned char replaced[MCOUNT_INSN_SIZE]; @@ -141,7 +141,20 @@ int ftrace_make_nop(struct module *mod, old = ftrace_call_replace(ip, addr); new = ftrace_nop_replace(); - return ftrace_modify_code(rec->ip, old, new); + /* + * On boot up, and when modules are loaded, the MCOUNT_ADDR + * is converted to a nop, and will never become MCOUNT_ADDR + * again. This code is either running before SMP (on boot up) + * or before the code will ever be executed (module load). + * We do not want to use the breakpoint version in this case, + * just modify the code directly. + */ + if (addr == MCOUNT_ADDR) + return ftrace_modify_code_direct(rec->ip, old, new); + + /* Normal cases use add_brk_on_nop */ + WARN_ONCE(1, "invalid use of ftrace_make_nop"); + return -EINVAL; } int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) @@ -152,9 +165,47 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) old = ftrace_nop_replace(); new = ftrace_call_replace(ip, addr); - return ftrace_modify_code(rec->ip, old, new); + /* Should only be called when module is loaded */ + return ftrace_modify_code_direct(rec->ip, old, new); } +/* + * The modifying_ftrace_code is used to tell the breakpoint + * handler to call ftrace_int3_handler(). If it fails to + * call this handler for a breakpoint added by ftrace, then + * the kernel may crash. + * + * As atomic_writes on x86 do not need a barrier, we do not + * need to add smp_mb()s for this to work. It is also considered + * that we can not read the modifying_ftrace_code before + * executing the breakpoint. That would be quite remarkable if + * it could do that. Here's the flow that is required: + * + * CPU-0 CPU-1 + * + * atomic_inc(mfc); + * write int3s + * <trap-int3> // implicit (r)mb + * if (atomic_read(mfc)) + * call ftrace_int3_handler() + * + * Then when we are finished: + * + * atomic_dec(mfc); + * + * If we hit a breakpoint that was not set by ftrace, it does not + * matter if ftrace_int3_handler() is called or not. It will + * simply be ignored. But it is crucial that a ftrace nop/caller + * breakpoint is handled. No other user should ever place a + * breakpoint on an ftrace nop/caller location. It must only + * be done by this code. + */ +atomic_t modifying_ftrace_code __read_mostly; + +static int +ftrace_modify_code(unsigned long ip, unsigned const char *old_code, + unsigned const char *new_code); + int ftrace_update_ftrace_func(ftrace_func_t func) { unsigned long ip = (unsigned long)(&ftrace_call); @@ -163,13 +214,17 @@ int ftrace_update_ftrace_func(ftrace_func_t func) memcpy(old, &ftrace_call, MCOUNT_INSN_SIZE); new = ftrace_call_replace(ip, (unsigned long)func); + + /* See comment above by declaration of modifying_ftrace_code */ + atomic_inc(&modifying_ftrace_code); + ret = ftrace_modify_code(ip, old, new); + atomic_dec(&modifying_ftrace_code); + return ret; } -int modifying_ftrace_code __read_mostly; - /* * A breakpoint was added to the code address we are about to * modify, and this is the handle that will just skip over it. @@ -489,13 +544,46 @@ void ftrace_replace_code(int enable) } } +static int +ftrace_modify_code(unsigned long ip, unsigned const char *old_code, + unsigned const char *new_code) +{ + int ret; + + ret = add_break(ip, old_code); + if (ret) + goto out; + + run_sync(); + + ret = add_update_code(ip, new_code); + if (ret) + goto fail_update; + + run_sync(); + + ret = ftrace_write(ip, new_code, 1); + if (ret) { + ret = -EPERM; + goto out; + } + run_sync(); + out: + return ret; + + fail_update: + probe_kernel_write((void *)ip, &old_code[0], 1); + goto out; +} + void arch_ftrace_update_code(int command) { - modifying_ftrace_code++; + /* See comment above by declaration of modifying_ftrace_code */ + atomic_inc(&modifying_ftrace_code); ftrace_modify_all_code(command); - modifying_ftrace_code--; + atomic_dec(&modifying_ftrace_code); } int __init ftrace_dyn_arch_init(void *data) diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 9cc7b4392f7..1460a5df92f 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -870,7 +870,7 @@ int __init hpet_enable(void) else pr_warn("HPET initial state will not be saved\n"); cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY); - hpet_writel(cfg, HPET_Tn_CFG(i)); + hpet_writel(cfg, HPET_CFG); if (cfg) pr_warn("HPET: Unrecognized bits %#x set in global cfg\n", cfg); diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 086eb58c6e8..f1b42b3a186 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -120,11 +120,6 @@ bool kvm_check_and_clear_guest_paused(void) bool ret = false; struct pvclock_vcpu_time_info *src; - /* - * per_cpu() is safe here because this function is only called from - * timer functions where preemption is already disabled. - */ - WARN_ON(!in_atomic()); src = &__get_cpu_var(hv_clock); if ((src->flags & PVCLOCK_GUEST_STOPPED) != 0) { __this_cpu_and(hv_clock.flags, ~PVCLOCK_GUEST_STOPPED); diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 90875279ef3..a0b2f84457b 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -444,14 +444,16 @@ static inline void nmi_nesting_preprocess(struct pt_regs *regs) */ if (unlikely(is_debug_stack(regs->sp))) { debug_stack_set_zero(); - __get_cpu_var(update_debug_stack) = 1; + this_cpu_write(update_debug_stack, 1); } } static inline void nmi_nesting_postprocess(void) { - if (unlikely(__get_cpu_var(update_debug_stack))) + if (unlikely(this_cpu_read(update_debug_stack))) { debug_stack_reset(); + this_cpu_write(update_debug_stack, 0); + } } #endif diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c index e31bf8d5c4d..149b8d9c6ad 100644 --- a/arch/x86/kernel/nmi_selftest.c +++ b/arch/x86/kernel/nmi_selftest.c @@ -42,7 +42,7 @@ static int __init nmi_unk_cb(unsigned int val, struct pt_regs *regs) static void __init init_nmi_testsuite(void) { /* trap all the unknown NMIs we may generate */ - register_nmi_handler(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk"); + register_nmi_handler_initonly(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk"); } static void __init cleanup_nmi_testsuite(void) @@ -64,7 +64,7 @@ static void __init test_nmi_ipi(struct cpumask *mask) { unsigned long timeout; - if (register_nmi_handler(NMI_LOCAL, test_nmi_ipi_callback, + if (register_nmi_handler_initonly(NMI_LOCAL, test_nmi_ipi_callback, NMI_FLAG_FIRST, "nmi_selftest")) { nmi_fail = FAILURE; return; diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 62c9457ccd2..c0f420f76cd 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -100,7 +100,7 @@ void *dma_generic_alloc_coherent(struct device *dev, size_t size, struct dma_attrs *attrs) { unsigned long dma_mask; - struct page *page = NULL; + struct page *page; unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; dma_addr_t addr; @@ -108,6 +108,7 @@ void *dma_generic_alloc_coherent(struct device *dev, size_t size, flag |= __GFP_ZERO; again: + page = NULL; if (!(flag & GFP_ATOMIC)) page = dma_alloc_from_contiguous(dev, count, get_order(size)); if (!page) diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 13b1990c7c5..c4c6a5c2bf0 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1211,12 +1211,6 @@ static long x32_arch_ptrace(struct task_struct *child, 0, sizeof(struct user_i387_struct), datap); - /* normal 64bit interface to access TLS data. - Works just like arch_prctl, except that the arguments - are reversed. */ - case PTRACE_ARCH_PRCTL: - return do_arch_prctl(child, data, addr); - default: return compat_ptrace_request(child, request, addr, data); } diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 79c45af8160..25b48edb847 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -639,9 +639,11 @@ void native_machine_shutdown(void) set_cpus_allowed_ptr(current, cpumask_of(reboot_cpu_id)); /* - * O.K Now that I'm on the appropriate processor, - * stop all of the others. + * O.K Now that I'm on the appropriate processor, stop all of the + * others. Also disable the local irq to not receive the per-cpu + * timer interrupt which may trigger scheduler's load balance. */ + local_irq_disable(); stop_other_cpus(); #endif diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 965dfda0fd5..21af737053a 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -555,7 +555,6 @@ unsigned long sys_sigreturn(struct pt_regs *regs) sizeof(frame->extramask)))) goto badframe; - sigdelsetmask(&set, ~_BLOCKABLE); set_current_blocked(&set); if (restore_sigcontext(regs, &frame->sc, &ax)) @@ -581,7 +580,6 @@ long sys_rt_sigreturn(struct pt_regs *regs) if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) goto badframe; - sigdelsetmask(&set, ~_BLOCKABLE); set_current_blocked(&set); if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) @@ -647,42 +645,28 @@ setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, struct pt_regs *regs) { int usig = signr_convert(sig); - sigset_t *set = ¤t->blocked; - int ret; - - if (current_thread_info()->status & TS_RESTORE_SIGMASK) - set = ¤t->saved_sigmask; + sigset_t *set = sigmask_to_save(); /* Set up the stack frame */ if (is_ia32) { if (ka->sa.sa_flags & SA_SIGINFO) - ret = ia32_setup_rt_frame(usig, ka, info, set, regs); + return ia32_setup_rt_frame(usig, ka, info, set, regs); else - ret = ia32_setup_frame(usig, ka, set, regs); + return ia32_setup_frame(usig, ka, set, regs); #ifdef CONFIG_X86_X32_ABI } else if (is_x32) { - ret = x32_setup_rt_frame(usig, ka, info, + return x32_setup_rt_frame(usig, ka, info, (compat_sigset_t *)set, regs); #endif } else { - ret = __setup_rt_frame(sig, ka, info, set, regs); - } - - if (ret) { - force_sigsegv(sig, current); - return -EFAULT; + return __setup_rt_frame(sig, ka, info, set, regs); } - - current_thread_info()->status &= ~TS_RESTORE_SIGMASK; - return ret; } -static int +static void handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, struct pt_regs *regs) { - int ret; - /* Are we from a system call? */ if (syscall_get_nr(current, regs) >= 0) { /* If so, check system call restarting.. */ @@ -713,10 +697,10 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, likely(test_and_clear_thread_flag(TIF_FORCED_TF))) regs->flags &= ~X86_EFLAGS_TF; - ret = setup_rt_frame(sig, ka, info, regs); - - if (ret) - return ret; + if (setup_rt_frame(sig, ka, info, regs) < 0) { + force_sigsegv(sig, current); + return; + } /* * Clear the direction flag as per the ABI for function entry. @@ -731,12 +715,8 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, */ regs->flags &= ~X86_EFLAGS_TF; - block_sigmask(ka, sig); - - tracehook_signal_handler(sig, info, ka, regs, - test_thread_flag(TIF_SINGLESTEP)); - - return 0; + signal_delivered(sig, info, ka, regs, + test_thread_flag(TIF_SINGLESTEP)); } #ifdef CONFIG_X86_32 @@ -757,16 +737,6 @@ static void do_signal(struct pt_regs *regs) siginfo_t info; int signr; - /* - * We want the common case to go fast, which is why we may in certain - * cases get here from kernel mode. Just return without doing anything - * if so. - * X86_32: vm86 regs switched out by assembly code before reaching - * here, so testing against kernel CS suffices. - */ - if (!user_mode(regs)) - return; - signr = get_signal_to_deliver(&info, &ka, regs, NULL); if (signr > 0) { /* Whee! Actually deliver the signal. */ @@ -796,10 +766,7 @@ static void do_signal(struct pt_regs *regs) * If there's no signal to deliver, we just put the saved sigmask * back. */ - if (current_thread_info()->status & TS_RESTORE_SIGMASK) { - current_thread_info()->status &= ~TS_RESTORE_SIGMASK; - set_current_blocked(¤t->saved_sigmask); - } + restore_saved_sigmask(); } /* @@ -827,8 +794,6 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) if (thread_info_flags & _TIF_NOTIFY_RESUME) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); - if (current->replacement_session_keyring) - key_replace_session_keyring(); } if (thread_info_flags & _TIF_USER_RETURN_NOTIFY) fire_user_return_notifiers(); @@ -936,7 +901,6 @@ asmlinkage long sys32_x32_rt_sigreturn(struct pt_regs *regs) if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) goto badframe; - sigdelsetmask(&set, ~_BLOCKABLE); set_current_blocked(&set); if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index f56f96da77f..7bd8a082365 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -349,9 +349,12 @@ static bool __cpuinit match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) static bool __cpuinit match_mc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) { - if (c->phys_proc_id == o->phys_proc_id) - return topology_sane(c, o, "mc"); + if (c->phys_proc_id == o->phys_proc_id) { + if (cpu_has(c, X86_FEATURE_AMD_DCM)) + return true; + return topology_sane(c, o, "mc"); + } return false; } @@ -382,6 +385,15 @@ void __cpuinit set_cpu_sibling_map(int cpu) if ((i == cpu) || (has_mc && match_llc(c, o))) link_mask(llc_shared, cpu, i); + } + + /* + * This needs a separate iteration over the cpus because we rely on all + * cpu_sibling_mask links to be set-up. + */ + for_each_cpu(i, cpu_sibling_setup_mask) { + o = &cpu_data(i); + if ((i == cpu) || (has_mc && match_mc(c, o))) { link_mask(core, cpu, i); @@ -410,15 +422,7 @@ void __cpuinit set_cpu_sibling_map(int cpu) /* maps the cpu to the sched domain representing multi-core */ const struct cpumask *cpu_coregroup_mask(int cpu) { - struct cpuinfo_x86 *c = &cpu_data(cpu); - /* - * For perf, we return last level cache shared map. - * And for power savings, we return cpu_core_map - */ - if (!(cpu_has(c, X86_FEATURE_AMD_DCM))) - return cpu_core_mask(cpu); - else - return cpu_llc_shared_mask(cpu); + return cpu_llc_shared_mask(cpu); } static void impress_friends(void) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index ff08457a025..05b31d92f69 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -303,8 +303,12 @@ gp_in_kernel: dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_code) { #ifdef CONFIG_DYNAMIC_FTRACE - /* ftrace must be first, everything else may cause a recursive crash */ - if (unlikely(modifying_ftrace_code) && ftrace_int3_handler(regs)) + /* + * ftrace must be first, everything else may cause a recursive crash. + * See note by declaration of modifying_ftrace_code in ftrace.c + */ + if (unlikely(atomic_read(&modifying_ftrace_code)) && + ftrace_int3_handler(regs)) return; #endif #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP |