summaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/aperture_64.c6
-rw-r--r--arch/x86/kernel/apic/io_apic.c4
-rw-r--r--arch/x86/kernel/cpu/common.c8
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c37
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c2
-rw-r--r--arch/x86/kernel/cpu/perf_event.c11
-rw-r--r--arch/x86/kernel/cpu/perf_event.h2
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c145
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c9
-rw-r--r--arch/x86/kernel/entry_32.S13
-rw-r--r--arch/x86/kernel/entry_64.S44
-rw-r--r--arch/x86/kernel/ftrace.c102
-rw-r--r--arch/x86/kernel/hpet.c2
-rw-r--r--arch/x86/kernel/kvmclock.c5
-rw-r--r--arch/x86/kernel/nmi.c6
-rw-r--r--arch/x86/kernel/nmi_selftest.c4
-rw-r--r--arch/x86/kernel/pci-dma.c3
-rw-r--r--arch/x86/kernel/ptrace.c6
-rw-r--r--arch/x86/kernel/reboot.c6
-rw-r--r--arch/x86/kernel/signal.c62
-rw-r--r--arch/x86/kernel/smpboot.c26
-rw-r--r--arch/x86/kernel/traps.c8
22 files changed, 342 insertions, 169 deletions
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 6e76c191a83..d5fd66f0d4c 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -20,7 +20,6 @@
#include <linux/bitops.h>
#include <linux/ioport.h>
#include <linux/suspend.h>
-#include <linux/kmemleak.h>
#include <asm/e820.h>
#include <asm/io.h>
#include <asm/iommu.h>
@@ -95,11 +94,6 @@ static u32 __init allocate_aperture(void)
return 0;
}
memblock_reserve(addr, aper_size);
- /*
- * Kmemleak should not scan this block as it may not be mapped via the
- * kernel direct mapping.
- */
- kmemleak_ignore(phys_to_virt(addr));
printk(KERN_INFO "Mapping aperture over %d KB of RAM @ %lx\n",
aper_size >> 10, addr);
insert_aperture_resource((u32)addr, aper_size);
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index ac96561d1a9..5f0ff597437 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1195,7 +1195,7 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
BUG_ON(!cfg->vector);
vector = cfg->vector;
- for_each_cpu_and(cpu, cfg->domain, cpu_online_mask)
+ for_each_cpu(cpu, cfg->domain)
per_cpu(vector_irq, cpu)[vector] = -1;
cfg->vector = 0;
@@ -1203,7 +1203,7 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
if (likely(!cfg->move_in_progress))
return;
- for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) {
+ for_each_cpu(cpu, cfg->old_domain) {
for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS;
vector++) {
if (per_cpu(vector_irq, cpu)[vector] != irq)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 82f29e70d05..6b9333b429b 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1101,14 +1101,20 @@ int is_debug_stack(unsigned long addr)
addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ));
}
+static DEFINE_PER_CPU(u32, debug_stack_use_ctr);
+
void debug_stack_set_zero(void)
{
+ this_cpu_inc(debug_stack_use_ctr);
load_idt((const struct desc_ptr *)&nmi_idt_descr);
}
void debug_stack_reset(void)
{
- load_idt((const struct desc_ptr *)&idt_descr);
+ if (WARN_ON(!this_cpu_read(debug_stack_use_ctr)))
+ return;
+ if (this_cpu_dec_return(debug_stack_use_ctr) == 0)
+ load_idt((const struct desc_ptr *)&idt_descr);
}
#else /* CONFIG_X86_64 */
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index b772dd6ad45..da27c5d2168 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1251,15 +1251,15 @@ void mce_log_therm_throt_event(__u64 status)
* poller finds an MCE, poll 2x faster. When the poller finds no more
* errors, poll 2x slower (up to check_interval seconds).
*/
-static int check_interval = 5 * 60; /* 5 minutes */
+static unsigned long check_interval = 5 * 60; /* 5 minutes */
-static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */
+static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
static DEFINE_PER_CPU(struct timer_list, mce_timer);
-static void mce_start_timer(unsigned long data)
+static void mce_timer_fn(unsigned long data)
{
- struct timer_list *t = &per_cpu(mce_timer, data);
- int *n;
+ struct timer_list *t = &__get_cpu_var(mce_timer);
+ unsigned long iv;
WARN_ON(smp_processor_id() != data);
@@ -1272,13 +1272,14 @@ static void mce_start_timer(unsigned long data)
* Alert userspace if needed. If we logged an MCE, reduce the
* polling interval, otherwise increase the polling interval.
*/
- n = &__get_cpu_var(mce_next_interval);
+ iv = __this_cpu_read(mce_next_interval);
if (mce_notify_irq())
- *n = max(*n/2, HZ/100);
+ iv = max(iv / 2, (unsigned long) HZ/100);
else
- *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
+ iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
+ __this_cpu_write(mce_next_interval, iv);
- t->expires = jiffies + *n;
+ t->expires = jiffies + iv;
add_timer_on(t, smp_processor_id());
}
@@ -1472,9 +1473,9 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
rdmsrl(msrs[i], val);
/* CntP bit set? */
- if (val & BIT(62)) {
- val &= ~BIT(62);
- wrmsrl(msrs[i], val);
+ if (val & BIT_64(62)) {
+ val &= ~BIT_64(62);
+ wrmsrl(msrs[i], val);
}
}
@@ -1556,17 +1557,17 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
static void __mcheck_cpu_init_timer(void)
{
struct timer_list *t = &__get_cpu_var(mce_timer);
- int *n = &__get_cpu_var(mce_next_interval);
+ unsigned long iv = check_interval * HZ;
- setup_timer(t, mce_start_timer, smp_processor_id());
+ setup_timer(t, mce_timer_fn, smp_processor_id());
if (mce_ignore_ce)
return;
- *n = check_interval * HZ;
- if (!*n)
+ __this_cpu_write(mce_next_interval, iv);
+ if (!iv)
return;
- t->expires = round_jiffies(jiffies + *n);
+ t->expires = round_jiffies(jiffies + iv);
add_timer_on(t, smp_processor_id());
}
@@ -2276,7 +2277,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
case CPU_DOWN_FAILED_FROZEN:
if (!mce_ignore_ce && check_interval) {
t->expires = round_jiffies(jiffies +
- __get_cpu_var(mce_next_interval));
+ per_cpu(mce_next_interval, cpu));
add_timer_on(t, cpu);
}
smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index ac140c7be39..bdda2e6c673 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -266,7 +266,7 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk,
if (align > max_align)
align = max_align;
- sizek = 1 << align;
+ sizek = 1UL << align;
if (debug_print) {
char start_factor = 'K', size_factor = 'K';
unsigned long start_base, size_base;
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index e049d6da018..c4706cf9c01 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1496,6 +1496,7 @@ static struct cpu_hw_events *allocate_fake_cpuc(void)
if (!cpuc->shared_regs)
goto error;
}
+ cpuc->is_fake = 1;
return cpuc;
error:
free_fake_cpuc(cpuc);
@@ -1756,6 +1757,12 @@ perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
}
+static inline int
+valid_user_frame(const void __user *fp, unsigned long size)
+{
+ return (__range_not_ok(fp, size, TASK_SIZE) == 0);
+}
+
#ifdef CONFIG_COMPAT
#include <asm/compat.h>
@@ -1780,7 +1787,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
if (bytes != sizeof(frame))
break;
- if (fp < compat_ptr(regs->sp))
+ if (!valid_user_frame(fp, sizeof(frame)))
break;
perf_callchain_store(entry, frame.return_address);
@@ -1826,7 +1833,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
if (bytes != sizeof(frame))
break;
- if ((unsigned long)fp < regs->sp)
+ if (!valid_user_frame(fp, sizeof(frame)))
break;
perf_callchain_store(entry, frame.return_address);
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 6638aaf5449..7241e2fc3c1 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -117,6 +117,7 @@ struct cpu_hw_events {
struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
unsigned int group_flag;
+ int is_fake;
/*
* Intel DebugStore bits
@@ -364,6 +365,7 @@ struct x86_pmu {
int pebs_record_size;
void (*drain_pebs)(struct pt_regs *regs);
struct event_constraint *pebs_constraints;
+ void (*pebs_aliases)(struct perf_event *event);
/*
* Intel LBR
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 166546ec6ae..187c294bc65 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1119,27 +1119,33 @@ intel_bts_constraints(struct perf_event *event)
return NULL;
}
-static bool intel_try_alt_er(struct perf_event *event, int orig_idx)
+static int intel_alt_er(int idx)
{
if (!(x86_pmu.er_flags & ERF_HAS_RSP_1))
- return false;
+ return idx;
- if (event->hw.extra_reg.idx == EXTRA_REG_RSP_0) {
- event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
- event->hw.config |= 0x01bb;
- event->hw.extra_reg.idx = EXTRA_REG_RSP_1;
- event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1;
- } else if (event->hw.extra_reg.idx == EXTRA_REG_RSP_1) {
+ if (idx == EXTRA_REG_RSP_0)
+ return EXTRA_REG_RSP_1;
+
+ if (idx == EXTRA_REG_RSP_1)
+ return EXTRA_REG_RSP_0;
+
+ return idx;
+}
+
+static void intel_fixup_er(struct perf_event *event, int idx)
+{
+ event->hw.extra_reg.idx = idx;
+
+ if (idx == EXTRA_REG_RSP_0) {
event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
event->hw.config |= 0x01b7;
- event->hw.extra_reg.idx = EXTRA_REG_RSP_0;
event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0;
+ } else if (idx == EXTRA_REG_RSP_1) {
+ event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
+ event->hw.config |= 0x01bb;
+ event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1;
}
-
- if (event->hw.extra_reg.idx == orig_idx)
- return false;
-
- return true;
}
/*
@@ -1157,14 +1163,18 @@ __intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc,
struct event_constraint *c = &emptyconstraint;
struct er_account *era;
unsigned long flags;
- int orig_idx = reg->idx;
+ int idx = reg->idx;
- /* already allocated shared msr */
- if (reg->alloc)
+ /*
+ * reg->alloc can be set due to existing state, so for fake cpuc we
+ * need to ignore this, otherwise we might fail to allocate proper fake
+ * state for this extra reg constraint. Also see the comment below.
+ */
+ if (reg->alloc && !cpuc->is_fake)
return NULL; /* call x86_get_event_constraint() */
again:
- era = &cpuc->shared_regs->regs[reg->idx];
+ era = &cpuc->shared_regs->regs[idx];
/*
* we use spin_lock_irqsave() to avoid lockdep issues when
* passing a fake cpuc
@@ -1173,6 +1183,29 @@ again:
if (!atomic_read(&era->ref) || era->config == reg->config) {
+ /*
+ * If its a fake cpuc -- as per validate_{group,event}() we
+ * shouldn't touch event state and we can avoid doing so
+ * since both will only call get_event_constraints() once
+ * on each event, this avoids the need for reg->alloc.
+ *
+ * Not doing the ER fixup will only result in era->reg being
+ * wrong, but since we won't actually try and program hardware
+ * this isn't a problem either.
+ */
+ if (!cpuc->is_fake) {
+ if (idx != reg->idx)
+ intel_fixup_er(event, idx);
+
+ /*
+ * x86_schedule_events() can call get_event_constraints()
+ * multiple times on events in the case of incremental
+ * scheduling(). reg->alloc ensures we only do the ER
+ * allocation once.
+ */
+ reg->alloc = 1;
+ }
+
/* lock in msr value */
era->config = reg->config;
era->reg = reg->reg;
@@ -1180,17 +1213,17 @@ again:
/* one more user */
atomic_inc(&era->ref);
- /* no need to reallocate during incremental event scheduling */
- reg->alloc = 1;
-
/*
* need to call x86_get_event_constraint()
* to check if associated event has constraints
*/
c = NULL;
- } else if (intel_try_alt_er(event, orig_idx)) {
- raw_spin_unlock_irqrestore(&era->lock, flags);
- goto again;
+ } else {
+ idx = intel_alt_er(idx);
+ if (idx != reg->idx) {
+ raw_spin_unlock_irqrestore(&era->lock, flags);
+ goto again;
+ }
}
raw_spin_unlock_irqrestore(&era->lock, flags);
@@ -1204,11 +1237,14 @@ __intel_shared_reg_put_constraints(struct cpu_hw_events *cpuc,
struct er_account *era;
/*
- * only put constraint if extra reg was actually
- * allocated. Also takes care of event which do
- * not use an extra shared reg
+ * Only put constraint if extra reg was actually allocated. Also takes
+ * care of event which do not use an extra shared reg.
+ *
+ * Also, if this is a fake cpuc we shouldn't touch any event state
+ * (reg->alloc) and we don't care about leaving inconsistent cpuc state
+ * either since it'll be thrown out.
*/
- if (!reg->alloc)
+ if (!reg->alloc || cpuc->is_fake)
return;
era = &cpuc->shared_regs->regs[reg->idx];
@@ -1300,15 +1336,9 @@ static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
intel_put_shared_regs_event_constraints(cpuc, event);
}
-static int intel_pmu_hw_config(struct perf_event *event)
+static void intel_pebs_aliases_core2(struct perf_event *event)
{
- int ret = x86_pmu_hw_config(event);
-
- if (ret)
- return ret;
-
- if (event->attr.precise_ip &&
- (event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
+ if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
/*
* Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
* (0x003c) so that we can use it with PEBS.
@@ -1329,10 +1359,48 @@ static int intel_pmu_hw_config(struct perf_event *event)
*/
u64 alt_config = X86_CONFIG(.event=0xc0, .inv=1, .cmask=16);
+ alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
+ event->hw.config = alt_config;
+ }
+}
+
+static void intel_pebs_aliases_snb(struct perf_event *event)
+{
+ if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
+ /*
+ * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
+ * (0x003c) so that we can use it with PEBS.
+ *
+ * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
+ * PEBS capable. However we can use UOPS_RETIRED.ALL
+ * (0x01c2), which is a PEBS capable event, to get the same
+ * count.
+ *
+ * UOPS_RETIRED.ALL counts the number of cycles that retires
+ * CNTMASK micro-ops. By setting CNTMASK to a value (16)
+ * larger than the maximum number of micro-ops that can be
+ * retired per cycle (4) and then inverting the condition, we
+ * count all cycles that retire 16 or less micro-ops, which
+ * is every cycle.
+ *
+ * Thereby we gain a PEBS capable cycle counter.
+ */
+ u64 alt_config = X86_CONFIG(.event=0xc2, .umask=0x01, .inv=1, .cmask=16);
alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
event->hw.config = alt_config;
}
+}
+
+static int intel_pmu_hw_config(struct perf_event *event)
+{
+ int ret = x86_pmu_hw_config(event);
+
+ if (ret)
+ return ret;
+
+ if (event->attr.precise_ip && x86_pmu.pebs_aliases)
+ x86_pmu.pebs_aliases(event);
if (intel_pmu_needs_lbr_smpl(event)) {
ret = intel_pmu_setup_lbr_filter(event);
@@ -1607,6 +1675,7 @@ static __initconst const struct x86_pmu intel_pmu = {
.max_period = (1ULL << 31) - 1,
.get_event_constraints = intel_get_event_constraints,
.put_event_constraints = intel_put_event_constraints,
+ .pebs_aliases = intel_pebs_aliases_core2,
.format_attrs = intel_arch3_formats_attr,
@@ -1840,8 +1909,9 @@ __init int intel_pmu_init(void)
break;
case 42: /* SandyBridge */
- x86_add_quirk(intel_sandybridge_quirk);
case 45: /* SandyBridge, "Romely-EP" */
+ x86_add_quirk(intel_sandybridge_quirk);
+ case 58: /* IvyBridge */
memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
@@ -1849,6 +1919,7 @@ __init int intel_pmu_init(void)
x86_pmu.event_constraints = intel_snb_event_constraints;
x86_pmu.pebs_constraints = intel_snb_pebs_event_constraints;
+ x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
x86_pmu.extra_regs = intel_snb_extra_regs;
/* all extra regs are per-cpu when HT is on */
x86_pmu.er_flags |= ERF_HAS_RSP_1;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 5a3edc27f6e..35e2192df9f 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -400,14 +400,7 @@ struct event_constraint intel_snb_pebs_event_constraints[] = {
INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */
INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */
INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.* */
- INTEL_UEVENT_CONSTRAINT(0x11d0, 0xf), /* MEM_UOP_RETIRED.STLB_MISS_LOADS */
- INTEL_UEVENT_CONSTRAINT(0x12d0, 0xf), /* MEM_UOP_RETIRED.STLB_MISS_STORES */
- INTEL_UEVENT_CONSTRAINT(0x21d0, 0xf), /* MEM_UOP_RETIRED.LOCK_LOADS */
- INTEL_UEVENT_CONSTRAINT(0x22d0, 0xf), /* MEM_UOP_RETIRED.LOCK_STORES */
- INTEL_UEVENT_CONSTRAINT(0x41d0, 0xf), /* MEM_UOP_RETIRED.SPLIT_LOADS */
- INTEL_UEVENT_CONSTRAINT(0x42d0, 0xf), /* MEM_UOP_RETIRED.SPLIT_STORES */
- INTEL_UEVENT_CONSTRAINT(0x81d0, 0xf), /* MEM_UOP_RETIRED.ANY_LOADS */
- INTEL_UEVENT_CONSTRAINT(0x82d0, 0xf), /* MEM_UOP_RETIRED.ANY_STORES */
+ INTEL_EVENT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */
INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
INTEL_UEVENT_CONSTRAINT(0x02d4, 0xf), /* MEM_LOAD_UOPS_MISC_RETIRED.LLC_MISS */
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 01ccf9b7147..623f2883747 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -316,7 +316,6 @@ ret_from_exception:
preempt_stop(CLBR_ANY)
ret_from_intr:
GET_THREAD_INFO(%ebp)
-resume_userspace_sig:
#ifdef CONFIG_VM86
movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS
movb PT_CS(%esp), %al
@@ -615,9 +614,13 @@ work_notifysig: # deal with pending signals and
# vm86-space
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
+ movb PT_CS(%esp), %bl
+ andb $SEGMENT_RPL_MASK, %bl
+ cmpb $USER_RPL, %bl
+ jb resume_kernel
xorl %edx, %edx
call do_notify_resume
- jmp resume_userspace_sig
+ jmp resume_userspace
ALIGN
work_notifysig_v86:
@@ -630,9 +633,13 @@ work_notifysig_v86:
#endif
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
+ movb PT_CS(%esp), %bl
+ andb $SEGMENT_RPL_MASK, %bl
+ cmpb $USER_RPL, %bl
+ jb resume_kernel
xorl %edx, %edx
call do_notify_resume
- jmp resume_userspace_sig
+ jmp resume_userspace
END(work_pending)
# perform syscall exit tracing
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 320852d0202..7d65133b51b 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -191,6 +191,44 @@ ENDPROC(native_usergs_sysret64)
.endm
/*
+ * When dynamic function tracer is enabled it will add a breakpoint
+ * to all locations that it is about to modify, sync CPUs, update
+ * all the code, sync CPUs, then remove the breakpoints. In this time
+ * if lockdep is enabled, it might jump back into the debug handler
+ * outside the updating of the IST protection. (TRACE_IRQS_ON/OFF).
+ *
+ * We need to change the IDT table before calling TRACE_IRQS_ON/OFF to
+ * make sure the stack pointer does not get reset back to the top
+ * of the debug stack, and instead just reuses the current stack.
+ */
+#if defined(CONFIG_DYNAMIC_FTRACE) && defined(CONFIG_TRACE_IRQFLAGS)
+
+.macro TRACE_IRQS_OFF_DEBUG
+ call debug_stack_set_zero
+ TRACE_IRQS_OFF
+ call debug_stack_reset
+.endm
+
+.macro TRACE_IRQS_ON_DEBUG
+ call debug_stack_set_zero
+ TRACE_IRQS_ON
+ call debug_stack_reset
+.endm
+
+.macro TRACE_IRQS_IRETQ_DEBUG offset=ARGOFFSET
+ bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */
+ jnc 1f
+ TRACE_IRQS_ON_DEBUG
+1:
+.endm
+
+#else
+# define TRACE_IRQS_OFF_DEBUG TRACE_IRQS_OFF
+# define TRACE_IRQS_ON_DEBUG TRACE_IRQS_ON
+# define TRACE_IRQS_IRETQ_DEBUG TRACE_IRQS_IRETQ
+#endif
+
+/*
* C code is not supposed to know about undefined top of stack. Every time
* a C function with an pt_regs argument is called from the SYSCALL based
* fast path FIXUP_TOP_OF_STACK is needed.
@@ -1098,7 +1136,7 @@ ENTRY(\sym)
subq $ORIG_RAX-R15, %rsp
CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
call save_paranoid
- TRACE_IRQS_OFF
+ TRACE_IRQS_OFF_DEBUG
movq %rsp,%rdi /* pt_regs pointer */
xorl %esi,%esi /* no error code */
subq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist)
@@ -1393,7 +1431,7 @@ paranoidzeroentry machine_check *machine_check_vector(%rip)
ENTRY(paranoid_exit)
DEFAULT_FRAME
DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
+ TRACE_IRQS_OFF_DEBUG
testl %ebx,%ebx /* swapgs needed? */
jnz paranoid_restore
testl $3,CS(%rsp)
@@ -1404,7 +1442,7 @@ paranoid_swapgs:
RESTORE_ALL 8
jmp irq_return
paranoid_restore:
- TRACE_IRQS_IRETQ 0
+ TRACE_IRQS_IRETQ_DEBUG 0
RESTORE_ALL 8
jmp irq_return
paranoid_userspace:
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 32ff36596ab..c3a7cb4bf6e 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -100,7 +100,7 @@ static const unsigned char *ftrace_nop_replace(void)
}
static int
-ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
+ftrace_modify_code_direct(unsigned long ip, unsigned const char *old_code,
unsigned const char *new_code)
{
unsigned char replaced[MCOUNT_INSN_SIZE];
@@ -141,7 +141,20 @@ int ftrace_make_nop(struct module *mod,
old = ftrace_call_replace(ip, addr);
new = ftrace_nop_replace();
- return ftrace_modify_code(rec->ip, old, new);
+ /*
+ * On boot up, and when modules are loaded, the MCOUNT_ADDR
+ * is converted to a nop, and will never become MCOUNT_ADDR
+ * again. This code is either running before SMP (on boot up)
+ * or before the code will ever be executed (module load).
+ * We do not want to use the breakpoint version in this case,
+ * just modify the code directly.
+ */
+ if (addr == MCOUNT_ADDR)
+ return ftrace_modify_code_direct(rec->ip, old, new);
+
+ /* Normal cases use add_brk_on_nop */
+ WARN_ONCE(1, "invalid use of ftrace_make_nop");
+ return -EINVAL;
}
int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
@@ -152,9 +165,47 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
old = ftrace_nop_replace();
new = ftrace_call_replace(ip, addr);
- return ftrace_modify_code(rec->ip, old, new);
+ /* Should only be called when module is loaded */
+ return ftrace_modify_code_direct(rec->ip, old, new);
}
+/*
+ * The modifying_ftrace_code is used to tell the breakpoint
+ * handler to call ftrace_int3_handler(). If it fails to
+ * call this handler for a breakpoint added by ftrace, then
+ * the kernel may crash.
+ *
+ * As atomic_writes on x86 do not need a barrier, we do not
+ * need to add smp_mb()s for this to work. It is also considered
+ * that we can not read the modifying_ftrace_code before
+ * executing the breakpoint. That would be quite remarkable if
+ * it could do that. Here's the flow that is required:
+ *
+ * CPU-0 CPU-1
+ *
+ * atomic_inc(mfc);
+ * write int3s
+ * <trap-int3> // implicit (r)mb
+ * if (atomic_read(mfc))
+ * call ftrace_int3_handler()
+ *
+ * Then when we are finished:
+ *
+ * atomic_dec(mfc);
+ *
+ * If we hit a breakpoint that was not set by ftrace, it does not
+ * matter if ftrace_int3_handler() is called or not. It will
+ * simply be ignored. But it is crucial that a ftrace nop/caller
+ * breakpoint is handled. No other user should ever place a
+ * breakpoint on an ftrace nop/caller location. It must only
+ * be done by this code.
+ */
+atomic_t modifying_ftrace_code __read_mostly;
+
+static int
+ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
+ unsigned const char *new_code);
+
int ftrace_update_ftrace_func(ftrace_func_t func)
{
unsigned long ip = (unsigned long)(&ftrace_call);
@@ -163,13 +214,17 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
memcpy(old, &ftrace_call, MCOUNT_INSN_SIZE);
new = ftrace_call_replace(ip, (unsigned long)func);
+
+ /* See comment above by declaration of modifying_ftrace_code */
+ atomic_inc(&modifying_ftrace_code);
+
ret = ftrace_modify_code(ip, old, new);
+ atomic_dec(&modifying_ftrace_code);
+
return ret;
}
-int modifying_ftrace_code __read_mostly;
-
/*
* A breakpoint was added to the code address we are about to
* modify, and this is the handle that will just skip over it.
@@ -489,13 +544,46 @@ void ftrace_replace_code(int enable)
}
}
+static int
+ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
+ unsigned const char *new_code)
+{
+ int ret;
+
+ ret = add_break(ip, old_code);
+ if (ret)
+ goto out;
+
+ run_sync();
+
+ ret = add_update_code(ip, new_code);
+ if (ret)
+ goto fail_update;
+
+ run_sync();
+
+ ret = ftrace_write(ip, new_code, 1);
+ if (ret) {
+ ret = -EPERM;
+ goto out;
+ }
+ run_sync();
+ out:
+ return ret;
+
+ fail_update:
+ probe_kernel_write((void *)ip, &old_code[0], 1);
+ goto out;
+}
+
void arch_ftrace_update_code(int command)
{
- modifying_ftrace_code++;
+ /* See comment above by declaration of modifying_ftrace_code */
+ atomic_inc(&modifying_ftrace_code);
ftrace_modify_all_code(command);
- modifying_ftrace_code--;
+ atomic_dec(&modifying_ftrace_code);
}
int __init ftrace_dyn_arch_init(void *data)
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 9cc7b4392f7..1460a5df92f 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -870,7 +870,7 @@ int __init hpet_enable(void)
else
pr_warn("HPET initial state will not be saved\n");
cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY);
- hpet_writel(cfg, HPET_Tn_CFG(i));
+ hpet_writel(cfg, HPET_CFG);
if (cfg)
pr_warn("HPET: Unrecognized bits %#x set in global cfg\n",
cfg);
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 086eb58c6e8..f1b42b3a186 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -120,11 +120,6 @@ bool kvm_check_and_clear_guest_paused(void)
bool ret = false;
struct pvclock_vcpu_time_info *src;
- /*
- * per_cpu() is safe here because this function is only called from
- * timer functions where preemption is already disabled.
- */
- WARN_ON(!in_atomic());
src = &__get_cpu_var(hv_clock);
if ((src->flags & PVCLOCK_GUEST_STOPPED) != 0) {
__this_cpu_and(hv_clock.flags, ~PVCLOCK_GUEST_STOPPED);
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 90875279ef3..a0b2f84457b 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -444,14 +444,16 @@ static inline void nmi_nesting_preprocess(struct pt_regs *regs)
*/
if (unlikely(is_debug_stack(regs->sp))) {
debug_stack_set_zero();
- __get_cpu_var(update_debug_stack) = 1;
+ this_cpu_write(update_debug_stack, 1);
}
}
static inline void nmi_nesting_postprocess(void)
{
- if (unlikely(__get_cpu_var(update_debug_stack)))
+ if (unlikely(this_cpu_read(update_debug_stack))) {
debug_stack_reset();
+ this_cpu_write(update_debug_stack, 0);
+ }
}
#endif
diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c
index e31bf8d5c4d..149b8d9c6ad 100644
--- a/arch/x86/kernel/nmi_selftest.c
+++ b/arch/x86/kernel/nmi_selftest.c
@@ -42,7 +42,7 @@ static int __init nmi_unk_cb(unsigned int val, struct pt_regs *regs)
static void __init init_nmi_testsuite(void)
{
/* trap all the unknown NMIs we may generate */
- register_nmi_handler(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk");
+ register_nmi_handler_initonly(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk");
}
static void __init cleanup_nmi_testsuite(void)
@@ -64,7 +64,7 @@ static void __init test_nmi_ipi(struct cpumask *mask)
{
unsigned long timeout;
- if (register_nmi_handler(NMI_LOCAL, test_nmi_ipi_callback,
+ if (register_nmi_handler_initonly(NMI_LOCAL, test_nmi_ipi_callback,
NMI_FLAG_FIRST, "nmi_selftest")) {
nmi_fail = FAILURE;
return;
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 62c9457ccd2..c0f420f76cd 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -100,7 +100,7 @@ void *dma_generic_alloc_coherent(struct device *dev, size_t size,
struct dma_attrs *attrs)
{
unsigned long dma_mask;
- struct page *page = NULL;
+ struct page *page;
unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
dma_addr_t addr;
@@ -108,6 +108,7 @@ void *dma_generic_alloc_coherent(struct device *dev, size_t size,
flag |= __GFP_ZERO;
again:
+ page = NULL;
if (!(flag & GFP_ATOMIC))
page = dma_alloc_from_contiguous(dev, count, get_order(size));
if (!page)
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 13b1990c7c5..c4c6a5c2bf0 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1211,12 +1211,6 @@ static long x32_arch_ptrace(struct task_struct *child,
0, sizeof(struct user_i387_struct),
datap);
- /* normal 64bit interface to access TLS data.
- Works just like arch_prctl, except that the arguments
- are reversed. */
- case PTRACE_ARCH_PRCTL:
- return do_arch_prctl(child, data, addr);
-
default:
return compat_ptrace_request(child, request, addr, data);
}
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 79c45af8160..25b48edb847 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -639,9 +639,11 @@ void native_machine_shutdown(void)
set_cpus_allowed_ptr(current, cpumask_of(reboot_cpu_id));
/*
- * O.K Now that I'm on the appropriate processor,
- * stop all of the others.
+ * O.K Now that I'm on the appropriate processor, stop all of the
+ * others. Also disable the local irq to not receive the per-cpu
+ * timer interrupt which may trigger scheduler's load balance.
*/
+ local_irq_disable();
stop_other_cpus();
#endif
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 965dfda0fd5..21af737053a 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -555,7 +555,6 @@ unsigned long sys_sigreturn(struct pt_regs *regs)
sizeof(frame->extramask))))
goto badframe;
- sigdelsetmask(&set, ~_BLOCKABLE);
set_current_blocked(&set);
if (restore_sigcontext(regs, &frame->sc, &ax))
@@ -581,7 +580,6 @@ long sys_rt_sigreturn(struct pt_regs *regs)
if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
goto badframe;
- sigdelsetmask(&set, ~_BLOCKABLE);
set_current_blocked(&set);
if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
@@ -647,42 +645,28 @@ setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
struct pt_regs *regs)
{
int usig = signr_convert(sig);
- sigset_t *set = &current->blocked;
- int ret;
-
- if (current_thread_info()->status & TS_RESTORE_SIGMASK)
- set = &current->saved_sigmask;
+ sigset_t *set = sigmask_to_save();
/* Set up the stack frame */
if (is_ia32) {
if (ka->sa.sa_flags & SA_SIGINFO)
- ret = ia32_setup_rt_frame(usig, ka, info, set, regs);
+ return ia32_setup_rt_frame(usig, ka, info, set, regs);
else
- ret = ia32_setup_frame(usig, ka, set, regs);
+ return ia32_setup_frame(usig, ka, set, regs);
#ifdef CONFIG_X86_X32_ABI
} else if (is_x32) {
- ret = x32_setup_rt_frame(usig, ka, info,
+ return x32_setup_rt_frame(usig, ka, info,
(compat_sigset_t *)set, regs);
#endif
} else {
- ret = __setup_rt_frame(sig, ka, info, set, regs);
- }
-
- if (ret) {
- force_sigsegv(sig, current);
- return -EFAULT;
+ return __setup_rt_frame(sig, ka, info, set, regs);
}
-
- current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
- return ret;
}
-static int
+static void
handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
struct pt_regs *regs)
{
- int ret;
-
/* Are we from a system call? */
if (syscall_get_nr(current, regs) >= 0) {
/* If so, check system call restarting.. */
@@ -713,10 +697,10 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
regs->flags &= ~X86_EFLAGS_TF;
- ret = setup_rt_frame(sig, ka, info, regs);
-
- if (ret)
- return ret;
+ if (setup_rt_frame(sig, ka, info, regs) < 0) {
+ force_sigsegv(sig, current);
+ return;
+ }
/*
* Clear the direction flag as per the ABI for function entry.
@@ -731,12 +715,8 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
*/
regs->flags &= ~X86_EFLAGS_TF;
- block_sigmask(ka, sig);
-
- tracehook_signal_handler(sig, info, ka, regs,
- test_thread_flag(TIF_SINGLESTEP));
-
- return 0;
+ signal_delivered(sig, info, ka, regs,
+ test_thread_flag(TIF_SINGLESTEP));
}
#ifdef CONFIG_X86_32
@@ -757,16 +737,6 @@ static void do_signal(struct pt_regs *regs)
siginfo_t info;
int signr;
- /*
- * We want the common case to go fast, which is why we may in certain
- * cases get here from kernel mode. Just return without doing anything
- * if so.
- * X86_32: vm86 regs switched out by assembly code before reaching
- * here, so testing against kernel CS suffices.
- */
- if (!user_mode(regs))
- return;
-
signr = get_signal_to_deliver(&info, &ka, regs, NULL);
if (signr > 0) {
/* Whee! Actually deliver the signal. */
@@ -796,10 +766,7 @@ static void do_signal(struct pt_regs *regs)
* If there's no signal to deliver, we just put the saved sigmask
* back.
*/
- if (current_thread_info()->status & TS_RESTORE_SIGMASK) {
- current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
- set_current_blocked(&current->saved_sigmask);
- }
+ restore_saved_sigmask();
}
/*
@@ -827,8 +794,6 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
if (thread_info_flags & _TIF_NOTIFY_RESUME) {
clear_thread_flag(TIF_NOTIFY_RESUME);
tracehook_notify_resume(regs);
- if (current->replacement_session_keyring)
- key_replace_session_keyring();
}
if (thread_info_flags & _TIF_USER_RETURN_NOTIFY)
fire_user_return_notifiers();
@@ -936,7 +901,6 @@ asmlinkage long sys32_x32_rt_sigreturn(struct pt_regs *regs)
if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
goto badframe;
- sigdelsetmask(&set, ~_BLOCKABLE);
set_current_blocked(&set);
if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index f56f96da77f..7bd8a082365 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -349,9 +349,12 @@ static bool __cpuinit match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
static bool __cpuinit match_mc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
{
- if (c->phys_proc_id == o->phys_proc_id)
- return topology_sane(c, o, "mc");
+ if (c->phys_proc_id == o->phys_proc_id) {
+ if (cpu_has(c, X86_FEATURE_AMD_DCM))
+ return true;
+ return topology_sane(c, o, "mc");
+ }
return false;
}
@@ -382,6 +385,15 @@ void __cpuinit set_cpu_sibling_map(int cpu)
if ((i == cpu) || (has_mc && match_llc(c, o)))
link_mask(llc_shared, cpu, i);
+ }
+
+ /*
+ * This needs a separate iteration over the cpus because we rely on all
+ * cpu_sibling_mask links to be set-up.
+ */
+ for_each_cpu(i, cpu_sibling_setup_mask) {
+ o = &cpu_data(i);
+
if ((i == cpu) || (has_mc && match_mc(c, o))) {
link_mask(core, cpu, i);
@@ -410,15 +422,7 @@ void __cpuinit set_cpu_sibling_map(int cpu)
/* maps the cpu to the sched domain representing multi-core */
const struct cpumask *cpu_coregroup_mask(int cpu)
{
- struct cpuinfo_x86 *c = &cpu_data(cpu);
- /*
- * For perf, we return last level cache shared map.
- * And for power savings, we return cpu_core_map
- */
- if (!(cpu_has(c, X86_FEATURE_AMD_DCM)))
- return cpu_core_mask(cpu);
- else
- return cpu_llc_shared_mask(cpu);
+ return cpu_llc_shared_mask(cpu);
}
static void impress_friends(void)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index ff08457a025..05b31d92f69 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -303,8 +303,12 @@ gp_in_kernel:
dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_code)
{
#ifdef CONFIG_DYNAMIC_FTRACE
- /* ftrace must be first, everything else may cause a recursive crash */
- if (unlikely(modifying_ftrace_code) && ftrace_int3_handler(regs))
+ /*
+ * ftrace must be first, everything else may cause a recursive crash.
+ * See note by declaration of modifying_ftrace_code in ftrace.c
+ */
+ if (unlikely(atomic_read(&modifying_ftrace_code)) &&
+ ftrace_int3_handler(regs))
return;
#endif
#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP