summaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile2
-rw-r--r--arch/x86/kernel/acpi/boot.c2
-rw-r--r--arch/x86/kernel/acpi/sleep.c15
-rw-r--r--arch/x86/kernel/alternative.c111
-rw-r--r--arch/x86/kernel/apic/apic.c2
-rw-r--r--arch/x86/kernel/cpu/amd.c67
-rw-r--r--arch/x86/kernel/cpu/bugs.c7
-rw-r--r--arch/x86/kernel/cpu/common.c63
-rw-r--r--arch/x86/kernel/cpu/intel.c4
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-inject.c8
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h12
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c94
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c168
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.c30
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.h6
-rw-r--r--arch/x86/kernel/cpu/proc.c5
-rw-r--r--arch/x86/kernel/cpuid.c5
-rw-r--r--arch/x86/kernel/devicetree.c51
-rw-r--r--arch/x86/kernel/entry_32.S100
-rw-r--r--arch/x86/kernel/entry_64.S172
-rw-r--r--arch/x86/kernel/ftrace.c73
-rw-r--r--arch/x86/kernel/head_32.S31
-rw-r--r--arch/x86/kernel/i387.c292
-rw-r--r--arch/x86/kernel/i8259.c2
-rw-r--r--arch/x86/kernel/irq.c4
-rw-r--r--arch/x86/kernel/kprobes.c67
-rw-r--r--arch/x86/kernel/microcode_amd.c357
-rw-r--r--arch/x86/kernel/microcode_core.c67
-rw-r--r--arch/x86/kernel/microcode_intel.c3
-rw-r--r--arch/x86/kernel/msr.c5
-rw-r--r--arch/x86/kernel/perf_regs.c105
-rw-r--r--arch/x86/kernel/probe_roms.c2
-rw-r--r--arch/x86/kernel/process.c22
-rw-r--r--arch/x86/kernel/process_32.c4
-rw-r--r--arch/x86/kernel/process_64.c4
-rw-r--r--arch/x86/kernel/ptrace.c8
-rw-r--r--arch/x86/kernel/setup.c4
-rw-r--r--arch/x86/kernel/signal.c231
-rw-r--r--arch/x86/kernel/smpboot.c20
-rw-r--r--arch/x86/kernel/step.c53
-rw-r--r--arch/x86/kernel/traps.c174
-rw-r--r--arch/x86/kernel/uprobes.c52
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c6
-rw-r--r--arch/x86/kernel/x86_init.c4
-rw-r--r--arch/x86/kernel/xsave.c517
45 files changed, 1904 insertions, 1127 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 8215e5652d9..8d7a619718b 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -100,6 +100,8 @@ obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o
obj-$(CONFIG_OF) += devicetree.o
obj-$(CONFIG_UPROBES) += uprobes.o
+obj-$(CONFIG_PERF_EVENTS) += perf_regs.o
+
###
# 64 bit specific files
ifeq ($(CONFIG_X86_64),y)
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index b2297e58c6e..e651f7a589a 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -656,7 +656,7 @@ static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu)
acpi_register_lapic(physid, ACPI_MADT_ENABLED);
/*
- * If mp_register_lapic successfully generates a new logical cpu
+ * If acpi_register_lapic successfully generates a new logical cpu
* number, then the following will get us exactly what was mapped
*/
cpumask_andnot(new_map, cpu_present_mask, tmp_map);
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 1b8e5a03d94..11676cf65ae 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -43,17 +43,22 @@ int acpi_suspend_lowlevel(void)
header->video_mode = saved_video_mode;
+ header->pmode_behavior = 0;
+
#ifndef CONFIG_64BIT
store_gdt((struct desc_ptr *)&header->pmode_gdt);
- if (rdmsr_safe(MSR_EFER, &header->pmode_efer_low,
- &header->pmode_efer_high))
- header->pmode_efer_low = header->pmode_efer_high = 0;
+ if (!rdmsr_safe(MSR_EFER,
+ &header->pmode_efer_low,
+ &header->pmode_efer_high))
+ header->pmode_behavior |= (1 << WAKEUP_BEHAVIOR_RESTORE_EFER);
#endif /* !CONFIG_64BIT */
header->pmode_cr0 = read_cr0();
- header->pmode_cr4 = read_cr4_safe();
- header->pmode_behavior = 0;
+ if (__this_cpu_read(cpu_info.cpuid_level) >= 0) {
+ header->pmode_cr4 = read_cr4();
+ header->pmode_behavior |= (1 << WAKEUP_BEHAVIOR_RESTORE_CR4);
+ }
if (!rdmsr_safe(MSR_IA32_MISC_ENABLE,
&header->pmode_misc_en_low,
&header->pmode_misc_en_high))
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index ced4534baed..ef5ccca79a6 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -23,19 +23,6 @@
#define MAX_PATCH_LEN (255-1)
-#ifdef CONFIG_HOTPLUG_CPU
-static int smp_alt_once;
-
-static int __init bootonly(char *str)
-{
- smp_alt_once = 1;
- return 1;
-}
-__setup("smp-alt-boot", bootonly);
-#else
-#define smp_alt_once 1
-#endif
-
static int __initdata_or_module debug_alternative;
static int __init debug_alt(char *str)
@@ -317,7 +304,7 @@ static void alternatives_smp_lock(const s32 *start, const s32 *end,
/* turn DS segment override prefix into lock prefix */
if (*ptr == 0x3e)
text_poke(ptr, ((unsigned char []){0xf0}), 1);
- };
+ }
mutex_unlock(&text_mutex);
}
@@ -326,9 +313,6 @@ static void alternatives_smp_unlock(const s32 *start, const s32 *end,
{
const s32 *poff;
- if (noreplace_smp)
- return;
-
mutex_lock(&text_mutex);
for (poff = start; poff < end; poff++) {
u8 *ptr = (u8 *)poff + *poff;
@@ -338,7 +322,7 @@ static void alternatives_smp_unlock(const s32 *start, const s32 *end,
/* turn lock prefix into DS segment override prefix */
if (*ptr == 0xf0)
text_poke(ptr, ((unsigned char []){0x3E}), 1);
- };
+ }
mutex_unlock(&text_mutex);
}
@@ -359,7 +343,7 @@ struct smp_alt_module {
};
static LIST_HEAD(smp_alt_modules);
static DEFINE_MUTEX(smp_alt);
-static int smp_mode = 1; /* protected by smp_alt */
+static bool uniproc_patched = false; /* protected by smp_alt */
void __init_or_module alternatives_smp_module_add(struct module *mod,
char *name,
@@ -368,19 +352,18 @@ void __init_or_module alternatives_smp_module_add(struct module *mod,
{
struct smp_alt_module *smp;
- if (noreplace_smp)
- return;
+ mutex_lock(&smp_alt);
+ if (!uniproc_patched)
+ goto unlock;
- if (smp_alt_once) {
- if (boot_cpu_has(X86_FEATURE_UP))
- alternatives_smp_unlock(locks, locks_end,
- text, text_end);
- return;
- }
+ if (num_possible_cpus() == 1)
+ /* Don't bother remembering, we'll never have to undo it. */
+ goto smp_unlock;
smp = kzalloc(sizeof(*smp), GFP_KERNEL);
if (NULL == smp)
- return; /* we'll run the (safe but slow) SMP code then ... */
+ /* we'll run the (safe but slow) SMP code then ... */
+ goto unlock;
smp->mod = mod;
smp->name = name;
@@ -392,11 +375,10 @@ void __init_or_module alternatives_smp_module_add(struct module *mod,
__func__, smp->locks, smp->locks_end,
smp->text, smp->text_end, smp->name);
- mutex_lock(&smp_alt);
list_add_tail(&smp->next, &smp_alt_modules);
- if (boot_cpu_has(X86_FEATURE_UP))
- alternatives_smp_unlock(smp->locks, smp->locks_end,
- smp->text, smp->text_end);
+smp_unlock:
+ alternatives_smp_unlock(locks, locks_end, text, text_end);
+unlock:
mutex_unlock(&smp_alt);
}
@@ -404,24 +386,18 @@ void __init_or_module alternatives_smp_module_del(struct module *mod)
{
struct smp_alt_module *item;
- if (smp_alt_once || noreplace_smp)
- return;
-
mutex_lock(&smp_alt);
list_for_each_entry(item, &smp_alt_modules, next) {
if (mod != item->mod)
continue;
list_del(&item->next);
- mutex_unlock(&smp_alt);
- DPRINTK("%s: %s\n", __func__, item->name);
kfree(item);
- return;
+ break;
}
mutex_unlock(&smp_alt);
}
-bool skip_smp_alternatives;
-void alternatives_smp_switch(int smp)
+void alternatives_enable_smp(void)
{
struct smp_alt_module *mod;
@@ -436,34 +412,21 @@ void alternatives_smp_switch(int smp)
pr_info("lockdep: fixing up alternatives\n");
#endif
- if (noreplace_smp || smp_alt_once || skip_smp_alternatives)
- return;
- BUG_ON(!smp && (num_online_cpus() > 1));
+ /* Why bother if there are no other CPUs? */
+ BUG_ON(num_possible_cpus() == 1);
mutex_lock(&smp_alt);
- /*
- * Avoid unnecessary switches because it forces JIT based VMs to
- * throw away all cached translations, which can be quite costly.
- */
- if (smp == smp_mode) {
- /* nothing */
- } else if (smp) {
+ if (uniproc_patched) {
pr_info("switching to SMP code\n");
+ BUG_ON(num_online_cpus() != 1);
clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
list_for_each_entry(mod, &smp_alt_modules, next)
alternatives_smp_lock(mod->locks, mod->locks_end,
mod->text, mod->text_end);
- } else {
- pr_info("switching to UP code\n");
- set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
- set_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
- list_for_each_entry(mod, &smp_alt_modules, next)
- alternatives_smp_unlock(mod->locks, mod->locks_end,
- mod->text, mod->text_end);
+ uniproc_patched = false;
}
- smp_mode = smp;
mutex_unlock(&smp_alt);
}
@@ -540,40 +503,22 @@ void __init alternative_instructions(void)
apply_alternatives(__alt_instructions, __alt_instructions_end);
- /* switch to patch-once-at-boottime-only mode and free the
- * tables in case we know the number of CPUs will never ever
- * change */
-#ifdef CONFIG_HOTPLUG_CPU
- if (num_possible_cpus() < 2)
- smp_alt_once = 1;
-#endif
-
#ifdef CONFIG_SMP
- if (smp_alt_once) {
- if (1 == num_possible_cpus()) {
- pr_info("switching to UP code\n");
- set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
- set_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
-
- alternatives_smp_unlock(__smp_locks, __smp_locks_end,
- _text, _etext);
- }
- } else {
+ /* Patch to UP if other cpus not imminent. */
+ if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) {
+ uniproc_patched = true;
alternatives_smp_module_add(NULL, "core kernel",
__smp_locks, __smp_locks_end,
_text, _etext);
-
- /* Only switch to UP mode if we don't immediately boot others */
- if (num_present_cpus() == 1 || setup_max_cpus <= 1)
- alternatives_smp_switch(0);
}
-#endif
- apply_paravirt(__parainstructions, __parainstructions_end);
- if (smp_alt_once)
+ if (!uniproc_patched || num_possible_cpus() == 1)
free_init_pages("SMP alternatives",
(unsigned long)__smp_locks,
(unsigned long)__smp_locks_end);
+#endif
+
+ apply_paravirt(__parainstructions, __parainstructions_end);
restart_nmi();
}
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 24deb308232..b17416e72fb 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1934,7 +1934,7 @@ void smp_error_interrupt(struct pt_regs *regs)
apic_printk(APIC_DEBUG, KERN_CONT " : %s", error_interrupt_reason[i]);
i++;
v1 >>= 1;
- };
+ }
apic_printk(APIC_DEBUG, KERN_CONT "\n");
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 9d92e19039f..f7e98a2c0d1 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -737,6 +737,72 @@ static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c,
}
#endif
+static void __cpuinit cpu_set_tlb_flushall_shift(struct cpuinfo_x86 *c)
+{
+ if (!cpu_has_invlpg)
+ return;
+
+ tlb_flushall_shift = 5;
+
+ if (c->x86 <= 0x11)
+ tlb_flushall_shift = 4;
+}
+
+static void __cpuinit cpu_detect_tlb_amd(struct cpuinfo_x86 *c)
+{
+ u32 ebx, eax, ecx, edx;
+ u16 mask = 0xfff;
+
+ if (c->x86 < 0xf)
+ return;
+
+ if (c->extended_cpuid_level < 0x80000006)
+ return;
+
+ cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
+
+ tlb_lld_4k[ENTRIES] = (ebx >> 16) & mask;
+ tlb_lli_4k[ENTRIES] = ebx & mask;
+
+ /*
+ * K8 doesn't have 2M/4M entries in the L2 TLB so read out the L1 TLB
+ * characteristics from the CPUID function 0x80000005 instead.
+ */
+ if (c->x86 == 0xf) {
+ cpuid(0x80000005, &eax, &ebx, &ecx, &edx);
+ mask = 0xff;
+ }
+
+ /* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */
+ if (!((eax >> 16) & mask)) {
+ u32 a, b, c, d;
+
+ cpuid(0x80000005, &a, &b, &c, &d);
+ tlb_lld_2m[ENTRIES] = (a >> 16) & 0xff;
+ } else {
+ tlb_lld_2m[ENTRIES] = (eax >> 16) & mask;
+ }
+
+ /* a 4M entry uses two 2M entries */
+ tlb_lld_4m[ENTRIES] = tlb_lld_2m[ENTRIES] >> 1;
+
+ /* Handle ITLB 2M and 4M sizes, fall back to L1 if L2 is disabled */
+ if (!(eax & mask)) {
+ /* Erratum 658 */
+ if (c->x86 == 0x15 && c->x86_model <= 0x1f) {
+ tlb_lli_2m[ENTRIES] = 1024;
+ } else {
+ cpuid(0x80000005, &eax, &ebx, &ecx, &edx);
+ tlb_lli_2m[ENTRIES] = eax & 0xff;
+ }
+ } else
+ tlb_lli_2m[ENTRIES] = eax & mask;
+
+ tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1;
+
+ cpu_set_tlb_flushall_shift(c);
+}
+
static const struct cpu_dev __cpuinitconst amd_cpu_dev = {
.c_vendor = "AMD",
.c_ident = { "AuthenticAMD" },
@@ -756,6 +822,7 @@ static const struct cpu_dev __cpuinitconst amd_cpu_dev = {
.c_size_cache = amd_size_cache,
#endif
.c_early_init = early_init_amd,
+ .c_detect_tlb = cpu_detect_tlb_amd,
.c_bsp_init = bsp_init_amd,
.c_init = init_amd,
.c_x86_vendor = X86_VENDOR_AMD,
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index c97bb7b5a9f..d0e910da16c 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -165,10 +165,15 @@ void __init check_bugs(void)
print_cpu_info(&boot_cpu_data);
#endif
check_config();
- check_fpu();
check_hlt();
check_popad();
init_utsname()->machine[1] =
'0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86);
alternative_instructions();
+
+ /*
+ * kernel_fpu_begin/end() in check_fpu() relies on the patched
+ * alternative instructions.
+ */
+ check_fpu();
}
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index a5fbc3c5fcc..7505f7b13e7 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -259,23 +259,36 @@ static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
}
#endif
-static int disable_smep __cpuinitdata;
static __init int setup_disable_smep(char *arg)
{
- disable_smep = 1;
+ setup_clear_cpu_cap(X86_FEATURE_SMEP);
return 1;
}
__setup("nosmep", setup_disable_smep);
-static __cpuinit void setup_smep(struct cpuinfo_x86 *c)
+static __always_inline void setup_smep(struct cpuinfo_x86 *c)
{
- if (cpu_has(c, X86_FEATURE_SMEP)) {
- if (unlikely(disable_smep)) {
- setup_clear_cpu_cap(X86_FEATURE_SMEP);
- clear_in_cr4(X86_CR4_SMEP);
- } else
- set_in_cr4(X86_CR4_SMEP);
- }
+ if (cpu_has(c, X86_FEATURE_SMEP))
+ set_in_cr4(X86_CR4_SMEP);
+}
+
+static __init int setup_disable_smap(char *arg)
+{
+ setup_clear_cpu_cap(X86_FEATURE_SMAP);
+ return 1;
+}
+__setup("nosmap", setup_disable_smap);
+
+static __always_inline void setup_smap(struct cpuinfo_x86 *c)
+{
+ unsigned long eflags;
+
+ /* This should have been cleared long ago */
+ raw_local_save_flags(eflags);
+ BUG_ON(eflags & X86_EFLAGS_AC);
+
+ if (cpu_has(c, X86_FEATURE_SMAP))
+ set_in_cr4(X86_CR4_SMAP);
}
/*
@@ -476,7 +489,7 @@ void __cpuinit cpu_detect_tlb(struct cpuinfo_x86 *c)
printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n" \
"Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d\n" \
- "tlb_flushall_shift is 0x%x\n",
+ "tlb_flushall_shift: %d\n",
tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES],
tlb_lli_4m[ENTRIES], tlb_lld_4k[ENTRIES],
tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES],
@@ -712,8 +725,6 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
c->cpu_index = 0;
filter_cpuid_features(c, false);
- setup_smep(c);
-
if (this_cpu->c_bsp_init)
this_cpu->c_bsp_init(c);
}
@@ -798,8 +809,6 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
c->phys_proc_id = c->initial_apicid;
}
- setup_smep(c);
-
get_model_name(c); /* Default name */
detect_nopl(c);
@@ -864,6 +873,10 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
/* Disable the PN if appropriate */
squash_the_stupid_serial_number(c);
+ /* Set up SMEP/SMAP */
+ setup_smep(c);
+ setup_smap(c);
+
/*
* The vendor-specific functions might have changed features.
* Now we do "generic changes."
@@ -942,8 +955,7 @@ void __init identify_boot_cpu(void)
#else
vgetcpu_set_mode();
#endif
- if (boot_cpu_data.cpuid_level >= 2)
- cpu_detect_tlb(&boot_cpu_data);
+ cpu_detect_tlb(&boot_cpu_data);
}
void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
@@ -1023,14 +1035,16 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
printk(KERN_CONT "%s ", vendor);
if (c->x86_model_id[0])
- printk(KERN_CONT "%s", c->x86_model_id);
+ printk(KERN_CONT "%s", strim(c->x86_model_id));
else
printk(KERN_CONT "%d86", c->x86);
+ printk(KERN_CONT " (fam: %02x, model: %02x", c->x86, c->x86_model);
+
if (c->x86_mask || c->cpuid_level >= 0)
- printk(KERN_CONT " stepping %02x\n", c->x86_mask);
+ printk(KERN_CONT ", stepping: %02x)\n", c->x86_mask);
else
- printk(KERN_CONT "\n");
+ printk(KERN_CONT ")\n");
print_cpu_msr(c);
}
@@ -1113,11 +1127,10 @@ void syscall_init(void)
/* Flags to clear on syscall */
wrmsrl(MSR_SYSCALL_MASK,
- X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
+ X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|
+ X86_EFLAGS_IOPL|X86_EFLAGS_AC);
}
-unsigned long kernel_eflags;
-
/*
* Copies of the original ist values from the tss are only accessed during
* debugging, no special alignment required.
@@ -1297,9 +1310,6 @@ void __cpuinit cpu_init(void)
dbg_restore_debug_regs();
fpu_init();
- xsave_init();
-
- raw_local_save_flags(kernel_eflags);
if (is_uv_system())
uv_cpu_init();
@@ -1352,6 +1362,5 @@ void __cpuinit cpu_init(void)
dbg_restore_debug_regs();
fpu_init();
- xsave_init();
}
#endif
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 0a4ce2980a5..198e019a531 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -648,6 +648,10 @@ static void __cpuinit intel_detect_tlb(struct cpuinfo_x86 *c)
int i, j, n;
unsigned int regs[4];
unsigned char *desc = (unsigned char *)regs;
+
+ if (c->cpuid_level < 2)
+ return;
+
/* Number of times to iterate */
n = cpuid_eax(2) & 0xFF;
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index fc4beb39357..ddc72f83933 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -78,6 +78,7 @@ static void raise_exception(struct mce *m, struct pt_regs *pregs)
}
static cpumask_var_t mce_inject_cpumask;
+static DEFINE_MUTEX(mce_inject_mutex);
static int mce_raise_notify(unsigned int cmd, struct pt_regs *regs)
{
@@ -194,7 +195,11 @@ static void raise_mce(struct mce *m)
put_online_cpus();
} else
#endif
+ {
+ preempt_disable();
raise_local();
+ preempt_enable();
+ }
}
/* Error injection interface */
@@ -225,7 +230,10 @@ static ssize_t mce_write(struct file *filp, const char __user *ubuf,
* so do it a jiffie or two later everywhere.
*/
schedule_timeout(2);
+
+ mutex_lock(&mce_inject_mutex);
raise_mce(&m);
+ mutex_unlock(&mce_inject_mutex);
return usize;
}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index ed44c8a6585..6a05c1d327a 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -28,6 +28,18 @@ extern int mce_ser;
extern struct mce_bank *mce_banks;
+#ifdef CONFIG_X86_MCE_INTEL
+unsigned long mce_intel_adjust_timer(unsigned long interval);
+void mce_intel_cmci_poll(void);
+void mce_intel_hcpu_update(unsigned long cpu);
+#else
+# define mce_intel_adjust_timer mce_adjust_timer_default
+static inline void mce_intel_cmci_poll(void) { }
+static inline void mce_intel_hcpu_update(unsigned long cpu) { }
+#endif
+
+void mce_timer_kick(unsigned long interval);
+
#ifdef CONFIG_ACPI_APEI
int apei_write_mce(struct mce *m);
ssize_t apei_read_mce(struct mce *m, u64 *record_id);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 292d0258311..29e87d3b284 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -83,6 +83,7 @@ static int mce_dont_log_ce __read_mostly;
int mce_cmci_disabled __read_mostly;
int mce_ignore_ce __read_mostly;
int mce_ser __read_mostly;
+int mce_bios_cmci_threshold __read_mostly;
struct mce_bank *mce_banks __read_mostly;
@@ -1266,6 +1267,14 @@ static unsigned long check_interval = 5 * 60; /* 5 minutes */
static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
static DEFINE_PER_CPU(struct timer_list, mce_timer);
+static unsigned long mce_adjust_timer_default(unsigned long interval)
+{
+ return interval;
+}
+
+static unsigned long (*mce_adjust_timer)(unsigned long interval) =
+ mce_adjust_timer_default;
+
static void mce_timer_fn(unsigned long data)
{
struct timer_list *t = &__get_cpu_var(mce_timer);
@@ -1276,6 +1285,7 @@ static void mce_timer_fn(unsigned long data)
if (mce_available(__this_cpu_ptr(&cpu_info))) {
machine_check_poll(MCP_TIMESTAMP,
&__get_cpu_var(mce_poll_banks));
+ mce_intel_cmci_poll();
}
/*
@@ -1283,14 +1293,38 @@ static void mce_timer_fn(unsigned long data)
* polling interval, otherwise increase the polling interval.
*/
iv = __this_cpu_read(mce_next_interval);
- if (mce_notify_irq())
+ if (mce_notify_irq()) {
iv = max(iv / 2, (unsigned long) HZ/100);
- else
+ } else {
iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
+ iv = mce_adjust_timer(iv);
+ }
__this_cpu_write(mce_next_interval, iv);
+ /* Might have become 0 after CMCI storm subsided */
+ if (iv) {
+ t->expires = jiffies + iv;
+ add_timer_on(t, smp_processor_id());
+ }
+}
- t->expires = jiffies + iv;
- add_timer_on(t, smp_processor_id());
+/*
+ * Ensure that the timer is firing in @interval from now.
+ */
+void mce_timer_kick(unsigned long interval)
+{
+ struct timer_list *t = &__get_cpu_var(mce_timer);
+ unsigned long when = jiffies + interval;
+ unsigned long iv = __this_cpu_read(mce_next_interval);
+
+ if (timer_pending(t)) {
+ if (time_before(when, t->expires))
+ mod_timer_pinned(t, when);
+ } else {
+ t->expires = round_jiffies(when);
+ add_timer_on(t, smp_processor_id());
+ }
+ if (interval < iv)
+ __this_cpu_write(mce_next_interval, interval);
}
/* Must not be called in IRQ context where del_timer_sync() can deadlock */
@@ -1585,6 +1619,7 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
switch (c->x86_vendor) {
case X86_VENDOR_INTEL:
mce_intel_feature_init(c);
+ mce_adjust_timer = mce_intel_adjust_timer;
break;
case X86_VENDOR_AMD:
mce_amd_feature_init(c);
@@ -1594,23 +1629,28 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
}
}
-static void __mcheck_cpu_init_timer(void)
+static void mce_start_timer(unsigned int cpu, struct timer_list *t)
{
- struct timer_list *t = &__get_cpu_var(mce_timer);
- unsigned long iv = check_interval * HZ;
+ unsigned long iv = mce_adjust_timer(check_interval * HZ);
- setup_timer(t, mce_timer_fn, smp_processor_id());
+ __this_cpu_write(mce_next_interval, iv);
- if (mce_ignore_ce)
+ if (mce_ignore_ce || !iv)
return;
- __this_cpu_write(mce_next_interval, iv);
- if (!iv)
- return;
t->expires = round_jiffies(jiffies + iv);
add_timer_on(t, smp_processor_id());
}
+static void __mcheck_cpu_init_timer(void)
+{
+ struct timer_list *t = &__get_cpu_var(mce_timer);
+ unsigned int cpu = smp_processor_id();
+
+ setup_timer(t, mce_timer_fn, cpu);
+ mce_start_timer(cpu, t);
+}
+
/* Handle unconfigured int18 (should never happen) */
static void unexpected_machine_check(struct pt_regs *regs, long error_code)
{
@@ -1907,6 +1947,7 @@ static struct miscdevice mce_chrdev_device = {
* check, or 0 to not wait
* mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
* mce=nobootlog Don't log MCEs from before booting.
+ * mce=bios_cmci_threshold Don't program the CMCI threshold
*/
static int __init mcheck_enable(char *str)
{
@@ -1926,6 +1967,8 @@ static int __init mcheck_enable(char *str)
mce_ignore_ce = 1;
else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
mce_bootlog = (str[0] == 'b');
+ else if (!strcmp(str, "bios_cmci_threshold"))
+ mce_bios_cmci_threshold = 1;
else if (isdigit(str[0])) {
get_option(&str, &tolerant);
if (*str == ',') {
@@ -2166,6 +2209,11 @@ static struct dev_ext_attribute dev_attr_cmci_disabled = {
&mce_cmci_disabled
};
+static struct dev_ext_attribute dev_attr_bios_cmci_threshold = {
+ __ATTR(bios_cmci_threshold, 0444, device_show_int, NULL),
+ &mce_bios_cmci_threshold
+};
+
static struct device_attribute *mce_device_attrs[] = {
&dev_attr_tolerant.attr,
&dev_attr_check_interval.attr,
@@ -2174,6 +2222,7 @@ static struct device_attribute *mce_device_attrs[] = {
&dev_attr_dont_log_ce.attr,
&dev_attr_ignore_ce.attr,
&dev_attr_cmci_disabled.attr,
+ &dev_attr_bios_cmci_threshold.attr,
NULL
};
@@ -2294,38 +2343,33 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
unsigned int cpu = (unsigned long)hcpu;
struct timer_list *t = &per_cpu(mce_timer, cpu);
- switch (action) {
+ switch (action & ~CPU_TASKS_FROZEN) {
case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
mce_device_create(cpu);
if (threshold_cpu_callback)
threshold_cpu_callback(action, cpu);
break;
case CPU_DEAD:
- case CPU_DEAD_FROZEN:
if (threshold_cpu_callback)
threshold_cpu_callback(action, cpu);
mce_device_remove(cpu);
+ mce_intel_hcpu_update(cpu);
break;
case CPU_DOWN_PREPARE:
- case CPU_DOWN_PREPARE_FROZEN:
- del_timer_sync(t);
smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
+ del_timer_sync(t);
break;
case CPU_DOWN_FAILED:
- case CPU_DOWN_FAILED_FROZEN:
- if (!mce_ignore_ce && check_interval) {
- t->expires = round_jiffies(jiffies +
- per_cpu(mce_next_interval, cpu));
- add_timer_on(t, cpu);
- }
smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
+ mce_start_timer(cpu, t);
break;
- case CPU_POST_DEAD:
+ }
+
+ if (action == CPU_POST_DEAD) {
/* intentionally ignoring frozen here */
cmci_rediscover(cpu);
- break;
}
+
return NOTIFY_OK;
}
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 38e49bc95ff..5f88abf07e9 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -15,6 +15,8 @@
#include <asm/msr.h>
#include <asm/mce.h>
+#include "mce-internal.h"
+
/*
* Support for Intel Correct Machine Check Interrupts. This allows
* the CPU to raise an interrupt when a corrected machine check happened.
@@ -30,7 +32,22 @@ static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
*/
static DEFINE_RAW_SPINLOCK(cmci_discover_lock);
-#define CMCI_THRESHOLD 1
+#define CMCI_THRESHOLD 1
+#define CMCI_POLL_INTERVAL (30 * HZ)
+#define CMCI_STORM_INTERVAL (1 * HZ)
+#define CMCI_STORM_THRESHOLD 15
+
+static DEFINE_PER_CPU(unsigned long, cmci_time_stamp);
+static DEFINE_PER_CPU(unsigned int, cmci_storm_cnt);
+static DEFINE_PER_CPU(unsigned int, cmci_storm_state);
+
+enum {
+ CMCI_STORM_NONE,
+ CMCI_STORM_ACTIVE,
+ CMCI_STORM_SUBSIDED,
+};
+
+static atomic_t cmci_storm_on_cpus;
static int cmci_supported(int *banks)
{
@@ -53,6 +70,93 @@ static int cmci_supported(int *banks)
return !!(cap & MCG_CMCI_P);
}
+void mce_intel_cmci_poll(void)
+{
+ if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE)
+ return;
+ machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
+}
+
+void mce_intel_hcpu_update(unsigned long cpu)
+{
+ if (per_cpu(cmci_storm_state, cpu) == CMCI_STORM_ACTIVE)
+ atomic_dec(&cmci_storm_on_cpus);
+
+ per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE;
+}
+
+unsigned long mce_intel_adjust_timer(unsigned long interval)
+{
+ int r;
+
+ if (interval < CMCI_POLL_INTERVAL)
+ return interval;
+
+ switch (__this_cpu_read(cmci_storm_state)) {
+ case CMCI_STORM_ACTIVE:
+ /*
+ * We switch back to interrupt mode once the poll timer has
+ * silenced itself. That means no events recorded and the
+ * timer interval is back to our poll interval.
+ */
+ __this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED);
+ r = atomic_sub_return(1, &cmci_storm_on_cpus);
+ if (r == 0)
+ pr_notice("CMCI storm subsided: switching to interrupt mode\n");
+ /* FALLTHROUGH */
+
+ case CMCI_STORM_SUBSIDED:
+ /*
+ * We wait for all cpus to go back to SUBSIDED
+ * state. When that happens we switch back to
+ * interrupt mode.
+ */
+ if (!atomic_read(&cmci_storm_on_cpus)) {
+ __this_cpu_write(cmci_storm_state, CMCI_STORM_NONE);
+ cmci_reenable();
+ cmci_recheck();
+ }
+ return CMCI_POLL_INTERVAL;
+ default:
+ /*
+ * We have shiny weather. Let the poll do whatever it
+ * thinks.
+ */
+ return interval;
+ }
+}
+
+static bool cmci_storm_detect(void)
+{
+ unsigned int cnt = __this_cpu_read(cmci_storm_cnt);
+ unsigned long ts = __this_cpu_read(cmci_time_stamp);
+ unsigned long now = jiffies;
+ int r;
+
+ if (__this_cpu_read(cmci_storm_state) != CMCI_STORM_NONE)
+ return true;
+
+ if (time_before_eq(now, ts + CMCI_STORM_INTERVAL)) {
+ cnt++;
+ } else {
+ cnt = 1;
+ __this_cpu_write(cmci_time_stamp, now);
+ }
+ __this_cpu_write(cmci_storm_cnt, cnt);
+
+ if (cnt <= CMCI_STORM_THRESHOLD)
+ return false;
+
+ cmci_clear();
+ __this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE);
+ r = atomic_add_return(1, &cmci_storm_on_cpus);
+ mce_timer_kick(CMCI_POLL_INTERVAL);
+
+ if (r == 1)
+ pr_notice("CMCI storm detected: switching to poll mode\n");
+ return true;
+}
+
/*
* The interrupt handler. This is called on every event.
* Just call the poller directly to log any events.
@@ -61,33 +165,28 @@ static int cmci_supported(int *banks)
*/
static void intel_threshold_interrupt(void)
{
+ if (cmci_storm_detect())
+ return;
machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
mce_notify_irq();
}
-static void print_update(char *type, int *hdr, int num)
-{
- if (*hdr == 0)
- printk(KERN_INFO "CPU %d MCA banks", smp_processor_id());
- *hdr = 1;
- printk(KERN_CONT " %s:%d", type, num);
-}
-
/*
* Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks
* on this CPU. Use the algorithm recommended in the SDM to discover shared
* banks.
*/
-static void cmci_discover(int banks, int boot)
+static void cmci_discover(int banks)
{
unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned);
unsigned long flags;
- int hdr = 0;
int i;
+ int bios_wrong_thresh = 0;
raw_spin_lock_irqsave(&cmci_discover_lock, flags);
for (i = 0; i < banks; i++) {
u64 val;
+ int bios_zero_thresh = 0;
if (test_bit(i, owned))
continue;
@@ -96,29 +195,52 @@ static void cmci_discover(int banks, int boot)
/* Already owned by someone else? */
if (val & MCI_CTL2_CMCI_EN) {
- if (test_and_clear_bit(i, owned) && !boot)
- print_update("SHD", &hdr, i);
+ clear_bit(i, owned);
__clear_bit(i, __get_cpu_var(mce_poll_banks));
continue;
}
- val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
- val |= MCI_CTL2_CMCI_EN | CMCI_THRESHOLD;
+ if (!mce_bios_cmci_threshold) {
+ val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
+ val |= CMCI_THRESHOLD;
+ } else if (!(val & MCI_CTL2_CMCI_THRESHOLD_MASK)) {
+ /*
+ * If bios_cmci_threshold boot option was specified
+ * but the threshold is zero, we'll try to initialize
+ * it to 1.
+ */
+ bios_zero_thresh = 1;
+ val |= CMCI_THRESHOLD;
+ }
+
+ val |= MCI_CTL2_CMCI_EN;
wrmsrl(MSR_IA32_MCx_CTL2(i), val);
rdmsrl(MSR_IA32_MCx_CTL2(i), val);
/* Did the enable bit stick? -- the bank supports CMCI */
if (val & MCI_CTL2_CMCI_EN) {
- if (!test_and_set_bit(i, owned) && !boot)
- print_update("CMCI", &hdr, i);
+ set_bit(i, owned);
__clear_bit(i, __get_cpu_var(mce_poll_banks));
+ /*
+ * We are able to set thresholds for some banks that
+ * had a threshold of 0. This means the BIOS has not
+ * set the thresholds properly or does not work with
+ * this boot option. Note down now and report later.
+ */
+ if (mce_bios_cmci_threshold && bios_zero_thresh &&
+ (val & MCI_CTL2_CMCI_THRESHOLD_MASK))
+ bios_wrong_thresh = 1;
} else {
WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks)));
}
}
raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
- if (hdr)
- printk(KERN_CONT "\n");
+ if (mce_bios_cmci_threshold && bios_wrong_thresh) {
+ pr_info_once(
+ "bios_cmci_threshold: Some banks do not have valid thresholds set\n");
+ pr_info_once(
+ "bios_cmci_threshold: Make sure your BIOS supports this boot option\n");
+ }
}
/*
@@ -156,7 +278,7 @@ void cmci_clear(void)
continue;
/* Disable CMCI */
rdmsrl(MSR_IA32_MCx_CTL2(i), val);
- val &= ~(MCI_CTL2_CMCI_EN|MCI_CTL2_CMCI_THRESHOLD_MASK);
+ val &= ~MCI_CTL2_CMCI_EN;
wrmsrl(MSR_IA32_MCx_CTL2(i), val);
__clear_bit(i, __get_cpu_var(mce_banks_owned));
}
@@ -186,7 +308,7 @@ void cmci_rediscover(int dying)
continue;
/* Recheck banks in case CPUs don't all have the same */
if (cmci_supported(&banks))
- cmci_discover(banks, 0);
+ cmci_discover(banks);
}
set_cpus_allowed_ptr(current, old);
@@ -200,7 +322,7 @@ void cmci_reenable(void)
{
int banks;
if (cmci_supported(&banks))
- cmci_discover(banks, 0);
+ cmci_discover(banks);
}
static void intel_init_cmci(void)
@@ -211,7 +333,7 @@ static void intel_init_cmci(void)
return;
mce_threshold_vector = intel_threshold_interrupt;
- cmci_discover(banks, 1);
+ cmci_discover(banks);
/*
* For CPU #0 this runs with still disabled APIC, but that's
* ok because only the vector is set up. We still do another
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index 38e4894165b..99d96a4978b 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -1950,7 +1950,7 @@ struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, int cp
static struct intel_uncore_box *
uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu)
{
- static struct intel_uncore_box *box;
+ struct intel_uncore_box *box;
box = *per_cpu_ptr(pmu->box, cpu);
if (box)
@@ -2347,6 +2347,27 @@ int uncore_pmu_event_init(struct perf_event *event)
return ret;
}
+static ssize_t uncore_get_attr_cpumask(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ int n = cpulist_scnprintf(buf, PAGE_SIZE - 2, &uncore_cpu_mask);
+
+ buf[n++] = '\n';
+ buf[n] = '\0';
+ return n;
+}
+
+static DEVICE_ATTR(cpumask, S_IRUGO, uncore_get_attr_cpumask, NULL);
+
+static struct attribute *uncore_pmu_attrs[] = {
+ &dev_attr_cpumask.attr,
+ NULL,
+};
+
+static struct attribute_group uncore_pmu_attr_group = {
+ .attrs = uncore_pmu_attrs,
+};
+
static int __init uncore_pmu_register(struct intel_uncore_pmu *pmu)
{
int ret;
@@ -2384,8 +2405,8 @@ static void __init uncore_type_exit(struct intel_uncore_type *type)
free_percpu(type->pmus[i].box);
kfree(type->pmus);
type->pmus = NULL;
- kfree(type->attr_groups[1]);
- type->attr_groups[1] = NULL;
+ kfree(type->events_group);
+ type->events_group = NULL;
}
static void __init uncore_types_exit(struct intel_uncore_type **types)
@@ -2437,9 +2458,10 @@ static int __init uncore_type_init(struct intel_uncore_type *type)
for (j = 0; j < i; j++)
attrs[j] = &type->event_descs[j].attr.attr;
- type->attr_groups[1] = events_group;
+ type->events_group = events_group;
}
+ type->pmu_group = &uncore_pmu_attr_group;
type->pmus = pmus;
return 0;
fail:
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
index 5b81c1856aa..e68a4550e95 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
@@ -369,10 +369,12 @@ struct intel_uncore_type {
struct intel_uncore_pmu *pmus;
struct intel_uncore_ops *ops;
struct uncore_event_desc *event_descs;
- const struct attribute_group *attr_groups[3];
+ const struct attribute_group *attr_groups[4];
};
-#define format_group attr_groups[0]
+#define pmu_group attr_groups[0]
+#define format_group attr_groups[1]
+#define events_group attr_groups[2]
struct intel_uncore_ops {
void (*init_box)(struct intel_uncore_box *);
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 8022c668148..fbd89556229 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -140,10 +140,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
static void *c_start(struct seq_file *m, loff_t *pos)
{
- if (*pos == 0) /* just in case, cpu 0 is not the first */
- *pos = cpumask_first(cpu_online_mask);
- else
- *pos = cpumask_next(*pos - 1, cpu_online_mask);
+ *pos = cpumask_next(*pos - 1, cpu_online_mask);
if ((*pos) < nr_cpu_ids)
return &cpu_data(*pos);
return NULL;
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 39472dd2323..60c78917190 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -199,12 +199,14 @@ static int __init cpuid_init(void)
goto out_chrdev;
}
cpuid_class->devnode = cpuid_devnode;
+ get_online_cpus();
for_each_online_cpu(i) {
err = cpuid_device_create(i);
if (err != 0)
goto out_class;
}
register_hotcpu_notifier(&cpuid_class_cpu_notifier);
+ put_online_cpus();
err = 0;
goto out;
@@ -214,6 +216,7 @@ out_class:
for_each_online_cpu(i) {
cpuid_device_destroy(i);
}
+ put_online_cpus();
class_destroy(cpuid_class);
out_chrdev:
__unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid");
@@ -225,11 +228,13 @@ static void __exit cpuid_exit(void)
{
int cpu = 0;
+ get_online_cpus();
for_each_online_cpu(cpu)
cpuid_device_destroy(cpu);
class_destroy(cpuid_class);
__unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid");
unregister_hotcpu_notifier(&cpuid_class_cpu_notifier);
+ put_online_cpus();
}
module_init(cpuid_init);
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 3ae2ced4a87..b1581527a23 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -342,6 +342,47 @@ const struct irq_domain_ops ioapic_irq_domain_ops = {
.xlate = ioapic_xlate,
};
+static void dt_add_ioapic_domain(unsigned int ioapic_num,
+ struct device_node *np)
+{
+ struct irq_domain *id;
+ struct mp_ioapic_gsi *gsi_cfg;
+ int ret;
+ int num;
+
+ gsi_cfg = mp_ioapic_gsi_routing(ioapic_num);
+ num = gsi_cfg->gsi_end - gsi_cfg->gsi_base + 1;
+
+ id = irq_domain_add_linear(np, num, &ioapic_irq_domain_ops,
+ (void *)ioapic_num);
+ BUG_ON(!id);
+ if (gsi_cfg->gsi_base == 0) {
+ /*
+ * The first NR_IRQS_LEGACY irq descs are allocated in
+ * early_irq_init() and need just a mapping. The
+ * remaining irqs need both. All of them are preallocated
+ * and assigned so we can keep the 1:1 mapping which the ioapic
+ * is having.
+ */
+ ret = irq_domain_associate_many(id, 0, 0, NR_IRQS_LEGACY);
+ if (ret)
+ pr_err("Error mapping legacy IRQs: %d\n", ret);
+
+ if (num > NR_IRQS_LEGACY) {
+ ret = irq_create_strict_mappings(id, NR_IRQS_LEGACY,
+ NR_IRQS_LEGACY, num - NR_IRQS_LEGACY);
+ if (ret)
+ pr_err("Error creating mapping for the "
+ "remaining IRQs: %d\n", ret);
+ }
+ irq_set_default_host(id);
+ } else {
+ ret = irq_create_strict_mappings(id, gsi_cfg->gsi_base, 0, num);
+ if (ret)
+ pr_err("Error creating IRQ mapping: %d\n", ret);
+ }
+}
+
static void __init ioapic_add_ofnode(struct device_node *np)
{
struct resource r;
@@ -356,15 +397,7 @@ static void __init ioapic_add_ofnode(struct device_node *np)
for (i = 0; i < nr_ioapics; i++) {
if (r.start == mpc_ioapic_addr(i)) {
- struct irq_domain *id;
- struct mp_ioapic_gsi *gsi_cfg;
-
- gsi_cfg = mp_ioapic_gsi_routing(i);
-
- id = irq_domain_add_legacy(np, 32, gsi_cfg->gsi_base, 0,
- &ioapic_irq_domain_ops,
- (void*)i);
- BUG_ON(!id);
+ dt_add_ioapic_domain(i, np);
return;
}
}
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 623f2883747..0750e3ba87c 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -57,6 +57,7 @@
#include <asm/cpufeature.h>
#include <asm/alternative-asm.h>
#include <asm/asm.h>
+#include <asm/smap.h>
/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
#include <linux/elf-em.h>
@@ -407,7 +408,9 @@ sysenter_past_esp:
*/
cmpl $__PAGE_OFFSET-3,%ebp
jae syscall_fault
+ ASM_STAC
1: movl (%ebp),%ebp
+ ASM_CLAC
movl %ebp,PT_EBP(%esp)
_ASM_EXTABLE(1b,syscall_fault)
@@ -488,6 +491,7 @@ ENDPROC(ia32_sysenter_target)
# system call handler stub
ENTRY(system_call)
RING0_INT_FRAME # can't unwind into user space anyway
+ ASM_CLAC
pushl_cfi %eax # save orig_eax
SAVE_ALL
GET_THREAD_INFO(%ebp)
@@ -670,6 +674,7 @@ END(syscall_exit_work)
RING0_INT_FRAME # can't unwind into user space anyway
syscall_fault:
+ ASM_CLAC
GET_THREAD_INFO(%ebp)
movl $-EFAULT,PT_EAX(%esp)
jmp resume_userspace
@@ -825,6 +830,7 @@ END(interrupt)
*/
.p2align CONFIG_X86_L1_CACHE_SHIFT
common_interrupt:
+ ASM_CLAC
addl $-0x80,(%esp) /* Adjust vector into the [-256,-1] range */
SAVE_ALL
TRACE_IRQS_OFF
@@ -841,6 +847,7 @@ ENDPROC(common_interrupt)
#define BUILD_INTERRUPT3(name, nr, fn) \
ENTRY(name) \
RING0_INT_FRAME; \
+ ASM_CLAC; \
pushl_cfi $~(nr); \
SAVE_ALL; \
TRACE_IRQS_OFF \
@@ -857,6 +864,7 @@ ENDPROC(name)
ENTRY(coprocessor_error)
RING0_INT_FRAME
+ ASM_CLAC
pushl_cfi $0
pushl_cfi $do_coprocessor_error
jmp error_code
@@ -865,6 +873,7 @@ END(coprocessor_error)
ENTRY(simd_coprocessor_error)
RING0_INT_FRAME
+ ASM_CLAC
pushl_cfi $0
#ifdef CONFIG_X86_INVD_BUG
/* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
@@ -886,6 +895,7 @@ END(simd_coprocessor_error)
ENTRY(device_not_available)
RING0_INT_FRAME
+ ASM_CLAC
pushl_cfi $-1 # mark this as an int
pushl_cfi $do_device_not_available
jmp error_code
@@ -906,6 +916,7 @@ END(native_irq_enable_sysexit)
ENTRY(overflow)
RING0_INT_FRAME
+ ASM_CLAC
pushl_cfi $0
pushl_cfi $do_overflow
jmp error_code
@@ -914,6 +925,7 @@ END(overflow)
ENTRY(bounds)
RING0_INT_FRAME
+ ASM_CLAC
pushl_cfi $0
pushl_cfi $do_bounds
jmp error_code
@@ -922,6 +934,7 @@ END(bounds)
ENTRY(invalid_op)
RING0_INT_FRAME
+ ASM_CLAC
pushl_cfi $0
pushl_cfi $do_invalid_op
jmp error_code
@@ -930,6 +943,7 @@ END(invalid_op)
ENTRY(coprocessor_segment_overrun)
RING0_INT_FRAME
+ ASM_CLAC
pushl_cfi $0
pushl_cfi $do_coprocessor_segment_overrun
jmp error_code
@@ -938,6 +952,7 @@ END(coprocessor_segment_overrun)
ENTRY(invalid_TSS)
RING0_EC_FRAME
+ ASM_CLAC
pushl_cfi $do_invalid_TSS
jmp error_code
CFI_ENDPROC
@@ -945,6 +960,7 @@ END(invalid_TSS)
ENTRY(segment_not_present)
RING0_EC_FRAME
+ ASM_CLAC
pushl_cfi $do_segment_not_present
jmp error_code
CFI_ENDPROC
@@ -952,6 +968,7 @@ END(segment_not_present)
ENTRY(stack_segment)
RING0_EC_FRAME
+ ASM_CLAC
pushl_cfi $do_stack_segment
jmp error_code
CFI_ENDPROC
@@ -959,6 +976,7 @@ END(stack_segment)
ENTRY(alignment_check)
RING0_EC_FRAME
+ ASM_CLAC
pushl_cfi $do_alignment_check
jmp error_code
CFI_ENDPROC
@@ -966,6 +984,7 @@ END(alignment_check)
ENTRY(divide_error)
RING0_INT_FRAME
+ ASM_CLAC
pushl_cfi $0 # no error code
pushl_cfi $do_divide_error
jmp error_code
@@ -975,6 +994,7 @@ END(divide_error)
#ifdef CONFIG_X86_MCE
ENTRY(machine_check)
RING0_INT_FRAME
+ ASM_CLAC
pushl_cfi $0
pushl_cfi machine_check_vector
jmp error_code
@@ -984,6 +1004,7 @@ END(machine_check)
ENTRY(spurious_interrupt_bug)
RING0_INT_FRAME
+ ASM_CLAC
pushl_cfi $0
pushl_cfi $do_spurious_interrupt_bug
jmp error_code
@@ -1109,17 +1130,21 @@ ENTRY(ftrace_caller)
pushl %eax
pushl %ecx
pushl %edx
- movl 0xc(%esp), %eax
+ pushl $0 /* Pass NULL as regs pointer */
+ movl 4*4(%esp), %eax
movl 0x4(%ebp), %edx
+ leal function_trace_op, %ecx
subl $MCOUNT_INSN_SIZE, %eax
.globl ftrace_call
ftrace_call:
call ftrace_stub
+ addl $4,%esp /* skip NULL pointer */
popl %edx
popl %ecx
popl %eax
+ftrace_ret:
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
.globl ftrace_graph_call
ftrace_graph_call:
@@ -1131,6 +1156,71 @@ ftrace_stub:
ret
END(ftrace_caller)
+ENTRY(ftrace_regs_caller)
+ pushf /* push flags before compare (in cs location) */
+ cmpl $0, function_trace_stop
+ jne ftrace_restore_flags
+
+ /*
+ * i386 does not save SS and ESP when coming from kernel.
+ * Instead, to get sp, &regs->sp is used (see ptrace.h).
+ * Unfortunately, that means eflags must be at the same location
+ * as the current return ip is. We move the return ip into the
+ * ip location, and move flags into the return ip location.
+ */
+ pushl 4(%esp) /* save return ip into ip slot */
+
+ pushl $0 /* Load 0 into orig_ax */
+ pushl %gs
+ pushl %fs
+ pushl %es
+ pushl %ds
+ pushl %eax
+ pushl %ebp
+ pushl %edi
+ pushl %esi
+ pushl %edx
+ pushl %ecx
+ pushl %ebx
+
+ movl 13*4(%esp), %eax /* Get the saved flags */
+ movl %eax, 14*4(%esp) /* Move saved flags into regs->flags location */
+ /* clobbering return ip */
+ movl $__KERNEL_CS,13*4(%esp)
+
+ movl 12*4(%esp), %eax /* Load ip (1st parameter) */
+ subl $MCOUNT_INSN_SIZE, %eax /* Adjust ip */
+ movl 0x4(%ebp), %edx /* Load parent ip (2nd parameter) */
+ leal function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */
+ pushl %esp /* Save pt_regs as 4th parameter */
+
+GLOBAL(ftrace_regs_call)
+ call ftrace_stub
+
+ addl $4, %esp /* Skip pt_regs */
+ movl 14*4(%esp), %eax /* Move flags back into cs */
+ movl %eax, 13*4(%esp) /* Needed to keep addl from modifying flags */
+ movl 12*4(%esp), %eax /* Get return ip from regs->ip */
+ movl %eax, 14*4(%esp) /* Put return ip back for ret */
+
+ popl %ebx
+ popl %ecx
+ popl %edx
+ popl %esi
+ popl %edi
+ popl %ebp
+ popl %eax
+ popl %ds
+ popl %es
+ popl %fs
+ popl %gs
+ addl $8, %esp /* Skip orig_ax and ip */
+ popf /* Pop flags at end (no addl to corrupt flags) */
+ jmp ftrace_ret
+
+ftrace_restore_flags:
+ popf
+ jmp ftrace_stub
#else /* ! CONFIG_DYNAMIC_FTRACE */
ENTRY(mcount)
@@ -1171,9 +1261,6 @@ END(mcount)
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
ENTRY(ftrace_graph_caller)
- cmpl $0, function_trace_stop
- jne ftrace_stub
-
pushl %eax
pushl %ecx
pushl %edx
@@ -1207,6 +1294,7 @@ return_to_handler:
ENTRY(page_fault)
RING0_EC_FRAME
+ ASM_CLAC
pushl_cfi $do_page_fault
ALIGN
error_code:
@@ -1279,6 +1367,7 @@ END(page_fault)
ENTRY(debug)
RING0_INT_FRAME
+ ASM_CLAC
cmpl $ia32_sysenter_target,(%esp)
jne debug_stack_correct
FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn
@@ -1303,6 +1392,7 @@ END(debug)
*/
ENTRY(nmi)
RING0_INT_FRAME
+ ASM_CLAC
pushl_cfi %eax
movl %ss, %eax
cmpw $__ESPFIX_SS, %ax
@@ -1373,6 +1463,7 @@ END(nmi)
ENTRY(int3)
RING0_INT_FRAME
+ ASM_CLAC
pushl_cfi $-1 # mark this as an int
SAVE_ALL
TRACE_IRQS_OFF
@@ -1393,6 +1484,7 @@ END(general_protection)
#ifdef CONFIG_KVM_GUEST
ENTRY(async_page_fault)
RING0_EC_FRAME
+ ASM_CLAC
pushl_cfi $do_async_page_fault
jmp error_code
CFI_ENDPROC
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 69babd8c834..44531acd9a8 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -56,6 +56,8 @@
#include <asm/ftrace.h>
#include <asm/percpu.h>
#include <asm/asm.h>
+#include <asm/rcu.h>
+#include <asm/smap.h>
#include <linux/err.h>
/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
@@ -68,25 +70,51 @@
.section .entry.text, "ax"
#ifdef CONFIG_FUNCTION_TRACER
+
+#ifdef CC_USING_FENTRY
+# define function_hook __fentry__
+#else
+# define function_hook mcount
+#endif
+
#ifdef CONFIG_DYNAMIC_FTRACE
-ENTRY(mcount)
+
+ENTRY(function_hook)
retq
-END(mcount)
+END(function_hook)
+
+/* skip is set if stack has been adjusted */
+.macro ftrace_caller_setup skip=0
+ MCOUNT_SAVE_FRAME \skip
+
+ /* Load the ftrace_ops into the 3rd parameter */
+ leaq function_trace_op, %rdx
+
+ /* Load ip into the first parameter */
+ movq RIP(%rsp), %rdi
+ subq $MCOUNT_INSN_SIZE, %rdi
+ /* Load the parent_ip into the second parameter */
+#ifdef CC_USING_FENTRY
+ movq SS+16(%rsp), %rsi
+#else
+ movq 8(%rbp), %rsi
+#endif
+.endm
ENTRY(ftrace_caller)
+ /* Check if tracing was disabled (quick check) */
cmpl $0, function_trace_stop
jne ftrace_stub
- MCOUNT_SAVE_FRAME
-
- movq 0x38(%rsp), %rdi
- movq 8(%rbp), %rsi
- subq $MCOUNT_INSN_SIZE, %rdi
+ ftrace_caller_setup
+ /* regs go into 4th parameter (but make it NULL) */
+ movq $0, %rcx
GLOBAL(ftrace_call)
call ftrace_stub
MCOUNT_RESTORE_FRAME
+ftrace_return:
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
GLOBAL(ftrace_graph_call)
@@ -97,8 +125,78 @@ GLOBAL(ftrace_stub)
retq
END(ftrace_caller)
+ENTRY(ftrace_regs_caller)
+ /* Save the current flags before compare (in SS location)*/
+ pushfq
+
+ /* Check if tracing was disabled (quick check) */
+ cmpl $0, function_trace_stop
+ jne ftrace_restore_flags
+
+ /* skip=8 to skip flags saved in SS */
+ ftrace_caller_setup 8
+
+ /* Save the rest of pt_regs */
+ movq %r15, R15(%rsp)
+ movq %r14, R14(%rsp)
+ movq %r13, R13(%rsp)
+ movq %r12, R12(%rsp)
+ movq %r11, R11(%rsp)
+ movq %r10, R10(%rsp)
+ movq %rbp, RBP(%rsp)
+ movq %rbx, RBX(%rsp)
+ /* Copy saved flags */
+ movq SS(%rsp), %rcx
+ movq %rcx, EFLAGS(%rsp)
+ /* Kernel segments */
+ movq $__KERNEL_DS, %rcx
+ movq %rcx, SS(%rsp)
+ movq $__KERNEL_CS, %rcx
+ movq %rcx, CS(%rsp)
+ /* Stack - skipping return address */
+ leaq SS+16(%rsp), %rcx
+ movq %rcx, RSP(%rsp)
+
+ /* regs go into 4th parameter */
+ leaq (%rsp), %rcx
+
+GLOBAL(ftrace_regs_call)
+ call ftrace_stub
+
+ /* Copy flags back to SS, to restore them */
+ movq EFLAGS(%rsp), %rax
+ movq %rax, SS(%rsp)
+
+ /* Handlers can change the RIP */
+ movq RIP(%rsp), %rax
+ movq %rax, SS+8(%rsp)
+
+ /* restore the rest of pt_regs */
+ movq R15(%rsp), %r15
+ movq R14(%rsp), %r14
+ movq R13(%rsp), %r13
+ movq R12(%rsp), %r12
+ movq R10(%rsp), %r10
+ movq RBP(%rsp), %rbp
+ movq RBX(%rsp), %rbx
+
+ /* skip=8 to skip flags saved in SS */
+ MCOUNT_RESTORE_FRAME 8
+
+ /* Restore flags */
+ popfq
+
+ jmp ftrace_return
+ftrace_restore_flags:
+ popfq
+ jmp ftrace_stub
+
+END(ftrace_regs_caller)
+
+
#else /* ! CONFIG_DYNAMIC_FTRACE */
-ENTRY(mcount)
+
+ENTRY(function_hook)
cmpl $0, function_trace_stop
jne ftrace_stub
@@ -119,8 +217,12 @@ GLOBAL(ftrace_stub)
trace:
MCOUNT_SAVE_FRAME
- movq 0x38(%rsp), %rdi
+ movq RIP(%rsp), %rdi
+#ifdef CC_USING_FENTRY
+ movq SS+16(%rsp), %rsi
+#else
movq 8(%rbp), %rsi
+#endif
subq $MCOUNT_INSN_SIZE, %rdi
call *ftrace_trace_function
@@ -128,20 +230,22 @@ trace:
MCOUNT_RESTORE_FRAME
jmp ftrace_stub
-END(mcount)
+END(function_hook)
#endif /* CONFIG_DYNAMIC_FTRACE */
#endif /* CONFIG_FUNCTION_TRACER */
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
ENTRY(ftrace_graph_caller)
- cmpl $0, function_trace_stop
- jne ftrace_stub
-
MCOUNT_SAVE_FRAME
+#ifdef CC_USING_FENTRY
+ leaq SS+16(%rsp), %rdi
+ movq $0, %rdx /* No framepointers needed */
+#else
leaq 8(%rbp), %rdi
- movq 0x38(%rsp), %rsi
movq (%rbp), %rdx
+#endif
+ movq RIP(%rsp), %rsi
subq $MCOUNT_INSN_SIZE, %rsi
call prepare_ftrace_return
@@ -342,15 +446,15 @@ ENDPROC(native_usergs_sysret64)
.macro SAVE_ARGS_IRQ
cld
/* start from rbp in pt_regs and jump over */
- movq_cfi rdi, RDI-RBP
- movq_cfi rsi, RSI-RBP
- movq_cfi rdx, RDX-RBP
- movq_cfi rcx, RCX-RBP
- movq_cfi rax, RAX-RBP
- movq_cfi r8, R8-RBP
- movq_cfi r9, R9-RBP
- movq_cfi r10, R10-RBP
- movq_cfi r11, R11-RBP
+ movq_cfi rdi, (RDI-RBP)
+ movq_cfi rsi, (RSI-RBP)
+ movq_cfi rdx, (RDX-RBP)
+ movq_cfi rcx, (RCX-RBP)
+ movq_cfi rax, (RAX-RBP)
+ movq_cfi r8, (R8-RBP)
+ movq_cfi r9, (R9-RBP)
+ movq_cfi r10, (R10-RBP)
+ movq_cfi r11, (R11-RBP)
/* Save rbp so that we can unwind from get_irq_regs() */
movq_cfi rbp, 0
@@ -384,7 +488,7 @@ ENDPROC(native_usergs_sysret64)
.endm
ENTRY(save_rest)
- PARTIAL_FRAME 1 REST_SKIP+8
+ PARTIAL_FRAME 1 (REST_SKIP+8)
movq 5*8+16(%rsp), %r11 /* save return address */
movq_cfi rbx, RBX+16
movq_cfi rbp, RBP+16
@@ -440,7 +544,7 @@ ENTRY(ret_from_fork)
LOCK ; btr $TIF_FORK,TI_flags(%r8)
- pushq_cfi kernel_eflags(%rip)
+ pushq_cfi $0x0002
popfq_cfi # reset kernel eflags
call schedule_tail # rdi: 'prev' task parameter
@@ -465,7 +569,8 @@ END(ret_from_fork)
* System call entry. Up to 6 arguments in registers are supported.
*
* SYSCALL does not save anything on the stack and does not change the
- * stack pointer.
+ * stack pointer. However, it does mask the flags register for us, so
+ * CLD and CLAC are not needed.
*/
/*
@@ -565,7 +670,7 @@ sysret_careful:
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
pushq_cfi %rdi
- call schedule
+ SCHEDULE_USER
popq_cfi %rdi
jmp sysret_check
@@ -678,7 +783,7 @@ int_careful:
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
pushq_cfi %rdi
- call schedule
+ SCHEDULE_USER
popq_cfi %rdi
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
@@ -884,6 +989,7 @@ END(interrupt)
*/
.p2align CONFIG_X86_L1_CACHE_SHIFT
common_interrupt:
+ ASM_CLAC
XCPT_FRAME
addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */
interrupt do_IRQ
@@ -974,7 +1080,7 @@ retint_careful:
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
pushq_cfi %rdi
- call schedule
+ SCHEDULE_USER
popq_cfi %rdi
GET_THREAD_INFO(%rcx)
DISABLE_INTERRUPTS(CLBR_NONE)
@@ -1023,6 +1129,7 @@ END(common_interrupt)
*/
.macro apicinterrupt num sym do_sym
ENTRY(\sym)
+ ASM_CLAC
INTR_FRAME
pushq_cfi $~(\num)
.Lcommon_\sym:
@@ -1077,6 +1184,7 @@ apicinterrupt IRQ_WORK_VECTOR \
*/
.macro zeroentry sym do_sym
ENTRY(\sym)
+ ASM_CLAC
INTR_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
@@ -1094,6 +1202,7 @@ END(\sym)
.macro paranoidzeroentry sym do_sym
ENTRY(\sym)
+ ASM_CLAC
INTR_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
@@ -1112,6 +1221,7 @@ END(\sym)
#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8)
.macro paranoidzeroentry_ist sym do_sym ist
ENTRY(\sym)
+ ASM_CLAC
INTR_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
@@ -1131,6 +1241,7 @@ END(\sym)
.macro errorentry sym do_sym
ENTRY(\sym)
+ ASM_CLAC
XCPT_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
subq $ORIG_RAX-R15, %rsp
@@ -1149,6 +1260,7 @@ END(\sym)
/* error code is on the stack already */
.macro paranoiderrorentry sym do_sym
ENTRY(\sym)
+ ASM_CLAC
XCPT_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
subq $ORIG_RAX-R15, %rsp
@@ -1449,7 +1561,7 @@ paranoid_userspace:
paranoid_schedule:
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_ANY)
- call schedule
+ SCHEDULE_USER
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF
jmp paranoid_userspace
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index c3a7cb4bf6e..1d414029f1d 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -206,6 +206,21 @@ static int
ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
unsigned const char *new_code);
+/*
+ * Should never be called:
+ * As it is only called by __ftrace_replace_code() which is called by
+ * ftrace_replace_code() that x86 overrides, and by ftrace_update_code()
+ * which is called to turn mcount into nops or nops into function calls
+ * but not to convert a function from not using regs to one that uses
+ * regs, which ftrace_modify_call() is for.
+ */
+int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,
+ unsigned long addr)
+{
+ WARN_ON(1);
+ return -EINVAL;
+}
+
int ftrace_update_ftrace_func(ftrace_func_t func)
{
unsigned long ip = (unsigned long)(&ftrace_call);
@@ -220,6 +235,14 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
ret = ftrace_modify_code(ip, old, new);
+ /* Also update the regs callback function */
+ if (!ret) {
+ ip = (unsigned long)(&ftrace_regs_call);
+ memcpy(old, &ftrace_regs_call, MCOUNT_INSN_SIZE);
+ new = ftrace_call_replace(ip, (unsigned long)func);
+ ret = ftrace_modify_code(ip, old, new);
+ }
+
atomic_dec(&modifying_ftrace_code);
return ret;
@@ -299,6 +322,32 @@ static int add_brk_on_nop(struct dyn_ftrace *rec)
return add_break(rec->ip, old);
}
+/*
+ * If the record has the FTRACE_FL_REGS set, that means that it
+ * wants to convert to a callback that saves all regs. If FTRACE_FL_REGS
+ * is not not set, then it wants to convert to the normal callback.
+ */
+static unsigned long get_ftrace_addr(struct dyn_ftrace *rec)
+{
+ if (rec->flags & FTRACE_FL_REGS)
+ return (unsigned long)FTRACE_REGS_ADDR;
+ else
+ return (unsigned long)FTRACE_ADDR;
+}
+
+/*
+ * The FTRACE_FL_REGS_EN is set when the record already points to
+ * a function that saves all the regs. Basically the '_EN' version
+ * represents the current state of the function.
+ */
+static unsigned long get_ftrace_old_addr(struct dyn_ftrace *rec)
+{
+ if (rec->flags & FTRACE_FL_REGS_EN)
+ return (unsigned long)FTRACE_REGS_ADDR;
+ else
+ return (unsigned long)FTRACE_ADDR;
+}
+
static int add_breakpoints(struct dyn_ftrace *rec, int enable)
{
unsigned long ftrace_addr;
@@ -306,7 +355,7 @@ static int add_breakpoints(struct dyn_ftrace *rec, int enable)
ret = ftrace_test_record(rec, enable);
- ftrace_addr = (unsigned long)FTRACE_ADDR;
+ ftrace_addr = get_ftrace_addr(rec);
switch (ret) {
case FTRACE_UPDATE_IGNORE:
@@ -316,6 +365,10 @@ static int add_breakpoints(struct dyn_ftrace *rec, int enable)
/* converting nop to call */
return add_brk_on_nop(rec);
+ case FTRACE_UPDATE_MODIFY_CALL_REGS:
+ case FTRACE_UPDATE_MODIFY_CALL:
+ ftrace_addr = get_ftrace_old_addr(rec);
+ /* fall through */
case FTRACE_UPDATE_MAKE_NOP:
/* converting a call to a nop */
return add_brk_on_call(rec, ftrace_addr);
@@ -360,13 +413,21 @@ static int remove_breakpoint(struct dyn_ftrace *rec)
* If not, don't touch the breakpoint, we make just create
* a disaster.
*/
- ftrace_addr = (unsigned long)FTRACE_ADDR;
+ ftrace_addr = get_ftrace_addr(rec);
+ nop = ftrace_call_replace(ip, ftrace_addr);
+
+ if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) == 0)
+ goto update;
+
+ /* Check both ftrace_addr and ftrace_old_addr */
+ ftrace_addr = get_ftrace_old_addr(rec);
nop = ftrace_call_replace(ip, ftrace_addr);
if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0)
return -EINVAL;
}
+ update:
return probe_kernel_write((void *)ip, &nop[0], 1);
}
@@ -405,12 +466,14 @@ static int add_update(struct dyn_ftrace *rec, int enable)
ret = ftrace_test_record(rec, enable);
- ftrace_addr = (unsigned long)FTRACE_ADDR;
+ ftrace_addr = get_ftrace_addr(rec);
switch (ret) {
case FTRACE_UPDATE_IGNORE:
return 0;
+ case FTRACE_UPDATE_MODIFY_CALL_REGS:
+ case FTRACE_UPDATE_MODIFY_CALL:
case FTRACE_UPDATE_MAKE_CALL:
/* converting nop to call */
return add_update_call(rec, ftrace_addr);
@@ -455,12 +518,14 @@ static int finish_update(struct dyn_ftrace *rec, int enable)
ret = ftrace_update_record(rec, enable);
- ftrace_addr = (unsigned long)FTRACE_ADDR;
+ ftrace_addr = get_ftrace_addr(rec);
switch (ret) {
case FTRACE_UPDATE_IGNORE:
return 0;
+ case FTRACE_UPDATE_MODIFY_CALL_REGS:
+ case FTRACE_UPDATE_MODIFY_CALL:
case FTRACE_UPDATE_MAKE_CALL:
/* converting nop to call */
return finish_update_call(rec, ftrace_addr);
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index d42ab17b739..957a47aec64 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -287,27 +287,28 @@ ENTRY(startup_32_smp)
leal -__PAGE_OFFSET(%ecx),%esp
default_entry:
-
/*
* New page tables may be in 4Mbyte page mode and may
* be using the global pages.
*
* NOTE! If we are on a 486 we may have no cr4 at all!
- * So we do not try to touch it unless we really have
- * some bits in it to set. This won't work if the BSP
- * implements cr4 but this AP does not -- very unlikely
- * but be warned! The same applies to the pse feature
- * if not equally supported. --macro
- *
- * NOTE! We have to correct for the fact that we're
- * not yet offset PAGE_OFFSET..
+ * Specifically, cr4 exists if and only if CPUID exists,
+ * which in turn exists if and only if EFLAGS.ID exists.
*/
-#define cr4_bits pa(mmu_cr4_features)
- movl cr4_bits,%edx
- andl %edx,%edx
- jz 6f
- movl %cr4,%eax # Turn on paging options (PSE,PAE,..)
- orl %edx,%eax
+ movl $X86_EFLAGS_ID,%ecx
+ pushl %ecx
+ popfl
+ pushfl
+ popl %eax
+ pushl $0
+ popfl
+ pushfl
+ popl %edx
+ xorl %edx,%eax
+ testl %ecx,%eax
+ jz 6f # No ID flag = no CPUID = no CR4
+
+ movl pa(mmu_cr4_features),%eax
movl %eax,%cr4
testb $X86_CR4_PAE, %al # check if PAE is enabled
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index f250431fb50..675a0501244 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -19,24 +19,17 @@
#include <asm/fpu-internal.h>
#include <asm/user.h>
-#ifdef CONFIG_X86_64
-# include <asm/sigcontext32.h>
-# include <asm/user32.h>
-#else
-# define save_i387_xstate_ia32 save_i387_xstate
-# define restore_i387_xstate_ia32 restore_i387_xstate
-# define _fpstate_ia32 _fpstate
-# define _xstate_ia32 _xstate
-# define sig_xstate_ia32_size sig_xstate_size
-# define fx_sw_reserved_ia32 fx_sw_reserved
-# define user_i387_ia32_struct user_i387_struct
-# define user32_fxsr_struct user_fxsr_struct
-#endif
-
/*
* Were we in an interrupt that interrupted kernel mode?
*
- * We can do a kernel_fpu_begin/end() pair *ONLY* if that
+ * For now, with eagerfpu we will return interrupted kernel FPU
+ * state as not-idle. TBD: Ideally we can change the return value
+ * to something like __thread_has_fpu(current). But we need to
+ * be careful of doing __thread_clear_has_fpu() before saving
+ * the FPU etc for supporting nested uses etc. For now, take
+ * the simple route!
+ *
+ * On others, we can do a kernel_fpu_begin/end() pair *ONLY* if that
* pair does nothing at all: the thread must not have fpu (so
* that we don't try to save the FPU state), and TS must
* be set (so that the clts/stts pair does nothing that is
@@ -44,6 +37,9 @@
*/
static inline bool interrupted_kernel_fpu_idle(void)
{
+ if (use_eager_fpu())
+ return 0;
+
return !__thread_has_fpu(current) &&
(read_cr0() & X86_CR0_TS);
}
@@ -77,29 +73,29 @@ bool irq_fpu_usable(void)
}
EXPORT_SYMBOL(irq_fpu_usable);
-void kernel_fpu_begin(void)
+void __kernel_fpu_begin(void)
{
struct task_struct *me = current;
- WARN_ON_ONCE(!irq_fpu_usable());
- preempt_disable();
if (__thread_has_fpu(me)) {
__save_init_fpu(me);
__thread_clear_has_fpu(me);
- /* We do 'stts()' in kernel_fpu_end() */
- } else {
+ /* We do 'stts()' in __kernel_fpu_end() */
+ } else if (!use_eager_fpu()) {
this_cpu_write(fpu_owner_task, NULL);
clts();
}
}
-EXPORT_SYMBOL(kernel_fpu_begin);
+EXPORT_SYMBOL(__kernel_fpu_begin);
-void kernel_fpu_end(void)
+void __kernel_fpu_end(void)
{
- stts();
- preempt_enable();
+ if (use_eager_fpu())
+ math_state_restore();
+ else
+ stts();
}
-EXPORT_SYMBOL(kernel_fpu_end);
+EXPORT_SYMBOL(__kernel_fpu_end);
void unlazy_fpu(struct task_struct *tsk)
{
@@ -113,23 +109,15 @@ void unlazy_fpu(struct task_struct *tsk)
}
EXPORT_SYMBOL(unlazy_fpu);
-#ifdef CONFIG_MATH_EMULATION
-# define HAVE_HWFP (boot_cpu_data.hard_math)
-#else
-# define HAVE_HWFP 1
-#endif
-
-static unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu;
+unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu;
unsigned int xstate_size;
EXPORT_SYMBOL_GPL(xstate_size);
-unsigned int sig_xstate_ia32_size = sizeof(struct _fpstate_ia32);
static struct i387_fxsave_struct fx_scratch __cpuinitdata;
static void __cpuinit mxcsr_feature_mask_init(void)
{
unsigned long mask = 0;
- clts();
if (cpu_has_fxsr) {
memset(&fx_scratch, 0, sizeof(struct i387_fxsave_struct));
asm volatile("fxsave %0" : : "m" (fx_scratch));
@@ -138,7 +126,6 @@ static void __cpuinit mxcsr_feature_mask_init(void)
mask = 0x0000ffbf;
}
mxcsr_feature_mask &= mask;
- stts();
}
static void __cpuinit init_thread_xstate(void)
@@ -192,9 +179,8 @@ void __cpuinit fpu_init(void)
init_thread_xstate();
mxcsr_feature_mask_init();
- /* clean state in init */
- current_thread_info()->status = 0;
- clear_used_math();
+ xsave_init();
+ eager_fpu_init();
}
void fpu_finit(struct fpu *fpu)
@@ -205,12 +191,7 @@ void fpu_finit(struct fpu *fpu)
}
if (cpu_has_fxsr) {
- struct i387_fxsave_struct *fx = &fpu->state->fxsave;
-
- memset(fx, 0, xstate_size);
- fx->cwd = 0x37f;
- if (cpu_has_xmm)
- fx->mxcsr = MXCSR_DEFAULT;
+ fx_finit(&fpu->state->fxsave);
} else {
struct i387_fsave_struct *fp = &fpu->state->fsave;
memset(fp, 0, xstate_size);
@@ -454,7 +435,7 @@ static inline u32 twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave)
* FXSR floating point environment conversions.
*/
-static void
+void
convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
{
struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state->fxsave;
@@ -491,8 +472,8 @@ convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
memcpy(&to[i], &from[i], sizeof(to[0]));
}
-static void convert_to_fxsr(struct task_struct *tsk,
- const struct user_i387_ia32_struct *env)
+void convert_to_fxsr(struct task_struct *tsk,
+ const struct user_i387_ia32_struct *env)
{
struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state->fxsave;
@@ -589,223 +570,6 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
}
/*
- * Signal frame handlers.
- */
-
-static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf)
-{
- struct task_struct *tsk = current;
- struct i387_fsave_struct *fp = &tsk->thread.fpu.state->fsave;
-
- fp->status = fp->swd;
- if (__copy_to_user(buf, fp, sizeof(struct i387_fsave_struct)))
- return -1;
- return 1;
-}
-
-static int save_i387_fxsave(struct _fpstate_ia32 __user *buf)
-{
- struct task_struct *tsk = current;
- struct i387_fxsave_struct *fx = &tsk->thread.fpu.state->fxsave;
- struct user_i387_ia32_struct env;
- int err = 0;
-
- convert_from_fxsr(&env, tsk);
- if (__copy_to_user(buf, &env, sizeof(env)))
- return -1;
-
- err |= __put_user(fx->swd, &buf->status);
- err |= __put_user(X86_FXSR_MAGIC, &buf->magic);
- if (err)
- return -1;
-
- if (__copy_to_user(&buf->_fxsr_env[0], fx, xstate_size))
- return -1;
- return 1;
-}
-
-static int save_i387_xsave(void __user *buf)
-{
- struct task_struct *tsk = current;
- struct _fpstate_ia32 __user *fx = buf;
- int err = 0;
-
-
- sanitize_i387_state(tsk);
-
- /*
- * For legacy compatible, we always set FP/SSE bits in the bit
- * vector while saving the state to the user context.
- * This will enable us capturing any changes(during sigreturn) to
- * the FP/SSE bits by the legacy applications which don't touch
- * xstate_bv in the xsave header.
- *
- * xsave aware applications can change the xstate_bv in the xsave
- * header as well as change any contents in the memory layout.
- * xrestore as part of sigreturn will capture all the changes.
- */
- tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
-
- if (save_i387_fxsave(fx) < 0)
- return -1;
-
- err = __copy_to_user(&fx->sw_reserved, &fx_sw_reserved_ia32,
- sizeof(struct _fpx_sw_bytes));
- err |= __put_user(FP_XSTATE_MAGIC2,
- (__u32 __user *) (buf + sig_xstate_ia32_size
- - FP_XSTATE_MAGIC2_SIZE));
- if (err)
- return -1;
-
- return 1;
-}
-
-int save_i387_xstate_ia32(void __user *buf)
-{
- struct _fpstate_ia32 __user *fp = (struct _fpstate_ia32 __user *) buf;
- struct task_struct *tsk = current;
-
- if (!used_math())
- return 0;
-
- if (!access_ok(VERIFY_WRITE, buf, sig_xstate_ia32_size))
- return -EACCES;
- /*
- * This will cause a "finit" to be triggered by the next
- * attempted FPU operation by the 'current' process.
- */
- clear_used_math();
-
- if (!HAVE_HWFP) {
- return fpregs_soft_get(current, NULL,
- 0, sizeof(struct user_i387_ia32_struct),
- NULL, fp) ? -1 : 1;
- }
-
- unlazy_fpu(tsk);
-
- if (cpu_has_xsave)
- return save_i387_xsave(fp);
- if (cpu_has_fxsr)
- return save_i387_fxsave(fp);
- else
- return save_i387_fsave(fp);
-}
-
-static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf)
-{
- struct task_struct *tsk = current;
-
- return __copy_from_user(&tsk->thread.fpu.state->fsave, buf,
- sizeof(struct i387_fsave_struct));
-}
-
-static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf,
- unsigned int size)
-{
- struct task_struct *tsk = current;
- struct user_i387_ia32_struct env;
- int err;
-
- err = __copy_from_user(&tsk->thread.fpu.state->fxsave, &buf->_fxsr_env[0],
- size);
- /* mxcsr reserved bits must be masked to zero for security reasons */
- tsk->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask;
- if (err || __copy_from_user(&env, buf, sizeof(env)))
- return 1;
- convert_to_fxsr(tsk, &env);
-
- return 0;
-}
-
-static int restore_i387_xsave(void __user *buf)
-{
- struct _fpx_sw_bytes fx_sw_user;
- struct _fpstate_ia32 __user *fx_user =
- ((struct _fpstate_ia32 __user *) buf);
- struct i387_fxsave_struct __user *fx =
- (struct i387_fxsave_struct __user *) &fx_user->_fxsr_env[0];
- struct xsave_hdr_struct *xsave_hdr =
- &current->thread.fpu.state->xsave.xsave_hdr;
- u64 mask;
- int err;
-
- if (check_for_xstate(fx, buf, &fx_sw_user))
- goto fx_only;
-
- mask = fx_sw_user.xstate_bv;
-
- err = restore_i387_fxsave(buf, fx_sw_user.xstate_size);
-
- xsave_hdr->xstate_bv &= pcntxt_mask;
- /*
- * These bits must be zero.
- */
- xsave_hdr->reserved1[0] = xsave_hdr->reserved1[1] = 0;
-
- /*
- * Init the state that is not present in the memory layout
- * and enabled by the OS.
- */
- mask = ~(pcntxt_mask & ~mask);
- xsave_hdr->xstate_bv &= mask;
-
- return err;
-fx_only:
- /*
- * Couldn't find the extended state information in the memory
- * layout. Restore the FP/SSE and init the other extended state
- * enabled by the OS.
- */
- xsave_hdr->xstate_bv = XSTATE_FPSSE;
- return restore_i387_fxsave(buf, sizeof(struct i387_fxsave_struct));
-}
-
-int restore_i387_xstate_ia32(void __user *buf)
-{
- int err;
- struct task_struct *tsk = current;
- struct _fpstate_ia32 __user *fp = (struct _fpstate_ia32 __user *) buf;
-
- if (HAVE_HWFP)
- clear_fpu(tsk);
-
- if (!buf) {
- if (used_math()) {
- clear_fpu(tsk);
- clear_used_math();
- }
-
- return 0;
- } else
- if (!access_ok(VERIFY_READ, buf, sig_xstate_ia32_size))
- return -EACCES;
-
- if (!used_math()) {
- err = init_fpu(tsk);
- if (err)
- return err;
- }
-
- if (HAVE_HWFP) {
- if (cpu_has_xsave)
- err = restore_i387_xsave(buf);
- else if (cpu_has_fxsr)
- err = restore_i387_fxsave(fp, sizeof(struct
- i387_fxsave_struct));
- else
- err = restore_i387_fsave(fp);
- } else {
- err = fpregs_soft_set(current, NULL,
- 0, sizeof(struct user_i387_ia32_struct),
- NULL, fp) != 0;
- }
- set_used_math();
-
- return err;
-}
-
-/*
* FPU state for core dumps.
* This is only used for a.out dumps now.
* It is declared generically using elf_fpregset_t (which is
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index 36d1853e91a..9a5c460404d 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -263,7 +263,7 @@ static void i8259A_shutdown(void)
* out of.
*/
outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
- outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-1 */
+ outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
}
static struct syscore_ops i8259_syscore_ops = {
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index d44f7829968..e4595f10591 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -92,7 +92,8 @@ int arch_show_interrupts(struct seq_file *p, int prec)
seq_printf(p, " Rescheduling interrupts\n");
seq_printf(p, "%*s: ", prec, "CAL");
for_each_online_cpu(j)
- seq_printf(p, "%10u ", irq_stats(j)->irq_call_count);
+ seq_printf(p, "%10u ", irq_stats(j)->irq_call_count -
+ irq_stats(j)->irq_tlb_count);
seq_printf(p, " Function call interrupts\n");
seq_printf(p, "%*s: ", prec, "TLB");
for_each_online_cpu(j)
@@ -147,7 +148,6 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
#ifdef CONFIG_SMP
sum += irq_stats(cpu)->irq_resched_count;
sum += irq_stats(cpu)->irq_call_count;
- sum += irq_stats(cpu)->irq_tlb_count;
#endif
#ifdef CONFIG_X86_THERMAL_VECTOR
sum += irq_stats(cpu)->irq_thermal_count;
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index e2f751efb7b..57916c0d3cf 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -541,6 +541,23 @@ reenter_kprobe(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb
return 1;
}
+#ifdef KPROBES_CAN_USE_FTRACE
+static void __kprobes skip_singlestep(struct kprobe *p, struct pt_regs *regs,
+ struct kprobe_ctlblk *kcb)
+{
+ /*
+ * Emulate singlestep (and also recover regs->ip)
+ * as if there is a 5byte nop
+ */
+ regs->ip = (unsigned long)p->addr + MCOUNT_INSN_SIZE;
+ if (unlikely(p->post_handler)) {
+ kcb->kprobe_status = KPROBE_HIT_SSDONE;
+ p->post_handler(p, regs, 0);
+ }
+ __this_cpu_write(current_kprobe, NULL);
+}
+#endif
+
/*
* Interrupts are disabled on entry as trap3 is an interrupt gate and they
* remain disabled throughout this function.
@@ -599,6 +616,12 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
} else if (kprobe_running()) {
p = __this_cpu_read(current_kprobe);
if (p->break_handler && p->break_handler(p, regs)) {
+#ifdef KPROBES_CAN_USE_FTRACE
+ if (kprobe_ftrace(p)) {
+ skip_singlestep(p, regs, kcb);
+ return 1;
+ }
+#endif
setup_singlestep(p, regs, kcb, 0);
return 1;
}
@@ -1052,6 +1075,50 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
return 0;
}
+#ifdef KPROBES_CAN_USE_FTRACE
+/* Ftrace callback handler for kprobes */
+void __kprobes kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *ops, struct pt_regs *regs)
+{
+ struct kprobe *p;
+ struct kprobe_ctlblk *kcb;
+ unsigned long flags;
+
+ /* Disable irq for emulating a breakpoint and avoiding preempt */
+ local_irq_save(flags);
+
+ p = get_kprobe((kprobe_opcode_t *)ip);
+ if (unlikely(!p) || kprobe_disabled(p))
+ goto end;
+
+ kcb = get_kprobe_ctlblk();
+ if (kprobe_running()) {
+ kprobes_inc_nmissed_count(p);
+ } else {
+ /* Kprobe handler expects regs->ip = ip + 1 as breakpoint hit */
+ regs->ip = ip + sizeof(kprobe_opcode_t);
+
+ __this_cpu_write(current_kprobe, p);
+ kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+ if (!p->pre_handler || !p->pre_handler(p, regs))
+ skip_singlestep(p, regs, kcb);
+ /*
+ * If pre_handler returns !0, it sets regs->ip and
+ * resets current kprobe.
+ */
+ }
+end:
+ local_irq_restore(flags);
+}
+
+int __kprobes arch_prepare_kprobe_ftrace(struct kprobe *p)
+{
+ p->ainsn.insn = NULL;
+ p->ainsn.boostable = -1;
+ return 0;
+}
+#endif
+
int __init arch_init_kprobes(void)
{
return arch_init_optprobes();
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index 82746f942cd..7720ff5a9ee 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -75,20 +75,113 @@ struct microcode_amd {
static struct equiv_cpu_entry *equiv_cpu_table;
-/* page-sized ucode patch buffer */
-void *patch;
+struct ucode_patch {
+ struct list_head plist;
+ void *data;
+ u32 patch_id;
+ u16 equiv_cpu;
+};
+
+static LIST_HEAD(pcache);
+
+static u16 find_equiv_id(unsigned int cpu)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+ int i = 0;
+
+ if (!equiv_cpu_table)
+ return 0;
+
+ while (equiv_cpu_table[i].installed_cpu != 0) {
+ if (uci->cpu_sig.sig == equiv_cpu_table[i].installed_cpu)
+ return equiv_cpu_table[i].equiv_cpu;
+
+ i++;
+ }
+ return 0;
+}
+
+static u32 find_cpu_family_by_equiv_cpu(u16 equiv_cpu)
+{
+ int i = 0;
+
+ BUG_ON(!equiv_cpu_table);
+
+ while (equiv_cpu_table[i].equiv_cpu != 0) {
+ if (equiv_cpu == equiv_cpu_table[i].equiv_cpu)
+ return equiv_cpu_table[i].installed_cpu;
+ i++;
+ }
+ return 0;
+}
+
+/*
+ * a small, trivial cache of per-family ucode patches
+ */
+static struct ucode_patch *cache_find_patch(u16 equiv_cpu)
+{
+ struct ucode_patch *p;
+
+ list_for_each_entry(p, &pcache, plist)
+ if (p->equiv_cpu == equiv_cpu)
+ return p;
+ return NULL;
+}
+
+static void update_cache(struct ucode_patch *new_patch)
+{
+ struct ucode_patch *p;
+
+ list_for_each_entry(p, &pcache, plist) {
+ if (p->equiv_cpu == new_patch->equiv_cpu) {
+ if (p->patch_id >= new_patch->patch_id)
+ /* we already have the latest patch */
+ return;
+
+ list_replace(&p->plist, &new_patch->plist);
+ kfree(p->data);
+ kfree(p);
+ return;
+ }
+ }
+ /* no patch found, add it */
+ list_add_tail(&new_patch->plist, &pcache);
+}
+
+static void free_cache(void)
+{
+ struct ucode_patch *p, *tmp;
+
+ list_for_each_entry_safe(p, tmp, &pcache, plist) {
+ __list_del(p->plist.prev, p->plist.next);
+ kfree(p->data);
+ kfree(p);
+ }
+}
+
+static struct ucode_patch *find_patch(unsigned int cpu)
+{
+ u16 equiv_id;
+
+ equiv_id = find_equiv_id(cpu);
+ if (!equiv_id)
+ return NULL;
+
+ return cache_find_patch(equiv_id);
+}
static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
{
struct cpuinfo_x86 *c = &cpu_data(cpu);
+ csig->sig = cpuid_eax(0x00000001);
csig->rev = c->microcode;
pr_info("CPU%d: patch_level=0x%08x\n", cpu, csig->rev);
return 0;
}
-static unsigned int verify_ucode_size(int cpu, u32 patch_size,
+static unsigned int verify_patch_size(int cpu, u32 patch_size,
unsigned int size)
{
struct cpuinfo_x86 *c = &cpu_data(cpu);
@@ -118,95 +211,37 @@ static unsigned int verify_ucode_size(int cpu, u32 patch_size,
return patch_size;
}
-static u16 find_equiv_id(void)
+static int apply_microcode_amd(int cpu)
{
- unsigned int current_cpu_id, i = 0;
-
- BUG_ON(equiv_cpu_table == NULL);
-
- current_cpu_id = cpuid_eax(0x00000001);
-
- while (equiv_cpu_table[i].installed_cpu != 0) {
- if (current_cpu_id == equiv_cpu_table[i].installed_cpu)
- return equiv_cpu_table[i].equiv_cpu;
-
- i++;
- }
- return 0;
-}
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+ struct microcode_amd *mc_amd;
+ struct ucode_cpu_info *uci;
+ struct ucode_patch *p;
+ u32 rev, dummy;
-/*
- * we signal a good patch is found by returning its size > 0
- */
-static int get_matching_microcode(int cpu, const u8 *ucode_ptr,
- unsigned int leftover_size, int rev,
- unsigned int *current_size)
-{
- struct microcode_header_amd *mc_hdr;
- unsigned int actual_size, patch_size;
- u16 equiv_cpu_id;
+ BUG_ON(raw_smp_processor_id() != cpu);
- /* size of the current patch we're staring at */
- patch_size = *(u32 *)(ucode_ptr + 4);
- *current_size = patch_size + SECTION_HDR_SIZE;
+ uci = ucode_cpu_info + cpu;
- equiv_cpu_id = find_equiv_id();
- if (!equiv_cpu_id)
+ p = find_patch(cpu);
+ if (!p)
return 0;
- /*
- * let's look at the patch header itself now
- */
- mc_hdr = (struct microcode_header_amd *)(ucode_ptr + SECTION_HDR_SIZE);
+ mc_amd = p->data;
+ uci->mc = p->data;
- if (mc_hdr->processor_rev_id != equiv_cpu_id)
- return 0;
+ rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
- /* ucode might be chipset specific -- currently we don't support this */
- if (mc_hdr->nb_dev_id || mc_hdr->sb_dev_id) {
- pr_err("CPU%d: chipset specific code not yet supported\n",
- cpu);
+ /* need to apply patch? */
+ if (rev >= mc_amd->hdr.patch_id) {
+ c->microcode = rev;
return 0;
}
- if (mc_hdr->patch_id <= rev)
- return 0;
-
- /*
- * now that the header looks sane, verify its size
- */
- actual_size = verify_ucode_size(cpu, patch_size, leftover_size);
- if (!actual_size)
- return 0;
-
- /* clear the patch buffer */
- memset(patch, 0, PAGE_SIZE);
-
- /* all looks ok, get the binary patch */
- get_ucode_data(patch, ucode_ptr + SECTION_HDR_SIZE, actual_size);
-
- return actual_size;
-}
-
-static int apply_microcode_amd(int cpu)
-{
- u32 rev, dummy;
- int cpu_num = raw_smp_processor_id();
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
- struct microcode_amd *mc_amd = uci->mc;
- struct cpuinfo_x86 *c = &cpu_data(cpu);
-
- /* We should bind the task to the CPU */
- BUG_ON(cpu_num != cpu);
-
- if (mc_amd == NULL)
- return 0;
-
wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code);
- /* get patch id after patching */
- rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
- /* check current patch id and patch's id for match */
+ /* verify patch application was successful */
+ rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
if (rev != mc_amd->hdr.patch_id) {
pr_err("CPU%d: update failed for patch_level=0x%08x\n",
cpu, mc_amd->hdr.patch_id);
@@ -238,7 +273,7 @@ static int install_equiv_cpu_table(const u8 *buf)
return -ENOMEM;
}
- get_ucode_data(equiv_cpu_table, buf + CONTAINER_HDR_SZ, size);
+ memcpy(equiv_cpu_table, buf + CONTAINER_HDR_SZ, size);
/* add header length */
return size + CONTAINER_HDR_SZ;
@@ -250,61 +285,113 @@ static void free_equiv_cpu_table(void)
equiv_cpu_table = NULL;
}
-static enum ucode_state
-generic_load_microcode(int cpu, const u8 *data, size_t size)
+static void cleanup(void)
{
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
- struct microcode_header_amd *mc_hdr = NULL;
- unsigned int mc_size, leftover, current_size = 0;
+ free_equiv_cpu_table();
+ free_cache();
+}
+
+/*
+ * We return the current size even if some of the checks failed so that
+ * we can skip over the next patch. If we return a negative value, we
+ * signal a grave error like a memory allocation has failed and the
+ * driver cannot continue functioning normally. In such cases, we tear
+ * down everything we've used up so far and exit.
+ */
+static int verify_and_add_patch(unsigned int cpu, u8 *fw, unsigned int leftover)
+{
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+ struct microcode_header_amd *mc_hdr;
+ struct ucode_patch *patch;
+ unsigned int patch_size, crnt_size, ret;
+ u32 proc_fam;
+ u16 proc_id;
+
+ patch_size = *(u32 *)(fw + 4);
+ crnt_size = patch_size + SECTION_HDR_SIZE;
+ mc_hdr = (struct microcode_header_amd *)(fw + SECTION_HDR_SIZE);
+ proc_id = mc_hdr->processor_rev_id;
+
+ proc_fam = find_cpu_family_by_equiv_cpu(proc_id);
+ if (!proc_fam) {
+ pr_err("No patch family for equiv ID: 0x%04x\n", proc_id);
+ return crnt_size;
+ }
+
+ /* check if patch is for the current family */
+ proc_fam = ((proc_fam >> 8) & 0xf) + ((proc_fam >> 20) & 0xff);
+ if (proc_fam != c->x86)
+ return crnt_size;
+
+ if (mc_hdr->nb_dev_id || mc_hdr->sb_dev_id) {
+ pr_err("Patch-ID 0x%08x: chipset-specific code unsupported.\n",
+ mc_hdr->patch_id);
+ return crnt_size;
+ }
+
+ ret = verify_patch_size(cpu, patch_size, leftover);
+ if (!ret) {
+ pr_err("Patch-ID 0x%08x: size mismatch.\n", mc_hdr->patch_id);
+ return crnt_size;
+ }
+
+ patch = kzalloc(sizeof(*patch), GFP_KERNEL);
+ if (!patch) {
+ pr_err("Patch allocation failure.\n");
+ return -EINVAL;
+ }
+
+ patch->data = kzalloc(patch_size, GFP_KERNEL);
+ if (!patch->data) {
+ pr_err("Patch data allocation failure.\n");
+ kfree(patch);
+ return -EINVAL;
+ }
+
+ /* All looks ok, copy patch... */
+ memcpy(patch->data, fw + SECTION_HDR_SIZE, patch_size);
+ INIT_LIST_HEAD(&patch->plist);
+ patch->patch_id = mc_hdr->patch_id;
+ patch->equiv_cpu = proc_id;
+
+ /* ... and add to cache. */
+ update_cache(patch);
+
+ return crnt_size;
+}
+
+static enum ucode_state load_microcode_amd(int cpu, const u8 *data, size_t size)
+{
+ enum ucode_state ret = UCODE_ERROR;
+ unsigned int leftover;
+ u8 *fw = (u8 *)data;
+ int crnt_size = 0;
int offset;
- const u8 *ucode_ptr = data;
- void *new_mc = NULL;
- unsigned int new_rev = uci->cpu_sig.rev;
- enum ucode_state state = UCODE_ERROR;
- offset = install_equiv_cpu_table(ucode_ptr);
+ offset = install_equiv_cpu_table(data);
if (offset < 0) {
pr_err("failed to create equivalent cpu table\n");
- goto out;
+ return ret;
}
- ucode_ptr += offset;
+ fw += offset;
leftover = size - offset;
- if (*(u32 *)ucode_ptr != UCODE_UCODE_TYPE) {
+ if (*(u32 *)fw != UCODE_UCODE_TYPE) {
pr_err("invalid type field in container file section header\n");
- goto free_table;
+ free_equiv_cpu_table();
+ return ret;
}
while (leftover) {
- mc_size = get_matching_microcode(cpu, ucode_ptr, leftover,
- new_rev, &current_size);
- if (mc_size) {
- mc_hdr = patch;
- new_mc = patch;
- new_rev = mc_hdr->patch_id;
- goto out_ok;
- }
-
- ucode_ptr += current_size;
- leftover -= current_size;
- }
+ crnt_size = verify_and_add_patch(cpu, fw, leftover);
+ if (crnt_size < 0)
+ return ret;
- if (!new_mc) {
- state = UCODE_NFOUND;
- goto free_table;
+ fw += crnt_size;
+ leftover -= crnt_size;
}
-out_ok:
- uci->mc = new_mc;
- state = UCODE_OK;
- pr_debug("CPU%d update ucode (0x%08x -> 0x%08x)\n",
- cpu, uci->cpu_sig.rev, new_rev);
-
-free_table:
- free_equiv_cpu_table();
-
-out:
- return state;
+ return UCODE_OK;
}
/*
@@ -315,7 +402,7 @@ out:
*
* This legacy file is always smaller than 2K in size.
*
- * Starting at family 15h they are in family specific firmware files:
+ * Beginning with family 15h, they are in family-specific firmware files:
*
* amd-ucode/microcode_amd_fam15h.bin
* amd-ucode/microcode_amd_fam16h.bin
@@ -323,12 +410,17 @@ out:
*
* These might be larger than 2K.
*/
-static enum ucode_state request_microcode_amd(int cpu, struct device *device)
+static enum ucode_state request_microcode_amd(int cpu, struct device *device,
+ bool refresh_fw)
{
char fw_name[36] = "amd-ucode/microcode_amd.bin";
- const struct firmware *fw;
- enum ucode_state ret = UCODE_NFOUND;
struct cpuinfo_x86 *c = &cpu_data(cpu);
+ enum ucode_state ret = UCODE_NFOUND;
+ const struct firmware *fw;
+
+ /* reload ucode container only on the boot cpu */
+ if (!refresh_fw || c->cpu_index != boot_cpu_data.cpu_index)
+ return UCODE_OK;
if (c->x86 >= 0x15)
snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86);
@@ -344,12 +436,17 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device)
goto fw_release;
}
- ret = generic_load_microcode(cpu, fw->data, fw->size);
+ /* free old equiv table */
+ free_equiv_cpu_table();
+
+ ret = load_microcode_amd(cpu, fw->data, fw->size);
+ if (ret != UCODE_OK)
+ cleanup();
-fw_release:
+ fw_release:
release_firmware(fw);
-out:
+ out:
return ret;
}
@@ -383,14 +480,10 @@ struct microcode_ops * __init init_amd_microcode(void)
return NULL;
}
- patch = (void *)get_zeroed_page(GFP_KERNEL);
- if (!patch)
- return NULL;
-
return &microcode_amd_ops;
}
void __exit exit_amd_microcode(void)
{
- free_page((unsigned long)patch);
+ cleanup();
}
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 9e5bcf1e237..3a04b224d0c 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -279,19 +279,18 @@ static struct platform_device *microcode_pdev;
static int reload_for_cpu(int cpu)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+ enum ucode_state ustate;
int err = 0;
- if (uci->valid) {
- enum ucode_state ustate;
-
- ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev);
- if (ustate == UCODE_OK)
- apply_microcode_on_target(cpu);
- else
- if (ustate == UCODE_ERROR)
- err = -EINVAL;
- }
+ if (!uci->valid)
+ return err;
+ ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev, true);
+ if (ustate == UCODE_OK)
+ apply_microcode_on_target(cpu);
+ else
+ if (ustate == UCODE_ERROR)
+ err = -EINVAL;
return err;
}
@@ -373,18 +372,15 @@ static void microcode_fini_cpu(int cpu)
static enum ucode_state microcode_resume_cpu(int cpu)
{
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-
- if (!uci->mc)
- return UCODE_NFOUND;
-
pr_debug("CPU%d updated upon resume\n", cpu);
- apply_microcode_on_target(cpu);
+
+ if (apply_microcode_on_target(cpu))
+ return UCODE_ERROR;
return UCODE_OK;
}
-static enum ucode_state microcode_init_cpu(int cpu)
+static enum ucode_state microcode_init_cpu(int cpu, bool refresh_fw)
{
enum ucode_state ustate;
@@ -395,7 +391,8 @@ static enum ucode_state microcode_init_cpu(int cpu)
if (system_state != SYSTEM_RUNNING)
return UCODE_NFOUND;
- ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev);
+ ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev,
+ refresh_fw);
if (ustate == UCODE_OK) {
pr_debug("CPU%d updated upon init\n", cpu);
@@ -408,14 +405,11 @@ static enum ucode_state microcode_init_cpu(int cpu)
static enum ucode_state microcode_update_cpu(int cpu)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
- enum ucode_state ustate;
if (uci->valid)
- ustate = microcode_resume_cpu(cpu);
- else
- ustate = microcode_init_cpu(cpu);
+ return microcode_resume_cpu(cpu);
- return ustate;
+ return microcode_init_cpu(cpu, false);
}
static int mc_device_add(struct device *dev, struct subsys_interface *sif)
@@ -431,7 +425,7 @@ static int mc_device_add(struct device *dev, struct subsys_interface *sif)
if (err)
return err;
- if (microcode_init_cpu(cpu) == UCODE_ERROR)
+ if (microcode_init_cpu(cpu, true) == UCODE_ERROR)
return -EINVAL;
return err;
@@ -480,34 +474,41 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
struct device *dev;
dev = get_cpu_device(cpu);
- switch (action) {
+
+ switch (action & ~CPU_TASKS_FROZEN) {
case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
microcode_update_cpu(cpu);
- case CPU_DOWN_FAILED:
- case CPU_DOWN_FAILED_FROZEN:
pr_debug("CPU%d added\n", cpu);
+ /*
+ * "break" is missing on purpose here because we want to fall
+ * through in order to create the sysfs group.
+ */
+
+ case CPU_DOWN_FAILED:
if (sysfs_create_group(&dev->kobj, &mc_attr_group))
pr_err("Failed to create group for CPU%d\n", cpu);
break;
+
case CPU_DOWN_PREPARE:
- case CPU_DOWN_PREPARE_FROZEN:
/* Suspend is in progress, only remove the interface */
sysfs_remove_group(&dev->kobj, &mc_attr_group);
pr_debug("CPU%d removed\n", cpu);
break;
/*
+ * case CPU_DEAD:
+ *
* When a CPU goes offline, don't free up or invalidate the copy of
* the microcode in kernel memory, so that we can reuse it when the
* CPU comes back online without unnecessarily requesting the userspace
* for it again.
*/
- case CPU_UP_CANCELED_FROZEN:
- /* The CPU refused to come up during a system resume */
- microcode_fini_cpu(cpu);
- break;
}
+
+ /* The CPU refused to come up during a system resume */
+ if (action == CPU_UP_CANCELED_FROZEN)
+ microcode_fini_cpu(cpu);
+
return NOTIFY_OK;
}
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index 0327e2b3c40..3544aed3933 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -405,7 +405,8 @@ static int get_ucode_fw(void *to, const void *from, size_t n)
return 0;
}
-static enum ucode_state request_microcode_fw(int cpu, struct device *device)
+static enum ucode_state request_microcode_fw(int cpu, struct device *device,
+ bool refresh_fw)
{
char name[30];
struct cpuinfo_x86 *c = &cpu_data(cpu);
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index eb113693f04..a7c5661f849 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -257,12 +257,14 @@ static int __init msr_init(void)
goto out_chrdev;
}
msr_class->devnode = msr_devnode;
+ get_online_cpus();
for_each_online_cpu(i) {
err = msr_device_create(i);
if (err != 0)
goto out_class;
}
register_hotcpu_notifier(&msr_class_cpu_notifier);
+ put_online_cpus();
err = 0;
goto out;
@@ -271,6 +273,7 @@ out_class:
i = 0;
for_each_online_cpu(i)
msr_device_destroy(i);
+ put_online_cpus();
class_destroy(msr_class);
out_chrdev:
__unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr");
@@ -281,11 +284,13 @@ out:
static void __exit msr_exit(void)
{
int cpu = 0;
+ get_online_cpus();
for_each_online_cpu(cpu)
msr_device_destroy(cpu);
class_destroy(msr_class);
__unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr");
unregister_hotcpu_notifier(&msr_class_cpu_notifier);
+ put_online_cpus();
}
module_init(msr_init);
diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c
new file mode 100644
index 00000000000..e309cc5c276
--- /dev/null
+++ b/arch/x86/kernel/perf_regs.c
@@ -0,0 +1,105 @@
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/perf_event.h>
+#include <linux/bug.h>
+#include <linux/stddef.h>
+#include <asm/perf_regs.h>
+#include <asm/ptrace.h>
+
+#ifdef CONFIG_X86_32
+#define PERF_REG_X86_MAX PERF_REG_X86_32_MAX
+#else
+#define PERF_REG_X86_MAX PERF_REG_X86_64_MAX
+#endif
+
+#define PT_REGS_OFFSET(id, r) [id] = offsetof(struct pt_regs, r)
+
+static unsigned int pt_regs_offset[PERF_REG_X86_MAX] = {
+ PT_REGS_OFFSET(PERF_REG_X86_AX, ax),
+ PT_REGS_OFFSET(PERF_REG_X86_BX, bx),
+ PT_REGS_OFFSET(PERF_REG_X86_CX, cx),
+ PT_REGS_OFFSET(PERF_REG_X86_DX, dx),
+ PT_REGS_OFFSET(PERF_REG_X86_SI, si),
+ PT_REGS_OFFSET(PERF_REG_X86_DI, di),
+ PT_REGS_OFFSET(PERF_REG_X86_BP, bp),
+ PT_REGS_OFFSET(PERF_REG_X86_SP, sp),
+ PT_REGS_OFFSET(PERF_REG_X86_IP, ip),
+ PT_REGS_OFFSET(PERF_REG_X86_FLAGS, flags),
+ PT_REGS_OFFSET(PERF_REG_X86_CS, cs),
+ PT_REGS_OFFSET(PERF_REG_X86_SS, ss),
+#ifdef CONFIG_X86_32
+ PT_REGS_OFFSET(PERF_REG_X86_DS, ds),
+ PT_REGS_OFFSET(PERF_REG_X86_ES, es),
+ PT_REGS_OFFSET(PERF_REG_X86_FS, fs),
+ PT_REGS_OFFSET(PERF_REG_X86_GS, gs),
+#else
+ /*
+ * The pt_regs struct does not store
+ * ds, es, fs, gs in 64 bit mode.
+ */
+ (unsigned int) -1,
+ (unsigned int) -1,
+ (unsigned int) -1,
+ (unsigned int) -1,
+#endif
+#ifdef CONFIG_X86_64
+ PT_REGS_OFFSET(PERF_REG_X86_R8, r8),
+ PT_REGS_OFFSET(PERF_REG_X86_R9, r9),
+ PT_REGS_OFFSET(PERF_REG_X86_R10, r10),
+ PT_REGS_OFFSET(PERF_REG_X86_R11, r11),
+ PT_REGS_OFFSET(PERF_REG_X86_R12, r12),
+ PT_REGS_OFFSET(PERF_REG_X86_R13, r13),
+ PT_REGS_OFFSET(PERF_REG_X86_R14, r14),
+ PT_REGS_OFFSET(PERF_REG_X86_R15, r15),
+#endif
+};
+
+u64 perf_reg_value(struct pt_regs *regs, int idx)
+{
+ if (WARN_ON_ONCE(idx >= ARRAY_SIZE(pt_regs_offset)))
+ return 0;
+
+ return regs_get_register(regs, pt_regs_offset[idx]);
+}
+
+#define REG_RESERVED (~((1ULL << PERF_REG_X86_MAX) - 1ULL))
+
+#ifdef CONFIG_X86_32
+int perf_reg_validate(u64 mask)
+{
+ if (!mask || mask & REG_RESERVED)
+ return -EINVAL;
+
+ return 0;
+}
+
+u64 perf_reg_abi(struct task_struct *task)
+{
+ return PERF_SAMPLE_REGS_ABI_32;
+}
+#else /* CONFIG_X86_64 */
+#define REG_NOSUPPORT ((1ULL << PERF_REG_X86_DS) | \
+ (1ULL << PERF_REG_X86_ES) | \
+ (1ULL << PERF_REG_X86_FS) | \
+ (1ULL << PERF_REG_X86_GS))
+
+int perf_reg_validate(u64 mask)
+{
+ if (!mask || mask & REG_RESERVED)
+ return -EINVAL;
+
+ if (mask & REG_NOSUPPORT)
+ return -EINVAL;
+
+ return 0;
+}
+
+u64 perf_reg_abi(struct task_struct *task)
+{
+ if (test_tsk_thread_flag(task, TIF_IA32))
+ return PERF_SAMPLE_REGS_ABI_32;
+ else
+ return PERF_SAMPLE_REGS_ABI_64;
+}
+#endif /* CONFIG_X86_32 */
diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c
index 0bc72e2069e..d5f15c3f7b2 100644
--- a/arch/x86/kernel/probe_roms.c
+++ b/arch/x86/kernel/probe_roms.c
@@ -150,7 +150,7 @@ static struct resource *find_oprom(struct pci_dev *pdev)
return oprom;
}
-void *pci_map_biosrom(struct pci_dev *pdev)
+void __iomem *pci_map_biosrom(struct pci_dev *pdev)
{
struct resource *oprom = find_oprom(pdev);
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index ef6a8456f71..dc3567e083f 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -66,15 +66,13 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
{
int ret;
- unlazy_fpu(src);
-
*dst = *src;
if (fpu_allocated(&src->thread.fpu)) {
memset(&dst->thread.fpu, 0, sizeof(dst->thread.fpu));
ret = fpu_alloc(&dst->thread.fpu);
if (ret)
return ret;
- fpu_copy(&dst->thread.fpu, &src->thread.fpu);
+ fpu_copy(dst, src);
}
return 0;
}
@@ -97,16 +95,6 @@ void arch_task_cache_init(void)
SLAB_PANIC | SLAB_NOTRACK, NULL);
}
-static inline void drop_fpu(struct task_struct *tsk)
-{
- /*
- * Forget coprocessor state..
- */
- tsk->fpu_counter = 0;
- clear_fpu(tsk);
- clear_used_math();
-}
-
/*
* Free current thread data structures etc..
*/
@@ -163,7 +151,13 @@ void flush_thread(void)
flush_ptrace_hw_breakpoint(tsk);
memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
- drop_fpu(tsk);
+ drop_init_fpu(tsk);
+ /*
+ * Free the FPU state for non xsave platforms. They get reallocated
+ * lazily at the first use.
+ */
+ if (!use_eager_fpu())
+ free_thread_xstate(tsk);
}
static void hard_disable_TSC(void)
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 516fa186121..b9ff83c7135 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -190,10 +190,6 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
regs->cs = __USER_CS;
regs->ip = new_ip;
regs->sp = new_sp;
- /*
- * Free the old FP and other extended state
- */
- free_thread_xstate(current);
}
EXPORT_SYMBOL_GPL(start_thread);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 0a980c9d7cb..8a6d20ce197 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -232,10 +232,6 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip,
regs->cs = _cs;
regs->ss = _ss;
regs->flags = X86_EFLAGS_IF;
- /*
- * Free the old FP and other extended state
- */
- free_thread_xstate(current);
}
void
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index c4c6a5c2bf0..b00b33a1839 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -21,6 +21,7 @@
#include <linux/signal.h>
#include <linux/perf_event.h>
#include <linux/hw_breakpoint.h>
+#include <linux/rcupdate.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
@@ -1332,9 +1333,6 @@ static const struct user_regset_view user_x86_64_view = {
#define genregs32_get genregs_get
#define genregs32_set genregs_set
-#define user_i387_ia32_struct user_i387_struct
-#define user32_fxsr_struct user_fxsr_struct
-
#endif /* CONFIG_X86_64 */
#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
@@ -1463,6 +1461,8 @@ long syscall_trace_enter(struct pt_regs *regs)
{
long ret = 0;
+ rcu_user_exit();
+
/*
* If we stepped into a sysenter/syscall insn, it trapped in
* kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
@@ -1526,4 +1526,6 @@ void syscall_trace_leave(struct pt_regs *regs)
!test_thread_flag(TIF_SYSCALL_EMU);
if (step || test_thread_flag(TIF_SYSCALL_TRACE))
tracehook_report_syscall_exit(regs, step);
+
+ rcu_user_enter();
}
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index f4b9b80e1b9..4f165479c45 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -961,9 +961,7 @@ void __init setup_arch(char **cmdline_p)
kvmclock_init();
#endif
- x86_init.paging.pagetable_setup_start(swapper_pg_dir);
- paging_init();
- x86_init.paging.pagetable_setup_done(swapper_pg_dir);
+ x86_init.paging.pagetable_init();
if (boot_cpu_data.cpuid_level >= 0) {
/* A CPU has %cr4 if and only if it has CPUID */
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index b280908a376..b33144c8b30 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -114,11 +114,12 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
regs->orig_ax = -1; /* disable syscall checks */
get_user_ex(buf, &sc->fpstate);
- err |= restore_i387_xstate(buf);
get_user_ex(*pax, &sc->ax);
} get_user_catch(err);
+ err |= restore_xstate_sig(buf, config_enabled(CONFIG_X86_32));
+
return err;
}
@@ -206,35 +207,32 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
void __user **fpstate)
{
/* Default to using normal stack */
+ unsigned long math_size = 0;
unsigned long sp = regs->sp;
+ unsigned long buf_fx = 0;
int onsigstack = on_sig_stack(sp);
-#ifdef CONFIG_X86_64
/* redzone */
- sp -= 128;
-#endif /* CONFIG_X86_64 */
+ if (config_enabled(CONFIG_X86_64))
+ sp -= 128;
if (!onsigstack) {
/* This is the X/Open sanctioned signal stack switching. */
if (ka->sa.sa_flags & SA_ONSTACK) {
if (current->sas_ss_size)
sp = current->sas_ss_sp + current->sas_ss_size;
- } else {
-#ifdef CONFIG_X86_32
- /* This is the legacy signal stack switching. */
- if ((regs->ss & 0xffff) != __USER_DS &&
- !(ka->sa.sa_flags & SA_RESTORER) &&
- ka->sa.sa_restorer)
+ } else if (config_enabled(CONFIG_X86_32) &&
+ (regs->ss & 0xffff) != __USER_DS &&
+ !(ka->sa.sa_flags & SA_RESTORER) &&
+ ka->sa.sa_restorer) {
+ /* This is the legacy signal stack switching. */
sp = (unsigned long) ka->sa.sa_restorer;
-#endif /* CONFIG_X86_32 */
}
}
if (used_math()) {
- sp -= sig_xstate_size;
-#ifdef CONFIG_X86_64
- sp = round_down(sp, 64);
-#endif /* CONFIG_X86_64 */
+ sp = alloc_mathframe(sp, config_enabled(CONFIG_X86_32),
+ &buf_fx, &math_size);
*fpstate = (void __user *)sp;
}
@@ -247,8 +245,9 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
if (onsigstack && !likely(on_sig_stack(sp)))
return (void __user *)-1L;
- /* save i387 state */
- if (used_math() && save_i387_xstate(*fpstate) < 0)
+ /* save i387 and extended state */
+ if (used_math() &&
+ save_xstate_sig(*fpstate, (void __user *)buf_fx, math_size) < 0)
return (void __user *)-1L;
return (void __user *)sp;
@@ -357,7 +356,6 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
put_user_ex(sig, &frame->sig);
put_user_ex(&frame->info, &frame->pinfo);
put_user_ex(&frame->uc, &frame->puc);
- err |= copy_siginfo_to_user(&frame->info, info);
/* Create the ucontext. */
if (cpu_has_xsave)
@@ -369,9 +367,6 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
put_user_ex(sas_ss_flags(regs->sp),
&frame->uc.uc_stack.ss_flags);
put_user_ex(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
- err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
- regs, set->sig[0]);
- err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
/* Set up to return from userspace. */
restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn);
@@ -388,6 +383,11 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
*/
put_user_ex(*((u64 *)&rt_retcode), (u64 *)frame->retcode);
} put_user_catch(err);
+
+ err |= copy_siginfo_to_user(&frame->info, info);
+ err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
+ regs, set->sig[0]);
+ err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
if (err)
return -EFAULT;
@@ -436,8 +436,6 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
put_user_ex(sas_ss_flags(regs->sp),
&frame->uc.uc_stack.ss_flags);
put_user_ex(me->sas_ss_size, &frame->uc.uc_stack.ss_size);
- err |= setup_sigcontext(&frame->uc.uc_mcontext, fp, regs, set->sig[0]);
- err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
/* Set up to return from userspace. If provided, use a stub
already in userspace. */
@@ -450,6 +448,9 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
}
} put_user_catch(err);
+ err |= setup_sigcontext(&frame->uc.uc_mcontext, fp, regs, set->sig[0]);
+ err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
+
if (err)
return -EFAULT;
@@ -474,6 +475,75 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
}
#endif /* CONFIG_X86_32 */
+static int x32_setup_rt_frame(int sig, struct k_sigaction *ka,
+ siginfo_t *info, compat_sigset_t *set,
+ struct pt_regs *regs)
+{
+#ifdef CONFIG_X86_X32_ABI
+ struct rt_sigframe_x32 __user *frame;
+ void __user *restorer;
+ int err = 0;
+ void __user *fpstate = NULL;
+
+ frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
+
+ if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+ return -EFAULT;
+
+ if (ka->sa.sa_flags & SA_SIGINFO) {
+ if (copy_siginfo_to_user32(&frame->info, info))
+ return -EFAULT;
+ }
+
+ put_user_try {
+ /* Create the ucontext. */
+ if (cpu_has_xsave)
+ put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags);
+ else
+ put_user_ex(0, &frame->uc.uc_flags);
+ put_user_ex(0, &frame->uc.uc_link);
+ put_user_ex(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
+ put_user_ex(sas_ss_flags(regs->sp),
+ &frame->uc.uc_stack.ss_flags);
+ put_user_ex(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
+ put_user_ex(0, &frame->uc.uc__pad0);
+
+ if (ka->sa.sa_flags & SA_RESTORER) {
+ restorer = ka->sa.sa_restorer;
+ } else {
+ /* could use a vstub here */
+ restorer = NULL;
+ err |= -EFAULT;
+ }
+ put_user_ex(restorer, &frame->pretcode);
+ } put_user_catch(err);
+
+ err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
+ regs, set->sig[0]);
+ err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
+
+ if (err)
+ return -EFAULT;
+
+ /* Set up registers for signal handler */
+ regs->sp = (unsigned long) frame;
+ regs->ip = (unsigned long) ka->sa.sa_handler;
+
+ /* We use the x32 calling convention here... */
+ regs->di = sig;
+ regs->si = (unsigned long) &frame->info;
+ regs->dx = (unsigned long) &frame->uc;
+
+ loadsegment(ds, __USER_DS);
+ loadsegment(es, __USER_DS);
+
+ regs->cs = __USER_CS;
+ regs->ss = __USER_DS;
+#endif /* CONFIG_X86_X32_ABI */
+
+ return 0;
+}
+
#ifdef CONFIG_X86_32
/*
* Atomically swap in the new signal mask, and wait for a signal.
@@ -612,55 +682,22 @@ static int signr_convert(int sig)
return sig;
}
-#ifdef CONFIG_X86_32
-
-#define is_ia32 1
-#define ia32_setup_frame __setup_frame
-#define ia32_setup_rt_frame __setup_rt_frame
-
-#else /* !CONFIG_X86_32 */
-
-#ifdef CONFIG_IA32_EMULATION
-#define is_ia32 test_thread_flag(TIF_IA32)
-#else /* !CONFIG_IA32_EMULATION */
-#define is_ia32 0
-#endif /* CONFIG_IA32_EMULATION */
-
-#ifdef CONFIG_X86_X32_ABI
-#define is_x32 test_thread_flag(TIF_X32)
-
-static int x32_setup_rt_frame(int sig, struct k_sigaction *ka,
- siginfo_t *info, compat_sigset_t *set,
- struct pt_regs *regs);
-#else /* !CONFIG_X86_X32_ABI */
-#define is_x32 0
-#endif /* CONFIG_X86_X32_ABI */
-
-int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
- sigset_t *set, struct pt_regs *regs);
-int ia32_setup_frame(int sig, struct k_sigaction *ka,
- sigset_t *set, struct pt_regs *regs);
-
-#endif /* CONFIG_X86_32 */
-
static int
setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
struct pt_regs *regs)
{
int usig = signr_convert(sig);
sigset_t *set = sigmask_to_save();
+ compat_sigset_t *cset = (compat_sigset_t *) set;
/* Set up the stack frame */
- if (is_ia32) {
+ if (is_ia32_frame()) {
if (ka->sa.sa_flags & SA_SIGINFO)
- return ia32_setup_rt_frame(usig, ka, info, set, regs);
+ return ia32_setup_rt_frame(usig, ka, info, cset, regs);
else
- return ia32_setup_frame(usig, ka, set, regs);
-#ifdef CONFIG_X86_X32_ABI
- } else if (is_x32) {
- return x32_setup_rt_frame(usig, ka, info,
- (compat_sigset_t *)set, regs);
-#endif
+ return ia32_setup_frame(usig, ka, cset, regs);
+ } else if (is_x32_frame()) {
+ return x32_setup_rt_frame(usig, ka, info, cset, regs);
} else {
return __setup_rt_frame(sig, ka, info, set, regs);
}
@@ -779,6 +816,8 @@ static void do_signal(struct pt_regs *regs)
void
do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
{
+ rcu_user_exit();
+
#ifdef CONFIG_X86_MCE
/* notify userspace of pending MCEs */
if (thread_info_flags & _TIF_MCE_NOTIFY)
@@ -804,6 +843,8 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
#ifdef CONFIG_X86_32
clear_thread_flag(TIF_IRET);
#endif /* CONFIG_X86_32 */
+
+ rcu_user_enter();
}
void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
@@ -824,72 +865,6 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
}
#ifdef CONFIG_X86_X32_ABI
-static int x32_setup_rt_frame(int sig, struct k_sigaction *ka,
- siginfo_t *info, compat_sigset_t *set,
- struct pt_regs *regs)
-{
- struct rt_sigframe_x32 __user *frame;
- void __user *restorer;
- int err = 0;
- void __user *fpstate = NULL;
-
- frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
-
- if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
- return -EFAULT;
-
- if (ka->sa.sa_flags & SA_SIGINFO) {
- if (copy_siginfo_to_user32(&frame->info, info))
- return -EFAULT;
- }
-
- put_user_try {
- /* Create the ucontext. */
- if (cpu_has_xsave)
- put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags);
- else
- put_user_ex(0, &frame->uc.uc_flags);
- put_user_ex(0, &frame->uc.uc_link);
- put_user_ex(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
- put_user_ex(sas_ss_flags(regs->sp),
- &frame->uc.uc_stack.ss_flags);
- put_user_ex(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
- put_user_ex(0, &frame->uc.uc__pad0);
- err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
- regs, set->sig[0]);
- err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
-
- if (ka->sa.sa_flags & SA_RESTORER) {
- restorer = ka->sa.sa_restorer;
- } else {
- /* could use a vstub here */
- restorer = NULL;
- err |= -EFAULT;
- }
- put_user_ex(restorer, &frame->pretcode);
- } put_user_catch(err);
-
- if (err)
- return -EFAULT;
-
- /* Set up registers for signal handler */
- regs->sp = (unsigned long) frame;
- regs->ip = (unsigned long) ka->sa.sa_handler;
-
- /* We use the x32 calling convention here... */
- regs->di = sig;
- regs->si = (unsigned long) &frame->info;
- regs->dx = (unsigned long) &frame->uc;
-
- loadsegment(ds, __USER_DS);
- loadsegment(es, __USER_DS);
-
- regs->cs = __USER_CS;
- regs->ss = __USER_DS;
-
- return 0;
-}
-
asmlinkage long sys32_x32_rt_sigreturn(struct pt_regs *regs)
{
struct rt_sigframe_x32 __user *frame;
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 7c5a8c314c0..c80a33bc528 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -665,7 +665,8 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
unsigned long boot_error = 0;
int timeout;
- alternatives_smp_switch(1);
+ /* Just in case we booted with a single CPU. */
+ alternatives_enable_smp();
idle->thread.sp = (unsigned long) (((struct pt_regs *)
(THREAD_SIZE + task_stack_page(idle))) - 1);
@@ -1053,20 +1054,6 @@ out:
preempt_enable();
}
-void arch_disable_nonboot_cpus_begin(void)
-{
- /*
- * Avoid the smp alternatives switch during the disable_nonboot_cpus().
- * In the suspend path, we will be back in the SMP mode shortly anyways.
- */
- skip_smp_alternatives = true;
-}
-
-void arch_disable_nonboot_cpus_end(void)
-{
- skip_smp_alternatives = false;
-}
-
void arch_enable_nonboot_cpus_begin(void)
{
set_mtrr_aps_delayed_init();
@@ -1256,9 +1243,6 @@ void native_cpu_die(unsigned int cpu)
if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
if (system_state == SYSTEM_RUNNING)
pr_info("CPU %u is now offline\n", cpu);
-
- if (1 == num_online_cpus())
- alternatives_smp_switch(0);
return;
}
msleep(100);
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index c346d116148..cd3b2438a98 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -157,6 +157,33 @@ static int enable_single_step(struct task_struct *child)
return 1;
}
+void set_task_blockstep(struct task_struct *task, bool on)
+{
+ unsigned long debugctl;
+
+ /*
+ * Ensure irq/preemption can't change debugctl in between.
+ * Note also that both TIF_BLOCKSTEP and debugctl should
+ * be changed atomically wrt preemption.
+ * FIXME: this means that set/clear TIF_BLOCKSTEP is simply
+ * wrong if task != current, SIGKILL can wakeup the stopped
+ * tracee and set/clear can play with the running task, this
+ * can confuse the next __switch_to_xtra().
+ */
+ local_irq_disable();
+ debugctl = get_debugctlmsr();
+ if (on) {
+ debugctl |= DEBUGCTLMSR_BTF;
+ set_tsk_thread_flag(task, TIF_BLOCKSTEP);
+ } else {
+ debugctl &= ~DEBUGCTLMSR_BTF;
+ clear_tsk_thread_flag(task, TIF_BLOCKSTEP);
+ }
+ if (task == current)
+ update_debugctlmsr(debugctl);
+ local_irq_enable();
+}
+
/*
* Enable single or block step.
*/
@@ -169,19 +196,10 @@ static void enable_step(struct task_struct *child, bool block)
* So no one should try to use debugger block stepping in a program
* that uses user-mode single stepping itself.
*/
- if (enable_single_step(child) && block) {
- unsigned long debugctl = get_debugctlmsr();
-
- debugctl |= DEBUGCTLMSR_BTF;
- update_debugctlmsr(debugctl);
- set_tsk_thread_flag(child, TIF_BLOCKSTEP);
- } else if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) {
- unsigned long debugctl = get_debugctlmsr();
-
- debugctl &= ~DEBUGCTLMSR_BTF;
- update_debugctlmsr(debugctl);
- clear_tsk_thread_flag(child, TIF_BLOCKSTEP);
- }
+ if (enable_single_step(child) && block)
+ set_task_blockstep(child, true);
+ else if (test_tsk_thread_flag(child, TIF_BLOCKSTEP))
+ set_task_blockstep(child, false);
}
void user_enable_single_step(struct task_struct *child)
@@ -199,13 +217,8 @@ void user_disable_single_step(struct task_struct *child)
/*
* Make sure block stepping (BTF) is disabled.
*/
- if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) {
- unsigned long debugctl = get_debugctlmsr();
-
- debugctl &= ~DEBUGCTLMSR_BTF;
- update_debugctlmsr(debugctl);
- clear_tsk_thread_flag(child, TIF_BLOCKSTEP);
- }
+ if (test_tsk_thread_flag(child, TIF_BLOCKSTEP))
+ set_task_blockstep(child, false);
/* Always clear TIF_SINGLESTEP... */
clear_tsk_thread_flag(child, TIF_SINGLESTEP);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index b481341c936..8276dc6794c 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -55,6 +55,7 @@
#include <asm/i387.h>
#include <asm/fpu-internal.h>
#include <asm/mce.h>
+#include <asm/rcu.h>
#include <asm/mach_traps.h>
@@ -107,30 +108,45 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
dec_preempt_count();
}
-static void __kprobes
-do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
- long error_code, siginfo_t *info)
+static int __kprobes
+do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
+ struct pt_regs *regs, long error_code)
{
- struct task_struct *tsk = current;
-
#ifdef CONFIG_X86_32
if (regs->flags & X86_VM_MASK) {
/*
- * traps 0, 1, 3, 4, and 5 should be forwarded to vm86.
+ * Traps 0, 1, 3, 4, and 5 should be forwarded to vm86.
* On nmi (interrupt 2), do_trap should not be called.
*/
- if (trapnr < X86_TRAP_UD)
- goto vm86_trap;
- goto trap_signal;
+ if (trapnr < X86_TRAP_UD) {
+ if (!handle_vm86_trap((struct kernel_vm86_regs *) regs,
+ error_code, trapnr))
+ return 0;
+ }
+ return -1;
}
#endif
+ if (!user_mode(regs)) {
+ if (!fixup_exception(regs)) {
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_nr = trapnr;
+ die(str, regs, error_code);
+ }
+ return 0;
+ }
- if (!user_mode(regs))
- goto kernel_trap;
+ return -1;
+}
-#ifdef CONFIG_X86_32
-trap_signal:
-#endif
+static void __kprobes
+do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
+ long error_code, siginfo_t *info)
+{
+ struct task_struct *tsk = current;
+
+
+ if (!do_trap_no_signal(tsk, trapnr, str, regs, error_code))
+ return;
/*
* We want error_code and trap_nr set for userspace faults and
* kernelspace faults which result in die(), but not
@@ -158,33 +174,20 @@ trap_signal:
force_sig_info(signr, info, tsk);
else
force_sig(signr, tsk);
- return;
-
-kernel_trap:
- if (!fixup_exception(regs)) {
- tsk->thread.error_code = error_code;
- tsk->thread.trap_nr = trapnr;
- die(str, regs, error_code);
- }
- return;
-
-#ifdef CONFIG_X86_32
-vm86_trap:
- if (handle_vm86_trap((struct kernel_vm86_regs *) regs,
- error_code, trapnr))
- goto trap_signal;
- return;
-#endif
}
#define DO_ERROR(trapnr, signr, str, name) \
dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \
{ \
- if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
- == NOTIFY_STOP) \
+ exception_enter(regs); \
+ if (notify_die(DIE_TRAP, str, regs, error_code, \
+ trapnr, signr) == NOTIFY_STOP) { \
+ exception_exit(regs); \
return; \
+ } \
conditional_sti(regs); \
do_trap(trapnr, signr, str, regs, error_code, NULL); \
+ exception_exit(regs); \
}
#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
@@ -195,11 +198,15 @@ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \
info.si_errno = 0; \
info.si_code = sicode; \
info.si_addr = (void __user *)siaddr; \
- if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
- == NOTIFY_STOP) \
+ exception_enter(regs); \
+ if (notify_die(DIE_TRAP, str, regs, error_code, \
+ trapnr, signr) == NOTIFY_STOP) { \
+ exception_exit(regs); \
return; \
+ } \
conditional_sti(regs); \
do_trap(trapnr, signr, str, regs, error_code, &info); \
+ exception_exit(regs); \
}
DO_ERROR_INFO(X86_TRAP_DE, SIGFPE, "divide error", divide_error, FPE_INTDIV,
@@ -222,12 +229,14 @@ DO_ERROR_INFO(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check,
/* Runs on IST stack */
dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code)
{
+ exception_enter(regs);
if (notify_die(DIE_TRAP, "stack segment", regs, error_code,
- X86_TRAP_SS, SIGBUS) == NOTIFY_STOP)
- return;
- preempt_conditional_sti(regs);
- do_trap(X86_TRAP_SS, SIGBUS, "stack segment", regs, error_code, NULL);
- preempt_conditional_cli(regs);
+ X86_TRAP_SS, SIGBUS) != NOTIFY_STOP) {
+ preempt_conditional_sti(regs);
+ do_trap(X86_TRAP_SS, SIGBUS, "stack segment", regs, error_code, NULL);
+ preempt_conditional_cli(regs);
+ }
+ exception_exit(regs);
}
dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
@@ -235,6 +244,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
static const char str[] = "double fault";
struct task_struct *tsk = current;
+ exception_enter(regs);
/* Return not checked because double check cannot be ignored */
notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV);
@@ -255,16 +265,29 @@ do_general_protection(struct pt_regs *regs, long error_code)
{
struct task_struct *tsk;
+ exception_enter(regs);
conditional_sti(regs);
#ifdef CONFIG_X86_32
- if (regs->flags & X86_VM_MASK)
- goto gp_in_vm86;
+ if (regs->flags & X86_VM_MASK) {
+ local_irq_enable();
+ handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
+ goto exit;
+ }
#endif
tsk = current;
- if (!user_mode(regs))
- goto gp_in_kernel;
+ if (!user_mode(regs)) {
+ if (fixup_exception(regs))
+ goto exit;
+
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_nr = X86_TRAP_GP;
+ if (notify_die(DIE_GPF, "general protection fault", regs, error_code,
+ X86_TRAP_GP, SIGSEGV) != NOTIFY_STOP)
+ die("general protection fault", regs, error_code);
+ goto exit;
+ }
tsk->thread.error_code = error_code;
tsk->thread.trap_nr = X86_TRAP_GP;
@@ -279,25 +302,8 @@ do_general_protection(struct pt_regs *regs, long error_code)
}
force_sig(SIGSEGV, tsk);
- return;
-
-#ifdef CONFIG_X86_32
-gp_in_vm86:
- local_irq_enable();
- handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
- return;
-#endif
-
-gp_in_kernel:
- if (fixup_exception(regs))
- return;
-
- tsk->thread.error_code = error_code;
- tsk->thread.trap_nr = X86_TRAP_GP;
- if (notify_die(DIE_GPF, "general protection fault", regs, error_code,
- X86_TRAP_GP, SIGSEGV) == NOTIFY_STOP)
- return;
- die("general protection fault", regs, error_code);
+exit:
+ exception_exit(regs);
}
/* May run on IST stack. */
@@ -312,15 +318,16 @@ dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_co
ftrace_int3_handler(regs))
return;
#endif
+ exception_enter(regs);
#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
SIGTRAP) == NOTIFY_STOP)
- return;
+ goto exit;
#endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */
if (notify_die(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
SIGTRAP) == NOTIFY_STOP)
- return;
+ goto exit;
/*
* Let others (NMI) know that the debug stack is in use
@@ -331,6 +338,8 @@ dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_co
do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, NULL);
preempt_conditional_cli(regs);
debug_stack_usage_dec();
+exit:
+ exception_exit(regs);
}
#ifdef CONFIG_X86_64
@@ -391,6 +400,8 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
unsigned long dr6;
int si_code;
+ exception_enter(regs);
+
get_debugreg(dr6, 6);
/* Filter out all the reserved bits which are preset to 1 */
@@ -406,7 +417,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
/* Catch kmemcheck conditions first of all! */
if ((dr6 & DR_STEP) && kmemcheck_trap(regs))
- return;
+ goto exit;
/* DR6 may or may not be cleared by the CPU */
set_debugreg(0, 6);
@@ -421,7 +432,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
if (notify_die(DIE_DEBUG, "debug", regs, PTR_ERR(&dr6), error_code,
SIGTRAP) == NOTIFY_STOP)
- return;
+ goto exit;
/*
* Let others (NMI) know that the debug stack is in use
@@ -437,7 +448,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
X86_TRAP_DB);
preempt_conditional_cli(regs);
debug_stack_usage_dec();
- return;
+ goto exit;
}
/*
@@ -458,7 +469,8 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
preempt_conditional_cli(regs);
debug_stack_usage_dec();
- return;
+exit:
+ exception_exit(regs);
}
/*
@@ -555,14 +567,17 @@ dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
#ifdef CONFIG_X86_32
ignore_fpu_irq = 1;
#endif
-
+ exception_enter(regs);
math_error(regs, error_code, X86_TRAP_MF);
+ exception_exit(regs);
}
dotraplinkage void
do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
{
+ exception_enter(regs);
math_error(regs, error_code, X86_TRAP_XF);
+ exception_exit(regs);
}
dotraplinkage void
@@ -613,11 +628,12 @@ void math_state_restore(void)
}
__thread_fpu_begin(tsk);
+
/*
* Paranoid restore. send a SIGSEGV if we fail to restore the state.
*/
if (unlikely(restore_fpu_checking(tsk))) {
- __thread_fpu_end(tsk);
+ drop_init_fpu(tsk);
force_sig(SIGSEGV, tsk);
return;
}
@@ -629,6 +645,9 @@ EXPORT_SYMBOL_GPL(math_state_restore);
dotraplinkage void __kprobes
do_device_not_available(struct pt_regs *regs, long error_code)
{
+ exception_enter(regs);
+ BUG_ON(use_eager_fpu());
+
#ifdef CONFIG_MATH_EMULATION
if (read_cr0() & X86_CR0_EM) {
struct math_emu_info info = { };
@@ -637,6 +656,7 @@ do_device_not_available(struct pt_regs *regs, long error_code)
info.regs = regs;
math_emulate(&info);
+ exception_exit(regs);
return;
}
#endif
@@ -644,12 +664,15 @@ do_device_not_available(struct pt_regs *regs, long error_code)
#ifdef CONFIG_X86_32
conditional_sti(regs);
#endif
+ exception_exit(regs);
}
#ifdef CONFIG_X86_32
dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
{
siginfo_t info;
+
+ exception_enter(regs);
local_irq_enable();
info.si_signo = SIGILL;
@@ -657,10 +680,11 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
info.si_code = ILL_BADSTK;
info.si_addr = NULL;
if (notify_die(DIE_TRAP, "iret exception", regs, error_code,
- X86_TRAP_IRET, SIGILL) == NOTIFY_STOP)
- return;
- do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code,
- &info);
+ X86_TRAP_IRET, SIGILL) != NOTIFY_STOP) {
+ do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code,
+ &info);
+ }
+ exception_exit(regs);
}
#endif
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 36fd42091fa..9538f00827a 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -41,6 +41,9 @@
/* Adjust the return address of a call insn */
#define UPROBE_FIX_CALL 0x2
+/* Instruction will modify TF, don't change it */
+#define UPROBE_FIX_SETF 0x4
+
#define UPROBE_FIX_RIP_AX 0x8000
#define UPROBE_FIX_RIP_CX 0x4000
@@ -239,6 +242,10 @@ static void prepare_fixups(struct arch_uprobe *auprobe, struct insn *insn)
insn_get_opcode(insn); /* should be a nop */
switch (OPCODE1(insn)) {
+ case 0x9d:
+ /* popf */
+ auprobe->fixups |= UPROBE_FIX_SETF;
+ break;
case 0xc3: /* ret/lret */
case 0xcb:
case 0xc2:
@@ -646,7 +653,7 @@ void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
* Skip these instructions as per the currently known x86 ISA.
* 0x66* { 0x90 | 0x0f 0x1f | 0x0f 0x19 | 0x87 0xc0 }
*/
-bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
+static bool __skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
int i;
@@ -673,3 +680,46 @@ bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
}
return false;
}
+
+bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ bool ret = __skip_sstep(auprobe, regs);
+ if (ret && (regs->flags & X86_EFLAGS_TF))
+ send_sig(SIGTRAP, current, 0);
+ return ret;
+}
+
+void arch_uprobe_enable_step(struct arch_uprobe *auprobe)
+{
+ struct task_struct *task = current;
+ struct arch_uprobe_task *autask = &task->utask->autask;
+ struct pt_regs *regs = task_pt_regs(task);
+
+ autask->saved_tf = !!(regs->flags & X86_EFLAGS_TF);
+
+ regs->flags |= X86_EFLAGS_TF;
+ if (test_tsk_thread_flag(task, TIF_BLOCKSTEP))
+ set_task_blockstep(task, false);
+}
+
+void arch_uprobe_disable_step(struct arch_uprobe *auprobe)
+{
+ struct task_struct *task = current;
+ struct arch_uprobe_task *autask = &task->utask->autask;
+ bool trapped = (task->utask->state == UTASK_SSTEP_TRAPPED);
+ struct pt_regs *regs = task_pt_regs(task);
+ /*
+ * The state of TIF_BLOCKSTEP was not saved so we can get an extra
+ * SIGTRAP if we do not clear TF. We need to examine the opcode to
+ * make it right.
+ */
+ if (unlikely(trapped)) {
+ if (!autask->saved_tf)
+ regs->flags &= ~X86_EFLAGS_TF;
+ } else {
+ if (autask->saved_tf)
+ send_sig(SIGTRAP, task, 0);
+ else if (!(auprobe->fixups & UPROBE_FIX_SETF))
+ regs->flags &= ~X86_EFLAGS_TF;
+ }
+}
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 6020f6f5927..1330dd10295 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -13,9 +13,13 @@
#include <asm/ftrace.h>
#ifdef CONFIG_FUNCTION_TRACER
-/* mcount is defined in assembly */
+/* mcount and __fentry__ are defined in assembly */
+#ifdef CC_USING_FENTRY
+EXPORT_SYMBOL(__fentry__);
+#else
EXPORT_SYMBOL(mcount);
#endif
+#endif
EXPORT_SYMBOL(__get_user_1);
EXPORT_SYMBOL(__get_user_2);
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 9f3167e891e..7a3d075a814 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -26,7 +26,6 @@
void __cpuinit x86_init_noop(void) { }
void __init x86_init_uint_noop(unsigned int unused) { }
-void __init x86_init_pgd_noop(pgd_t *unused) { }
int __init iommu_init_noop(void) { return 0; }
void iommu_shutdown_noop(void) { }
@@ -68,8 +67,7 @@ struct x86_init_ops x86_init __initdata = {
},
.paging = {
- .pagetable_setup_start = native_pagetable_setup_start,
- .pagetable_setup_done = native_pagetable_setup_done,
+ .pagetable_init = native_pagetable_init,
},
.timers = {
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 3d3e2070911..ada87a329ed 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -10,9 +10,7 @@
#include <linux/compat.h>
#include <asm/i387.h>
#include <asm/fpu-internal.h>
-#ifdef CONFIG_IA32_EMULATION
-#include <asm/sigcontext32.h>
-#endif
+#include <asm/sigframe.h>
#include <asm/xcr.h>
/*
@@ -23,13 +21,9 @@ u64 pcntxt_mask;
/*
* Represents init state for the supported extended state.
*/
-static struct xsave_struct *init_xstate_buf;
-
-struct _fpx_sw_bytes fx_sw_reserved;
-#ifdef CONFIG_IA32_EMULATION
-struct _fpx_sw_bytes fx_sw_reserved_ia32;
-#endif
+struct xsave_struct *init_xstate_buf;
+static struct _fpx_sw_bytes fx_sw_reserved, fx_sw_reserved_ia32;
static unsigned int *xstate_offsets, *xstate_sizes, xstate_features;
/*
@@ -44,9 +38,9 @@ static unsigned int *xstate_offsets, *xstate_sizes, xstate_features;
*/
void __sanitize_i387_state(struct task_struct *tsk)
{
- u64 xstate_bv;
- int feature_bit = 0x2;
struct i387_fxsave_struct *fx = &tsk->thread.fpu.state->fxsave;
+ int feature_bit = 0x2;
+ u64 xstate_bv;
if (!fx)
return;
@@ -104,213 +98,326 @@ void __sanitize_i387_state(struct task_struct *tsk)
* Check for the presence of extended state information in the
* user fpstate pointer in the sigcontext.
*/
-int check_for_xstate(struct i387_fxsave_struct __user *buf,
- void __user *fpstate,
- struct _fpx_sw_bytes *fx_sw_user)
+static inline int check_for_xstate(struct i387_fxsave_struct __user *buf,
+ void __user *fpstate,
+ struct _fpx_sw_bytes *fx_sw)
{
int min_xstate_size = sizeof(struct i387_fxsave_struct) +
sizeof(struct xsave_hdr_struct);
unsigned int magic2;
- int err;
- err = __copy_from_user(fx_sw_user, &buf->sw_reserved[0],
- sizeof(struct _fpx_sw_bytes));
- if (err)
- return -EFAULT;
+ if (__copy_from_user(fx_sw, &buf->sw_reserved[0], sizeof(*fx_sw)))
+ return -1;
- /*
- * First Magic check failed.
- */
- if (fx_sw_user->magic1 != FP_XSTATE_MAGIC1)
- return -EINVAL;
+ /* Check for the first magic field and other error scenarios. */
+ if (fx_sw->magic1 != FP_XSTATE_MAGIC1 ||
+ fx_sw->xstate_size < min_xstate_size ||
+ fx_sw->xstate_size > xstate_size ||
+ fx_sw->xstate_size > fx_sw->extended_size)
+ return -1;
/*
- * Check for error scenarios.
- */
- if (fx_sw_user->xstate_size < min_xstate_size ||
- fx_sw_user->xstate_size > xstate_size ||
- fx_sw_user->xstate_size > fx_sw_user->extended_size)
- return -EINVAL;
-
- err = __get_user(magic2, (__u32 *) (((void *)fpstate) +
- fx_sw_user->extended_size -
- FP_XSTATE_MAGIC2_SIZE));
- if (err)
- return err;
- /*
* Check for the presence of second magic word at the end of memory
* layout. This detects the case where the user just copied the legacy
* fpstate layout with out copying the extended state information
* in the memory layout.
*/
- if (magic2 != FP_XSTATE_MAGIC2)
- return -EFAULT;
+ if (__get_user(magic2, (__u32 __user *)(fpstate + fx_sw->xstate_size))
+ || magic2 != FP_XSTATE_MAGIC2)
+ return -1;
return 0;
}
-#ifdef CONFIG_X86_64
/*
* Signal frame handlers.
*/
-
-int save_i387_xstate(void __user *buf)
+static inline int save_fsave_header(struct task_struct *tsk, void __user *buf)
{
- struct task_struct *tsk = current;
- int err = 0;
-
- if (!access_ok(VERIFY_WRITE, buf, sig_xstate_size))
- return -EACCES;
+ if (use_fxsr()) {
+ struct xsave_struct *xsave = &tsk->thread.fpu.state->xsave;
+ struct user_i387_ia32_struct env;
+ struct _fpstate_ia32 __user *fp = buf;
- BUG_ON(sig_xstate_size < xstate_size);
+ convert_from_fxsr(&env, tsk);
- if ((unsigned long)buf % 64)
- pr_err("%s: bad fpstate %p\n", __func__, buf);
-
- if (!used_math())
- return 0;
-
- if (user_has_fpu()) {
- if (use_xsave())
- err = xsave_user(buf);
- else
- err = fxsave_user(buf);
-
- if (err)
- return err;
- user_fpu_end();
+ if (__copy_to_user(buf, &env, sizeof(env)) ||
+ __put_user(xsave->i387.swd, &fp->status) ||
+ __put_user(X86_FXSR_MAGIC, &fp->magic))
+ return -1;
} else {
- sanitize_i387_state(tsk);
- if (__copy_to_user(buf, &tsk->thread.fpu.state->fxsave,
- xstate_size))
+ struct i387_fsave_struct __user *fp = buf;
+ u32 swd;
+ if (__get_user(swd, &fp->swd) || __put_user(swd, &fp->status))
return -1;
}
- clear_used_math(); /* trigger finit */
+ return 0;
+}
- if (use_xsave()) {
- struct _fpstate __user *fx = buf;
- struct _xstate __user *x = buf;
- u64 xstate_bv;
+static inline int save_xstate_epilog(void __user *buf, int ia32_frame)
+{
+ struct xsave_struct __user *x = buf;
+ struct _fpx_sw_bytes *sw_bytes;
+ u32 xstate_bv;
+ int err;
- err = __copy_to_user(&fx->sw_reserved, &fx_sw_reserved,
- sizeof(struct _fpx_sw_bytes));
+ /* Setup the bytes not touched by the [f]xsave and reserved for SW. */
+ sw_bytes = ia32_frame ? &fx_sw_reserved_ia32 : &fx_sw_reserved;
+ err = __copy_to_user(&x->i387.sw_reserved, sw_bytes, sizeof(*sw_bytes));
- err |= __put_user(FP_XSTATE_MAGIC2,
- (__u32 __user *) (buf + sig_xstate_size
- - FP_XSTATE_MAGIC2_SIZE));
+ if (!use_xsave())
+ return err;
- /*
- * Read the xstate_bv which we copied (directly from the cpu or
- * from the state in task struct) to the user buffers and
- * set the FP/SSE bits.
- */
- err |= __get_user(xstate_bv, &x->xstate_hdr.xstate_bv);
+ err |= __put_user(FP_XSTATE_MAGIC2, (__u32 *)(buf + xstate_size));
- /*
- * For legacy compatible, we always set FP/SSE bits in the bit
- * vector while saving the state to the user context. This will
- * enable us capturing any changes(during sigreturn) to
- * the FP/SSE bits by the legacy applications which don't touch
- * xstate_bv in the xsave header.
- *
- * xsave aware apps can change the xstate_bv in the xsave
- * header as well as change any contents in the memory layout.
- * xrestore as part of sigreturn will capture all the changes.
- */
- xstate_bv |= XSTATE_FPSSE;
+ /*
+ * Read the xstate_bv which we copied (directly from the cpu or
+ * from the state in task struct) to the user buffers.
+ */
+ err |= __get_user(xstate_bv, (__u32 *)&x->xsave_hdr.xstate_bv);
- err |= __put_user(xstate_bv, &x->xstate_hdr.xstate_bv);
+ /*
+ * For legacy compatible, we always set FP/SSE bits in the bit
+ * vector while saving the state to the user context. This will
+ * enable us capturing any changes(during sigreturn) to
+ * the FP/SSE bits by the legacy applications which don't touch
+ * xstate_bv in the xsave header.
+ *
+ * xsave aware apps can change the xstate_bv in the xsave
+ * header as well as change any contents in the memory layout.
+ * xrestore as part of sigreturn will capture all the changes.
+ */
+ xstate_bv |= XSTATE_FPSSE;
- if (err)
- return err;
- }
+ err |= __put_user(xstate_bv, (__u32 *)&x->xsave_hdr.xstate_bv);
- return 1;
+ return err;
+}
+
+static inline int save_user_xstate(struct xsave_struct __user *buf)
+{
+ int err;
+
+ if (use_xsave())
+ err = xsave_user(buf);
+ else if (use_fxsr())
+ err = fxsave_user((struct i387_fxsave_struct __user *) buf);
+ else
+ err = fsave_user((struct i387_fsave_struct __user *) buf);
+
+ if (unlikely(err) && __clear_user(buf, xstate_size))
+ err = -EFAULT;
+ return err;
}
/*
- * Restore the extended state if present. Otherwise, restore the FP/SSE
- * state.
+ * Save the fpu, extended register state to the user signal frame.
+ *
+ * 'buf_fx' is the 64-byte aligned pointer at which the [f|fx|x]save
+ * state is copied.
+ * 'buf' points to the 'buf_fx' or to the fsave header followed by 'buf_fx'.
+ *
+ * buf == buf_fx for 64-bit frames and 32-bit fsave frame.
+ * buf != buf_fx for 32-bit frames with fxstate.
+ *
+ * If the fpu, extended register state is live, save the state directly
+ * to the user frame pointed by the aligned pointer 'buf_fx'. Otherwise,
+ * copy the thread's fpu state to the user frame starting at 'buf_fx'.
+ *
+ * If this is a 32-bit frame with fxstate, put a fsave header before
+ * the aligned state at 'buf_fx'.
+ *
+ * For [f]xsave state, update the SW reserved fields in the [f]xsave frame
+ * indicating the absence/presence of the extended state to the user.
*/
-static int restore_user_xstate(void __user *buf)
+int save_xstate_sig(void __user *buf, void __user *buf_fx, int size)
{
- struct _fpx_sw_bytes fx_sw_user;
- u64 mask;
- int err;
+ struct xsave_struct *xsave = &current->thread.fpu.state->xsave;
+ struct task_struct *tsk = current;
+ int ia32_fxstate = (buf != buf_fx);
- if (((unsigned long)buf % 64) ||
- check_for_xstate(buf, buf, &fx_sw_user))
- goto fx_only;
+ ia32_fxstate &= (config_enabled(CONFIG_X86_32) ||
+ config_enabled(CONFIG_IA32_EMULATION));
- mask = fx_sw_user.xstate_bv;
+ if (!access_ok(VERIFY_WRITE, buf, size))
+ return -EACCES;
- /*
- * restore the state passed by the user.
- */
- err = xrestore_user(buf, mask);
- if (err)
- return err;
+ if (!HAVE_HWFP)
+ return fpregs_soft_get(current, NULL, 0,
+ sizeof(struct user_i387_ia32_struct), NULL,
+ (struct _fpstate_ia32 __user *) buf) ? -1 : 1;
- /*
- * init the state skipped by the user.
- */
- mask = pcntxt_mask & ~mask;
- if (unlikely(mask))
- xrstor_state(init_xstate_buf, mask);
+ if (user_has_fpu()) {
+ /* Save the live register state to the user directly. */
+ if (save_user_xstate(buf_fx))
+ return -1;
+ /* Update the thread's fxstate to save the fsave header. */
+ if (ia32_fxstate)
+ fpu_fxsave(&tsk->thread.fpu);
+ } else {
+ sanitize_i387_state(tsk);
+ if (__copy_to_user(buf_fx, xsave, xstate_size))
+ return -1;
+ }
+
+ /* Save the fsave header for the 32-bit frames. */
+ if ((ia32_fxstate || !use_fxsr()) && save_fsave_header(tsk, buf))
+ return -1;
+
+ if (use_fxsr() && save_xstate_epilog(buf_fx, ia32_fxstate))
+ return -1;
+
+ drop_init_fpu(tsk); /* trigger finit */
return 0;
+}
-fx_only:
- /*
- * couldn't find the extended state information in the
- * memory layout. Restore just the FP/SSE and init all
- * the other extended state.
- */
- xrstor_state(init_xstate_buf, pcntxt_mask & ~XSTATE_FPSSE);
- return fxrstor_checking((__force struct i387_fxsave_struct *)buf);
+static inline void
+sanitize_restored_xstate(struct task_struct *tsk,
+ struct user_i387_ia32_struct *ia32_env,
+ u64 xstate_bv, int fx_only)
+{
+ struct xsave_struct *xsave = &tsk->thread.fpu.state->xsave;
+ struct xsave_hdr_struct *xsave_hdr = &xsave->xsave_hdr;
+
+ if (use_xsave()) {
+ /* These bits must be zero. */
+ xsave_hdr->reserved1[0] = xsave_hdr->reserved1[1] = 0;
+
+ /*
+ * Init the state that is not present in the memory
+ * layout and not enabled by the OS.
+ */
+ if (fx_only)
+ xsave_hdr->xstate_bv = XSTATE_FPSSE;
+ else
+ xsave_hdr->xstate_bv &= (pcntxt_mask & xstate_bv);
+ }
+
+ if (use_fxsr()) {
+ /*
+ * mscsr reserved bits must be masked to zero for security
+ * reasons.
+ */
+ xsave->i387.mxcsr &= mxcsr_feature_mask;
+
+ convert_to_fxsr(tsk, ia32_env);
+ }
}
/*
- * This restores directly out of user space. Exceptions are handled.
+ * Restore the extended state if present. Otherwise, restore the FP/SSE state.
*/
-int restore_i387_xstate(void __user *buf)
+static inline int restore_user_xstate(void __user *buf, u64 xbv, int fx_only)
{
+ if (use_xsave()) {
+ if ((unsigned long)buf % 64 || fx_only) {
+ u64 init_bv = pcntxt_mask & ~XSTATE_FPSSE;
+ xrstor_state(init_xstate_buf, init_bv);
+ return fxrstor_user(buf);
+ } else {
+ u64 init_bv = pcntxt_mask & ~xbv;
+ if (unlikely(init_bv))
+ xrstor_state(init_xstate_buf, init_bv);
+ return xrestore_user(buf, xbv);
+ }
+ } else if (use_fxsr()) {
+ return fxrstor_user(buf);
+ } else
+ return frstor_user(buf);
+}
+
+int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size)
+{
+ int ia32_fxstate = (buf != buf_fx);
struct task_struct *tsk = current;
- int err = 0;
+ int state_size = xstate_size;
+ u64 xstate_bv = 0;
+ int fx_only = 0;
+
+ ia32_fxstate &= (config_enabled(CONFIG_X86_32) ||
+ config_enabled(CONFIG_IA32_EMULATION));
if (!buf) {
- if (used_math())
- goto clear;
+ drop_init_fpu(tsk);
return 0;
- } else
- if (!access_ok(VERIFY_READ, buf, sig_xstate_size))
- return -EACCES;
+ }
- if (!used_math()) {
- err = init_fpu(tsk);
- if (err)
- return err;
+ if (!access_ok(VERIFY_READ, buf, size))
+ return -EACCES;
+
+ if (!used_math() && init_fpu(tsk))
+ return -1;
+
+ if (!HAVE_HWFP) {
+ return fpregs_soft_set(current, NULL,
+ 0, sizeof(struct user_i387_ia32_struct),
+ NULL, buf) != 0;
}
- user_fpu_begin();
- if (use_xsave())
- err = restore_user_xstate(buf);
- else
- err = fxrstor_checking((__force struct i387_fxsave_struct *)
- buf);
- if (unlikely(err)) {
+ if (use_xsave()) {
+ struct _fpx_sw_bytes fx_sw_user;
+ if (unlikely(check_for_xstate(buf_fx, buf_fx, &fx_sw_user))) {
+ /*
+ * Couldn't find the extended state information in the
+ * memory layout. Restore just the FP/SSE and init all
+ * the other extended state.
+ */
+ state_size = sizeof(struct i387_fxsave_struct);
+ fx_only = 1;
+ } else {
+ state_size = fx_sw_user.xstate_size;
+ xstate_bv = fx_sw_user.xstate_bv;
+ }
+ }
+
+ if (ia32_fxstate) {
+ /*
+ * For 32-bit frames with fxstate, copy the user state to the
+ * thread's fpu state, reconstruct fxstate from the fsave
+ * header. Sanitize the copied state etc.
+ */
+ struct xsave_struct *xsave = &tsk->thread.fpu.state->xsave;
+ struct user_i387_ia32_struct env;
+ int err = 0;
+
+ /*
+ * Drop the current fpu which clears used_math(). This ensures
+ * that any context-switch during the copy of the new state,
+ * avoids the intermediate state from getting restored/saved.
+ * Thus avoiding the new restored state from getting corrupted.
+ * We will be ready to restore/save the state only after
+ * set_used_math() is again set.
+ */
+ drop_fpu(tsk);
+
+ if (__copy_from_user(xsave, buf_fx, state_size) ||
+ __copy_from_user(&env, buf, sizeof(env))) {
+ err = -1;
+ } else {
+ sanitize_restored_xstate(tsk, &env, xstate_bv, fx_only);
+ set_used_math();
+ }
+
+ if (use_eager_fpu())
+ math_state_restore();
+
+ return err;
+ } else {
/*
- * Encountered an error while doing the restore from the
- * user buffer, clear the fpu state.
+ * For 64-bit frames and 32-bit fsave frames, restore the user
+ * state to the registers directly (with exceptions handled).
*/
-clear:
- clear_fpu(tsk);
- clear_used_math();
+ user_fpu_begin();
+ if (restore_user_xstate(buf_fx, xstate_bv, fx_only)) {
+ drop_init_fpu(tsk);
+ return -1;
+ }
}
- return err;
+
+ return 0;
}
-#endif
/*
* Prepare the SW reserved portion of the fxsave memory layout, indicating
@@ -321,31 +428,22 @@ clear:
*/
static void prepare_fx_sw_frame(void)
{
- int size_extended = (xstate_size - sizeof(struct i387_fxsave_struct)) +
- FP_XSTATE_MAGIC2_SIZE;
+ int fsave_header_size = sizeof(struct i387_fsave_struct);
+ int size = xstate_size + FP_XSTATE_MAGIC2_SIZE;
- sig_xstate_size = sizeof(struct _fpstate) + size_extended;
-
-#ifdef CONFIG_IA32_EMULATION
- sig_xstate_ia32_size = sizeof(struct _fpstate_ia32) + size_extended;
-#endif
-
- memset(&fx_sw_reserved, 0, sizeof(fx_sw_reserved));
+ if (config_enabled(CONFIG_X86_32))
+ size += fsave_header_size;
fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1;
- fx_sw_reserved.extended_size = sig_xstate_size;
+ fx_sw_reserved.extended_size = size;
fx_sw_reserved.xstate_bv = pcntxt_mask;
fx_sw_reserved.xstate_size = xstate_size;
-#ifdef CONFIG_IA32_EMULATION
- memcpy(&fx_sw_reserved_ia32, &fx_sw_reserved,
- sizeof(struct _fpx_sw_bytes));
- fx_sw_reserved_ia32.extended_size = sig_xstate_ia32_size;
-#endif
-}
-#ifdef CONFIG_X86_64
-unsigned int sig_xstate_size = sizeof(struct _fpstate);
-#endif
+ if (config_enabled(CONFIG_IA32_EMULATION)) {
+ fx_sw_reserved_ia32 = fx_sw_reserved;
+ fx_sw_reserved_ia32.extended_size += fsave_header_size;
+ }
+}
/*
* Enable the extended processor state save/restore feature
@@ -384,19 +482,21 @@ static void __init setup_xstate_features(void)
/*
* setup the xstate image representing the init state
*/
-static void __init setup_xstate_init(void)
+static void __init setup_init_fpu_buf(void)
{
- setup_xstate_features();
-
/*
* Setup init_xstate_buf to represent the init state of
* all the features managed by the xsave
*/
init_xstate_buf = alloc_bootmem_align(xstate_size,
__alignof__(struct xsave_struct));
- init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT;
+ fx_finit(&init_xstate_buf->i387);
+
+ if (!cpu_has_xsave)
+ return;
+
+ setup_xstate_features();
- clts();
/*
* Init all the features state with header_bv being 0x0
*/
@@ -406,9 +506,21 @@ static void __init setup_xstate_init(void)
* of any feature which is not represented by all zero's.
*/
xsave_state(init_xstate_buf, -1);
- stts();
}
+static enum { AUTO, ENABLE, DISABLE } eagerfpu = AUTO;
+static int __init eager_fpu_setup(char *s)
+{
+ if (!strcmp(s, "on"))
+ eagerfpu = ENABLE;
+ else if (!strcmp(s, "off"))
+ eagerfpu = DISABLE;
+ else if (!strcmp(s, "auto"))
+ eagerfpu = AUTO;
+ return 1;
+}
+__setup("eagerfpu=", eager_fpu_setup);
+
/*
* Enable and initialize the xsave feature.
*/
@@ -445,8 +557,11 @@ static void __init xstate_enable_boot_cpu(void)
update_regset_xstate_info(xstate_size, pcntxt_mask);
prepare_fx_sw_frame();
+ setup_init_fpu_buf();
- setup_xstate_init();
+ /* Auto enable eagerfpu for xsaveopt */
+ if (cpu_has_xsaveopt && eagerfpu != DISABLE)
+ eagerfpu = ENABLE;
pr_info("enabled xstate_bv 0x%llx, cntxt size 0x%x\n",
pcntxt_mask, xstate_size);
@@ -471,3 +586,43 @@ void __cpuinit xsave_init(void)
next_func = xstate_enable;
this_func();
}
+
+static inline void __init eager_fpu_init_bp(void)
+{
+ current->thread.fpu.state =
+ alloc_bootmem_align(xstate_size, __alignof__(struct xsave_struct));
+ if (!init_xstate_buf)
+ setup_init_fpu_buf();
+}
+
+void __cpuinit eager_fpu_init(void)
+{
+ static __refdata void (*boot_func)(void) = eager_fpu_init_bp;
+
+ clear_used_math();
+ current_thread_info()->status = 0;
+
+ if (eagerfpu == ENABLE)
+ setup_force_cpu_cap(X86_FEATURE_EAGER_FPU);
+
+ if (!cpu_has_eager_fpu) {
+ stts();
+ return;
+ }
+
+ if (boot_func) {
+ boot_func();
+ boot_func = NULL;
+ }
+
+ /*
+ * This is same as math_state_restore(). But use_xsave() is
+ * not yet patched to use math_state_restore().
+ */
+ init_fpu(current);
+ __thread_fpu_begin(current);
+ if (cpu_has_xsave)
+ xrstor_state(init_xstate_buf, -1);
+ else
+ fxrstor_checking(&init_xstate_buf->i387);
+}