diff options
Diffstat (limited to 'arch/x86')
55 files changed, 920 insertions, 612 deletions
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index c026cca5602..f3aaf231b4e 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -341,10 +341,6 @@ config X86_USE_3DNOW def_bool y depends on (MCYRIXIII || MK7 || MGEODE_LX) && !UML -config X86_OOSTORE - def_bool y - depends on (MWINCHIP3D || MWINCHIPC6) && MTRR - # # P6_NOPs are a relatively minor optimization that require a family >= # 6 processor, except that it is broken on certain VIA chips. diff --git a/arch/x86/Makefile b/arch/x86/Makefile index eeda43abed6..a414b1471ff 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -82,8 +82,8 @@ else KBUILD_AFLAGS += -m64 KBUILD_CFLAGS += -m64 - # Don't autogenerate MMX or SSE instructions - KBUILD_CFLAGS += -mno-mmx -mno-sse + # Don't autogenerate traditional x87, MMX or SSE instructions + KBUILD_CFLAGS += -mno-mmx -mno-sse -mno-80387 -mno-fp-ret-in-387 # Use -mpreferred-stack-boundary=3 if supported. KBUILD_CFLAGS += $(call cc-option,-mpreferred-stack-boundary=3) diff --git a/arch/x86/boot/cpucheck.c b/arch/x86/boot/cpucheck.c index 100a9a10076..f0d0b20fe14 100644 --- a/arch/x86/boot/cpucheck.c +++ b/arch/x86/boot/cpucheck.c @@ -67,6 +67,13 @@ static int is_transmeta(void) cpu_vendor[2] == A32('M', 'x', '8', '6'); } +static int is_intel(void) +{ + return cpu_vendor[0] == A32('G', 'e', 'n', 'u') && + cpu_vendor[1] == A32('i', 'n', 'e', 'I') && + cpu_vendor[2] == A32('n', 't', 'e', 'l'); +} + /* Returns a bitmask of which words we have error bits in */ static int check_cpuflags(void) { @@ -153,6 +160,19 @@ int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr) asm("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx)); err = check_cpuflags(); + } else if (err == 0x01 && + !(err_flags[0] & ~(1 << X86_FEATURE_PAE)) && + is_intel() && cpu.level == 6 && + (cpu.model == 9 || cpu.model == 13)) { + /* PAE is disabled on this Pentium M but can be forced */ + if (cmdline_find_option_bool("forcepae")) { + puts("WARNING: Forcing PAE in CPU flags\n"); + set_bit(X86_FEATURE_PAE, cpu.flags); + err = check_cpuflags(); + } + else { + puts("WARNING: PAE disabled. Use parameter 'forcepae' to enable at your own risk!\n"); + } } if (err_flags_ptr) diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 256388260c8..0ca9a5c362b 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -350,7 +350,7 @@ cmd_line_ptr: .long 0 # (Header version 0x0202 or later) # can be located anywhere in # low memory 0x10000 or higher. -ramdisk_max: .long 0x7fffffff +initrd_addr_max: .long 0x7fffffff # (Header version 0x0203 or later) # The highest safe address for # the contents of an initrd diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index 7f669853317..a8fee078b92 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -5,3 +5,4 @@ genhdr-y += unistd_64.h genhdr-y += unistd_x32.h generic-y += clkdev.h +generic-y += mcs_spinlock.h diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 1d2091a226b..19b0ebafcd3 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -93,9 +93,6 @@ static inline int is_vsmp_box(void) return 0; } #endif -extern void xapic_wait_icr_idle(void); -extern u32 safe_xapic_wait_icr_idle(void); -extern void xapic_icr_write(u32, u32); extern int setup_profiling_timer(unsigned int); static inline void native_apic_mem_write(u32 reg, u32 v) @@ -184,7 +181,6 @@ extern int x2apic_phys; extern int x2apic_preenabled; extern void check_x2apic(void); extern void enable_x2apic(void); -extern void x2apic_icr_write(u32 low, u32 id); static inline int x2apic_enabled(void) { u64 msr; @@ -221,7 +217,6 @@ static inline void x2apic_force_phys(void) { } -#define nox2apic 0 #define x2apic_preenabled 0 #define x2apic_supported() 0 #endif @@ -351,7 +346,7 @@ struct apic { int trampoline_phys_low; int trampoline_phys_high; - void (*wait_for_init_deassert)(atomic_t *deassert); + bool wait_for_init_deassert; void (*smp_callin_clear_local_apic)(void); void (*inquire_remote_apic)(int apicid); @@ -517,13 +512,6 @@ extern int default_cpu_present_to_apicid(int mps_cpu); extern int default_check_phys_apicid_present(int phys_apicid); #endif -static inline void default_wait_for_init_deassert(atomic_t *deassert) -{ - while (!atomic_read(deassert)) - cpu_relax(); - return; -} - extern void generic_bigsmp_probe(void); diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index 04a48903b2e..69bbb484502 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -85,11 +85,7 @@ #else # define smp_rmb() barrier() #endif -#ifdef CONFIG_X86_OOSTORE -# define smp_wmb() wmb() -#else -# define smp_wmb() barrier() -#endif +#define smp_wmb() barrier() #define smp_read_barrier_depends() read_barrier_depends() #define set_mb(var, value) do { (void)xchg(&var, value); } while (0) #else /* !SMP */ @@ -100,7 +96,7 @@ #define set_mb(var, value) do { var = value; barrier(); } while (0) #endif /* SMP */ -#if defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE) +#if defined(CONFIG_X86_PPRO_FENCE) /* * For either of these options x86 doesn't have a strong TSO memory diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index e099f9502ac..bc507d7640f 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -217,9 +217,14 @@ #define X86_FEATURE_INVPCID (9*32+10) /* Invalidate Processor Context ID */ #define X86_FEATURE_RTM (9*32+11) /* Restricted Transactional Memory */ #define X86_FEATURE_MPX (9*32+14) /* Memory Protection Extension */ +#define X86_FEATURE_AVX512F (9*32+16) /* AVX-512 Foundation */ #define X86_FEATURE_RDSEED (9*32+18) /* The RDSEED instruction */ #define X86_FEATURE_ADX (9*32+19) /* The ADCX and ADOX instructions */ #define X86_FEATURE_SMAP (9*32+20) /* Supervisor Mode Access Prevention */ +#define X86_FEATURE_CLFLUSHOPT (9*32+23) /* CLFLUSHOPT instruction */ +#define X86_FEATURE_AVX512PF (9*32+26) /* AVX-512 Prefetch */ +#define X86_FEATURE_AVX512ER (9*32+27) /* AVX-512 Exponential and Reciprocal */ +#define X86_FEATURE_AVX512CD (9*32+28) /* AVX-512 Conflict Detection */ /* * BUG word(s) diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 34f69cb9350..91d9c69a629 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -237,7 +237,7 @@ memcpy_toio(volatile void __iomem *dst, const void *src, size_t count) static inline void flush_write_buffers(void) { -#if defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE) +#if defined(CONFIG_X86_PPRO_FENCE) asm volatile("lock; addl $0,0(%%esp)": : :"memory"); #endif } diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index e139b13f2a3..de36f22eb0b 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -214,6 +214,8 @@ do { \ struct msr *msrs_alloc(void); void msrs_free(struct msr *msrs); +int msr_set_bit(u32 msr, u8 bit); +int msr_clear_bit(u32 msr, u8 bit); #ifdef CONFIG_SMP int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index 86f9301903c..5f2fc4441b1 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -1,6 +1,7 @@ #ifndef _ASM_X86_NMI_H #define _ASM_X86_NMI_H +#include <linux/irq_work.h> #include <linux/pm.h> #include <asm/irq.h> #include <asm/io.h> @@ -38,6 +39,8 @@ typedef int (*nmi_handler_t)(unsigned int, struct pt_regs *); struct nmiaction { struct list_head list; nmi_handler_t handler; + u64 max_duration; + struct irq_work irq_work; unsigned long flags; const char *name; }; diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 938ef1d0458..b459ddf27d6 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -446,20 +446,10 @@ static inline int pte_same(pte_t a, pte_t b) return a.pte == b.pte; } -static inline int pteval_present(pteval_t pteval) -{ - /* - * Yes Linus, _PAGE_PROTNONE == _PAGE_NUMA. Expressing it this - * way clearly states that the intent is that protnone and numa - * hinting ptes are considered present for the purposes of - * pagetable operations like zapping, protection changes, gup etc. - */ - return pteval & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_NUMA); -} - static inline int pte_present(pte_t a) { - return pteval_present(pte_flags(a)); + return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE | + _PAGE_NUMA); } #define pte_accessible pte_accessible diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index 645cad2c95f..e820c080a4e 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -191,6 +191,14 @@ static inline void clflush(volatile void *__p) asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p)); } +static inline void clflushopt(volatile void *__p) +{ + alternative_io(".byte " __stringify(NOP_DS_PREFIX) "; clflush %P0", + ".byte 0x66; clflush %P0", + X86_FEATURE_CLFLUSHOPT, + "+m" (*(volatile char __force *)__p)); +} + #define nop() asm volatile ("nop") diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index bf156ded74b..0f62f5482d9 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -26,10 +26,9 @@ # define LOCK_PTR_REG "D" #endif -#if defined(CONFIG_X86_32) && \ - (defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE)) +#if defined(CONFIG_X86_32) && (defined(CONFIG_X86_PPRO_FENCE)) /* - * On PPro SMP or if we are using OOSTORE, we use a locked operation to unlock + * On PPro SMP, we use a locked operation to unlock * (PPro errata 66, 92) */ # define UNLOCK_LOCK_PREFIX LOCK_PREFIX diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index d35f24e231c..b28097e4c8c 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -119,9 +119,10 @@ static inline void setup_node_to_cpumask_map(void) { } extern const struct cpumask *cpu_coregroup_mask(int cpu); -#ifdef ENABLE_TOPO_DEFINES #define topology_physical_package_id(cpu) (cpu_data(cpu).phys_proc_id) #define topology_core_id(cpu) (cpu_data(cpu).cpu_core_id) + +#ifdef ENABLE_TOPO_DEFINES #define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu)) #define topology_thread_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu)) #endif @@ -133,12 +134,6 @@ static inline void arch_fix_phys_package_id(int num, u32 slot) struct pci_bus; void x86_pci_root_bus_resources(int bus, struct list_head *resources); -#ifdef CONFIG_SMP -#define mc_capable() ((boot_cpu_data.x86_max_cores > 1) && \ - (cpumask_weight(cpu_core_mask(0)) != nr_cpu_ids)) -#define smt_capable() (smp_num_siblings > 1) -#endif - #ifdef CONFIG_NUMA extern int get_mp_bus_to_node(int busnum); extern void set_mp_bus_to_node(int busnum, int node); diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h index 554738963b2..6c1d7411eb0 100644 --- a/arch/x86/include/asm/xsave.h +++ b/arch/x86/include/asm/xsave.h @@ -6,11 +6,14 @@ #define XSTATE_CPUID 0x0000000d -#define XSTATE_FP 0x1 -#define XSTATE_SSE 0x2 -#define XSTATE_YMM 0x4 -#define XSTATE_BNDREGS 0x8 -#define XSTATE_BNDCSR 0x10 +#define XSTATE_FP 0x1 +#define XSTATE_SSE 0x2 +#define XSTATE_YMM 0x4 +#define XSTATE_BNDREGS 0x8 +#define XSTATE_BNDCSR 0x10 +#define XSTATE_OPMASK 0x20 +#define XSTATE_ZMM_Hi256 0x40 +#define XSTATE_Hi16_ZMM 0x80 #define XSTATE_FPSSE (XSTATE_FP | XSTATE_SSE) @@ -23,7 +26,8 @@ #define XSAVE_YMM_OFFSET (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET) /* Supported features which support lazy state saving */ -#define XSTATE_LAZY (XSTATE_FP | XSTATE_SSE | XSTATE_YMM) +#define XSTATE_LAZY (XSTATE_FP | XSTATE_SSE | XSTATE_YMM \ + | XSTATE_OPMASK | XSTATE_ZMM_Hi256 | XSTATE_Hi16_ZMM) /* Supported features which require eager state saving */ #define XSTATE_EAGER (XSTATE_BNDREGS | XSTATE_BNDCSR) diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h index c19fc60ff06..4924f4be2b9 100644 --- a/arch/x86/include/uapi/asm/msr-index.h +++ b/arch/x86/include/uapi/asm/msr-index.h @@ -368,33 +368,58 @@ #define THERM_LOG_THRESHOLD1 (1 << 9) /* MISC_ENABLE bits: architectural */ -#define MSR_IA32_MISC_ENABLE_FAST_STRING (1ULL << 0) -#define MSR_IA32_MISC_ENABLE_TCC (1ULL << 1) -#define MSR_IA32_MISC_ENABLE_EMON (1ULL << 7) -#define MSR_IA32_MISC_ENABLE_BTS_UNAVAIL (1ULL << 11) -#define MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL (1ULL << 12) -#define MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP (1ULL << 16) -#define MSR_IA32_MISC_ENABLE_MWAIT (1ULL << 18) -#define MSR_IA32_MISC_ENABLE_LIMIT_CPUID (1ULL << 22) -#define MSR_IA32_MISC_ENABLE_XTPR_DISABLE (1ULL << 23) -#define MSR_IA32_MISC_ENABLE_XD_DISABLE (1ULL << 34) +#define MSR_IA32_MISC_ENABLE_FAST_STRING_BIT 0 +#define MSR_IA32_MISC_ENABLE_FAST_STRING (1ULL << MSR_IA32_MISC_ENABLE_FAST_STRING_BIT) +#define MSR_IA32_MISC_ENABLE_TCC_BIT 1 +#define MSR_IA32_MISC_ENABLE_TCC (1ULL << MSR_IA32_MISC_ENABLE_TCC_BIT) +#define MSR_IA32_MISC_ENABLE_EMON_BIT 7 +#define MSR_IA32_MISC_ENABLE_EMON (1ULL << MSR_IA32_MISC_ENABLE_EMON_BIT) +#define MSR_IA32_MISC_ENABLE_BTS_UNAVAIL_BIT 11 +#define MSR_IA32_MISC_ENABLE_BTS_UNAVAIL (1ULL << MSR_IA32_MISC_ENABLE_BTS_UNAVAIL_BIT) +#define MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL_BIT 12 +#define MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL (1ULL << MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL_BIT) +#define MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP_BIT 16 +#define MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP (1ULL << MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP_BIT) +#define MSR_IA32_MISC_ENABLE_MWAIT_BIT 18 +#define MSR_IA32_MISC_ENABLE_MWAIT (1ULL << MSR_IA32_MISC_ENABLE_MWAIT_BIT) +#define MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT 22 +#define MSR_IA32_MISC_ENABLE_LIMIT_CPUID (1ULL << MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT); +#define MSR_IA32_MISC_ENABLE_XTPR_DISABLE_BIT 23 +#define MSR_IA32_MISC_ENABLE_XTPR_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_XTPR_DISABLE_BIT) +#define MSR_IA32_MISC_ENABLE_XD_DISABLE_BIT 34 +#define MSR_IA32_MISC_ENABLE_XD_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_XD_DISABLE_BIT) /* MISC_ENABLE bits: model-specific, meaning may vary from core to core */ -#define MSR_IA32_MISC_ENABLE_X87_COMPAT (1ULL << 2) -#define MSR_IA32_MISC_ENABLE_TM1 (1ULL << 3) -#define MSR_IA32_MISC_ENABLE_SPLIT_LOCK_DISABLE (1ULL << 4) -#define MSR_IA32_MISC_ENABLE_L3CACHE_DISABLE (1ULL << 6) -#define MSR_IA32_MISC_ENABLE_SUPPRESS_LOCK (1ULL << 8) -#define MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE (1ULL << 9) -#define MSR_IA32_MISC_ENABLE_FERR (1ULL << 10) -#define MSR_IA32_MISC_ENABLE_FERR_MULTIPLEX (1ULL << 10) -#define MSR_IA32_MISC_ENABLE_TM2 (1ULL << 13) -#define MSR_IA32_MISC_ENABLE_ADJ_PREF_DISABLE (1ULL << 19) -#define MSR_IA32_MISC_ENABLE_SPEEDSTEP_LOCK (1ULL << 20) -#define MSR_IA32_MISC_ENABLE_L1D_CONTEXT (1ULL << 24) -#define MSR_IA32_MISC_ENABLE_DCU_PREF_DISABLE (1ULL << 37) -#define MSR_IA32_MISC_ENABLE_TURBO_DISABLE (1ULL << 38) -#define MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE (1ULL << 39) +#define MSR_IA32_MISC_ENABLE_X87_COMPAT_BIT 2 +#define MSR_IA32_MISC_ENABLE_X87_COMPAT (1ULL << MSR_IA32_MISC_ENABLE_X87_COMPAT_BIT) +#define MSR_IA32_MISC_ENABLE_TM1_BIT 3 +#define MSR_IA32_MISC_ENABLE_TM1 (1ULL << MSR_IA32_MISC_ENABLE_TM1_BIT) +#define MSR_IA32_MISC_ENABLE_SPLIT_LOCK_DISABLE_BIT 4 +#define MSR_IA32_MISC_ENABLE_SPLIT_LOCK_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_SPLIT_LOCK_DISABLE_BIT) +#define MSR_IA32_MISC_ENABLE_L3CACHE_DISABLE_BIT 6 +#define MSR_IA32_MISC_ENABLE_L3CACHE_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_L3CACHE_DISABLE_BIT) +#define MSR_IA32_MISC_ENABLE_SUPPRESS_LOCK_BIT 8 +#define MSR_IA32_MISC_ENABLE_SUPPRESS_LOCK (1ULL << MSR_IA32_MISC_ENABLE_SUPPRESS_LOCK_BIT) +#define MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE_BIT 9 +#define MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE_BIT) +#define MSR_IA32_MISC_ENABLE_FERR_BIT 10 +#define MSR_IA32_MISC_ENABLE_FERR (1ULL << MSR_IA32_MISC_ENABLE_FERR_BIT) +#define MSR_IA32_MISC_ENABLE_FERR_MULTIPLEX_BIT 10 +#define MSR_IA32_MISC_ENABLE_FERR_MULTIPLEX (1ULL << MSR_IA32_MISC_ENABLE_FERR_MULTIPLEX_BIT) +#define MSR_IA32_MISC_ENABLE_TM2_BIT 13 +#define MSR_IA32_MISC_ENABLE_TM2 (1ULL << MSR_IA32_MISC_ENABLE_TM2_BIT) +#define MSR_IA32_MISC_ENABLE_ADJ_PREF_DISABLE_BIT 19 +#define MSR_IA32_MISC_ENABLE_ADJ_PREF_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_ADJ_PREF_DISABLE_BIT) +#define MSR_IA32_MISC_ENABLE_SPEEDSTEP_LOCK_BIT 20 +#define MSR_IA32_MISC_ENABLE_SPEEDSTEP_LOCK (1ULL << MSR_IA32_MISC_ENABLE_SPEEDSTEP_LOCK_BIT) +#define MSR_IA32_MISC_ENABLE_L1D_CONTEXT_BIT 24 +#define MSR_IA32_MISC_ENABLE_L1D_CONTEXT (1ULL << MSR_IA32_MISC_ENABLE_L1D_CONTEXT_BIT) +#define MSR_IA32_MISC_ENABLE_DCU_PREF_DISABLE_BIT 37 +#define MSR_IA32_MISC_ENABLE_DCU_PREF_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_DCU_PREF_DISABLE_BIT) +#define MSR_IA32_MISC_ENABLE_TURBO_DISABLE_BIT 38 +#define MSR_IA32_MISC_ENABLE_TURBO_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_TURBO_DISABLE_BIT) +#define MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE_BIT 39 +#define MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE_BIT) #define MSR_IA32_TSC_DEADLINE 0x000006E0 diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 1dac94265b5..9f46f2b1cfc 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -613,10 +613,10 @@ static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) int nid; nid = acpi_get_node(handle); - if (nid == -1 || !node_online(nid)) - return; - set_apicid_to_node(physid, nid); - numa_set_node(cpu, nid); + if (nid != -1) { + set_apicid_to_node(physid, nid); + numa_set_node(cpu, nid); + } #endif } diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index fd972a3e4cb..9fa8aa051f5 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -18,7 +18,6 @@ #include <linux/pci_ids.h> #include <linux/pci.h> #include <linux/bitops.h> -#include <linux/ioport.h> #include <linux/suspend.h> #include <asm/e820.h> #include <asm/io.h> @@ -54,18 +53,6 @@ int fallback_aper_force __initdata; int fix_aperture __initdata = 1; -static struct resource gart_resource = { - .name = "GART", - .flags = IORESOURCE_MEM, -}; - -static void __init insert_aperture_resource(u32 aper_base, u32 aper_size) -{ - gart_resource.start = aper_base; - gart_resource.end = aper_base + aper_size - 1; - insert_resource(&iomem_resource, &gart_resource); -} - /* This code runs before the PCI subsystem is initialized, so just access the northbridge directly. */ @@ -96,7 +83,6 @@ static u32 __init allocate_aperture(void) memblock_reserve(addr, aper_size); printk(KERN_INFO "Mapping aperture over %d KB of RAM @ %lx\n", aper_size >> 10, addr); - insert_aperture_resource((u32)addr, aper_size); register_nosave_region(addr >> PAGE_SHIFT, (addr+aper_size) >> PAGE_SHIFT); @@ -444,12 +430,8 @@ int __init gart_iommu_hole_init(void) out: if (!fix && !fallback_aper_force) { - if (last_aper_base) { - unsigned long n = (32 * 1024 * 1024) << last_aper_order; - - insert_aperture_resource((u32)last_aper_base, n); + if (last_aper_base) return 1; - } return 0; } diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 7f26c9a70a9..53e20531470 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -133,6 +133,10 @@ static inline void imcr_apic_to_pic(void) * +1=force-enable */ static int force_enable_local_apic __initdata; + +/* Control whether x2APIC mode is enabled or not */ +static bool nox2apic __initdata; + /* * APIC command line parameters */ @@ -162,8 +166,7 @@ int x2apic_mode; /* x2apic enabled before OS handover */ int x2apic_preenabled; static int x2apic_disabled; -static int nox2apic; -static __init int setup_nox2apic(char *str) +static int __init setup_nox2apic(char *str) { if (x2apic_enabled()) { int apicid = native_apic_msr_read(APIC_ID); @@ -178,7 +181,7 @@ static __init int setup_nox2apic(char *str) } else setup_clear_cpu_cap(X86_FEATURE_X2APIC); - nox2apic = 1; + nox2apic = true; return 0; } @@ -283,8 +286,12 @@ u32 native_safe_apic_wait_icr_idle(void) void native_apic_icr_write(u32 low, u32 id) { + unsigned long flags; + + local_irq_save(flags); apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id)); apic_write(APIC_ICR, low); + local_irq_restore(flags); } u64 native_apic_icr_read(void) diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 2c621a6b901..7c1b2947951 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -198,7 +198,7 @@ static struct apic apic_flat = { .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, - .wait_for_init_deassert = NULL, + .wait_for_init_deassert = false, .smp_callin_clear_local_apic = NULL, .inquire_remote_apic = default_inquire_remote_apic, @@ -314,7 +314,7 @@ static struct apic apic_physflat = { .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, - .wait_for_init_deassert = NULL, + .wait_for_init_deassert = false, .smp_callin_clear_local_apic = NULL, .inquire_remote_apic = default_inquire_remote_apic, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 191ce75c0e5..8c7c98249c2 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -172,8 +172,7 @@ struct apic apic_noop = { .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, - .wait_for_init_deassert = NULL, - + .wait_for_init_deassert = false, .smp_callin_clear_local_apic = NULL, .inquire_remote_apic = NULL, diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 3e67f9e3d7e..a5b45df8bc8 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -248,7 +248,7 @@ static const struct apic apic_numachip __refconst = { .wakeup_secondary_cpu = numachip_wakeup_secondary, .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, - .wait_for_init_deassert = NULL, + .wait_for_init_deassert = false, .smp_callin_clear_local_apic = NULL, .inquire_remote_apic = NULL, /* REMRD not supported */ diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index d50e3640d5a..e4840aa7a25 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -199,8 +199,7 @@ static struct apic apic_bigsmp = { .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, - .wait_for_init_deassert = default_wait_for_init_deassert, - + .wait_for_init_deassert = true, .smp_callin_clear_local_apic = NULL, .inquire_remote_apic = default_inquire_remote_apic, diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index c55224731b2..6f8f8b348a3 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -394,12 +394,6 @@ static void es7000_enable_apic_mode(void) WARN(1, "Command failed, status = %x\n", mip_status); } -static void es7000_wait_for_init_deassert(atomic_t *deassert) -{ - while (!atomic_read(deassert)) - cpu_relax(); -} - static unsigned int es7000_get_apic_id(unsigned long x) { return (x >> 24) & 0xFF; @@ -658,8 +652,7 @@ static struct apic __refdata apic_es7000_cluster = { .trampoline_phys_low = 0x467, .trampoline_phys_high = 0x469, - .wait_for_init_deassert = NULL, - + .wait_for_init_deassert = false, /* Nothing to do for most platforms, since cleared by the INIT cycle: */ .smp_callin_clear_local_apic = NULL, .inquire_remote_apic = default_inquire_remote_apic, @@ -722,8 +715,7 @@ static struct apic __refdata apic_es7000 = { .trampoline_phys_low = 0x467, .trampoline_phys_high = 0x469, - .wait_for_init_deassert = es7000_wait_for_init_deassert, - + .wait_for_init_deassert = true, /* Nothing to do for most platforms, since cleared by the INIT cycle: */ .smp_callin_clear_local_apic = NULL, .inquire_remote_apic = default_inquire_remote_apic, diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index 1e42e8f305e..030ea1c04f7 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -505,8 +505,7 @@ static struct apic __refdata apic_numaq = { .trampoline_phys_high = NUMAQ_TRAMPOLINE_PHYS_HIGH, /* We don't do anything here because we use NMI's to boot instead */ - .wait_for_init_deassert = NULL, - + .wait_for_init_deassert = false, .smp_callin_clear_local_apic = numaq_smp_callin_clear_local_apic, .inquire_remote_apic = NULL, diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index eb35ef9ee63..cceb352c968 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -119,8 +119,7 @@ static struct apic apic_default = { .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, - .wait_for_init_deassert = default_wait_for_init_deassert, - + .wait_for_init_deassert = true, .smp_callin_clear_local_apic = NULL, .inquire_remote_apic = default_inquire_remote_apic, diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index 00146f9b025..b656128611c 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -532,8 +532,7 @@ static struct apic apic_summit = { .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, - .wait_for_init_deassert = default_wait_for_init_deassert, - + .wait_for_init_deassert = true, .smp_callin_clear_local_apic = NULL, .inquire_remote_apic = default_inquire_remote_apic, diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index cac85ee6913..e66766bf164 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -279,7 +279,7 @@ static struct apic apic_x2apic_cluster = { .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, - .wait_for_init_deassert = NULL, + .wait_for_init_deassert = false, .smp_callin_clear_local_apic = NULL, .inquire_remote_apic = NULL, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index de231e328ca..6d600ebf6c1 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -133,7 +133,7 @@ static struct apic apic_x2apic_phys = { .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, - .wait_for_init_deassert = NULL, + .wait_for_init_deassert = false, .smp_callin_clear_local_apic = NULL, .inquire_remote_apic = NULL, diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index d263b1307de..7834389ba5b 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -396,7 +396,7 @@ static struct apic __refdata apic_x2apic_uv_x = { .wakeup_secondary_cpu = uv_wakeup_secondary, .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, - .wait_for_init_deassert = NULL, + .wait_for_init_deassert = false, .smp_callin_clear_local_apic = NULL, .inquire_remote_apic = NULL, diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index c67ffa68606..ce8b8ff0e0e 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -218,7 +218,7 @@ static void amd_k7_smp_check(struct cpuinfo_x86 *c) */ WARN_ONCE(1, "WARNING: This combination of AMD" " processors is not suitable for SMP.\n"); - add_taint(TAINT_UNSAFE_SMP, LOCKDEP_NOW_UNRELIABLE); + add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_NOW_UNRELIABLE); } static void init_amd_k7(struct cpuinfo_x86 *c) @@ -233,9 +233,7 @@ static void init_amd_k7(struct cpuinfo_x86 *c) if (c->x86_model >= 6 && c->x86_model <= 10) { if (!cpu_has(c, X86_FEATURE_XMM)) { printk(KERN_INFO "Enabling disabled K7/SSE Support.\n"); - rdmsr(MSR_K7_HWCR, l, h); - l &= ~0x00008000; - wrmsr(MSR_K7_HWCR, l, h); + msr_clear_bit(MSR_K7_HWCR, 15); set_cpu_cap(c, X86_FEATURE_XMM); } } @@ -509,14 +507,8 @@ static void early_init_amd(struct cpuinfo_x86 *c) #endif /* F16h erratum 793, CVE-2013-6885 */ - if (c->x86 == 0x16 && c->x86_model <= 0xf) { - u64 val; - - rdmsrl(MSR_AMD64_LS_CFG, val); - if (!(val & BIT(15))) - wrmsrl(MSR_AMD64_LS_CFG, val | BIT(15)); - } - + if (c->x86 == 0x16 && c->x86_model <= 0xf) + msr_set_bit(MSR_AMD64_LS_CFG, 15); } static const int amd_erratum_383[]; @@ -536,11 +528,8 @@ static void init_amd(struct cpuinfo_x86 *c) * Errata 63 for SH-B3 steppings * Errata 122 for all steppings (F+ have it disabled by default) */ - if (c->x86 == 0xf) { - rdmsrl(MSR_K7_HWCR, value); - value |= 1 << 6; - wrmsrl(MSR_K7_HWCR, value); - } + if (c->x86 == 0xf) + msr_set_bit(MSR_K7_HWCR, 6); #endif early_init_amd(c); @@ -623,14 +612,11 @@ static void init_amd(struct cpuinfo_x86 *c) (c->x86_model >= 0x10) && (c->x86_model <= 0x1f) && !cpu_has(c, X86_FEATURE_TOPOEXT)) { - if (!rdmsrl_safe(0xc0011005, &value)) { - value |= 1ULL << 54; - wrmsrl_safe(0xc0011005, value); + if (msr_set_bit(0xc0011005, 54) > 0) { rdmsrl(0xc0011005, value); - if (value & (1ULL << 54)) { + if (value & BIT_64(54)) { set_cpu_cap(c, X86_FEATURE_TOPOEXT); - printk(KERN_INFO FW_INFO "CPU: Re-enabling " - "disabled Topology Extensions Support\n"); + pr_info(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n"); } } } @@ -709,19 +695,12 @@ static void init_amd(struct cpuinfo_x86 *c) * Disable GART TLB Walk Errors on Fam10h. We do this here * because this is always needed when GART is enabled, even in a * kernel which has no MCE support built in. - * BIOS should disable GartTlbWlk Errors themself. If - * it doesn't do it here as suggested by the BKDG. + * BIOS should disable GartTlbWlk Errors already. If + * it doesn't, do it here as suggested by the BKDG. * * Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=33012 */ - u64 mask; - int err; - - err = rdmsrl_safe(MSR_AMD64_MCx_MASK(4), &mask); - if (err == 0) { - mask |= (1 << 10); - wrmsrl_safe(MSR_AMD64_MCx_MASK(4), mask); - } + msr_set_bit(MSR_AMD64_MCx_MASK(4), 10); /* * On family 10h BIOS may not have properly enabled WC+ support, @@ -733,10 +712,7 @@ static void init_amd(struct cpuinfo_x86 *c) * NOTE: we want to use the _safe accessors so as not to #GP kvm * guests on older kvm hosts. */ - - rdmsrl_safe(MSR_AMD64_BU_CFG2, &value); - value &= ~(1ULL << 24); - wrmsrl_safe(MSR_AMD64_BU_CFG2, value); + msr_clear_bit(MSR_AMD64_BU_CFG2, 24); if (cpu_has_amd_erratum(c, amd_erratum_383)) set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH); diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index 8779edab684..d8fba5c15fb 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -8,236 +8,6 @@ #include "cpu.h" -#ifdef CONFIG_X86_OOSTORE - -static u32 power2(u32 x) -{ - u32 s = 1; - - while (s <= x) - s <<= 1; - - return s >>= 1; -} - - -/* - * Set up an actual MCR - */ -static void centaur_mcr_insert(int reg, u32 base, u32 size, int key) -{ - u32 lo, hi; - - hi = base & ~0xFFF; - lo = ~(size-1); /* Size is a power of 2 so this makes a mask */ - lo &= ~0xFFF; /* Remove the ctrl value bits */ - lo |= key; /* Attribute we wish to set */ - wrmsr(reg+MSR_IDT_MCR0, lo, hi); - mtrr_centaur_report_mcr(reg, lo, hi); /* Tell the mtrr driver */ -} - -/* - * Figure what we can cover with MCR's - * - * Shortcut: We know you can't put 4Gig of RAM on a winchip - */ -static u32 ramtop(void) -{ - u32 clip = 0xFFFFFFFFUL; - u32 top = 0; - int i; - - for (i = 0; i < e820.nr_map; i++) { - unsigned long start, end; - - if (e820.map[i].addr > 0xFFFFFFFFUL) - continue; - /* - * Don't MCR over reserved space. Ignore the ISA hole - * we frob around that catastrophe already - */ - if (e820.map[i].type == E820_RESERVED) { - if (e820.map[i].addr >= 0x100000UL && - e820.map[i].addr < clip) - clip = e820.map[i].addr; - continue; - } - start = e820.map[i].addr; - end = e820.map[i].addr + e820.map[i].size; - if (start >= end) - continue; - if (end > top) - top = end; - } - /* - * Everything below 'top' should be RAM except for the ISA hole. - * Because of the limited MCR's we want to map NV/ACPI into our - * MCR range for gunk in RAM - * - * Clip might cause us to MCR insufficient RAM but that is an - * acceptable failure mode and should only bite obscure boxes with - * a VESA hole at 15Mb - * - * The second case Clip sometimes kicks in is when the EBDA is marked - * as reserved. Again we fail safe with reasonable results - */ - if (top > clip) - top = clip; - - return top; -} - -/* - * Compute a set of MCR's to give maximum coverage - */ -static int centaur_mcr_compute(int nr, int key) -{ - u32 mem = ramtop(); - u32 root = power2(mem); - u32 base = root; - u32 top = root; - u32 floor = 0; - int ct = 0; - - while (ct < nr) { - u32 fspace = 0; - u32 high; - u32 low; - - /* - * Find the largest block we will fill going upwards - */ - high = power2(mem-top); - - /* - * Find the largest block we will fill going downwards - */ - low = base/2; - - /* - * Don't fill below 1Mb going downwards as there - * is an ISA hole in the way. - */ - if (base <= 1024*1024) - low = 0; - - /* - * See how much space we could cover by filling below - * the ISA hole - */ - - if (floor == 0) - fspace = 512*1024; - else if (floor == 512*1024) - fspace = 128*1024; - - /* And forget ROM space */ - - /* - * Now install the largest coverage we get - */ - if (fspace > high && fspace > low) { - centaur_mcr_insert(ct, floor, fspace, key); - floor += fspace; - } else if (high > low) { - centaur_mcr_insert(ct, top, high, key); - top += high; - } else if (low > 0) { - base -= low; - centaur_mcr_insert(ct, base, low, key); - } else - break; - ct++; - } - /* - * We loaded ct values. We now need to set the mask. The caller - * must do this bit. - */ - return ct; -} - -static void centaur_create_optimal_mcr(void) -{ - int used; - int i; - - /* - * Allocate up to 6 mcrs to mark as much of ram as possible - * as write combining and weak write ordered. - * - * To experiment with: Linux never uses stack operations for - * mmio spaces so we could globally enable stack operation wc - * - * Load the registers with type 31 - full write combining, all - * writes weakly ordered. - */ - used = centaur_mcr_compute(6, 31); - - /* - * Wipe unused MCRs - */ - for (i = used; i < 8; i++) - wrmsr(MSR_IDT_MCR0+i, 0, 0); -} - -static void winchip2_create_optimal_mcr(void) -{ - u32 lo, hi; - int used; - int i; - - /* - * Allocate up to 6 mcrs to mark as much of ram as possible - * as write combining, weak store ordered. - * - * Load the registers with type 25 - * 8 - weak write ordering - * 16 - weak read ordering - * 1 - write combining - */ - used = centaur_mcr_compute(6, 25); - - /* - * Mark the registers we are using. - */ - rdmsr(MSR_IDT_MCR_CTRL, lo, hi); - for (i = 0; i < used; i++) - lo |= 1<<(9+i); - wrmsr(MSR_IDT_MCR_CTRL, lo, hi); - - /* - * Wipe unused MCRs - */ - - for (i = used; i < 8; i++) - wrmsr(MSR_IDT_MCR0+i, 0, 0); -} - -/* - * Handle the MCR key on the Winchip 2. - */ -static void winchip2_unprotect_mcr(void) -{ - u32 lo, hi; - u32 key; - - rdmsr(MSR_IDT_MCR_CTRL, lo, hi); - lo &= ~0x1C0; /* blank bits 8-6 */ - key = (lo>>17) & 7; - lo |= key<<6; /* replace with unlock key */ - wrmsr(MSR_IDT_MCR_CTRL, lo, hi); -} - -static void winchip2_protect_mcr(void) -{ - u32 lo, hi; - - rdmsr(MSR_IDT_MCR_CTRL, lo, hi); - lo &= ~0x1C0; /* blank bits 8-6 */ - wrmsr(MSR_IDT_MCR_CTRL, lo, hi); -} -#endif /* CONFIG_X86_OOSTORE */ - #define ACE_PRESENT (1 << 6) #define ACE_ENABLED (1 << 7) #define ACE_FCR (1 << 28) /* MSR_VIA_FCR */ @@ -362,20 +132,6 @@ static void init_centaur(struct cpuinfo_x86 *c) fcr_clr = DPDC; printk(KERN_NOTICE "Disabling bugged TSC.\n"); clear_cpu_cap(c, X86_FEATURE_TSC); -#ifdef CONFIG_X86_OOSTORE - centaur_create_optimal_mcr(); - /* - * Enable: - * write combining on non-stack, non-string - * write combining on string, all types - * weak write ordering - * - * The C6 original lacks weak read order - * - * Note 0x120 is write only on Winchip 1 - */ - wrmsr(MSR_IDT_MCR_CTRL, 0x01F0001F, 0); -#endif break; case 8: switch (c->x86_mask) { @@ -392,40 +148,12 @@ static void init_centaur(struct cpuinfo_x86 *c) fcr_set = ECX8|DSMC|DTLOCK|EMMX|EBRPRED|ERETSTK| E2MMX|EAMD3D; fcr_clr = DPDC; -#ifdef CONFIG_X86_OOSTORE - winchip2_unprotect_mcr(); - winchip2_create_optimal_mcr(); - rdmsr(MSR_IDT_MCR_CTRL, lo, hi); - /* - * Enable: - * write combining on non-stack, non-string - * write combining on string, all types - * weak write ordering - */ - lo |= 31; - wrmsr(MSR_IDT_MCR_CTRL, lo, hi); - winchip2_protect_mcr(); -#endif break; case 9: name = "3"; fcr_set = ECX8|DSMC|DTLOCK|EMMX|EBRPRED|ERETSTK| E2MMX|EAMD3D; fcr_clr = DPDC; -#ifdef CONFIG_X86_OOSTORE - winchip2_unprotect_mcr(); - winchip2_create_optimal_mcr(); - rdmsr(MSR_IDT_MCR_CTRL, lo, hi); - /* - * Enable: - * write combining on non-stack, non-string - * write combining on string, all types - * weak write ordering - */ - lo |= 31; - wrmsr(MSR_IDT_MCR_CTRL, lo, hi); - winchip2_protect_mcr(); -#endif break; default: name = "??"; diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 5cd9bfabd64..897d6201ef1 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -31,11 +31,8 @@ static void early_init_intel(struct cpuinfo_x86 *c) /* Unmask CPUID levels if masked: */ if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) { - rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); - - if (misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID) { - misc_enable &= ~MSR_IA32_MISC_ENABLE_LIMIT_CPUID; - wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable); + if (msr_clear_bit(MSR_IA32_MISC_ENABLE, + MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT) > 0) { c->cpuid_level = cpuid_eax(0); get_cpu_cap(c); } @@ -129,16 +126,10 @@ static void early_init_intel(struct cpuinfo_x86 *c) * Ingo Molnar reported a Pentium D (model 6) and a Xeon * (model 2) with the same problem. */ - if (c->x86 == 15) { - rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); - - if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) { - printk(KERN_INFO "kmemcheck: Disabling fast string operations\n"); - - misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING; - wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable); - } - } + if (c->x86 == 15) + if (msr_clear_bit(MSR_IA32_MISC_ENABLE, + MSR_IA32_MISC_ENABLE_FAST_STRING_BIT) > 0) + pr_info("kmemcheck: Disabling fast string operations\n"); #endif /* @@ -195,10 +186,16 @@ static void intel_smp_check(struct cpuinfo_x86 *c) } } -static void intel_workarounds(struct cpuinfo_x86 *c) +static int forcepae; +static int __init forcepae_setup(char *__unused) { - unsigned long lo, hi; + forcepae = 1; + return 1; +} +__setup("forcepae", forcepae_setup); +static void intel_workarounds(struct cpuinfo_x86 *c) +{ #ifdef CONFIG_X86_F00F_BUG /* * All current models of Pentium and Pentium with MMX technology CPUs @@ -225,16 +222,26 @@ static void intel_workarounds(struct cpuinfo_x86 *c) clear_cpu_cap(c, X86_FEATURE_SEP); /* + * PAE CPUID issue: many Pentium M report no PAE but may have a + * functionally usable PAE implementation. + * Forcefully enable PAE if kernel parameter "forcepae" is present. + */ + if (forcepae) { + printk(KERN_WARNING "PAE forced!\n"); + set_cpu_cap(c, X86_FEATURE_PAE); + add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_NOW_UNRELIABLE); + } + + /* * P4 Xeon errata 037 workaround. * Hardware prefetcher may cause stale data to be loaded into the cache. */ if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) { - rdmsr(MSR_IA32_MISC_ENABLE, lo, hi); - if ((lo & MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE) == 0) { - printk (KERN_INFO "CPU: C0 stepping P4 Xeon detected.\n"); - printk (KERN_INFO "CPU: Disabling hardware prefetching (Errata 037)\n"); - lo |= MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE; - wrmsr(MSR_IA32_MISC_ENABLE, lo, hi); + if (msr_set_bit(MSR_IA32_MISC_ENABLE, + MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE_BIT) + > 0) { + pr_info("CPU: C0 stepping P4 Xeon detected.\n"); + pr_info("CPU: Disabling hardware prefetching (Errata 037)\n"); } } diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 79f9f848bee..ae407f7226c 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -892,7 +892,6 @@ static void x86_pmu_enable(struct pmu *pmu) * hw_perf_group_sched_in() or x86_pmu_enable() * * step1: save events moving to new counters - * step2: reprogram moved events into new counters */ for (i = 0; i < n_running; i++) { event = cpuc->event_list[i]; @@ -918,6 +917,9 @@ static void x86_pmu_enable(struct pmu *pmu) x86_pmu_stop(event, PERF_EF_UPDATE); } + /* + * step2: reprogram moved events into new counters + */ for (i = 0; i < cpuc->n_events; i++) { event = cpuc->event_list[i]; hwc = &event->hw; @@ -1043,7 +1045,7 @@ static int x86_pmu_add(struct perf_event *event, int flags) /* * If group events scheduling transaction was started, * skip the schedulability test here, it will be performed - * at commit time (->commit_txn) as a whole + * at commit time (->commit_txn) as a whole. */ if (cpuc->group_flag & PERF_EVENT_TXN) goto done_collect; @@ -1058,6 +1060,10 @@ static int x86_pmu_add(struct perf_event *event, int flags) memcpy(cpuc->assign, assign, n*sizeof(int)); done_collect: + /* + * Commit the collect_events() state. See x86_pmu_del() and + * x86_pmu_*_txn(). + */ cpuc->n_events = n; cpuc->n_added += n - n0; cpuc->n_txn += n - n0; @@ -1183,28 +1189,38 @@ static void x86_pmu_del(struct perf_event *event, int flags) * If we're called during a txn, we don't need to do anything. * The events never got scheduled and ->cancel_txn will truncate * the event_list. + * + * XXX assumes any ->del() called during a TXN will only be on + * an event added during that same TXN. */ if (cpuc->group_flag & PERF_EVENT_TXN) return; + /* + * Not a TXN, therefore cleanup properly. + */ x86_pmu_stop(event, PERF_EF_UPDATE); for (i = 0; i < cpuc->n_events; i++) { - if (event == cpuc->event_list[i]) { + if (event == cpuc->event_list[i]) + break; + } - if (i >= cpuc->n_events - cpuc->n_added) - --cpuc->n_added; + if (WARN_ON_ONCE(i == cpuc->n_events)) /* called ->del() without ->add() ? */ + return; - if (x86_pmu.put_event_constraints) - x86_pmu.put_event_constraints(cpuc, event); + /* If we have a newly added event; make sure to decrease n_added. */ + if (i >= cpuc->n_events - cpuc->n_added) + --cpuc->n_added; - while (++i < cpuc->n_events) - cpuc->event_list[i-1] = cpuc->event_list[i]; + if (x86_pmu.put_event_constraints) + x86_pmu.put_event_constraints(cpuc, event); + + /* Delete the array entry. */ + while (++i < cpuc->n_events) + cpuc->event_list[i-1] = cpuc->event_list[i]; + --cpuc->n_events; - --cpuc->n_events; - break; - } - } perf_event_update_userpage(event); } @@ -1598,7 +1614,8 @@ static void x86_pmu_cancel_txn(struct pmu *pmu) { __this_cpu_and(cpu_hw_events.group_flag, ~PERF_EVENT_TXN); /* - * Truncate the collected events. + * Truncate collected array by the number of events added in this + * transaction. See x86_pmu_add() and x86_pmu_*_txn(). */ __this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn)); __this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn)); @@ -1609,6 +1626,8 @@ static void x86_pmu_cancel_txn(struct pmu *pmu) * Commit group events scheduling transaction * Perform the group schedulability test as a whole * Return 0 if success + * + * Does not cancel the transaction on failure; expects the caller to do this. */ static int x86_pmu_commit_txn(struct pmu *pmu) { diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 4972c244d0b..3b2f9bdd974 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -130,9 +130,11 @@ struct cpu_hw_events { unsigned long running[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; int enabled; - int n_events; - int n_added; - int n_txn; + int n_events; /* the # of events in the below arrays */ + int n_added; /* the # last events in the below arrays; + they've never been enabled yet */ + int n_txn; /* the # last events in the below arrays; + added in the current transaction */ int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */ u64 tags[X86_PMC_IDX_MAX]; struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */ diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index c88f7f4b03e..bd2253d40cf 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -66,6 +66,47 @@ DEFINE_UNCORE_FORMAT_ATTR(mask_vnw, mask_vnw, "config2:3-4"); DEFINE_UNCORE_FORMAT_ATTR(mask0, mask0, "config2:0-31"); DEFINE_UNCORE_FORMAT_ATTR(mask1, mask1, "config2:32-63"); +static void uncore_pmu_start_hrtimer(struct intel_uncore_box *box); +static void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box); +static void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event); +static void uncore_pmu_event_read(struct perf_event *event); + +static struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event) +{ + return container_of(event->pmu, struct intel_uncore_pmu, pmu); +} + +static struct intel_uncore_box * +uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu) +{ + struct intel_uncore_box *box; + + box = *per_cpu_ptr(pmu->box, cpu); + if (box) + return box; + + raw_spin_lock(&uncore_box_lock); + list_for_each_entry(box, &pmu->box_list, list) { + if (box->phys_id == topology_physical_package_id(cpu)) { + atomic_inc(&box->refcnt); + *per_cpu_ptr(pmu->box, cpu) = box; + break; + } + } + raw_spin_unlock(&uncore_box_lock); + + return *per_cpu_ptr(pmu->box, cpu); +} + +static struct intel_uncore_box *uncore_event_to_box(struct perf_event *event) +{ + /* + * perf core schedules event on the basis of cpu, uncore events are + * collected by one of the cpus inside a physical package. + */ + return uncore_pmu_to_box(uncore_event_to_pmu(event), smp_processor_id()); +} + static u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event) { u64 count; @@ -1639,6 +1680,349 @@ static struct intel_uncore_type *snb_msr_uncores[] = { &snb_uncore_cbox, NULL, }; + +enum { + SNB_PCI_UNCORE_IMC, +}; + +static struct uncore_event_desc snb_uncore_imc_events[] = { + INTEL_UNCORE_EVENT_DESC(data_reads, "event=0x01"), + INTEL_UNCORE_EVENT_DESC(data_reads.scale, "6.103515625e-5"), + INTEL_UNCORE_EVENT_DESC(data_reads.unit, "MiB"), + + INTEL_UNCORE_EVENT_DESC(data_writes, "event=0x02"), + INTEL_UNCORE_EVENT_DESC(data_writes.scale, "6.103515625e-5"), + INTEL_UNCORE_EVENT_DESC(data_writes.unit, "MiB"), + + { /* end: all zeroes */ }, +}; + +#define SNB_UNCORE_PCI_IMC_EVENT_MASK 0xff +#define SNB_UNCORE_PCI_IMC_BAR_OFFSET 0x48 + +/* page size multiple covering all config regs */ +#define SNB_UNCORE_PCI_IMC_MAP_SIZE 0x6000 + +#define SNB_UNCORE_PCI_IMC_DATA_READS 0x1 +#define SNB_UNCORE_PCI_IMC_DATA_READS_BASE 0x5050 +#define SNB_UNCORE_PCI_IMC_DATA_WRITES 0x2 +#define SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE 0x5054 +#define SNB_UNCORE_PCI_IMC_CTR_BASE SNB_UNCORE_PCI_IMC_DATA_READS_BASE + +static struct attribute *snb_uncore_imc_formats_attr[] = { + &format_attr_event.attr, + NULL, +}; + +static struct attribute_group snb_uncore_imc_format_group = { + .name = "format", + .attrs = snb_uncore_imc_formats_attr, +}; + +static void snb_uncore_imc_init_box(struct intel_uncore_box *box) +{ + struct pci_dev *pdev = box->pci_dev; + int where = SNB_UNCORE_PCI_IMC_BAR_OFFSET; + resource_size_t addr; + u32 pci_dword; + + pci_read_config_dword(pdev, where, &pci_dword); + addr = pci_dword; + +#ifdef CONFIG_PHYS_ADDR_T_64BIT + pci_read_config_dword(pdev, where + 4, &pci_dword); + addr |= ((resource_size_t)pci_dword << 32); +#endif + + addr &= ~(PAGE_SIZE - 1); + + box->io_addr = ioremap(addr, SNB_UNCORE_PCI_IMC_MAP_SIZE); + box->hrtimer_duration = UNCORE_SNB_IMC_HRTIMER_INTERVAL; +} + +static void snb_uncore_imc_enable_box(struct intel_uncore_box *box) +{} + +static void snb_uncore_imc_disable_box(struct intel_uncore_box *box) +{} + +static void snb_uncore_imc_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{} + +static void snb_uncore_imc_disable_event(struct intel_uncore_box *box, struct perf_event *event) +{} + +static u64 snb_uncore_imc_read_counter(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + return (u64)*(unsigned int *)(box->io_addr + hwc->event_base); +} + +/* + * custom event_init() function because we define our own fixed, free + * running counters, so we do not want to conflict with generic uncore + * logic. Also simplifies processing + */ +static int snb_uncore_imc_event_init(struct perf_event *event) +{ + struct intel_uncore_pmu *pmu; + struct intel_uncore_box *box; + struct hw_perf_event *hwc = &event->hw; + u64 cfg = event->attr.config & SNB_UNCORE_PCI_IMC_EVENT_MASK; + int idx, base; + + if (event->attr.type != event->pmu->type) + return -ENOENT; + + pmu = uncore_event_to_pmu(event); + /* no device found for this pmu */ + if (pmu->func_id < 0) + return -ENOENT; + + /* Sampling not supported yet */ + if (hwc->sample_period) + return -EINVAL; + + /* unsupported modes and filters */ + if (event->attr.exclude_user || + event->attr.exclude_kernel || + event->attr.exclude_hv || + event->attr.exclude_idle || + event->attr.exclude_host || + event->attr.exclude_guest || + event->attr.sample_period) /* no sampling */ + return -EINVAL; + + /* + * Place all uncore events for a particular physical package + * onto a single cpu + */ + if (event->cpu < 0) + return -EINVAL; + + /* check only supported bits are set */ + if (event->attr.config & ~SNB_UNCORE_PCI_IMC_EVENT_MASK) + return -EINVAL; + + box = uncore_pmu_to_box(pmu, event->cpu); + if (!box || box->cpu < 0) + return -EINVAL; + + event->cpu = box->cpu; + + event->hw.idx = -1; + event->hw.last_tag = ~0ULL; + event->hw.extra_reg.idx = EXTRA_REG_NONE; + event->hw.branch_reg.idx = EXTRA_REG_NONE; + /* + * check event is known (whitelist, determines counter) + */ + switch (cfg) { + case SNB_UNCORE_PCI_IMC_DATA_READS: + base = SNB_UNCORE_PCI_IMC_DATA_READS_BASE; + idx = UNCORE_PMC_IDX_FIXED; + break; + case SNB_UNCORE_PCI_IMC_DATA_WRITES: + base = SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE; + idx = UNCORE_PMC_IDX_FIXED + 1; + break; + default: + return -EINVAL; + } + + /* must be done before validate_group */ + event->hw.event_base = base; + event->hw.config = cfg; + event->hw.idx = idx; + + /* no group validation needed, we have free running counters */ + + return 0; +} + +static int snb_uncore_imc_hw_config(struct intel_uncore_box *box, struct perf_event *event) +{ + return 0; +} + +static void snb_uncore_imc_event_start(struct perf_event *event, int flags) +{ + struct intel_uncore_box *box = uncore_event_to_box(event); + u64 count; + + if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) + return; + + event->hw.state = 0; + box->n_active++; + + list_add_tail(&event->active_entry, &box->active_list); + + count = snb_uncore_imc_read_counter(box, event); + local64_set(&event->hw.prev_count, count); + + if (box->n_active == 1) + uncore_pmu_start_hrtimer(box); +} + +static void snb_uncore_imc_event_stop(struct perf_event *event, int flags) +{ + struct intel_uncore_box *box = uncore_event_to_box(event); + struct hw_perf_event *hwc = &event->hw; + + if (!(hwc->state & PERF_HES_STOPPED)) { + box->n_active--; + + WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); + hwc->state |= PERF_HES_STOPPED; + + list_del(&event->active_entry); + + if (box->n_active == 0) + uncore_pmu_cancel_hrtimer(box); + } + + if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { + /* + * Drain the remaining delta count out of a event + * that we are disabling: + */ + uncore_perf_event_update(box, event); + hwc->state |= PERF_HES_UPTODATE; + } +} + +static int snb_uncore_imc_event_add(struct perf_event *event, int flags) +{ + struct intel_uncore_box *box = uncore_event_to_box(event); + struct hw_perf_event *hwc = &event->hw; + + if (!box) + return -ENODEV; + + hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + if (!(flags & PERF_EF_START)) + hwc->state |= PERF_HES_ARCH; + + snb_uncore_imc_event_start(event, 0); + + box->n_events++; + + return 0; +} + +static void snb_uncore_imc_event_del(struct perf_event *event, int flags) +{ + struct intel_uncore_box *box = uncore_event_to_box(event); + int i; + + snb_uncore_imc_event_stop(event, PERF_EF_UPDATE); + + for (i = 0; i < box->n_events; i++) { + if (event == box->event_list[i]) { + --box->n_events; + break; + } + } +} + +static int snb_pci2phy_map_init(int devid) +{ + struct pci_dev *dev = NULL; + int bus; + + dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, dev); + if (!dev) + return -ENOTTY; + + bus = dev->bus->number; + + pcibus_to_physid[bus] = 0; + + pci_dev_put(dev); + + return 0; +} + +static struct pmu snb_uncore_imc_pmu = { + .task_ctx_nr = perf_invalid_context, + .event_init = snb_uncore_imc_event_init, + .add = snb_uncore_imc_event_add, + .del = snb_uncore_imc_event_del, + .start = snb_uncore_imc_event_start, + .stop = snb_uncore_imc_event_stop, + .read = uncore_pmu_event_read, +}; + +static struct intel_uncore_ops snb_uncore_imc_ops = { + .init_box = snb_uncore_imc_init_box, + .enable_box = snb_uncore_imc_enable_box, + .disable_box = snb_uncore_imc_disable_box, + .disable_event = snb_uncore_imc_disable_event, + .enable_event = snb_uncore_imc_enable_event, + .hw_config = snb_uncore_imc_hw_config, + .read_counter = snb_uncore_imc_read_counter, +}; + +static struct intel_uncore_type snb_uncore_imc = { + .name = "imc", + .num_counters = 2, + .num_boxes = 1, + .fixed_ctr_bits = 32, + .fixed_ctr = SNB_UNCORE_PCI_IMC_CTR_BASE, + .event_descs = snb_uncore_imc_events, + .format_group = &snb_uncore_imc_format_group, + .perf_ctr = SNB_UNCORE_PCI_IMC_DATA_READS_BASE, + .event_mask = SNB_UNCORE_PCI_IMC_EVENT_MASK, + .ops = &snb_uncore_imc_ops, + .pmu = &snb_uncore_imc_pmu, +}; + +static struct intel_uncore_type *snb_pci_uncores[] = { + [SNB_PCI_UNCORE_IMC] = &snb_uncore_imc, + NULL, +}; + +static DEFINE_PCI_DEVICE_TABLE(snb_uncore_pci_ids) = { + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SNB_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* end: all zeroes */ }, +}; + +static DEFINE_PCI_DEVICE_TABLE(ivb_uncore_pci_ids) = { + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* end: all zeroes */ }, +}; + +static DEFINE_PCI_DEVICE_TABLE(hsw_uncore_pci_ids) = { + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* end: all zeroes */ }, +}; + +static struct pci_driver snb_uncore_pci_driver = { + .name = "snb_uncore", + .id_table = snb_uncore_pci_ids, +}; + +static struct pci_driver ivb_uncore_pci_driver = { + .name = "ivb_uncore", + .id_table = ivb_uncore_pci_ids, +}; + +static struct pci_driver hsw_uncore_pci_driver = { + .name = "hsw_uncore", + .id_table = hsw_uncore_pci_ids, +}; + /* end of Sandy Bridge uncore support */ /* Nehalem uncore support */ @@ -2789,6 +3173,7 @@ again: static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer) { struct intel_uncore_box *box; + struct perf_event *event; unsigned long flags; int bit; @@ -2801,19 +3186,27 @@ static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer) */ local_irq_save(flags); + /* + * handle boxes with an active event list as opposed to active + * counters + */ + list_for_each_entry(event, &box->active_list, active_entry) { + uncore_perf_event_update(box, event); + } + for_each_set_bit(bit, box->active_mask, UNCORE_PMC_IDX_MAX) uncore_perf_event_update(box, box->events[bit]); local_irq_restore(flags); - hrtimer_forward_now(hrtimer, ns_to_ktime(UNCORE_PMU_HRTIMER_INTERVAL)); + hrtimer_forward_now(hrtimer, ns_to_ktime(box->hrtimer_duration)); return HRTIMER_RESTART; } static void uncore_pmu_start_hrtimer(struct intel_uncore_box *box) { __hrtimer_start_range_ns(&box->hrtimer, - ns_to_ktime(UNCORE_PMU_HRTIMER_INTERVAL), 0, + ns_to_ktime(box->hrtimer_duration), 0, HRTIMER_MODE_REL_PINNED, 0); } @@ -2847,43 +3240,12 @@ static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, box->cpu = -1; box->phys_id = -1; - return box; -} - -static struct intel_uncore_box * -uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu) -{ - struct intel_uncore_box *box; - - box = *per_cpu_ptr(pmu->box, cpu); - if (box) - return box; - - raw_spin_lock(&uncore_box_lock); - list_for_each_entry(box, &pmu->box_list, list) { - if (box->phys_id == topology_physical_package_id(cpu)) { - atomic_inc(&box->refcnt); - *per_cpu_ptr(pmu->box, cpu) = box; - break; - } - } - raw_spin_unlock(&uncore_box_lock); - - return *per_cpu_ptr(pmu->box, cpu); -} + /* set default hrtimer timeout */ + box->hrtimer_duration = UNCORE_PMU_HRTIMER_INTERVAL; -static struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event) -{ - return container_of(event->pmu, struct intel_uncore_pmu, pmu); -} + INIT_LIST_HEAD(&box->active_list); -static struct intel_uncore_box *uncore_event_to_box(struct perf_event *event) -{ - /* - * perf core schedules event on the basis of cpu, uncore events are - * collected by one of the cpus inside a physical package. - */ - return uncore_pmu_to_box(uncore_event_to_pmu(event), smp_processor_id()); + return box; } static int @@ -3279,16 +3641,21 @@ static int __init uncore_pmu_register(struct intel_uncore_pmu *pmu) { int ret; - pmu->pmu = (struct pmu) { - .attr_groups = pmu->type->attr_groups, - .task_ctx_nr = perf_invalid_context, - .event_init = uncore_pmu_event_init, - .add = uncore_pmu_event_add, - .del = uncore_pmu_event_del, - .start = uncore_pmu_event_start, - .stop = uncore_pmu_event_stop, - .read = uncore_pmu_event_read, - }; + if (!pmu->type->pmu) { + pmu->pmu = (struct pmu) { + .attr_groups = pmu->type->attr_groups, + .task_ctx_nr = perf_invalid_context, + .event_init = uncore_pmu_event_init, + .add = uncore_pmu_event_add, + .del = uncore_pmu_event_del, + .start = uncore_pmu_event_start, + .stop = uncore_pmu_event_stop, + .read = uncore_pmu_event_read, + }; + } else { + pmu->pmu = *pmu->type->pmu; + pmu->pmu.attr_groups = pmu->type->attr_groups; + } if (pmu->type->num_boxes == 1) { if (strlen(pmu->type->name) > 0) @@ -3334,6 +3701,8 @@ static int __init uncore_type_init(struct intel_uncore_type *type) if (!pmus) return -ENOMEM; + type->pmus = pmus; + type->unconstrainted = (struct event_constraint) __EVENT_CONSTRAINT(0, (1ULL << type->num_counters) - 1, 0, type->num_counters, 0, 0); @@ -3369,7 +3738,6 @@ static int __init uncore_type_init(struct intel_uncore_type *type) } type->pmu_group = &uncore_pmu_attr_group; - type->pmus = pmus; return 0; fail: uncore_type_exit(type); @@ -3501,6 +3869,28 @@ static int __init uncore_pci_init(void) pci_uncores = ivt_pci_uncores; uncore_pci_driver = &ivt_uncore_pci_driver; break; + case 42: /* Sandy Bridge */ + ret = snb_pci2phy_map_init(PCI_DEVICE_ID_INTEL_SNB_IMC); + if (ret) + return ret; + pci_uncores = snb_pci_uncores; + uncore_pci_driver = &snb_uncore_pci_driver; + break; + case 58: /* Ivy Bridge */ + ret = snb_pci2phy_map_init(PCI_DEVICE_ID_INTEL_IVB_IMC); + if (ret) + return ret; + pci_uncores = snb_pci_uncores; + uncore_pci_driver = &ivb_uncore_pci_driver; + break; + case 60: /* Haswell */ + case 69: /* Haswell Celeron */ + ret = snb_pci2phy_map_init(PCI_DEVICE_ID_INTEL_HSW_IMC); + if (ret) + return ret; + pci_uncores = snb_pci_uncores; + uncore_pci_driver = &hsw_uncore_pci_driver; + break; default: return 0; } @@ -3772,7 +4162,7 @@ static void __init uncore_cpu_setup(void *dummy) static int __init uncore_cpu_init(void) { - int ret, cpu, max_cores; + int ret, max_cores; max_cores = boot_cpu_data.x86_max_cores; switch (boot_cpu_data.x86_model) { @@ -3816,29 +4206,6 @@ static int __init uncore_cpu_init(void) if (ret) return ret; - get_online_cpus(); - - for_each_online_cpu(cpu) { - int i, phys_id = topology_physical_package_id(cpu); - - for_each_cpu(i, &uncore_cpu_mask) { - if (phys_id == topology_physical_package_id(i)) { - phys_id = -1; - break; - } - } - if (phys_id < 0) - continue; - - uncore_cpu_prepare(cpu, phys_id); - uncore_event_init_cpu(cpu); - } - on_each_cpu(uncore_cpu_setup, NULL, 1); - - register_cpu_notifier(&uncore_cpu_nb); - - put_online_cpus(); - return 0; } @@ -3867,6 +4234,41 @@ static int __init uncore_pmus_register(void) return 0; } +static void __init uncore_cpumask_init(void) +{ + int cpu; + + /* + * ony invoke once from msr or pci init code + */ + if (!cpumask_empty(&uncore_cpu_mask)) + return; + + get_online_cpus(); + + for_each_online_cpu(cpu) { + int i, phys_id = topology_physical_package_id(cpu); + + for_each_cpu(i, &uncore_cpu_mask) { + if (phys_id == topology_physical_package_id(i)) { + phys_id = -1; + break; + } + } + if (phys_id < 0) + continue; + + uncore_cpu_prepare(cpu, phys_id); + uncore_event_init_cpu(cpu); + } + on_each_cpu(uncore_cpu_setup, NULL, 1); + + register_cpu_notifier(&uncore_cpu_nb); + + put_online_cpus(); +} + + static int __init intel_uncore_init(void) { int ret; @@ -3885,6 +4287,7 @@ static int __init intel_uncore_init(void) uncore_pci_exit(); goto fail; } + uncore_cpumask_init(); uncore_pmus_register(); return 0; diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h index a80ab71a883..90236f0c94a 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h @@ -6,6 +6,7 @@ #define UNCORE_PMU_NAME_LEN 32 #define UNCORE_PMU_HRTIMER_INTERVAL (60LL * NSEC_PER_SEC) +#define UNCORE_SNB_IMC_HRTIMER_INTERVAL (5ULL * NSEC_PER_SEC) #define UNCORE_FIXED_EVENT 0xff #define UNCORE_PMC_IDX_MAX_GENERIC 8 @@ -440,6 +441,7 @@ struct intel_uncore_type { struct intel_uncore_ops *ops; struct uncore_event_desc *event_descs; const struct attribute_group *attr_groups[4]; + struct pmu *pmu; /* for custom pmu ops */ }; #define pmu_group attr_groups[0] @@ -488,8 +490,11 @@ struct intel_uncore_box { u64 tags[UNCORE_PMC_IDX_MAX]; struct pci_dev *pci_dev; struct intel_uncore_pmu *pmu; + u64 hrtimer_duration; /* hrtimer timeout for this box */ struct hrtimer hrtimer; struct list_head list; + struct list_head active_list; + void *io_addr; struct intel_uncore_extra_reg shared_regs[0]; }; diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index 3486e666035..5d466b7d860 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -1257,7 +1257,24 @@ again: pass++; goto again; } - + /* + * Perf does test runs to see if a whole group can be assigned + * together succesfully. There can be multiple rounds of this. + * Unfortunately, p4_pmu_swap_config_ts touches the hwc->config + * bits, such that the next round of group assignments will + * cause the above p4_should_swap_ts to pass instead of fail. + * This leads to counters exclusive to thread0 being used by + * thread1. + * + * Solve this with a cheap hack, reset the idx back to -1 to + * force a new lookup (p4_next_cntr) to get the right counter + * for the right thread. + * + * This probably doesn't comply with the general spirit of how + * perf wants to work, but P4 is special. :-( + */ + if (p4_should_swap_ts(hwc->config, cpu)) + hwc->idx = -1; p4_pmu_swap_config_ts(hwc, cpu); if (assign) assign[i] = cntr_idx; @@ -1322,6 +1339,7 @@ static __initconst const struct x86_pmu p4_pmu = { __init int p4_pmu_init(void) { unsigned int low, high; + int i, reg; /* If we get stripped -- indexing fails */ BUILD_BUG_ON(ARCH_P4_MAX_CCCR > INTEL_PMC_MAX_GENERIC); @@ -1340,5 +1358,19 @@ __init int p4_pmu_init(void) x86_pmu = p4_pmu; + /* + * Even though the counters are configured to interrupt a particular + * logical processor when an overflow happens, testing has shown that + * on kdump kernels (which uses a single cpu), thread1's counter + * continues to run and will report an NMI on thread0. Due to the + * overflow bug, this leads to a stream of unknown NMIs. + * + * Solve this by zero'ing out the registers to mimic a reset. + */ + for (i = 0; i < x86_pmu.num_counters; i++) { + reg = x86_pmu_config_addr(i); + wrmsrl_safe(reg, 0ULL); + } + return 0; } diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index a57902efe2d..507de806659 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -57,9 +57,7 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs) { #ifdef CONFIG_X86_32 struct pt_regs fixed_regs; -#endif -#ifdef CONFIG_X86_32 if (!user_mode_vm(regs)) { crash_fixup_ss_esp(&fixed_regs, regs); regs = &fixed_regs; diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index f2a1770ca17..a21d49c071d 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -30,7 +30,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned long dummy; stack = &dummy; - if (task && task != current) + if (task != current) stack = (unsigned long *)task->thread.sp; } diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 81ba27679f1..f36bd42d6f0 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -544,6 +544,10 @@ ENDPROC(early_idt_handlers) /* This is global to keep gas from relaxing the jumps */ ENTRY(early_idt_handler) cld + + cmpl $2,(%esp) # X86_TRAP_NMI + je is_nmi # Ignore NMI + cmpl $2,%ss:early_recursion_flag je hlt_loop incl %ss:early_recursion_flag @@ -594,8 +598,9 @@ ex_entry: pop %edx pop %ecx pop %eax - addl $8,%esp /* drop vector number and error code */ decl %ss:early_recursion_flag +is_nmi: + addl $8,%esp /* drop vector number and error code */ iret ENDPROC(early_idt_handler) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index e1aabdb314c..a468c0a65c4 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -343,6 +343,9 @@ early_idt_handlers: ENTRY(early_idt_handler) cld + cmpl $2,(%rsp) # X86_TRAP_NMI + je is_nmi # Ignore NMI + cmpl $2,early_recursion_flag(%rip) jz 1f incl early_recursion_flag(%rip) @@ -405,8 +408,9 @@ ENTRY(early_idt_handler) popq %rdx popq %rcx popq %rax - addq $16,%rsp # drop vector number and error code decl early_recursion_flag(%rip) +is_nmi: + addq $16,%rsp # drop vector number and error code INTERRUPT_RETURN ENDPROC(early_idt_handler) diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index e8368c6dd2a..d5dd8081441 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -86,10 +86,19 @@ EXPORT_SYMBOL(__kernel_fpu_begin); void __kernel_fpu_end(void) { - if (use_eager_fpu()) - math_state_restore(); - else + if (use_eager_fpu()) { + /* + * For eager fpu, most the time, tsk_used_math() is true. + * Restore the user math as we are done with the kernel usage. + * At few instances during thread exit, signal handling etc, + * tsk_used_math() is false. Those few places will take proper + * actions, so we don't need to restore the math here. + */ + if (likely(tsk_used_math(current))) + math_state_restore(); + } else { stts(); + } } EXPORT_SYMBOL(__kernel_fpu_end); diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 6fcb49ce50a..b4872b999a7 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -87,6 +87,7 @@ __setup("unknown_nmi_panic", setup_unknown_nmi_panic); #define nmi_to_desc(type) (&nmi_desc[type]) static u64 nmi_longest_ns = 1 * NSEC_PER_MSEC; + static int __init nmi_warning_debugfs(void) { debugfs_create_u64("nmi_longest_ns", 0644, @@ -95,6 +96,20 @@ static int __init nmi_warning_debugfs(void) } fs_initcall(nmi_warning_debugfs); +static void nmi_max_handler(struct irq_work *w) +{ + struct nmiaction *a = container_of(w, struct nmiaction, irq_work); + int remainder_ns, decimal_msecs; + u64 whole_msecs = ACCESS_ONCE(a->max_duration); + + remainder_ns = do_div(whole_msecs, (1000 * 1000)); + decimal_msecs = remainder_ns / 1000; + + printk_ratelimited(KERN_INFO + "INFO: NMI handler (%ps) took too long to run: %lld.%03d msecs\n", + a->handler, whole_msecs, decimal_msecs); +} + static int __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b) { struct nmi_desc *desc = nmi_to_desc(type); @@ -110,26 +125,20 @@ static int __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2 * to handle those situations. */ list_for_each_entry_rcu(a, &desc->head, list) { - u64 before, delta, whole_msecs; - int remainder_ns, decimal_msecs, thishandled; + int thishandled; + u64 delta; - before = sched_clock(); + delta = sched_clock(); thishandled = a->handler(type, regs); handled += thishandled; - delta = sched_clock() - before; + delta = sched_clock() - delta; trace_nmi_handler(a->handler, (int)delta, thishandled); - if (delta < nmi_longest_ns) + if (delta < nmi_longest_ns || delta < a->max_duration) continue; - nmi_longest_ns = delta; - whole_msecs = delta; - remainder_ns = do_div(whole_msecs, (1000 * 1000)); - decimal_msecs = remainder_ns / 1000; - printk_ratelimited(KERN_INFO - "INFO: NMI handler (%ps) took too long to run: " - "%lld.%03d msecs\n", a->handler, whole_msecs, - decimal_msecs); + a->max_duration = delta; + irq_work_queue(&a->irq_work); } rcu_read_unlock(); @@ -146,6 +155,8 @@ int __register_nmi_handler(unsigned int type, struct nmiaction *action) if (!action->handler) return -EINVAL; + init_irq_work(&action->irq_work, nmi_max_handler); + spin_lock_irqsave(&desc->lock, flags); /* diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 3fb8d95ab8b..4505e2a950d 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -298,10 +298,7 @@ void arch_cpu_idle_dead(void) */ void arch_cpu_idle(void) { - if (cpuidle_idle_call()) - x86_idle(); - else - local_irq_enable(); + x86_idle(); } /* diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 7c6acd4b899..ff898bbf579 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -529,7 +529,7 @@ static void quirk_amd_nb_node(struct pci_dev *dev) return; pci_read_config_dword(nb_ht, 0x60, &val); - node = val & 7; + node = pcibus_to_node(dev->bus) | (val & 7); /* * Some hardware may return an invalid node ID, * so check it first: diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index a32da804252..60179ec39d4 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -122,8 +122,9 @@ static void smp_callin(void) * Since CPU0 is not wakened up by INIT, it doesn't wait for the IPI. */ cpuid = smp_processor_id(); - if (apic->wait_for_init_deassert && cpuid != 0) - apic->wait_for_init_deassert(&init_deasserted); + if (apic->wait_for_init_deassert && cpuid) + while (!atomic_read(&init_deasserted)) + cpu_relax(); /* * (This works even if the APIC is not enabled.) @@ -701,11 +702,15 @@ wakeup_cpu_via_init_nmi(int cpu, unsigned long start_ip, int apicid, int id; int boot_error; + preempt_disable(); + /* * Wake up AP by INIT, INIT, STARTUP sequence. */ - if (cpu) - return wakeup_secondary_cpu_via_init(apicid, start_ip); + if (cpu) { + boot_error = wakeup_secondary_cpu_via_init(apicid, start_ip); + goto out; + } /* * Wake up BSP by nmi. @@ -725,6 +730,9 @@ wakeup_cpu_via_init_nmi(int cpu, unsigned long start_ip, int apicid, boot_error = wakeup_secondary_cpu_via_nmi(id, start_ip); } +out: + preempt_enable(); + return boot_error; } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index e81df8fce02..2de1bc09a8d 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3002,10 +3002,8 @@ static int cr8_write_interception(struct vcpu_svm *svm) u8 cr8_prev = kvm_get_cr8(&svm->vcpu); /* instruction emulation calls kvm_set_cr8() */ r = cr_interception(svm); - if (irqchip_in_kernel(svm->vcpu.kvm)) { - clr_cr_intercept(svm, INTERCEPT_CR8_WRITE); + if (irqchip_in_kernel(svm->vcpu.kvm)) return r; - } if (cr8_prev <= kvm_get_cr8(&svm->vcpu)) return r; kvm_run->exit_reason = KVM_EXIT_SET_TPR; @@ -3567,6 +3565,8 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK)) return; + clr_cr_intercept(svm, INTERCEPT_CR8_WRITE); + if (irr == -1) return; diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c index 8f8eebdca7d..db9db446b71 100644 --- a/arch/x86/lib/msr.c +++ b/arch/x86/lib/msr.c @@ -8,7 +8,7 @@ struct msr *msrs_alloc(void) msrs = alloc_percpu(struct msr); if (!msrs) { - pr_warning("%s: error allocating msrs\n", __func__); + pr_warn("%s: error allocating msrs\n", __func__); return NULL; } @@ -21,3 +21,90 @@ void msrs_free(struct msr *msrs) free_percpu(msrs); } EXPORT_SYMBOL(msrs_free); + +/** + * Read an MSR with error handling + * + * @msr: MSR to read + * @m: value to read into + * + * It returns read data only on success, otherwise it doesn't change the output + * argument @m. + * + */ +int msr_read(u32 msr, struct msr *m) +{ + int err; + u64 val; + + err = rdmsrl_safe(msr, &val); + if (!err) + m->q = val; + + return err; +} + +/** + * Write an MSR with error handling + * + * @msr: MSR to write + * @m: value to write + */ +int msr_write(u32 msr, struct msr *m) +{ + return wrmsrl_safe(msr, m->q); +} + +static inline int __flip_bit(u32 msr, u8 bit, bool set) +{ + struct msr m, m1; + int err = -EINVAL; + + if (bit > 63) + return err; + + err = msr_read(msr, &m); + if (err) + return err; + + m1 = m; + if (set) + m1.q |= BIT_64(bit); + else + m1.q &= ~BIT_64(bit); + + if (m1.q == m.q) + return 0; + + err = msr_write(msr, &m); + if (err) + return err; + + return 1; +} + +/** + * Set @bit in a MSR @msr. + * + * Retval: + * < 0: An error was encountered. + * = 0: Bit was already set. + * > 0: Hardware accepted the MSR write. + */ +int msr_set_bit(u32 msr, u8 bit) +{ + return __flip_bit(msr, bit, true); +} + +/** + * Clear @bit in a MSR @msr. + * + * Retval: + * < 0: An error was encountered. + * = 0: Bit was already cleared. + * > 0: Hardware accepted the MSR write. + */ +int msr_clear_bit(u32 msr, u8 bit) +{ + return __flip_bit(msr, bit, false); +} diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index a92c4c99978..8e572299267 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1025,8 +1025,12 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs) * This routine handles page faults. It determines the address, * and the problem, and then passes it off to one of the appropriate * routines. + * + * This function must have noinline because both callers + * {,trace_}do_page_fault() have notrace on. Having this an actual function + * guarantees there's a function trace entry. */ -static void __kprobes +static void __kprobes noinline __do_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address) { @@ -1250,31 +1254,38 @@ good_area: up_read(&mm->mmap_sem); } -dotraplinkage void __kprobes +dotraplinkage void __kprobes notrace do_page_fault(struct pt_regs *regs, unsigned long error_code) { + unsigned long address = read_cr2(); /* Get the faulting address */ enum ctx_state prev_state; - /* Get the faulting address: */ - unsigned long address = read_cr2(); + + /* + * We must have this function tagged with __kprobes, notrace and call + * read_cr2() before calling anything else. To avoid calling any kind + * of tracing machinery before we've observed the CR2 value. + * + * exception_{enter,exit}() contain all sorts of tracepoints. + */ prev_state = exception_enter(); __do_page_fault(regs, error_code, address); exception_exit(prev_state); } -static void trace_page_fault_entries(struct pt_regs *regs, +#ifdef CONFIG_TRACING +static void trace_page_fault_entries(unsigned long address, struct pt_regs *regs, unsigned long error_code) { if (user_mode(regs)) - trace_page_fault_user(read_cr2(), regs, error_code); + trace_page_fault_user(address, regs, error_code); else - trace_page_fault_kernel(read_cr2(), regs, error_code); + trace_page_fault_kernel(address, regs, error_code); } -dotraplinkage void __kprobes +dotraplinkage void __kprobes notrace trace_do_page_fault(struct pt_regs *regs, unsigned long error_code) { - enum ctx_state prev_state; /* * The exception_enter and tracepoint processing could * trigger another page faults (user space callchain @@ -1282,9 +1293,11 @@ trace_do_page_fault(struct pt_regs *regs, unsigned long error_code) * the faulting address now. */ unsigned long address = read_cr2(); + enum ctx_state prev_state; prev_state = exception_enter(); - trace_page_fault_entries(regs, error_code); + trace_page_fault_entries(address, regs, error_code); __do_page_fault(regs, error_code, address); exception_exit(prev_state); } +#endif /* CONFIG_TRACING */ diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 1585da3b9b8..ae242a7c11c 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -126,8 +126,8 @@ within(unsigned long addr, unsigned long start, unsigned long end) * @vaddr: virtual start address * @size: number of bytes to flush * - * clflush is an unordered instruction which needs fencing with mfence - * to avoid ordering issues. + * clflushopt is an unordered instruction which needs fencing with mfence or + * sfence to avoid ordering issues. */ void clflush_cache_range(void *vaddr, unsigned int size) { @@ -136,11 +136,11 @@ void clflush_cache_range(void *vaddr, unsigned int size) mb(); for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size) - clflush(vaddr); + clflushopt(vaddr); /* * Flush any possible final partial cacheline: */ - clflush(vend); + clflushopt(vend); mb(); } @@ -1393,10 +1393,10 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, cache = cache_attr(mask_set); /* - * On success we use clflush, when the CPU supports it to - * avoid the wbindv. If the CPU does not support it and in the + * On success we use CLFLUSH, when the CPU supports it to + * avoid the WBINVD. If the CPU does not support it and in the * error case we fall back to cpa_flush_all (which uses - * wbindv): + * WBINVD): */ if (!ret && cpu_has_clflush) { if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) { diff --git a/arch/x86/net/bpf_jit.S b/arch/x86/net/bpf_jit.S index 877b9a1b215..01495755701 100644 --- a/arch/x86/net/bpf_jit.S +++ b/arch/x86/net/bpf_jit.S @@ -140,7 +140,7 @@ bpf_slow_path_byte_msh: push %r9; \ push SKBDATA; \ /* rsi already has offset */ \ - mov $SIZE,%ecx; /* size */ \ + mov $SIZE,%edx; /* size */ \ call bpf_internal_load_pointer_neg_helper; \ test %rax,%rax; \ pop SKBDATA; \ diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h index 7d01b8c56c0..cc04e67bfd0 100644 --- a/arch/x86/um/asm/barrier.h +++ b/arch/x86/um/asm/barrier.h @@ -40,11 +40,7 @@ #define smp_rmb() barrier() #endif /* CONFIG_X86_PPRO_FENCE */ -#ifdef CONFIG_X86_OOSTORE -#define smp_wmb() wmb() -#else /* CONFIG_X86_OOSTORE */ #define smp_wmb() barrier() -#endif /* CONFIG_X86_OOSTORE */ #define smp_read_barrier_depends() read_barrier_depends() #define set_mb(var, value) do { (void)xchg(&var, value); } while (0) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 256282e7888..2423ef04ffe 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -365,7 +365,7 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, /* Assume pteval_t is equivalent to all the other *val_t types. */ static pteval_t pte_mfn_to_pfn(pteval_t val) { - if (pteval_present(val)) { + if (val & _PAGE_PRESENT) { unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; unsigned long pfn = mfn_to_pfn(mfn); @@ -381,7 +381,7 @@ static pteval_t pte_mfn_to_pfn(pteval_t val) static pteval_t pte_pfn_to_mfn(pteval_t val) { - if (pteval_present(val)) { + if (val & _PAGE_PRESENT) { unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; pteval_t flags = val & PTE_FLAGS_MASK; unsigned long mfn; |