From 819165fb34b9777f852429f2c6d6f79fbb71b9eb Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 20 Jan 2012 16:21:41 +0000 Subject: x86: Adjust asm constraints in atomic64 wrappers Eric pointed out overly restrictive constraints in atomic64_set(), but there are issues throughout the file. In the cited case, %ebx and %ecx are inputs only (don't get changed by either of the two low level implementations). This was also the case elsewhere. Further in many cases early-clobber indicators were missing. Finally, the previous implementation rolled a custom alternative instruction macro from scratch, rather than using alternative_call() (which was introduced with the commit that the description of the change in question actually refers to). Adjusting has the benefit of not hiding referenced symbols from the compiler, which however requires them to be declared not just in the exporting source file (which, as a desirable side effect, in turn allows that exporting file to become a real 5-line stub). This patch does not eliminate the overly restrictive memory clobbers, however: Doing so would occasionally make the compiler set up a second register for accessing the memory object (to satisfy the added "m" constraint), and it's not clear which of the two non-optimal alternatives is better. v2: Re-do the declaration and exporting of the internal symbols. Reported-by: Eric Dumazet Signed-off-by: Jan Beulich Link: http://lkml.kernel.org/r/4F19A2A5020000780006E0D9@nat28.tlf.novell.com Cc: Luca Barbieri Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/alternative.h | 6 ++ arch/x86/include/asm/atomic64_32.h | 147 ++++++++++++++++++++----------------- 2 files changed, 86 insertions(+), 67 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 37ad100a221..49331bedc15 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -145,6 +145,12 @@ static inline int alternatives_text_reserved(void *start, void *end) */ #define ASM_OUTPUT2(a...) a +/* + * use this macro if you need clobbers but no inputs in + * alternative_{input,io,call}() + */ +#define ASM_NO_INPUT_CLOBBER(clbr...) "i" (0) : clbr + struct paravirt_patch_site; #ifdef CONFIG_PARAVIRT void apply_paravirt(struct paravirt_patch_site *start, diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h index fa13f0ec287..908303f68bb 100644 --- a/arch/x86/include/asm/atomic64_32.h +++ b/arch/x86/include/asm/atomic64_32.h @@ -14,13 +14,52 @@ typedef struct { #define ATOMIC64_INIT(val) { (val) } +#define __ATOMIC64_DECL(sym) void atomic64_##sym(atomic64_t *, ...) +#ifndef ATOMIC64_EXPORT +#define ATOMIC64_DECL_ONE __ATOMIC64_DECL +#else +#define ATOMIC64_DECL_ONE(sym) __ATOMIC64_DECL(sym); \ + ATOMIC64_EXPORT(atomic64_##sym) +#endif + #ifdef CONFIG_X86_CMPXCHG64 -#define ATOMIC64_ALTERNATIVE_(f, g) "call atomic64_" #g "_cx8" +#define __alternative_atomic64(f, g, out, in...) \ + asm volatile("call %P[func]" \ + : out : [func] "i" (atomic64_##g##_cx8), ## in) + +#define ATOMIC64_DECL(sym) ATOMIC64_DECL_ONE(sym##_cx8) #else -#define ATOMIC64_ALTERNATIVE_(f, g) ALTERNATIVE("call atomic64_" #f "_386", "call atomic64_" #g "_cx8", X86_FEATURE_CX8) +#define __alternative_atomic64(f, g, out, in...) \ + alternative_call(atomic64_##f##_386, atomic64_##g##_cx8, \ + X86_FEATURE_CX8, ASM_OUTPUT2(out), ## in) + +#define ATOMIC64_DECL(sym) ATOMIC64_DECL_ONE(sym##_cx8); \ + ATOMIC64_DECL_ONE(sym##_386) + +ATOMIC64_DECL_ONE(add_386); +ATOMIC64_DECL_ONE(sub_386); +ATOMIC64_DECL_ONE(inc_386); +ATOMIC64_DECL_ONE(dec_386); #endif -#define ATOMIC64_ALTERNATIVE(f) ATOMIC64_ALTERNATIVE_(f, f) +#define alternative_atomic64(f, out, in...) \ + __alternative_atomic64(f, f, ASM_OUTPUT2(out), ## in) + +ATOMIC64_DECL(read); +ATOMIC64_DECL(set); +ATOMIC64_DECL(xchg); +ATOMIC64_DECL(add_return); +ATOMIC64_DECL(sub_return); +ATOMIC64_DECL(inc_return); +ATOMIC64_DECL(dec_return); +ATOMIC64_DECL(dec_if_positive); +ATOMIC64_DECL(inc_not_zero); +ATOMIC64_DECL(add_unless); + +#undef ATOMIC64_DECL +#undef ATOMIC64_DECL_ONE +#undef __ATOMIC64_DECL +#undef ATOMIC64_EXPORT /** * atomic64_cmpxchg - cmpxchg atomic64 variable @@ -50,11 +89,9 @@ static inline long long atomic64_xchg(atomic64_t *v, long long n) long long o; unsigned high = (unsigned)(n >> 32); unsigned low = (unsigned)n; - asm volatile(ATOMIC64_ALTERNATIVE(xchg) - : "=A" (o), "+b" (low), "+c" (high) - : "S" (v) - : "memory" - ); + alternative_atomic64(xchg, "=&A" (o), + "S" (v), "b" (low), "c" (high) + : "memory"); return o; } @@ -69,11 +106,9 @@ static inline void atomic64_set(atomic64_t *v, long long i) { unsigned high = (unsigned)(i >> 32); unsigned low = (unsigned)i; - asm volatile(ATOMIC64_ALTERNATIVE(set) - : "+b" (low), "+c" (high) - : "S" (v) - : "eax", "edx", "memory" - ); + alternative_atomic64(set, /* no output */, + "S" (v), "b" (low), "c" (high) + : "eax", "edx", "memory"); } /** @@ -85,10 +120,7 @@ static inline void atomic64_set(atomic64_t *v, long long i) static inline long long atomic64_read(const atomic64_t *v) { long long r; - asm volatile(ATOMIC64_ALTERNATIVE(read) - : "=A" (r), "+c" (v) - : : "memory" - ); + alternative_atomic64(read, "=&A" (r), "c" (v) : "memory"); return r; } @@ -101,10 +133,9 @@ static inline long long atomic64_read(const atomic64_t *v) */ static inline long long atomic64_add_return(long long i, atomic64_t *v) { - asm volatile(ATOMIC64_ALTERNATIVE(add_return) - : "+A" (i), "+c" (v) - : : "memory" - ); + alternative_atomic64(add_return, + ASM_OUTPUT2("+A" (i), "+c" (v)), + ASM_NO_INPUT_CLOBBER("memory")); return i; } @@ -113,32 +144,25 @@ static inline long long atomic64_add_return(long long i, atomic64_t *v) */ static inline long long atomic64_sub_return(long long i, atomic64_t *v) { - asm volatile(ATOMIC64_ALTERNATIVE(sub_return) - : "+A" (i), "+c" (v) - : : "memory" - ); + alternative_atomic64(sub_return, + ASM_OUTPUT2("+A" (i), "+c" (v)), + ASM_NO_INPUT_CLOBBER("memory")); return i; } static inline long long atomic64_inc_return(atomic64_t *v) { long long a; - asm volatile(ATOMIC64_ALTERNATIVE(inc_return) - : "=A" (a) - : "S" (v) - : "memory", "ecx" - ); + alternative_atomic64(inc_return, "=&A" (a), + "S" (v) : "memory", "ecx"); return a; } static inline long long atomic64_dec_return(atomic64_t *v) { long long a; - asm volatile(ATOMIC64_ALTERNATIVE(dec_return) - : "=A" (a) - : "S" (v) - : "memory", "ecx" - ); + alternative_atomic64(dec_return, "=&A" (a), + "S" (v) : "memory", "ecx"); return a; } @@ -151,10 +175,9 @@ static inline long long atomic64_dec_return(atomic64_t *v) */ static inline long long atomic64_add(long long i, atomic64_t *v) { - asm volatile(ATOMIC64_ALTERNATIVE_(add, add_return) - : "+A" (i), "+c" (v) - : : "memory" - ); + __alternative_atomic64(add, add_return, + ASM_OUTPUT2("+A" (i), "+c" (v)), + ASM_NO_INPUT_CLOBBER("memory")); return i; } @@ -167,10 +190,9 @@ static inline long long atomic64_add(long long i, atomic64_t *v) */ static inline long long atomic64_sub(long long i, atomic64_t *v) { - asm volatile(ATOMIC64_ALTERNATIVE_(sub, sub_return) - : "+A" (i), "+c" (v) - : : "memory" - ); + __alternative_atomic64(sub, sub_return, + ASM_OUTPUT2("+A" (i), "+c" (v)), + ASM_NO_INPUT_CLOBBER("memory")); return i; } @@ -196,10 +218,8 @@ static inline int atomic64_sub_and_test(long long i, atomic64_t *v) */ static inline void atomic64_inc(atomic64_t *v) { - asm volatile(ATOMIC64_ALTERNATIVE_(inc, inc_return) - : : "S" (v) - : "memory", "eax", "ecx", "edx" - ); + __alternative_atomic64(inc, inc_return, /* no output */, + "S" (v) : "memory", "eax", "ecx", "edx"); } /** @@ -210,10 +230,8 @@ static inline void atomic64_inc(atomic64_t *v) */ static inline void atomic64_dec(atomic64_t *v) { - asm volatile(ATOMIC64_ALTERNATIVE_(dec, dec_return) - : : "S" (v) - : "memory", "eax", "ecx", "edx" - ); + __alternative_atomic64(dec, dec_return, /* no output */, + "S" (v) : "memory", "eax", "ecx", "edx"); } /** @@ -263,15 +281,16 @@ static inline int atomic64_add_negative(long long i, atomic64_t *v) * @u: ...unless v is equal to u. * * Atomically adds @a to @v, so long as it was not @u. - * Returns the old value of @v. + * Returns non-zero if the add was done, zero otherwise. */ static inline int atomic64_add_unless(atomic64_t *v, long long a, long long u) { unsigned low = (unsigned)u; unsigned high = (unsigned)(u >> 32); - asm volatile(ATOMIC64_ALTERNATIVE(add_unless) "\n\t" - : "+A" (a), "+c" (v), "+S" (low), "+D" (high) - : : "memory"); + alternative_atomic64(add_unless, + ASM_OUTPUT2("+A" (a), "+c" (v), + "+S" (low), "+D" (high)), + ASM_NO_INPUT_CLOBBER("memory")); return (int)a; } @@ -279,26 +298,20 @@ static inline int atomic64_add_unless(atomic64_t *v, long long a, long long u) static inline int atomic64_inc_not_zero(atomic64_t *v) { int r; - asm volatile(ATOMIC64_ALTERNATIVE(inc_not_zero) - : "=a" (r) - : "S" (v) - : "ecx", "edx", "memory" - ); + alternative_atomic64(inc_not_zero, "=&a" (r), + "S" (v) : "ecx", "edx", "memory"); return r; } static inline long long atomic64_dec_if_positive(atomic64_t *v) { long long r; - asm volatile(ATOMIC64_ALTERNATIVE(dec_if_positive) - : "=A" (r) - : "S" (v) - : "ecx", "memory" - ); + alternative_atomic64(dec_if_positive, "=&A" (r), + "S" (v) : "ecx", "memory"); return r; } -#undef ATOMIC64_ALTERNATIVE -#undef ATOMIC64_ALTERNATIVE_ +#undef alternative_atomic64 +#undef __alternative_atomic64 #endif /* _ASM_X86_ATOMIC64_32_H */ -- cgit v1.2.3-70-g09d2 From cb8095bba6d24118135a5683a956f4f4fb5f17bb Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 20 Jan 2012 16:22:04 +0000 Subject: x86: atomic64 assembly improvements In the "xchg" implementation, %ebx and %ecx don't need to be copied into %eax and %edx respectively (this is only necessary when desiring to only read the stored value). In the "add_unless" implementation, swapping the use of %ecx and %esi for passing arguments allows %esi to become an input only (i.e. permitting the register to be re-used to address the same object without reload). In "{add,sub}_return", doing the initial read64 through the passed in %ecx decreases a register dependency. In "inc_not_zero", a branch can be eliminated by or-ing together the two halves of the current (64-bit) value, and code size can be further reduced by adjusting the arithmetic slightly. v2: Undo the folding of "xchg" and "set". Signed-off-by: Jan Beulich Link: http://lkml.kernel.org/r/4F19A2BC020000780006E0DC@nat28.tlf.novell.com Cc: Luca Barbieri Cc: Eric Dumazet Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/atomic64_32.h | 5 ++--- arch/x86/lib/atomic64_386_32.S | 6 +++--- arch/x86/lib/atomic64_cx8_32.S | 29 +++++++++++------------------ 3 files changed, 16 insertions(+), 24 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h index 908303f68bb..198119910da 100644 --- a/arch/x86/include/asm/atomic64_32.h +++ b/arch/x86/include/asm/atomic64_32.h @@ -288,9 +288,8 @@ static inline int atomic64_add_unless(atomic64_t *v, long long a, long long u) unsigned low = (unsigned)u; unsigned high = (unsigned)(u >> 32); alternative_atomic64(add_unless, - ASM_OUTPUT2("+A" (a), "+c" (v), - "+S" (low), "+D" (high)), - ASM_NO_INPUT_CLOBBER("memory")); + ASM_OUTPUT2("+A" (a), "+c" (low), "+D" (high)), + "S" (v) : "memory"); return (int)a; } diff --git a/arch/x86/lib/atomic64_386_32.S b/arch/x86/lib/atomic64_386_32.S index e8e7e0d06f4..00933d5e992 100644 --- a/arch/x86/lib/atomic64_386_32.S +++ b/arch/x86/lib/atomic64_386_32.S @@ -137,13 +137,13 @@ BEGIN(dec_return) RET_ENDP #undef v -#define v %ecx +#define v %esi BEGIN(add_unless) - addl %eax, %esi + addl %eax, %ecx adcl %edx, %edi addl (v), %eax adcl 4(v), %edx - cmpl %eax, %esi + cmpl %eax, %ecx je 3f 1: movl %eax, (v) diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S index 391a083674b..f5cc9eb1d51 100644 --- a/arch/x86/lib/atomic64_cx8_32.S +++ b/arch/x86/lib/atomic64_cx8_32.S @@ -55,8 +55,6 @@ ENDPROC(atomic64_set_cx8) ENTRY(atomic64_xchg_cx8) CFI_STARTPROC - movl %ebx, %eax - movl %ecx, %edx 1: LOCK_PREFIX cmpxchg8b (%esi) @@ -78,7 +76,7 @@ ENTRY(atomic64_\func\()_return_cx8) movl %edx, %edi movl %ecx, %ebp - read64 %ebp + read64 %ecx 1: movl %eax, %ebx movl %edx, %ecx @@ -159,23 +157,22 @@ ENTRY(atomic64_add_unless_cx8) SAVE ebx /* these just push these two parameters on the stack */ SAVE edi - SAVE esi + SAVE ecx - movl %ecx, %ebp - movl %eax, %esi + movl %eax, %ebp movl %edx, %edi - read64 %ebp + read64 %esi 1: cmpl %eax, 0(%esp) je 4f 2: movl %eax, %ebx movl %edx, %ecx - addl %esi, %ebx + addl %ebp, %ebx adcl %edi, %ecx LOCK_PREFIX - cmpxchg8b (%ebp) + cmpxchg8b (%esi) jne 1b movl $1, %eax @@ -199,13 +196,13 @@ ENTRY(atomic64_inc_not_zero_cx8) read64 %esi 1: - testl %eax, %eax - je 4f -2: + movl %eax, %ecx + orl %edx, %ecx + jz 3f movl %eax, %ebx - movl %edx, %ecx + xorl %ecx, %ecx addl $1, %ebx - adcl $0, %ecx + adcl %edx, %ecx LOCK_PREFIX cmpxchg8b (%esi) jne 1b @@ -214,9 +211,5 @@ ENTRY(atomic64_inc_not_zero_cx8) 3: RESTORE ebx ret -4: - testl %edx, %edx - jne 2b - jmp 3b CFI_ENDPROC ENDPROC(atomic64_inc_not_zero_cx8) -- cgit v1.2.3-70-g09d2 From 1a8359e411eb5055405412a7da812dae63c64a55 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Thu, 26 Jan 2012 17:33:30 +0000 Subject: x86/mid: Remove Intel Moorestown All production devices operate in the Oaktrail configuration with legacy PC elements present and an ACPI BIOS. Continue stripping out the Moorestown elements from the tree leaving Medfield. Signed-off-by: Alan Cox Cc: jacob.jun.pan@linux.intel.com Link: http://lkml.kernel.org/n/tip-fvm1hgpq99jln6l0fbek68ik@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 21 -- arch/x86/include/asm/mrst.h | 4 +- arch/x86/platform/mrst/Makefile | 1 - arch/x86/platform/mrst/mrst.c | 64 ++-- arch/x86/platform/mrst/pmu.c | 817 ---------------------------------------- arch/x86/platform/mrst/pmu.h | 234 ------------ 6 files changed, 26 insertions(+), 1115 deletions(-) delete mode 100644 arch/x86/platform/mrst/pmu.c delete mode 100644 arch/x86/platform/mrst/pmu.h (limited to 'arch/x86/include') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 864cc6e6ac8..a13addbcbd5 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -418,27 +418,6 @@ if X86_WANT_INTEL_MID config X86_INTEL_MID bool -config X86_MRST - bool "Moorestown MID platform" - depends on PCI - depends on PCI_GOANY - depends on X86_IO_APIC - select X86_INTEL_MID - select SFI - select DW_APB_TIMER - select APB_TIMER - select I2C - select SPI - select INTEL_SCU_IPC - select X86_PLATFORM_DEVICES - ---help--- - Moorestown is Intel's Low Power Intel Architecture (LPIA) based Moblin - Internet Device(MID) platform. Moorestown consists of two chips: - Lincroft (CPU core, graphics, and memory controller) and Langwell IOH. - Unlike standard x86 PCs, Moorestown does not have many legacy devices - nor standard legacy replacement devices/features. e.g. Moorestown does - not contain i8259, i8254, HPET, legacy BIOS, most of the io ports. - config X86_MDFLD bool "Medfield MID platform" depends on PCI diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h index 0a0a9546043..fc18bf3ce7c 100644 --- a/arch/x86/include/asm/mrst.h +++ b/arch/x86/include/asm/mrst.h @@ -26,8 +26,8 @@ extern struct sfi_rtc_table_entry sfi_mrtc_array[]; * identified via MSRs. */ enum mrst_cpu_type { - MRST_CPU_CHIP_LINCROFT = 1, - MRST_CPU_CHIP_PENWELL, + /* 1 was Moorestown */ + MRST_CPU_CHIP_PENWELL = 2, }; extern enum mrst_cpu_type __mrst_cpu_chip; diff --git a/arch/x86/platform/mrst/Makefile b/arch/x86/platform/mrst/Makefile index 7baed5135e0..af1da7e623f 100644 --- a/arch/x86/platform/mrst/Makefile +++ b/arch/x86/platform/mrst/Makefile @@ -1,4 +1,3 @@ obj-$(CONFIG_X86_INTEL_MID) += mrst.o obj-$(CONFIG_X86_INTEL_MID) += vrtc.o obj-$(CONFIG_EARLY_PRINTK_INTEL_MID) += early_printk_mrst.o -obj-$(CONFIG_X86_MRST) += pmu.o diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c index 475e2cd0f3c..6743587575a 100644 --- a/arch/x86/platform/mrst/mrst.c +++ b/arch/x86/platform/mrst/mrst.c @@ -78,16 +78,11 @@ int sfi_mrtc_num; static void mrst_power_off(void) { - if (__mrst_cpu_chip == MRST_CPU_CHIP_LINCROFT) - intel_scu_ipc_simple_command(IPCMSG_COLD_RESET, 1); } static void mrst_reboot(void) { - if (__mrst_cpu_chip == MRST_CPU_CHIP_LINCROFT) - intel_scu_ipc_simple_command(IPCMSG_COLD_RESET, 0); - else - intel_scu_ipc_simple_command(IPCMSG_COLD_BOOT, 0); + intel_scu_ipc_simple_command(IPCMSG_COLD_BOOT, 0); } /* parse all the mtimer info to a static mtimer array */ @@ -200,34 +195,28 @@ int __init sfi_parse_mrtc(struct sfi_table_header *table) static unsigned long __init mrst_calibrate_tsc(void) { - unsigned long flags, fast_calibrate; - if (__mrst_cpu_chip == MRST_CPU_CHIP_PENWELL) { - u32 lo, hi, ratio, fsb; - - rdmsr(MSR_IA32_PERF_STATUS, lo, hi); - pr_debug("IA32 perf status is 0x%x, 0x%0x\n", lo, hi); - ratio = (hi >> 8) & 0x1f; - pr_debug("ratio is %d\n", ratio); - if (!ratio) { - pr_err("read a zero ratio, should be incorrect!\n"); - pr_err("force tsc ratio to 16 ...\n"); - ratio = 16; - } - rdmsr(MSR_FSB_FREQ, lo, hi); - if ((lo & 0x7) == 0x7) - fsb = PENWELL_FSB_FREQ_83SKU; - else - fsb = PENWELL_FSB_FREQ_100SKU; - fast_calibrate = ratio * fsb; - pr_debug("read penwell tsc %lu khz\n", fast_calibrate); - lapic_timer_frequency = fsb * 1000 / HZ; - /* mark tsc clocksource as reliable */ - set_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC_RELIABLE); - } else { - local_irq_save(flags); - fast_calibrate = apbt_quick_calibrate(); - local_irq_restore(flags); + unsigned long fast_calibrate; + u32 lo, hi, ratio, fsb; + + rdmsr(MSR_IA32_PERF_STATUS, lo, hi); + pr_debug("IA32 perf status is 0x%x, 0x%0x\n", lo, hi); + ratio = (hi >> 8) & 0x1f; + pr_debug("ratio is %d\n", ratio); + if (!ratio) { + pr_err("read a zero ratio, should be incorrect!\n"); + pr_err("force tsc ratio to 16 ...\n"); + ratio = 16; } + rdmsr(MSR_FSB_FREQ, lo, hi); + if ((lo & 0x7) == 0x7) + fsb = PENWELL_FSB_FREQ_83SKU; + else + fsb = PENWELL_FSB_FREQ_100SKU; + fast_calibrate = ratio * fsb; + pr_debug("read penwell tsc %lu khz\n", fast_calibrate); + lapic_timer_frequency = fsb * 1000 / HZ; + /* mark tsc clocksource as reliable */ + set_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC_RELIABLE); if (fast_calibrate) return fast_calibrate; @@ -261,16 +250,11 @@ static void __cpuinit mrst_arch_setup(void) { if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27) __mrst_cpu_chip = MRST_CPU_CHIP_PENWELL; - else if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x26) - __mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT; else { - pr_err("Unknown Moorestown CPU (%d:%d), default to Lincroft\n", + pr_err("Unknown Intel MID CPU (%d:%d), default to Penwell\n", boot_cpu_data.x86, boot_cpu_data.x86_model); - __mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT; + __mrst_cpu_chip = MRST_CPU_CHIP_PENWELL; } - pr_debug("Moorestown CPU %s identified\n", - (__mrst_cpu_chip == MRST_CPU_CHIP_LINCROFT) ? - "Lincroft" : "Penwell"); } /* MID systems don't have i8042 controller */ diff --git a/arch/x86/platform/mrst/pmu.c b/arch/x86/platform/mrst/pmu.c deleted file mode 100644 index c0ac06da57a..00000000000 --- a/arch/x86/platform/mrst/pmu.c +++ /dev/null @@ -1,817 +0,0 @@ -/* - * mrst/pmu.c - driver for MRST Power Management Unit - * - * Copyright (c) 2011, Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., - * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "pmu.h" - -#define IPCMSG_FW_REVISION 0xF4 - -struct mrst_device { - u16 pci_dev_num; /* DEBUG only */ - u16 lss; - u16 latest_request; - unsigned int pci_state_counts[PCI_D3cold + 1]; /* DEBUG only */ -}; - -/* - * comlete list of MRST PCI devices - */ -static struct mrst_device mrst_devs[] = { -/* 0 */ { 0x0800, LSS_SPI0 }, /* Moorestown SPI Ctrl 0 */ -/* 1 */ { 0x0801, LSS_SPI1 }, /* Moorestown SPI Ctrl 1 */ -/* 2 */ { 0x0802, LSS_I2C0 }, /* Moorestown I2C 0 */ -/* 3 */ { 0x0803, LSS_I2C1 }, /* Moorestown I2C 1 */ -/* 4 */ { 0x0804, LSS_I2C2 }, /* Moorestown I2C 2 */ -/* 5 */ { 0x0805, LSS_KBD }, /* Moorestown Keyboard Ctrl */ -/* 6 */ { 0x0806, LSS_USB_HC }, /* Moorestown USB Ctrl */ -/* 7 */ { 0x0807, LSS_SD_HC0 }, /* Moorestown SD Host Ctrl 0 */ -/* 8 */ { 0x0808, LSS_SD_HC1 }, /* Moorestown SD Host Ctrl 1 */ -/* 9 */ { 0x0809, LSS_NAND }, /* Moorestown NAND Ctrl */ -/* 10 */ { 0x080a, LSS_AUDIO }, /* Moorestown Audio Ctrl */ -/* 11 */ { 0x080b, LSS_IMAGING }, /* Moorestown ISP */ -/* 12 */ { 0x080c, LSS_SECURITY }, /* Moorestown Security Controller */ -/* 13 */ { 0x080d, LSS_DISPLAY }, /* Moorestown External Displays */ -/* 14 */ { 0x080e, 0 }, /* Moorestown SCU IPC */ -/* 15 */ { 0x080f, LSS_GPIO }, /* Moorestown GPIO Controller */ -/* 16 */ { 0x0810, 0 }, /* Moorestown Power Management Unit */ -/* 17 */ { 0x0811, LSS_USB_OTG }, /* Moorestown OTG Ctrl */ -/* 18 */ { 0x0812, LSS_SPI2 }, /* Moorestown SPI Ctrl 2 */ -/* 19 */ { 0x0813, 0 }, /* Moorestown SC DMA */ -/* 20 */ { 0x0814, LSS_AUDIO_LPE }, /* Moorestown LPE DMA */ -/* 21 */ { 0x0815, LSS_AUDIO_SSP }, /* Moorestown SSP0 */ - -/* 22 */ { 0x084F, LSS_SD_HC2 }, /* Moorestown SD Host Ctrl 2 */ - -/* 23 */ { 0x4102, 0 }, /* Lincroft */ -/* 24 */ { 0x4110, 0 }, /* Lincroft */ -}; - -/* n.b. We ignore PCI-id 0x815 in LSS9 b/c Linux has no driver for it */ -static u16 mrst_lss9_pci_ids[] = {0x080a, 0x0814, 0}; -static u16 mrst_lss10_pci_ids[] = {0x0800, 0x0801, 0x0802, 0x0803, - 0x0804, 0x0805, 0x080f, 0}; - -/* handle concurrent SMP invokations of pmu_pci_set_power_state() */ -static spinlock_t mrst_pmu_power_state_lock; - -static unsigned int wake_counters[MRST_NUM_LSS]; /* DEBUG only */ -static unsigned int pmu_irq_stats[INT_INVALID + 1]; /* DEBUG only */ - -static int graphics_is_off; -static int lss_s0i3_enabled; -static bool mrst_pmu_s0i3_enable; - -/* debug counters */ -static u32 pmu_wait_ready_calls; -static u32 pmu_wait_ready_udelays; -static u32 pmu_wait_ready_udelays_max; -static u32 pmu_wait_done_calls; -static u32 pmu_wait_done_udelays; -static u32 pmu_wait_done_udelays_max; -static u32 pmu_set_power_state_entry; -static u32 pmu_set_power_state_send_cmd; - -static struct mrst_device *pci_id_2_mrst_dev(u16 pci_dev_num) -{ - int index = 0; - - if ((pci_dev_num >= 0x0800) && (pci_dev_num <= 0x815)) - index = pci_dev_num - 0x800; - else if (pci_dev_num == 0x084F) - index = 22; - else if (pci_dev_num == 0x4102) - index = 23; - else if (pci_dev_num == 0x4110) - index = 24; - - if (pci_dev_num != mrst_devs[index].pci_dev_num) { - WARN_ONCE(1, FW_BUG "Unknown PCI device 0x%04X\n", pci_dev_num); - return 0; - } - - return &mrst_devs[index]; -} - -/** - * mrst_pmu_validate_cstates - * @dev: cpuidle_device - * - * Certain states are not appropriate for governor to pick in some cases. - * This function will be called as cpuidle_device's prepare callback and - * thus tells governor to ignore such states when selecting the next state - * to enter. - */ - -#define IDLE_STATE4_IS_C6 4 -#define IDLE_STATE5_IS_S0I3 5 - -int mrst_pmu_invalid_cstates(void) -{ - int cpu = smp_processor_id(); - - /* - * Demote to C4 if the PMU is busy. - * Since LSS changes leave the busy bit clear... - * busy means either the PMU is waiting for an ACK-C6 that - * isn't coming due to an MWAIT that returned immediately; - * or we returned from S0i3 successfully, and the PMU - * is not done sending us interrupts. - */ - if (pmu_read_busy_status()) - return 1 << IDLE_STATE4_IS_C6 | 1 << IDLE_STATE5_IS_S0I3; - - /* - * Disallow S0i3 if: PMU is not initialized, or CPU1 is active, - * or if device LSS is insufficient, or the GPU is active, - * or if it has been explicitly disabled. - */ - if (!pmu_reg || !cpumask_equal(cpu_online_mask, cpumask_of(cpu)) || - !lss_s0i3_enabled || !graphics_is_off || !mrst_pmu_s0i3_enable) - return 1 << IDLE_STATE5_IS_S0I3; - else - return 0; -} - -/* - * pmu_update_wake_counters(): read PM_WKS, update wake_counters[] - * DEBUG only. - */ -static void pmu_update_wake_counters(void) -{ - int lss; - u32 wake_status; - - wake_status = pmu_read_wks(); - - for (lss = 0; lss < MRST_NUM_LSS; ++lss) { - if (wake_status & (1 << lss)) - wake_counters[lss]++; - } -} - -int mrst_pmu_s0i3_entry(void) -{ - int status; - - /* Clear any possible error conditions */ - pmu_write_ics(0x300); - - /* set wake control to current D-states */ - pmu_write_wssc(S0I3_SSS_TARGET); - - status = mrst_s0i3_entry(PM_S0I3_COMMAND, &pmu_reg->pm_cmd); - pmu_update_wake_counters(); - return status; -} - -/* poll for maximum of 5ms for busy bit to clear */ -static int pmu_wait_ready(void) -{ - int udelays; - - pmu_wait_ready_calls++; - - for (udelays = 0; udelays < 500; ++udelays) { - if (udelays > pmu_wait_ready_udelays_max) - pmu_wait_ready_udelays_max = udelays; - - if (pmu_read_busy_status() == 0) - return 0; - - udelay(10); - pmu_wait_ready_udelays++; - } - - /* - * if this fires, observe - * /sys/kernel/debug/mrst_pmu_wait_ready_calls - * /sys/kernel/debug/mrst_pmu_wait_ready_udelays - */ - WARN_ONCE(1, "SCU not ready for 5ms"); - return -EBUSY; -} -/* poll for maximum of 50ms us for busy bit to clear */ -static int pmu_wait_done(void) -{ - int udelays; - - pmu_wait_done_calls++; - - for (udelays = 0; udelays < 500; ++udelays) { - if (udelays > pmu_wait_done_udelays_max) - pmu_wait_done_udelays_max = udelays; - - if (pmu_read_busy_status() == 0) - return 0; - - udelay(100); - pmu_wait_done_udelays++; - } - - /* - * if this fires, observe - * /sys/kernel/debug/mrst_pmu_wait_done_calls - * /sys/kernel/debug/mrst_pmu_wait_done_udelays - */ - WARN_ONCE(1, "SCU not done for 50ms"); - return -EBUSY; -} - -u32 mrst_pmu_msi_is_disabled(void) -{ - return pmu_msi_is_disabled(); -} - -void mrst_pmu_enable_msi(void) -{ - pmu_msi_enable(); -} - -/** - * pmu_irq - pmu driver interrupt handler - * Context: interrupt context - */ -static irqreturn_t pmu_irq(int irq, void *dummy) -{ - union pmu_pm_ics pmu_ics; - - pmu_ics.value = pmu_read_ics(); - - if (!pmu_ics.bits.pending) - return IRQ_NONE; - - switch (pmu_ics.bits.cause) { - case INT_SPURIOUS: - case INT_CMD_DONE: - case INT_CMD_ERR: - case INT_WAKE_RX: - case INT_SS_ERROR: - case INT_S0IX_MISS: - case INT_NO_ACKC6: - pmu_irq_stats[pmu_ics.bits.cause]++; - break; - default: - pmu_irq_stats[INT_INVALID]++; - } - - pmu_write_ics(pmu_ics.value); /* Clear pending interrupt */ - - return IRQ_HANDLED; -} - -/* - * Translate PCI power management to MRST LSS D-states - */ -static int pci_2_mrst_state(int lss, pci_power_t pci_state) -{ - switch (pci_state) { - case PCI_D0: - if (SSMSK(D0i1, lss) & D0I1_ACG_SSS_TARGET) - return D0i1; - else - return D0; - case PCI_D1: - return D0i1; - case PCI_D2: - return D0i2; - case PCI_D3hot: - case PCI_D3cold: - return D0i3; - default: - WARN(1, "pci_state %d\n", pci_state); - return 0; - } -} - -static int pmu_issue_command(u32 pm_ssc) -{ - union pmu_pm_set_cfg_cmd_t command; - - if (pmu_read_busy_status()) { - pr_debug("pmu is busy, Operation not permitted\n"); - return -1; - } - - /* - * enable interrupts in PMU so that interrupts are - * propagated when ioc bit for a particular set - * command is set - */ - - pmu_irq_enable(); - - /* Configure the sub systems for pmu2 */ - - pmu_write_ssc(pm_ssc); - - /* - * Send the set config command for pmu its configured - * for mode CM_IMMEDIATE & hence with No Trigger - */ - - command.pmu2_params.d_param.cfg_mode = CM_IMMEDIATE; - command.pmu2_params.d_param.cfg_delay = 0; - command.pmu2_params.d_param.rsvd = 0; - - /* construct the command to send SET_CFG to particular PMU */ - command.pmu2_params.d_param.cmd = SET_CFG_CMD; - command.pmu2_params.d_param.ioc = 0; - command.pmu2_params.d_param.mode_id = 0; - command.pmu2_params.d_param.sys_state = SYS_STATE_S0I0; - - /* write the value of PM_CMD into particular PMU */ - pr_debug("pmu command being written %x\n", - command.pmu_pm_set_cfg_cmd_value); - - pmu_write_cmd(command.pmu_pm_set_cfg_cmd_value); - - return 0; -} - -static u16 pmu_min_lss_pci_req(u16 *ids, u16 pci_state) -{ - u16 existing_request; - int i; - - for (i = 0; ids[i]; ++i) { - struct mrst_device *mrst_dev; - - mrst_dev = pci_id_2_mrst_dev(ids[i]); - if (unlikely(!mrst_dev)) - continue; - - existing_request = mrst_dev->latest_request; - if (existing_request < pci_state) - pci_state = existing_request; - } - return pci_state; -} - -/** - * pmu_pci_set_power_state - Callback function is used by all the PCI devices - * for a platform specific device power on/shutdown. - */ - -int pmu_pci_set_power_state(struct pci_dev *pdev, pci_power_t pci_state) -{ - u32 old_sss, new_sss; - int status = 0; - struct mrst_device *mrst_dev; - - pmu_set_power_state_entry++; - - BUG_ON(pdev->vendor != PCI_VENDOR_ID_INTEL); - BUG_ON(pci_state < PCI_D0 || pci_state > PCI_D3cold); - - mrst_dev = pci_id_2_mrst_dev(pdev->device); - if (unlikely(!mrst_dev)) - return -ENODEV; - - mrst_dev->pci_state_counts[pci_state]++; /* count invocations */ - - /* PMU driver calls self as part of PCI initialization, ignore */ - if (pdev->device == PCI_DEV_ID_MRST_PMU) - return 0; - - BUG_ON(!pmu_reg); /* SW bug if called before initialized */ - - spin_lock(&mrst_pmu_power_state_lock); - - if (pdev->d3_delay) { - dev_dbg(&pdev->dev, "d3_delay %d, should be 0\n", - pdev->d3_delay); - pdev->d3_delay = 0; - } - /* - * If Lincroft graphics, simply remember state - */ - if ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY - && !((pdev->class & PCI_SUB_CLASS_MASK) >> 8)) { - if (pci_state == PCI_D0) - graphics_is_off = 0; - else - graphics_is_off = 1; - goto ret; - } - - if (!mrst_dev->lss) - goto ret; /* device with no LSS */ - - if (mrst_dev->latest_request == pci_state) - goto ret; /* no change */ - - mrst_dev->latest_request = pci_state; /* record latest request */ - - /* - * LSS9 and LSS10 contain multiple PCI devices. - * Use the lowest numbered (highest power) state in the LSS - */ - if (mrst_dev->lss == 9) - pci_state = pmu_min_lss_pci_req(mrst_lss9_pci_ids, pci_state); - else if (mrst_dev->lss == 10) - pci_state = pmu_min_lss_pci_req(mrst_lss10_pci_ids, pci_state); - - status = pmu_wait_ready(); - if (status) - goto ret; - - old_sss = pmu_read_sss(); - new_sss = old_sss & ~SSMSK(3, mrst_dev->lss); - new_sss |= SSMSK(pci_2_mrst_state(mrst_dev->lss, pci_state), - mrst_dev->lss); - - if (new_sss == old_sss) - goto ret; /* nothing to do */ - - pmu_set_power_state_send_cmd++; - - status = pmu_issue_command(new_sss); - - if (unlikely(status != 0)) { - dev_err(&pdev->dev, "Failed to Issue a PM command\n"); - goto ret; - } - - if (pmu_wait_done()) - goto ret; - - lss_s0i3_enabled = - ((pmu_read_sss() & S0I3_SSS_TARGET) == S0I3_SSS_TARGET); -ret: - spin_unlock(&mrst_pmu_power_state_lock); - return status; -} - -#ifdef CONFIG_DEBUG_FS -static char *d0ix_names[] = {"D0", "D0i1", "D0i2", "D0i3"}; - -static inline const char *d0ix_name(int state) -{ - return d0ix_names[(int) state]; -} - -static int debug_mrst_pmu_show(struct seq_file *s, void *unused) -{ - struct pci_dev *pdev = NULL; - u32 cur_pmsss; - int lss; - - seq_printf(s, "0x%08X D0I1_ACG_SSS_TARGET\n", D0I1_ACG_SSS_TARGET); - - cur_pmsss = pmu_read_sss(); - - seq_printf(s, "0x%08X S0I3_SSS_TARGET\n", S0I3_SSS_TARGET); - - seq_printf(s, "0x%08X Current SSS ", cur_pmsss); - seq_printf(s, lss_s0i3_enabled ? "\n" : "[BLOCKS s0i3]\n"); - - if (cpumask_equal(cpu_online_mask, cpumask_of(0))) - seq_printf(s, "cpu0 is only cpu online\n"); - else - seq_printf(s, "cpu0 is NOT only cpu online [BLOCKS S0i3]\n"); - - seq_printf(s, "GFX: %s\n", graphics_is_off ? "" : "[BLOCKS s0i3]"); - - - for_each_pci_dev(pdev) { - int pos; - u16 pmcsr; - struct mrst_device *mrst_dev; - int i; - - mrst_dev = pci_id_2_mrst_dev(pdev->device); - - seq_printf(s, "%s %04x/%04X %-16.16s ", - dev_name(&pdev->dev), - pdev->vendor, pdev->device, - dev_driver_string(&pdev->dev)); - - if (unlikely (!mrst_dev)) { - seq_printf(s, " UNKNOWN\n"); - continue; - } - - if (mrst_dev->lss) - seq_printf(s, "LSS %2d %-4s ", mrst_dev->lss, - d0ix_name(((cur_pmsss >> - (mrst_dev->lss * 2)) & 0x3))); - else - seq_printf(s, " "); - - /* PCI PM config space setting */ - pos = pci_find_capability(pdev, PCI_CAP_ID_PM); - if (pos != 0) { - pci_read_config_word(pdev, pos + PCI_PM_CTRL, &pmcsr); - seq_printf(s, "PCI-%-4s", - pci_power_name(pmcsr & PCI_PM_CTRL_STATE_MASK)); - } else { - seq_printf(s, " "); - } - - seq_printf(s, " %s ", pci_power_name(mrst_dev->latest_request)); - for (i = 0; i <= PCI_D3cold; ++i) - seq_printf(s, "%d ", mrst_dev->pci_state_counts[i]); - - if (mrst_dev->lss) { - unsigned int lssmask; - - lssmask = SSMSK(D0i3, mrst_dev->lss); - - if ((lssmask & S0I3_SSS_TARGET) && - ((lssmask & cur_pmsss) != - (lssmask & S0I3_SSS_TARGET))) - seq_printf(s , "[BLOCKS s0i3]"); - } - - seq_printf(s, "\n"); - } - seq_printf(s, "Wake Counters:\n"); - for (lss = 0; lss < MRST_NUM_LSS; ++lss) - seq_printf(s, "LSS%d %d\n", lss, wake_counters[lss]); - - seq_printf(s, "Interrupt Counters:\n"); - seq_printf(s, - "INT_SPURIOUS \t%8u\n" "INT_CMD_DONE \t%8u\n" - "INT_CMD_ERR \t%8u\n" "INT_WAKE_RX \t%8u\n" - "INT_SS_ERROR \t%8u\n" "INT_S0IX_MISS\t%8u\n" - "INT_NO_ACKC6 \t%8u\n" "INT_INVALID \t%8u\n", - pmu_irq_stats[INT_SPURIOUS], pmu_irq_stats[INT_CMD_DONE], - pmu_irq_stats[INT_CMD_ERR], pmu_irq_stats[INT_WAKE_RX], - pmu_irq_stats[INT_SS_ERROR], pmu_irq_stats[INT_S0IX_MISS], - pmu_irq_stats[INT_NO_ACKC6], pmu_irq_stats[INT_INVALID]); - - seq_printf(s, "mrst_pmu_wait_ready_calls %8d\n", - pmu_wait_ready_calls); - seq_printf(s, "mrst_pmu_wait_ready_udelays %8d\n", - pmu_wait_ready_udelays); - seq_printf(s, "mrst_pmu_wait_ready_udelays_max %8d\n", - pmu_wait_ready_udelays_max); - seq_printf(s, "mrst_pmu_wait_done_calls %8d\n", - pmu_wait_done_calls); - seq_printf(s, "mrst_pmu_wait_done_udelays %8d\n", - pmu_wait_done_udelays); - seq_printf(s, "mrst_pmu_wait_done_udelays_max %8d\n", - pmu_wait_done_udelays_max); - seq_printf(s, "mrst_pmu_set_power_state_entry %8d\n", - pmu_set_power_state_entry); - seq_printf(s, "mrst_pmu_set_power_state_send_cmd %8d\n", - pmu_set_power_state_send_cmd); - seq_printf(s, "SCU busy: %d\n", pmu_read_busy_status()); - - return 0; -} - -static int debug_mrst_pmu_open(struct inode *inode, struct file *file) -{ - return single_open(file, debug_mrst_pmu_show, NULL); -} - -static const struct file_operations devices_state_operations = { - .open = debug_mrst_pmu_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; -#endif /* DEBUG_FS */ - -/* - * Validate SCU PCI shim PCI vendor capability byte - * against LSS hard-coded in mrst_devs[] above. - * DEBUG only. - */ -static void pmu_scu_firmware_debug(void) -{ - struct pci_dev *pdev = NULL; - - for_each_pci_dev(pdev) { - struct mrst_device *mrst_dev; - u8 pci_config_lss; - int pos; - - mrst_dev = pci_id_2_mrst_dev(pdev->device); - if (unlikely(!mrst_dev)) { - printk(KERN_ERR FW_BUG "pmu: Unknown " - "PCI device 0x%04X\n", pdev->device); - continue; - } - - if (mrst_dev->lss == 0) - continue; /* no LSS in our table */ - - pos = pci_find_capability(pdev, PCI_CAP_ID_VNDR); - if (!pos != 0) { - printk(KERN_ERR FW_BUG "pmu: 0x%04X " - "missing PCI Vendor Capability\n", - pdev->device); - continue; - } - pci_read_config_byte(pdev, pos + 4, &pci_config_lss); - if (!(pci_config_lss & PCI_VENDOR_CAP_LOG_SS_MASK)) { - printk(KERN_ERR FW_BUG "pmu: 0x%04X " - "invalid PCI Vendor Capability 0x%x " - " expected LSS 0x%X\n", - pdev->device, pci_config_lss, mrst_dev->lss); - continue; - } - pci_config_lss &= PCI_VENDOR_CAP_LOG_ID_MASK; - - if (mrst_dev->lss == pci_config_lss) - continue; - - printk(KERN_ERR FW_BUG "pmu: 0x%04X LSS = %d, expected %d\n", - pdev->device, pci_config_lss, mrst_dev->lss); - } -} - -/** - * pmu_probe - */ -static int __devinit pmu_probe(struct pci_dev *pdev, - const struct pci_device_id *pci_id) -{ - int ret; - struct mrst_pmu_reg *pmu; - - /* Init the device */ - ret = pci_enable_device(pdev); - if (ret) { - dev_err(&pdev->dev, "Unable to Enable PCI device\n"); - return ret; - } - - ret = pci_request_regions(pdev, MRST_PMU_DRV_NAME); - if (ret < 0) { - dev_err(&pdev->dev, "Cannot obtain PCI resources, aborting\n"); - goto out_err1; - } - - /* Map the memory of PMU reg base */ - pmu = pci_iomap(pdev, 0, 0); - if (!pmu) { - dev_err(&pdev->dev, "Unable to map the PMU address space\n"); - ret = -ENOMEM; - goto out_err2; - } - -#ifdef CONFIG_DEBUG_FS - /* /sys/kernel/debug/mrst_pmu */ - (void) debugfs_create_file("mrst_pmu", S_IFREG | S_IRUGO, - NULL, NULL, &devices_state_operations); -#endif - pmu_reg = pmu; /* success */ - - if (request_irq(pdev->irq, pmu_irq, 0, MRST_PMU_DRV_NAME, NULL)) { - dev_err(&pdev->dev, "Registering isr has failed\n"); - ret = -1; - goto out_err3; - } - - pmu_scu_firmware_debug(); - - pmu_write_wkc(S0I3_WAKE_SOURCES); /* Enable S0i3 wakeup sources */ - - pmu_wait_ready(); - - pmu_write_ssc(D0I1_ACG_SSS_TARGET); /* Enable Auto-Clock_Gating */ - pmu_write_cmd(0x201); - - spin_lock_init(&mrst_pmu_power_state_lock); - - /* Enable the hardware interrupt */ - pmu_irq_enable(); - return 0; - -out_err3: - free_irq(pdev->irq, NULL); - pci_iounmap(pdev, pmu_reg); - pmu_reg = NULL; -out_err2: - pci_release_region(pdev, 0); -out_err1: - pci_disable_device(pdev); - return ret; -} - -static void __devexit pmu_remove(struct pci_dev *pdev) -{ - dev_err(&pdev->dev, "Mid PM pmu_remove called\n"); - - /* Freeing up the irq */ - free_irq(pdev->irq, NULL); - - pci_iounmap(pdev, pmu_reg); - pmu_reg = NULL; - - /* disable the current PCI device */ - pci_release_region(pdev, 0); - pci_disable_device(pdev); -} - -static DEFINE_PCI_DEVICE_TABLE(pmu_pci_ids) = { - { PCI_VDEVICE(INTEL, PCI_DEV_ID_MRST_PMU), 0 }, - { } -}; - -MODULE_DEVICE_TABLE(pci, pmu_pci_ids); - -static struct pci_driver driver = { - .name = MRST_PMU_DRV_NAME, - .id_table = pmu_pci_ids, - .probe = pmu_probe, - .remove = __devexit_p(pmu_remove), -}; - -/** - * pmu_pci_register - register the PMU driver as PCI device - */ -static int __init pmu_pci_register(void) -{ - return pci_register_driver(&driver); -} - -/* Register and probe via fs_initcall() to preceed device_initcall() */ -fs_initcall(pmu_pci_register); - -static void __exit mid_pci_cleanup(void) -{ - pci_unregister_driver(&driver); -} - -static int ia_major; -static int ia_minor; - -static int pmu_sfi_parse_oem(struct sfi_table_header *table) -{ - struct sfi_table_simple *sb; - - sb = (struct sfi_table_simple *)table; - ia_major = (sb->pentry[1] >> 0) & 0xFFFF; - ia_minor = (sb->pentry[1] >> 16) & 0xFFFF; - printk(KERN_INFO "mrst_pmu: IA FW version v%x.%x\n", - ia_major, ia_minor); - - return 0; -} - -static int __init scu_fw_check(void) -{ - int ret; - u32 fw_version; - - if (!pmu_reg) - return 0; /* this driver didn't probe-out */ - - sfi_table_parse("OEMB", NULL, NULL, pmu_sfi_parse_oem); - - if (ia_major < 0x6005 || ia_minor < 0x1525) { - WARN(1, "mrst_pmu: IA FW version too old\n"); - return -1; - } - - ret = intel_scu_ipc_command(IPCMSG_FW_REVISION, 0, NULL, 0, - &fw_version, 1); - - if (ret) { - WARN(1, "mrst_pmu: IPC FW version? %d\n", ret); - } else { - int scu_major = (fw_version >> 8) & 0xFF; - int scu_minor = (fw_version >> 0) & 0xFF; - - printk(KERN_INFO "mrst_pmu: firmware v%x\n", fw_version); - - if ((scu_major >= 0xC0) && (scu_minor >= 0x49)) { - printk(KERN_INFO "mrst_pmu: enabling S0i3\n"); - mrst_pmu_s0i3_enable = true; - } else { - WARN(1, "mrst_pmu: S0i3 disabled, old firmware %X.%X", - scu_major, scu_minor); - } - } - return 0; -} -late_initcall(scu_fw_check); -module_exit(mid_pci_cleanup); diff --git a/arch/x86/platform/mrst/pmu.h b/arch/x86/platform/mrst/pmu.h deleted file mode 100644 index bfbfe64b167..00000000000 --- a/arch/x86/platform/mrst/pmu.h +++ /dev/null @@ -1,234 +0,0 @@ -/* - * mrst/pmu.h - private definitions for MRST Power Management Unit mrst/pmu.c - * - * Copyright (c) 2011, Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., - * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. - */ - -#ifndef _MRST_PMU_H_ -#define _MRST_PMU_H_ - -#define PCI_DEV_ID_MRST_PMU 0x0810 -#define MRST_PMU_DRV_NAME "mrst_pmu" -#define PCI_SUB_CLASS_MASK 0xFF00 - -#define PCI_VENDOR_CAP_LOG_ID_MASK 0x7F -#define PCI_VENDOR_CAP_LOG_SS_MASK 0x80 - -#define SUB_SYS_ALL_D0I1 0x01155555 -#define S0I3_WAKE_SOURCES 0x00001FFF - -#define PM_S0I3_COMMAND \ - ((0 << 31) | /* Reserved */ \ - (0 << 30) | /* Core must be idle */ \ - (0xc2 << 22) | /* ACK C6 trigger */ \ - (3 << 19) | /* Trigger on DMI message */ \ - (3 << 16) | /* Enter S0i3 */ \ - (0 << 13) | /* Numeric mode ID (sw) */ \ - (3 << 9) | /* Trigger mode */ \ - (0 << 8) | /* Do not interrupt */ \ - (1 << 0)) /* Set configuration */ - -#define LSS_DMI 0 -#define LSS_SD_HC0 1 -#define LSS_SD_HC1 2 -#define LSS_NAND 3 -#define LSS_IMAGING 4 -#define LSS_SECURITY 5 -#define LSS_DISPLAY 6 -#define LSS_USB_HC 7 -#define LSS_USB_OTG 8 -#define LSS_AUDIO 9 -#define LSS_AUDIO_LPE 9 -#define LSS_AUDIO_SSP 9 -#define LSS_I2C0 10 -#define LSS_I2C1 10 -#define LSS_I2C2 10 -#define LSS_KBD 10 -#define LSS_SPI0 10 -#define LSS_SPI1 10 -#define LSS_SPI2 10 -#define LSS_GPIO 10 -#define LSS_SRAM 11 /* used by SCU, do not touch */ -#define LSS_SD_HC2 12 -/* LSS hardware bits 15,14,13 are hardwired to 0, thus unusable */ -#define MRST_NUM_LSS 13 - -#define MIN(a, b) (((a) < (b)) ? (a) : (b)) - -#define SSMSK(mask, lss) ((mask) << ((lss) * 2)) -#define D0 0 -#define D0i1 1 -#define D0i2 2 -#define D0i3 3 - -#define S0I3_SSS_TARGET ( \ - SSMSK(D0i1, LSS_DMI) | \ - SSMSK(D0i3, LSS_SD_HC0) | \ - SSMSK(D0i3, LSS_SD_HC1) | \ - SSMSK(D0i3, LSS_NAND) | \ - SSMSK(D0i3, LSS_SD_HC2) | \ - SSMSK(D0i3, LSS_IMAGING) | \ - SSMSK(D0i3, LSS_SECURITY) | \ - SSMSK(D0i3, LSS_DISPLAY) | \ - SSMSK(D0i3, LSS_USB_HC) | \ - SSMSK(D0i3, LSS_USB_OTG) | \ - SSMSK(D0i3, LSS_AUDIO) | \ - SSMSK(D0i1, LSS_I2C0)) - -/* - * D0i1 on Langwell is Autonomous Clock Gating (ACG). - * Enable ACG on every LSS except camera and audio - */ -#define D0I1_ACG_SSS_TARGET \ - (SUB_SYS_ALL_D0I1 & ~SSMSK(D0i1, LSS_IMAGING) & ~SSMSK(D0i1, LSS_AUDIO)) - -enum cm_mode { - CM_NOP, /* ignore the config mode value */ - CM_IMMEDIATE, - CM_DELAY, - CM_TRIGGER, - CM_INVALID -}; - -enum sys_state { - SYS_STATE_S0I0, - SYS_STATE_S0I1, - SYS_STATE_S0I2, - SYS_STATE_S0I3, - SYS_STATE_S3, - SYS_STATE_S5 -}; - -#define SET_CFG_CMD 1 - -enum int_status { - INT_SPURIOUS = 0, - INT_CMD_DONE = 1, - INT_CMD_ERR = 2, - INT_WAKE_RX = 3, - INT_SS_ERROR = 4, - INT_S0IX_MISS = 5, - INT_NO_ACKC6 = 6, - INT_INVALID = 7, -}; - -/* PMU register interface */ -static struct mrst_pmu_reg { - u32 pm_sts; /* 0x00 */ - u32 pm_cmd; /* 0x04 */ - u32 pm_ics; /* 0x08 */ - u32 _resv1; /* 0x0C */ - u32 pm_wkc[2]; /* 0x10 */ - u32 pm_wks[2]; /* 0x18 */ - u32 pm_ssc[4]; /* 0x20 */ - u32 pm_sss[4]; /* 0x30 */ - u32 pm_wssc[4]; /* 0x40 */ - u32 pm_c3c4; /* 0x50 */ - u32 pm_c5c6; /* 0x54 */ - u32 pm_msi_disable; /* 0x58 */ -} *pmu_reg; - -static inline u32 pmu_read_sts(void) { return readl(&pmu_reg->pm_sts); } -static inline u32 pmu_read_ics(void) { return readl(&pmu_reg->pm_ics); } -static inline u32 pmu_read_wks(void) { return readl(&pmu_reg->pm_wks[0]); } -static inline u32 pmu_read_sss(void) { return readl(&pmu_reg->pm_sss[0]); } - -static inline void pmu_write_cmd(u32 arg) { writel(arg, &pmu_reg->pm_cmd); } -static inline void pmu_write_ics(u32 arg) { writel(arg, &pmu_reg->pm_ics); } -static inline void pmu_write_wkc(u32 arg) { writel(arg, &pmu_reg->pm_wkc[0]); } -static inline void pmu_write_ssc(u32 arg) { writel(arg, &pmu_reg->pm_ssc[0]); } -static inline void pmu_write_wssc(u32 arg) - { writel(arg, &pmu_reg->pm_wssc[0]); } - -static inline void pmu_msi_enable(void) { writel(0, &pmu_reg->pm_msi_disable); } -static inline u32 pmu_msi_is_disabled(void) - { return readl(&pmu_reg->pm_msi_disable); } - -union pmu_pm_ics { - struct { - u32 cause:8; - u32 enable:1; - u32 pending:1; - u32 reserved:22; - } bits; - u32 value; -}; - -static inline void pmu_irq_enable(void) -{ - union pmu_pm_ics pmu_ics; - - pmu_ics.value = pmu_read_ics(); - pmu_ics.bits.enable = 1; - pmu_write_ics(pmu_ics.value); -} - -union pmu_pm_status { - struct { - u32 pmu_rev:8; - u32 pmu_busy:1; - u32 mode_id:4; - u32 Reserved:19; - } pmu_status_parts; - u32 pmu_status_value; -}; - -static inline int pmu_read_busy_status(void) -{ - union pmu_pm_status result; - - result.pmu_status_value = pmu_read_sts(); - - return result.pmu_status_parts.pmu_busy; -} - -/* pmu set config parameters */ -struct cfg_delay_param_t { - u32 cmd:8; - u32 ioc:1; - u32 cfg_mode:4; - u32 mode_id:3; - u32 sys_state:3; - u32 cfg_delay:8; - u32 rsvd:5; -}; - -struct cfg_trig_param_t { - u32 cmd:8; - u32 ioc:1; - u32 cfg_mode:4; - u32 mode_id:3; - u32 sys_state:3; - u32 cfg_trig_type:3; - u32 cfg_trig_val:8; - u32 cmbi:1; - u32 rsvd1:1; -}; - -union pmu_pm_set_cfg_cmd_t { - union { - struct cfg_delay_param_t d_param; - struct cfg_trig_param_t t_param; - } pmu2_params; - u32 pmu_pm_set_cfg_cmd_value; -}; - -#ifdef FUTURE_PATCH -extern int mrst_s0i3_entry(u32 regval, u32 *regaddr); -#else -static inline int mrst_s0i3_entry(u32 regval, u32 *regaddr) { return -1; } -#endif -#endif -- cgit v1.2.3-70-g09d2 From 644e9cbbe3fc032cc92d0936057e166a994dc246 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 26 Jan 2012 00:09:05 +0100 Subject: Add driver auto probing for x86 features v4 There's a growing number of drivers that support a specific x86 feature or CPU. Currently loading these drivers currently on a generic distribution requires various driver specific hacks and it often doesn't work. This patch adds auto probing for drivers based on the x86 cpuid information, in particular based on vendor/family/model number and also based on CPUID feature bits. For example a common issue is not loading the SSE 4.2 accelerated CRC module: this can significantly lower the performance of BTRFS which relies on fast CRC. Another issue is loading the right CPUFREQ driver for the current CPU. Currently distributions often try all all possible driver until one sticks, which is not really a good way to do this. It works with existing udev without any changes. The code exports the x86 information as a generic string in sysfs that can be matched by udev's pattern matching. This scheme does not support numeric ranges, so if you want to handle e.g. ranges of model numbers they have to be encoded in ASCII or simply all models or families listed. Fixing that would require changing udev. Another issue is that udev will happily load all drivers that match, there is currently no nice way to stop a specific driver from being loaded if it's not needed (e.g. if you don't need fast CRC) But there are not that many cpu specific drivers around and they're all not that bloated, so this isn't a particularly serious issue. Originally this patch added the modalias to the normal cpu sysdevs. However sysdevs don't have all the infrastructure needed for udev, so it couldn't really autoload drivers. This patch instead adds the CPU modaliases to the cpuid devices, which are real devices with full support for udev. This implies that the cpuid driver has to be loaded to use this. This patch just adds infrastructure, some driver conversions in followups. Thanks to Kay for helping with some sysfs magic. v2: Constifcation, some updates v4: (trenn@suse.de): - Use kzalloc instead of kmalloc to terminate modalias buffer - Use uppercase hex values to match correctly against hex values containing letters Cc: Dave Jones Cc: Kay Sievers Cc: Jen Axboe Cc: Herbert Xu Cc: Huang Ying Cc: Len Brown Signed-off-by: Andi Kleen Signed-off-by: Thomas Renninger Acked-by: H. Peter Anvin Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/cpu_device_id.h | 13 ++++++++ arch/x86/kernel/cpu/Makefile | 1 + arch/x86/kernel/cpu/match.c | 48 +++++++++++++++++++++++++++++ arch/x86/kernel/cpuid.c | 59 +++++++++++++++++++++++++++++++++++- include/linux/mod_devicetable.h | 21 +++++++++++++ scripts/mod/file2alias.c | 24 +++++++++++++++ 6 files changed, 165 insertions(+), 1 deletion(-) create mode 100644 arch/x86/include/asm/cpu_device_id.h create mode 100644 arch/x86/kernel/cpu/match.c (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/cpu_device_id.h b/arch/x86/include/asm/cpu_device_id.h new file mode 100644 index 00000000000..ff501e511d9 --- /dev/null +++ b/arch/x86/include/asm/cpu_device_id.h @@ -0,0 +1,13 @@ +#ifndef _CPU_DEVICE_ID +#define _CPU_DEVICE_ID 1 + +/* + * Declare drivers belonging to specific x86 CPUs + * Similar in spirit to pci_device_id and related PCI functions + */ + +#include + +extern const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match); + +#endif diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 25f24dccdcf..6ab6aa2fdfd 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -16,6 +16,7 @@ obj-y := intel_cacheinfo.o scattered.o topology.o obj-y += proc.o capflags.o powerflags.o common.o obj-y += vmware.o hypervisor.o sched.o mshyperv.o obj-y += rdrand.o +obj-y += match.o obj-$(CONFIG_X86_32) += bugs.o obj-$(CONFIG_X86_64) += bugs_64.o diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c new file mode 100644 index 00000000000..7acc961422e --- /dev/null +++ b/arch/x86/kernel/cpu/match.c @@ -0,0 +1,48 @@ +#include +#include +#include +#include + +/** + * x86_match_cpu - match current CPU again an array of x86_cpu_ids + * @match: Pointer to array of x86_cpu_ids. Last entry terminated with + * {}. + * + * Return the entry if the current CPU matches the entries in the + * passed x86_cpu_id match table. Otherwise NULL. The match table + * contains vendor (X86_VENDOR_*), family, model and feature bits or + * respective wildcard entries. + * + * A typical table entry would be to match a specific CPU + * { X86_VENDOR_INTEL, 6, 0x12 } + * or to match a specific CPU feature + * { X86_FEATURE_MATCH(X86_FEATURE_FOOBAR) } + * + * Fields can be wildcarded with %X86_VENDOR_ANY, %X86_FAMILY_ANY, + * %X86_MODEL_ANY, %X86_FEATURE_ANY or 0 (except for vendor) + * + * Arrays used to match for this should also be declared using + * MODULE_DEVICE_TABLE(x86_cpu, ...) + * + * This always matches against the boot cpu, assuming models and features are + * consistent over all CPUs. + */ +const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match) +{ + const struct x86_cpu_id *m; + struct cpuinfo_x86 *c = &boot_cpu_data; + + for (m = match; m->vendor | m->family | m->model | m->feature; m++) { + if (m->vendor != X86_VENDOR_ANY && c->x86_vendor != m->vendor) + continue; + if (m->family != X86_FAMILY_ANY && c->x86 != m->family) + continue; + if (m->model != X86_MODEL_ANY && c->x86_model != m->model) + continue; + if (m->feature != X86_FEATURE_ANY && !cpu_has(c, m->feature)) + continue; + return m; + } + return NULL; +} +EXPORT_SYMBOL(x86_match_cpu); diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index a524353d93f..7c89880eefd 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include @@ -138,13 +139,57 @@ static const struct file_operations cpuid_fops = { .open = cpuid_open, }; +static ssize_t print_cpu_modalias(struct device *dev, + struct device_attribute *attr, + char *bufptr) +{ + int size = PAGE_SIZE; + int i, n; + char *buf = bufptr; + + n = snprintf(buf, size, "x86cpu:vendor:%04X:family:" + "%04X:model:%04X:feature:", + boot_cpu_data.x86_vendor, + boot_cpu_data.x86, + boot_cpu_data.x86_model); + size -= n; + buf += n; + size -= 2; + for (i = 0; i < NCAPINTS*32; i++) { + if (boot_cpu_has(i)) { + n = snprintf(buf, size, ",%04X", i); + if (n < 0) { + WARN(1, "x86 features overflow page\n"); + break; + } + size -= n; + buf += n; + } + } + *buf++ = ','; + *buf++ = '\n'; + return buf - bufptr; +} + +static DEVICE_ATTR(modalias, 0444, print_cpu_modalias, NULL); + static __cpuinit int cpuid_device_create(int cpu) { struct device *dev; + int err; dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), NULL, "cpu%d", cpu); - return IS_ERR(dev) ? PTR_ERR(dev) : 0; + if (IS_ERR(dev)) + return PTR_ERR(dev); + + err = device_create_file(dev, &dev_attr_modalias); + if (err) { + /* keep device around on error. attribute is optional. */ + err = 0; + } + + return 0; } static void cpuid_device_destroy(int cpu) @@ -182,6 +227,17 @@ static char *cpuid_devnode(struct device *dev, umode_t *mode) return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt)); } +static int cpuid_dev_uevent(struct device *dev, struct kobj_uevent_env *env) +{ + char *buf = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (buf) { + print_cpu_modalias(NULL, NULL, buf); + add_uevent_var(env, "MODALIAS=%s", buf); + kfree(buf); + } + return 0; +} + static int __init cpuid_init(void) { int i, err = 0; @@ -200,6 +256,7 @@ static int __init cpuid_init(void) goto out_chrdev; } cpuid_class->devnode = cpuid_devnode; + cpuid_class->dev_uevent = cpuid_dev_uevent; for_each_online_cpu(i) { err = cpuid_device_create(i); if (err != 0) diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index b29e7f6f8fa..cff2cc08f45 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -571,4 +571,25 @@ struct amba_id { #endif }; +/* + * Match x86 CPUs for CPU specific drivers. + * See documentation of "x86_match_cpu" for details. + */ + +struct x86_cpu_id { + __u16 vendor; + __u16 family; + __u16 model; + __u16 feature; /* bit index */ + kernel_ulong_t driver_data; +}; + +#define X86_FEATURE_MATCH(x) \ + { X86_VENDOR_ANY, X86_FAMILY_ANY, X86_MODEL_ANY, x } + +#define X86_VENDOR_ANY 0xffff +#define X86_FAMILY_ANY 0 +#define X86_MODEL_ANY 0 +#define X86_FEATURE_ANY 0 /* Same as FPU, you can't test for that */ + #endif /* LINUX_MOD_DEVICETABLE_H */ diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c index c0e14b3f230..026ba38759c 100644 --- a/scripts/mod/file2alias.c +++ b/scripts/mod/file2alias.c @@ -1013,6 +1013,30 @@ static int do_amba_entry(const char *filename, } ADD_TO_DEVTABLE("amba", struct amba_id, do_amba_entry); +/* LOOKS like x86cpu:vendor:VVVV:family:FFFF:model:MMMM:feature:*,FEAT,* + * All fields are numbers. It would be nicer to use strings for vendor + * and feature, but getting those out of the build system here is too + * complicated. + */ + +static int do_x86cpu_entry(const char *filename, struct x86_cpu_id *id, + char *alias) +{ + id->feature = TO_NATIVE(id->feature); + id->family = TO_NATIVE(id->family); + id->model = TO_NATIVE(id->model); + id->vendor = TO_NATIVE(id->vendor); + + strcpy(alias, "x86cpu:"); + ADD(alias, "vendor:", id->vendor != X86_VENDOR_ANY, id->vendor); + ADD(alias, ":family:", id->family != X86_FAMILY_ANY, id->family); + ADD(alias, ":model:", id->model != X86_MODEL_ANY, id->model); + ADD(alias, ":feature:*,", id->feature != X86_FEATURE_ANY, id->feature); + strcat(alias, ",*"); + return 1; +} +ADD_TO_DEVTABLE("x86cpu", struct x86_cpu_id, do_x86cpu_entry); + /* Does namelen bytes of name exactly match the symbol? */ static bool sym_is(const char *name, unsigned namelen, const char *symbol) { -- cgit v1.2.3-70-g09d2 From 2f1e097e24defe64a86535b53768f5c8ab0368d1 Mon Sep 17 00:00:00 2001 From: Thomas Renninger Date: Thu, 26 Jan 2012 00:09:11 +0100 Subject: X86: Introduce HW-Pstate scattered cpuid feature It is rather similar to CPB (boot capability) feature and exists since fam10h (can be looked up in AMD's BKDG). The feature is needed for powernow-k8 to cleanup init functions and to provide proper autoloading matching with the new x86cpu modalias feature. Cc: Kay Sievers Cc: Dave Jones Cc: Borislav Petkov Signed-off-by: Thomas Renninger Acked-by: H. Peter Anvin Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/cpufeature.h | 1 + arch/x86/kernel/cpu/scattered.c | 1 + 2 files changed, 2 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 17c5d4bdee5..67b0910ebbb 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -176,6 +176,7 @@ #define X86_FEATURE_PLN (7*32+ 5) /* Intel Power Limit Notification */ #define X86_FEATURE_PTS (7*32+ 6) /* Intel Package Thermal Status */ #define X86_FEATURE_DTS (7*32+ 7) /* Digital Thermal Sensor */ +#define X86_FEATURE_HW_PSTATE (7*32+ 8) /* AMD HW-PState */ /* Virtualization flags: Linux defined, word 8 */ #define X86_FEATURE_TPR_SHADOW (8*32+ 0) /* Intel TPR Shadow */ diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index c7f64e6f537..addf9e82a7f 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -40,6 +40,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 }, { X86_FEATURE_XSAVEOPT, CR_EAX, 0, 0x0000000d, 1 }, { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 }, + { X86_FEATURE_HW_PSTATE, CR_EDX, 7, 0x80000007, 0 }, { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a, 0 }, { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 }, { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 }, -- cgit v1.2.3-70-g09d2 From 7931d493051ea9b09e4fddee2dc40b2eb88d62b9 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 3 Feb 2012 15:06:26 +0000 Subject: x86/spinlocks: Eliminate TICKET_MASK The definition of it being questionable already (unnecessarily including a cast), and it being used in a single place that can be written shorter without it, remove this #define. Along the same lines, simplify __ticket_spin_is_locked()'s main expression, which was the more convoluted way because of needs that went away with the recent type changes by Jeremy. This is pure cleanup, no functional change intended. Signed-off-by: Jan Beulich Acked-by: Jeremy Fitzhardinge Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/4F2C06020200007800071066@nat28.tlf.novell.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/spinlock.h | 4 ++-- arch/x86/include/asm/spinlock_types.h | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index a82c2bf504b..76bfa2cf301 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -88,14 +88,14 @@ static inline int __ticket_spin_is_locked(arch_spinlock_t *lock) { struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets); - return !!(tmp.tail ^ tmp.head); + return tmp.tail != tmp.head; } static inline int __ticket_spin_is_contended(arch_spinlock_t *lock) { struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets); - return ((tmp.tail - tmp.head) & TICKET_MASK) > 1; + return (__ticket_t)(tmp.tail - tmp.head) > 1; } #ifndef CONFIG_PARAVIRT_SPINLOCKS diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h index 8ebd5df7451..ad0ad07fc00 100644 --- a/arch/x86/include/asm/spinlock_types.h +++ b/arch/x86/include/asm/spinlock_types.h @@ -16,7 +16,6 @@ typedef u32 __ticketpair_t; #endif #define TICKET_SHIFT (sizeof(__ticket_t) * 8) -#define TICKET_MASK ((__ticket_t)((1 << TICKET_SHIFT) - 1)) typedef struct arch_spinlock { union { -- cgit v1.2.3-70-g09d2 From 21c3fcf3e39353d4f21d50e257cc74f3204b1988 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 12 Feb 2012 09:53:57 -0800 Subject: x86/debug: Fix/improve the show_msr= debug print out Found out that show_msr= is broken, when I asked a user to use it to capture debug info about broken MTRR's whose MTRR settings are probably different between CPUs. Only the first CPUs MSRs are printed, but that is not enough to track down the suspected bug. For years we called print_cpu_msr from print_cpu_info(), but this commit: | commit 2eaad1fddd7450a48ad464229775f97fbfe8af36 | Author: Mike Travis | Date: Thu Dec 10 17:19:36 2009 -0800 | | x86: Limit the number of processor bootup messages removed the print_cpu_info() call from all APs. Put it back - it will only print MSRs when the user specifically requests them via show_msr=. Signed-off-by: Yinghai Lu Cc: Mike Travis Link: http://lkml.kernel.org/r/1329069237-11483-1-git-send-email-yinghai@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 1 + arch/x86/kernel/cpu/common.c | 14 +++++++------- arch/x86/kernel/smpboot.c | 5 +++-- 3 files changed, 11 insertions(+), 9 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index aa9088c2693..8bb062bbcbe 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -162,6 +162,7 @@ extern void early_cpu_init(void); extern void identify_boot_cpu(void); extern void identify_secondary_cpu(struct cpuinfo_x86 *); extern void print_cpu_info(struct cpuinfo_x86 *); +void print_cpu_msr(struct cpuinfo_x86 *); extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c); extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c); extern unsigned short num_cache_leaves; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index d43cad74f16..8b6a3bb57d8 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -933,7 +933,7 @@ static const struct msr_range msr_range_array[] __cpuinitconst = { { 0xc0011000, 0xc001103b}, }; -static void __cpuinit print_cpu_msr(void) +static void __cpuinit __print_cpu_msr(void) { unsigned index_min, index_max; unsigned index; @@ -997,13 +997,13 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) else printk(KERN_CONT "\n"); -#ifdef CONFIG_SMP + __print_cpu_msr(); +} + +void __cpuinit print_cpu_msr(struct cpuinfo_x86 *c) +{ if (c->cpu_index < show_msr) - print_cpu_msr(); -#else - if (show_msr) - print_cpu_msr(); -#endif + __print_cpu_msr(); } static __init int setup_disablecpuid(char *arg) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 66d250c00d1..257049d7c65 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -791,9 +791,10 @@ do_rest: schedule(); } - if (cpumask_test_cpu(cpu, cpu_callin_mask)) + if (cpumask_test_cpu(cpu, cpu_callin_mask)) { + print_cpu_msr(&cpu_data(cpu)); pr_debug("CPU%d: has booted.\n", cpu); - else { + } else { boot_error = 1; if (*(volatile u32 *)TRAMPOLINE_SYM(trampoline_status) == 0xA5A5A5A5) -- cgit v1.2.3-70-g09d2 From 8546c008924d5fd1724fa698eaa92b414bafd50d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 21 Feb 2012 10:25:45 -0800 Subject: i387: Uninline the generic FP helpers that we expose to kernel modules Instead of exporting the very low-level internals of the FPU state save/restore code (ie things like 'fpu_owner_task'), we should export the higher-level interfaces. Inlining these things is pointless anyway: sure, sometimes the end result is small, but while 'stts()' can result in just three x86 instructions, those are not cheap instructions (writing %cr0 is a serializing instruction and a very slow one at that). So the overhead of a function call is not noticeable, and we really don't want random modules mucking about with our internal state save logic anyway. So this unexports 'fpu_owner_task', and instead uninlines and exports the actual functions that modules can use: fpu_kernel_begin/end() and unlazy_fpu(). Signed-off-by: Linus Torvalds Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1202211339590.5354@i5.linux-foundation.org Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/i387.h | 78 +++--------------------------------------- arch/x86/kernel/cpu/common.c | 2 -- arch/x86/kernel/i387.c | 80 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 84 insertions(+), 76 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 247904945d3..0c1031d354f 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -419,70 +419,9 @@ static inline void __clear_fpu(struct task_struct *tsk) } } -/* - * Were we in an interrupt that interrupted kernel mode? - * - * We can do a kernel_fpu_begin/end() pair *ONLY* if that - * pair does nothing at all: the thread must not have fpu (so - * that we don't try to save the FPU state), and TS must - * be set (so that the clts/stts pair does nothing that is - * visible in the interrupted kernel thread). - */ -static inline bool interrupted_kernel_fpu_idle(void) -{ - return !__thread_has_fpu(current) && - (read_cr0() & X86_CR0_TS); -} - -/* - * Were we in user mode (or vm86 mode) when we were - * interrupted? - * - * Doing kernel_fpu_begin/end() is ok if we are running - * in an interrupt context from user mode - we'll just - * save the FPU state as required. - */ -static inline bool interrupted_user_mode(void) -{ - struct pt_regs *regs = get_irq_regs(); - return regs && user_mode_vm(regs); -} - -/* - * Can we use the FPU in kernel mode with the - * whole "kernel_fpu_begin/end()" sequence? - * - * It's always ok in process context (ie "not interrupt") - * but it is sometimes ok even from an irq. - */ -static inline bool irq_fpu_usable(void) -{ - return !in_interrupt() || - interrupted_user_mode() || - interrupted_kernel_fpu_idle(); -} - -static inline void kernel_fpu_begin(void) -{ - struct task_struct *me = current; - - WARN_ON_ONCE(!irq_fpu_usable()); - preempt_disable(); - if (__thread_has_fpu(me)) { - __save_init_fpu(me); - __thread_clear_has_fpu(me); - /* We do 'stts()' in kernel_fpu_end() */ - } else { - percpu_write(fpu_owner_task, NULL); - clts(); - } -} - -static inline void kernel_fpu_end(void) -{ - stts(); - preempt_enable(); -} +extern bool irq_fpu_usable(void); +extern void kernel_fpu_begin(void); +extern void kernel_fpu_end(void); /* * Some instructions like VIA's padlock instructions generate a spurious @@ -566,16 +505,7 @@ static inline void save_init_fpu(struct task_struct *tsk) preempt_enable(); } -static inline void unlazy_fpu(struct task_struct *tsk) -{ - preempt_disable(); - if (__thread_has_fpu(tsk)) { - __save_init_fpu(tsk); - __thread_fpu_end(tsk); - } else - tsk->fpu_counter = 0; - preempt_enable(); -} +extern void unlazy_fpu(struct task_struct *tsk); static inline void clear_fpu(struct task_struct *tsk) { diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c0f7d68d318..cb71b01ab66 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1045,7 +1045,6 @@ DEFINE_PER_CPU(char *, irq_stack_ptr) = DEFINE_PER_CPU(unsigned int, irq_count) = -1; DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); -EXPORT_PER_CPU_SYMBOL(fpu_owner_task); /* * Special IST stacks which the CPU switches to when it calls @@ -1115,7 +1114,6 @@ void debug_stack_reset(void) DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; EXPORT_PER_CPU_SYMBOL(current_task); DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); -EXPORT_PER_CPU_SYMBOL(fpu_owner_task); #ifdef CONFIG_CC_STACKPROTECTOR DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 739d8598f78..17b7549c413 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -32,6 +32,86 @@ # define user32_fxsr_struct user_fxsr_struct #endif +/* + * Were we in an interrupt that interrupted kernel mode? + * + * We can do a kernel_fpu_begin/end() pair *ONLY* if that + * pair does nothing at all: the thread must not have fpu (so + * that we don't try to save the FPU state), and TS must + * be set (so that the clts/stts pair does nothing that is + * visible in the interrupted kernel thread). + */ +static inline bool interrupted_kernel_fpu_idle(void) +{ + return !__thread_has_fpu(current) && + (read_cr0() & X86_CR0_TS); +} + +/* + * Were we in user mode (or vm86 mode) when we were + * interrupted? + * + * Doing kernel_fpu_begin/end() is ok if we are running + * in an interrupt context from user mode - we'll just + * save the FPU state as required. + */ +static inline bool interrupted_user_mode(void) +{ + struct pt_regs *regs = get_irq_regs(); + return regs && user_mode_vm(regs); +} + +/* + * Can we use the FPU in kernel mode with the + * whole "kernel_fpu_begin/end()" sequence? + * + * It's always ok in process context (ie "not interrupt") + * but it is sometimes ok even from an irq. + */ +bool irq_fpu_usable(void) +{ + return !in_interrupt() || + interrupted_user_mode() || + interrupted_kernel_fpu_idle(); +} +EXPORT_SYMBOL(irq_fpu_usable); + +void kernel_fpu_begin(void) +{ + struct task_struct *me = current; + + WARN_ON_ONCE(!irq_fpu_usable()); + preempt_disable(); + if (__thread_has_fpu(me)) { + __save_init_fpu(me); + __thread_clear_has_fpu(me); + /* We do 'stts()' in kernel_fpu_end() */ + } else { + percpu_write(fpu_owner_task, NULL); + clts(); + } +} +EXPORT_SYMBOL(kernel_fpu_begin); + +void kernel_fpu_end(void) +{ + stts(); + preempt_enable(); +} +EXPORT_SYMBOL(kernel_fpu_end); + +void unlazy_fpu(struct task_struct *tsk) +{ + preempt_disable(); + if (__thread_has_fpu(tsk)) { + __save_init_fpu(tsk); + __thread_fpu_end(tsk); + } else + tsk->fpu_counter = 0; + preempt_enable(); +} +EXPORT_SYMBOL(unlazy_fpu); + #ifdef CONFIG_MATH_EMULATION # define HAVE_HWFP (boot_cpu_data.hard_math) #else -- cgit v1.2.3-70-g09d2 From 1361b83a13d4d92e53fbb6c877528713e118b821 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 21 Feb 2012 13:19:22 -0800 Subject: i387: Split up into exported and internal interfaces While various modules include to get access to things we actually *intend* for them to use, most of that header file was really pretty low-level internal stuff that we really don't want to expose to others. So split the header file into two: the small exported interfaces remain in , while the internal definitions that are only used by core architecture code are now in . The guiding principle for this was to expose functions that we export to modules, and leave them in , while stuff that is used by task switching or was marked GPL-only is in . The fpu-internal.h file could be further split up too, especially since arch/x86/kvm/ uses some of the remaining stuff for its module. But that kvm usage should probably be abstracted out a bit, and at least now the internal FPU accessor functions are much more contained. Even if it isn't perhaps as contained as it _could_ be. Signed-off-by: Linus Torvalds Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1202211340330.5354@i5.linux-foundation.org Signed-off-by: H. Peter Anvin --- arch/x86/ia32/ia32_signal.c | 1 + arch/x86/include/asm/fpu-internal.h | 520 ++++++++++++++++++++++++++++++++++++ arch/x86/include/asm/i387.h | 512 +---------------------------------- arch/x86/kernel/cpu/common.c | 1 + arch/x86/kernel/i387.c | 3 +- arch/x86/kernel/process.c | 1 + arch/x86/kernel/process_32.c | 1 + arch/x86/kernel/process_64.c | 1 + arch/x86/kernel/ptrace.c | 1 + arch/x86/kernel/signal.c | 1 + arch/x86/kernel/traps.c | 1 + arch/x86/kernel/xsave.c | 1 + arch/x86/kvm/vmx.c | 2 +- arch/x86/kvm/x86.c | 1 + arch/x86/power/cpu.c | 1 + 15 files changed, 540 insertions(+), 508 deletions(-) create mode 100644 arch/x86/include/asm/fpu-internal.h (limited to 'arch/x86/include') diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 65577698cab..5563ba1cf51 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h new file mode 100644 index 00000000000..4fa88154e4d --- /dev/null +++ b/arch/x86/include/asm/fpu-internal.h @@ -0,0 +1,520 @@ +/* + * Copyright (C) 1994 Linus Torvalds + * + * Pentium III FXSR, SSE support + * General FPU state handling cleanups + * Gareth Hughes , May 2000 + * x86-64 work by Andi Kleen 2002 + */ + +#ifndef _FPU_INTERNAL_H +#define _FPU_INTERNAL_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +extern unsigned int sig_xstate_size; +extern void fpu_init(void); + +DECLARE_PER_CPU(struct task_struct *, fpu_owner_task); + +extern user_regset_active_fn fpregs_active, xfpregs_active; +extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get, + xstateregs_get; +extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set, + xstateregs_set; + + +/* + * xstateregs_active == fpregs_active. Please refer to the comment + * at the definition of fpregs_active. + */ +#define xstateregs_active fpregs_active + +extern struct _fpx_sw_bytes fx_sw_reserved; +#ifdef CONFIG_IA32_EMULATION +extern unsigned int sig_xstate_ia32_size; +extern struct _fpx_sw_bytes fx_sw_reserved_ia32; +struct _fpstate_ia32; +struct _xstate_ia32; +extern int save_i387_xstate_ia32(void __user *buf); +extern int restore_i387_xstate_ia32(void __user *buf); +#endif + +#ifdef CONFIG_MATH_EMULATION +extern void finit_soft_fpu(struct i387_soft_struct *soft); +#else +static inline void finit_soft_fpu(struct i387_soft_struct *soft) {} +#endif + +#define X87_FSW_ES (1 << 7) /* Exception Summary */ + +static __always_inline __pure bool use_xsaveopt(void) +{ + return static_cpu_has(X86_FEATURE_XSAVEOPT); +} + +static __always_inline __pure bool use_xsave(void) +{ + return static_cpu_has(X86_FEATURE_XSAVE); +} + +static __always_inline __pure bool use_fxsr(void) +{ + return static_cpu_has(X86_FEATURE_FXSR); +} + +extern void __sanitize_i387_state(struct task_struct *); + +static inline void sanitize_i387_state(struct task_struct *tsk) +{ + if (!use_xsaveopt()) + return; + __sanitize_i387_state(tsk); +} + +#ifdef CONFIG_X86_64 +static inline int fxrstor_checking(struct i387_fxsave_struct *fx) +{ + int err; + + /* See comment in fxsave() below. */ +#ifdef CONFIG_AS_FXSAVEQ + asm volatile("1: fxrstorq %[fx]\n\t" + "2:\n" + ".section .fixup,\"ax\"\n" + "3: movl $-1,%[err]\n" + " jmp 2b\n" + ".previous\n" + _ASM_EXTABLE(1b, 3b) + : [err] "=r" (err) + : [fx] "m" (*fx), "0" (0)); +#else + asm volatile("1: rex64/fxrstor (%[fx])\n\t" + "2:\n" + ".section .fixup,\"ax\"\n" + "3: movl $-1,%[err]\n" + " jmp 2b\n" + ".previous\n" + _ASM_EXTABLE(1b, 3b) + : [err] "=r" (err) + : [fx] "R" (fx), "m" (*fx), "0" (0)); +#endif + return err; +} + +static inline int fxsave_user(struct i387_fxsave_struct __user *fx) +{ + int err; + + /* + * Clear the bytes not touched by the fxsave and reserved + * for the SW usage. + */ + err = __clear_user(&fx->sw_reserved, + sizeof(struct _fpx_sw_bytes)); + if (unlikely(err)) + return -EFAULT; + + /* See comment in fxsave() below. */ +#ifdef CONFIG_AS_FXSAVEQ + asm volatile("1: fxsaveq %[fx]\n\t" + "2:\n" + ".section .fixup,\"ax\"\n" + "3: movl $-1,%[err]\n" + " jmp 2b\n" + ".previous\n" + _ASM_EXTABLE(1b, 3b) + : [err] "=r" (err), [fx] "=m" (*fx) + : "0" (0)); +#else + asm volatile("1: rex64/fxsave (%[fx])\n\t" + "2:\n" + ".section .fixup,\"ax\"\n" + "3: movl $-1,%[err]\n" + " jmp 2b\n" + ".previous\n" + _ASM_EXTABLE(1b, 3b) + : [err] "=r" (err), "=m" (*fx) + : [fx] "R" (fx), "0" (0)); +#endif + if (unlikely(err) && + __clear_user(fx, sizeof(struct i387_fxsave_struct))) + err = -EFAULT; + /* No need to clear here because the caller clears USED_MATH */ + return err; +} + +static inline void fpu_fxsave(struct fpu *fpu) +{ + /* Using "rex64; fxsave %0" is broken because, if the memory operand + uses any extended registers for addressing, a second REX prefix + will be generated (to the assembler, rex64 followed by semicolon + is a separate instruction), and hence the 64-bitness is lost. */ + +#ifdef CONFIG_AS_FXSAVEQ + /* Using "fxsaveq %0" would be the ideal choice, but is only supported + starting with gas 2.16. */ + __asm__ __volatile__("fxsaveq %0" + : "=m" (fpu->state->fxsave)); +#else + /* Using, as a workaround, the properly prefixed form below isn't + accepted by any binutils version so far released, complaining that + the same type of prefix is used twice if an extended register is + needed for addressing (fix submitted to mainline 2005-11-21). + asm volatile("rex64/fxsave %0" + : "=m" (fpu->state->fxsave)); + This, however, we can work around by forcing the compiler to select + an addressing mode that doesn't require extended registers. */ + asm volatile("rex64/fxsave (%[fx])" + : "=m" (fpu->state->fxsave) + : [fx] "R" (&fpu->state->fxsave)); +#endif +} + +#else /* CONFIG_X86_32 */ + +/* perform fxrstor iff the processor has extended states, otherwise frstor */ +static inline int fxrstor_checking(struct i387_fxsave_struct *fx) +{ + /* + * The "nop" is needed to make the instructions the same + * length. + */ + alternative_input( + "nop ; frstor %1", + "fxrstor %1", + X86_FEATURE_FXSR, + "m" (*fx)); + + return 0; +} + +static inline void fpu_fxsave(struct fpu *fpu) +{ + asm volatile("fxsave %[fx]" + : [fx] "=m" (fpu->state->fxsave)); +} + +#endif /* CONFIG_X86_64 */ + +/* + * These must be called with preempt disabled. Returns + * 'true' if the FPU state is still intact. + */ +static inline int fpu_save_init(struct fpu *fpu) +{ + if (use_xsave()) { + fpu_xsave(fpu); + + /* + * xsave header may indicate the init state of the FP. + */ + if (!(fpu->state->xsave.xsave_hdr.xstate_bv & XSTATE_FP)) + return 1; + } else if (use_fxsr()) { + fpu_fxsave(fpu); + } else { + asm volatile("fnsave %[fx]; fwait" + : [fx] "=m" (fpu->state->fsave)); + return 0; + } + + /* + * If exceptions are pending, we need to clear them so + * that we don't randomly get exceptions later. + * + * FIXME! Is this perhaps only true for the old-style + * irq13 case? Maybe we could leave the x87 state + * intact otherwise? + */ + if (unlikely(fpu->state->fxsave.swd & X87_FSW_ES)) { + asm volatile("fnclex"); + return 0; + } + return 1; +} + +static inline int __save_init_fpu(struct task_struct *tsk) +{ + return fpu_save_init(&tsk->thread.fpu); +} + +static inline int fpu_fxrstor_checking(struct fpu *fpu) +{ + return fxrstor_checking(&fpu->state->fxsave); +} + +static inline int fpu_restore_checking(struct fpu *fpu) +{ + if (use_xsave()) + return fpu_xrstor_checking(fpu); + else + return fpu_fxrstor_checking(fpu); +} + +static inline int restore_fpu_checking(struct task_struct *tsk) +{ + /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception + is pending. Clear the x87 state here by setting it to fixed + values. "m" is a random variable that should be in L1 */ + alternative_input( + ASM_NOP8 ASM_NOP2, + "emms\n\t" /* clear stack tags */ + "fildl %P[addr]", /* set F?P to defined value */ + X86_FEATURE_FXSAVE_LEAK, + [addr] "m" (tsk->thread.fpu.has_fpu)); + + return fpu_restore_checking(&tsk->thread.fpu); +} + +/* + * Software FPU state helpers. Careful: these need to + * be preemption protection *and* they need to be + * properly paired with the CR0.TS changes! + */ +static inline int __thread_has_fpu(struct task_struct *tsk) +{ + return tsk->thread.fpu.has_fpu; +} + +/* Must be paired with an 'stts' after! */ +static inline void __thread_clear_has_fpu(struct task_struct *tsk) +{ + tsk->thread.fpu.has_fpu = 0; + percpu_write(fpu_owner_task, NULL); +} + +/* Must be paired with a 'clts' before! */ +static inline void __thread_set_has_fpu(struct task_struct *tsk) +{ + tsk->thread.fpu.has_fpu = 1; + percpu_write(fpu_owner_task, tsk); +} + +/* + * Encapsulate the CR0.TS handling together with the + * software flag. + * + * These generally need preemption protection to work, + * do try to avoid using these on their own. + */ +static inline void __thread_fpu_end(struct task_struct *tsk) +{ + __thread_clear_has_fpu(tsk); + stts(); +} + +static inline void __thread_fpu_begin(struct task_struct *tsk) +{ + clts(); + __thread_set_has_fpu(tsk); +} + +/* + * FPU state switching for scheduling. + * + * This is a two-stage process: + * + * - switch_fpu_prepare() saves the old state and + * sets the new state of the CR0.TS bit. This is + * done within the context of the old process. + * + * - switch_fpu_finish() restores the new state as + * necessary. + */ +typedef struct { int preload; } fpu_switch_t; + +/* + * FIXME! We could do a totally lazy restore, but we need to + * add a per-cpu "this was the task that last touched the FPU + * on this CPU" variable, and the task needs to have a "I last + * touched the FPU on this CPU" and check them. + * + * We don't do that yet, so "fpu_lazy_restore()" always returns + * false, but some day.. + */ +static inline int fpu_lazy_restore(struct task_struct *new, unsigned int cpu) +{ + return new == percpu_read_stable(fpu_owner_task) && + cpu == new->thread.fpu.last_cpu; +} + +static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct task_struct *new, int cpu) +{ + fpu_switch_t fpu; + + fpu.preload = tsk_used_math(new) && new->fpu_counter > 5; + if (__thread_has_fpu(old)) { + if (!__save_init_fpu(old)) + cpu = ~0; + old->thread.fpu.last_cpu = cpu; + old->thread.fpu.has_fpu = 0; /* But leave fpu_owner_task! */ + + /* Don't change CR0.TS if we just switch! */ + if (fpu.preload) { + new->fpu_counter++; + __thread_set_has_fpu(new); + prefetch(new->thread.fpu.state); + } else + stts(); + } else { + old->fpu_counter = 0; + old->thread.fpu.last_cpu = ~0; + if (fpu.preload) { + new->fpu_counter++; + if (fpu_lazy_restore(new, cpu)) + fpu.preload = 0; + else + prefetch(new->thread.fpu.state); + __thread_fpu_begin(new); + } + } + return fpu; +} + +/* + * By the time this gets called, we've already cleared CR0.TS and + * given the process the FPU if we are going to preload the FPU + * state - all we need to do is to conditionally restore the register + * state itself. + */ +static inline void switch_fpu_finish(struct task_struct *new, fpu_switch_t fpu) +{ + if (fpu.preload) { + if (unlikely(restore_fpu_checking(new))) + __thread_fpu_end(new); + } +} + +/* + * Signal frame handlers... + */ +extern int save_i387_xstate(void __user *buf); +extern int restore_i387_xstate(void __user *buf); + +static inline void __clear_fpu(struct task_struct *tsk) +{ + if (__thread_has_fpu(tsk)) { + /* Ignore delayed exceptions from user space */ + asm volatile("1: fwait\n" + "2:\n" + _ASM_EXTABLE(1b, 2b)); + __thread_fpu_end(tsk); + } +} + +/* + * The actual user_fpu_begin/end() functions + * need to be preemption-safe. + * + * NOTE! user_fpu_end() must be used only after you + * have saved the FP state, and user_fpu_begin() must + * be used only immediately before restoring it. + * These functions do not do any save/restore on + * their own. + */ +static inline void user_fpu_end(void) +{ + preempt_disable(); + __thread_fpu_end(current); + preempt_enable(); +} + +static inline void user_fpu_begin(void) +{ + preempt_disable(); + if (!user_has_fpu()) + __thread_fpu_begin(current); + preempt_enable(); +} + +/* + * These disable preemption on their own and are safe + */ +static inline void save_init_fpu(struct task_struct *tsk) +{ + WARN_ON_ONCE(!__thread_has_fpu(tsk)); + preempt_disable(); + __save_init_fpu(tsk); + __thread_fpu_end(tsk); + preempt_enable(); +} + +static inline void clear_fpu(struct task_struct *tsk) +{ + preempt_disable(); + __clear_fpu(tsk); + preempt_enable(); +} + +/* + * i387 state interaction + */ +static inline unsigned short get_fpu_cwd(struct task_struct *tsk) +{ + if (cpu_has_fxsr) { + return tsk->thread.fpu.state->fxsave.cwd; + } else { + return (unsigned short)tsk->thread.fpu.state->fsave.cwd; + } +} + +static inline unsigned short get_fpu_swd(struct task_struct *tsk) +{ + if (cpu_has_fxsr) { + return tsk->thread.fpu.state->fxsave.swd; + } else { + return (unsigned short)tsk->thread.fpu.state->fsave.swd; + } +} + +static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk) +{ + if (cpu_has_xmm) { + return tsk->thread.fpu.state->fxsave.mxcsr; + } else { + return MXCSR_DEFAULT; + } +} + +static bool fpu_allocated(struct fpu *fpu) +{ + return fpu->state != NULL; +} + +static inline int fpu_alloc(struct fpu *fpu) +{ + if (fpu_allocated(fpu)) + return 0; + fpu->state = kmem_cache_alloc(task_xstate_cachep, GFP_KERNEL); + if (!fpu->state) + return -ENOMEM; + WARN_ON((unsigned long)fpu->state & 15); + return 0; +} + +static inline void fpu_free(struct fpu *fpu) +{ + if (fpu->state) { + kmem_cache_free(task_xstate_cachep, fpu->state); + fpu->state = NULL; + } +} + +static inline void fpu_copy(struct fpu *dst, struct fpu *src) +{ + memcpy(dst->state, src->state, xstate_size); +} + +extern void fpu_finit(struct fpu *fpu); + +#endif diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 0c1031d354f..7ce0798b1b2 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -13,411 +13,15 @@ #ifndef __ASSEMBLY__ #include -#include -#include #include -#include -#include -#include -#include -#include -#include -#include -#include +#include + +struct pt_regs; +struct user_i387_struct; -extern unsigned int sig_xstate_size; -extern void fpu_init(void); -extern void mxcsr_feature_mask_init(void); extern int init_fpu(struct task_struct *child); -extern void math_state_restore(void); extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); - -DECLARE_PER_CPU(struct task_struct *, fpu_owner_task); - -extern user_regset_active_fn fpregs_active, xfpregs_active; -extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get, - xstateregs_get; -extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set, - xstateregs_set; - -/* - * xstateregs_active == fpregs_active. Please refer to the comment - * at the definition of fpregs_active. - */ -#define xstateregs_active fpregs_active - -extern struct _fpx_sw_bytes fx_sw_reserved; -#ifdef CONFIG_IA32_EMULATION -extern unsigned int sig_xstate_ia32_size; -extern struct _fpx_sw_bytes fx_sw_reserved_ia32; -struct _fpstate_ia32; -struct _xstate_ia32; -extern int save_i387_xstate_ia32(void __user *buf); -extern int restore_i387_xstate_ia32(void __user *buf); -#endif - -#ifdef CONFIG_MATH_EMULATION -extern void finit_soft_fpu(struct i387_soft_struct *soft); -#else -static inline void finit_soft_fpu(struct i387_soft_struct *soft) {} -#endif - -#define X87_FSW_ES (1 << 7) /* Exception Summary */ - -static __always_inline __pure bool use_xsaveopt(void) -{ - return static_cpu_has(X86_FEATURE_XSAVEOPT); -} - -static __always_inline __pure bool use_xsave(void) -{ - return static_cpu_has(X86_FEATURE_XSAVE); -} - -static __always_inline __pure bool use_fxsr(void) -{ - return static_cpu_has(X86_FEATURE_FXSR); -} - -extern void __sanitize_i387_state(struct task_struct *); - -static inline void sanitize_i387_state(struct task_struct *tsk) -{ - if (!use_xsaveopt()) - return; - __sanitize_i387_state(tsk); -} - -#ifdef CONFIG_X86_64 -static inline int fxrstor_checking(struct i387_fxsave_struct *fx) -{ - int err; - - /* See comment in fxsave() below. */ -#ifdef CONFIG_AS_FXSAVEQ - asm volatile("1: fxrstorq %[fx]\n\t" - "2:\n" - ".section .fixup,\"ax\"\n" - "3: movl $-1,%[err]\n" - " jmp 2b\n" - ".previous\n" - _ASM_EXTABLE(1b, 3b) - : [err] "=r" (err) - : [fx] "m" (*fx), "0" (0)); -#else - asm volatile("1: rex64/fxrstor (%[fx])\n\t" - "2:\n" - ".section .fixup,\"ax\"\n" - "3: movl $-1,%[err]\n" - " jmp 2b\n" - ".previous\n" - _ASM_EXTABLE(1b, 3b) - : [err] "=r" (err) - : [fx] "R" (fx), "m" (*fx), "0" (0)); -#endif - return err; -} - -static inline int fxsave_user(struct i387_fxsave_struct __user *fx) -{ - int err; - - /* - * Clear the bytes not touched by the fxsave and reserved - * for the SW usage. - */ - err = __clear_user(&fx->sw_reserved, - sizeof(struct _fpx_sw_bytes)); - if (unlikely(err)) - return -EFAULT; - - /* See comment in fxsave() below. */ -#ifdef CONFIG_AS_FXSAVEQ - asm volatile("1: fxsaveq %[fx]\n\t" - "2:\n" - ".section .fixup,\"ax\"\n" - "3: movl $-1,%[err]\n" - " jmp 2b\n" - ".previous\n" - _ASM_EXTABLE(1b, 3b) - : [err] "=r" (err), [fx] "=m" (*fx) - : "0" (0)); -#else - asm volatile("1: rex64/fxsave (%[fx])\n\t" - "2:\n" - ".section .fixup,\"ax\"\n" - "3: movl $-1,%[err]\n" - " jmp 2b\n" - ".previous\n" - _ASM_EXTABLE(1b, 3b) - : [err] "=r" (err), "=m" (*fx) - : [fx] "R" (fx), "0" (0)); -#endif - if (unlikely(err) && - __clear_user(fx, sizeof(struct i387_fxsave_struct))) - err = -EFAULT; - /* No need to clear here because the caller clears USED_MATH */ - return err; -} - -static inline void fpu_fxsave(struct fpu *fpu) -{ - /* Using "rex64; fxsave %0" is broken because, if the memory operand - uses any extended registers for addressing, a second REX prefix - will be generated (to the assembler, rex64 followed by semicolon - is a separate instruction), and hence the 64-bitness is lost. */ - -#ifdef CONFIG_AS_FXSAVEQ - /* Using "fxsaveq %0" would be the ideal choice, but is only supported - starting with gas 2.16. */ - __asm__ __volatile__("fxsaveq %0" - : "=m" (fpu->state->fxsave)); -#else - /* Using, as a workaround, the properly prefixed form below isn't - accepted by any binutils version so far released, complaining that - the same type of prefix is used twice if an extended register is - needed for addressing (fix submitted to mainline 2005-11-21). - asm volatile("rex64/fxsave %0" - : "=m" (fpu->state->fxsave)); - This, however, we can work around by forcing the compiler to select - an addressing mode that doesn't require extended registers. */ - asm volatile("rex64/fxsave (%[fx])" - : "=m" (fpu->state->fxsave) - : [fx] "R" (&fpu->state->fxsave)); -#endif -} - -#else /* CONFIG_X86_32 */ - -/* perform fxrstor iff the processor has extended states, otherwise frstor */ -static inline int fxrstor_checking(struct i387_fxsave_struct *fx) -{ - /* - * The "nop" is needed to make the instructions the same - * length. - */ - alternative_input( - "nop ; frstor %1", - "fxrstor %1", - X86_FEATURE_FXSR, - "m" (*fx)); - - return 0; -} - -static inline void fpu_fxsave(struct fpu *fpu) -{ - asm volatile("fxsave %[fx]" - : [fx] "=m" (fpu->state->fxsave)); -} - -#endif /* CONFIG_X86_64 */ - -/* - * These must be called with preempt disabled. Returns - * 'true' if the FPU state is still intact. - */ -static inline int fpu_save_init(struct fpu *fpu) -{ - if (use_xsave()) { - fpu_xsave(fpu); - - /* - * xsave header may indicate the init state of the FP. - */ - if (!(fpu->state->xsave.xsave_hdr.xstate_bv & XSTATE_FP)) - return 1; - } else if (use_fxsr()) { - fpu_fxsave(fpu); - } else { - asm volatile("fnsave %[fx]; fwait" - : [fx] "=m" (fpu->state->fsave)); - return 0; - } - - /* - * If exceptions are pending, we need to clear them so - * that we don't randomly get exceptions later. - * - * FIXME! Is this perhaps only true for the old-style - * irq13 case? Maybe we could leave the x87 state - * intact otherwise? - */ - if (unlikely(fpu->state->fxsave.swd & X87_FSW_ES)) { - asm volatile("fnclex"); - return 0; - } - return 1; -} - -static inline int __save_init_fpu(struct task_struct *tsk) -{ - return fpu_save_init(&tsk->thread.fpu); -} - -static inline int fpu_fxrstor_checking(struct fpu *fpu) -{ - return fxrstor_checking(&fpu->state->fxsave); -} - -static inline int fpu_restore_checking(struct fpu *fpu) -{ - if (use_xsave()) - return fpu_xrstor_checking(fpu); - else - return fpu_fxrstor_checking(fpu); -} - -static inline int restore_fpu_checking(struct task_struct *tsk) -{ - /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception - is pending. Clear the x87 state here by setting it to fixed - values. "m" is a random variable that should be in L1 */ - alternative_input( - ASM_NOP8 ASM_NOP2, - "emms\n\t" /* clear stack tags */ - "fildl %P[addr]", /* set F?P to defined value */ - X86_FEATURE_FXSAVE_LEAK, - [addr] "m" (tsk->thread.fpu.has_fpu)); - - return fpu_restore_checking(&tsk->thread.fpu); -} - -/* - * Software FPU state helpers. Careful: these need to - * be preemption protection *and* they need to be - * properly paired with the CR0.TS changes! - */ -static inline int __thread_has_fpu(struct task_struct *tsk) -{ - return tsk->thread.fpu.has_fpu; -} - -/* Must be paired with an 'stts' after! */ -static inline void __thread_clear_has_fpu(struct task_struct *tsk) -{ - tsk->thread.fpu.has_fpu = 0; - percpu_write(fpu_owner_task, NULL); -} - -/* Must be paired with a 'clts' before! */ -static inline void __thread_set_has_fpu(struct task_struct *tsk) -{ - tsk->thread.fpu.has_fpu = 1; - percpu_write(fpu_owner_task, tsk); -} - -/* - * Encapsulate the CR0.TS handling together with the - * software flag. - * - * These generally need preemption protection to work, - * do try to avoid using these on their own. - */ -static inline void __thread_fpu_end(struct task_struct *tsk) -{ - __thread_clear_has_fpu(tsk); - stts(); -} - -static inline void __thread_fpu_begin(struct task_struct *tsk) -{ - clts(); - __thread_set_has_fpu(tsk); -} - -/* - * FPU state switching for scheduling. - * - * This is a two-stage process: - * - * - switch_fpu_prepare() saves the old state and - * sets the new state of the CR0.TS bit. This is - * done within the context of the old process. - * - * - switch_fpu_finish() restores the new state as - * necessary. - */ -typedef struct { int preload; } fpu_switch_t; - -/* - * FIXME! We could do a totally lazy restore, but we need to - * add a per-cpu "this was the task that last touched the FPU - * on this CPU" variable, and the task needs to have a "I last - * touched the FPU on this CPU" and check them. - * - * We don't do that yet, so "fpu_lazy_restore()" always returns - * false, but some day.. - */ -static inline int fpu_lazy_restore(struct task_struct *new, unsigned int cpu) -{ - return new == percpu_read_stable(fpu_owner_task) && - cpu == new->thread.fpu.last_cpu; -} - -static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct task_struct *new, int cpu) -{ - fpu_switch_t fpu; - - fpu.preload = tsk_used_math(new) && new->fpu_counter > 5; - if (__thread_has_fpu(old)) { - if (!__save_init_fpu(old)) - cpu = ~0; - old->thread.fpu.last_cpu = cpu; - old->thread.fpu.has_fpu = 0; /* But leave fpu_owner_task! */ - - /* Don't change CR0.TS if we just switch! */ - if (fpu.preload) { - new->fpu_counter++; - __thread_set_has_fpu(new); - prefetch(new->thread.fpu.state); - } else - stts(); - } else { - old->fpu_counter = 0; - old->thread.fpu.last_cpu = ~0; - if (fpu.preload) { - new->fpu_counter++; - if (fpu_lazy_restore(new, cpu)) - fpu.preload = 0; - else - prefetch(new->thread.fpu.state); - __thread_fpu_begin(new); - } - } - return fpu; -} - -/* - * By the time this gets called, we've already cleared CR0.TS and - * given the process the FPU if we are going to preload the FPU - * state - all we need to do is to conditionally restore the register - * state itself. - */ -static inline void switch_fpu_finish(struct task_struct *new, fpu_switch_t fpu) -{ - if (fpu.preload) { - if (unlikely(restore_fpu_checking(new))) - __thread_fpu_end(new); - } -} - -/* - * Signal frame handlers... - */ -extern int save_i387_xstate(void __user *buf); -extern int restore_i387_xstate(void __user *buf); - -static inline void __clear_fpu(struct task_struct *tsk) -{ - if (__thread_has_fpu(tsk)) { - /* Ignore delayed exceptions from user space */ - asm volatile("1: fwait\n" - "2:\n" - _ASM_EXTABLE(1b, 2b)); - __thread_fpu_end(tsk); - } -} +extern void math_state_restore(void); extern bool irq_fpu_usable(void); extern void kernel_fpu_begin(void); @@ -463,118 +67,14 @@ static inline void irq_ts_restore(int TS_state) * we can just assume we have FPU access - typically * to save the FP state - we'll just take a #NM * fault and get the FPU access back. - * - * The actual user_fpu_begin/end() functions - * need to be preemption-safe, though. - * - * NOTE! user_fpu_end() must be used only after you - * have saved the FP state, and user_fpu_begin() must - * be used only immediately before restoring it. - * These functions do not do any save/restore on - * their own. */ static inline int user_has_fpu(void) { - return __thread_has_fpu(current); -} - -static inline void user_fpu_end(void) -{ - preempt_disable(); - __thread_fpu_end(current); - preempt_enable(); -} - -static inline void user_fpu_begin(void) -{ - preempt_disable(); - if (!user_has_fpu()) - __thread_fpu_begin(current); - preempt_enable(); -} - -/* - * These disable preemption on their own and are safe - */ -static inline void save_init_fpu(struct task_struct *tsk) -{ - WARN_ON_ONCE(!__thread_has_fpu(tsk)); - preempt_disable(); - __save_init_fpu(tsk); - __thread_fpu_end(tsk); - preempt_enable(); + return current->thread.fpu.has_fpu; } extern void unlazy_fpu(struct task_struct *tsk); -static inline void clear_fpu(struct task_struct *tsk) -{ - preempt_disable(); - __clear_fpu(tsk); - preempt_enable(); -} - -/* - * i387 state interaction - */ -static inline unsigned short get_fpu_cwd(struct task_struct *tsk) -{ - if (cpu_has_fxsr) { - return tsk->thread.fpu.state->fxsave.cwd; - } else { - return (unsigned short)tsk->thread.fpu.state->fsave.cwd; - } -} - -static inline unsigned short get_fpu_swd(struct task_struct *tsk) -{ - if (cpu_has_fxsr) { - return tsk->thread.fpu.state->fxsave.swd; - } else { - return (unsigned short)tsk->thread.fpu.state->fsave.swd; - } -} - -static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk) -{ - if (cpu_has_xmm) { - return tsk->thread.fpu.state->fxsave.mxcsr; - } else { - return MXCSR_DEFAULT; - } -} - -static bool fpu_allocated(struct fpu *fpu) -{ - return fpu->state != NULL; -} - -static inline int fpu_alloc(struct fpu *fpu) -{ - if (fpu_allocated(fpu)) - return 0; - fpu->state = kmem_cache_alloc(task_xstate_cachep, GFP_KERNEL); - if (!fpu->state) - return -ENOMEM; - WARN_ON((unsigned long)fpu->state & 15); - return 0; -} - -static inline void fpu_free(struct fpu *fpu) -{ - if (fpu->state) { - kmem_cache_free(task_xstate_cachep, fpu->state); - fpu->state = NULL; - } -} - -static inline void fpu_copy(struct fpu *dst, struct fpu *src) -{ - memcpy(dst->state, src->state, xstate_size); -} - -extern void fpu_finit(struct fpu *fpu); - #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_I387_H */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index cb71b01ab66..89620b1725d 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 17b7549c413..7734bcbb5a3 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #ifdef CONFIG_X86_64 @@ -124,7 +125,7 @@ EXPORT_SYMBOL_GPL(xstate_size); unsigned int sig_xstate_ia32_size = sizeof(struct _fpstate_ia32); static struct i387_fxsave_struct fx_scratch __cpuinitdata; -void __cpuinit mxcsr_feature_mask_init(void) +static void __cpuinit mxcsr_feature_mask_init(void) { unsigned long mask = 0; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 15763af7bfe..c38d84e0102 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -21,6 +21,7 @@ #include #include #include +#include #include struct kmem_cache *task_xstate_cachep; diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index c08d1ff12b7..ee32dee7a0a 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #ifdef CONFIG_MATH_EMULATION #include diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index cfa5c90c01d..5bad3c71e48 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 50267386b76..78f05e438be 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 46a01bdc27e..25edcfc9ba5 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 4bbe04d9674..ec61d4c1b93 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -54,6 +54,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 71109111411..e62728e30b0 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -6,6 +6,7 @@ #include #include #include +#include #ifdef CONFIG_IA32_EMULATION #include #endif diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 3b4c8d8ad90..246490f643b 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1457,7 +1457,7 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx) #ifdef CONFIG_X86_64 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); #endif - if (__thread_has_fpu(current)) + if (user_has_fpu()) clts(); load_gdt(&__get_cpu_var(host_gdt)); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9cbfc069811..b937b6179d8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -57,6 +57,7 @@ #include #include #include +#include /* Ugh! */ #include #include #include diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index f10c0afa1cb..4889655ba78 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -20,6 +20,7 @@ #include #include #include +#include /* pcntxt_mask */ #ifdef CONFIG_X86_32 static struct saved_context saved_context; -- cgit v1.2.3-70-g09d2 From 513c4ec6e4759aa33c90af0658b82eb4d2027871 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 21 Feb 2012 17:25:50 -0800 Subject: x86, cpufeature: Add CPU features from Intel document 319433-012A Add CPU features from the Intel Archicture Instruction Set Extensions Programming Reference version 012A (Feb 2012), document number 319433-012A. Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/cpufeature.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 8d67d428b0f..0d3dcc9cbab 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -199,10 +199,13 @@ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ #define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ #define X86_FEATURE_BMI1 (9*32+ 3) /* 1st group bit manipulation extensions */ +#define X86_FEATURE_HLE (9*32+ 4) /* Hardware Lock Elision */ #define X86_FEATURE_AVX2 (9*32+ 5) /* AVX2 instructions */ #define X86_FEATURE_SMEP (9*32+ 7) /* Supervisor Mode Execution Protection */ #define X86_FEATURE_BMI2 (9*32+ 8) /* 2nd group bit manipulation extensions */ #define X86_FEATURE_ERMS (9*32+ 9) /* Enhanced REP MOVSB/STOSB */ +#define X86_FEATURE_INVPCID (9*32+10) /* Invalidate Processor Context ID */ +#define X86_FEATURE_RTM (9*32+11) /* Restricted Transactional Memory */ #if defined(__KERNEL__) && !defined(__ASSEMBLY__) -- cgit v1.2.3-70-g09d2 From d6126ef5f31ca54980cb067af659a360dfcca037 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 26 Jan 2012 15:49:14 -0800 Subject: x86/mce: Convert static array of pointers to per-cpu variables When I previously fixed up the mce_device code, I used a static array of the pointers. It was (rightfully) pointed out to me that I should be using the per_cpu code instead. This patch converts the code over to that structure, moving the variable back into the per_cpu area, like it used to be for 3.2 and earlier. Signed-off-by: Greg Kroah-Hartman Reviewed-by: Srivatsa S. Bhat Link: https://lkml.org/lkml/2012/1/27/165 Signed-off-by: Tony Luck --- arch/x86/include/asm/mce.h | 2 +- arch/x86/kernel/cpu/mcheck/mce.c | 8 ++++---- arch/x86/kernel/cpu/mcheck/mce_amd.c | 9 +++++---- 3 files changed, 10 insertions(+), 9 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 6aefb14cbbc..441520e4174 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -151,7 +151,7 @@ static inline void enable_p5_mce(void) {} void mce_setup(struct mce *m); void mce_log(struct mce *m); -extern struct device *mce_device[CONFIG_NR_CPUS]; +DECLARE_PER_CPU(struct device *, mce_device); /* * Maximum banks number. diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 5a11ae2e9e9..4979a5dfeba 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1859,7 +1859,7 @@ static struct bus_type mce_subsys = { .dev_name = "machinecheck", }; -struct device *mce_device[CONFIG_NR_CPUS]; +DEFINE_PER_CPU(struct device *, mce_device); __cpuinitdata void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); @@ -2038,7 +2038,7 @@ static __cpuinit int mce_device_create(unsigned int cpu) goto error2; } cpumask_set_cpu(cpu, mce_device_initialized); - mce_device[cpu] = dev; + per_cpu(mce_device, cpu) = dev; return 0; error2: @@ -2055,7 +2055,7 @@ error: static __cpuinit void mce_device_remove(unsigned int cpu) { - struct device *dev = mce_device[cpu]; + struct device *dev = per_cpu(mce_device, cpu); int i; if (!cpumask_test_cpu(cpu, mce_device_initialized)) @@ -2069,7 +2069,7 @@ static __cpuinit void mce_device_remove(unsigned int cpu) device_unregister(dev); cpumask_clear_cpu(cpu, mce_device_initialized); - mce_device[cpu] = NULL; + per_cpu(mce_device, cpu) = NULL; } /* Make sure there are no machine checks on offlined CPUs. */ diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 786e76a8632..a4bf9d23cdb 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -523,7 +523,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) { int i, err = 0; struct threshold_bank *b = NULL; - struct device *dev = mce_device[cpu]; + struct device *dev = per_cpu(mce_device, cpu); char name[32]; sprintf(name, "threshold_bank%i", bank); @@ -585,7 +585,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) if (i == cpu) continue; - dev = mce_device[i]; + dev = per_cpu(mce_device, i); if (dev) err = sysfs_create_link(&dev->kobj,b->kobj, name); if (err) @@ -665,7 +665,8 @@ static void threshold_remove_bank(unsigned int cpu, int bank) #ifdef CONFIG_SMP /* sibling symlink */ if (shared_bank[bank] && b->blocks->cpu != cpu) { - sysfs_remove_link(&mce_device[cpu]->kobj, name); + dev = per_cpu(mce_device, cpu); + sysfs_remove_link(&dev->kobj, name); per_cpu(threshold_banks, cpu)[bank] = NULL; return; @@ -677,7 +678,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank) if (i == cpu) continue; - dev = mce_device[i]; + dev = per_cpu(mce_device, i); if (dev) sysfs_remove_link(&dev->kobj, name); per_cpu(threshold_banks, i)[bank] = NULL; -- cgit v1.2.3-70-g09d2 From b4e518547da042fdc65bd4bdafd046fed13337d5 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Fri, 16 Dec 2011 15:50:17 -0700 Subject: irq_domain/x86: Convert x86 (embedded) to use common irq_domain This patch removes the x86-specific definition of irq_domain and replaces it with the common implementation. Signed-off-by: Grant Likely Acked-by: Sebastian Andrzej Siewior Cc: Rob Herring Cc: Thomas Gleixner --- arch/x86/Kconfig | 2 + arch/x86/include/asm/irq_controller.h | 12 ---- arch/x86/include/asm/prom.h | 10 ---- arch/x86/kernel/devicetree.c | 101 ++++++++++------------------------ drivers/net/phy/mdio-gpio.c | 4 +- 5 files changed, 34 insertions(+), 95 deletions(-) delete mode 100644 arch/x86/include/asm/irq_controller.h (limited to 'arch/x86/include') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5bed94e189f..e0829a6a466 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -398,6 +398,7 @@ config X86_INTEL_CE select X86_REBOOTFIXUPS select OF select OF_EARLY_FLATTREE + select IRQ_DOMAIN ---help--- Select for the Intel CE media processor (CE4100) SOC. This option compiles in support for the CE4100 SOC for settop @@ -2076,6 +2077,7 @@ config OLPC select GPIOLIB select OF select OF_PROMTREE + select IRQ_DOMAIN ---help--- Add support for detecting the unique features of the OLPC XO hardware. diff --git a/arch/x86/include/asm/irq_controller.h b/arch/x86/include/asm/irq_controller.h deleted file mode 100644 index 423bbbddf36..00000000000 --- a/arch/x86/include/asm/irq_controller.h +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef __IRQ_CONTROLLER__ -#define __IRQ_CONTROLLER__ - -struct irq_domain { - int (*xlate)(struct irq_domain *h, const u32 *intspec, u32 intsize, - u32 *out_hwirq, u32 *out_type); - void *priv; - struct device_node *controller; - struct list_head l; -}; - -#endif diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h index 644dd885f05..60bef663609 100644 --- a/arch/x86/include/asm/prom.h +++ b/arch/x86/include/asm/prom.h @@ -21,7 +21,6 @@ #include #include #include -#include #ifdef CONFIG_OF extern int of_ioapic; @@ -43,15 +42,6 @@ extern char cmd_line[COMMAND_LINE_SIZE]; #define pci_address_to_pio pci_address_to_pio unsigned long pci_address_to_pio(phys_addr_t addr); -/** - * irq_dispose_mapping - Unmap an interrupt - * @virq: linux virq number of the interrupt to unmap - * - * FIXME: We really should implement proper virq handling like power, - * but that's going to be major surgery. - */ -static inline void irq_dispose_mapping(unsigned int virq) { } - #define HAVE_ARCH_DEVTREE_FIXUPS #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 52821799a70..3ae2ced4a87 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -17,64 +18,14 @@ #include #include -#include #include #include __initdata u64 initial_dtb; char __initdata cmd_line[COMMAND_LINE_SIZE]; -static LIST_HEAD(irq_domains); -static DEFINE_RAW_SPINLOCK(big_irq_lock); int __initdata of_ioapic; -#ifdef CONFIG_X86_IO_APIC -static void add_interrupt_host(struct irq_domain *ih) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&big_irq_lock, flags); - list_add(&ih->l, &irq_domains); - raw_spin_unlock_irqrestore(&big_irq_lock, flags); -} -#endif - -static struct irq_domain *get_ih_from_node(struct device_node *controller) -{ - struct irq_domain *ih, *found = NULL; - unsigned long flags; - - raw_spin_lock_irqsave(&big_irq_lock, flags); - list_for_each_entry(ih, &irq_domains, l) { - if (ih->controller == controller) { - found = ih; - break; - } - } - raw_spin_unlock_irqrestore(&big_irq_lock, flags); - return found; -} - -unsigned int irq_create_of_mapping(struct device_node *controller, - const u32 *intspec, unsigned int intsize) -{ - struct irq_domain *ih; - u32 virq, type; - int ret; - - ih = get_ih_from_node(controller); - if (!ih) - return 0; - ret = ih->xlate(ih, intspec, intsize, &virq, &type); - if (ret) - return 0; - if (type == IRQ_TYPE_NONE) - return virq; - irq_set_irq_type(virq, type); - return virq; -} -EXPORT_SYMBOL_GPL(irq_create_of_mapping); - unsigned long pci_address_to_pio(phys_addr_t address) { /* @@ -354,36 +305,43 @@ static struct of_ioapic_type of_ioapic_type[] = }, }; -static int ioapic_xlate(struct irq_domain *id, const u32 *intspec, u32 intsize, - u32 *out_hwirq, u32 *out_type) +static int ioapic_xlate(struct irq_domain *domain, + struct device_node *controller, + const u32 *intspec, u32 intsize, + irq_hw_number_t *out_hwirq, u32 *out_type) { - struct mp_ioapic_gsi *gsi_cfg; struct io_apic_irq_attr attr; struct of_ioapic_type *it; - u32 line, idx, type; + u32 line, idx; + int rc; - if (intsize < 2) + if (WARN_ON(intsize < 2)) return -EINVAL; - line = *intspec; - idx = (u32) id->priv; - gsi_cfg = mp_ioapic_gsi_routing(idx); - *out_hwirq = line + gsi_cfg->gsi_base; - - intspec++; - type = *intspec; + line = intspec[0]; - if (type >= ARRAY_SIZE(of_ioapic_type)) + if (intspec[1] >= ARRAY_SIZE(of_ioapic_type)) return -EINVAL; - it = of_ioapic_type + type; - *out_type = it->out_type; + it = &of_ioapic_type[intspec[1]]; + idx = (u32) domain->host_data; set_io_apic_irq_attr(&attr, idx, line, it->trigger, it->polarity); - return io_apic_setup_irq_pin_once(*out_hwirq, cpu_to_node(0), &attr); + rc = io_apic_setup_irq_pin_once(irq_find_mapping(domain, line), + cpu_to_node(0), &attr); + if (rc) + return rc; + + *out_hwirq = line; + *out_type = it->out_type; + return 0; } +const struct irq_domain_ops ioapic_irq_domain_ops = { + .xlate = ioapic_xlate, +}; + static void __init ioapic_add_ofnode(struct device_node *np) { struct resource r; @@ -399,13 +357,14 @@ static void __init ioapic_add_ofnode(struct device_node *np) for (i = 0; i < nr_ioapics; i++) { if (r.start == mpc_ioapic_addr(i)) { struct irq_domain *id; + struct mp_ioapic_gsi *gsi_cfg; + + gsi_cfg = mp_ioapic_gsi_routing(i); - id = kzalloc(sizeof(*id), GFP_KERNEL); + id = irq_domain_add_legacy(np, 32, gsi_cfg->gsi_base, 0, + &ioapic_irq_domain_ops, + (void*)i); BUG_ON(!id); - id->controller = np; - id->xlate = ioapic_xlate; - id->priv = (void *)i; - add_interrupt_host(id); return; } } diff --git a/drivers/net/phy/mdio-gpio.c b/drivers/net/phy/mdio-gpio.c index 50e8e5e7446..7189adf54bd 100644 --- a/drivers/net/phy/mdio-gpio.c +++ b/drivers/net/phy/mdio-gpio.c @@ -255,13 +255,13 @@ static inline int __init mdio_ofgpio_init(void) return platform_driver_register(&mdio_ofgpio_driver); } -static inline void __exit mdio_ofgpio_exit(void) +static inline void mdio_ofgpio_exit(void) { platform_driver_unregister(&mdio_ofgpio_driver); } #else static inline int __init mdio_ofgpio_init(void) { return 0; } -static inline void __exit mdio_ofgpio_exit(void) { } +static inline void mdio_ofgpio_exit(void) { } #endif /* CONFIG_OF_GPIO */ static struct platform_driver mdio_gpio_driver = { -- cgit v1.2.3-70-g09d2 From 1adbfa3511ee1c1118e16a9a0246870f12fef4e6 Mon Sep 17 00:00:00 2001 From: Olof Johansson Date: Sun, 12 Feb 2012 13:24:29 -0800 Subject: x86, efi: Allow basic init with mixed 32/64-bit efi/kernel Traditionally the kernel has refused to setup EFI at all if there's been a mismatch in 32/64-bit mode between EFI and the kernel. On some platforms that boot natively through EFI (Chrome OS being one), we still need to get at least some of the static data such as memory configuration out of EFI. Runtime services aren't as critical, and it's a significant amount of work to implement switching between the operating modes to call between kernel and firmware for thise cases. So I'm ignoring it for now. v5: * Fixed some printk strings based on feedback * Renamed 32/64-bit specific types to not have _ prefix * Fixed bug in printout of efi runtime disablement v4: * Some of the earlier cleanup was accidentally reverted by this patch, fixed. * Reworded some messages to not have to line wrap printk strings v3: * Reorganized to a series of patches to make it easier to review, and do some of the cleanups I had left out before. v2: * Added graceful error handling for 32-bit kernel that gets passed EFI data above 4GB. * Removed some warnings that were missed in first version. Signed-off-by: Olof Johansson Link: http://lkml.kernel.org/r/1329081869-20779-6-git-send-email-olof@lixom.net Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/efi.h | 2 +- arch/x86/kernel/setup.c | 10 ++- arch/x86/platform/efi/efi.c | 164 ++++++++++++++++++++++++++++++++++++++------ include/linux/efi.h | 45 ++++++++++++ 4 files changed, 196 insertions(+), 25 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 844f735fd63..c9dcc181d4d 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -95,7 +95,7 @@ extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size, extern int add_efi_memmap; extern void efi_set_executable(efi_memory_desc_t *md, bool executable); -extern void efi_memblock_x86_reserve_range(void); +extern int efi_memblock_x86_reserve_range(void); extern void efi_call_phys_prelog(void); extern void efi_call_phys_epilog(void); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index d7d5099fe87..88638883176 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -749,10 +749,16 @@ void __init setup_arch(char **cmdline_p) #endif #ifdef CONFIG_EFI if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, - EFI_LOADER_SIGNATURE, 4)) { + "EL32", 4)) { efi_enabled = 1; - efi_memblock_x86_reserve_range(); + efi_64bit = false; + } else if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, + "EL64", 4)) { + efi_enabled = 1; + efi_64bit = true; } + if (efi_enabled && efi_memblock_x86_reserve_range()) + efi_enabled = 0; #endif x86_init.oem.arch_setup(); diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 5a053e7737b..92660edaa1e 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -68,6 +68,9 @@ EXPORT_SYMBOL(efi); struct efi_memory_map memmap; +bool efi_64bit; +static bool efi_native; + static struct efi efi_phys __initdata; static efi_system_table_t efi_systab __initdata; @@ -339,11 +342,16 @@ static void __init do_add_efi_memmap(void) sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); } -void __init efi_memblock_x86_reserve_range(void) +int __init efi_memblock_x86_reserve_range(void) { unsigned long pmap; #ifdef CONFIG_X86_32 + /* Can't handle data above 4GB at this time */ + if (boot_params.efi_info.efi_memmap_hi) { + pr_err("Memory map is above 4GB, disabling EFI.\n"); + return -EINVAL; + } pmap = boot_params.efi_info.efi_memmap; #else pmap = (boot_params.efi_info.efi_memmap | @@ -355,6 +363,8 @@ void __init efi_memblock_x86_reserve_range(void) memmap.desc_version = boot_params.efi_info.efi_memdesc_version; memmap.desc_size = boot_params.efi_info.efi_memdesc_size; memblock_reserve(pmap, memmap.nr_map * memmap.desc_size); + + return 0; } #if EFI_DEBUG @@ -432,14 +442,75 @@ static void __init efi_free_boot_services(void) static int __init efi_systab_init(void *phys) { - efi.systab = early_ioremap((unsigned long)efi_phys.systab, - sizeof(efi_system_table_t)); - if (efi.systab == NULL) { - pr_err("Couldn't map the system table!\n"); - return -ENOMEM; + if (efi_64bit) { + efi_system_table_64_t *systab64; + u64 tmp = 0; + + systab64 = early_ioremap((unsigned long)phys, + sizeof(*systab64)); + if (systab64 == NULL) { + pr_err("Couldn't map the system table!\n"); + return -ENOMEM; + } + + efi_systab.hdr = systab64->hdr; + efi_systab.fw_vendor = systab64->fw_vendor; + tmp |= systab64->fw_vendor; + efi_systab.fw_revision = systab64->fw_revision; + efi_systab.con_in_handle = systab64->con_in_handle; + tmp |= systab64->con_in_handle; + efi_systab.con_in = systab64->con_in; + tmp |= systab64->con_in; + efi_systab.con_out_handle = systab64->con_out_handle; + tmp |= systab64->con_out_handle; + efi_systab.con_out = systab64->con_out; + tmp |= systab64->con_out; + efi_systab.stderr_handle = systab64->stderr_handle; + tmp |= systab64->stderr_handle; + efi_systab.stderr = systab64->stderr; + tmp |= systab64->stderr; + efi_systab.runtime = (void *)(unsigned long)systab64->runtime; + tmp |= systab64->runtime; + efi_systab.boottime = (void *)(unsigned long)systab64->boottime; + tmp |= systab64->boottime; + efi_systab.nr_tables = systab64->nr_tables; + efi_systab.tables = systab64->tables; + tmp |= systab64->tables; + + early_iounmap(systab64, sizeof(*systab64)); +#ifdef CONFIG_X86_32 + if (tmp >> 32) { + pr_err("EFI data located above 4GB, disabling EFI.\n"); + return -EINVAL; + } +#endif + } else { + efi_system_table_32_t *systab32; + + systab32 = early_ioremap((unsigned long)phys, + sizeof(*systab32)); + if (systab32 == NULL) { + pr_err("Couldn't map the system table!\n"); + return -ENOMEM; + } + + efi_systab.hdr = systab32->hdr; + efi_systab.fw_vendor = systab32->fw_vendor; + efi_systab.fw_revision = systab32->fw_revision; + efi_systab.con_in_handle = systab32->con_in_handle; + efi_systab.con_in = systab32->con_in; + efi_systab.con_out_handle = systab32->con_out_handle; + efi_systab.con_out = systab32->con_out; + efi_systab.stderr_handle = systab32->stderr_handle; + efi_systab.stderr = systab32->stderr; + efi_systab.runtime = (void *)(unsigned long)systab32->runtime; + efi_systab.boottime = (void *)(unsigned long)systab32->boottime; + efi_systab.nr_tables = systab32->nr_tables; + efi_systab.tables = systab32->tables; + + early_iounmap(systab32, sizeof(*systab32)); } - memcpy(&efi_systab, efi.systab, sizeof(efi_system_table_t)); - early_iounmap(efi.systab, sizeof(efi_system_table_t)); + efi.systab = &efi_systab; /* @@ -460,24 +531,47 @@ static int __init efi_systab_init(void *phys) static int __init efi_config_init(u64 tables, int nr_tables) { - efi_config_table_t *config_tables; - int i, sz = sizeof(efi_config_table_t); + void *config_tables, *tablep; + int i, sz; + + if (efi_64bit) + sz = sizeof(efi_config_table_64_t); + else + sz = sizeof(efi_config_table_32_t); /* * Let's see what config tables the firmware passed to us. */ - config_tables = early_ioremap(efi.systab->tables, - efi.systab->nr_tables * sz); + config_tables = early_ioremap(tables, nr_tables * sz); if (config_tables == NULL) { pr_err("Could not map Configuration table!\n"); return -ENOMEM; } + tablep = config_tables; pr_info(""); for (i = 0; i < efi.systab->nr_tables; i++) { - efi_guid_t guid = config_tables[i].guid; - unsigned long table = config_tables[i].table; - + efi_guid_t guid; + unsigned long table; + + if (efi_64bit) { + u64 table64; + guid = ((efi_config_table_64_t *)tablep)->guid; + table64 = ((efi_config_table_64_t *)tablep)->table; + table = table64; +#ifdef CONFIG_X86_32 + if (table64 >> 32) { + pr_cont("\n"); + pr_err("Table located above 4GB, disabling EFI.\n"); + early_iounmap(config_tables, + efi.systab->nr_tables * sz); + return -EINVAL; + } +#endif + } else { + guid = ((efi_config_table_32_t *)tablep)->guid; + table = ((efi_config_table_32_t *)tablep)->table; + } if (!efi_guidcmp(guid, MPS_TABLE_GUID)) { efi.mps = table; pr_cont(" MPS=0x%lx ", table); @@ -502,10 +596,10 @@ static int __init efi_config_init(u64 tables, int nr_tables) efi.uga = table; pr_cont(" UGA=0x%lx ", table); } + tablep += sz; } pr_cont("\n"); early_iounmap(config_tables, efi.systab->nr_tables * sz); - return 0; } @@ -569,11 +663,19 @@ void __init efi_init(void) void *tmp; #ifdef CONFIG_X86_32 + if (boot_params.efi_info.efi_systab_hi || + boot_params.efi_info.efi_memmap_hi) { + pr_info("Table located above 4GB, disabling EFI.\n"); + efi_enabled = 0; + return; + } efi_phys.systab = (efi_system_table_t *)boot_params.efi_info.efi_systab; + efi_native = !efi_64bit; #else efi_phys.systab = (efi_system_table_t *) - (boot_params.efi_info.efi_systab | - ((__u64)boot_params.efi_info.efi_systab_hi<<32)); + (boot_params.efi_info.efi_systab | + ((__u64)boot_params.efi_info.efi_systab_hi<<32)); + efi_native = efi_64bit; #endif if (efi_systab_init(efi_phys.systab)) { @@ -602,7 +704,14 @@ void __init efi_init(void) return; } - if (efi_runtime_init()) { + /* + * Note: We currently don't support runtime services on an EFI + * that doesn't match the kernel 32/64-bit mode. + */ + + if (!efi_native) + pr_info("No EFI runtime due to 32/64-bit mismatch with kernel\n"); + else if (efi_runtime_init()) { efi_enabled = 0; return; } @@ -611,10 +720,11 @@ void __init efi_init(void) efi_enabled = 0; return; } - #ifdef CONFIG_X86_32 - x86_platform.get_wallclock = efi_get_time; - x86_platform.set_wallclock = efi_set_rtc_mmss; + if (efi_native) { + x86_platform.get_wallclock = efi_get_time; + x86_platform.set_wallclock = efi_set_rtc_mmss; + } #endif #if EFI_DEBUG @@ -672,6 +782,14 @@ void __init efi_enter_virtual_mode(void) efi.systab = NULL; + /* + * We don't do virtual mode, since we don't do runtime services, on + * non-native EFI + */ + + if (!efi_native) + goto out; + /* Merge contiguous regions of the same type and attribute */ for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { u64 prev_size; @@ -787,6 +905,8 @@ void __init efi_enter_virtual_mode(void) efi.query_capsule_caps = virt_efi_query_capsule_caps; if (__supported_pte_mask & _PAGE_NX) runtime_code_page_mkexec(); + +out: early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size); memmap.map = NULL; kfree(new_memmap); diff --git a/include/linux/efi.h b/include/linux/efi.h index 37c300712e0..47fbf6b3dc7 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -313,6 +313,16 @@ typedef efi_status_t efi_query_capsule_caps_t(efi_capsule_header_t **capsules, #define EFI_FILE_SYSTEM_GUID \ EFI_GUID( 0x964e5b22, 0x6459, 0x11d2, 0x8e, 0x39, 0x00, 0xa0, 0xc9, 0x69, 0x72, 0x3b ) +typedef struct { + efi_guid_t guid; + u64 table; +} efi_config_table_64_t; + +typedef struct { + efi_guid_t guid; + u32 table; +} efi_config_table_32_t; + typedef struct { efi_guid_t guid; unsigned long table; @@ -327,6 +337,40 @@ typedef struct { #define EFI_1_10_SYSTEM_TABLE_REVISION ((1 << 16) | (10)) #define EFI_1_02_SYSTEM_TABLE_REVISION ((1 << 16) | (02)) +typedef struct { + efi_table_hdr_t hdr; + u64 fw_vendor; /* physical addr of CHAR16 vendor string */ + u32 fw_revision; + u32 __pad1; + u64 con_in_handle; + u64 con_in; + u64 con_out_handle; + u64 con_out; + u64 stderr_handle; + u64 stderr; + u64 runtime; + u64 boottime; + u32 nr_tables; + u32 __pad2; + u64 tables; +} efi_system_table_64_t; + +typedef struct { + efi_table_hdr_t hdr; + u32 fw_vendor; /* physical addr of CHAR16 vendor string */ + u32 fw_revision; + u32 con_in_handle; + u32 con_in; + u32 con_out_handle; + u32 con_out; + u32 stderr_handle; + u32 stderr; + u32 runtime; + u32 boottime; + u32 nr_tables; + u32 tables; +} efi_system_table_32_t; + typedef struct { efi_table_hdr_t hdr; unsigned long fw_vendor; /* physical addr of CHAR16 vendor string */ @@ -497,6 +541,7 @@ extern int __init efi_setup_pcdp_console(char *); #ifdef CONFIG_EFI # ifdef CONFIG_X86 extern int efi_enabled; + extern bool efi_64bit; # else # define efi_enabled 1 # endif -- cgit v1.2.3-70-g09d2 From c5905afb0ee6550b42c49213da1c22d67316c194 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 24 Feb 2012 08:31:31 +0100 Subject: static keys: Introduce 'struct static_key', static_key_true()/false() and static_key_slow_[inc|dec]() So here's a boot tested patch on top of Jason's series that does all the cleanups I talked about and turns jump labels into a more intuitive to use facility. It should also address the various misconceptions and confusions that surround jump labels. Typical usage scenarios: #include struct static_key key = STATIC_KEY_INIT_TRUE; if (static_key_false(&key)) do unlikely code else do likely code Or: if (static_key_true(&key)) do likely code else do unlikely code The static key is modified via: static_key_slow_inc(&key); ... static_key_slow_dec(&key); The 'slow' prefix makes it abundantly clear that this is an expensive operation. I've updated all in-kernel code to use this everywhere. Note that I (intentionally) have not pushed through the rename blindly through to the lowest levels: the actual jump-label patching arch facility should be named like that, so we want to decouple jump labels from the static-key facility a bit. On non-jump-label enabled architectures static keys default to likely()/unlikely() branches. Signed-off-by: Ingo Molnar Acked-by: Jason Baron Acked-by: Steven Rostedt Cc: a.p.zijlstra@chello.nl Cc: mathieu.desnoyers@efficios.com Cc: davem@davemloft.net Cc: ddaney.cavm@gmail.com Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20120222085809.GA26397@elte.hu Signed-off-by: Ingo Molnar --- arch/Kconfig | 29 ++++--- arch/ia64/include/asm/paravirt.h | 6 +- arch/ia64/kernel/paravirt.c | 4 +- arch/mips/include/asm/jump_label.h | 2 +- arch/powerpc/include/asm/jump_label.h | 2 +- arch/s390/include/asm/jump_label.h | 2 +- arch/sparc/include/asm/jump_label.h | 2 +- arch/x86/include/asm/jump_label.h | 6 +- arch/x86/include/asm/paravirt.h | 6 +- arch/x86/kernel/kvm.c | 4 +- arch/x86/kernel/paravirt.c | 4 +- arch/x86/kvm/mmu_audit.c | 8 +- include/linux/jump_label.h | 139 ++++++++++++++++++++++++---------- include/linux/netdevice.h | 4 +- include/linux/netfilter.h | 6 +- include/linux/perf_event.h | 12 +-- include/linux/static_key.h | 1 + include/linux/tracepoint.h | 8 +- include/net/sock.h | 6 +- kernel/events/core.c | 16 ++-- kernel/jump_label.c | 128 ++++++++++++++++++------------- kernel/sched/core.c | 18 ++--- kernel/sched/fair.c | 8 +- kernel/sched/sched.h | 14 ++-- kernel/tracepoint.c | 20 ++--- net/core/dev.c | 24 +++--- net/core/net-sysfs.c | 4 +- net/core/sock.c | 4 +- net/core/sysctl_net_core.c | 4 +- net/ipv4/tcp_memcontrol.c | 6 +- net/netfilter/core.c | 6 +- 31 files changed, 298 insertions(+), 205 deletions(-) create mode 100644 include/linux/static_key.h (limited to 'arch/x86/include') diff --git a/arch/Kconfig b/arch/Kconfig index 4f55c736be1..5b448a74d0f 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -47,18 +47,29 @@ config KPROBES If in doubt, say "N". config JUMP_LABEL - bool "Optimize trace point call sites" + bool "Optimize very unlikely/likely branches" depends on HAVE_ARCH_JUMP_LABEL help + This option enables a transparent branch optimization that + makes certain almost-always-true or almost-always-false branch + conditions even cheaper to execute within the kernel. + + Certain performance-sensitive kernel code, such as trace points, + scheduler functionality, networking code and KVM have such + branches and include support for this optimization technique. + If it is detected that the compiler has support for "asm goto", - the kernel will compile trace point locations with just a - nop instruction. When trace points are enabled, the nop will - be converted to a jump to the trace function. This technique - lowers overhead and stress on the branch prediction of the - processor. - - On i386, options added to the compiler flags may increase - the size of the kernel slightly. + the kernel will compile such branches with just a nop + instruction. When the condition flag is toggled to true, the + nop will be converted to a jump instruction to execute the + conditional block of instructions. + + This technique lowers overhead and stress on the branch prediction + of the processor and generally makes the kernel faster. The update + of the condition is slower, but those are always very rare. + + ( On 32-bit x86, the necessary options added to the compiler + flags may increase the size of the kernel slightly. ) config OPTPROBES def_bool y diff --git a/arch/ia64/include/asm/paravirt.h b/arch/ia64/include/asm/paravirt.h index 32551d304cd..b149b88ea79 100644 --- a/arch/ia64/include/asm/paravirt.h +++ b/arch/ia64/include/asm/paravirt.h @@ -281,9 +281,9 @@ paravirt_init_missing_ticks_accounting(int cpu) pv_time_ops.init_missing_ticks_accounting(cpu); } -struct jump_label_key; -extern struct jump_label_key paravirt_steal_enabled; -extern struct jump_label_key paravirt_steal_rq_enabled; +struct static_key; +extern struct static_key paravirt_steal_enabled; +extern struct static_key paravirt_steal_rq_enabled; static inline int paravirt_do_steal_accounting(unsigned long *new_itm) diff --git a/arch/ia64/kernel/paravirt.c b/arch/ia64/kernel/paravirt.c index 100868216c5..1b22f6de293 100644 --- a/arch/ia64/kernel/paravirt.c +++ b/arch/ia64/kernel/paravirt.c @@ -634,8 +634,8 @@ struct pv_irq_ops pv_irq_ops = { * pv_time_ops * time operations */ -struct jump_label_key paravirt_steal_enabled; -struct jump_label_key paravirt_steal_rq_enabled; +struct static_key paravirt_steal_enabled; +struct static_key paravirt_steal_rq_enabled; static int ia64_native_do_steal_accounting(unsigned long *new_itm) diff --git a/arch/mips/include/asm/jump_label.h b/arch/mips/include/asm/jump_label.h index 1881b316ca4..4d6d77ed9b9 100644 --- a/arch/mips/include/asm/jump_label.h +++ b/arch/mips/include/asm/jump_label.h @@ -20,7 +20,7 @@ #define WORD_INSN ".word" #endif -static __always_inline bool arch_static_branch(struct jump_label_key *key) +static __always_inline bool arch_static_branch(struct static_key *key) { asm goto("1:\tnop\n\t" "nop\n\t" diff --git a/arch/powerpc/include/asm/jump_label.h b/arch/powerpc/include/asm/jump_label.h index 938986e412f..ae098c438f0 100644 --- a/arch/powerpc/include/asm/jump_label.h +++ b/arch/powerpc/include/asm/jump_label.h @@ -17,7 +17,7 @@ #define JUMP_ENTRY_TYPE stringify_in_c(FTR_ENTRY_LONG) #define JUMP_LABEL_NOP_SIZE 4 -static __always_inline bool arch_static_branch(struct jump_label_key *key) +static __always_inline bool arch_static_branch(struct static_key *key) { asm goto("1:\n\t" "nop\n\t" diff --git a/arch/s390/include/asm/jump_label.h b/arch/s390/include/asm/jump_label.h index 95a6cf2b5b6..6c32190dc73 100644 --- a/arch/s390/include/asm/jump_label.h +++ b/arch/s390/include/asm/jump_label.h @@ -13,7 +13,7 @@ #define ASM_ALIGN ".balign 4" #endif -static __always_inline bool arch_static_branch(struct jump_label_key *key) +static __always_inline bool arch_static_branch(struct static_key *key) { asm goto("0: brcl 0,0\n" ".pushsection __jump_table, \"aw\"\n" diff --git a/arch/sparc/include/asm/jump_label.h b/arch/sparc/include/asm/jump_label.h index fc73a82366f..5080d16a832 100644 --- a/arch/sparc/include/asm/jump_label.h +++ b/arch/sparc/include/asm/jump_label.h @@ -7,7 +7,7 @@ #define JUMP_LABEL_NOP_SIZE 4 -static __always_inline bool arch_static_branch(struct jump_label_key *key) +static __always_inline bool arch_static_branch(struct static_key *key) { asm goto("1:\n\t" "nop\n\t" diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h index a32b18ce6ea..3a16c1483b4 100644 --- a/arch/x86/include/asm/jump_label.h +++ b/arch/x86/include/asm/jump_label.h @@ -9,12 +9,12 @@ #define JUMP_LABEL_NOP_SIZE 5 -#define JUMP_LABEL_INITIAL_NOP ".byte 0xe9 \n\t .long 0\n\t" +#define STATIC_KEY_INITIAL_NOP ".byte 0xe9 \n\t .long 0\n\t" -static __always_inline bool arch_static_branch(struct jump_label_key *key) +static __always_inline bool arch_static_branch(struct static_key *key) { asm goto("1:" - JUMP_LABEL_INITIAL_NOP + STATIC_KEY_INITIAL_NOP ".pushsection __jump_table, \"aw\" \n\t" _ASM_ALIGN "\n\t" _ASM_PTR "1b, %l[l_yes], %c0 \n\t" diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index a7d2db9a74f..c0180fd372d 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -230,9 +230,9 @@ static inline unsigned long long paravirt_sched_clock(void) return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock); } -struct jump_label_key; -extern struct jump_label_key paravirt_steal_enabled; -extern struct jump_label_key paravirt_steal_rq_enabled; +struct static_key; +extern struct static_key paravirt_steal_enabled; +extern struct static_key paravirt_steal_rq_enabled; static inline u64 paravirt_steal_clock(int cpu) { diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index f0c6fd6f176..694d801bf60 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -438,9 +438,9 @@ void __init kvm_guest_init(void) static __init int activate_jump_labels(void) { if (has_steal_clock) { - jump_label_inc(¶virt_steal_enabled); + static_key_slow_inc(¶virt_steal_enabled); if (steal_acc) - jump_label_inc(¶virt_steal_rq_enabled); + static_key_slow_inc(¶virt_steal_rq_enabled); } return 0; diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index d90272e6bc4..ada2f99388d 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -202,8 +202,8 @@ static void native_flush_tlb_single(unsigned long addr) __native_flush_tlb_single(addr); } -struct jump_label_key paravirt_steal_enabled; -struct jump_label_key paravirt_steal_rq_enabled; +struct static_key paravirt_steal_enabled; +struct static_key paravirt_steal_rq_enabled; static u64 native_steal_clock(int cpu) { diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index fe15dcc07a6..ea7b4fd3467 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c @@ -234,7 +234,7 @@ static void audit_vcpu_spte(struct kvm_vcpu *vcpu) } static bool mmu_audit; -static struct jump_label_key mmu_audit_key; +static struct static_key mmu_audit_key; static void __kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { @@ -250,7 +250,7 @@ static void __kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) static inline void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { - if (static_branch((&mmu_audit_key))) + if (static_key_false((&mmu_audit_key))) __kvm_mmu_audit(vcpu, point); } @@ -259,7 +259,7 @@ static void mmu_audit_enable(void) if (mmu_audit) return; - jump_label_inc(&mmu_audit_key); + static_key_slow_inc(&mmu_audit_key); mmu_audit = true; } @@ -268,7 +268,7 @@ static void mmu_audit_disable(void) if (!mmu_audit) return; - jump_label_dec(&mmu_audit_key); + static_key_slow_dec(&mmu_audit_key); mmu_audit = false; } diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index f7c69580fea..2172da2d9bb 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -9,15 +9,15 @@ * * Jump labels provide an interface to generate dynamic branches using * self-modifying code. Assuming toolchain and architecture support the result - * of a "if (static_branch(&key))" statement is a unconditional branch (which + * of a "if (static_key_false(&key))" statement is a unconditional branch (which * defaults to false - and the true block is placed out of line). * - * However at runtime we can change the 'static' branch target using - * jump_label_{inc,dec}(). These function as a 'reference' count on the key + * However at runtime we can change the branch target using + * static_key_slow_{inc,dec}(). These function as a 'reference' count on the key * object and for as long as there are references all branches referring to * that particular key will point to the (out of line) true block. * - * Since this relies on modifying code the jump_label_{inc,dec}() functions + * Since this relies on modifying code the static_key_slow_{inc,dec}() functions * must be considered absolute slow paths (machine wide synchronization etc.). * OTOH, since the affected branches are unconditional their runtime overhead * will be absolutely minimal, esp. in the default (off) case where the total @@ -26,12 +26,26 @@ * * When the control is directly exposed to userspace it is prudent to delay the * decrement to avoid high frequency code modifications which can (and do) - * cause significant performance degradation. Struct jump_label_key_deferred and - * jump_label_dec_deferred() provide for this. + * cause significant performance degradation. Struct static_key_deferred and + * static_key_slow_dec_deferred() provide for this. * * Lacking toolchain and or architecture support, it falls back to a simple * conditional branch. - */ + * + * struct static_key my_key = STATIC_KEY_INIT_TRUE; + * + * if (static_key_true(&my_key)) { + * } + * + * will result in the true case being in-line and starts the key with a single + * reference. Mixing static_key_true() and static_key_false() on the same key is not + * allowed. + * + * Not initializing the key (static data is initialized to 0s anyway) is the + * same as using STATIC_KEY_INIT_FALSE and static_key_false() is + * equivalent with static_branch(). + * +*/ #include #include @@ -39,16 +53,17 @@ #if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_JUMP_LABEL) -struct jump_label_key { +struct static_key { atomic_t enabled; +/* Set lsb bit to 1 if branch is default true, 0 ot */ struct jump_entry *entries; #ifdef CONFIG_MODULES - struct jump_label_mod *next; + struct static_key_mod *next; #endif }; -struct jump_label_key_deferred { - struct jump_label_key key; +struct static_key_deferred { + struct static_key key; unsigned long timeout; struct delayed_work work; }; @@ -66,13 +81,34 @@ struct module; #ifdef HAVE_JUMP_LABEL -#ifdef CONFIG_MODULES -#define JUMP_LABEL_INIT {ATOMIC_INIT(0), NULL, NULL} -#else -#define JUMP_LABEL_INIT {ATOMIC_INIT(0), NULL} -#endif +#define JUMP_LABEL_TRUE_BRANCH 1UL + +static +inline struct jump_entry *jump_label_get_entries(struct static_key *key) +{ + return (struct jump_entry *)((unsigned long)key->entries + & ~JUMP_LABEL_TRUE_BRANCH); +} + +static inline bool jump_label_get_branch_default(struct static_key *key) +{ + if ((unsigned long)key->entries & JUMP_LABEL_TRUE_BRANCH) + return true; + return false; +} + +static __always_inline bool static_key_false(struct static_key *key) +{ + return arch_static_branch(key); +} -static __always_inline bool static_branch(struct jump_label_key *key) +static __always_inline bool static_key_true(struct static_key *key) +{ + return !static_key_false(key); +} + +/* Deprecated. Please use 'static_key_false() instead. */ +static __always_inline bool static_branch(struct static_key *key) { return arch_static_branch(key); } @@ -88,21 +124,24 @@ extern void arch_jump_label_transform(struct jump_entry *entry, extern void arch_jump_label_transform_static(struct jump_entry *entry, enum jump_label_type type); extern int jump_label_text_reserved(void *start, void *end); -extern void jump_label_inc(struct jump_label_key *key); -extern void jump_label_dec(struct jump_label_key *key); -extern void jump_label_dec_deferred(struct jump_label_key_deferred *key); -extern bool jump_label_enabled(struct jump_label_key *key); +extern void static_key_slow_inc(struct static_key *key); +extern void static_key_slow_dec(struct static_key *key); +extern void static_key_slow_dec_deferred(struct static_key_deferred *key); +extern bool static_key_enabled(struct static_key *key); extern void jump_label_apply_nops(struct module *mod); -extern void jump_label_rate_limit(struct jump_label_key_deferred *key, - unsigned long rl); +extern void +jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl); + +#define STATIC_KEY_INIT_TRUE ((struct static_key) \ + { .enabled = ATOMIC_INIT(1), .entries = (void *)1 }) +#define STATIC_KEY_INIT_FALSE ((struct static_key) \ + { .enabled = ATOMIC_INIT(0), .entries = (void *)0 }) #else /* !HAVE_JUMP_LABEL */ #include -#define JUMP_LABEL_INIT {ATOMIC_INIT(0)} - -struct jump_label_key { +struct static_key { atomic_t enabled; }; @@ -110,30 +149,45 @@ static __always_inline void jump_label_init(void) { } -struct jump_label_key_deferred { - struct jump_label_key key; +struct static_key_deferred { + struct static_key key; }; -static __always_inline bool static_branch(struct jump_label_key *key) +static __always_inline bool static_key_false(struct static_key *key) +{ + if (unlikely(atomic_read(&key->enabled)) > 0) + return true; + return false; +} + +static __always_inline bool static_key_true(struct static_key *key) { - if (unlikely(atomic_read(&key->enabled))) + if (likely(atomic_read(&key->enabled)) > 0) return true; return false; } -static inline void jump_label_inc(struct jump_label_key *key) +/* Deprecated. Please use 'static_key_false() instead. */ +static __always_inline bool static_branch(struct static_key *key) +{ + if (unlikely(atomic_read(&key->enabled)) > 0) + return true; + return false; +} + +static inline void static_key_slow_inc(struct static_key *key) { atomic_inc(&key->enabled); } -static inline void jump_label_dec(struct jump_label_key *key) +static inline void static_key_slow_dec(struct static_key *key) { atomic_dec(&key->enabled); } -static inline void jump_label_dec_deferred(struct jump_label_key_deferred *key) +static inline void static_key_slow_dec_deferred(struct static_key_deferred *key) { - jump_label_dec(&key->key); + static_key_slow_dec(&key->key); } static inline int jump_label_text_reserved(void *start, void *end) @@ -144,9 +198,9 @@ static inline int jump_label_text_reserved(void *start, void *end) static inline void jump_label_lock(void) {} static inline void jump_label_unlock(void) {} -static inline bool jump_label_enabled(struct jump_label_key *key) +static inline bool static_key_enabled(struct static_key *key) { - return !!atomic_read(&key->enabled); + return (atomic_read(&key->enabled) > 0); } static inline int jump_label_apply_nops(struct module *mod) @@ -154,13 +208,20 @@ static inline int jump_label_apply_nops(struct module *mod) return 0; } -static inline void jump_label_rate_limit(struct jump_label_key_deferred *key, +static inline void +jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl) { } + +#define STATIC_KEY_INIT_TRUE ((struct static_key) \ + { .enabled = ATOMIC_INIT(1) }) +#define STATIC_KEY_INIT_FALSE ((struct static_key) \ + { .enabled = ATOMIC_INIT(0) }) + #endif /* HAVE_JUMP_LABEL */ -#define jump_label_key_enabled ((struct jump_label_key){ .enabled = ATOMIC_INIT(1), }) -#define jump_label_key_disabled ((struct jump_label_key){ .enabled = ATOMIC_INIT(0), }) +#define STATIC_KEY_INIT STATIC_KEY_INIT_FALSE +#define jump_label_enabled static_key_enabled #endif /* _LINUX_JUMP_LABEL_H */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 0eac07c9525..7dfaae7846a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -214,8 +214,8 @@ enum { #include #ifdef CONFIG_RPS -#include -extern struct jump_label_key rps_needed; +#include +extern struct static_key rps_needed; #endif struct neighbour; diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index b809265607d..29734be334c 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -163,13 +163,13 @@ extern struct ctl_path nf_net_ipv4_netfilter_sysctl_path[]; extern struct list_head nf_hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; #if defined(CONFIG_JUMP_LABEL) -#include -extern struct jump_label_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; +#include +extern struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; static inline bool nf_hooks_active(u_int8_t pf, unsigned int hook) { if (__builtin_constant_p(pf) && __builtin_constant_p(hook)) - return static_branch(&nf_hooks_needed[pf][hook]); + return static_key_false(&nf_hooks_needed[pf][hook]); return !list_empty(&nf_hooks[pf][hook]); } diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 412b790f5da..0d21e6f1cf5 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -514,7 +514,7 @@ struct perf_guest_info_callbacks { #include #include #include -#include +#include #include #include @@ -1038,7 +1038,7 @@ static inline int is_software_event(struct perf_event *event) return event->pmu->task_ctx_nr == perf_sw_context; } -extern struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; +extern struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; extern void __perf_sw_event(u32, u64, struct pt_regs *, u64); @@ -1066,7 +1066,7 @@ perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { struct pt_regs hot_regs; - if (static_branch(&perf_swevent_enabled[event_id])) { + if (static_key_false(&perf_swevent_enabled[event_id])) { if (!regs) { perf_fetch_caller_regs(&hot_regs); regs = &hot_regs; @@ -1075,12 +1075,12 @@ perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) } } -extern struct jump_label_key_deferred perf_sched_events; +extern struct static_key_deferred perf_sched_events; static inline void perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task) { - if (static_branch(&perf_sched_events.key)) + if (static_key_false(&perf_sched_events.key)) __perf_event_task_sched_in(prev, task); } @@ -1089,7 +1089,7 @@ static inline void perf_event_task_sched_out(struct task_struct *prev, { perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, NULL, 0); - if (static_branch(&perf_sched_events.key)) + if (static_key_false(&perf_sched_events.key)) __perf_event_task_sched_out(prev, next); } diff --git a/include/linux/static_key.h b/include/linux/static_key.h new file mode 100644 index 00000000000..27bd3f8a085 --- /dev/null +++ b/include/linux/static_key.h @@ -0,0 +1 @@ +#include diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index fc36da97ff7..bd96ecd0e05 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -17,7 +17,7 @@ #include #include #include -#include +#include struct module; struct tracepoint; @@ -29,7 +29,7 @@ struct tracepoint_func { struct tracepoint { const char *name; /* Tracepoint name */ - struct jump_label_key key; + struct static_key key; void (*regfunc)(void); void (*unregfunc)(void); struct tracepoint_func __rcu *funcs; @@ -145,7 +145,7 @@ static inline void tracepoint_synchronize_unregister(void) extern struct tracepoint __tracepoint_##name; \ static inline void trace_##name(proto) \ { \ - if (static_branch(&__tracepoint_##name.key)) \ + if (static_key_false(&__tracepoint_##name.key)) \ __DO_TRACE(&__tracepoint_##name, \ TP_PROTO(data_proto), \ TP_ARGS(data_args), \ @@ -188,7 +188,7 @@ static inline void tracepoint_synchronize_unregister(void) __attribute__((section("__tracepoints_strings"))) = #name; \ struct tracepoint __tracepoint_##name \ __attribute__((section("__tracepoints"))) = \ - { __tpstrtab_##name, JUMP_LABEL_INIT, reg, unreg, NULL };\ + { __tpstrtab_##name, STATIC_KEY_INIT_FALSE, reg, unreg, NULL };\ static struct tracepoint * const __tracepoint_ptr_##name __used \ __attribute__((section("__tracepoints_ptrs"))) = \ &__tracepoint_##name; diff --git a/include/net/sock.h b/include/net/sock.h index 91c1c8baf02..dcde2d9268c 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -55,7 +55,7 @@ #include #include #include -#include +#include #include #include @@ -924,13 +924,13 @@ inline void sk_refcnt_debug_release(const struct sock *sk) #endif /* SOCK_REFCNT_DEBUG */ #if defined(CONFIG_CGROUP_MEM_RES_CTLR_KMEM) && defined(CONFIG_NET) -extern struct jump_label_key memcg_socket_limit_enabled; +extern struct static_key memcg_socket_limit_enabled; static inline struct cg_proto *parent_cg_proto(struct proto *proto, struct cg_proto *cg_proto) { return proto->proto_cgroup(parent_mem_cgroup(cg_proto->memcg)); } -#define mem_cgroup_sockets_enabled static_branch(&memcg_socket_limit_enabled) +#define mem_cgroup_sockets_enabled static_key_false(&memcg_socket_limit_enabled) #else #define mem_cgroup_sockets_enabled 0 static inline struct cg_proto *parent_cg_proto(struct proto *proto, diff --git a/kernel/events/core.c b/kernel/events/core.c index 7c3b9de55f6..5e0f8bb89b2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -128,7 +128,7 @@ enum event_type_t { * perf_sched_events : >0 events exist * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu */ -struct jump_label_key_deferred perf_sched_events __read_mostly; +struct static_key_deferred perf_sched_events __read_mostly; static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); static atomic_t nr_mmap_events __read_mostly; @@ -2769,7 +2769,7 @@ static void free_event(struct perf_event *event) if (!event->parent) { if (event->attach_state & PERF_ATTACH_TASK) - jump_label_dec_deferred(&perf_sched_events); + static_key_slow_dec_deferred(&perf_sched_events); if (event->attr.mmap || event->attr.mmap_data) atomic_dec(&nr_mmap_events); if (event->attr.comm) @@ -2780,7 +2780,7 @@ static void free_event(struct perf_event *event) put_callchain_buffers(); if (is_cgroup_event(event)) { atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); - jump_label_dec_deferred(&perf_sched_events); + static_key_slow_dec_deferred(&perf_sched_events); } } @@ -4982,7 +4982,7 @@ fail: return err; } -struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; +struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; static void sw_perf_event_destroy(struct perf_event *event) { @@ -4990,7 +4990,7 @@ static void sw_perf_event_destroy(struct perf_event *event) WARN_ON(event->parent); - jump_label_dec(&perf_swevent_enabled[event_id]); + static_key_slow_dec(&perf_swevent_enabled[event_id]); swevent_hlist_put(event); } @@ -5020,7 +5020,7 @@ static int perf_swevent_init(struct perf_event *event) if (err) return err; - jump_label_inc(&perf_swevent_enabled[event_id]); + static_key_slow_inc(&perf_swevent_enabled[event_id]); event->destroy = sw_perf_event_destroy; } @@ -5843,7 +5843,7 @@ done: if (!event->parent) { if (event->attach_state & PERF_ATTACH_TASK) - jump_label_inc(&perf_sched_events.key); + static_key_slow_inc(&perf_sched_events.key); if (event->attr.mmap || event->attr.mmap_data) atomic_inc(&nr_mmap_events); if (event->attr.comm) @@ -6081,7 +6081,7 @@ SYSCALL_DEFINE5(perf_event_open, * - that may need work on context switch */ atomic_inc(&per_cpu(perf_cgroup_events, event->cpu)); - jump_label_inc(&perf_sched_events.key); + static_key_slow_inc(&perf_sched_events.key); } /* diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 543782e7cdd..bf9dcadbb53 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #ifdef HAVE_JUMP_LABEL @@ -29,10 +29,11 @@ void jump_label_unlock(void) mutex_unlock(&jump_label_mutex); } -bool jump_label_enabled(struct jump_label_key *key) +bool static_key_enabled(struct static_key *key) { - return !!atomic_read(&key->enabled); + return (atomic_read(&key->enabled) > 0); } +EXPORT_SYMBOL_GPL(static_key_enabled); static int jump_label_cmp(const void *a, const void *b) { @@ -58,22 +59,26 @@ jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop) sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL); } -static void jump_label_update(struct jump_label_key *key, int enable); +static void jump_label_update(struct static_key *key, int enable); -void jump_label_inc(struct jump_label_key *key) +void static_key_slow_inc(struct static_key *key) { if (atomic_inc_not_zero(&key->enabled)) return; jump_label_lock(); - if (atomic_read(&key->enabled) == 0) - jump_label_update(key, JUMP_LABEL_ENABLE); + if (atomic_read(&key->enabled) == 0) { + if (!jump_label_get_branch_default(key)) + jump_label_update(key, JUMP_LABEL_ENABLE); + else + jump_label_update(key, JUMP_LABEL_DISABLE); + } atomic_inc(&key->enabled); jump_label_unlock(); } -EXPORT_SYMBOL_GPL(jump_label_inc); +EXPORT_SYMBOL_GPL(static_key_slow_inc); -static void __jump_label_dec(struct jump_label_key *key, +static void __static_key_slow_dec(struct static_key *key, unsigned long rate_limit, struct delayed_work *work) { if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) { @@ -85,32 +90,35 @@ static void __jump_label_dec(struct jump_label_key *key, if (rate_limit) { atomic_inc(&key->enabled); schedule_delayed_work(work, rate_limit); - } else - jump_label_update(key, JUMP_LABEL_DISABLE); - + } else { + if (!jump_label_get_branch_default(key)) + jump_label_update(key, JUMP_LABEL_DISABLE); + else + jump_label_update(key, JUMP_LABEL_ENABLE); + } jump_label_unlock(); } -EXPORT_SYMBOL_GPL(jump_label_dec); static void jump_label_update_timeout(struct work_struct *work) { - struct jump_label_key_deferred *key = - container_of(work, struct jump_label_key_deferred, work.work); - __jump_label_dec(&key->key, 0, NULL); + struct static_key_deferred *key = + container_of(work, struct static_key_deferred, work.work); + __static_key_slow_dec(&key->key, 0, NULL); } -void jump_label_dec(struct jump_label_key *key) +void static_key_slow_dec(struct static_key *key) { - __jump_label_dec(key, 0, NULL); + __static_key_slow_dec(key, 0, NULL); } +EXPORT_SYMBOL_GPL(static_key_slow_dec); -void jump_label_dec_deferred(struct jump_label_key_deferred *key) +void static_key_slow_dec_deferred(struct static_key_deferred *key) { - __jump_label_dec(&key->key, key->timeout, &key->work); + __static_key_slow_dec(&key->key, key->timeout, &key->work); } +EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred); - -void jump_label_rate_limit(struct jump_label_key_deferred *key, +void jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl) { key->timeout = rl; @@ -153,7 +161,7 @@ void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry arch_jump_label_transform(entry, type); } -static void __jump_label_update(struct jump_label_key *key, +static void __jump_label_update(struct static_key *key, struct jump_entry *entry, struct jump_entry *stop, int enable) { @@ -170,27 +178,40 @@ static void __jump_label_update(struct jump_label_key *key, } } +static enum jump_label_type jump_label_type(struct static_key *key) +{ + bool true_branch = jump_label_get_branch_default(key); + bool state = static_key_enabled(key); + + if ((!true_branch && state) || (true_branch && !state)) + return JUMP_LABEL_ENABLE; + + return JUMP_LABEL_DISABLE; +} + void __init jump_label_init(void) { struct jump_entry *iter_start = __start___jump_table; struct jump_entry *iter_stop = __stop___jump_table; - struct jump_label_key *key = NULL; + struct static_key *key = NULL; struct jump_entry *iter; jump_label_lock(); jump_label_sort_entries(iter_start, iter_stop); for (iter = iter_start; iter < iter_stop; iter++) { - struct jump_label_key *iterk; + struct static_key *iterk; - iterk = (struct jump_label_key *)(unsigned long)iter->key; - arch_jump_label_transform_static(iter, jump_label_enabled(iterk) ? - JUMP_LABEL_ENABLE : JUMP_LABEL_DISABLE); + iterk = (struct static_key *)(unsigned long)iter->key; + arch_jump_label_transform_static(iter, jump_label_type(iterk)); if (iterk == key) continue; key = iterk; - key->entries = iter; + /* + * Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH. + */ + *((unsigned long *)&key->entries) += (unsigned long)iter; #ifdef CONFIG_MODULES key->next = NULL; #endif @@ -200,8 +221,8 @@ void __init jump_label_init(void) #ifdef CONFIG_MODULES -struct jump_label_mod { - struct jump_label_mod *next; +struct static_key_mod { + struct static_key_mod *next; struct jump_entry *entries; struct module *mod; }; @@ -221,9 +242,9 @@ static int __jump_label_mod_text_reserved(void *start, void *end) start, end); } -static void __jump_label_mod_update(struct jump_label_key *key, int enable) +static void __jump_label_mod_update(struct static_key *key, int enable) { - struct jump_label_mod *mod = key->next; + struct static_key_mod *mod = key->next; while (mod) { struct module *m = mod->mod; @@ -254,11 +275,7 @@ void jump_label_apply_nops(struct module *mod) return; for (iter = iter_start; iter < iter_stop; iter++) { - struct jump_label_key *iterk; - - iterk = (struct jump_label_key *)(unsigned long)iter->key; - arch_jump_label_transform_static(iter, jump_label_enabled(iterk) ? - JUMP_LABEL_ENABLE : JUMP_LABEL_DISABLE); + arch_jump_label_transform_static(iter, JUMP_LABEL_DISABLE); } } @@ -267,8 +284,8 @@ static int jump_label_add_module(struct module *mod) struct jump_entry *iter_start = mod->jump_entries; struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; struct jump_entry *iter; - struct jump_label_key *key = NULL; - struct jump_label_mod *jlm; + struct static_key *key = NULL; + struct static_key_mod *jlm; /* if the module doesn't have jump label entries, just return */ if (iter_start == iter_stop) @@ -277,28 +294,30 @@ static int jump_label_add_module(struct module *mod) jump_label_sort_entries(iter_start, iter_stop); for (iter = iter_start; iter < iter_stop; iter++) { - if (iter->key == (jump_label_t)(unsigned long)key) - continue; + struct static_key *iterk; - key = (struct jump_label_key *)(unsigned long)iter->key; + iterk = (struct static_key *)(unsigned long)iter->key; + if (iterk == key) + continue; + key = iterk; if (__module_address(iter->key) == mod) { - atomic_set(&key->enabled, 0); - key->entries = iter; + /* + * Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH. + */ + *((unsigned long *)&key->entries) += (unsigned long)iter; key->next = NULL; continue; } - - jlm = kzalloc(sizeof(struct jump_label_mod), GFP_KERNEL); + jlm = kzalloc(sizeof(struct static_key_mod), GFP_KERNEL); if (!jlm) return -ENOMEM; - jlm->mod = mod; jlm->entries = iter; jlm->next = key->next; key->next = jlm; - if (jump_label_enabled(key)) + if (jump_label_type(key) == JUMP_LABEL_ENABLE) __jump_label_update(key, iter, iter_stop, JUMP_LABEL_ENABLE); } @@ -310,14 +329,14 @@ static void jump_label_del_module(struct module *mod) struct jump_entry *iter_start = mod->jump_entries; struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; struct jump_entry *iter; - struct jump_label_key *key = NULL; - struct jump_label_mod *jlm, **prev; + struct static_key *key = NULL; + struct static_key_mod *jlm, **prev; for (iter = iter_start; iter < iter_stop; iter++) { if (iter->key == (jump_label_t)(unsigned long)key) continue; - key = (struct jump_label_key *)(unsigned long)iter->key; + key = (struct static_key *)(unsigned long)iter->key; if (__module_address(iter->key) == mod) continue; @@ -419,9 +438,10 @@ int jump_label_text_reserved(void *start, void *end) return ret; } -static void jump_label_update(struct jump_label_key *key, int enable) +static void jump_label_update(struct static_key *key, int enable) { - struct jump_entry *entry = key->entries, *stop = __stop___jump_table; + struct jump_entry *stop = __stop___jump_table; + struct jump_entry *entry = jump_label_get_entries(key); #ifdef CONFIG_MODULES struct module *mod = __module_address((unsigned long)key); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5255c9d2e05..112c6824476 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -162,13 +162,13 @@ static int sched_feat_show(struct seq_file *m, void *v) #ifdef HAVE_JUMP_LABEL -#define jump_label_key__true jump_label_key_enabled -#define jump_label_key__false jump_label_key_disabled +#define jump_label_key__true STATIC_KEY_INIT_TRUE +#define jump_label_key__false STATIC_KEY_INIT_FALSE #define SCHED_FEAT(name, enabled) \ jump_label_key__##enabled , -struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = { +struct static_key sched_feat_keys[__SCHED_FEAT_NR] = { #include "features.h" }; @@ -176,14 +176,14 @@ struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = { static void sched_feat_disable(int i) { - if (jump_label_enabled(&sched_feat_keys[i])) - jump_label_dec(&sched_feat_keys[i]); + if (static_key_enabled(&sched_feat_keys[i])) + static_key_slow_dec(&sched_feat_keys[i]); } static void sched_feat_enable(int i) { - if (!jump_label_enabled(&sched_feat_keys[i])) - jump_label_inc(&sched_feat_keys[i]); + if (!static_key_enabled(&sched_feat_keys[i])) + static_key_slow_inc(&sched_feat_keys[i]); } #else static void sched_feat_disable(int i) { }; @@ -894,7 +894,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) delta -= irq_delta; #endif #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING - if (static_branch((¶virt_steal_rq_enabled))) { + if (static_key_false((¶virt_steal_rq_enabled))) { u64 st; steal = paravirt_steal_clock(cpu_of(rq)); @@ -2756,7 +2756,7 @@ void account_idle_time(cputime_t cputime) static __always_inline bool steal_account_process_tick(void) { #ifdef CONFIG_PARAVIRT - if (static_branch(¶virt_steal_enabled)) { + if (static_key_false(¶virt_steal_enabled)) { u64 steal, st = 0; steal = paravirt_steal_clock(smp_processor_id()); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7c6414fc669..423547ada38 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1399,20 +1399,20 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) #ifdef CONFIG_CFS_BANDWIDTH #ifdef HAVE_JUMP_LABEL -static struct jump_label_key __cfs_bandwidth_used; +static struct static_key __cfs_bandwidth_used; static inline bool cfs_bandwidth_used(void) { - return static_branch(&__cfs_bandwidth_used); + return static_key_false(&__cfs_bandwidth_used); } void account_cfs_bandwidth_used(int enabled, int was_enabled) { /* only need to count groups transitioning between enabled/!enabled */ if (enabled && !was_enabled) - jump_label_inc(&__cfs_bandwidth_used); + static_key_slow_inc(&__cfs_bandwidth_used); else if (!enabled && was_enabled) - jump_label_dec(&__cfs_bandwidth_used); + static_key_slow_dec(&__cfs_bandwidth_used); } #else /* HAVE_JUMP_LABEL */ static bool cfs_bandwidth_used(void) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 98c0c2623db..b4cd6d8ea15 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -611,7 +611,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) * Tunables that become constants when CONFIG_SCHED_DEBUG is off: */ #ifdef CONFIG_SCHED_DEBUG -# include +# include # define const_debug __read_mostly #else # define const_debug const @@ -630,18 +630,18 @@ enum { #undef SCHED_FEAT #if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL) -static __always_inline bool static_branch__true(struct jump_label_key *key) +static __always_inline bool static_branch__true(struct static_key *key) { - return likely(static_branch(key)); /* Not out of line branch. */ + return static_key_true(key); /* Not out of line branch. */ } -static __always_inline bool static_branch__false(struct jump_label_key *key) +static __always_inline bool static_branch__false(struct static_key *key) { - return unlikely(static_branch(key)); /* Out of line branch. */ + return static_key_false(key); /* Out of line branch. */ } #define SCHED_FEAT(name, enabled) \ -static __always_inline bool static_branch_##name(struct jump_label_key *key) \ +static __always_inline bool static_branch_##name(struct static_key *key) \ { \ return static_branch__##enabled(key); \ } @@ -650,7 +650,7 @@ static __always_inline bool static_branch_##name(struct jump_label_key *key) \ #undef SCHED_FEAT -extern struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR]; +extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; #define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x])) #else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */ #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index f1539decd99..d96ba22dabf 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include extern struct tracepoint * const __start___tracepoints_ptrs[]; extern struct tracepoint * const __stop___tracepoints_ptrs[]; @@ -256,9 +256,9 @@ static void set_tracepoint(struct tracepoint_entry **entry, { WARN_ON(strcmp((*entry)->name, elem->name) != 0); - if (elem->regfunc && !jump_label_enabled(&elem->key) && active) + if (elem->regfunc && !static_key_enabled(&elem->key) && active) elem->regfunc(); - else if (elem->unregfunc && jump_label_enabled(&elem->key) && !active) + else if (elem->unregfunc && static_key_enabled(&elem->key) && !active) elem->unregfunc(); /* @@ -269,10 +269,10 @@ static void set_tracepoint(struct tracepoint_entry **entry, * is used. */ rcu_assign_pointer(elem->funcs, (*entry)->funcs); - if (active && !jump_label_enabled(&elem->key)) - jump_label_inc(&elem->key); - else if (!active && jump_label_enabled(&elem->key)) - jump_label_dec(&elem->key); + if (active && !static_key_enabled(&elem->key)) + static_key_slow_inc(&elem->key); + else if (!active && static_key_enabled(&elem->key)) + static_key_slow_dec(&elem->key); } /* @@ -283,11 +283,11 @@ static void set_tracepoint(struct tracepoint_entry **entry, */ static void disable_tracepoint(struct tracepoint *elem) { - if (elem->unregfunc && jump_label_enabled(&elem->key)) + if (elem->unregfunc && static_key_enabled(&elem->key)) elem->unregfunc(); - if (jump_label_enabled(&elem->key)) - jump_label_dec(&elem->key); + if (static_key_enabled(&elem->key)) + static_key_slow_dec(&elem->key); rcu_assign_pointer(elem->funcs, NULL); } diff --git a/net/core/dev.c b/net/core/dev.c index 115dee1d985..da7ce7f0e56 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -134,7 +134,7 @@ #include #include #include -#include +#include #include #include "net-sysfs.h" @@ -1441,11 +1441,11 @@ int call_netdevice_notifiers(unsigned long val, struct net_device *dev) } EXPORT_SYMBOL(call_netdevice_notifiers); -static struct jump_label_key netstamp_needed __read_mostly; +static struct static_key netstamp_needed __read_mostly; #ifdef HAVE_JUMP_LABEL -/* We are not allowed to call jump_label_dec() from irq context +/* We are not allowed to call static_key_slow_dec() from irq context * If net_disable_timestamp() is called from irq context, defer the - * jump_label_dec() calls. + * static_key_slow_dec() calls. */ static atomic_t netstamp_needed_deferred; #endif @@ -1457,12 +1457,12 @@ void net_enable_timestamp(void) if (deferred) { while (--deferred) - jump_label_dec(&netstamp_needed); + static_key_slow_dec(&netstamp_needed); return; } #endif WARN_ON(in_interrupt()); - jump_label_inc(&netstamp_needed); + static_key_slow_inc(&netstamp_needed); } EXPORT_SYMBOL(net_enable_timestamp); @@ -1474,19 +1474,19 @@ void net_disable_timestamp(void) return; } #endif - jump_label_dec(&netstamp_needed); + static_key_slow_dec(&netstamp_needed); } EXPORT_SYMBOL(net_disable_timestamp); static inline void net_timestamp_set(struct sk_buff *skb) { skb->tstamp.tv64 = 0; - if (static_branch(&netstamp_needed)) + if (static_key_false(&netstamp_needed)) __net_timestamp(skb); } #define net_timestamp_check(COND, SKB) \ - if (static_branch(&netstamp_needed)) { \ + if (static_key_false(&netstamp_needed)) { \ if ((COND) && !(SKB)->tstamp.tv64) \ __net_timestamp(SKB); \ } \ @@ -2660,7 +2660,7 @@ EXPORT_SYMBOL(__skb_get_rxhash); struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly; EXPORT_SYMBOL(rps_sock_flow_table); -struct jump_label_key rps_needed __read_mostly; +struct static_key rps_needed __read_mostly; static struct rps_dev_flow * set_rps_cpu(struct net_device *dev, struct sk_buff *skb, @@ -2945,7 +2945,7 @@ int netif_rx(struct sk_buff *skb) trace_netif_rx(skb); #ifdef CONFIG_RPS - if (static_branch(&rps_needed)) { + if (static_key_false(&rps_needed)) { struct rps_dev_flow voidflow, *rflow = &voidflow; int cpu; @@ -3309,7 +3309,7 @@ int netif_receive_skb(struct sk_buff *skb) return NET_RX_SUCCESS; #ifdef CONFIG_RPS - if (static_branch(&rps_needed)) { + if (static_key_false(&rps_needed)) { struct rps_dev_flow voidflow, *rflow = &voidflow; int cpu, ret; diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index a1727cda03d..495586232aa 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -608,10 +608,10 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue, spin_unlock(&rps_map_lock); if (map) - jump_label_inc(&rps_needed); + static_key_slow_inc(&rps_needed); if (old_map) { kfree_rcu(old_map, rcu); - jump_label_dec(&rps_needed); + static_key_slow_dec(&rps_needed); } free_cpumask_var(mask); return len; diff --git a/net/core/sock.c b/net/core/sock.c index 3e81fd2e3c7..3a4e5817a2a 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -111,7 +111,7 @@ #include #include #include -#include +#include #include #include @@ -184,7 +184,7 @@ void mem_cgroup_sockets_destroy(struct cgroup *cgrp, struct cgroup_subsys *ss) static struct lock_class_key af_family_keys[AF_MAX]; static struct lock_class_key af_family_slock_keys[AF_MAX]; -struct jump_label_key memcg_socket_limit_enabled; +struct static_key memcg_socket_limit_enabled; EXPORT_SYMBOL(memcg_socket_limit_enabled); /* diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index d05559d4d9c..0c285087425 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -69,9 +69,9 @@ static int rps_sock_flow_sysctl(ctl_table *table, int write, if (sock_table != orig_sock_table) { rcu_assign_pointer(rps_sock_flow_table, sock_table); if (sock_table) - jump_label_inc(&rps_needed); + static_key_slow_inc(&rps_needed); if (orig_sock_table) { - jump_label_dec(&rps_needed); + static_key_slow_dec(&rps_needed); synchronize_rcu(); vfree(orig_sock_table); } diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index 49978788a9d..602fb305365 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c @@ -111,7 +111,7 @@ void tcp_destroy_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss) val = res_counter_read_u64(&tcp->tcp_memory_allocated, RES_LIMIT); if (val != RESOURCE_MAX) - jump_label_dec(&memcg_socket_limit_enabled); + static_key_slow_dec(&memcg_socket_limit_enabled); } EXPORT_SYMBOL(tcp_destroy_cgroup); @@ -143,9 +143,9 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val) net->ipv4.sysctl_tcp_mem[i]); if (val == RESOURCE_MAX && old_lim != RESOURCE_MAX) - jump_label_dec(&memcg_socket_limit_enabled); + static_key_slow_dec(&memcg_socket_limit_enabled); else if (old_lim == RESOURCE_MAX && val != RESOURCE_MAX) - jump_label_inc(&memcg_socket_limit_enabled); + static_key_slow_inc(&memcg_socket_limit_enabled); return 0; } diff --git a/net/netfilter/core.c b/net/netfilter/core.c index b4e8ff05b30..e1b7e051332 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -56,7 +56,7 @@ struct list_head nf_hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS] __read_mostly; EXPORT_SYMBOL(nf_hooks); #if defined(CONFIG_JUMP_LABEL) -struct jump_label_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; +struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; EXPORT_SYMBOL(nf_hooks_needed); #endif @@ -77,7 +77,7 @@ int nf_register_hook(struct nf_hook_ops *reg) list_add_rcu(®->list, elem->list.prev); mutex_unlock(&nf_hook_mutex); #if defined(CONFIG_JUMP_LABEL) - jump_label_inc(&nf_hooks_needed[reg->pf][reg->hooknum]); + static_key_slow_inc(&nf_hooks_needed[reg->pf][reg->hooknum]); #endif return 0; } @@ -89,7 +89,7 @@ void nf_unregister_hook(struct nf_hook_ops *reg) list_del_rcu(®->list); mutex_unlock(&nf_hook_mutex); #if defined(CONFIG_JUMP_LABEL) - jump_label_dec(&nf_hooks_needed[reg->pf][reg->hooknum]); + static_key_slow_dec(&nf_hooks_needed[reg->pf][reg->hooknum]); #endif synchronize_net(); } -- cgit v1.2.3-70-g09d2 From d93c4071b78f4676ef70ec8f2d4bae59b6cc5523 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 24 Feb 2012 11:50:27 +0000 Subject: x86/time: Eliminate unused irq0_irqs counter As of v2.6.38 this counter is being maintained without ever being read. Signed-off-by: Jan Beulich Link: http://lkml.kernel.org/r/4F4787930200007800074A10@nat28.tlf.novell.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/hardirq.h | 1 - arch/x86/kernel/time.c | 3 --- 2 files changed, 4 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index da0b3ca815b..382f75d735f 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -7,7 +7,6 @@ typedef struct { unsigned int __softirq_pending; unsigned int __nmi_count; /* arch dependent */ - unsigned int irq0_irqs; #ifdef CONFIG_X86_LOCAL_APIC unsigned int apic_timer_irqs; /* arch dependent */ unsigned int irq_spurious_count; diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index dd5fbf4101f..c6eba2b4267 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -57,9 +57,6 @@ EXPORT_SYMBOL(profile_pc); */ static irqreturn_t timer_interrupt(int irq, void *dev_id) { - /* Keep nmi watchdog up to date */ - inc_irq_stat(irq0_irqs); - global_clock_event->event_handler(global_clock_event); /* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */ -- cgit v1.2.3-70-g09d2 From f649e9388cd46ad1634164e56f96ae092ca59e4a Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Fri, 20 Jan 2012 16:24:09 -0500 Subject: x86: relocate get/set debugreg fcns to include/asm/debugreg. Since we already have a debugreg.h header file, move the assoc. get/set functions to it. In addition to it being the logical home for them, it has a secondary advantage. The functions that are moved use BUG(). So we really need to have linux/bug.h in scope. But asm/processor.h is used about 600 times, vs. only about 15 for debugreg.h -- so adding bug.h to the latter reduces the amount of time we'll be processing it during a compile. Signed-off-by: Paul Gortmaker Acked-by: Ingo Molnar CC: Thomas Gleixner CC: "H. Peter Anvin" --- arch/x86/include/asm/debugreg.h | 67 ++++++++++++++++++++++++++++++++++++++++ arch/x86/include/asm/processor.h | 63 ------------------------------------- arch/x86/kernel/cpu/common.c | 1 + 3 files changed, 68 insertions(+), 63 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index b903d5ea394..2d91580bf22 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h @@ -78,8 +78,75 @@ */ #ifdef __KERNEL__ +#include + DECLARE_PER_CPU(unsigned long, cpu_dr7); +#ifndef CONFIG_PARAVIRT +/* + * These special macros can be used to get or set a debugging register + */ +#define get_debugreg(var, register) \ + (var) = native_get_debugreg(register) +#define set_debugreg(value, register) \ + native_set_debugreg(register, value) +#endif + +static inline unsigned long native_get_debugreg(int regno) +{ + unsigned long val = 0; /* Damn you, gcc! */ + + switch (regno) { + case 0: + asm("mov %%db0, %0" :"=r" (val)); + break; + case 1: + asm("mov %%db1, %0" :"=r" (val)); + break; + case 2: + asm("mov %%db2, %0" :"=r" (val)); + break; + case 3: + asm("mov %%db3, %0" :"=r" (val)); + break; + case 6: + asm("mov %%db6, %0" :"=r" (val)); + break; + case 7: + asm("mov %%db7, %0" :"=r" (val)); + break; + default: + BUG(); + } + return val; +} + +static inline void native_set_debugreg(int regno, unsigned long value) +{ + switch (regno) { + case 0: + asm("mov %0, %%db0" ::"r" (value)); + break; + case 1: + asm("mov %0, %%db1" ::"r" (value)); + break; + case 2: + asm("mov %0, %%db2" ::"r" (value)); + break; + case 3: + asm("mov %0, %%db3" ::"r" (value)); + break; + case 6: + asm("mov %0, %%db6" ::"r" (value)); + break; + case 7: + asm("mov %0, %%db7" ::"r" (value)); + break; + default: + BUG(); + } +} + static inline void hw_breakpoint_disable(void) { /* Zero the control register for HW Breakpoint */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 58545c97d07..30aa6e95f81 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -474,61 +474,6 @@ struct thread_struct { unsigned io_bitmap_max; }; -static inline unsigned long native_get_debugreg(int regno) -{ - unsigned long val = 0; /* Damn you, gcc! */ - - switch (regno) { - case 0: - asm("mov %%db0, %0" :"=r" (val)); - break; - case 1: - asm("mov %%db1, %0" :"=r" (val)); - break; - case 2: - asm("mov %%db2, %0" :"=r" (val)); - break; - case 3: - asm("mov %%db3, %0" :"=r" (val)); - break; - case 6: - asm("mov %%db6, %0" :"=r" (val)); - break; - case 7: - asm("mov %%db7, %0" :"=r" (val)); - break; - default: - BUG(); - } - return val; -} - -static inline void native_set_debugreg(int regno, unsigned long value) -{ - switch (regno) { - case 0: - asm("mov %0, %%db0" ::"r" (value)); - break; - case 1: - asm("mov %0, %%db1" ::"r" (value)); - break; - case 2: - asm("mov %0, %%db2" ::"r" (value)); - break; - case 3: - asm("mov %0, %%db3" ::"r" (value)); - break; - case 6: - asm("mov %0, %%db6" ::"r" (value)); - break; - case 7: - asm("mov %0, %%db7" ::"r" (value)); - break; - default: - BUG(); - } -} - /* * Set IOPL bits in EFLAGS from given mask */ @@ -574,14 +519,6 @@ static inline void native_swapgs(void) #define __cpuid native_cpuid #define paravirt_enabled() 0 -/* - * These special macros can be used to get or set a debugging register - */ -#define get_debugreg(var, register) \ - (var) = native_get_debugreg(register) -#define set_debugreg(value, register) \ - native_set_debugreg(register, value) - static inline void load_sp0(struct tss_struct *tss, struct thread_struct *thread) { diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c0f7d68d318..0d676dd923a 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3-70-g09d2 From 187f1882b5b0748b3c4c22274663fdb372ac0452 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Wed, 23 Nov 2011 20:12:59 -0500 Subject: BUG: headers with BUG/BUG_ON etc. need linux/bug.h If a header file is making use of BUG, BUG_ON, BUILD_BUG_ON, or any other BUG variant in a static inline (i.e. not in a #define) then that header really should be including and not just expecting it to be implicitly present. We can make this change risk-free, since if the files using these headers didn't have exposure to linux/bug.h already, they would have been causing compile failures/warnings. Signed-off-by: Paul Gortmaker --- arch/avr32/include/asm/io.h | 1 + arch/m68k/include/asm/system.h | 1 + arch/sparc/include/asm/vga.h | 1 + arch/x86/include/asm/paravirt.h | 1 + include/asm-generic/dma-mapping-common.h | 1 + include/asm-generic/pgtable.h | 1 + include/asm-generic/tlbflush.h | 2 ++ include/drm/ttm/ttm_memory.h | 1 + include/linux/atmdev.h | 1 + include/linux/bio.h | 1 + include/linux/bit_spinlock.h | 1 + include/linux/ceph/decode.h | 3 ++- include/linux/ceph/libceph.h | 1 + include/linux/ceph/mdsmap.h | 1 + include/linux/cpumask.h | 1 + include/linux/crypto.h | 1 + include/linux/debug_locks.h | 1 + include/linux/dmaengine.h | 1 + include/linux/elfcore.h | 1 + include/linux/ext3_fs.h | 1 + include/linux/fs.h | 1 + include/linux/fsnotify.h | 1 + include/linux/gpio.h | 1 + include/linux/highmem.h | 1 + include/linux/i2o.h | 1 + include/linux/if_vlan.h | 1 + include/linux/io-mapping.h | 1 + include/linux/kprobes.h | 1 + include/linux/kvm_host.h | 1 + include/linux/memory_hotplug.h | 1 + include/linux/mm.h | 1 + include/linux/mtd/cfi.h | 1 + include/linux/netdevice.h | 1 + include/linux/nilfs2_fs.h | 1 + include/linux/page-flags.h | 1 + include/linux/pid_namespace.h | 1 + include/linux/posix_acl.h | 1 + include/linux/ptrace.h | 1 + include/linux/radix-tree.h | 1 + include/linux/rcupdate.h | 1 + include/linux/regset.h | 1 + include/linux/reiserfs_fs.h | 1 + include/linux/relay.h | 1 + include/linux/scatterlist.h | 6 ++++-- include/linux/seq_file.h | 1 + include/linux/skbuff.h | 1 + include/linux/slub_def.h | 1 + include/linux/ssb/ssb_driver_gige.h | 1 + include/linux/swapops.h | 1 + include/linux/syscalls.h | 1 + include/linux/transport_class.h | 1 + include/linux/virtio_config.h | 1 + include/net/cfg80211.h | 1 + include/net/dst.h | 1 + include/net/ip_vs.h | 1 + include/net/mac80211.h | 1 + include/net/netns/generic.h | 1 + include/net/red.h | 1 + include/net/tcp.h | 1 + include/net/timewait_sock.h | 1 + include/net/udp.h | 1 + include/net/wpan-phy.h | 1 + include/scsi/osd_ore.h | 1 + include/scsi/scsi_transport.h | 1 + 64 files changed, 69 insertions(+), 3 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/avr32/include/asm/io.h b/arch/avr32/include/asm/io.h index 22c97ef9220..cf60d0a9f17 100644 --- a/arch/avr32/include/asm/io.h +++ b/arch/avr32/include/asm/io.h @@ -1,6 +1,7 @@ #ifndef __ASM_AVR32_IO_H #define __ASM_AVR32_IO_H +#include #include #include #include diff --git a/arch/m68k/include/asm/system.h b/arch/m68k/include/asm/system.h index 47b01f4726b..8dc68178716 100644 --- a/arch/m68k/include/asm/system.h +++ b/arch/m68k/include/asm/system.h @@ -3,6 +3,7 @@ #include #include +#include #include #include #include diff --git a/arch/sparc/include/asm/vga.h b/arch/sparc/include/asm/vga.h index c69d5b2ba19..ec0e9967d93 100644 --- a/arch/sparc/include/asm/vga.h +++ b/arch/sparc/include/asm/vga.h @@ -7,6 +7,7 @@ #ifndef _LINUX_ASM_VGA_H_ #define _LINUX_ASM_VGA_H_ +#include #include #define VT_BUF_HAVE_RW diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index a7d2db9a74f..923b07024a0 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -10,6 +10,7 @@ #include #ifndef __ASSEMBLY__ +#include #include #include diff --git a/include/asm-generic/dma-mapping-common.h b/include/asm-generic/dma-mapping-common.h index 9fa3f96e38c..2e248d8924d 100644 --- a/include/asm-generic/dma-mapping-common.h +++ b/include/asm-generic/dma-mapping-common.h @@ -2,6 +2,7 @@ #define _ASM_GENERIC_DMA_MAPPING_H #include +#include #include #include #include diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 76bff2bff15..236b1056839 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -5,6 +5,7 @@ #ifdef CONFIG_MMU #include +#include #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS extern int ptep_set_access_flags(struct vm_area_struct *vma, diff --git a/include/asm-generic/tlbflush.h b/include/asm-generic/tlbflush.h index c7af037024c..d6d0a88430f 100644 --- a/include/asm-generic/tlbflush.h +++ b/include/asm-generic/tlbflush.h @@ -9,6 +9,8 @@ #error need to implement an architecture specific asm/tlbflush.h #endif +#include + static inline void flush_tlb_mm(struct mm_struct *mm) { BUG(); diff --git a/include/drm/ttm/ttm_memory.h b/include/drm/ttm/ttm_memory.h index 26c1f78d136..d6d1da468c9 100644 --- a/include/drm/ttm/ttm_memory.h +++ b/include/drm/ttm/ttm_memory.h @@ -30,6 +30,7 @@ #include #include +#include #include #include #include diff --git a/include/linux/atmdev.h b/include/linux/atmdev.h index f4ff882cb2d..42c471afc52 100644 --- a/include/linux/atmdev.h +++ b/include/linux/atmdev.h @@ -217,6 +217,7 @@ struct atm_cirange { #include /* wait_queue_head_t */ #include /* struct timeval */ #include +#include #include /* struct sk_buff */ #include #include diff --git a/include/linux/bio.h b/include/linux/bio.h index 129a9c09795..f54db088f33 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -23,6 +23,7 @@ #include #include #include +#include #ifdef CONFIG_BLOCK diff --git a/include/linux/bit_spinlock.h b/include/linux/bit_spinlock.h index ac4d9f8b52e..3b5bafce433 100644 --- a/include/linux/bit_spinlock.h +++ b/include/linux/bit_spinlock.h @@ -4,6 +4,7 @@ #include #include #include +#include /* * bit-based spin_lock() diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h index c5b6939fb32..220ae21e819 100644 --- a/include/linux/ceph/decode.h +++ b/include/linux/ceph/decode.h @@ -1,8 +1,9 @@ #ifndef __CEPH_DECODE_H #define __CEPH_DECODE_H -#include +#include #include +#include #include "types.h" diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 95bd8502e71..e8cf0ccd1a8 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include diff --git a/include/linux/ceph/mdsmap.h b/include/linux/ceph/mdsmap.h index 4c5cb0880bb..9935fac8c10 100644 --- a/include/linux/ceph/mdsmap.h +++ b/include/linux/ceph/mdsmap.h @@ -1,6 +1,7 @@ #ifndef _FS_CEPH_MDSMAP_H #define _FS_CEPH_MDSMAP_H +#include #include "types.h" /* diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 4f7a6323747..7b9b75a529b 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -9,6 +9,7 @@ #include #include #include +#include typedef struct cpumask { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t; diff --git a/include/linux/crypto.h b/include/linux/crypto.h index 8a94217b298..d870bae81df 100644 --- a/include/linux/crypto.h +++ b/include/linux/crypto.h @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include diff --git a/include/linux/debug_locks.h b/include/linux/debug_locks.h index 5033fb88c10..94f20c1488a 100644 --- a/include/linux/debug_locks.h +++ b/include/linux/debug_locks.h @@ -3,6 +3,7 @@ #include #include +#include #include struct task_struct; diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index 679b349d9b6..a5966f691ef 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -23,6 +23,7 @@ #include #include +#include #include #include #include diff --git a/include/linux/elfcore.h b/include/linux/elfcore.h index 394a3e0e4a6..0698c79fbcb 100644 --- a/include/linux/elfcore.h +++ b/include/linux/elfcore.h @@ -6,6 +6,7 @@ #include #ifdef __KERNEL__ #include +#include #endif #include #include diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h index f957085d40e..f5a84eef6ed 100644 --- a/include/linux/ext3_fs.h +++ b/include/linux/ext3_fs.h @@ -18,6 +18,7 @@ #include #include +#include /* * The second extended filesystem constants/structures diff --git a/include/linux/fs.h b/include/linux/fs.h index 69cd5bb640f..abc92db51e5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -389,6 +389,7 @@ struct inodes_stat_t { #include #include #include +#include #include #include #include diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index 2a53f10712b..a6dfe694456 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -14,6 +14,7 @@ #include #include #include +#include /* * fsnotify_d_instantiate - instantiate a dentry for inode diff --git a/include/linux/gpio.h b/include/linux/gpio.h index 38ac48b7d3a..ed5a46707ad 100644 --- a/include/linux/gpio.h +++ b/include/linux/gpio.h @@ -34,6 +34,7 @@ struct gpio { #include #include #include +#include struct device; struct gpio_chip; diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 3a93f73a8ac..6ede661e5b8 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -3,6 +3,7 @@ #include #include +#include #include #include #include diff --git a/include/linux/i2o.h b/include/linux/i2o.h index a6deef4f4f6..d23c3c20b20 100644 --- a/include/linux/i2o.h +++ b/include/linux/i2o.h @@ -24,6 +24,7 @@ #define I2O_MAX_DRIVERS 8 #include +#include #include #include #include diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 13aff1e2183..82097f39df1 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -17,6 +17,7 @@ #include #include #include +#include #define VLAN_HLEN 4 /* The additional bytes (on top of the Ethernet header) * that VLAN requires. diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h index e44e84f0156..657fab4efab 100644 --- a/include/linux/io-mapping.h +++ b/include/linux/io-mapping.h @@ -20,6 +20,7 @@ #include #include +#include #include #include diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index dce6e4dbeda..b6e1f8c0057 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 900c76337e8..ca1b153585d 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 0b8e2a74260..910550f3b70 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -4,6 +4,7 @@ #include #include #include +#include struct page; struct zone; diff --git a/include/linux/mm.h b/include/linux/mm.h index 17b27cd269c..b7fac5b6acb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -6,6 +6,7 @@ #ifdef __KERNEL__ #include +#include #include #include #include diff --git a/include/linux/mtd/cfi.h b/include/linux/mtd/cfi.h index d5d2ec6494b..37ef6b19408 100644 --- a/include/linux/mtd/cfi.h +++ b/include/linux/mtd/cfi.h @@ -22,6 +22,7 @@ #include #include +#include #include #include #include diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 0eac07c9525..5820638193f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -33,6 +33,7 @@ #ifdef __KERNEL__ #include #include +#include #include #include #include diff --git a/include/linux/nilfs2_fs.h b/include/linux/nilfs2_fs.h index 7454ad7451b..89bd4a4dcfb 100644 --- a/include/linux/nilfs2_fs.h +++ b/include/linux/nilfs2_fs.h @@ -41,6 +41,7 @@ #include #include #include +#include #define NILFS_INODE_BMAP_SIZE 7 diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index e90a673be67..3cfa3ad94b1 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -6,6 +6,7 @@ #define PAGE_FLAGS_H #include +#include #ifndef __GENERATING_BOUNDS_H #include #include diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index e7cf6669ac3..f5bd679be46 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -2,6 +2,7 @@ #define _LINUX_PID_NS_H #include +#include #include #include #include diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h index b7681102a4b..11bad91c443 100644 --- a/include/linux/posix_acl.h +++ b/include/linux/posix_acl.h @@ -8,6 +8,7 @@ #ifndef __LINUX_POSIX_ACL_H #define __LINUX_POSIX_ACL_H +#include #include #include diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index c2f1f6a5fcb..753ee8b6233 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -113,6 +113,7 @@ #include /* For unlikely. */ #include /* For struct task_struct. */ #include /* for IS_ERR_VALUE */ +#include /* For BUG_ON. */ extern long arch_ptrace(struct task_struct *child, long request, diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 07e360b1b28..e9a48234e69 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -22,6 +22,7 @@ #include #include +#include #include #include diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 81c04f4348e..3b657f2bed4 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -42,6 +42,7 @@ #include #include #include +#include #include #ifdef CONFIG_RCU_TORTURE_TEST diff --git a/include/linux/regset.h b/include/linux/regset.h index 8abee655622..6325e099105 100644 --- a/include/linux/regset.h +++ b/include/linux/regset.h @@ -15,6 +15,7 @@ #include #include +#include #include struct task_struct; struct user_regset; diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h index 2213ddcce20..6643fb03129 100644 --- a/include/linux/reiserfs_fs.h +++ b/include/linux/reiserfs_fs.h @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include diff --git a/include/linux/relay.h b/include/linux/relay.h index a822fd71fd6..91cacc34c15 100644 --- a/include/linux/relay.h +++ b/include/linux/relay.h @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index 9aaf5bfdad1..ac9586dadfa 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -1,10 +1,12 @@ #ifndef _LINUX_SCATTERLIST_H #define _LINUX_SCATTERLIST_H +#include +#include +#include + #include #include -#include -#include #include struct sg_table { diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index 44f1514b00b..5ff2df6c821 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -3,6 +3,7 @@ #include #include +#include #include #include #include diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 50db9b04a55..773ae985ec7 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -18,6 +18,7 @@ #include #include #include +#include #include #include diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index a32bcfdc783..ca122b36aec 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -8,6 +8,7 @@ */ #include #include +#include #include #include diff --git a/include/linux/ssb/ssb_driver_gige.h b/include/linux/ssb/ssb_driver_gige.h index eba52a10053..6b05dcd927f 100644 --- a/include/linux/ssb/ssb_driver_gige.h +++ b/include/linux/ssb/ssb_driver_gige.h @@ -2,6 +2,7 @@ #define LINUX_SSB_DRIVER_GIGE_H_ #include +#include #include #include diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 2189d3ffc85..792d16d9cbc 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -2,6 +2,7 @@ #define _LINUX_SWAPOPS_H #include +#include /* * swapcache pages are stored in the swapper_space radix tree. We want to diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 8ec1153ff57..3de3acb84a9 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -68,6 +68,7 @@ struct file_handle; #include #include #include +#include #include #include #include diff --git a/include/linux/transport_class.h b/include/linux/transport_class.h index 9ae8da3e640..11087cdd4ad 100644 --- a/include/linux/transport_class.h +++ b/include/linux/transport_class.h @@ -10,6 +10,7 @@ #define _TRANSPORT_CLASS_H_ #include +#include #include struct transport_container; diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index 5206d6541da..7323a339020 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -53,6 +53,7 @@ #ifdef __KERNEL__ #include +#include #include /** diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index a067d30ce73..85b44ca54ac 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include diff --git a/include/net/dst.h b/include/net/dst.h index 344c8dd0287..59c5d18cc38 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index ebe517f2da9..2bdee51ba30 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -16,6 +16,7 @@ #include /* for struct atomic_t */ #include #include +#include #include #include /* for union nf_inet_addr */ diff --git a/include/net/mac80211.h b/include/net/mac80211.h index d49928ba5d0..8294f44c425 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -13,6 +13,7 @@ #ifndef MAC80211_H #define MAC80211_H +#include #include #include #include diff --git a/include/net/netns/generic.h b/include/net/netns/generic.h index d55f4344333..0931618c0f7 100644 --- a/include/net/netns/generic.h +++ b/include/net/netns/generic.h @@ -5,6 +5,7 @@ #ifndef __NET_GENERIC_H__ #define __NET_GENERIC_H__ +#include #include /* diff --git a/include/net/red.h b/include/net/red.h index 28068ec614b..77d4c3745cb 100644 --- a/include/net/red.h +++ b/include/net/red.h @@ -2,6 +2,7 @@ #define __NET_SCHED_RED_H #include +#include #include #include #include diff --git a/include/net/tcp.h b/include/net/tcp.h index 42c29bfbcee..ad8d0a86555 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -22,6 +22,7 @@ #include #include +#include #include #include #include diff --git a/include/net/timewait_sock.h b/include/net/timewait_sock.h index 053b3cf2c66..8d6689cb2c6 100644 --- a/include/net/timewait_sock.h +++ b/include/net/timewait_sock.h @@ -12,6 +12,7 @@ #define _TIMEWAIT_SOCK_H #include +#include #include struct timewait_sock_ops { diff --git a/include/net/udp.h b/include/net/udp.h index e39592f682c..5d606d9da9e 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -23,6 +23,7 @@ #define _UDP_H #include +#include #include #include #include diff --git a/include/net/wpan-phy.h b/include/net/wpan-phy.h index d86fffd3c03..ff27f1b078d 100644 --- a/include/net/wpan-phy.h +++ b/include/net/wpan-phy.h @@ -23,6 +23,7 @@ #include #include +#include struct wpan_phy { struct mutex pib_lock; diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h index f05fa826f89..a5f9b960dfc 100644 --- a/include/scsi/osd_ore.h +++ b/include/scsi/osd_ore.h @@ -26,6 +26,7 @@ #include #include #include +#include struct ore_comp { struct osd_obj_id obj; diff --git a/include/scsi/scsi_transport.h b/include/scsi/scsi_transport.h index 0de32cd4e8a..af244f4bba5 100644 --- a/include/scsi/scsi_transport.h +++ b/include/scsi/scsi_transport.h @@ -22,6 +22,7 @@ #include #include +#include #include #include -- cgit v1.2.3-70-g09d2 From 2b036c6b861dc5da295c6fe19a3edcff7093fdeb Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Mon, 9 Jan 2012 14:00:35 -0500 Subject: KVM: SVM: Add support for AMD's OSVW feature in guests In some cases guests should not provide workarounds for errata even when the physical processor is affected. For example, because of erratum 400 on family 10h processors a Linux guest will read an MSR (resulting in VMEXIT) before going to idle in order to avoid getting stuck in a non-C0 state. This is not necessary: HLT and IO instructions are intercepted and therefore there is no reason for erratum 400 workaround in the guest. This patch allows us to present a guest with certain errata as fixed, regardless of the state of actual hardware. Signed-off-by: Boris Ostrovsky Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 6 +++++ arch/x86/kvm/cpuid.c | 2 +- arch/x86/kvm/cpuid.h | 8 ++++++ arch/x86/kvm/svm.c | 59 +++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/x86.c | 20 ++++++++++++++ 5 files changed, 94 insertions(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 52d6640a5ca..bd69c93da8f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -478,6 +478,12 @@ struct kvm_vcpu_arch { u32 id; bool send_user_only; } apf; + + /* OSVW MSRs (AMD only) */ + struct { + u64 length; + u64 status; + } osvw; }; struct kvm_arch { diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 89b02bfaaca..9fed5bedaad 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -236,7 +236,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, const u32 kvm_supported_word6_x86_features = F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ | F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | - F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) | + F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) | 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM); /* cpuid 0xC0000001.edx */ diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 5b97e1797a6..26d1fb437eb 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -43,4 +43,12 @@ static inline bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu) return best && (best->ebx & bit(X86_FEATURE_FSGSBASE)); } +static inline bool guest_cpuid_has_osvw(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); + return best && (best->ecx & bit(X86_FEATURE_OSVW)); +} + #endif diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 5fa553babe5..fce3ba0f207 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -110,6 +110,12 @@ struct nested_state { #define MSRPM_OFFSETS 16 static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; +/* + * Set osvw_len to higher value when updated Revision Guides + * are published and we know what the new status bits are + */ +static uint64_t osvw_len = 4, osvw_status; + struct vcpu_svm { struct kvm_vcpu vcpu; struct vmcb *vmcb; @@ -556,6 +562,27 @@ static void svm_init_erratum_383(void) erratum_383_found = true; } +static void svm_init_osvw(struct kvm_vcpu *vcpu) +{ + /* + * Guests should see errata 400 and 415 as fixed (assuming that + * HLT and IO instructions are intercepted). + */ + vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3; + vcpu->arch.osvw.status = osvw_status & ~(6ULL); + + /* + * By increasing VCPU's osvw.length to 3 we are telling the guest that + * all osvw.status bits inside that length, including bit 0 (which is + * reserved for erratum 298), are valid. However, if host processor's + * osvw_len is 0 then osvw_status[0] carries no information. We need to + * be conservative here and therefore we tell the guest that erratum 298 + * is present (because we really don't know). + */ + if (osvw_len == 0 && boot_cpu_data.x86 == 0x10) + vcpu->arch.osvw.status |= 1; +} + static int has_svm(void) { const char *msg; @@ -620,6 +647,36 @@ static int svm_hardware_enable(void *garbage) __get_cpu_var(current_tsc_ratio) = TSC_RATIO_DEFAULT; } + + /* + * Get OSVW bits. + * + * Note that it is possible to have a system with mixed processor + * revisions and therefore different OSVW bits. If bits are not the same + * on different processors then choose the worst case (i.e. if erratum + * is present on one processor and not on another then assume that the + * erratum is present everywhere). + */ + if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) { + uint64_t len, status = 0; + int err; + + len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err); + if (!err) + status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS, + &err); + + if (err) + osvw_status = osvw_len = 0; + else { + if (len < osvw_len) + osvw_len = len; + osvw_status |= status; + osvw_status &= (1ULL << osvw_len) - 1; + } + } else + osvw_status = osvw_len = 0; + svm_init_erratum_383(); return 0; @@ -1186,6 +1243,8 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) if (kvm_vcpu_is_bsp(&svm->vcpu)) svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; + svm_init_osvw(&svm->vcpu); + return &svm->vcpu; free_page4: diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a3ce196d21f..2bd77a3a41e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1675,6 +1675,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) */ pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data); break; + case MSR_AMD64_OSVW_ID_LENGTH: + if (!guest_cpuid_has_osvw(vcpu)) + return 1; + vcpu->arch.osvw.length = data; + break; + case MSR_AMD64_OSVW_STATUS: + if (!guest_cpuid_has_osvw(vcpu)) + return 1; + vcpu->arch.osvw.status = data; + break; default: if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) return xen_hvm_config(vcpu, data); @@ -1959,6 +1969,16 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) */ data = 0xbe702111; break; + case MSR_AMD64_OSVW_ID_LENGTH: + if (!guest_cpuid_has_osvw(vcpu)) + return 1; + data = vcpu->arch.osvw.length; + break; + case MSR_AMD64_OSVW_STATUS: + if (!guest_cpuid_has_osvw(vcpu)) + return 1; + data = vcpu->arch.osvw.status; + break; default: if (kvm_pmu_msr(vcpu, msr)) return kvm_pmu_get_msr(vcpu, msr, pdata); -- cgit v1.2.3-70-g09d2 From b9e5dc8d4511e6a00862a795319569e7fe7f60f4 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Wed, 11 Jan 2012 11:20:30 +0100 Subject: KVM: provide synchronous registers in kvm_run On some cpus the overhead for virtualization instructions is in the same range as a system call. Having to call multiple ioctls to get set registers will make certain userspace handled exits more expensive than necessary. Lets provide a section in kvm_run that works as a shared save area for guest registers. We also provide two 64bit flags fields (architecture specific), that will specify 1. which parts of these fields are valid. 2. which registers were modified by userspace Each bit for these flag fields will define a group of registers (like general purpose) or a single register. Signed-off-by: Christian Borntraeger Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- Documentation/virtual/kvm/api.txt | 23 +++++++++++++++++++++++ arch/ia64/include/asm/kvm.h | 4 ++++ arch/powerpc/include/asm/kvm.h | 4 ++++ arch/s390/include/asm/kvm.h | 3 +++ arch/x86/include/asm/kvm.h | 4 ++++ include/linux/kvm.h | 15 +++++++++++++++ 6 files changed, 53 insertions(+) (limited to 'arch/x86/include') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index a67fb35993f..7ca696227d3 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1771,6 +1771,29 @@ developer registration required to access it). /* Fix the size of the union. */ char padding[256]; }; + + /* + * shared registers between kvm and userspace. + * kvm_valid_regs specifies the register classes set by the host + * kvm_dirty_regs specified the register classes dirtied by userspace + * struct kvm_sync_regs is architecture specific, as well as the + * bits for kvm_valid_regs and kvm_dirty_regs + */ + __u64 kvm_valid_regs; + __u64 kvm_dirty_regs; + union { + struct kvm_sync_regs regs; + char padding[1024]; + } s; + +If KVM_CAP_SYNC_REGS is defined, these fields allow userspace to access +certain guest registers without having to call SET/GET_*REGS. Thus we can +avoid some system call overhead if userspace has to handle the exit. +Userspace can query the validity of the structure by checking +kvm_valid_regs for specific bits. These bits are architecture specific +and usually define the validity of a groups of registers. (e.g. one bit + for general purpose registers) + }; 6. Capabilities that can be enabled diff --git a/arch/ia64/include/asm/kvm.h b/arch/ia64/include/asm/kvm.h index bc90c75adf6..b9f82c84f09 100644 --- a/arch/ia64/include/asm/kvm.h +++ b/arch/ia64/include/asm/kvm.h @@ -261,4 +261,8 @@ struct kvm_debug_exit_arch { struct kvm_guest_debug_arch { }; +/* definition of registers in kvm_run */ +struct kvm_sync_regs { +}; + #endif diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h index f7727d91ac6..7d9d4de057e 100644 --- a/arch/powerpc/include/asm/kvm.h +++ b/arch/powerpc/include/asm/kvm.h @@ -265,6 +265,10 @@ struct kvm_debug_exit_arch { struct kvm_guest_debug_arch { }; +/* definition of registers in kvm_run */ +struct kvm_sync_regs { +}; + #define KVM_REG_MASK 0x001f #define KVM_REG_EXT_MASK 0xffe0 #define KVM_REG_GPR 0x0000 diff --git a/arch/s390/include/asm/kvm.h b/arch/s390/include/asm/kvm.h index 82b32a100c7..325560afb77 100644 --- a/arch/s390/include/asm/kvm.h +++ b/arch/s390/include/asm/kvm.h @@ -41,4 +41,7 @@ struct kvm_debug_exit_arch { struct kvm_guest_debug_arch { }; +/* definition of registers in kvm_run */ +struct kvm_sync_regs { +}; #endif diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h index 4d8dcbdfc12..e7d1c194d27 100644 --- a/arch/x86/include/asm/kvm.h +++ b/arch/x86/include/asm/kvm.h @@ -321,4 +321,8 @@ struct kvm_xcrs { __u64 padding[16]; }; +/* definition of registers in kvm_run */ +struct kvm_sync_regs { +}; + #endif /* _ASM_X86_KVM_H */ diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 6cf048d9604..245bcb3a0fc 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -279,6 +279,20 @@ struct kvm_run { /* Fix the size of the union. */ char padding[256]; }; + + /* + * shared registers between kvm and userspace. + * kvm_valid_regs specifies the register classes set by the host + * kvm_dirty_regs specified the register classes dirtied by userspace + * struct kvm_sync_regs is architecture specific, as well as the + * bits for kvm_valid_regs and kvm_dirty_regs + */ + __u64 kvm_valid_regs; + __u64 kvm_dirty_regs; + union { + struct kvm_sync_regs regs; + char padding[1024]; + } s; }; /* for KVM_REGISTER_COALESCED_MMIO / KVM_UNREGISTER_COALESCED_MMIO */ @@ -570,6 +584,7 @@ struct kvm_ppc_pvinfo { #define KVM_CAP_S390_GMAP 71 #define KVM_CAP_TSC_DEADLINE_TIMER 72 #define KVM_CAP_S390_UCONTROL 73 +#define KVM_CAP_SYNC_REGS 74 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3-70-g09d2 From 3ea8b75e47ac70bdd0a2c0492102682d43bfa3c4 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Tue, 17 Jan 2012 19:50:08 +0900 Subject: KVM: MMU: Remove unused kvm_pte_chain Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index bd69c93da8f..46101661432 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -181,13 +181,6 @@ struct kvm_mmu_memory_cache { void *objects[KVM_NR_MEM_OBJS]; }; -#define NR_PTE_CHAIN_ENTRIES 5 - -struct kvm_pte_chain { - u64 *parent_ptes[NR_PTE_CHAIN_ENTRIES]; - struct hlist_node link; -}; - /* * kvm_mmu_page_role, below, is defined as: * -- cgit v1.2.3-70-g09d2 From df156f90a0f90649dd38b7667901ef85478f3d2b Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Tue, 7 Feb 2012 15:52:44 +0100 Subject: x86: Introduce x86_cpuinit.early_percpu_clock_init hook When kvm guest uses kvmclock, it may hang on vcpu hot-plug. This is caused by an overflow in pvclock_get_nsec_offset, u64 delta = tsc - shadow->tsc_timestamp; which in turn is caused by an undefined values from percpu hv_clock that hasn't been initialized yet. Uninitialized clock on being booted cpu is accessed from start_secondary -> smp_callin -> smp_store_cpu_info -> identify_secondary_cpu -> mtrr_ap_init -> mtrr_restore -> stop_machine_from_inactive_cpu -> queue_stop_cpus_work ... -> sched_clock -> kvm_clock_read which is well before x86_cpuinit.setup_percpu_clockev call in start_secondary, where percpu clock is initialized. This patch introduces a hook that allows to setup/initialize per_cpu clock early and avoid overflow due to reading - undefined values - old values if cpu was offlined and then onlined again Another possible early user of this clock source is ftrace that accesses it to get timestamps for ring buffer entries. So if mtrr_ap_init is moved from identify_secondary_cpu to past x86_cpuinit.setup_percpu_clockev in start_secondary, ftrace may cause the same overflow/hang on cpu hot-plug anyway. More complete description of the problem: https://lkml.org/lkml/2012/2/2/101 Credits to Marcelo Tosatti for hook idea. Acked-by: Thomas Gleixner Signed-off-by: Igor Mammedov Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/include/asm/x86_init.h | 2 ++ arch/x86/kernel/kvmclock.c | 4 +--- arch/x86/kernel/smpboot.c | 1 + arch/x86/kernel/x86_init.c | 1 + 4 files changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 517d4767ffd..5d0afac2962 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -145,9 +145,11 @@ struct x86_init_ops { /** * struct x86_cpuinit_ops - platform specific cpu hotplug setups * @setup_percpu_clockev: set up the per cpu clock event device + * @early_percpu_clock_init: early init of the per cpu clock event device */ struct x86_cpuinit_ops { void (*setup_percpu_clockev)(void); + void (*early_percpu_clock_init)(void); void (*fixup_cpu_id)(struct cpuinfo_x86 *c, int node); }; diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 44842d756b2..ca4e735adc5 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -144,8 +144,6 @@ static void __cpuinit kvm_setup_secondary_clock(void) * we shouldn't fail. */ WARN_ON(kvm_register_clock("secondary cpu clock")); - /* ok, done with our trickery, call native */ - setup_secondary_APIC_clock(); } #endif @@ -194,7 +192,7 @@ void __init kvmclock_init(void) x86_platform.get_wallclock = kvm_get_wallclock; x86_platform.set_wallclock = kvm_set_wallclock; #ifdef CONFIG_X86_LOCAL_APIC - x86_cpuinit.setup_percpu_clockev = + x86_cpuinit.early_percpu_clock_init = kvm_setup_secondary_clock; #endif machine_ops.shutdown = kvm_shutdown; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 66d250c00d1..a05d6fd5e06 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -255,6 +255,7 @@ notrace static void __cpuinit start_secondary(void *unused) * most necessary things. */ cpu_init(); + x86_cpuinit.early_percpu_clock_init(); preempt_disable(); smp_callin(); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 947a06ccc67..6f2ec53deed 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -91,6 +91,7 @@ struct x86_init_ops x86_init __initdata = { }; struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { + .early_percpu_clock_init = x86_init_noop, .setup_percpu_clockev = setup_secondary_APIC_clock, .fixup_cpu_id = x86_default_fixup_cpu_id, }; -- cgit v1.2.3-70-g09d2 From a59cb29e4d81e025192550c2703f305637f016f6 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Fri, 3 Feb 2012 12:28:31 -0200 Subject: KVM: x86: increase recommended max vcpus to 160 Increase recommended max vcpus from 64 to 160 (tested internally at Red Hat). Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 46101661432..782d973b071 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -29,7 +29,7 @@ #include #define KVM_MAX_VCPUS 254 -#define KVM_SOFT_MAX_VCPUS 64 +#define KVM_SOFT_MAX_VCPUS 160 #define KVM_MEMORY_SLOTS 32 /* memory slots that does not exposed to userspace */ #define KVM_PRIVATE_MEM_SLOTS 4 -- cgit v1.2.3-70-g09d2 From 225ce53910edc3c2322b1e4f2ed049a9196cd0b3 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 9 Feb 2012 23:20:52 +0100 Subject: perf/x86: Add Intel LBR MSR definitions This patch adds the LBR definitions for NHM/WSM/SNB and Core. It also adds the definitions for the architected LBR MSR: LBR_SELECT, LBRT_TOS. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1328826068-11713-3-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/msr-index.h | 7 +++++++ arch/x86/kernel/cpu/perf_event_intel_lbr.c | 18 +++++++++--------- 2 files changed, 16 insertions(+), 9 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index a6962d9161a..ccb805966f6 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -56,6 +56,13 @@ #define MSR_OFFCORE_RSP_0 0x000001a6 #define MSR_OFFCORE_RSP_1 0x000001a7 +#define MSR_LBR_SELECT 0x000001c8 +#define MSR_LBR_TOS 0x000001c9 +#define MSR_LBR_NHM_FROM 0x00000680 +#define MSR_LBR_NHM_TO 0x000006c0 +#define MSR_LBR_CORE_FROM 0x00000040 +#define MSR_LBR_CORE_TO 0x00000060 + #define MSR_IA32_PEBS_ENABLE 0x000003f1 #define MSR_IA32_DS_AREA 0x00000600 #define MSR_IA32_PERF_CAPABILITIES 0x00000345 diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index 309d0cc6916..6710a5116eb 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -203,23 +203,23 @@ void intel_pmu_lbr_read(void) void intel_pmu_lbr_init_core(void) { x86_pmu.lbr_nr = 4; - x86_pmu.lbr_tos = 0x01c9; - x86_pmu.lbr_from = 0x40; - x86_pmu.lbr_to = 0x60; + x86_pmu.lbr_tos = MSR_LBR_TOS; + x86_pmu.lbr_from = MSR_LBR_CORE_FROM; + x86_pmu.lbr_to = MSR_LBR_CORE_TO; } void intel_pmu_lbr_init_nhm(void) { x86_pmu.lbr_nr = 16; - x86_pmu.lbr_tos = 0x01c9; - x86_pmu.lbr_from = 0x680; - x86_pmu.lbr_to = 0x6c0; + x86_pmu.lbr_tos = MSR_LBR_TOS; + x86_pmu.lbr_from = MSR_LBR_NHM_FROM; + x86_pmu.lbr_to = MSR_LBR_NHM_TO; } void intel_pmu_lbr_init_atom(void) { x86_pmu.lbr_nr = 8; - x86_pmu.lbr_tos = 0x01c9; - x86_pmu.lbr_from = 0x40; - x86_pmu.lbr_to = 0x60; + x86_pmu.lbr_tos = MSR_LBR_TOS; + x86_pmu.lbr_from = MSR_LBR_CORE_FROM; + x86_pmu.lbr_to = MSR_LBR_CORE_TO; } -- cgit v1.2.3-70-g09d2 From cc578287e3224d0da196cc1d226bdae6b068faa7 Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Fri, 3 Feb 2012 15:43:50 -0200 Subject: KVM: Infrastructure for software and hardware based TSC rate scaling This requires some restructuring; rather than use 'virtual_tsc_khz' to indicate whether hardware rate scaling is in effect, we consider each VCPU to always have a virtual TSC rate. Instead, there is new logic above the vendor-specific hardware scaling that decides whether it is even necessary to use and updates all rate variables used by common code. This means we can simply query the virtual rate at any point, which is needed for software rate scaling. There is also now a threshold added to the TSC rate scaling; minor differences and variations of measured TSC rate can accidentally provoke rate scaling to be used when it is not needed. Instead, we have a tolerance variable called tsc_tolerance_ppm, which is the maximum variation from user requested rate at which scaling will be used. The default is 250ppm, which is the half the threshold for NTP adjustment, allowing for some hardware variation. In the event that hardware rate scaling is not available, we can kludge a bit by forcing TSC catchup to turn on when a faster than hardware speed has been requested, but there is nothing available yet for the reverse case; this requires a trap and emulate software implementation for RDTSC, which is still forthcoming. [avi: fix 64-bit division on i386] Signed-off-by: Zachary Amsden Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 9 +++-- arch/x86/kvm/lapic.c | 2 +- arch/x86/kvm/svm.c | 20 ++++++---- arch/x86/kvm/vmx.c | 16 +++++--- arch/x86/kvm/x86.c | 82 +++++++++++++++++++++-------------------- 5 files changed, 71 insertions(+), 58 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 782d973b071..ddebbe01fff 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -422,10 +422,11 @@ struct kvm_vcpu_arch { u64 last_kernel_ns; u64 last_tsc_nsec; u64 last_tsc_write; - u32 virtual_tsc_khz; bool tsc_catchup; - u32 tsc_catchup_mult; - s8 tsc_catchup_shift; + bool tsc_always_catchup; + s8 virtual_tsc_shift; + u32 virtual_tsc_mult; + u32 virtual_tsc_khz; atomic_t nmi_queued; /* unprocessed asynchronous NMIs */ unsigned nmi_pending; /* NMI queued after currently running handler */ @@ -651,7 +652,7 @@ struct kvm_x86_ops { bool (*has_wbinvd_exit)(void); - void (*set_tsc_khz)(struct kvm_vcpu *vcpu, u32 user_tsc_khz); + void (*set_tsc_khz)(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale); void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); u64 (*compute_tsc_offset)(struct kvm_vcpu *vcpu, u64 target_tsc); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 3ee1d83c695..72975f758c8 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -731,7 +731,7 @@ static void start_apic_timer(struct kvm_lapic *apic) u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline; u64 ns = 0; struct kvm_vcpu *vcpu = apic->vcpu; - unsigned long this_tsc_khz = vcpu_tsc_khz(vcpu); + unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz; unsigned long flags; if (unlikely(!tscdeadline || !this_tsc_khz)) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 7bbd17cc348..e12026e5244 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -964,20 +964,25 @@ static u64 svm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc) return _tsc; } -static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) +static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) { struct vcpu_svm *svm = to_svm(vcpu); u64 ratio; u64 khz; - /* TSC scaling supported? */ - if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) + /* Guest TSC same frequency as host TSC? */ + if (!scale) { + svm->tsc_ratio = TSC_RATIO_DEFAULT; return; + } - /* TSC-Scaling disabled or guest TSC same frequency as host TSC? */ - if (user_tsc_khz == 0) { - vcpu->arch.virtual_tsc_khz = 0; - svm->tsc_ratio = TSC_RATIO_DEFAULT; + /* TSC scaling supported? */ + if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) { + if (user_tsc_khz > tsc_khz) { + vcpu->arch.tsc_catchup = 1; + vcpu->arch.tsc_always_catchup = 1; + } else + WARN(1, "user requested TSC rate below hardware speed\n"); return; } @@ -992,7 +997,6 @@ static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) user_tsc_khz); return; } - vcpu->arch.virtual_tsc_khz = user_tsc_khz; svm->tsc_ratio = ratio; } diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 3b4c8d8ad90..e6bf61fa1c0 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1817,13 +1817,19 @@ u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu) } /* - * Empty call-back. Needs to be implemented when VMX enables the SET_TSC_KHZ - * ioctl. In this case the call-back should update internal vmx state to make - * the changes effective. + * Engage any workarounds for mis-matched TSC rates. Currently limited to + * software catchup for faster rates on slower CPUs. */ -static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) +static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) { - /* Nothing to do here */ + if (!scale) + return; + + if (user_tsc_khz > tsc_khz) { + vcpu->arch.tsc_catchup = 1; + vcpu->arch.tsc_always_catchup = 1; + } else + WARN(1, "user requested TSC rate below hardware speed\n"); } /* diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2bd77a3a41e..41bb90acb23 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -96,6 +96,10 @@ EXPORT_SYMBOL_GPL(kvm_has_tsc_control); u32 kvm_max_guest_tsc_khz; EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz); +/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */ +static u32 tsc_tolerance_ppm = 250; +module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); + #define KVM_NR_SHARED_MSRS 16 struct kvm_shared_msrs_global { @@ -968,49 +972,50 @@ static inline u64 get_kernel_ns(void) static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); unsigned long max_tsc_khz; -static inline int kvm_tsc_changes_freq(void) +static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) { - int cpu = get_cpu(); - int ret = !boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && - cpufreq_quick_get(cpu) != 0; - put_cpu(); - return ret; + return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult, + vcpu->arch.virtual_tsc_shift); } -u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu) +static u32 adjust_tsc_khz(u32 khz, s32 ppm) { - if (vcpu->arch.virtual_tsc_khz) - return vcpu->arch.virtual_tsc_khz; - else - return __this_cpu_read(cpu_tsc_khz); + u64 v = (u64)khz * (1000000 + ppm); + do_div(v, 1000000); + return v; } -static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) +static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz) { - u64 ret; + u32 thresh_lo, thresh_hi; + int use_scaling = 0; - WARN_ON(preemptible()); - if (kvm_tsc_changes_freq()) - printk_once(KERN_WARNING - "kvm: unreliable cycle conversion on adjustable rate TSC\n"); - ret = nsec * vcpu_tsc_khz(vcpu); - do_div(ret, USEC_PER_SEC); - return ret; -} - -static void kvm_init_tsc_catchup(struct kvm_vcpu *vcpu, u32 this_tsc_khz) -{ /* Compute a scale to convert nanoseconds in TSC cycles */ kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000, - &vcpu->arch.tsc_catchup_shift, - &vcpu->arch.tsc_catchup_mult); + &vcpu->arch.virtual_tsc_shift, + &vcpu->arch.virtual_tsc_mult); + vcpu->arch.virtual_tsc_khz = this_tsc_khz; + + /* + * Compute the variation in TSC rate which is acceptable + * within the range of tolerance and decide if the + * rate being applied is within that bounds of the hardware + * rate. If so, no scaling or compensation need be done. + */ + thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm); + thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm); + if (this_tsc_khz < thresh_lo || this_tsc_khz > thresh_hi) { + pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", this_tsc_khz, thresh_lo, thresh_hi); + use_scaling = 1; + } + kvm_x86_ops->set_tsc_khz(vcpu, this_tsc_khz, use_scaling); } static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) { u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec, - vcpu->arch.tsc_catchup_mult, - vcpu->arch.tsc_catchup_shift); + vcpu->arch.virtual_tsc_mult, + vcpu->arch.virtual_tsc_shift); tsc += vcpu->arch.last_tsc_write; return tsc; } @@ -1077,7 +1082,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) local_irq_save(flags); tsc_timestamp = kvm_x86_ops->read_l1_tsc(v); kernel_ns = get_kernel_ns(); - this_tsc_khz = vcpu_tsc_khz(v); + this_tsc_khz = __get_cpu_var(cpu_tsc_khz); if (unlikely(this_tsc_khz == 0)) { local_irq_restore(flags); kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); @@ -2804,26 +2809,21 @@ long kvm_arch_vcpu_ioctl(struct file *filp, u32 user_tsc_khz; r = -EINVAL; - if (!kvm_has_tsc_control) - break; - user_tsc_khz = (u32)arg; if (user_tsc_khz >= kvm_max_guest_tsc_khz) goto out; - kvm_x86_ops->set_tsc_khz(vcpu, user_tsc_khz); + if (user_tsc_khz == 0) + user_tsc_khz = tsc_khz; + + kvm_set_tsc_khz(vcpu, user_tsc_khz); r = 0; goto out; } case KVM_GET_TSC_KHZ: { - r = -EIO; - if (check_tsc_unstable()) - goto out; - - r = vcpu_tsc_khz(vcpu); - + r = vcpu->arch.virtual_tsc_khz; goto out; } default: @@ -5312,6 +5312,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) profile_hit(KVM_PROFILING, (void *)rip); } + if (unlikely(vcpu->arch.tsc_always_catchup)) + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); kvm_lapic_sync_from_vapic(vcpu); @@ -6004,7 +6006,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) } vcpu->arch.pio_data = page_address(page); - kvm_init_tsc_catchup(vcpu, max_tsc_khz); + kvm_set_tsc_khz(vcpu, max_tsc_khz); r = kvm_mmu_create(vcpu); if (r < 0) -- cgit v1.2.3-70-g09d2 From 5d3cb0f6a8e3af018a522ae8d36f8f7d2511b5d8 Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Fri, 3 Feb 2012 15:43:51 -0200 Subject: KVM: Improve TSC offset matching There are a few improvements that can be made to the TSC offset matching code. First, we don't need to call the 128-bit multiply (especially on a constant number), the code works much nicer to do computation in nanosecond units. Second, the way everything is setup with software TSC rate scaling, we currently have per-cpu rates. Obviously this isn't too desirable to use in practice, but if for some reason we do change the rate of all VCPUs at runtime, then reset the TSCs, we will only want to match offsets for VCPUs running at the same rate. Finally, for the case where we have an unstable host TSC, but rate scaling is being done in hardware, we should call the platform code to compute the TSC offset, so the math is reorganized to recompute the base instead, then transform the base into an offset using the existing API. [avi: fix 64-bit division on i386] Signed-off-by: Zachary Amsden Signed-off-by: Marcelo Tosatti KVM: Fix 64-bit division in kvm_write_tsc() Breaks i386 build. Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/x86.c | 44 +++++++++++++++++++++++++++-------------- 2 files changed, 30 insertions(+), 15 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index ddebbe01fff..8a34fca6c57 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -513,6 +513,7 @@ struct kvm_arch { u64 last_tsc_nsec; u64 last_tsc_offset; u64 last_tsc_write; + u32 last_tsc_khz; struct kvm_xen_hvm_config xen_hvm_config; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 41bb90acb23..4390f42b371 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1025,33 +1025,46 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) struct kvm *kvm = vcpu->kvm; u64 offset, ns, elapsed; unsigned long flags; - s64 sdiff; + s64 nsdiff; raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); ns = get_kernel_ns(); elapsed = ns - kvm->arch.last_tsc_nsec; - sdiff = data - kvm->arch.last_tsc_write; - if (sdiff < 0) - sdiff = -sdiff; + + /* n.b - signed multiplication and division required */ + nsdiff = data - kvm->arch.last_tsc_write; +#ifdef CONFIG_X86_64 + nsdiff = (nsdiff * 1000) / vcpu->arch.virtual_tsc_khz; +#else + /* do_div() only does unsigned */ + asm("idivl %2; xor %%edx, %%edx" + : "=A"(nsdiff) + : "A"(nsdiff * 1000), "rm"(vcpu->arch.virtual_tsc_khz)); +#endif + nsdiff -= elapsed; + if (nsdiff < 0) + nsdiff = -nsdiff; /* - * Special case: close write to TSC within 5 seconds of - * another CPU is interpreted as an attempt to synchronize - * The 5 seconds is to accommodate host load / swapping as - * well as any reset of TSC during the boot process. - * - * In that case, for a reliable TSC, we can match TSC offsets, - * or make a best guest using elapsed value. - */ - if (sdiff < nsec_to_cycles(vcpu, 5ULL * NSEC_PER_SEC) && - elapsed < 5ULL * NSEC_PER_SEC) { + * Special case: TSC write with a small delta (1 second) of virtual + * cycle time against real time is interpreted as an attempt to + * synchronize the CPU. + * + * For a reliable TSC, we can match TSC offsets, and for an unstable + * TSC, we add elapsed time in this computation. We could let the + * compensation code attempt to catch up if we fall behind, but + * it's better to try to match offsets from the beginning. + */ + if (nsdiff < NSEC_PER_SEC && + vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) { if (!check_tsc_unstable()) { offset = kvm->arch.last_tsc_offset; pr_debug("kvm: matched tsc offset for %llu\n", data); } else { u64 delta = nsec_to_cycles(vcpu, elapsed); - offset += delta; + data += delta; + offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); pr_debug("kvm: adjusted tsc offset by %llu\n", delta); } ns = kvm->arch.last_tsc_nsec; @@ -1059,6 +1072,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) kvm->arch.last_tsc_nsec = ns; kvm->arch.last_tsc_write = data; kvm->arch.last_tsc_offset = offset; + kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz; kvm_x86_ops->write_tsc_offset(vcpu, offset); raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); -- cgit v1.2.3-70-g09d2 From 6f526ec5383dcd5fa5ffc7b3ac1d62099a0b46ad Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Fri, 3 Feb 2012 15:43:54 -0200 Subject: KVM: Add last_host_tsc tracking back to KVM The variable last_host_tsc was removed from upstream code. I am adding it back for two reasons. First, it is unnecessary to use guest TSC computation to conclude information about the host TSC. The guest may set the TSC backwards (this case handled by the previous patch), but the computation of guest TSC (and fetching an MSR) is significanlty more work and complexity than simply reading the hardware counter. In addition, we don't actually need the guest TSC for any part of the computation, by always recomputing the offset, we can eliminate the need to deal with the current offset and any scaling factors that may apply. The second reason is that later on, we are going to be using the host TSC value to restore TSC offsets after a host S4 suspend, so we need to be reading the host values, not the guest values here. Signed-off-by: Zachary Amsden Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/x86.c | 11 +++-------- 2 files changed, 4 insertions(+), 8 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 8a34fca6c57..b23682900f4 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -422,6 +422,7 @@ struct kvm_vcpu_arch { u64 last_kernel_ns; u64 last_tsc_nsec; u64 last_tsc_write; + u64 last_host_tsc; bool tsc_catchup; bool tsc_always_catchup; s8 virtual_tsc_shift; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2a59f76d96f..39a57dac884 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2253,13 +2253,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_x86_ops->vcpu_load(vcpu, cpu); if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) { - /* Make sure TSC doesn't go backwards */ - s64 tsc_delta; - u64 tsc; - - tsc = kvm_x86_ops->read_l1_tsc(vcpu); - tsc_delta = tsc - vcpu->arch.last_guest_tsc; - + s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 : + native_read_tsc() - vcpu->arch.last_host_tsc; if (tsc_delta < 0) mark_tsc_unstable("KVM discovered backwards TSC"); if (check_tsc_unstable()) { @@ -2282,7 +2277,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { kvm_x86_ops->vcpu_put(vcpu); kvm_put_guest_fpu(vcpu); - vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu); + vcpu->arch.last_host_tsc = native_read_tsc(); } static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, -- cgit v1.2.3-70-g09d2 From f1e2b26003c41e581243c09ceed7567677449468 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Fri, 3 Feb 2012 15:43:55 -0200 Subject: KVM: Allow adjust_tsc_offset to be in host or guest cycles Redefine the API to take a parameter indicating whether an adjustment is in host or guest cycles. Signed-off-by: Zachary Amsden Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 13 ++++++++++++- arch/x86/kvm/svm.c | 6 +++++- arch/x86/kvm/vmx.c | 2 +- arch/x86/kvm/x86.c | 2 +- 4 files changed, 19 insertions(+), 4 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b23682900f4..dd439f13df8 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -646,7 +646,7 @@ struct kvm_x86_ops { u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); int (*get_lpage_level)(void); bool (*rdtscp_supported)(void); - void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment); + void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment, bool host); void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); @@ -676,6 +676,17 @@ struct kvm_arch_async_pf { extern struct kvm_x86_ops *kvm_x86_ops; +static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, + s64 adjustment) +{ + kvm_x86_ops->adjust_tsc_offset(vcpu, adjustment, false); +} + +static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment) +{ + kvm_x86_ops->adjust_tsc_offset(vcpu, adjustment, true); +} + int kvm_mmu_module_init(void); void kvm_mmu_module_exit(void); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index e12026e5244..0b7690ee20b 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1016,10 +1016,14 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) mark_dirty(svm->vmcb, VMCB_INTERCEPTS); } -static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) +static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host) { struct vcpu_svm *svm = to_svm(vcpu); + WARN_ON(adjustment < 0); + if (host) + adjustment = svm_scale_tsc(vcpu, adjustment); + svm->vmcb->control.tsc_offset += adjustment; if (is_guest_mode(vcpu)) svm->nested.hsave->control.tsc_offset += adjustment; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index e6bf61fa1c0..575fb742a6f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1856,7 +1856,7 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) } } -static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) +static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host) { u64 offset = vmcs_read64(TSC_OFFSET); vmcs_write64(TSC_OFFSET, offset + adjustment); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 39a57dac884..3b931302fa5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1116,7 +1116,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) if (vcpu->tsc_catchup) { u64 tsc = compute_guest_tsc(v, kernel_ns); if (tsc > tsc_timestamp) { - kvm_x86_ops->adjust_tsc_offset(v, tsc - tsc_timestamp); + adjust_tsc_offset_guest(v, tsc - tsc_timestamp); tsc_timestamp = tsc; } } -- cgit v1.2.3-70-g09d2 From 0dd6a6edb0124e6c71931ff575b18e15ed6e8603 Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Fri, 3 Feb 2012 15:43:56 -0200 Subject: KVM: Dont mark TSC unstable due to S4 suspend During a host suspend, TSC may go backwards, which KVM interprets as an unstable TSC. Technically, KVM should not be marking the TSC unstable, which causes the TSC clocksource to go bad, but we need to be adjusting the TSC offsets in such a case. Dealing with this issue is a little tricky as the only place we can reliably do it is before much of the timekeeping infrastructure is up and running. On top of this, we are not in a KVM thread context, so we may not be able to safely access VCPU fields. Instead, we compute our best known hardware offset at power-up and stash it to be applied to all VCPUs when they actually start running. Signed-off-by: Zachary Amsden Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/x86.c | 93 ++++++++++++++++++++++++++++++++++++++--- 2 files changed, 89 insertions(+), 5 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index dd439f13df8..4fbeb84b181 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -423,6 +423,7 @@ struct kvm_vcpu_arch { u64 last_tsc_nsec; u64 last_tsc_write; u64 last_host_tsc; + u64 tsc_offset_adjustment; bool tsc_catchup; bool tsc_always_catchup; s8 virtual_tsc_shift; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3b931302fa5..4e9bd23d522 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2252,6 +2252,14 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) } kvm_x86_ops->vcpu_load(vcpu, cpu); + + /* Apply any externally detected TSC adjustments (due to suspend) */ + if (unlikely(vcpu->arch.tsc_offset_adjustment)) { + adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment); + vcpu->arch.tsc_offset_adjustment = 0; + set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests); + } + if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) { s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 : native_read_tsc() - vcpu->arch.last_host_tsc; @@ -5964,13 +5972,88 @@ int kvm_arch_hardware_enable(void *garbage) struct kvm *kvm; struct kvm_vcpu *vcpu; int i; + int ret; + u64 local_tsc; + u64 max_tsc = 0; + bool stable, backwards_tsc = false; kvm_shared_msr_cpu_online(); - list_for_each_entry(kvm, &vm_list, vm_list) - kvm_for_each_vcpu(i, vcpu, kvm) - if (vcpu->cpu == smp_processor_id()) - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); - return kvm_x86_ops->hardware_enable(garbage); + ret = kvm_x86_ops->hardware_enable(garbage); + if (ret != 0) + return ret; + + local_tsc = native_read_tsc(); + stable = !check_tsc_unstable(); + list_for_each_entry(kvm, &vm_list, vm_list) { + kvm_for_each_vcpu(i, vcpu, kvm) { + if (!stable && vcpu->cpu == smp_processor_id()) + set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests); + if (stable && vcpu->arch.last_host_tsc > local_tsc) { + backwards_tsc = true; + if (vcpu->arch.last_host_tsc > max_tsc) + max_tsc = vcpu->arch.last_host_tsc; + } + } + } + + /* + * Sometimes, even reliable TSCs go backwards. This happens on + * platforms that reset TSC during suspend or hibernate actions, but + * maintain synchronization. We must compensate. Fortunately, we can + * detect that condition here, which happens early in CPU bringup, + * before any KVM threads can be running. Unfortunately, we can't + * bring the TSCs fully up to date with real time, as we aren't yet far + * enough into CPU bringup that we know how much real time has actually + * elapsed; our helper function, get_kernel_ns() will be using boot + * variables that haven't been updated yet. + * + * So we simply find the maximum observed TSC above, then record the + * adjustment to TSC in each VCPU. When the VCPU later gets loaded, + * the adjustment will be applied. Note that we accumulate + * adjustments, in case multiple suspend cycles happen before some VCPU + * gets a chance to run again. In the event that no KVM threads get a + * chance to run, we will miss the entire elapsed period, as we'll have + * reset last_host_tsc, so VCPUs will not have the TSC adjusted and may + * loose cycle time. This isn't too big a deal, since the loss will be + * uniform across all VCPUs (not to mention the scenario is extremely + * unlikely). It is possible that a second hibernate recovery happens + * much faster than a first, causing the observed TSC here to be + * smaller; this would require additional padding adjustment, which is + * why we set last_host_tsc to the local tsc observed here. + * + * N.B. - this code below runs only on platforms with reliable TSC, + * as that is the only way backwards_tsc is set above. Also note + * that this runs for ALL vcpus, which is not a bug; all VCPUs should + * have the same delta_cyc adjustment applied if backwards_tsc + * is detected. Note further, this adjustment is only done once, + * as we reset last_host_tsc on all VCPUs to stop this from being + * called multiple times (one for each physical CPU bringup). + * + * Platforms with unnreliable TSCs don't have to deal with this, they + * will be compensated by the logic in vcpu_load, which sets the TSC to + * catchup mode. This will catchup all VCPUs to real time, but cannot + * guarantee that they stay in perfect synchronization. + */ + if (backwards_tsc) { + u64 delta_cyc = max_tsc - local_tsc; + list_for_each_entry(kvm, &vm_list, vm_list) { + kvm_for_each_vcpu(i, vcpu, kvm) { + vcpu->arch.tsc_offset_adjustment += delta_cyc; + vcpu->arch.last_host_tsc = local_tsc; + } + + /* + * We have to disable TSC offset matching.. if you were + * booting a VM while issuing an S4 host suspend.... + * you may have some problem. Solving this issue is + * left as an exercise to the reader. + */ + kvm->arch.last_tsc_nsec = 0; + kvm->arch.last_tsc_write = 0; + } + + } + return 0; } void kvm_arch_hardware_disable(void *garbage) -- cgit v1.2.3-70-g09d2 From e26101b116a6235bcd80b3a4c38c9fe91286cd79 Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Fri, 3 Feb 2012 15:43:57 -0200 Subject: KVM: Track TSC synchronization in generations This allows us to track the original nanosecond and counter values at each phase of TSC writing by the guest. This gets us perfect offset matching for stable TSC systems, and perfect software computed TSC matching for machines with unstable TSC. Signed-off-by: Zachary Amsden Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 10 +++++++--- arch/x86/kvm/x86.c | 41 +++++++++++++++++++++++++++++++++-------- 2 files changed, 40 insertions(+), 11 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4fbeb84b181..c24125cd0c6 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -420,10 +420,11 @@ struct kvm_vcpu_arch { u64 last_guest_tsc; u64 last_kernel_ns; - u64 last_tsc_nsec; - u64 last_tsc_write; u64 last_host_tsc; u64 tsc_offset_adjustment; + u64 this_tsc_nsec; + u64 this_tsc_write; + u8 this_tsc_generation; bool tsc_catchup; bool tsc_always_catchup; s8 virtual_tsc_shift; @@ -513,9 +514,12 @@ struct kvm_arch { s64 kvmclock_offset; raw_spinlock_t tsc_write_lock; u64 last_tsc_nsec; - u64 last_tsc_offset; u64 last_tsc_write; u32 last_tsc_khz; + u64 cur_tsc_nsec; + u64 cur_tsc_write; + u64 cur_tsc_offset; + u8 cur_tsc_generation; struct kvm_xen_hvm_config xen_hvm_config; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4e9bd23d522..e86f9b22eac 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1013,10 +1013,10 @@ static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz) static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) { - u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec, + u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec, vcpu->arch.virtual_tsc_mult, vcpu->arch.virtual_tsc_shift); - tsc += vcpu->arch.last_tsc_write; + tsc += vcpu->arch.this_tsc_write; return tsc; } @@ -1059,7 +1059,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) if (nsdiff < NSEC_PER_SEC && vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) { if (!check_tsc_unstable()) { - offset = kvm->arch.last_tsc_offset; + offset = kvm->arch.cur_tsc_offset; pr_debug("kvm: matched tsc offset for %llu\n", data); } else { u64 delta = nsec_to_cycles(vcpu, elapsed); @@ -1067,20 +1067,45 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); pr_debug("kvm: adjusted tsc offset by %llu\n", delta); } + } else { + /* + * We split periods of matched TSC writes into generations. + * For each generation, we track the original measured + * nanosecond time, offset, and write, so if TSCs are in + * sync, we can match exact offset, and if not, we can match + * exact software computaion in compute_guest_tsc() + * + * These values are tracked in kvm->arch.cur_xxx variables. + */ + kvm->arch.cur_tsc_generation++; + kvm->arch.cur_tsc_nsec = ns; + kvm->arch.cur_tsc_write = data; + kvm->arch.cur_tsc_offset = offset; + pr_debug("kvm: new tsc generation %u, clock %llu\n", + kvm->arch.cur_tsc_generation, data); } + + /* + * We also track th most recent recorded KHZ, write and time to + * allow the matching interval to be extended at each write. + */ kvm->arch.last_tsc_nsec = ns; kvm->arch.last_tsc_write = data; - kvm->arch.last_tsc_offset = offset; kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz; - kvm_x86_ops->write_tsc_offset(vcpu, offset); - raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); /* Reset of TSC must disable overshoot protection below */ vcpu->arch.hv_clock.tsc_timestamp = 0; - vcpu->arch.last_tsc_write = data; - vcpu->arch.last_tsc_nsec = ns; vcpu->arch.last_guest_tsc = data; + + /* Keep track of which generation this VCPU has synchronized to */ + vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation; + vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec; + vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write; + + kvm_x86_ops->write_tsc_offset(vcpu, offset); + raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); } + EXPORT_SYMBOL_GPL(kvm_write_tsc); static int kvm_guest_time_update(struct kvm_vcpu *v) -- cgit v1.2.3-70-g09d2 From db3fe4eb45f3555d91a7124e18cf3a2f2a30eb90 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Wed, 8 Feb 2012 13:02:18 +0900 Subject: KVM: Introduce kvm_memory_slot::arch and move lpage_info into it Some members of kvm_memory_slot are not used by every architecture. This patch is the first step to make this difference clear by introducing kvm_memory_slot::arch; lpage_info is moved into it. Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/ia64/include/asm/kvm_host.h | 3 ++ arch/ia64/kvm/kvm-ia64.c | 10 ++++++ arch/powerpc/include/asm/kvm_host.h | 3 ++ arch/powerpc/kvm/powerpc.c | 10 ++++++ arch/s390/include/asm/kvm_host.h | 3 ++ arch/s390/kvm/kvm-s390.c | 10 ++++++ arch/x86/include/asm/kvm_host.h | 9 +++++ arch/x86/kvm/mmu.c | 2 +- arch/x86/kvm/x86.c | 59 +++++++++++++++++++++++++++++++ include/linux/kvm_host.h | 11 +++--- virt/kvm/kvm_main.c | 70 +++++-------------------------------- 11 files changed, 122 insertions(+), 68 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h index 2689ee54a1c..e35b3a84a40 100644 --- a/arch/ia64/include/asm/kvm_host.h +++ b/arch/ia64/include/asm/kvm_host.h @@ -459,6 +459,9 @@ struct kvm_sal_data { unsigned long boot_gp; }; +struct kvm_arch_memory_slot { +}; + struct kvm_arch { spinlock_t dirty_log_lock; diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index 8ca7261e7b3..d8ddbba6fe7 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -1571,6 +1571,16 @@ int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) return VM_FAULT_SIGBUS; } +void kvm_arch_free_memslot(struct kvm_memory_slot *free, + struct kvm_memory_slot *dont) +{ +} + +int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) +{ + return 0; +} + int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, struct kvm_memory_slot old, diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 1843d5d2a3b..52eb9c1f4fe 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -213,6 +213,9 @@ struct revmap_entry { #define KVMPPC_PAGE_WRITETHRU HPTE_R_W /* 0x40 */ #define KVMPPC_GOT_PAGE 0x80 +struct kvm_arch_memory_slot { +}; + struct kvm_arch { #ifdef CONFIG_KVM_BOOK3S_64_HV unsigned long hpt_virt; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 0e21d155eea..00d7e345b3f 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -281,6 +281,16 @@ long kvm_arch_dev_ioctl(struct file *filp, return -EINVAL; } +void kvm_arch_free_memslot(struct kvm_memory_slot *free, + struct kvm_memory_slot *dont) +{ +} + +int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) +{ + return 0; +} + int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, struct kvm_memory_slot old, diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index e6304268ea2..7343872890a 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -245,6 +245,9 @@ struct kvm_vm_stat { u32 remote_tlb_flush; }; +struct kvm_arch_memory_slot { +}; + struct kvm_arch{ struct sca_block *sca; debug_info_t *dbf; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index cf3c0a91d04..17ad69d596f 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -814,6 +814,16 @@ int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) return VM_FAULT_SIGBUS; } +void kvm_arch_free_memslot(struct kvm_memory_slot *free, + struct kvm_memory_slot *dont) +{ +} + +int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) +{ + return 0; +} + /* Section: memory related */ int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c24125cd0c6..74c9edf2bb1 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -483,6 +483,15 @@ struct kvm_vcpu_arch { } osvw; }; +struct kvm_lpage_info { + unsigned long rmap_pde; + int write_count; +}; + +struct kvm_arch_memory_slot { + struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1]; +}; + struct kvm_arch { unsigned int n_used_mmu_pages; unsigned int n_requested_mmu_pages; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 37e7f100a0e..ff053ca3230 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -689,7 +689,7 @@ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn, unsigned long idx; idx = gfn_to_index(gfn, slot->base_gfn, level); - return &slot->lpage_info[level - 2][idx]; + return &slot->arch.lpage_info[level - 2][idx]; } static void account_shadowed(struct kvm *kvm, gfn_t gfn) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3df0b7a140b..ca74c1dadf3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6239,6 +6239,65 @@ void kvm_arch_destroy_vm(struct kvm *kvm) put_page(kvm->arch.ept_identity_pagetable); } +void kvm_arch_free_memslot(struct kvm_memory_slot *free, + struct kvm_memory_slot *dont) +{ + int i; + + for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { + if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) { + vfree(free->arch.lpage_info[i]); + free->arch.lpage_info[i] = NULL; + } + } +} + +int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) +{ + int i; + + for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { + unsigned long ugfn; + int lpages; + int level = i + 2; + + lpages = gfn_to_index(slot->base_gfn + npages - 1, + slot->base_gfn, level) + 1; + + slot->arch.lpage_info[i] = + vzalloc(lpages * sizeof(*slot->arch.lpage_info[i])); + if (!slot->arch.lpage_info[i]) + goto out_free; + + if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1)) + slot->arch.lpage_info[i][0].write_count = 1; + if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1)) + slot->arch.lpage_info[i][lpages - 1].write_count = 1; + ugfn = slot->userspace_addr >> PAGE_SHIFT; + /* + * If the gfn and userspace address are not aligned wrt each + * other, or if explicitly asked to, disable large page + * support for this slot + */ + if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) || + !kvm_largepages_enabled()) { + unsigned long j; + + for (j = 0; j < lpages; ++j) + slot->arch.lpage_info[i][j].write_count = 1; + } + } + + return 0; + +out_free: + for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { + vfree(slot->arch.lpage_info[i]); + slot->arch.lpage_info[i] = NULL; + } + return -ENOMEM; +} + int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, struct kvm_memory_slot old, diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 7a08496b974..355e44555c3 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -171,11 +171,6 @@ static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu) */ #define KVM_MEM_MAX_NR_PAGES ((1UL << 31) - 1) -struct kvm_lpage_info { - unsigned long rmap_pde; - int write_count; -}; - struct kvm_memory_slot { gfn_t base_gfn; unsigned long npages; @@ -184,7 +179,7 @@ struct kvm_memory_slot { unsigned long *dirty_bitmap; unsigned long *dirty_bitmap_head; unsigned long nr_dirty_pages; - struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1]; + struct kvm_arch_memory_slot arch; unsigned long userspace_addr; int user_alloc; int id; @@ -376,6 +371,9 @@ int kvm_set_memory_region(struct kvm *kvm, int __kvm_set_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, int user_alloc); +void kvm_arch_free_memslot(struct kvm_memory_slot *free, + struct kvm_memory_slot *dont); +int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages); int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, struct kvm_memory_slot old, @@ -385,6 +383,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, struct kvm_memory_slot old, int user_alloc); +bool kvm_largepages_enabled(void); void kvm_disable_largepages(void); void kvm_arch_flush_shadow(struct kvm *kvm); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index a30447c5eb4..8340e0e6203 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -535,21 +535,13 @@ static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) static void kvm_free_physmem_slot(struct kvm_memory_slot *free, struct kvm_memory_slot *dont) { - int i; - if (!dont || free->rmap != dont->rmap) vfree(free->rmap); if (!dont || free->dirty_bitmap != dont->dirty_bitmap) kvm_destroy_dirty_bitmap(free); - - for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { - if (!dont || free->lpage_info[i] != dont->lpage_info[i]) { - vfree(free->lpage_info[i]); - free->lpage_info[i] = NULL; - } - } + kvm_arch_free_memslot(free, dont); free->npages = 0; free->rmap = NULL; @@ -685,53 +677,6 @@ void update_memslots(struct kvm_memslots *slots, struct kvm_memory_slot *new) slots->generation++; } -#ifndef CONFIG_S390 -static int create_lpage_info(struct kvm_memory_slot *slot, unsigned long npages) -{ - int i; - - for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { - unsigned long ugfn; - int lpages; - int level = i + 2; - - lpages = gfn_to_index(slot->base_gfn + npages - 1, - slot->base_gfn, level) + 1; - - slot->lpage_info[i] = vzalloc(lpages * sizeof(*slot->lpage_info[i])); - if (!slot->lpage_info[i]) - goto out_free; - - if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1)) - slot->lpage_info[i][0].write_count = 1; - if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1)) - slot->lpage_info[i][lpages - 1].write_count = 1; - ugfn = slot->userspace_addr >> PAGE_SHIFT; - /* - * If the gfn and userspace address are not aligned wrt each - * other, or if explicitly asked to, disable large page - * support for this slot - */ - if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) || - !largepages_enabled) { - unsigned long j; - - for (j = 0; j < lpages; ++j) - slot->lpage_info[i][j].write_count = 1; - } - } - - return 0; - -out_free: - for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { - vfree(slot->lpage_info[i]); - slot->lpage_info[i] = NULL; - } - return -ENOMEM; -} -#endif /* not defined CONFIG_S390 */ - /* * Allocate some memory and give it an address in the guest physical address * space. @@ -819,10 +764,9 @@ int __kvm_set_memory_region(struct kvm *kvm, new.rmap = vzalloc(npages * sizeof(*new.rmap)); if (!new.rmap) goto out_free; - - if (create_lpage_info(&new, npages)) - goto out_free; #endif /* not defined CONFIG_S390 */ + if (kvm_arch_create_memslot(&new, npages)) + goto out_free; } /* Allocate page dirty bitmap if needed */ @@ -880,8 +824,7 @@ int __kvm_set_memory_region(struct kvm *kvm, if (!npages) { new.rmap = NULL; new.dirty_bitmap = NULL; - for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) - new.lpage_info[i] = NULL; + memset(&new.arch, 0, sizeof(new.arch)); } update_memslots(slots, &new); @@ -968,6 +911,11 @@ out: return r; } +bool kvm_largepages_enabled(void) +{ + return largepages_enabled; +} + void kvm_disable_largepages(void) { largepages_enabled = false; -- cgit v1.2.3-70-g09d2 From 7f3d35fddd173e52886d03bc34b5b5d6f5bea343 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Wed, 8 Feb 2012 14:34:38 +0100 Subject: KVM: x86 emulator: Fix task switch privilege checks Currently, all task switches check privileges against the DPL of the TSS. This is only correct for jmp/call to a TSS. If a task gate is used, the DPL of this take gate is used for the check instead. Exceptions, external interrupts and iret shouldn't perform any check. [avi: kill kvm-kmod remnants] Signed-off-by: Kevin Wolf Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 2 +- arch/x86/include/asm/kvm_host.h | 4 +-- arch/x86/kvm/emulate.c | 53 +++++++++++++++++++++++++++++++++----- arch/x86/kvm/svm.c | 5 +++- arch/x86/kvm/vmx.c | 8 +++--- arch/x86/kvm/x86.c | 6 ++--- 6 files changed, 61 insertions(+), 17 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 7b9cfc4878a..df437b68f42 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -388,7 +388,7 @@ bool x86_page_table_writing_insn(struct x86_emulate_ctxt *ctxt); #define EMULATION_INTERCEPTED 2 int x86_emulate_insn(struct x86_emulate_ctxt *ctxt); int emulator_task_switch(struct x86_emulate_ctxt *ctxt, - u16 tss_selector, int reason, + u16 tss_selector, int idt_index, int reason, bool has_error_code, u32 error_code); int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq); #endif /* _ASM_X86_KVM_X86_EMULATE_H */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 74c9edf2bb1..e216ba066e7 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -768,8 +768,8 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu); void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg); -int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, - bool has_error_code, u32 error_code); +int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, + int reason, bool has_error_code, u32 error_code); int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 71450aca3b8..fa310a48591 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1152,6 +1152,22 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, return 1; } +static int read_interrupt_descriptor(struct x86_emulate_ctxt *ctxt, + u16 index, struct desc_struct *desc) +{ + struct desc_ptr dt; + ulong addr; + + ctxt->ops->get_idt(ctxt, &dt); + + if (dt.size < index * 8 + 7) + return emulate_gp(ctxt, index << 3 | 0x2); + + addr = dt.address + index * 8; + return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc, + &ctxt->exception); +} + static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, u16 selector, struct desc_ptr *dt) { @@ -2421,7 +2437,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, } static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, - u16 tss_selector, int reason, + u16 tss_selector, int idt_index, int reason, bool has_error_code, u32 error_code) { struct x86_emulate_ops *ops = ctxt->ops; @@ -2443,12 +2459,35 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, /* FIXME: check that next_tss_desc is tss */ - if (reason != TASK_SWITCH_IRET) { - if ((tss_selector & 3) > next_tss_desc.dpl || - ops->cpl(ctxt) > next_tss_desc.dpl) - return emulate_gp(ctxt, 0); + /* + * Check privileges. The three cases are task switch caused by... + * + * 1. jmp/call/int to task gate: Check against DPL of the task gate + * 2. Exception/IRQ/iret: No check is performed + * 3. jmp/call to TSS: Check agains DPL of the TSS + */ + if (reason == TASK_SWITCH_GATE) { + if (idt_index != -1) { + /* Software interrupts */ + struct desc_struct task_gate_desc; + int dpl; + + ret = read_interrupt_descriptor(ctxt, idt_index, + &task_gate_desc); + if (ret != X86EMUL_CONTINUE) + return ret; + + dpl = task_gate_desc.dpl; + if ((tss_selector & 3) > dpl || ops->cpl(ctxt) > dpl) + return emulate_gp(ctxt, (idt_index << 3) | 0x2); + } + } else if (reason != TASK_SWITCH_IRET) { + int dpl = next_tss_desc.dpl; + if ((tss_selector & 3) > dpl || ops->cpl(ctxt) > dpl) + return emulate_gp(ctxt, tss_selector); } + desc_limit = desc_limit_scaled(&next_tss_desc); if (!next_tss_desc.p || ((desc_limit < 0x67 && (next_tss_desc.type & 8)) || @@ -2501,7 +2540,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, } int emulator_task_switch(struct x86_emulate_ctxt *ctxt, - u16 tss_selector, int reason, + u16 tss_selector, int idt_index, int reason, bool has_error_code, u32 error_code) { int rc; @@ -2509,7 +2548,7 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt, ctxt->_eip = ctxt->eip; ctxt->dst.type = OP_NONE; - rc = emulator_do_task_switch(ctxt, tss_selector, reason, + rc = emulator_do_task_switch(ctxt, tss_selector, idt_index, reason, has_error_code, error_code); if (rc == X86EMUL_CONTINUE) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 0b7690ee20b..95cdeaf9c71 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2799,7 +2799,10 @@ static int task_switch_interception(struct vcpu_svm *svm) (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) skip_emulated_instruction(&svm->vcpu); - if (kvm_task_switch(&svm->vcpu, tss_selector, reason, + if (int_type != SVM_EXITINTINFO_TYPE_SOFT) + int_vec = -1; + + if (kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason, has_error_code, error_code) == EMULATE_FAIL) { svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR; svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d2bd719925a..124a0952a04 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4658,9 +4658,10 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) bool has_error_code = false; u32 error_code = 0; u16 tss_selector; - int reason, type, idt_v; + int reason, type, idt_v, idt_index; idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); + idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK); type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK); exit_qualification = vmcs_readl(EXIT_QUALIFICATION); @@ -4698,8 +4699,9 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) type != INTR_TYPE_NMI_INTR)) skip_emulated_instruction(vcpu); - if (kvm_task_switch(vcpu, tss_selector, reason, - has_error_code, error_code) == EMULATE_FAIL) { + if (kvm_task_switch(vcpu, tss_selector, + type == INTR_TYPE_SOFT_INTR ? idt_index : -1, reason, + has_error_code, error_code) == EMULATE_FAIL) { vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; vcpu->run->internal.ndata = 0; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ca74c1dadf3..490a1b1a255 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5655,15 +5655,15 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, return 0; } -int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, - bool has_error_code, u32 error_code) +int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, + int reason, bool has_error_code, u32 error_code) { struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; int ret; init_emulate_ctxt(vcpu); - ret = emulator_task_switch(ctxt, tss_selector, reason, + ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason, has_error_code, error_code); if (ret) -- cgit v1.2.3-70-g09d2 From 4cee4798a304ee1ea579423ca048f16ceaccdfb5 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Wed, 8 Feb 2012 14:34:41 +0100 Subject: KVM: x86 emulator: Allow PM/VM86 switch during task switch Task switches can switch between Protected Mode and VM86. The current mode must be updated during the task switch emulation so that the new segment selectors are interpreted correctly. In order to let privilege checks succeed, rflags needs to be updated in the vcpu struct as this causes a CPL update. Signed-off-by: Kevin Wolf Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 1 + arch/x86/kvm/emulate.c | 20 ++++++++++++++++++++ arch/x86/kvm/svm.c | 4 ++++ arch/x86/kvm/x86.c | 6 ++++++ 4 files changed, 31 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index df437b68f42..c222e1a1b12 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -176,6 +176,7 @@ struct x86_emulate_ops { void (*set_idt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt); ulong (*get_cr)(struct x86_emulate_ctxt *ctxt, int cr); int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val); + void (*set_rflags)(struct x86_emulate_ctxt *ctxt, ulong val); int (*cpl)(struct x86_emulate_ctxt *ctxt); int (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest); int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index b19e9fffe58..83756223f8a 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2344,6 +2344,8 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, return emulate_gp(ctxt, 0); ctxt->_eip = tss->eip; ctxt->eflags = tss->eflags | 2; + + /* General purpose registers */ ctxt->regs[VCPU_REGS_RAX] = tss->eax; ctxt->regs[VCPU_REGS_RCX] = tss->ecx; ctxt->regs[VCPU_REGS_RDX] = tss->edx; @@ -2365,6 +2367,24 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, set_segment_selector(ctxt, tss->fs, VCPU_SREG_FS); set_segment_selector(ctxt, tss->gs, VCPU_SREG_GS); + /* + * If we're switching between Protected Mode and VM86, we need to make + * sure to update the mode before loading the segment descriptors so + * that the selectors are interpreted correctly. + * + * Need to get rflags to the vcpu struct immediately because it + * influences the CPL which is checked at least when loading the segment + * descriptors and when pushing an error code to the new kernel stack. + * + * TODO Introduce a separate ctxt->ops->set_cpl callback + */ + if (ctxt->eflags & X86_EFLAGS_VM) + ctxt->mode = X86EMUL_MODE_VM86; + else + ctxt->mode = X86EMUL_MODE_PROT32; + + ctxt->ops->set_rflags(ctxt, ctxt->eflags); + /* * Now load segment descriptors. If fault happenes at this stage * it is handled in a context of new task diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index ab39d84dee0..53efd597f39 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1354,7 +1354,11 @@ static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { + unsigned long old_rflags = to_svm(vcpu)->vmcb->save.rflags; + to_svm(vcpu)->vmcb->save.rflags = rflags; + if ((old_rflags ^ rflags) & X86_EFLAGS_VM) + svm_update_cpl(vcpu); } static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 490a1b1a255..03a1fd47a6d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4129,6 +4129,11 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val) return res; } +static void emulator_set_rflags(struct x86_emulate_ctxt *ctxt, ulong val) +{ + kvm_set_rflags(emul_to_vcpu(ctxt), val); +} + static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt) { return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt)); @@ -4310,6 +4315,7 @@ static struct x86_emulate_ops emulate_ops = { .set_idt = emulator_set_idt, .get_cr = emulator_get_cr, .set_cr = emulator_set_cr, + .set_rflags = emulator_set_rflags, .cpl = emulator_get_cpl, .get_dr = emulator_get_dr, .set_dr = emulator_set_dr, -- cgit v1.2.3-70-g09d2 From a7b9d2ccc3d86303ee9314612d301966e04011c7 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Sun, 26 Feb 2012 16:55:40 +0200 Subject: KVM: PMU: warn when pin control is set in eventsel msr Print warning once if pin control bit is set in eventsel msr since emulation does not support it yet. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/include/asm/perf_event.h | 1 + arch/x86/kvm/pmu.c | 3 +++ 2 files changed, 4 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 096c975e099..f1f71823f68 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -23,6 +23,7 @@ #define ARCH_PERFMON_EVENTSEL_USR (1ULL << 16) #define ARCH_PERFMON_EVENTSEL_OS (1ULL << 17) #define ARCH_PERFMON_EVENTSEL_EDGE (1ULL << 18) +#define ARCH_PERFMON_EVENTSEL_PIN_CONTROL (1ULL << 19) #define ARCH_PERFMON_EVENTSEL_INT (1ULL << 20) #define ARCH_PERFMON_EVENTSEL_ANY (1ULL << 21) #define ARCH_PERFMON_EVENTSEL_ENABLE (1ULL << 22) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 3e48c1d3edc..6af9a542e54 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -210,6 +210,9 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) unsigned config, type = PERF_TYPE_RAW; u8 event_select, unit_mask; + if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL) + printk_once("kvm pmu: pin control bit is ignored\n"); + pmc->eventsel = eventsel; stop_counter(pmc); -- cgit v1.2.3-70-g09d2 From 73c154c60be106b47f15d1111fc2d75cc7a436f2 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Mon, 13 Feb 2012 22:26:32 -0500 Subject: xen/enlighten: Expose MWAIT and MWAIT_LEAF if hypervisor OKs it. For the hypervisor to take advantage of the MWAIT support it needs to extract from the ACPI _CST the register address. But the hypervisor does not have the support to parse DSDT so it relies on the initial domain (dom0) to parse the ACPI Power Management information and push it up to the hypervisor. The pushing of the data is done by the processor_harveset_xen module which parses the information that the ACPI parser has graciously exposed in 'struct acpi_processor'. For the ACPI parser to also expose the Cx states for MWAIT, we need to expose the MWAIT capability (leaf 1). Furthermore we also need to expose the MWAIT_LEAF capability (leaf 5) for cstate.c to properly function. The hypervisor could expose these flags when it traps the XEN_EMULATE_PREFIX operations, but it can't do it since it needs to be backwards compatible. Instead we choose to use the native CPUID to figure out if the MWAIT capability exists and use the XEN_SET_PDC query hypercall to figure out if the hypervisor wants us to expose the MWAIT_LEAF capability or not. Note: The XEN_SET_PDC query was implemented in c/s 23783: "ACPI: add _PDC input override mechanism". With this in place, instead of C3 ACPI IOPORT 415 we get now C3:ACPI FFH INTEL MWAIT 0x20 Note: The cpu_idle which would be calling the mwait variants for idling never gets set b/c we set the default pm_idle to be the hypercall variant. Acked-by: Jan Beulich [v2: Fix missing header file include and #ifdef] Signed-off-by: Konrad Rzeszutek Wilk --- arch/ia64/include/asm/xen/interface.h | 1 + arch/x86/include/asm/xen/interface.h | 1 + arch/x86/xen/enlighten.c | 93 ++++++++++++++++++++++++++++++++++- include/xen/interface/platform.h | 3 +- 4 files changed, 96 insertions(+), 2 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/ia64/include/asm/xen/interface.h b/arch/ia64/include/asm/xen/interface.h index fbb519828aa..09d5f7fd9db 100644 --- a/arch/ia64/include/asm/xen/interface.h +++ b/arch/ia64/include/asm/xen/interface.h @@ -77,6 +77,7 @@ DEFINE_GUEST_HANDLE(int); DEFINE_GUEST_HANDLE(long); DEFINE_GUEST_HANDLE(void); DEFINE_GUEST_HANDLE(uint64_t); +DEFINE_GUEST_HANDLE(uint32_t); typedef unsigned long xen_pfn_t; DEFINE_GUEST_HANDLE(xen_pfn_t); diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h index a1f2db5f117..cbf0c9d50b9 100644 --- a/arch/x86/include/asm/xen/interface.h +++ b/arch/x86/include/asm/xen/interface.h @@ -56,6 +56,7 @@ DEFINE_GUEST_HANDLE(int); DEFINE_GUEST_HANDLE(long); DEFINE_GUEST_HANDLE(void); DEFINE_GUEST_HANDLE(uint64_t); +DEFINE_GUEST_HANDLE(uint32_t); #endif #ifndef HYPERVISOR_VIRT_START diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 312c9e3cb63..fe06bf4ef0e 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -62,6 +62,15 @@ #include #include #include +#include + +#ifdef CONFIG_ACPI +#include +#include +#include +#include +#include +#endif #include "xen-ops.h" #include "mmu.h" @@ -200,13 +209,17 @@ static void __init xen_banner(void) static __read_mostly unsigned int cpuid_leaf1_edx_mask = ~0; static __read_mostly unsigned int cpuid_leaf1_ecx_mask = ~0; +static __read_mostly unsigned int cpuid_leaf1_ecx_set_mask; +static __read_mostly unsigned int cpuid_leaf5_ecx_val; +static __read_mostly unsigned int cpuid_leaf5_edx_val; + static void xen_cpuid(unsigned int *ax, unsigned int *bx, unsigned int *cx, unsigned int *dx) { unsigned maskebx = ~0; unsigned maskecx = ~0; unsigned maskedx = ~0; - + unsigned setecx = 0; /* * Mask out inconvenient features, to try and disable as many * unsupported kernel subsystems as possible. @@ -214,9 +227,18 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx, switch (*ax) { case 1: maskecx = cpuid_leaf1_ecx_mask; + setecx = cpuid_leaf1_ecx_set_mask; maskedx = cpuid_leaf1_edx_mask; break; + case CPUID_MWAIT_LEAF: + /* Synthesize the values.. */ + *ax = 0; + *bx = 0; + *cx = cpuid_leaf5_ecx_val; + *dx = cpuid_leaf5_edx_val; + return; + case 0xb: /* Suppress extended topology stuff */ maskebx = 0; @@ -232,9 +254,75 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx, *bx &= maskebx; *cx &= maskecx; + *cx |= setecx; *dx &= maskedx; + } +static bool __init xen_check_mwait(void) +{ +#ifdef CONFIG_ACPI + struct xen_platform_op op = { + .cmd = XENPF_set_processor_pminfo, + .u.set_pminfo.id = -1, + .u.set_pminfo.type = XEN_PM_PDC, + }; + uint32_t buf[3]; + unsigned int ax, bx, cx, dx; + unsigned int mwait_mask; + + /* We need to determine whether it is OK to expose the MWAIT + * capability to the kernel to harvest deeper than C3 states from ACPI + * _CST using the processor_harvest_xen.c module. For this to work, we + * need to gather the MWAIT_LEAF values (which the cstate.c code + * checks against). The hypervisor won't expose the MWAIT flag because + * it would break backwards compatibility; so we will find out directly + * from the hardware and hypercall. + */ + if (!xen_initial_domain()) + return false; + + ax = 1; + cx = 0; + + native_cpuid(&ax, &bx, &cx, &dx); + + mwait_mask = (1 << (X86_FEATURE_EST % 32)) | + (1 << (X86_FEATURE_MWAIT % 32)); + + if ((cx & mwait_mask) != mwait_mask) + return false; + + /* We need to emulate the MWAIT_LEAF and for that we need both + * ecx and edx. The hypercall provides only partial information. + */ + + ax = CPUID_MWAIT_LEAF; + bx = 0; + cx = 0; + dx = 0; + + native_cpuid(&ax, &bx, &cx, &dx); + + /* Ask the Hypervisor whether to clear ACPI_PDC_C_C2C3_FFH. If so, + * don't expose MWAIT_LEAF and let ACPI pick the IOPORT version of C3. + */ + buf[0] = ACPI_PDC_REVISION_ID; + buf[1] = 1; + buf[2] = (ACPI_PDC_C_CAPABILITY_SMP | ACPI_PDC_EST_CAPABILITY_SWSMP); + + set_xen_guest_handle(op.u.set_pminfo.pdc, buf); + + if ((HYPERVISOR_dom0_op(&op) == 0) && + (buf[2] & (ACPI_PDC_C_C1_FFH | ACPI_PDC_C_C2C3_FFH))) { + cpuid_leaf5_ecx_val = cx; + cpuid_leaf5_edx_val = dx; + } + return true; +#else + return false; +#endif +} static void __init xen_init_cpuid_mask(void) { unsigned int ax, bx, cx, dx; @@ -261,6 +349,9 @@ static void __init xen_init_cpuid_mask(void) /* Xen will set CR4.OSXSAVE if supported and not disabled by force */ if ((cx & xsave_mask) != xsave_mask) cpuid_leaf1_ecx_mask &= ~xsave_mask; /* disable XSAVE & OSXSAVE */ + + if (xen_check_mwait()) + cpuid_leaf1_ecx_set_mask = (1 << (X86_FEATURE_MWAIT % 32)); } static void xen_set_debugreg(int reg, unsigned long val) diff --git a/include/xen/interface/platform.h b/include/xen/interface/platform.h index c1684680431..861b359b44a 100644 --- a/include/xen/interface/platform.h +++ b/include/xen/interface/platform.h @@ -200,7 +200,7 @@ DEFINE_GUEST_HANDLE_STRUCT(xenpf_getidletime_t); #define XEN_PM_CX 0 #define XEN_PM_PX 1 #define XEN_PM_TX 2 - +#define XEN_PM_PDC 3 /* Px sub info type */ #define XEN_PX_PCT 1 #define XEN_PX_PSS 2 @@ -293,6 +293,7 @@ struct xenpf_set_processor_pminfo { union { struct xen_processor_power power;/* Cx: _CST/_CSD */ struct xen_processor_performance perf; /* Px: _PPC/_PCT/_PSS/_PSD */ + GUEST_HANDLE(uint32_t) pdc; }; }; DEFINE_GUEST_HANDLE_STRUCT(xenpf_set_processor_pminfo); -- cgit v1.2.3-70-g09d2 From 9993bc635d01a6ee7f6b833b4ee65ce7c06350b1 Mon Sep 17 00:00:00 2001 From: Salman Qazi Date: Fri, 9 Mar 2012 16:41:01 -0800 Subject: sched/x86: Fix overflow in cyc2ns_offset When a machine boots up, the TSC generally gets reset. However, when kexec is used to boot into a kernel, the TSC value would be carried over from the previous kernel. The computation of cycns_offset in set_cyc2ns_scale is prone to an overflow, if the machine has been up more than 208 days prior to the kexec. The overflow happens when we multiply *scale, even though there is enough room to store the final answer. We fix this issue by decomposing tsc_now into the quotient and remainder of division by CYC2NS_SCALE_FACTOR and then performing the multiplication separately on the two components. Refactor code to share the calculation with the previous fix in __cycles_2_ns(). Signed-off-by: Salman Qazi Acked-by: John Stultz Acked-by: Peter Zijlstra Cc: Paul Turner Cc: john stultz Link: http://lkml.kernel.org/r/20120310004027.19291.88460.stgit@dungbeetle.mtv.corp.google.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/timer.h | 8 ++------ arch/x86/kernel/tsc.c | 3 ++- include/linux/kernel.h | 13 +++++++++++++ 3 files changed, 17 insertions(+), 7 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h index 431793e5d48..34baa0eb5d0 100644 --- a/arch/x86/include/asm/timer.h +++ b/arch/x86/include/asm/timer.h @@ -57,14 +57,10 @@ DECLARE_PER_CPU(unsigned long long, cyc2ns_offset); static inline unsigned long long __cycles_2_ns(unsigned long long cyc) { - unsigned long long quot; - unsigned long long rem; int cpu = smp_processor_id(); unsigned long long ns = per_cpu(cyc2ns_offset, cpu); - quot = (cyc >> CYC2NS_SCALE_FACTOR); - rem = cyc & ((1ULL << CYC2NS_SCALE_FACTOR) - 1); - ns += quot * per_cpu(cyc2ns, cpu) + - ((rem * per_cpu(cyc2ns, cpu)) >> CYC2NS_SCALE_FACTOR); + ns += mult_frac(cyc, per_cpu(cyc2ns, cpu), + (1UL << CYC2NS_SCALE_FACTOR)); return ns; } diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index a62c201c97e..183c5925a9f 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -620,7 +620,8 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) if (cpu_khz) { *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz; - *offset = ns_now - (tsc_now * *scale >> CYC2NS_SCALE_FACTOR); + *offset = ns_now - mult_frac(tsc_now, *scale, + (1UL << CYC2NS_SCALE_FACTOR)); } sched_clock_idle_wakeup_event(0); diff --git a/include/linux/kernel.h b/include/linux/kernel.h index e8343422240..d801acb5e68 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -85,6 +85,19 @@ } \ ) +/* + * Multiplies an integer by a fraction, while avoiding unnecessary + * overflow or loss of precision. + */ +#define mult_frac(x, numer, denom)( \ +{ \ + typeof(x) quot = (x) / (denom); \ + typeof(x) rem = (x) % (denom); \ + (quot * (numer)) + ((rem * (numer)) / (denom)); \ +} \ +) + + #define _RET_IP_ (unsigned long)__builtin_return_address(0) #define _THIS_IP_ ({ __label__ __here; __here: (unsigned long)&&__here; }) -- cgit v1.2.3-70-g09d2 From fa63030e9c79e37b4d4e63b39ffb09cfb7aa0fe4 Mon Sep 17 00:00:00 2001 From: Daniel J Blueman Date: Wed, 14 Mar 2012 15:17:34 +0800 Subject: x86/platform: Move APIC ID validity check into platform APIC code Move APIC ID validity check into platform APIC code, so it can be overridden when needed. For NumaChip systems, always trust MADT, as it's constructed with high APIC IDs. Behaviour verifies on standard x86 systems and on NumaChip systems with this, and compile-tested with allyesconfig. Signed-off-by: Daniel J Blueman Reviewed-by: Steffen Persvold Cc: Yinghai Lu Cc: H. Peter Anvin Cc: Suresh Siddha Link: http://lkml.kernel.org/r/1331709454-27966-1-git-send-email-daniel@numascale-asia.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apic.h | 6 ++++++ arch/x86/kernel/apic/apic_flat_64.c | 2 ++ arch/x86/kernel/apic/apic_noop.c | 1 + arch/x86/kernel/apic/apic_numachip.c | 10 +++++++++- arch/x86/kernel/apic/bigsmp_32.c | 1 + arch/x86/kernel/apic/es7000_32.c | 2 ++ arch/x86/kernel/apic/numaq_32.c | 1 + arch/x86/kernel/apic/probe_32.c | 1 + arch/x86/kernel/apic/summit_32.c | 1 + arch/x86/kernel/apic/x2apic_cluster.c | 1 + arch/x86/kernel/apic/x2apic_phys.c | 1 + arch/x86/kernel/apic/x2apic_uv_x.c | 1 + arch/x86/kernel/smpboot.c | 2 +- 13 files changed, 28 insertions(+), 2 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 3ab9bdd87e7..a9371c91718 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -288,6 +288,7 @@ struct apic { int (*probe)(void); int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id); + int (*apic_id_valid)(int apicid); int (*apic_id_registered)(void); u32 irq_delivery_mode; @@ -532,6 +533,11 @@ static inline unsigned int read_apic_id(void) return apic->get_apic_id(reg); } +static inline int default_apic_id_valid(int apicid) +{ + return x2apic_mode || (apicid < 255); +} + extern void default_setup_apic_routing(void); extern struct apic apic_noop; diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 8c3cdded6f2..359b6899a36 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -180,6 +180,7 @@ static struct apic apic_flat = { .name = "flat", .probe = flat_probe, .acpi_madt_oem_check = flat_acpi_madt_oem_check, + .apic_id_valid = default_apic_id_valid, .apic_id_registered = flat_apic_id_registered, .irq_delivery_mode = dest_LowestPrio, @@ -337,6 +338,7 @@ static struct apic apic_physflat = { .name = "physical flat", .probe = physflat_probe, .acpi_madt_oem_check = physflat_acpi_madt_oem_check, + .apic_id_valid = default_apic_id_valid, .apic_id_registered = flat_apic_id_registered, .irq_delivery_mode = dest_Fixed, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 775b82bc655..634ae6cdd5c 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -124,6 +124,7 @@ struct apic apic_noop = { .probe = noop_probe, .acpi_madt_oem_check = NULL, + .apic_id_valid = default_apic_id_valid, .apic_id_registered = noop_apic_id_registered, .irq_delivery_mode = dest_LowestPrio, diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 09d3d8c1cd9..d9ea5f331ac 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -56,6 +56,12 @@ static unsigned int read_xapic_id(void) return get_apic_id(apic_read(APIC_ID)); } +static int numachip_apic_id_valid(int apicid) +{ + /* Trust what bootloader passes in MADT */ + return 1; +} + static int numachip_apic_id_registered(void) { return physid_isset(read_xapic_id(), phys_cpu_present_map); @@ -223,10 +229,11 @@ static int __init numachip_system_init(void) } early_initcall(numachip_system_init); -static int numachip_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +static int __cpuinit numachip_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { if (!strncmp(oem_id, "NUMASC", 6)) { numachip_system = 1; + setup_force_cpu_cap(X86_FEATURE_X2APIC); return 1; } @@ -238,6 +245,7 @@ static struct apic apic_numachip __refconst = { .name = "NumaConnect system", .probe = numachip_probe, .acpi_madt_oem_check = numachip_acpi_madt_oem_check, + .apic_id_valid = numachip_apic_id_valid, .apic_id_registered = numachip_apic_id_registered, .irq_delivery_mode = dest_Fixed, diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 521bead0113..0cdec7065af 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -198,6 +198,7 @@ static struct apic apic_bigsmp = { .name = "bigsmp", .probe = probe_bigsmp, .acpi_madt_oem_check = NULL, + .apic_id_valid = default_apic_id_valid, .apic_id_registered = bigsmp_apic_id_registered, .irq_delivery_mode = dest_Fixed, diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 5d513bc47b6..e42d1d3b913 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -625,6 +625,7 @@ static struct apic __refdata apic_es7000_cluster = { .name = "es7000", .probe = probe_es7000, .acpi_madt_oem_check = es7000_acpi_madt_oem_check_cluster, + .apic_id_valid = default_apic_id_valid, .apic_id_registered = es7000_apic_id_registered, .irq_delivery_mode = dest_LowestPrio, @@ -690,6 +691,7 @@ static struct apic __refdata apic_es7000 = { .name = "es7000", .probe = probe_es7000, .acpi_madt_oem_check = es7000_acpi_madt_oem_check, + .apic_id_valid = default_apic_id_valid, .apic_id_registered = es7000_apic_id_registered, .irq_delivery_mode = dest_Fixed, diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index c4a61ca1349..00d2422ca7c 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -478,6 +478,7 @@ static struct apic __refdata apic_numaq = { .name = "NUMAQ", .probe = probe_numaq, .acpi_madt_oem_check = NULL, + .apic_id_valid = default_apic_id_valid, .apic_id_registered = numaq_apic_id_registered, .irq_delivery_mode = dest_LowestPrio, diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 0787bb3412f..ff2c1b9aac4 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -92,6 +92,7 @@ static struct apic apic_default = { .name = "default", .probe = probe_default, .acpi_madt_oem_check = NULL, + .apic_id_valid = default_apic_id_valid, .apic_id_registered = default_apic_id_registered, .irq_delivery_mode = dest_LowestPrio, diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index 19114423c58..fea000b27f0 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -496,6 +496,7 @@ static struct apic apic_summit = { .name = "summit", .probe = probe_summit, .acpi_madt_oem_check = summit_acpi_madt_oem_check, + .apic_id_valid = default_apic_id_valid, .apic_id_registered = summit_apic_id_registered, .irq_delivery_mode = dest_LowestPrio, diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 50079587582..9193713060a 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -213,6 +213,7 @@ static struct apic apic_x2apic_cluster = { .name = "cluster x2apic", .probe = x2apic_cluster_probe, .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, + .apic_id_valid = default_apic_id_valid, .apic_id_registered = x2apic_apic_id_registered, .irq_delivery_mode = dest_LowestPrio, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index f5373dfde21..bcd1db6eaca 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -119,6 +119,7 @@ static struct apic apic_x2apic_phys = { .name = "physical x2apic", .probe = x2apic_phys_probe, .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, + .apic_id_valid = default_apic_id_valid, .apic_id_registered = x2apic_apic_id_registered, .irq_delivery_mode = dest_Fixed, diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 79b05b88aa1..fc477142585 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -351,6 +351,7 @@ static struct apic __refdata apic_x2apic_uv_x = { .name = "UV large system", .probe = uv_probe, .acpi_madt_oem_check = uv_acpi_madt_oem_check, + .apic_id_valid = default_apic_id_valid, .apic_id_registered = uv_apic_id_registered, .irq_delivery_mode = dest_Fixed, diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 66d250c00d1..d279e6e1d1b 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -847,7 +847,7 @@ int __cpuinit native_cpu_up(unsigned int cpu) if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid || !physid_isset(apicid, phys_cpu_present_map) || - (!x2apic_mode && apicid >= 255)) { + !apic->apic_id_valid(apicid)) { printk(KERN_ERR "%s: bad cpu %d\n", __func__, cpu); return -EINVAL; } -- cgit v1.2.3-70-g09d2 From 31796ac4e8f0e88f5c10f1ad6dab8f19bebe44a4 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 14 Mar 2012 14:27:52 -0700 Subject: x32: Fix alignment fail in struct compat_siginfo Adding struct _sigchld_x32 caused a misalignment cascade in struct siginfo, because union _sifields is located on an 4-byte boundary (8-byte misaligned.) Adding new fields that are 8-byte aligned caused the intermediate structures to also be aligned to 8 bytes, thereby adding padding in unexpected places. Thus, change s64 to compat_s64 here, which makes it "misaligned on paper". In reality these fields *are* actually aligned (there are 3 preceeding ints outside the union and 3 inside struct _sigchld_x32), but because of the intervening union and struct it is not possible for gcc to avoid padding without breaking the ABI. Reported-and-tested-by: H. J. Lu Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1329696488-16970-1-git-send-email-hpa@zytor.com --- arch/x86/include/asm/ia32.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h index 7d0c1858770..ee52760549f 100644 --- a/arch/x86/include/asm/ia32.h +++ b/arch/x86/include/asm/ia32.h @@ -130,8 +130,8 @@ typedef struct compat_siginfo { unsigned int _pid; /* which child */ unsigned int _uid; /* sender's uid */ int _status; /* exit code */ - s64 _utime; - s64 _stime; + compat_s64 _utime; + compat_s64 _stime; } _sigchld_x32; /* SIGILL, SIGFPE, SIGSEGV, SIGBUS */ -- cgit v1.2.3-70-g09d2 From 2ab516575f2f273b19d95140d02c54612201e80a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 Feb 2012 19:46:04 +0000 Subject: x86: vdso: Use seqcount instead of seqlock The update of the vdso data happens under xtime_lock, so adding a nested lock is pointless. Just use a seqcount to sync the readers. Reviewed-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Signed-off-by: John Stultz --- arch/x86/include/asm/vgtod.h | 2 +- arch/x86/kernel/vsyscall_64.c | 11 +++-------- arch/x86/vdso/vclock_gettime.c | 16 ++++++++-------- 3 files changed, 12 insertions(+), 17 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h index 815285bcace..1f007178c81 100644 --- a/arch/x86/include/asm/vgtod.h +++ b/arch/x86/include/asm/vgtod.h @@ -5,7 +5,7 @@ #include struct vsyscall_gtod_data { - seqlock_t lock; + seqcount_t seq; /* open coded 'struct timespec' */ time_t wall_time_sec; diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 33385c18e5d..cdc95a707cd 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -52,10 +52,7 @@ #include "vsyscall_trace.h" DEFINE_VVAR(int, vgetcpu_mode); -DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) = -{ - .lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock), -}; +DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data); static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE; @@ -86,9 +83,7 @@ void update_vsyscall_tz(void) void update_vsyscall(struct timespec *wall_time, struct timespec *wtm, struct clocksource *clock, u32 mult) { - unsigned long flags; - - write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); + write_seqcount_begin(&vsyscall_gtod_data.seq); /* copy vsyscall data */ vsyscall_gtod_data.clock.vclock_mode = clock->archdata.vclock_mode; @@ -101,7 +96,7 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm, vsyscall_gtod_data.wall_to_monotonic = *wtm; vsyscall_gtod_data.wall_time_coarse = __current_kernel_time(); - write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); + write_seqcount_end(&vsyscall_gtod_data.seq); } static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index 7eeb1f6188e..944c5e5d6b6 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -100,12 +100,12 @@ notrace static noinline int do_realtime(struct timespec *ts) int mode; do { - seq = read_seqbegin(>od->lock); + seq = read_seqcount_begin(>od->seq); mode = gtod->clock.vclock_mode; ts->tv_sec = gtod->wall_time_sec; ts->tv_nsec = gtod->wall_time_nsec; ns = vgetns(); - } while (unlikely(read_seqretry(>od->lock, seq))); + } while (unlikely(read_seqcount_retry(>od->seq, seq))); timespec_add_ns(ts, ns); return mode; @@ -117,13 +117,13 @@ notrace static noinline int do_monotonic(struct timespec *ts) int mode; do { - seq = read_seqbegin(>od->lock); + seq = read_seqcount_begin(>od->seq); mode = gtod->clock.vclock_mode; secs = gtod->wall_time_sec; ns = gtod->wall_time_nsec + vgetns(); secs += gtod->wall_to_monotonic.tv_sec; ns += gtod->wall_to_monotonic.tv_nsec; - } while (unlikely(read_seqretry(>od->lock, seq))); + } while (unlikely(read_seqcount_retry(>od->seq, seq))); /* wall_time_nsec, vgetns(), and wall_to_monotonic.tv_nsec * are all guaranteed to be nonnegative. @@ -142,10 +142,10 @@ notrace static noinline int do_realtime_coarse(struct timespec *ts) { unsigned long seq; do { - seq = read_seqbegin(>od->lock); + seq = read_seqcount_begin(>od->seq); ts->tv_sec = gtod->wall_time_coarse.tv_sec; ts->tv_nsec = gtod->wall_time_coarse.tv_nsec; - } while (unlikely(read_seqretry(>od->lock, seq))); + } while (unlikely(read_seqcount_retry(>od->seq, seq))); return 0; } @@ -153,12 +153,12 @@ notrace static noinline int do_monotonic_coarse(struct timespec *ts) { unsigned long seq, ns, secs; do { - seq = read_seqbegin(>od->lock); + seq = read_seqcount_begin(>od->seq); secs = gtod->wall_time_coarse.tv_sec; ns = gtod->wall_time_coarse.tv_nsec; secs += gtod->wall_to_monotonic.tv_sec; ns += gtod->wall_to_monotonic.tv_nsec; - } while (unlikely(read_seqretry(>od->lock, seq))); + } while (unlikely(read_seqcount_retry(>od->seq, seq))); /* wall_time_nsec and wall_to_monotonic.tv_nsec are * guaranteed to be between 0 and NSEC_PER_SEC. -- cgit v1.2.3-70-g09d2 From b74f05d61b73af584d0c39121980171389ecfaaa Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Mon, 13 Feb 2012 11:07:27 -0200 Subject: x86: kvmclock: abstract save/restore sched_clock_state Upon resume from hibernation, CPU 0's hvclock area contains the old values for system_time and tsc_timestamp. It is necessary for the hypervisor to update these values with uptodate ones before the CPU uses them. Abstract TSC's save/restore sched_clock_state functions and use restore_state to write to KVM_SYSTEM_TIME MSR, forcing an update. Also move restore_sched_clock_state before __restore_processor_state, since the later calls CONFIG_LOCK_STAT's lockstat_clock (also for TSC). Thanks to Igor Mammedov for tracking it down. Fixes suspend-to-disk with kvmclock. Reviewed-by: Thomas Gleixner Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/include/asm/tsc.h | 4 ++-- arch/x86/include/asm/x86_init.h | 4 ++++ arch/x86/kernel/kvmclock.c | 11 +++++++++++ arch/x86/kernel/tsc.c | 4 ++-- arch/x86/kernel/x86_init.c | 4 +++- arch/x86/power/cpu.c | 4 ++-- 6 files changed, 24 insertions(+), 7 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 15d99153a96..c91e8b9d588 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -61,7 +61,7 @@ extern void check_tsc_sync_source(int cpu); extern void check_tsc_sync_target(void); extern int notsc_setup(char *); -extern void save_sched_clock_state(void); -extern void restore_sched_clock_state(void); +extern void tsc_save_sched_clock_state(void); +extern void tsc_restore_sched_clock_state(void); #endif /* _ASM_X86_TSC_H */ diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 5d0afac2962..baaca8defec 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -162,6 +162,8 @@ struct x86_cpuinit_ops { * @is_untracked_pat_range exclude from PAT logic * @nmi_init enable NMI on cpus * @i8042_detect pre-detect if i8042 controller exists + * @save_sched_clock_state: save state for sched_clock() on suspend + * @restore_sched_clock_state: restore state for sched_clock() on resume */ struct x86_platform_ops { unsigned long (*calibrate_tsc)(void); @@ -173,6 +175,8 @@ struct x86_platform_ops { void (*nmi_init)(void); unsigned char (*get_nmi_reason)(void); int (*i8042_detect)(void); + void (*save_sched_clock_state)(void); + void (*restore_sched_clock_state)(void); }; struct pci_dev; diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index ca4e735adc5..f8492da65bf 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -136,6 +136,15 @@ int kvm_register_clock(char *txt) return ret; } +static void kvm_save_sched_clock_state(void) +{ +} + +static void kvm_restore_sched_clock_state(void) +{ + kvm_register_clock("primary cpu clock, resume"); +} + #ifdef CONFIG_X86_LOCAL_APIC static void __cpuinit kvm_setup_secondary_clock(void) { @@ -195,6 +204,8 @@ void __init kvmclock_init(void) x86_cpuinit.early_percpu_clock_init = kvm_setup_secondary_clock; #endif + x86_platform.save_sched_clock_state = kvm_save_sched_clock_state; + x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state; machine_ops.shutdown = kvm_shutdown; #ifdef CONFIG_KEXEC machine_ops.crash_shutdown = kvm_crash_shutdown; diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index a62c201c97e..aed2aa1088f 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -629,7 +629,7 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) static unsigned long long cyc2ns_suspend; -void save_sched_clock_state(void) +void tsc_save_sched_clock_state(void) { if (!sched_clock_stable) return; @@ -645,7 +645,7 @@ void save_sched_clock_state(void) * that sched_clock() continues from the point where it was left off during * suspend. */ -void restore_sched_clock_state(void) +void tsc_restore_sched_clock_state(void) { unsigned long long offset; unsigned long flags; diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 6f2ec53deed..e9f265fd79a 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -108,7 +108,9 @@ struct x86_platform_ops x86_platform = { .is_untracked_pat_range = is_ISA_range, .nmi_init = default_nmi_init, .get_nmi_reason = default_get_nmi_reason, - .i8042_detect = default_i8042_detect + .i8042_detect = default_i8042_detect, + .save_sched_clock_state = tsc_save_sched_clock_state, + .restore_sched_clock_state = tsc_restore_sched_clock_state, }; EXPORT_SYMBOL_GPL(x86_platform); diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index f10c0afa1cb..0e76a281412 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -114,7 +114,7 @@ static void __save_processor_state(struct saved_context *ctxt) void save_processor_state(void) { __save_processor_state(&saved_context); - save_sched_clock_state(); + x86_platform.save_sched_clock_state(); } #ifdef CONFIG_X86_32 EXPORT_SYMBOL(save_processor_state); @@ -230,8 +230,8 @@ static void __restore_processor_state(struct saved_context *ctxt) /* Needed by apm.c */ void restore_processor_state(void) { + x86_platform.restore_sched_clock_state(); __restore_processor_state(&saved_context); - restore_sched_clock_state(); } #ifdef CONFIG_X86_32 EXPORT_SYMBOL(restore_processor_state); -- cgit v1.2.3-70-g09d2 From a24401bcf4a67c8fe17e649e74eeb09b08b79ef5 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sat, 26 Nov 2011 10:53:39 +0800 Subject: highmem: kill all __kmap_atomic() [swarren@nvidia.com: highmem: Fix ARM build break due to __kmap_atomic rename] Signed-off-by: Stephen Warren Signed-off-by: Cong Wang --- arch/arm/include/asm/highmem.h | 2 +- arch/arm/mm/highmem.c | 4 ++-- arch/frv/include/asm/highmem.h | 2 +- arch/frv/mm/highmem.c | 4 ++-- arch/mips/include/asm/highmem.h | 2 +- arch/mips/mm/highmem.c | 4 ++-- arch/mn10300/include/asm/highmem.h | 2 +- arch/parisc/include/asm/cacheflush.h | 2 +- arch/powerpc/include/asm/highmem.h | 2 +- arch/sparc/include/asm/highmem.h | 2 +- arch/sparc/mm/highmem.c | 4 ++-- arch/tile/include/asm/highmem.h | 2 +- arch/tile/mm/highmem.c | 4 ++-- arch/x86/include/asm/highmem.h | 2 +- arch/x86/mm/highmem_32.c | 4 ++-- include/linux/highmem.h | 11 +++-------- 16 files changed, 24 insertions(+), 29 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/arm/include/asm/highmem.h b/arch/arm/include/asm/highmem.h index a4edd19dd3d..8c5e828f484 100644 --- a/arch/arm/include/asm/highmem.h +++ b/arch/arm/include/asm/highmem.h @@ -57,7 +57,7 @@ static inline void *kmap_high_get(struct page *page) #ifdef CONFIG_HIGHMEM extern void *kmap(struct page *page); extern void kunmap(struct page *page); -extern void *__kmap_atomic(struct page *page); +extern void *kmap_atomic(struct page *page); extern void __kunmap_atomic(void *kvaddr); extern void *kmap_atomic_pfn(unsigned long pfn); extern struct page *kmap_atomic_to_page(const void *ptr); diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c index 807c0573abb..5a21505d755 100644 --- a/arch/arm/mm/highmem.c +++ b/arch/arm/mm/highmem.c @@ -36,7 +36,7 @@ void kunmap(struct page *page) } EXPORT_SYMBOL(kunmap); -void *__kmap_atomic(struct page *page) +void *kmap_atomic(struct page *page) { unsigned int idx; unsigned long vaddr; @@ -81,7 +81,7 @@ void *__kmap_atomic(struct page *page) return (void *)vaddr; } -EXPORT_SYMBOL(__kmap_atomic); +EXPORT_SYMBOL(kmap_atomic); void __kunmap_atomic(void *kvaddr) { diff --git a/arch/frv/include/asm/highmem.h b/arch/frv/include/asm/highmem.h index a8d6565d415..716956a5317 100644 --- a/arch/frv/include/asm/highmem.h +++ b/arch/frv/include/asm/highmem.h @@ -157,7 +157,7 @@ static inline void kunmap_atomic_primary(void *kvaddr, enum km_type type) pagefault_enable(); } -void *__kmap_atomic(struct page *page); +void *kmap_atomic(struct page *page); void __kunmap_atomic(void *kvaddr); #endif /* !__ASSEMBLY__ */ diff --git a/arch/frv/mm/highmem.c b/arch/frv/mm/highmem.c index fd7fcd4c2e3..31902c9d5be 100644 --- a/arch/frv/mm/highmem.c +++ b/arch/frv/mm/highmem.c @@ -37,7 +37,7 @@ struct page *kmap_atomic_to_page(void *ptr) return virt_to_page(ptr); } -void *__kmap_atomic(struct page *page) +void *kmap_atomic(struct page *page) { unsigned long paddr; int type; @@ -64,7 +64,7 @@ void *__kmap_atomic(struct page *page) return NULL; } } -EXPORT_SYMBOL(__kmap_atomic); +EXPORT_SYMBOL(kmap_atomic); void __kunmap_atomic(void *kvaddr) { diff --git a/arch/mips/include/asm/highmem.h b/arch/mips/include/asm/highmem.h index 77e644082a3..2d91888c9b7 100644 --- a/arch/mips/include/asm/highmem.h +++ b/arch/mips/include/asm/highmem.h @@ -47,7 +47,7 @@ extern void kunmap_high(struct page *page); extern void *kmap(struct page *page); extern void kunmap(struct page *page); -extern void *__kmap_atomic(struct page *page); +extern void *kmap_atomic(struct page *page); extern void __kunmap_atomic(void *kvaddr); extern void *kmap_atomic_pfn(unsigned long pfn); extern struct page *kmap_atomic_to_page(void *ptr); diff --git a/arch/mips/mm/highmem.c b/arch/mips/mm/highmem.c index 3634c7ea06a..aff57057a94 100644 --- a/arch/mips/mm/highmem.c +++ b/arch/mips/mm/highmem.c @@ -41,7 +41,7 @@ EXPORT_SYMBOL(kunmap); * kmaps are appropriate for short, tight code paths only. */ -void *__kmap_atomic(struct page *page) +void *kmap_atomic(struct page *page) { unsigned long vaddr; int idx, type; @@ -62,7 +62,7 @@ void *__kmap_atomic(struct page *page) return (void*) vaddr; } -EXPORT_SYMBOL(__kmap_atomic); +EXPORT_SYMBOL(kmap_atomic); void __kunmap_atomic(void *kvaddr) { diff --git a/arch/mn10300/include/asm/highmem.h b/arch/mn10300/include/asm/highmem.h index bfe2d88604d..7c137cd8aa3 100644 --- a/arch/mn10300/include/asm/highmem.h +++ b/arch/mn10300/include/asm/highmem.h @@ -70,7 +70,7 @@ static inline void kunmap(struct page *page) * be used in IRQ contexts, so in some (very limited) cases we need * it. */ -static inline unsigned long __kmap_atomic(struct page *page) +static inline unsigned long kmap_atomic(struct page *page) { unsigned long vaddr; int idx, type; diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h index da601dd34c0..9f21ab0c02e 100644 --- a/arch/parisc/include/asm/cacheflush.h +++ b/arch/parisc/include/asm/cacheflush.h @@ -140,7 +140,7 @@ static inline void *kmap(struct page *page) #define kunmap(page) kunmap_parisc(page_address(page)) -static inline void *__kmap_atomic(struct page *page) +static inline void *kmap_atomic(struct page *page) { pagefault_disable(); return page_address(page); diff --git a/arch/powerpc/include/asm/highmem.h b/arch/powerpc/include/asm/highmem.h index dbc264010d0..caaf6e00630 100644 --- a/arch/powerpc/include/asm/highmem.h +++ b/arch/powerpc/include/asm/highmem.h @@ -79,7 +79,7 @@ static inline void kunmap(struct page *page) kunmap_high(page); } -static inline void *__kmap_atomic(struct page *page) +static inline void *kmap_atomic(struct page *page) { return kmap_atomic_prot(page, kmap_prot); } diff --git a/arch/sparc/include/asm/highmem.h b/arch/sparc/include/asm/highmem.h index 3d7afbb7f4b..3b6e00dd96e 100644 --- a/arch/sparc/include/asm/highmem.h +++ b/arch/sparc/include/asm/highmem.h @@ -70,7 +70,7 @@ static inline void kunmap(struct page *page) kunmap_high(page); } -extern void *__kmap_atomic(struct page *page); +extern void *kmap_atomic(struct page *page); extern void __kunmap_atomic(void *kvaddr); extern struct page *kmap_atomic_to_page(void *vaddr); diff --git a/arch/sparc/mm/highmem.c b/arch/sparc/mm/highmem.c index 77140a02c86..055c66cf1bf 100644 --- a/arch/sparc/mm/highmem.c +++ b/arch/sparc/mm/highmem.c @@ -30,7 +30,7 @@ #include #include -void *__kmap_atomic(struct page *page) +void *kmap_atomic(struct page *page) { unsigned long vaddr; long idx, type; @@ -64,7 +64,7 @@ void *__kmap_atomic(struct page *page) return (void*) vaddr; } -EXPORT_SYMBOL(__kmap_atomic); +EXPORT_SYMBOL(kmap_atomic); void __kunmap_atomic(void *kvaddr) { diff --git a/arch/tile/include/asm/highmem.h b/arch/tile/include/asm/highmem.h index b2a6c5de79a..fc8429a31c8 100644 --- a/arch/tile/include/asm/highmem.h +++ b/arch/tile/include/asm/highmem.h @@ -59,7 +59,7 @@ void *kmap_fix_kpte(struct page *page, int finished); /* This macro is used only in map_new_virtual() to map "page". */ #define kmap_prot page_to_kpgprot(page) -void *__kmap_atomic(struct page *page); +void *kmap_atomic(struct page *page); void __kunmap_atomic(void *kvaddr); void *kmap_atomic_pfn(unsigned long pfn); void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot); diff --git a/arch/tile/mm/highmem.c b/arch/tile/mm/highmem.c index 31dbbd9afe4..ef8e5a62b6e 100644 --- a/arch/tile/mm/highmem.c +++ b/arch/tile/mm/highmem.c @@ -224,12 +224,12 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot) } EXPORT_SYMBOL(kmap_atomic_prot); -void *__kmap_atomic(struct page *page) +void *kmap_atomic(struct page *page) { /* PAGE_NONE is a magic value that tells us to check immutability. */ return kmap_atomic_prot(page, PAGE_NONE); } -EXPORT_SYMBOL(__kmap_atomic); +EXPORT_SYMBOL(kmap_atomic); void __kunmap_atomic(void *kvaddr) { diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h index 3bd04022fd0..302a323b3f6 100644 --- a/arch/x86/include/asm/highmem.h +++ b/arch/x86/include/asm/highmem.h @@ -61,7 +61,7 @@ void *kmap(struct page *page); void kunmap(struct page *page); void *kmap_atomic_prot(struct page *page, pgprot_t prot); -void *__kmap_atomic(struct page *page); +void *kmap_atomic(struct page *page); void __kunmap_atomic(void *kvaddr); void *kmap_atomic_pfn(unsigned long pfn); void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot); diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index f4f29b19fac..6f31ee56c00 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -51,11 +51,11 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot) } EXPORT_SYMBOL(kmap_atomic_prot); -void *__kmap_atomic(struct page *page) +void *kmap_atomic(struct page *page) { return kmap_atomic_prot(page, kmap_prot); } -EXPORT_SYMBOL(__kmap_atomic); +EXPORT_SYMBOL(kmap_atomic); /* * This is the same as kmap_atomic() but can map memory that doesn't diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 284ec5535f3..6549ed75e0a 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -55,12 +55,12 @@ static inline void kunmap(struct page *page) { } -static inline void *__kmap_atomic(struct page *page) +static inline void *kmap_atomic(struct page *page) { pagefault_disable(); return page_address(page); } -#define kmap_atomic_prot(page, prot) __kmap_atomic(page) +#define kmap_atomic_prot(page, prot) kmap_atomic(page) static inline void __kunmap_atomic(void *addr) { @@ -121,15 +121,10 @@ static inline void kmap_atomic_idx_pop(void) #define NARG_(_2, _1, n, ...) n #define NARG(...) NARG_(__VA_ARGS__, 2, 1, :) -static inline void *kmap_atomic(struct page *page) -{ - return __kmap_atomic(page); -} - static inline void __deprecated *kmap_atomic_deprecated(struct page *page, enum km_type km) { - return __kmap_atomic(page); + return kmap_atomic(page); } #define kmap_atomic1(...) kmap_atomic(__VA_ARGS__) -- cgit v1.2.3-70-g09d2 From 639077fb69aec8112e5427210a83d0fb192969f0 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 19 Mar 2012 15:16:48 -0500 Subject: kgdb: x86: Return all segment registers also in 64-bit mode Even if the content is always 0, gdb expects us to return also ds, es, fs, and gs while in x86-64 mode. Do this to avoid ugly errors on "info registers". [jason.wessel@windriver.com: adjust NUMREGBYTES for two new regs] Signed-off-by: Jan Kiszka Signed-off-by: Jason Wessel --- arch/x86/include/asm/kgdb.h | 10 +++++++--- arch/x86/kernel/kgdb.c | 6 ++++-- 2 files changed, 11 insertions(+), 5 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kgdb.h b/arch/x86/include/asm/kgdb.h index 77e95f54570..332f98c9111 100644 --- a/arch/x86/include/asm/kgdb.h +++ b/arch/x86/include/asm/kgdb.h @@ -64,11 +64,15 @@ enum regnames { GDB_PS, /* 17 */ GDB_CS, /* 18 */ GDB_SS, /* 19 */ + GDB_DS, /* 20 */ + GDB_ES, /* 21 */ + GDB_FS, /* 22 */ + GDB_GS, /* 23 */ }; #define GDB_ORIG_AX 57 -#define DBG_MAX_REG_NUM 20 -/* 17 64 bit regs and 3 32 bit regs */ -#define NUMREGBYTES ((17 * 8) + (3 * 4)) +#define DBG_MAX_REG_NUM 24 +/* 17 64 bit regs and 5 32 bit regs */ +#define NUMREGBYTES ((17 * 8) + (5 * 4)) #endif /* ! CONFIG_X86_32 */ static inline void arch_kgdb_breakpoint(void) diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index faba5771aca..fdc37b3d0ce 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -67,8 +67,6 @@ struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = { "ss", 4, offsetof(struct pt_regs, ss) }, { "ds", 4, offsetof(struct pt_regs, ds) }, { "es", 4, offsetof(struct pt_regs, es) }, - { "fs", 4, -1 }, - { "gs", 4, -1 }, #else { "ax", 8, offsetof(struct pt_regs, ax) }, { "bx", 8, offsetof(struct pt_regs, bx) }, @@ -90,7 +88,11 @@ struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = { "flags", 4, offsetof(struct pt_regs, flags) }, { "cs", 4, offsetof(struct pt_regs, cs) }, { "ss", 4, offsetof(struct pt_regs, ss) }, + { "ds", 4, -1 }, + { "es", 4, -1 }, #endif + { "fs", 4, -1 }, + { "gs", 4, -1 }, }; int dbg_set_reg(int regno, void *mem, struct pt_regs *regs) -- cgit v1.2.3-70-g09d2 From b7157acf429e6aef690646ba964b9ebd25049ec2 Mon Sep 17 00:00:00 2001 From: Steffen Persvold Date: Fri, 16 Mar 2012 20:25:35 +0100 Subject: x86/apic: Add separate apic_id_valid() functions for selected apic drivers As suggested by Suresh Siddha and Yinghai Lu: For x2apic pre-enabled systems, apic driver is set already early through early_acpi_boot_init()/early_acpi_process_madt()/ acpi_parse_madt()/default_acpi_madt_oem_check() path so that apic_id_valid() checking will be sufficient during MADT and SRAT parsing. For non-x2apic pre-enabled systems, all apic ids should be less than 255. This allows us to substitute the checks in arch/x86/kernel/acpi/boot.c::acpi_parse_x2apic() and arch/x86/mm/srat.c::acpi_numa_x2apic_affinity_init() with apic->apic_id_valid(). In addition we can avoid feigning the x2apic cpu feature in the NumaChip apic code. The following apic drivers have separate apic_id_valid() functions which will accept x2apic type IDs : x2apic_phys x2apic_cluster x2apic_uv_x apic_numachip Signed-off-by: Steffen Persvold Cc: Suresh Siddha Cc: Daniel J Blueman Cc: Yinghai Lu Cc: Jack Steiner Link: http://lkml.kernel.org/r/1331925935-13372-1-git-send-email-sp@numascale.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apic.h | 2 +- arch/x86/include/asm/x2apic.h | 5 +++++ arch/x86/kernel/acpi/boot.c | 2 +- arch/x86/kernel/apic/apic_numachip.c | 3 +-- arch/x86/kernel/apic/x2apic_cluster.c | 2 +- arch/x86/kernel/apic/x2apic_phys.c | 2 +- arch/x86/kernel/apic/x2apic_uv_x.c | 7 ++++++- arch/x86/mm/srat.c | 2 +- 8 files changed, 17 insertions(+), 8 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index a9371c91718..d3eaac44860 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -535,7 +535,7 @@ static inline unsigned int read_apic_id(void) static inline int default_apic_id_valid(int apicid) { - return x2apic_mode || (apicid < 255); + return (apicid < 255); } extern void default_setup_apic_routing(void); diff --git a/arch/x86/include/asm/x2apic.h b/arch/x86/include/asm/x2apic.h index 6bf5b8e478c..92e54abf89e 100644 --- a/arch/x86/include/asm/x2apic.h +++ b/arch/x86/include/asm/x2apic.h @@ -18,6 +18,11 @@ static const struct cpumask *x2apic_target_cpus(void) return cpu_online_mask; } +static int x2apic_apic_id_valid(int apicid) +{ + return 1; +} + static int x2apic_apic_id_registered(void) { return 1; diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 406ed77216d..0f42c2f4431 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -239,7 +239,7 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end) * to not preallocating memory for all NR_CPUS * when we use CPU hotplug. */ - if (!cpu_has_x2apic && (apic_id >= 0xff) && enabled) + if (!apic->apic_id_valid(apic_id) && enabled) printk(KERN_WARNING PREFIX "x2apic entry ignored\n"); else acpi_register_lapic(apic_id, enabled); diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index d9ea5f331ac..899803e0321 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -229,11 +229,10 @@ static int __init numachip_system_init(void) } early_initcall(numachip_system_init); -static int __cpuinit numachip_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +static int numachip_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { if (!strncmp(oem_id, "NUMASC", 6)) { numachip_system = 1; - setup_force_cpu_cap(X86_FEATURE_X2APIC); return 1; } diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 9193713060a..48f3103b3c9 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -213,7 +213,7 @@ static struct apic apic_x2apic_cluster = { .name = "cluster x2apic", .probe = x2apic_cluster_probe, .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, - .apic_id_valid = default_apic_id_valid, + .apic_id_valid = x2apic_apic_id_valid, .apic_id_registered = x2apic_apic_id_registered, .irq_delivery_mode = dest_LowestPrio, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index bcd1db6eaca..8a778db45e3 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -119,7 +119,7 @@ static struct apic apic_x2apic_phys = { .name = "physical x2apic", .probe = x2apic_phys_probe, .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, - .apic_id_valid = default_apic_id_valid, + .apic_id_valid = x2apic_apic_id_valid, .apic_id_registered = x2apic_apic_id_registered, .irq_delivery_mode = dest_Fixed, diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index fc477142585..87bfa69e216 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -266,6 +266,11 @@ static void uv_send_IPI_all(int vector) uv_send_IPI_mask(cpu_online_mask, vector); } +static int uv_apic_id_valid(int apicid) +{ + return 1; +} + static int uv_apic_id_registered(void) { return 1; @@ -351,7 +356,7 @@ static struct apic __refdata apic_x2apic_uv_x = { .name = "UV large system", .probe = uv_probe, .acpi_madt_oem_check = uv_acpi_madt_oem_check, - .apic_id_valid = default_apic_id_valid, + .apic_id_valid = uv_apic_id_valid, .apic_id_registered = uv_apic_id_registered, .irq_delivery_mode = dest_Fixed, diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c index 1c1c4f46a7c..efb5b4b9371 100644 --- a/arch/x86/mm/srat.c +++ b/arch/x86/mm/srat.c @@ -70,7 +70,7 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa) return; pxm = pa->proximity_domain; apic_id = pa->apic_id; - if (!cpu_has_x2apic && (apic_id >= 0xff)) { + if (!apic->apic_id_valid(apic_id)) { printk(KERN_INFO "SRAT: PXM %u -> X2APIC 0x%04x ignored\n", pxm, apic_id); return; -- cgit v1.2.3-70-g09d2 From 91ec87d57fc38c529034e853687dfb7756de5406 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 22 Mar 2012 21:15:51 -0700 Subject: x86-64: Simplify and optimize vdso clock_gettime monotonic variants We used to store the wall-to-monotonic offset and the realtime base. It's faster to precompute the monotonic base. This is about a 3% speedup on Sandy Bridge for CLOCK_MONOTONIC. It's much more impressive for CLOCK_MONOTONIC_COARSE. Signed-off-by: Andy Lutomirski Signed-off-by: John Stultz --- arch/x86/include/asm/vgtod.h | 15 +++++++++------ arch/x86/kernel/vsyscall_64.c | 10 +++++++++- arch/x86/vdso/vclock_gettime.c | 38 ++++++++------------------------------ 3 files changed, 26 insertions(+), 37 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h index 1f007178c81..8b38be2de9e 100644 --- a/arch/x86/include/asm/vgtod.h +++ b/arch/x86/include/asm/vgtod.h @@ -7,11 +7,6 @@ struct vsyscall_gtod_data { seqcount_t seq; - /* open coded 'struct timespec' */ - time_t wall_time_sec; - u32 wall_time_nsec; - - struct timezone sys_tz; struct { /* extract of a clocksource struct */ int vclock_mode; cycle_t cycle_last; @@ -19,8 +14,16 @@ struct vsyscall_gtod_data { u32 mult; u32 shift; } clock; - struct timespec wall_to_monotonic; + + /* open coded 'struct timespec' */ + time_t wall_time_sec; + u32 wall_time_nsec; + u32 monotonic_time_nsec; + time_t monotonic_time_sec; + + struct timezone sys_tz; struct timespec wall_time_coarse; + struct timespec monotonic_time_coarse; }; extern struct vsyscall_gtod_data vsyscall_gtod_data; diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index cdc95a707cd..4285f1f404c 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -84,6 +84,7 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm, struct clocksource *clock, u32 mult) { write_seqcount_begin(&vsyscall_gtod_data.seq); + struct timespec monotonic; /* copy vsyscall data */ vsyscall_gtod_data.clock.vclock_mode = clock->archdata.vclock_mode; @@ -91,10 +92,17 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm, vsyscall_gtod_data.clock.mask = clock->mask; vsyscall_gtod_data.clock.mult = mult; vsyscall_gtod_data.clock.shift = clock->shift; + vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; - vsyscall_gtod_data.wall_to_monotonic = *wtm; + + monotonic = timespec_add(*wall_time, *wtm); + vsyscall_gtod_data.monotonic_time_sec = monotonic.tv_sec; + vsyscall_gtod_data.monotonic_time_nsec = monotonic.tv_nsec; + vsyscall_gtod_data.wall_time_coarse = __current_kernel_time(); + vsyscall_gtod_data.monotonic_time_coarse = + timespec_add(vsyscall_gtod_data.wall_time_coarse, *wtm); write_seqcount_end(&vsyscall_gtod_data.seq); } diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index 944c5e5d6b6..6eea70b8f38 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -113,27 +113,17 @@ notrace static noinline int do_realtime(struct timespec *ts) notrace static noinline int do_monotonic(struct timespec *ts) { - unsigned long seq, ns, secs; + unsigned long seq, ns; int mode; do { seq = read_seqcount_begin(>od->seq); mode = gtod->clock.vclock_mode; - secs = gtod->wall_time_sec; - ns = gtod->wall_time_nsec + vgetns(); - secs += gtod->wall_to_monotonic.tv_sec; - ns += gtod->wall_to_monotonic.tv_nsec; + ts->tv_sec = gtod->monotonic_time_sec; + ts->tv_nsec = gtod->monotonic_time_nsec; + ns = vgetns(); } while (unlikely(read_seqcount_retry(>od->seq, seq))); - - /* wall_time_nsec, vgetns(), and wall_to_monotonic.tv_nsec - * are all guaranteed to be nonnegative. - */ - while (ns >= NSEC_PER_SEC) { - ns -= NSEC_PER_SEC; - ++secs; - } - ts->tv_sec = secs; - ts->tv_nsec = ns; + timespec_add_ns(ts, ns); return mode; } @@ -151,25 +141,13 @@ notrace static noinline int do_realtime_coarse(struct timespec *ts) notrace static noinline int do_monotonic_coarse(struct timespec *ts) { - unsigned long seq, ns, secs; + unsigned long seq; do { seq = read_seqcount_begin(>od->seq); - secs = gtod->wall_time_coarse.tv_sec; - ns = gtod->wall_time_coarse.tv_nsec; - secs += gtod->wall_to_monotonic.tv_sec; - ns += gtod->wall_to_monotonic.tv_nsec; + ts->tv_sec = gtod->monotonic_time_coarse.tv_sec; + ts->tv_nsec = gtod->monotonic_time_coarse.tv_nsec; } while (unlikely(read_seqcount_retry(>od->seq, seq))); - /* wall_time_nsec and wall_to_monotonic.tv_nsec are - * guaranteed to be between 0 and NSEC_PER_SEC. - */ - if (ns >= NSEC_PER_SEC) { - ns -= NSEC_PER_SEC; - ++secs; - } - ts->tv_sec = secs; - ts->tv_nsec = ns; - return 0; } -- cgit v1.2.3-70-g09d2 From 90e240142bd31ff10aeda5a280a53153f4eff004 Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Sun, 25 Mar 2012 23:00:04 +0200 Subject: x86: Merge the x86_32 and x86_64 cpu_idle() functions Both functions are mostly identical. The differences are: - x86_32's cpu_idle() makes use of check_pgt_cache(), which is a nop on both x86_32 and x86_64. - x86_64's cpu_idle() uses enter/__exit_idle/(), on x86_32 these function are a nop. - In contrast to x86_32, x86_64 calls rcu_idle_enter/exit() in the innermost loop because idle notifications need RCU. Calling these function on x86_32 also in the innermost loop does not hurt. So we can merge both functions. Signed-off-by: Richard Weinberger Acked-by: Frederic Weisbecker Cc: paulmck@linux.vnet.ibm.com Cc: josh@joshtriplett.org Cc: tj@kernel.org Link: http://lkml.kernel.org/r/1332709204-22496-1-git-send-email-richard@nod.at Signed-off-by: Ingo Molnar --- arch/x86/include/asm/idle.h | 1 + arch/x86/kernel/process.c | 114 +++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/process_32.c | 58 ---------------------- arch/x86/kernel/process_64.c | 107 ---------------------------------------- 4 files changed, 115 insertions(+), 165 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/idle.h b/arch/x86/include/asm/idle.h index f49253d7571..c5d1785373e 100644 --- a/arch/x86/include/asm/idle.h +++ b/arch/x86/include/asm/idle.h @@ -14,6 +14,7 @@ void exit_idle(void); #else /* !CONFIG_X86_64 */ static inline void enter_idle(void) { } static inline void exit_idle(void) { } +static inline void __exit_idle(void) { } #endif /* CONFIG_X86_64 */ void amd_e400_remove_cpu(int cpu); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 14baf78d5a1..29309c42b9e 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -12,6 +12,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -23,6 +26,24 @@ #include #include #include +#include + +#ifdef CONFIG_X86_64 +static DEFINE_PER_CPU(unsigned char, is_idle); +static ATOMIC_NOTIFIER_HEAD(idle_notifier); + +void idle_notifier_register(struct notifier_block *n) +{ + atomic_notifier_chain_register(&idle_notifier, n); +} +EXPORT_SYMBOL_GPL(idle_notifier_register); + +void idle_notifier_unregister(struct notifier_block *n) +{ + atomic_notifier_chain_unregister(&idle_notifier, n); +} +EXPORT_SYMBOL_GPL(idle_notifier_unregister); +#endif struct kmem_cache *task_xstate_cachep; EXPORT_SYMBOL_GPL(task_xstate_cachep); @@ -371,6 +392,99 @@ static inline int hlt_use_halt(void) } #endif +#ifndef CONFIG_SMP +static inline void play_dead(void) +{ + BUG(); +} +#endif + +#ifdef CONFIG_X86_64 +void enter_idle(void) +{ + percpu_write(is_idle, 1); + atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); +} + +static void __exit_idle(void) +{ + if (x86_test_and_clear_bit_percpu(0, is_idle) == 0) + return; + atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); +} + +/* Called from interrupts to signify idle end */ +void exit_idle(void) +{ + /* idle loop has pid 0 */ + if (current->pid) + return; + __exit_idle(); +} +#endif + +/* + * The idle thread. There's no useful work to be + * done, so just try to conserve power and have a + * low exit latency (ie sit in a loop waiting for + * somebody to say that they'd like to reschedule) + */ +void cpu_idle(void) +{ + /* + * If we're the non-boot CPU, nothing set the stack canary up + * for us. CPU0 already has it initialized but no harm in + * doing it again. This is a good place for updating it, as + * we wont ever return from this function (so the invalid + * canaries already on the stack wont ever trigger). + */ + boot_init_stack_canary(); + current_thread_info()->status |= TS_POLLING; + + while (1) { + tick_nohz_idle_enter(); + + while (!need_resched()) { + rmb(); + + if (cpu_is_offline(smp_processor_id())) + play_dead(); + + /* + * Idle routines should keep interrupts disabled + * from here on, until they go to idle. + * Otherwise, idle callbacks can misfire. + */ + local_touch_nmi(); + local_irq_disable(); + + enter_idle(); + + /* Don't trace irqs off for idle */ + stop_critical_timings(); + + /* enter_idle() needs rcu for notifiers */ + rcu_idle_enter(); + + if (cpuidle_idle_call()) + pm_idle(); + + rcu_idle_exit(); + start_critical_timings(); + + /* In many cases the interrupt that ended idle + has already called exit_idle. But some idle + loops can be woken up without interrupt. */ + __exit_idle(); + } + + tick_nohz_idle_exit(); + preempt_enable_no_resched(); + schedule(); + preempt_disable(); + } +} + /* * We use this if we don't have any better * idle routine.. diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 9d7d4842bfa..ea207c245aa 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -9,7 +9,6 @@ * This file handles the architecture-dependent parts of process handling.. */ -#include #include #include #include @@ -31,14 +30,12 @@ #include #include #include -#include #include #include #include #include #include #include -#include #include #include @@ -58,7 +55,6 @@ #include #include #include -#include asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); @@ -70,60 +66,6 @@ unsigned long thread_saved_pc(struct task_struct *tsk) return ((unsigned long *)tsk->thread.sp)[3]; } -#ifndef CONFIG_SMP -static inline void play_dead(void) -{ - BUG(); -} -#endif - -/* - * The idle thread. There's no useful work to be - * done, so just try to conserve power and have a - * low exit latency (ie sit in a loop waiting for - * somebody to say that they'd like to reschedule) - */ -void cpu_idle(void) -{ - int cpu = smp_processor_id(); - - /* - * If we're the non-boot CPU, nothing set the stack canary up - * for us. CPU0 already has it initialized but no harm in - * doing it again. This is a good place for updating it, as - * we wont ever return from this function (so the invalid - * canaries already on the stack wont ever trigger). - */ - boot_init_stack_canary(); - - current_thread_info()->status |= TS_POLLING; - - /* endless idle loop with no priority at all */ - while (1) { - tick_nohz_idle_enter(); - rcu_idle_enter(); - while (!need_resched()) { - - check_pgt_cache(); - rmb(); - - if (cpu_is_offline(cpu)) - play_dead(); - - local_touch_nmi(); - local_irq_disable(); - /* Don't trace irqs off for idle */ - stop_critical_timings(); - if (cpuidle_idle_call()) - pm_idle(); - start_critical_timings(); - } - rcu_idle_exit(); - tick_nohz_idle_exit(); - schedule_preempt_disabled(); - } -} - void __show_regs(struct pt_regs *regs, int all) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 292da13fc5a..ce5e34f2bec 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -14,7 +14,6 @@ * This file handles the architecture-dependent parts of process handling.. */ -#include #include #include #include @@ -32,12 +31,10 @@ #include #include #include -#include #include #include #include #include -#include #include #include @@ -52,114 +49,10 @@ #include #include #include -#include asmlinkage extern void ret_from_fork(void); DEFINE_PER_CPU(unsigned long, old_rsp); -static DEFINE_PER_CPU(unsigned char, is_idle); - -static ATOMIC_NOTIFIER_HEAD(idle_notifier); - -void idle_notifier_register(struct notifier_block *n) -{ - atomic_notifier_chain_register(&idle_notifier, n); -} -EXPORT_SYMBOL_GPL(idle_notifier_register); - -void idle_notifier_unregister(struct notifier_block *n) -{ - atomic_notifier_chain_unregister(&idle_notifier, n); -} -EXPORT_SYMBOL_GPL(idle_notifier_unregister); - -void enter_idle(void) -{ - percpu_write(is_idle, 1); - atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); -} - -static void __exit_idle(void) -{ - if (x86_test_and_clear_bit_percpu(0, is_idle) == 0) - return; - atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); -} - -/* Called from interrupts to signify idle end */ -void exit_idle(void) -{ - /* idle loop has pid 0 */ - if (current->pid) - return; - __exit_idle(); -} - -#ifndef CONFIG_SMP -static inline void play_dead(void) -{ - BUG(); -} -#endif - -/* - * The idle thread. There's no useful work to be - * done, so just try to conserve power and have a - * low exit latency (ie sit in a loop waiting for - * somebody to say that they'd like to reschedule) - */ -void cpu_idle(void) -{ - current_thread_info()->status |= TS_POLLING; - - /* - * If we're the non-boot CPU, nothing set the stack canary up - * for us. CPU0 already has it initialized but no harm in - * doing it again. This is a good place for updating it, as - * we wont ever return from this function (so the invalid - * canaries already on the stack wont ever trigger). - */ - boot_init_stack_canary(); - - /* endless idle loop with no priority at all */ - while (1) { - tick_nohz_idle_enter(); - while (!need_resched()) { - - rmb(); - - if (cpu_is_offline(smp_processor_id())) - play_dead(); - /* - * Idle routines should keep interrupts disabled - * from here on, until they go to idle. - * Otherwise, idle callbacks can misfire. - */ - local_touch_nmi(); - local_irq_disable(); - enter_idle(); - /* Don't trace irqs off for idle */ - stop_critical_timings(); - - /* enter_idle() needs rcu for notifiers */ - rcu_idle_enter(); - - if (cpuidle_idle_call()) - pm_idle(); - - rcu_idle_exit(); - start_critical_timings(); - - /* In many cases the interrupt that ended idle - has already called exit_idle. But some idle - loops can be woken up without interrupt. */ - __exit_idle(); - } - - tick_nohz_idle_exit(); - schedule_preempt_disabled(); - } -} /* Prints also some state that isn't saved in the pt_regs */ void __show_regs(struct pt_regs *regs, int all) -- cgit v1.2.3-70-g09d2 From 136d249ef7dbf0fefa292082cc40be1ea864cbd6 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 21 Mar 2012 22:58:08 -0400 Subject: x86/ioapic: Add io_apic_ops driver layer to allow interception Xen dom0 needs to paravirtualize IO operations to the IO APIC, so add a io_apic_ops for it to intercept. Do this as ops structure because there's at least some chance that another paravirtualized environment may want to intercept these. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Konrad Rzeszutek Wilk Acked-by: Suresh Siddha Cc: jwboyer@redhat.com Cc: yinghai@kernel.org Link: http://lkml.kernel.org/r/1332385090-18056-2-git-send-email-konrad.wilk@oracle.com [ Made all the affected code easier on the eyes ] Signed-off-by: Ingo Molnar --- arch/x86/include/asm/io_apic.h | 9 +++++++ arch/x86/kernel/apic/io_apic.c | 58 +++++++++++++++++++++++++++++++++++++----- 2 files changed, 60 insertions(+), 7 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 690d1cc9a87..2c4943de515 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -21,6 +21,15 @@ #define IO_APIC_REDIR_LEVEL_TRIGGER (1 << 15) #define IO_APIC_REDIR_MASKED (1 << 16) +struct io_apic_ops { + void (*init) (void); + unsigned int (*read) (unsigned int apic, unsigned int reg); + void (*write) (unsigned int apic, unsigned int reg, unsigned int value); + void (*modify)(unsigned int apic, unsigned int reg, unsigned int value); +}; + +void __init set_io_apic_ops(const struct io_apic_ops *); + /* * The structure of the IO-APIC: */ diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 2c428c5d7ca..e88300d8e80 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -64,9 +64,28 @@ #include #define __apicdebuginit(type) static type __init + #define for_each_irq_pin(entry, head) \ for (entry = head; entry; entry = entry->next) +static void __init __ioapic_init_mappings(void); + +static unsigned int __io_apic_read (unsigned int apic, unsigned int reg); +static void __io_apic_write (unsigned int apic, unsigned int reg, unsigned int val); +static void __io_apic_modify(unsigned int apic, unsigned int reg, unsigned int val); + +static struct io_apic_ops io_apic_ops = { + .init = __ioapic_init_mappings, + .read = __io_apic_read, + .write = __io_apic_write, + .modify = __io_apic_modify, +}; + +void __init set_io_apic_ops(const struct io_apic_ops *ops) +{ + io_apic_ops = *ops; +} + /* * Is the SiS APIC rmw bug present ? * -1 = don't know, 0 = no, 1 = yes @@ -294,6 +313,22 @@ static void free_irq_at(unsigned int at, struct irq_cfg *cfg) irq_free_desc(at); } +static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) +{ + return io_apic_ops.read(apic, reg); +} + +static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) +{ + io_apic_ops.write(apic, reg, value); +} + +static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) +{ + io_apic_ops.modify(apic, reg, value); +} + + struct io_apic { unsigned int index; unsigned int unused[3]; @@ -314,16 +349,17 @@ static inline void io_apic_eoi(unsigned int apic, unsigned int vector) writel(vector, &io_apic->eoi); } -static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) +static unsigned int __io_apic_read(unsigned int apic, unsigned int reg) { struct io_apic __iomem *io_apic = io_apic_base(apic); writel(reg, &io_apic->index); return readl(&io_apic->data); } -static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) +static void __io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) { struct io_apic __iomem *io_apic = io_apic_base(apic); + writel(reg, &io_apic->index); writel(value, &io_apic->data); } @@ -334,7 +370,7 @@ static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned i * * Older SiS APIC requires we rewrite the index register */ -static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) +static void __io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) { struct io_apic __iomem *io_apic = io_apic_base(apic); @@ -377,6 +413,7 @@ static struct IO_APIC_route_entry __ioapic_read_entry(int apic, int pin) eu.w1 = io_apic_read(apic, 0x10 + 2 * pin); eu.w2 = io_apic_read(apic, 0x11 + 2 * pin); + return eu.entry; } @@ -384,9 +421,11 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) { union entry_union eu; unsigned long flags; + raw_spin_lock_irqsave(&ioapic_lock, flags); eu.entry = __ioapic_read_entry(apic, pin); raw_spin_unlock_irqrestore(&ioapic_lock, flags); + return eu.entry; } @@ -396,8 +435,7 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) * the interrupt, and we need to make sure the entry is fully populated * before that happens. */ -static void -__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) +static void __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) { union entry_union eu = {{0, 0}}; @@ -409,6 +447,7 @@ __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) { unsigned long flags; + raw_spin_lock_irqsave(&ioapic_lock, flags); __ioapic_write_entry(apic, pin, e); raw_spin_unlock_irqrestore(&ioapic_lock, flags); @@ -435,8 +474,7 @@ static void ioapic_mask_entry(int apic, int pin) * shared ISA-space IRQs, so we have to support them. We are super * fast in the common case, and fast for shared ISA-space IRQs. */ -static int -__add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) +static int __add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) { struct irq_pin_list **last, *entry; @@ -521,6 +559,7 @@ static void io_apic_sync(struct irq_pin_list *entry) * a dummy read from the IO-APIC */ struct io_apic __iomem *io_apic; + io_apic = io_apic_base(entry->apic); readl(&io_apic->data); } @@ -3893,6 +3932,11 @@ static struct resource * __init ioapic_setup_resources(int nr_ioapics) } void __init ioapic_and_gsi_init(void) +{ + io_apic_ops.init(); +} + +static void __init __ioapic_init_mappings(void) { unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; struct resource *ioapic_res; -- cgit v1.2.3-70-g09d2 From baa676fcf8d555269bd0a5a2496782beee55824d Mon Sep 17 00:00:00 2001 From: Andrzej Pietrasiewicz Date: Tue, 27 Mar 2012 14:28:18 +0200 Subject: X86 & IA64: adapt for dma_map_ops changes Adapt core x86 and IA64 architecture code for dma_map_ops changes: replace alloc/free_coherent with generic alloc/free methods. Signed-off-by: Andrzej Pietrasiewicz Acked-by: Kyungmin Park [removed swiotlb related changes and replaced it with wrappers, merged with IA64 patch to avoid inter-patch dependences in intel-iommu code] Signed-off-by: Marek Szyprowski Reviewed-by: Arnd Bergmann Acked-by: Tony Luck --- arch/ia64/hp/common/sba_iommu.c | 11 ++++++----- arch/ia64/include/asm/dma-mapping.h | 18 ++++++++++++------ arch/ia64/kernel/pci-swiotlb.c | 14 +++++++++++--- arch/ia64/sn/pci/pci_dma.c | 9 +++++---- arch/x86/include/asm/dma-mapping.h | 26 ++++++++++++++++---------- arch/x86/kernel/amd_gart_64.c | 11 ++++++----- arch/x86/kernel/pci-calgary_64.c | 9 +++++---- arch/x86/kernel/pci-dma.c | 3 ++- arch/x86/kernel/pci-nommu.c | 6 +++--- arch/x86/kernel/pci-swiotlb.c | 17 +++++++++++++---- arch/x86/xen/pci-swiotlb-xen.c | 4 ++-- drivers/iommu/amd_iommu.c | 10 ++++++---- drivers/iommu/intel-iommu.c | 9 +++++---- drivers/xen/swiotlb-xen.c | 5 +++-- include/xen/swiotlb-xen.h | 6 ++++-- 15 files changed, 99 insertions(+), 59 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/ia64/hp/common/sba_iommu.c b/arch/ia64/hp/common/sba_iommu.c index f5f4ef149aa..e5eb9c4b219 100644 --- a/arch/ia64/hp/common/sba_iommu.c +++ b/arch/ia64/hp/common/sba_iommu.c @@ -1130,7 +1130,8 @@ void sba_unmap_single_attrs(struct device *dev, dma_addr_t iova, size_t size, * See Documentation/DMA-API-HOWTO.txt */ static void * -sba_alloc_coherent (struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flags) +sba_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, + gfp_t flags, struct dma_attrs *attrs) { struct ioc *ioc; void *addr; @@ -1192,8 +1193,8 @@ sba_alloc_coherent (struct device *dev, size_t size, dma_addr_t *dma_handle, gfp * * See Documentation/DMA-API-HOWTO.txt */ -static void sba_free_coherent (struct device *dev, size_t size, void *vaddr, - dma_addr_t dma_handle) +static void sba_free_coherent(struct device *dev, size_t size, void *vaddr, + dma_addr_t dma_handle, struct dma_attrs *attrs) { sba_unmap_single_attrs(dev, dma_handle, size, 0, NULL); free_pages((unsigned long) vaddr, get_order(size)); @@ -2213,8 +2214,8 @@ sba_page_override(char *str) __setup("sbapagesize=",sba_page_override); struct dma_map_ops sba_dma_ops = { - .alloc_coherent = sba_alloc_coherent, - .free_coherent = sba_free_coherent, + .alloc = sba_alloc_coherent, + .free = sba_free_coherent, .map_page = sba_map_page, .unmap_page = sba_unmap_page, .map_sg = sba_map_sg_attrs, diff --git a/arch/ia64/include/asm/dma-mapping.h b/arch/ia64/include/asm/dma-mapping.h index 4336d080b24..4f5e8148440 100644 --- a/arch/ia64/include/asm/dma-mapping.h +++ b/arch/ia64/include/asm/dma-mapping.h @@ -23,23 +23,29 @@ extern void machvec_dma_sync_single(struct device *, dma_addr_t, size_t, extern void machvec_dma_sync_sg(struct device *, struct scatterlist *, int, enum dma_data_direction); -static inline void *dma_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *daddr, gfp_t gfp) +#define dma_alloc_coherent(d,s,h,f) dma_alloc_attrs(d,s,h,f,NULL) + +static inline void *dma_alloc_attrs(struct device *dev, size_t size, + dma_addr_t *daddr, gfp_t gfp, + struct dma_attrs *attrs) { struct dma_map_ops *ops = platform_dma_get_ops(dev); void *caddr; - caddr = ops->alloc_coherent(dev, size, daddr, gfp); + caddr = ops->alloc(dev, size, daddr, gfp, attrs); debug_dma_alloc_coherent(dev, size, *daddr, caddr); return caddr; } -static inline void dma_free_coherent(struct device *dev, size_t size, - void *caddr, dma_addr_t daddr) +#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL) + +static inline void dma_free_attrs(struct device *dev, size_t size, + void *caddr, dma_addr_t daddr, + struct dma_attrs *attrs) { struct dma_map_ops *ops = platform_dma_get_ops(dev); debug_dma_free_coherent(dev, size, caddr, daddr); - ops->free_coherent(dev, size, caddr, daddr); + ops->free(dev, size, caddr, daddr, attrs); } #define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f) diff --git a/arch/ia64/kernel/pci-swiotlb.c b/arch/ia64/kernel/pci-swiotlb.c index d9485d952ed..939260aeac9 100644 --- a/arch/ia64/kernel/pci-swiotlb.c +++ b/arch/ia64/kernel/pci-swiotlb.c @@ -15,16 +15,24 @@ int swiotlb __read_mostly; EXPORT_SYMBOL(swiotlb); static void *ia64_swiotlb_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t gfp) + dma_addr_t *dma_handle, gfp_t gfp, + struct dma_attrs *attrs) { if (dev->coherent_dma_mask != DMA_BIT_MASK(64)) gfp |= GFP_DMA; return swiotlb_alloc_coherent(dev, size, dma_handle, gfp); } +static void ia64_swiotlb_free_coherent(struct device *dev, size_t size, + void *vaddr, dma_addr_t dma_addr, + struct dma_attrs *attrs) +{ + swiotlb_free_coherent(dev, size, vaddr, dma_addr); +} + struct dma_map_ops swiotlb_dma_ops = { - .alloc_coherent = ia64_swiotlb_alloc_coherent, - .free_coherent = swiotlb_free_coherent, + .alloc = ia64_swiotlb_alloc_coherent, + .free = ia64_swiotlb_free_coherent, .map_page = swiotlb_map_page, .unmap_page = swiotlb_unmap_page, .map_sg = swiotlb_map_sg_attrs, diff --git a/arch/ia64/sn/pci/pci_dma.c b/arch/ia64/sn/pci/pci_dma.c index a9d310de57d..3290d6e00c3 100644 --- a/arch/ia64/sn/pci/pci_dma.c +++ b/arch/ia64/sn/pci/pci_dma.c @@ -76,7 +76,8 @@ EXPORT_SYMBOL(sn_dma_set_mask); * more information. */ static void *sn_dma_alloc_coherent(struct device *dev, size_t size, - dma_addr_t * dma_handle, gfp_t flags) + dma_addr_t * dma_handle, gfp_t flags, + struct dma_attrs *attrs) { void *cpuaddr; unsigned long phys_addr; @@ -137,7 +138,7 @@ static void *sn_dma_alloc_coherent(struct device *dev, size_t size, * any associated IOMMU mappings. */ static void sn_dma_free_coherent(struct device *dev, size_t size, void *cpu_addr, - dma_addr_t dma_handle) + dma_addr_t dma_handle, struct dma_attrs *attrs) { struct pci_dev *pdev = to_pci_dev(dev); struct sn_pcibus_provider *provider = SN_PCIDEV_BUSPROVIDER(pdev); @@ -466,8 +467,8 @@ int sn_pci_legacy_write(struct pci_bus *bus, u16 port, u32 val, u8 size) } static struct dma_map_ops sn_dma_ops = { - .alloc_coherent = sn_dma_alloc_coherent, - .free_coherent = sn_dma_free_coherent, + .alloc = sn_dma_alloc_coherent, + .free = sn_dma_free_coherent, .map_page = sn_dma_map_page, .unmap_page = sn_dma_unmap_page, .map_sg = sn_dma_map_sg, diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index ed3065fd631..4b4331d7193 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -59,7 +59,8 @@ extern int dma_supported(struct device *hwdev, u64 mask); extern int dma_set_mask(struct device *dev, u64 mask); extern void *dma_generic_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_addr, gfp_t flag); + dma_addr_t *dma_addr, gfp_t flag, + struct dma_attrs *attrs); static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) { @@ -111,9 +112,11 @@ static inline gfp_t dma_alloc_coherent_gfp_flags(struct device *dev, gfp_t gfp) return gfp; } +#define dma_alloc_coherent(d,s,h,f) dma_alloc_attrs(d,s,h,f,NULL) + static inline void * -dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, - gfp_t gfp) +dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, + gfp_t gfp, struct dma_attrs *attrs) { struct dma_map_ops *ops = get_dma_ops(dev); void *memory; @@ -129,18 +132,21 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, if (!is_device_dma_capable(dev)) return NULL; - if (!ops->alloc_coherent) + if (!ops->alloc) return NULL; - memory = ops->alloc_coherent(dev, size, dma_handle, - dma_alloc_coherent_gfp_flags(dev, gfp)); + memory = ops->alloc(dev, size, dma_handle, + dma_alloc_coherent_gfp_flags(dev, gfp), attrs); debug_dma_alloc_coherent(dev, size, *dma_handle, memory); return memory; } -static inline void dma_free_coherent(struct device *dev, size_t size, - void *vaddr, dma_addr_t bus) +#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL) + +static inline void dma_free_attrs(struct device *dev, size_t size, + void *vaddr, dma_addr_t bus, + struct dma_attrs *attrs) { struct dma_map_ops *ops = get_dma_ops(dev); @@ -150,8 +156,8 @@ static inline void dma_free_coherent(struct device *dev, size_t size, return; debug_dma_free_coherent(dev, size, vaddr, bus); - if (ops->free_coherent) - ops->free_coherent(dev, size, vaddr, bus); + if (ops->free) + ops->free(dev, size, vaddr, bus, attrs); } #endif diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index b1e7c7f7a0a..e66311200cb 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -477,7 +477,7 @@ error: /* allocate and map a coherent mapping */ static void * gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, - gfp_t flag) + gfp_t flag, struct dma_attrs *attrs) { dma_addr_t paddr; unsigned long align_mask; @@ -500,7 +500,8 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, } __free_pages(page, get_order(size)); } else - return dma_generic_alloc_coherent(dev, size, dma_addr, flag); + return dma_generic_alloc_coherent(dev, size, dma_addr, flag, + attrs); return NULL; } @@ -508,7 +509,7 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, /* free a coherent mapping */ static void gart_free_coherent(struct device *dev, size_t size, void *vaddr, - dma_addr_t dma_addr) + dma_addr_t dma_addr, struct dma_attrs *attrs) { gart_unmap_page(dev, dma_addr, size, DMA_BIDIRECTIONAL, NULL); free_pages((unsigned long)vaddr, get_order(size)); @@ -700,8 +701,8 @@ static struct dma_map_ops gart_dma_ops = { .unmap_sg = gart_unmap_sg, .map_page = gart_map_page, .unmap_page = gart_unmap_page, - .alloc_coherent = gart_alloc_coherent, - .free_coherent = gart_free_coherent, + .alloc = gart_alloc_coherent, + .free = gart_free_coherent, .mapping_error = gart_mapping_error, }; diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 726494b5834..07b587c5a2d 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -431,7 +431,7 @@ static void calgary_unmap_page(struct device *dev, dma_addr_t dma_addr, } static void* calgary_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t flag) + dma_addr_t *dma_handle, gfp_t flag, struct dma_attrs *attrs) { void *ret = NULL; dma_addr_t mapping; @@ -464,7 +464,8 @@ error: } static void calgary_free_coherent(struct device *dev, size_t size, - void *vaddr, dma_addr_t dma_handle) + void *vaddr, dma_addr_t dma_handle, + struct dma_attrs *attrs) { unsigned int npages; struct iommu_table *tbl = find_iommu_table(dev); @@ -477,8 +478,8 @@ static void calgary_free_coherent(struct device *dev, size_t size, } static struct dma_map_ops calgary_dma_ops = { - .alloc_coherent = calgary_alloc_coherent, - .free_coherent = calgary_free_coherent, + .alloc = calgary_alloc_coherent, + .free = calgary_free_coherent, .map_sg = calgary_map_sg, .unmap_sg = calgary_unmap_sg, .map_page = calgary_map_page, diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 1c4d769e21e..75e1cc19e63 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -96,7 +96,8 @@ void __init pci_iommu_alloc(void) } } void *dma_generic_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_addr, gfp_t flag) + dma_addr_t *dma_addr, gfp_t flag, + struct dma_attrs *attrs) { unsigned long dma_mask; struct page *page; diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index 3af4af810c0..f96050685b4 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c @@ -75,7 +75,7 @@ static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg, } static void nommu_free_coherent(struct device *dev, size_t size, void *vaddr, - dma_addr_t dma_addr) + dma_addr_t dma_addr, struct dma_attrs *attrs) { free_pages((unsigned long)vaddr, get_order(size)); } @@ -96,8 +96,8 @@ static void nommu_sync_sg_for_device(struct device *dev, } struct dma_map_ops nommu_dma_ops = { - .alloc_coherent = dma_generic_alloc_coherent, - .free_coherent = nommu_free_coherent, + .alloc = dma_generic_alloc_coherent, + .free = nommu_free_coherent, .map_sg = nommu_map_sg, .map_page = nommu_map_page, .sync_single_for_device = nommu_sync_single_for_device, diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index 8f972cbddef..6c483ba98b9 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -15,21 +15,30 @@ int swiotlb __read_mostly; static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, - dma_addr_t *dma_handle, gfp_t flags) + dma_addr_t *dma_handle, gfp_t flags, + struct dma_attrs *attrs) { void *vaddr; - vaddr = dma_generic_alloc_coherent(hwdev, size, dma_handle, flags); + vaddr = dma_generic_alloc_coherent(hwdev, size, dma_handle, flags, + attrs); if (vaddr) return vaddr; return swiotlb_alloc_coherent(hwdev, size, dma_handle, flags); } +static void x86_swiotlb_free_coherent(struct device *dev, size_t size, + void *vaddr, dma_addr_t dma_addr, + struct dma_attrs *attrs) +{ + swiotlb_free_coherent(dev, size, vaddr, dma_addr); +} + static struct dma_map_ops swiotlb_dma_ops = { .mapping_error = swiotlb_dma_mapping_error, - .alloc_coherent = x86_swiotlb_alloc_coherent, - .free_coherent = swiotlb_free_coherent, + .alloc = x86_swiotlb_alloc_coherent, + .free = x86_swiotlb_free_coherent, .sync_single_for_cpu = swiotlb_sync_single_for_cpu, .sync_single_for_device = swiotlb_sync_single_for_device, .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c index b480d4207a4..967633ad98c 100644 --- a/arch/x86/xen/pci-swiotlb-xen.c +++ b/arch/x86/xen/pci-swiotlb-xen.c @@ -12,8 +12,8 @@ int xen_swiotlb __read_mostly; static struct dma_map_ops xen_swiotlb_dma_ops = { .mapping_error = xen_swiotlb_dma_mapping_error, - .alloc_coherent = xen_swiotlb_alloc_coherent, - .free_coherent = xen_swiotlb_free_coherent, + .alloc = xen_swiotlb_alloc_coherent, + .free = xen_swiotlb_free_coherent, .sync_single_for_cpu = xen_swiotlb_sync_single_for_cpu, .sync_single_for_device = xen_swiotlb_sync_single_for_device, .sync_sg_for_cpu = xen_swiotlb_sync_sg_for_cpu, diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index f75e0608be5..daa333f97b7 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -2707,7 +2707,8 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist, * The exported alloc_coherent function for dma_ops. */ static void *alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_addr, gfp_t flag) + dma_addr_t *dma_addr, gfp_t flag, + struct dma_attrs *attrs) { unsigned long flags; void *virt_addr; @@ -2765,7 +2766,8 @@ out_free: * The exported free_coherent function for dma_ops. */ static void free_coherent(struct device *dev, size_t size, - void *virt_addr, dma_addr_t dma_addr) + void *virt_addr, dma_addr_t dma_addr, + struct dma_attrs *attrs) { unsigned long flags; struct protection_domain *domain; @@ -2846,8 +2848,8 @@ static void prealloc_protection_domains(void) } static struct dma_map_ops amd_iommu_dma_ops = { - .alloc_coherent = alloc_coherent, - .free_coherent = free_coherent, + .alloc = alloc_coherent, + .free = free_coherent, .map_page = map_page, .unmap_page = unmap_page, .map_sg = map_sg, diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index c9c6053198d..e39bfdc055c 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -2938,7 +2938,8 @@ static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr, } static void *intel_alloc_coherent(struct device *hwdev, size_t size, - dma_addr_t *dma_handle, gfp_t flags) + dma_addr_t *dma_handle, gfp_t flags, + struct dma_attrs *attrs) { void *vaddr; int order; @@ -2970,7 +2971,7 @@ static void *intel_alloc_coherent(struct device *hwdev, size_t size, } static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr, - dma_addr_t dma_handle) + dma_addr_t dma_handle, struct dma_attrs *attrs) { int order; @@ -3115,8 +3116,8 @@ static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr) } struct dma_map_ops intel_dma_ops = { - .alloc_coherent = intel_alloc_coherent, - .free_coherent = intel_free_coherent, + .alloc = intel_alloc_coherent, + .free = intel_free_coherent, .map_sg = intel_map_sg, .unmap_sg = intel_unmap_sg, .map_page = intel_map_page, diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 19e6a204137..1afb4fba11b 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -204,7 +204,8 @@ error: void * xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size, - dma_addr_t *dma_handle, gfp_t flags) + dma_addr_t *dma_handle, gfp_t flags, + struct dma_attrs *attrs) { void *ret; int order = get_order(size); @@ -253,7 +254,7 @@ EXPORT_SYMBOL_GPL(xen_swiotlb_alloc_coherent); void xen_swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr, - dma_addr_t dev_addr) + dma_addr_t dev_addr, struct dma_attrs *attrs) { int order = get_order(size); phys_addr_t phys; diff --git a/include/xen/swiotlb-xen.h b/include/xen/swiotlb-xen.h index 2ea2fdc79c1..4f4d449f00f 100644 --- a/include/xen/swiotlb-xen.h +++ b/include/xen/swiotlb-xen.h @@ -7,11 +7,13 @@ extern void xen_swiotlb_init(int verbose); extern void *xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size, - dma_addr_t *dma_handle, gfp_t flags); + dma_addr_t *dma_handle, gfp_t flags, + struct dma_attrs *attrs); extern void xen_swiotlb_free_coherent(struct device *hwdev, size_t size, - void *vaddr, dma_addr_t dma_handle); + void *vaddr, dma_addr_t dma_handle, + struct dma_attrs *attrs); extern dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page, unsigned long offset, size_t size, -- cgit v1.2.3-70-g09d2 From f05e798ad4c09255f590f5b2c00a7ca6c172f983 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 28 Mar 2012 18:11:12 +0100 Subject: Disintegrate asm/system.h for X86 Disintegrate asm/system.h for X86. Signed-off-by: David Howells Acked-by: H. Peter Anvin cc: x86@kernel.org --- arch/x86/ia32/ia32_aout.c | 1 - arch/x86/include/asm/apic.h | 1 - arch/x86/include/asm/auxvec.h | 7 + arch/x86/include/asm/barrier.h | 116 +++++++ arch/x86/include/asm/bug.h | 4 + arch/x86/include/asm/cacheflush.h | 1 + arch/x86/include/asm/elf.h | 1 - arch/x86/include/asm/exec.h | 1 + arch/x86/include/asm/futex.h | 1 - arch/x86/include/asm/i387.h | 1 - arch/x86/include/asm/local.h | 1 - arch/x86/include/asm/mc146818rtc.h | 1 - arch/x86/include/asm/processor.h | 31 +- arch/x86/include/asm/segment.h | 58 +++- arch/x86/include/asm/special_insns.h | 199 ++++++++++++ arch/x86/include/asm/stackprotector.h | 1 - arch/x86/include/asm/switch_to.h | 129 ++++++++ arch/x86/include/asm/system.h | 527 +------------------------------ arch/x86/include/asm/tlbflush.h | 2 +- arch/x86/include/asm/virtext.h | 1 - arch/x86/kernel/acpi/cstate.c | 1 + arch/x86/kernel/apm_32.c | 1 - arch/x86/kernel/cpu/mcheck/p5.c | 1 - arch/x86/kernel/cpu/mcheck/therm_throt.c | 1 - arch/x86/kernel/cpu/mcheck/winchip.c | 1 - arch/x86/kernel/cpu/mtrr/generic.c | 1 - arch/x86/kernel/cpuid.c | 1 - arch/x86/kernel/i8259.c | 1 - arch/x86/kernel/irqinit.c | 1 - arch/x86/kernel/kgdb.c | 1 - arch/x86/kernel/ldt.c | 1 - arch/x86/kernel/machine_kexec_32.c | 1 - arch/x86/kernel/mca_32.c | 1 - arch/x86/kernel/module.c | 1 - arch/x86/kernel/msr.c | 1 - arch/x86/kernel/paravirt.c | 1 + arch/x86/kernel/pci-calgary_64.c | 1 - arch/x86/kernel/process.c | 1 - arch/x86/kernel/process_32.c | 2 +- arch/x86/kernel/process_64.c | 2 +- arch/x86/kernel/ptrace.c | 1 - arch/x86/kernel/setup.c | 1 - arch/x86/kernel/tce_64.c | 1 + arch/x86/kernel/tls.c | 1 - arch/x86/kernel/traps.c | 1 - arch/x86/mm/init.c | 1 - arch/x86/mm/init_32.c | 1 - arch/x86/mm/init_64.c | 1 - arch/x86/mm/pgtable_32.c | 1 - arch/x86/power/hibernate_32.c | 1 - 50 files changed, 554 insertions(+), 562 deletions(-) create mode 100644 arch/x86/include/asm/barrier.h create mode 100644 arch/x86/include/asm/exec.h create mode 100644 arch/x86/include/asm/special_insns.h create mode 100644 arch/x86/include/asm/switch_to.h (limited to 'arch/x86/include') diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index 4c2e59a420b..d511d951a05 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -26,7 +26,6 @@ #include #include -#include #include #include #include diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index a9371c91718..4b2caeefe1a 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -11,7 +11,6 @@ #include #include #include -#include #include #define ARCH_APICTIMER_STOPS_ON_C3 1 diff --git a/arch/x86/include/asm/auxvec.h b/arch/x86/include/asm/auxvec.h index 1316b4c3542..77203ac352d 100644 --- a/arch/x86/include/asm/auxvec.h +++ b/arch/x86/include/asm/auxvec.h @@ -9,4 +9,11 @@ #endif #define AT_SYSINFO_EHDR 33 +/* entries in ARCH_DLINFO: */ +#if defined(CONFIG_IA32_EMULATION) || !defined(CONFIG_X86_64) +# define AT_VECTOR_SIZE_ARCH 2 +#else /* else it's non-compat x86-64 */ +# define AT_VECTOR_SIZE_ARCH 1 +#endif + #endif /* _ASM_X86_AUXVEC_H */ diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h new file mode 100644 index 00000000000..c6cd358a1ee --- /dev/null +++ b/arch/x86/include/asm/barrier.h @@ -0,0 +1,116 @@ +#ifndef _ASM_X86_BARRIER_H +#define _ASM_X86_BARRIER_H + +#include +#include + +/* + * Force strict CPU ordering. + * And yes, this is required on UP too when we're talking + * to devices. + */ + +#ifdef CONFIG_X86_32 +/* + * Some non-Intel clones support out of order store. wmb() ceases to be a + * nop for these. + */ +#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2) +#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2) +#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM) +#else +#define mb() asm volatile("mfence":::"memory") +#define rmb() asm volatile("lfence":::"memory") +#define wmb() asm volatile("sfence" ::: "memory") +#endif + +/** + * read_barrier_depends - Flush all pending reads that subsequents reads + * depend on. + * + * No data-dependent reads from memory-like regions are ever reordered + * over this barrier. All reads preceding this primitive are guaranteed + * to access memory (but not necessarily other CPUs' caches) before any + * reads following this primitive that depend on the data return by + * any of the preceding reads. This primitive is much lighter weight than + * rmb() on most CPUs, and is never heavier weight than is + * rmb(). + * + * These ordering constraints are respected by both the local CPU + * and the compiler. + * + * Ordering is not guaranteed by anything other than these primitives, + * not even by data dependencies. See the documentation for + * memory_barrier() for examples and URLs to more information. + * + * For example, the following code would force ordering (the initial + * value of "a" is zero, "b" is one, and "p" is "&a"): + * + * + * CPU 0 CPU 1 + * + * b = 2; + * memory_barrier(); + * p = &b; q = p; + * read_barrier_depends(); + * d = *q; + * + * + * because the read of "*q" depends on the read of "p" and these + * two reads are separated by a read_barrier_depends(). However, + * the following code, with the same initial values for "a" and "b": + * + * + * CPU 0 CPU 1 + * + * a = 2; + * memory_barrier(); + * b = 3; y = b; + * read_barrier_depends(); + * x = a; + * + * + * does not enforce ordering, since there is no data dependency between + * the read of "a" and the read of "b". Therefore, on some CPUs, such + * as Alpha, "y" could be set to 3 and "x" to 0. Use rmb() + * in cases like this where there are no data dependencies. + **/ + +#define read_barrier_depends() do { } while (0) + +#ifdef CONFIG_SMP +#define smp_mb() mb() +#ifdef CONFIG_X86_PPRO_FENCE +# define smp_rmb() rmb() +#else +# define smp_rmb() barrier() +#endif +#ifdef CONFIG_X86_OOSTORE +# define smp_wmb() wmb() +#else +# define smp_wmb() barrier() +#endif +#define smp_read_barrier_depends() read_barrier_depends() +#define set_mb(var, value) do { (void)xchg(&var, value); } while (0) +#else +#define smp_mb() barrier() +#define smp_rmb() barrier() +#define smp_wmb() barrier() +#define smp_read_barrier_depends() do { } while (0) +#define set_mb(var, value) do { var = value; barrier(); } while (0) +#endif + +/* + * Stop RDTSC speculation. This is needed when you need to use RDTSC + * (or get_cycles or vread that possibly accesses the TSC) in a defined + * code region. + * + * (Could use an alternative three way for this if there was one.) + */ +static __always_inline void rdtsc_barrier(void) +{ + alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC); + alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC); +} + +#endif /* _ASM_X86_BARRIER_H */ diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index f654d1bb17f..11e1152222d 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -36,4 +36,8 @@ do { \ #endif /* !CONFIG_BUG */ #include + + +extern void show_regs_common(void); + #endif /* _ASM_X86_BUG_H */ diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h index 4e12668711e..9863ee3747d 100644 --- a/arch/x86/include/asm/cacheflush.h +++ b/arch/x86/include/asm/cacheflush.h @@ -3,6 +3,7 @@ /* Caches aren't brain-dead on the intel. */ #include +#include #ifdef CONFIG_X86_PAT /* diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 5f962df30d0..f27f79abe02 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -84,7 +84,6 @@ extern unsigned int vdso_enabled; (((x)->e_machine == EM_386) || ((x)->e_machine == EM_486)) #include -#include #ifdef CONFIG_X86_32 #include diff --git a/arch/x86/include/asm/exec.h b/arch/x86/include/asm/exec.h new file mode 100644 index 00000000000..54c2e1db274 --- /dev/null +++ b/arch/x86/include/asm/exec.h @@ -0,0 +1 @@ +/* define arch_align_stack() here */ diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h index d09bb03653f..71ecbcba1a4 100644 --- a/arch/x86/include/asm/futex.h +++ b/arch/x86/include/asm/futex.h @@ -9,7 +9,6 @@ #include #include #include -#include #define __futex_atomic_op1(insn, ret, oldval, uaddr, oparg) \ asm volatile("1:\t" insn "\n" \ diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 7ce0798b1b2..257d9cca214 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -14,7 +14,6 @@ #include #include -#include struct pt_regs; struct user_i387_struct; diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h index 9cdae5d47e8..c8bed0da434 100644 --- a/arch/x86/include/asm/local.h +++ b/arch/x86/include/asm/local.h @@ -3,7 +3,6 @@ #include -#include #include #include diff --git a/arch/x86/include/asm/mc146818rtc.h b/arch/x86/include/asm/mc146818rtc.h index 0e8e85bb7c5..d354fb781c5 100644 --- a/arch/x86/include/asm/mc146818rtc.h +++ b/arch/x86/include/asm/mc146818rtc.h @@ -5,7 +5,6 @@ #define _ASM_X86_MC146818RTC_H #include -#include #include #include diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 95da14f7ee8..78e30ea492b 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -14,13 +14,13 @@ struct mm_struct; #include #include #include -#include #include #include #include #include #include #include +#include #include #include @@ -29,6 +29,15 @@ struct mm_struct; #include #include #include +#include + +/* + * We handle most unaligned accesses in hardware. On the other hand + * unaligned DMA can be quite expensive on some Nehalem processors. + * + * Based on this we disable the IP header alignment in network drivers. + */ +#define NET_IP_ALIGN 0 #define HBP_NUM 4 /* @@ -1022,4 +1031,24 @@ extern bool cpu_has_amd_erratum(const int *); #define cpu_has_amd_erratum(x) (false) #endif /* CONFIG_CPU_SUP_AMD */ +#ifdef CONFIG_X86_32 +/* + * disable hlt during certain critical i/o operations + */ +#define HAVE_DISABLE_HLT +#endif + +void disable_hlt(void); +void enable_hlt(void); + +void cpu_idle_wait(void); + +extern unsigned long arch_align_stack(unsigned long sp); +extern void free_init_pages(char *what, unsigned long begin, unsigned long end); + +void default_idle(void); +bool set_pm_idle_to_default(void); + +void stop_this_cpu(void *dummy); + #endif /* _ASM_X86_PROCESSOR_H */ diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index 5e641715c3f..165466233ab 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -212,7 +212,61 @@ #ifdef __KERNEL__ #ifndef __ASSEMBLY__ extern const char early_idt_handlers[NUM_EXCEPTION_VECTORS][10]; -#endif -#endif + +/* + * Load a segment. Fall back on loading the zero + * segment if something goes wrong.. + */ +#define loadsegment(seg, value) \ +do { \ + unsigned short __val = (value); \ + \ + asm volatile(" \n" \ + "1: movl %k0,%%" #seg " \n" \ + \ + ".section .fixup,\"ax\" \n" \ + "2: xorl %k0,%k0 \n" \ + " jmp 1b \n" \ + ".previous \n" \ + \ + _ASM_EXTABLE(1b, 2b) \ + \ + : "+r" (__val) : : "memory"); \ +} while (0) + +/* + * Save a segment register away + */ +#define savesegment(seg, value) \ + asm("mov %%" #seg ",%0":"=r" (value) : : "memory") + +/* + * x86_32 user gs accessors. + */ +#ifdef CONFIG_X86_32 +#ifdef CONFIG_X86_32_LAZY_GS +#define get_user_gs(regs) (u16)({unsigned long v; savesegment(gs, v); v;}) +#define set_user_gs(regs, v) loadsegment(gs, (unsigned long)(v)) +#define task_user_gs(tsk) ((tsk)->thread.gs) +#define lazy_save_gs(v) savesegment(gs, (v)) +#define lazy_load_gs(v) loadsegment(gs, (v)) +#else /* X86_32_LAZY_GS */ +#define get_user_gs(regs) (u16)((regs)->gs) +#define set_user_gs(regs, v) do { (regs)->gs = (v); } while (0) +#define task_user_gs(tsk) (task_pt_regs(tsk)->gs) +#define lazy_save_gs(v) do { } while (0) +#define lazy_load_gs(v) do { } while (0) +#endif /* X86_32_LAZY_GS */ +#endif /* X86_32 */ + +static inline unsigned long get_limit(unsigned long segment) +{ + unsigned long __limit; + asm("lsll %1,%0" : "=r" (__limit) : "r" (segment)); + return __limit + 1; +} + +#endif /* !__ASSEMBLY__ */ +#endif /* __KERNEL__ */ #endif /* _ASM_X86_SEGMENT_H */ diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h new file mode 100644 index 00000000000..41fc93a2e22 --- /dev/null +++ b/arch/x86/include/asm/special_insns.h @@ -0,0 +1,199 @@ +#ifndef _ASM_X86_SPECIAL_INSNS_H +#define _ASM_X86_SPECIAL_INSNS_H + + +#ifdef __KERNEL__ + +static inline void native_clts(void) +{ + asm volatile("clts"); +} + +/* + * Volatile isn't enough to prevent the compiler from reordering the + * read/write functions for the control registers and messing everything up. + * A memory clobber would solve the problem, but would prevent reordering of + * all loads stores around it, which can hurt performance. Solution is to + * use a variable and mimic reads and writes to it to enforce serialization + */ +static unsigned long __force_order; + +static inline unsigned long native_read_cr0(void) +{ + unsigned long val; + asm volatile("mov %%cr0,%0\n\t" : "=r" (val), "=m" (__force_order)); + return val; +} + +static inline void native_write_cr0(unsigned long val) +{ + asm volatile("mov %0,%%cr0": : "r" (val), "m" (__force_order)); +} + +static inline unsigned long native_read_cr2(void) +{ + unsigned long val; + asm volatile("mov %%cr2,%0\n\t" : "=r" (val), "=m" (__force_order)); + return val; +} + +static inline void native_write_cr2(unsigned long val) +{ + asm volatile("mov %0,%%cr2": : "r" (val), "m" (__force_order)); +} + +static inline unsigned long native_read_cr3(void) +{ + unsigned long val; + asm volatile("mov %%cr3,%0\n\t" : "=r" (val), "=m" (__force_order)); + return val; +} + +static inline void native_write_cr3(unsigned long val) +{ + asm volatile("mov %0,%%cr3": : "r" (val), "m" (__force_order)); +} + +static inline unsigned long native_read_cr4(void) +{ + unsigned long val; + asm volatile("mov %%cr4,%0\n\t" : "=r" (val), "=m" (__force_order)); + return val; +} + +static inline unsigned long native_read_cr4_safe(void) +{ + unsigned long val; + /* This could fault if %cr4 does not exist. In x86_64, a cr4 always + * exists, so it will never fail. */ +#ifdef CONFIG_X86_32 + asm volatile("1: mov %%cr4, %0\n" + "2:\n" + _ASM_EXTABLE(1b, 2b) + : "=r" (val), "=m" (__force_order) : "0" (0)); +#else + val = native_read_cr4(); +#endif + return val; +} + +static inline void native_write_cr4(unsigned long val) +{ + asm volatile("mov %0,%%cr4": : "r" (val), "m" (__force_order)); +} + +#ifdef CONFIG_X86_64 +static inline unsigned long native_read_cr8(void) +{ + unsigned long cr8; + asm volatile("movq %%cr8,%0" : "=r" (cr8)); + return cr8; +} + +static inline void native_write_cr8(unsigned long val) +{ + asm volatile("movq %0,%%cr8" :: "r" (val) : "memory"); +} +#endif + +static inline void native_wbinvd(void) +{ + asm volatile("wbinvd": : :"memory"); +} + +extern void native_load_gs_index(unsigned); + +#ifdef CONFIG_PARAVIRT +#include +#else + +static inline unsigned long read_cr0(void) +{ + return native_read_cr0(); +} + +static inline void write_cr0(unsigned long x) +{ + native_write_cr0(x); +} + +static inline unsigned long read_cr2(void) +{ + return native_read_cr2(); +} + +static inline void write_cr2(unsigned long x) +{ + native_write_cr2(x); +} + +static inline unsigned long read_cr3(void) +{ + return native_read_cr3(); +} + +static inline void write_cr3(unsigned long x) +{ + native_write_cr3(x); +} + +static inline unsigned long read_cr4(void) +{ + return native_read_cr4(); +} + +static inline unsigned long read_cr4_safe(void) +{ + return native_read_cr4_safe(); +} + +static inline void write_cr4(unsigned long x) +{ + native_write_cr4(x); +} + +static inline void wbinvd(void) +{ + native_wbinvd(); +} + +#ifdef CONFIG_X86_64 + +static inline unsigned long read_cr8(void) +{ + return native_read_cr8(); +} + +static inline void write_cr8(unsigned long x) +{ + native_write_cr8(x); +} + +static inline void load_gs_index(unsigned selector) +{ + native_load_gs_index(selector); +} + +#endif + +/* Clear the 'TS' bit */ +static inline void clts(void) +{ + native_clts(); +} + +#endif/* CONFIG_PARAVIRT */ + +#define stts() write_cr0(read_cr0() | X86_CR0_TS) + +static inline void clflush(volatile void *__p) +{ + asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p)); +} + +#define nop() asm volatile ("nop") + + +#endif /* __KERNEL__ */ + +#endif /* _ASM_X86_SPECIAL_INSNS_H */ diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h index 15751776356..b5d9533d2c3 100644 --- a/arch/x86/include/asm/stackprotector.h +++ b/arch/x86/include/asm/stackprotector.h @@ -38,7 +38,6 @@ #include #include #include -#include #include #include diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h new file mode 100644 index 00000000000..4ec45b3abba --- /dev/null +++ b/arch/x86/include/asm/switch_to.h @@ -0,0 +1,129 @@ +#ifndef _ASM_X86_SWITCH_TO_H +#define _ASM_X86_SWITCH_TO_H + +struct task_struct; /* one of the stranger aspects of C forward declarations */ +struct task_struct *__switch_to(struct task_struct *prev, + struct task_struct *next); +struct tss_struct; +void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, + struct tss_struct *tss); + +#ifdef CONFIG_X86_32 + +#ifdef CONFIG_CC_STACKPROTECTOR +#define __switch_canary \ + "movl %P[task_canary](%[next]), %%ebx\n\t" \ + "movl %%ebx, "__percpu_arg([stack_canary])"\n\t" +#define __switch_canary_oparam \ + , [stack_canary] "=m" (stack_canary.canary) +#define __switch_canary_iparam \ + , [task_canary] "i" (offsetof(struct task_struct, stack_canary)) +#else /* CC_STACKPROTECTOR */ +#define __switch_canary +#define __switch_canary_oparam +#define __switch_canary_iparam +#endif /* CC_STACKPROTECTOR */ + +/* + * Saving eflags is important. It switches not only IOPL between tasks, + * it also protects other tasks from NT leaking through sysenter etc. + */ +#define switch_to(prev, next, last) \ +do { \ + /* \ + * Context-switching clobbers all registers, so we clobber \ + * them explicitly, via unused output variables. \ + * (EAX and EBP is not listed because EBP is saved/restored \ + * explicitly for wchan access and EAX is the return value of \ + * __switch_to()) \ + */ \ + unsigned long ebx, ecx, edx, esi, edi; \ + \ + asm volatile("pushfl\n\t" /* save flags */ \ + "pushl %%ebp\n\t" /* save EBP */ \ + "movl %%esp,%[prev_sp]\n\t" /* save ESP */ \ + "movl %[next_sp],%%esp\n\t" /* restore ESP */ \ + "movl $1f,%[prev_ip]\n\t" /* save EIP */ \ + "pushl %[next_ip]\n\t" /* restore EIP */ \ + __switch_canary \ + "jmp __switch_to\n" /* regparm call */ \ + "1:\t" \ + "popl %%ebp\n\t" /* restore EBP */ \ + "popfl\n" /* restore flags */ \ + \ + /* output parameters */ \ + : [prev_sp] "=m" (prev->thread.sp), \ + [prev_ip] "=m" (prev->thread.ip), \ + "=a" (last), \ + \ + /* clobbered output registers: */ \ + "=b" (ebx), "=c" (ecx), "=d" (edx), \ + "=S" (esi), "=D" (edi) \ + \ + __switch_canary_oparam \ + \ + /* input parameters: */ \ + : [next_sp] "m" (next->thread.sp), \ + [next_ip] "m" (next->thread.ip), \ + \ + /* regparm parameters for __switch_to(): */ \ + [prev] "a" (prev), \ + [next] "d" (next) \ + \ + __switch_canary_iparam \ + \ + : /* reloaded segment registers */ \ + "memory"); \ +} while (0) + +#else /* CONFIG_X86_32 */ + +/* frame pointer must be last for get_wchan */ +#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t" +#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\t" + +#define __EXTRA_CLOBBER \ + , "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \ + "r12", "r13", "r14", "r15" + +#ifdef CONFIG_CC_STACKPROTECTOR +#define __switch_canary \ + "movq %P[task_canary](%%rsi),%%r8\n\t" \ + "movq %%r8,"__percpu_arg([gs_canary])"\n\t" +#define __switch_canary_oparam \ + , [gs_canary] "=m" (irq_stack_union.stack_canary) +#define __switch_canary_iparam \ + , [task_canary] "i" (offsetof(struct task_struct, stack_canary)) +#else /* CC_STACKPROTECTOR */ +#define __switch_canary +#define __switch_canary_oparam +#define __switch_canary_iparam +#endif /* CC_STACKPROTECTOR */ + +/* Save restore flags to clear handle leaking NT */ +#define switch_to(prev, next, last) \ + asm volatile(SAVE_CONTEXT \ + "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ + "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \ + "call __switch_to\n\t" \ + "movq "__percpu_arg([current_task])",%%rsi\n\t" \ + __switch_canary \ + "movq %P[thread_info](%%rsi),%%r8\n\t" \ + "movq %%rax,%%rdi\n\t" \ + "testl %[_tif_fork],%P[ti_flags](%%r8)\n\t" \ + "jnz ret_from_fork\n\t" \ + RESTORE_CONTEXT \ + : "=a" (last) \ + __switch_canary_oparam \ + : [next] "S" (next), [prev] "D" (prev), \ + [threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \ + [ti_flags] "i" (offsetof(struct thread_info, flags)), \ + [_tif_fork] "i" (_TIF_FORK), \ + [thread_info] "i" (offsetof(struct task_struct, stack)), \ + [current_task] "m" (current_task) \ + __switch_canary_iparam \ + : "memory", "cc" __EXTRA_CLOBBER) + +#endif /* CONFIG_X86_32 */ + +#endif /* _ASM_X86_SWITCH_TO_H */ diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h index 2d2f01ce6dc..0d84f9e42fd 100644 --- a/arch/x86/include/asm/system.h +++ b/arch/x86/include/asm/system.h @@ -1,523 +1,6 @@ -#ifndef _ASM_X86_SYSTEM_H -#define _ASM_X86_SYSTEM_H - -#include -#include -#include +/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */ +#include #include -#include - -#include -#include - -/* entries in ARCH_DLINFO: */ -#if defined(CONFIG_IA32_EMULATION) || !defined(CONFIG_X86_64) -# define AT_VECTOR_SIZE_ARCH 2 -#else /* else it's non-compat x86-64 */ -# define AT_VECTOR_SIZE_ARCH 1 -#endif - -struct task_struct; /* one of the stranger aspects of C forward declarations */ -struct task_struct *__switch_to(struct task_struct *prev, - struct task_struct *next); -struct tss_struct; -void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, - struct tss_struct *tss); -extern void show_regs_common(void); - -#ifdef CONFIG_X86_32 - -#ifdef CONFIG_CC_STACKPROTECTOR -#define __switch_canary \ - "movl %P[task_canary](%[next]), %%ebx\n\t" \ - "movl %%ebx, "__percpu_arg([stack_canary])"\n\t" -#define __switch_canary_oparam \ - , [stack_canary] "=m" (stack_canary.canary) -#define __switch_canary_iparam \ - , [task_canary] "i" (offsetof(struct task_struct, stack_canary)) -#else /* CC_STACKPROTECTOR */ -#define __switch_canary -#define __switch_canary_oparam -#define __switch_canary_iparam -#endif /* CC_STACKPROTECTOR */ - -/* - * Saving eflags is important. It switches not only IOPL between tasks, - * it also protects other tasks from NT leaking through sysenter etc. - */ -#define switch_to(prev, next, last) \ -do { \ - /* \ - * Context-switching clobbers all registers, so we clobber \ - * them explicitly, via unused output variables. \ - * (EAX and EBP is not listed because EBP is saved/restored \ - * explicitly for wchan access and EAX is the return value of \ - * __switch_to()) \ - */ \ - unsigned long ebx, ecx, edx, esi, edi; \ - \ - asm volatile("pushfl\n\t" /* save flags */ \ - "pushl %%ebp\n\t" /* save EBP */ \ - "movl %%esp,%[prev_sp]\n\t" /* save ESP */ \ - "movl %[next_sp],%%esp\n\t" /* restore ESP */ \ - "movl $1f,%[prev_ip]\n\t" /* save EIP */ \ - "pushl %[next_ip]\n\t" /* restore EIP */ \ - __switch_canary \ - "jmp __switch_to\n" /* regparm call */ \ - "1:\t" \ - "popl %%ebp\n\t" /* restore EBP */ \ - "popfl\n" /* restore flags */ \ - \ - /* output parameters */ \ - : [prev_sp] "=m" (prev->thread.sp), \ - [prev_ip] "=m" (prev->thread.ip), \ - "=a" (last), \ - \ - /* clobbered output registers: */ \ - "=b" (ebx), "=c" (ecx), "=d" (edx), \ - "=S" (esi), "=D" (edi) \ - \ - __switch_canary_oparam \ - \ - /* input parameters: */ \ - : [next_sp] "m" (next->thread.sp), \ - [next_ip] "m" (next->thread.ip), \ - \ - /* regparm parameters for __switch_to(): */ \ - [prev] "a" (prev), \ - [next] "d" (next) \ - \ - __switch_canary_iparam \ - \ - : /* reloaded segment registers */ \ - "memory"); \ -} while (0) - -/* - * disable hlt during certain critical i/o operations - */ -#define HAVE_DISABLE_HLT -#else - -/* frame pointer must be last for get_wchan */ -#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t" -#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\t" - -#define __EXTRA_CLOBBER \ - , "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \ - "r12", "r13", "r14", "r15" - -#ifdef CONFIG_CC_STACKPROTECTOR -#define __switch_canary \ - "movq %P[task_canary](%%rsi),%%r8\n\t" \ - "movq %%r8,"__percpu_arg([gs_canary])"\n\t" -#define __switch_canary_oparam \ - , [gs_canary] "=m" (irq_stack_union.stack_canary) -#define __switch_canary_iparam \ - , [task_canary] "i" (offsetof(struct task_struct, stack_canary)) -#else /* CC_STACKPROTECTOR */ -#define __switch_canary -#define __switch_canary_oparam -#define __switch_canary_iparam -#endif /* CC_STACKPROTECTOR */ - -/* Save restore flags to clear handle leaking NT */ -#define switch_to(prev, next, last) \ - asm volatile(SAVE_CONTEXT \ - "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ - "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \ - "call __switch_to\n\t" \ - "movq "__percpu_arg([current_task])",%%rsi\n\t" \ - __switch_canary \ - "movq %P[thread_info](%%rsi),%%r8\n\t" \ - "movq %%rax,%%rdi\n\t" \ - "testl %[_tif_fork],%P[ti_flags](%%r8)\n\t" \ - "jnz ret_from_fork\n\t" \ - RESTORE_CONTEXT \ - : "=a" (last) \ - __switch_canary_oparam \ - : [next] "S" (next), [prev] "D" (prev), \ - [threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \ - [ti_flags] "i" (offsetof(struct thread_info, flags)), \ - [_tif_fork] "i" (_TIF_FORK), \ - [thread_info] "i" (offsetof(struct task_struct, stack)), \ - [current_task] "m" (current_task) \ - __switch_canary_iparam \ - : "memory", "cc" __EXTRA_CLOBBER) -#endif - -#ifdef __KERNEL__ - -extern void native_load_gs_index(unsigned); - -/* - * Load a segment. Fall back on loading the zero - * segment if something goes wrong.. - */ -#define loadsegment(seg, value) \ -do { \ - unsigned short __val = (value); \ - \ - asm volatile(" \n" \ - "1: movl %k0,%%" #seg " \n" \ - \ - ".section .fixup,\"ax\" \n" \ - "2: xorl %k0,%k0 \n" \ - " jmp 1b \n" \ - ".previous \n" \ - \ - _ASM_EXTABLE(1b, 2b) \ - \ - : "+r" (__val) : : "memory"); \ -} while (0) - -/* - * Save a segment register away - */ -#define savesegment(seg, value) \ - asm("mov %%" #seg ",%0":"=r" (value) : : "memory") - -/* - * x86_32 user gs accessors. - */ -#ifdef CONFIG_X86_32 -#ifdef CONFIG_X86_32_LAZY_GS -#define get_user_gs(regs) (u16)({unsigned long v; savesegment(gs, v); v;}) -#define set_user_gs(regs, v) loadsegment(gs, (unsigned long)(v)) -#define task_user_gs(tsk) ((tsk)->thread.gs) -#define lazy_save_gs(v) savesegment(gs, (v)) -#define lazy_load_gs(v) loadsegment(gs, (v)) -#else /* X86_32_LAZY_GS */ -#define get_user_gs(regs) (u16)((regs)->gs) -#define set_user_gs(regs, v) do { (regs)->gs = (v); } while (0) -#define task_user_gs(tsk) (task_pt_regs(tsk)->gs) -#define lazy_save_gs(v) do { } while (0) -#define lazy_load_gs(v) do { } while (0) -#endif /* X86_32_LAZY_GS */ -#endif /* X86_32 */ - -static inline unsigned long get_limit(unsigned long segment) -{ - unsigned long __limit; - asm("lsll %1,%0" : "=r" (__limit) : "r" (segment)); - return __limit + 1; -} - -static inline void native_clts(void) -{ - asm volatile("clts"); -} - -/* - * Volatile isn't enough to prevent the compiler from reordering the - * read/write functions for the control registers and messing everything up. - * A memory clobber would solve the problem, but would prevent reordering of - * all loads stores around it, which can hurt performance. Solution is to - * use a variable and mimic reads and writes to it to enforce serialization - */ -static unsigned long __force_order; - -static inline unsigned long native_read_cr0(void) -{ - unsigned long val; - asm volatile("mov %%cr0,%0\n\t" : "=r" (val), "=m" (__force_order)); - return val; -} - -static inline void native_write_cr0(unsigned long val) -{ - asm volatile("mov %0,%%cr0": : "r" (val), "m" (__force_order)); -} - -static inline unsigned long native_read_cr2(void) -{ - unsigned long val; - asm volatile("mov %%cr2,%0\n\t" : "=r" (val), "=m" (__force_order)); - return val; -} - -static inline void native_write_cr2(unsigned long val) -{ - asm volatile("mov %0,%%cr2": : "r" (val), "m" (__force_order)); -} - -static inline unsigned long native_read_cr3(void) -{ - unsigned long val; - asm volatile("mov %%cr3,%0\n\t" : "=r" (val), "=m" (__force_order)); - return val; -} - -static inline void native_write_cr3(unsigned long val) -{ - asm volatile("mov %0,%%cr3": : "r" (val), "m" (__force_order)); -} - -static inline unsigned long native_read_cr4(void) -{ - unsigned long val; - asm volatile("mov %%cr4,%0\n\t" : "=r" (val), "=m" (__force_order)); - return val; -} - -static inline unsigned long native_read_cr4_safe(void) -{ - unsigned long val; - /* This could fault if %cr4 does not exist. In x86_64, a cr4 always - * exists, so it will never fail. */ -#ifdef CONFIG_X86_32 - asm volatile("1: mov %%cr4, %0\n" - "2:\n" - _ASM_EXTABLE(1b, 2b) - : "=r" (val), "=m" (__force_order) : "0" (0)); -#else - val = native_read_cr4(); -#endif - return val; -} - -static inline void native_write_cr4(unsigned long val) -{ - asm volatile("mov %0,%%cr4": : "r" (val), "m" (__force_order)); -} - -#ifdef CONFIG_X86_64 -static inline unsigned long native_read_cr8(void) -{ - unsigned long cr8; - asm volatile("movq %%cr8,%0" : "=r" (cr8)); - return cr8; -} - -static inline void native_write_cr8(unsigned long val) -{ - asm volatile("movq %0,%%cr8" :: "r" (val) : "memory"); -} -#endif - -static inline void native_wbinvd(void) -{ - asm volatile("wbinvd": : :"memory"); -} - -#ifdef CONFIG_PARAVIRT -#include -#else - -static inline unsigned long read_cr0(void) -{ - return native_read_cr0(); -} - -static inline void write_cr0(unsigned long x) -{ - native_write_cr0(x); -} - -static inline unsigned long read_cr2(void) -{ - return native_read_cr2(); -} - -static inline void write_cr2(unsigned long x) -{ - native_write_cr2(x); -} - -static inline unsigned long read_cr3(void) -{ - return native_read_cr3(); -} - -static inline void write_cr3(unsigned long x) -{ - native_write_cr3(x); -} - -static inline unsigned long read_cr4(void) -{ - return native_read_cr4(); -} - -static inline unsigned long read_cr4_safe(void) -{ - return native_read_cr4_safe(); -} - -static inline void write_cr4(unsigned long x) -{ - native_write_cr4(x); -} - -static inline void wbinvd(void) -{ - native_wbinvd(); -} - -#ifdef CONFIG_X86_64 - -static inline unsigned long read_cr8(void) -{ - return native_read_cr8(); -} - -static inline void write_cr8(unsigned long x) -{ - native_write_cr8(x); -} - -static inline void load_gs_index(unsigned selector) -{ - native_load_gs_index(selector); -} - -#endif - -/* Clear the 'TS' bit */ -static inline void clts(void) -{ - native_clts(); -} - -#endif/* CONFIG_PARAVIRT */ - -#define stts() write_cr0(read_cr0() | X86_CR0_TS) - -#endif /* __KERNEL__ */ - -static inline void clflush(volatile void *__p) -{ - asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p)); -} - -#define nop() asm volatile ("nop") - -void disable_hlt(void); -void enable_hlt(void); - -void cpu_idle_wait(void); - -extern unsigned long arch_align_stack(unsigned long sp); -extern void free_init_pages(char *what, unsigned long begin, unsigned long end); - -void default_idle(void); -bool set_pm_idle_to_default(void); - -void stop_this_cpu(void *dummy); - -/* - * Force strict CPU ordering. - * And yes, this is required on UP too when we're talking - * to devices. - */ -#ifdef CONFIG_X86_32 -/* - * Some non-Intel clones support out of order store. wmb() ceases to be a - * nop for these. - */ -#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2) -#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2) -#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM) -#else -#define mb() asm volatile("mfence":::"memory") -#define rmb() asm volatile("lfence":::"memory") -#define wmb() asm volatile("sfence" ::: "memory") -#endif - -/** - * read_barrier_depends - Flush all pending reads that subsequents reads - * depend on. - * - * No data-dependent reads from memory-like regions are ever reordered - * over this barrier. All reads preceding this primitive are guaranteed - * to access memory (but not necessarily other CPUs' caches) before any - * reads following this primitive that depend on the data return by - * any of the preceding reads. This primitive is much lighter weight than - * rmb() on most CPUs, and is never heavier weight than is - * rmb(). - * - * These ordering constraints are respected by both the local CPU - * and the compiler. - * - * Ordering is not guaranteed by anything other than these primitives, - * not even by data dependencies. See the documentation for - * memory_barrier() for examples and URLs to more information. - * - * For example, the following code would force ordering (the initial - * value of "a" is zero, "b" is one, and "p" is "&a"): - * - * - * CPU 0 CPU 1 - * - * b = 2; - * memory_barrier(); - * p = &b; q = p; - * read_barrier_depends(); - * d = *q; - * - * - * because the read of "*q" depends on the read of "p" and these - * two reads are separated by a read_barrier_depends(). However, - * the following code, with the same initial values for "a" and "b": - * - * - * CPU 0 CPU 1 - * - * a = 2; - * memory_barrier(); - * b = 3; y = b; - * read_barrier_depends(); - * x = a; - * - * - * does not enforce ordering, since there is no data dependency between - * the read of "a" and the read of "b". Therefore, on some CPUs, such - * as Alpha, "y" could be set to 3 and "x" to 0. Use rmb() - * in cases like this where there are no data dependencies. - **/ - -#define read_barrier_depends() do { } while (0) - -#ifdef CONFIG_SMP -#define smp_mb() mb() -#ifdef CONFIG_X86_PPRO_FENCE -# define smp_rmb() rmb() -#else -# define smp_rmb() barrier() -#endif -#ifdef CONFIG_X86_OOSTORE -# define smp_wmb() wmb() -#else -# define smp_wmb() barrier() -#endif -#define smp_read_barrier_depends() read_barrier_depends() -#define set_mb(var, value) do { (void)xchg(&var, value); } while (0) -#else -#define smp_mb() barrier() -#define smp_rmb() barrier() -#define smp_wmb() barrier() -#define smp_read_barrier_depends() do { } while (0) -#define set_mb(var, value) do { var = value; barrier(); } while (0) -#endif - -/* - * Stop RDTSC speculation. This is needed when you need to use RDTSC - * (or get_cycles or vread that possibly accesses the TSC) in a defined - * code region. - * - * (Could use an alternative three way for this if there was one.) - */ -static __always_inline void rdtsc_barrier(void) -{ - alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC); - alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC); -} - -/* - * We handle most unaligned accesses in hardware. On the other hand - * unaligned DMA can be quite expensive on some Nehalem processors. - * - * Based on this we disable the IP header alignment in network drivers. - */ -#define NET_IP_ALIGN 0 -#endif /* _ASM_X86_SYSTEM_H */ +#include +#include +#include diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 169be8938b9..c0e108e0807 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -5,7 +5,7 @@ #include #include -#include +#include #ifdef CONFIG_PARAVIRT #include diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h index e0f9aa16358..5da71c27cc5 100644 --- a/arch/x86/include/asm/virtext.h +++ b/arch/x86/include/asm/virtext.h @@ -16,7 +16,6 @@ #define _ASM_X86_VIRTEX_H #include -#include #include #include diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index f50e7fb2a20..d2b7f27781b 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -14,6 +14,7 @@ #include #include #include +#include /* * Initialize bm_flags based on the CPU cache properties diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 5d56931a15b..459e78cbf61 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -231,7 +231,6 @@ #include #include -#include #include #include #include diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c index 5c0e6533d9b..2d5454cd2c4 100644 --- a/arch/x86/kernel/cpu/mcheck/p5.c +++ b/arch/x86/kernel/cpu/mcheck/p5.c @@ -9,7 +9,6 @@ #include #include -#include #include #include diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 67bb17a37a0..47a1870279a 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -25,7 +25,6 @@ #include #include -#include #include #include #include diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c index 54060f56597..2d7998fb628 100644 --- a/arch/x86/kernel/cpu/mcheck/winchip.c +++ b/arch/x86/kernel/cpu/mcheck/winchip.c @@ -8,7 +8,6 @@ #include #include -#include #include #include diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 97b26356e9e..75772ae6c65 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index a524353d93f..39472dd2323 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -43,7 +43,6 @@ #include #include -#include static struct class *cpuid_class; diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 610485223bd..36d1853e91a 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -15,7 +15,6 @@ #include #include -#include #include #include #include diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 313fb5cddbc..99b85b423bb 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -16,7 +16,6 @@ #include #include -#include #include #include #include diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index faba5771aca..4425a12ece4 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -46,7 +46,6 @@ #include #include -#include #include #include diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index ea697263b37..ebc98739892 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -15,7 +15,6 @@ #include #include -#include #include #include #include diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index a3fa43ba5d3..5b19e4d78b0 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include diff --git a/arch/x86/kernel/mca_32.c b/arch/x86/kernel/mca_32.c index 177183cbb6a..7eb1e2b9782 100644 --- a/arch/x86/kernel/mca_32.c +++ b/arch/x86/kernel/mca_32.c @@ -43,7 +43,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 925179f871d..f21fd94ac89 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -26,7 +26,6 @@ #include #include -#include #include #include diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 96356762a51..eb113693f04 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -40,7 +40,6 @@ #include #include -#include static struct class *msr_class; diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index ada2f99388d..2b26485f0c1 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -37,6 +37,7 @@ #include #include #include +#include /* nop stub */ void _paravirt_nop(void) diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 726494b5834..6ac5782f4d6 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -42,7 +42,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 14baf78d5a1..9b24f36eb55 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 9d7d4842bfa..aae4f4bbbe8 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -41,7 +41,6 @@ #include #include -#include #include #include #include @@ -59,6 +58,7 @@ #include #include #include +#include asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 292da13fc5a..61270e8d428 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -40,7 +40,6 @@ #include #include -#include #include #include #include @@ -53,6 +52,7 @@ #include #include #include +#include asmlinkage extern void ret_from_fork(void); diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 78f05e438be..8a634c88765 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -24,7 +24,6 @@ #include #include -#include #include #include #include diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 88638883176..8cbeb7209c3 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -90,7 +90,6 @@ #include #include -#include #include #include #include diff --git a/arch/x86/kernel/tce_64.c b/arch/x86/kernel/tce_64.c index 9e540fee700..ab40954e113 100644 --- a/arch/x86/kernel/tce_64.c +++ b/arch/x86/kernel/tce_64.c @@ -34,6 +34,7 @@ #include #include #include +#include /* flush a tce at 'tceaddr' to main memory */ static inline void flush_tce(void* tceaddr) diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c index 6bb7b8579e7..73920e4c6dc 100644 --- a/arch/x86/kernel/tls.c +++ b/arch/x86/kernel/tls.c @@ -6,7 +6,6 @@ #include #include -#include #include #include #include diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index ec61d4c1b93..860f126ca23 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -50,7 +50,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 6cabf6570d6..4f0cec7e4ff 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 8663f6c47cc..575d86f85ce 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -35,7 +35,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 436a0309db3..fc18be0f6f2 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -35,7 +35,6 @@ #include #include -#include #include #include #include diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index cac71849925..a69bcb8c762 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -10,7 +10,6 @@ #include #include -#include #include #include #include diff --git a/arch/x86/power/hibernate_32.c b/arch/x86/power/hibernate_32.c index 3769079874d..74202c1910c 100644 --- a/arch/x86/power/hibernate_32.c +++ b/arch/x86/power/hibernate_32.c @@ -10,7 +10,6 @@ #include #include -#include #include #include #include -- cgit v1.2.3-70-g09d2 From 49a7f04a4b9d45cd794741ce3d5d66524b37bdd0 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 28 Mar 2012 18:30:03 +0100 Subject: Move all declarations of free_initmem() to linux/mm.h Move all declarations of free_initmem() to linux/mm.h so that there's only one and it's used by everything. Signed-off-by: David Howells cc: linux-c6x-dev@linux-c6x.org cc: microblaze-uclinux@itee.uq.edu.au cc: linux-sh@vger.kernel.org cc: sparclinux@vger.kernel.org cc: x86@kernel.org cc: linux-mm@kvack.org --- arch/c6x/include/asm/system.h | 1 - arch/frv/include/asm/system.h | 2 -- arch/microblaze/include/asm/system.h | 1 - arch/sh/include/asm/system.h | 1 - arch/sparc/mm/init_64.h | 2 -- arch/x86/include/asm/page_types.h | 1 - include/linux/mm.h | 2 ++ init/main.c | 1 - 8 files changed, 2 insertions(+), 9 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/c6x/include/asm/system.h b/arch/c6x/include/asm/system.h index ccc4f86d16c..0d84f9e42fd 100644 --- a/arch/c6x/include/asm/system.h +++ b/arch/c6x/include/asm/system.h @@ -4,4 +4,3 @@ #include #include #include -extern void free_initmem(void); diff --git a/arch/frv/include/asm/system.h b/arch/frv/include/asm/system.h index 5c707a23540..659bcdb44ec 100644 --- a/arch/frv/include/asm/system.h +++ b/arch/frv/include/asm/system.h @@ -1,6 +1,4 @@ -/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */ #include #include #include #include -extern void free_initmem(void); diff --git a/arch/microblaze/include/asm/system.h b/arch/microblaze/include/asm/system.h index ccc4f86d16c..0d84f9e42fd 100644 --- a/arch/microblaze/include/asm/system.h +++ b/arch/microblaze/include/asm/system.h @@ -4,4 +4,3 @@ #include #include #include -extern void free_initmem(void); diff --git a/arch/sh/include/asm/system.h b/arch/sh/include/asm/system.h index e2042aa32f2..04268aa3b3e 100644 --- a/arch/sh/include/asm/system.h +++ b/arch/sh/include/asm/system.h @@ -6,4 +6,3 @@ #include #include #include -void free_initmem(void); diff --git a/arch/sparc/mm/init_64.h b/arch/sparc/mm/init_64.h index 77d1b313e34..3e1ac8b96ca 100644 --- a/arch/sparc/mm/init_64.h +++ b/arch/sparc/mm/init_64.h @@ -36,8 +36,6 @@ extern unsigned long kern_locked_tte_data; extern void prom_world(int enter); -extern void free_initmem(void); - #ifdef CONFIG_SPARSEMEM_VMEMMAP #define VMEMMAP_CHUNK_SHIFT 22 #define VMEMMAP_CHUNK (1UL << VMEMMAP_CHUNK_SHIFT) diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index bce688d54c1..e21fdd10479 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -55,7 +55,6 @@ extern unsigned long init_memory_mapping(unsigned long start, unsigned long end); extern void initmem_init(void); -extern void free_initmem(void); #endif /* !__ASSEMBLY__ */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 7330742e797..69f6d7b7eb0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1257,6 +1257,8 @@ static inline void pgtable_page_dtor(struct page *page) extern void free_area_init(unsigned long * zones_size); extern void free_area_init_node(int nid, unsigned long * zones_size, unsigned long zone_start_pfn, unsigned long *zholes_size); +extern void free_initmem(void); + #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP /* * With CONFIG_HAVE_MEMBLOCK_NODE_MAP set, an architecture may initialise its diff --git a/init/main.c b/init/main.c index c24805c824b..44c9754e2a5 100644 --- a/init/main.c +++ b/init/main.c @@ -87,7 +87,6 @@ extern void mca_init(void); extern void sbus_init(void); extern void prio_tree_init(void); extern void radix_tree_init(void); -extern void free_initmem(void); #ifndef CONFIG_DEBUG_RODATA static inline void mark_rodata_ro(void) { } #endif -- cgit v1.2.3-70-g09d2 From 141124c02059eee9dbc5c86ea797b1ca888e77f7 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 28 Mar 2012 18:30:03 +0100 Subject: Delete all instances of asm/system.h Delete all instances of asm/system.h as they should be redundant by this point. Signed-off-by: David Howells --- arch/alpha/include/asm/system.h | 5 ----- arch/avr32/include/asm/system.h | 6 ------ arch/blackfin/include/asm/system.h | 5 ----- arch/c6x/include/asm/system.h | 6 ------ arch/cris/include/asm/system.h | 5 ----- arch/frv/include/asm/system.h | 4 ---- arch/h8300/include/asm/system.h | 5 ----- arch/hexagon/include/asm/system.h | 5 ----- arch/ia64/include/asm/system.h | 4 ---- arch/m32r/include/asm/system.h | 6 ------ arch/m68k/include/asm/system.h | 5 ----- arch/microblaze/include/asm/system.h | 6 ------ arch/mips/include/asm/system.h | 5 ----- arch/mn10300/include/asm/system.h | 5 ----- arch/openrisc/include/asm/system.h | 5 ----- arch/parisc/include/asm/system.h | 6 ------ arch/powerpc/include/asm/system.h | 6 ------ arch/s390/include/asm/system.h | 7 ------- arch/score/include/asm/system.h | 5 ----- arch/sh/include/asm/system.h | 8 -------- arch/sparc/include/asm/system.h | 6 ------ arch/tile/include/asm/system.h | 4 ---- arch/unicore32/include/asm/system.h | 5 ----- arch/x86/include/asm/system.h | 6 ------ arch/xtensa/include/asm/system.h | 5 ----- include/asm-generic/system.h | 5 ----- 26 files changed, 140 deletions(-) delete mode 100644 arch/alpha/include/asm/system.h delete mode 100644 arch/avr32/include/asm/system.h delete mode 100644 arch/blackfin/include/asm/system.h delete mode 100644 arch/c6x/include/asm/system.h delete mode 100644 arch/cris/include/asm/system.h delete mode 100644 arch/frv/include/asm/system.h delete mode 100644 arch/h8300/include/asm/system.h delete mode 100644 arch/hexagon/include/asm/system.h delete mode 100644 arch/ia64/include/asm/system.h delete mode 100644 arch/m32r/include/asm/system.h delete mode 100644 arch/m68k/include/asm/system.h delete mode 100644 arch/microblaze/include/asm/system.h delete mode 100644 arch/mips/include/asm/system.h delete mode 100644 arch/mn10300/include/asm/system.h delete mode 100644 arch/openrisc/include/asm/system.h delete mode 100644 arch/parisc/include/asm/system.h delete mode 100644 arch/powerpc/include/asm/system.h delete mode 100644 arch/s390/include/asm/system.h delete mode 100644 arch/score/include/asm/system.h delete mode 100644 arch/sh/include/asm/system.h delete mode 100644 arch/sparc/include/asm/system.h delete mode 100644 arch/tile/include/asm/system.h delete mode 100644 arch/unicore32/include/asm/system.h delete mode 100644 arch/x86/include/asm/system.h delete mode 100644 arch/xtensa/include/asm/system.h delete mode 100644 include/asm-generic/system.h (limited to 'arch/x86/include') diff --git a/arch/alpha/include/asm/system.h b/arch/alpha/include/asm/system.h deleted file mode 100644 index c7270dcff32..00000000000 --- a/arch/alpha/include/asm/system.h +++ /dev/null @@ -1,5 +0,0 @@ -/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */ -#include -#include -#include -#include diff --git a/arch/avr32/include/asm/system.h b/arch/avr32/include/asm/system.h deleted file mode 100644 index 0d84f9e42fd..00000000000 --- a/arch/avr32/include/asm/system.h +++ /dev/null @@ -1,6 +0,0 @@ -/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */ -#include -#include -#include -#include -#include diff --git a/arch/blackfin/include/asm/system.h b/arch/blackfin/include/asm/system.h deleted file mode 100644 index a7f40578587..00000000000 --- a/arch/blackfin/include/asm/system.h +++ /dev/null @@ -1,5 +0,0 @@ -/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */ -#include -#include -#include -#include diff --git a/arch/c6x/include/asm/system.h b/arch/c6x/include/asm/system.h deleted file mode 100644 index 0d84f9e42fd..00000000000 --- a/arch/c6x/include/asm/system.h +++ /dev/null @@ -1,6 +0,0 @@ -/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */ -#include -#include -#include -#include -#include diff --git a/arch/cris/include/asm/system.h b/arch/cris/include/asm/system.h deleted file mode 100644 index a7f40578587..00000000000 --- a/arch/cris/include/asm/system.h +++ /dev/null @@ -1,5 +0,0 @@ -/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */ -#include -#include -#include -#include diff --git a/arch/frv/include/asm/system.h b/arch/frv/include/asm/system.h deleted file mode 100644 index 659bcdb44ec..00000000000 --- a/arch/frv/include/asm/system.h +++ /dev/null @@ -1,4 +0,0 @@ -#include -#include -#include -#include diff --git a/arch/h8300/include/asm/system.h b/arch/h8300/include/asm/system.h deleted file mode 100644 index a7f40578587..00000000000 --- a/arch/h8300/include/asm/system.h +++ /dev/null @@ -1,5 +0,0 @@ -/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */ -#include -#include -#include -#include diff --git a/arch/hexagon/include/asm/system.h b/arch/hexagon/include/asm/system.h deleted file mode 100644 index a7f40578587..00000000000 --- a/arch/hexagon/include/asm/system.h +++ /dev/null @@ -1,5 +0,0 @@ -/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */ -#include -#include -#include -#include diff --git a/arch/ia64/include/asm/system.h b/arch/ia64/include/asm/system.h deleted file mode 100644 index 5b190b48fcd..00000000000 --- a/arch/ia64/include/asm/system.h +++ /dev/null @@ -1,4 +0,0 @@ -/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */ -#include -#include -#include diff --git a/arch/m32r/include/asm/system.h b/arch/m32r/include/asm/system.h deleted file mode 100644 index a55c384fdcf..00000000000 --- a/arch/m32r/include/asm/system.h +++ /dev/null @@ -1,6 +0,0 @@ -/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */ -#include -#include -#include -#include -#include diff --git a/arch/m68k/include/asm/system.h b/arch/m68k/include/asm/system.h deleted file mode 100644 index a7f40578587..00000000000 --- a/arch/m68k/include/asm/system.h +++ /dev/null @@ -1,5 +0,0 @@ -/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */ -#include -#include -#include -#include diff --git a/arch/microblaze/include/asm/system.h b/arch/microblaze/include/asm/system.h deleted file mode 100644 index 0d84f9e42fd..00000000000 --- a/arch/microblaze/include/asm/system.h +++ /dev/null @@ -1,6 +0,0 @@ -/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */ -#include -#include -#include -#include -#include diff --git a/arch/mips/include/asm/system.h b/arch/mips/include/asm/system.h deleted file mode 100644 index a7f40578587..00000000000 --- a/arch/mips/include/asm/system.h +++ /dev/null @@ -1,5 +0,0 @@ -/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */ -#include -#include -#include -#include diff --git a/arch/mn10300/include/asm/system.h b/arch/mn10300/include/asm/system.h deleted file mode 100644 index a7f40578587..00000000000 --- a/arch/mn10300/include/asm/system.h +++ /dev/null @@ -1,5 +0,0 @@ -/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */ -#include -#include -#include -#include diff --git a/arch/openrisc/include/asm/system.h b/arch/openrisc/include/asm/system.h deleted file mode 100644 index a7f40578587..00000000000 --- a/arch/openrisc/include/asm/system.h +++ /dev/null @@ -1,5 +0,0 @@ -/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */ -#include -#include -#include -#include diff --git a/arch/parisc/include/asm/system.h b/arch/parisc/include/asm/system.h deleted file mode 100644 index fc2c1261ac1..00000000000 --- a/arch/parisc/include/asm/system.h +++ /dev/null @@ -1,6 +0,0 @@ -/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */ -#include -#include -#include -#include -#include diff --git a/arch/powerpc/include/asm/system.h b/arch/powerpc/include/asm/system.h deleted file mode 100644 index 502c1e0275a..00000000000 --- a/arch/powerpc/include/asm/system.h +++ /dev/null @@ -1,6 +0,0 @@ -/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */ -#include -#include -#include -#include -#include diff --git a/arch/s390/include/asm/system.h b/arch/s390/include/asm/system.h deleted file mode 100644 index 641c7290327..00000000000 --- a/arch/s390/include/asm/system.h +++ /dev/null @@ -1,7 +0,0 @@ -/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */ -#include -#include -#include -#include -#include -#include diff --git a/arch/score/include/asm/system.h b/arch/score/include/asm/system.h deleted file mode 100644 index a7f40578587..00000000000 --- a/arch/score/include/asm/system.h +++ /dev/null @@ -1,5 +0,0 @@ -/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */ -#include -#include -#include -#include diff --git a/arch/sh/include/asm/system.h b/arch/sh/include/asm/system.h deleted file mode 100644 index 04268aa3b3e..00000000000 --- a/arch/sh/include/asm/system.h +++ /dev/null @@ -1,8 +0,0 @@ -/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */ -#include -#include -#include -#include -#include -#include -#include diff --git a/arch/sparc/include/asm/system.h b/arch/sparc/include/asm/system.h deleted file mode 100644 index ed532ba000b..00000000000 --- a/arch/sparc/include/asm/system.h +++ /dev/null @@ -1,6 +0,0 @@ -/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */ -#include -#include -#include -#include -#include diff --git a/arch/tile/include/asm/system.h b/arch/tile/include/asm/system.h deleted file mode 100644 index 5b190b48fcd..00000000000 --- a/arch/tile/include/asm/system.h +++ /dev/null @@ -1,4 +0,0 @@ -/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */ -#include -#include -#include diff --git a/arch/unicore32/include/asm/system.h b/arch/unicore32/include/asm/system.h deleted file mode 100644 index a7f40578587..00000000000 --- a/arch/unicore32/include/asm/system.h +++ /dev/null @@ -1,5 +0,0 @@ -/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */ -#include -#include -#include -#include diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h deleted file mode 100644 index 0d84f9e42fd..00000000000 --- a/arch/x86/include/asm/system.h +++ /dev/null @@ -1,6 +0,0 @@ -/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */ -#include -#include -#include -#include -#include diff --git a/arch/xtensa/include/asm/system.h b/arch/xtensa/include/asm/system.h deleted file mode 100644 index a7f40578587..00000000000 --- a/arch/xtensa/include/asm/system.h +++ /dev/null @@ -1,5 +0,0 @@ -/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */ -#include -#include -#include -#include diff --git a/include/asm-generic/system.h b/include/asm-generic/system.h deleted file mode 100644 index a7f40578587..00000000000 --- a/include/asm-generic/system.h +++ /dev/null @@ -1,5 +0,0 @@ -/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */ -#include -#include -#include -#include -- cgit v1.2.3-70-g09d2 From f6365201d8a21fb347260f89d6e9b3e718d63c70 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Thu, 29 Mar 2012 14:49:17 -0700 Subject: x86: Remove the ancient and deprecated disable_hlt() and enable_hlt() facility The X86_32-only disable_hlt/enable_hlt mechanism was used by the 32-bit floppy driver. Its effect was to replace the use of the HLT instruction inside default_idle() with cpu_relax() - essentially it turned off the use of HLT. This workaround was commented in the code as: "disable hlt during certain critical i/o operations" "This halt magic was a workaround for ancient floppy DMA wreckage. It should be safe to remove." H. Peter Anvin additionally adds: "To the best of my knowledge, no-hlt only existed because of flaky power distributions on 386/486 systems which were sold to run DOS. Since DOS did no power management of any kind, including HLT, the power draw was fairly uniform; when exposed to the much hhigher noise levels you got when Linux used HLT caused some of these systems to fail. They were by far in the minority even back then." Alan Cox further says: "Also for the Cyrix 5510 which tended to go castors up if a HLT occurred during a DMA cycle and on a few other boxes HLT during DMA tended to go astray. Do we care ? I doubt it. The 5510 was pretty obscure, the 5520 fixed it, the 5530 is probably the oldest still in any kind of use." So, let's finally drop this. Signed-off-by: Len Brown Signed-off-by: Josh Boyer Signed-off-by: Andrew Morton Acked-by: "H. Peter Anvin" Acked-by: Alan Cox Cc: Stephen Hemminger Cc: Link: http://lkml.kernel.org/n/tip-3rhk9bzf0x9rljkv488tloib@git.kernel.org [ If anyone cares then alternative instruction patching could be used to replace HLT with a one-byte NOP instruction. Much simpler. ] Signed-off-by: Ingo Molnar --- Documentation/feature-removal-schedule.txt | 8 ------- arch/x86/include/asm/processor.h | 10 --------- arch/x86/kernel/process.c | 24 -------------------- drivers/block/floppy.c | 36 ------------------------------ 4 files changed, 78 deletions(-) (limited to 'arch/x86/include') diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index 0cad4803ffa..7c950d48d76 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -6,14 +6,6 @@ be removed from this file. --------------------------- -What: x86 floppy disable_hlt -When: 2012 -Why: ancient workaround of dubious utility clutters the - code used by everybody else. -Who: Len Brown - ---------------------------- - What: CONFIG_APM_CPU_IDLE, and its ability to call APM BIOS in idle When: 2012 Why: This optional sub-feature of APM is of dubious reliability, diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 7284c9a6a0b..4fa7dcceb6c 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -974,16 +974,6 @@ extern bool cpu_has_amd_erratum(const int *); #define cpu_has_amd_erratum(x) (false) #endif /* CONFIG_CPU_SUP_AMD */ -#ifdef CONFIG_X86_32 -/* - * disable hlt during certain critical i/o operations - */ -#define HAVE_DISABLE_HLT -#endif - -void disable_hlt(void); -void enable_hlt(void); - void cpu_idle_wait(void); extern unsigned long arch_align_stack(unsigned long sp); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index a33afaa5ddb..1d92a5ab6e8 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -362,34 +362,10 @@ void (*pm_idle)(void); EXPORT_SYMBOL(pm_idle); #endif -#ifdef CONFIG_X86_32 -/* - * This halt magic was a workaround for ancient floppy DMA - * wreckage. It should be safe to remove. - */ -static int hlt_counter; -void disable_hlt(void) -{ - hlt_counter++; -} -EXPORT_SYMBOL(disable_hlt); - -void enable_hlt(void) -{ - hlt_counter--; -} -EXPORT_SYMBOL(enable_hlt); - -static inline int hlt_use_halt(void) -{ - return (!hlt_counter && boot_cpu_data.hlt_works_ok); -} -#else static inline int hlt_use_halt(void) { return 1; } -#endif #ifndef CONFIG_SMP static inline void play_dead(void) diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 76a08236430..b0b00d70c16 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -1030,37 +1030,6 @@ static int fd_wait_for_completion(unsigned long delay, timeout_fn function) return 0; } -static DEFINE_SPINLOCK(floppy_hlt_lock); -static int hlt_disabled; -static void floppy_disable_hlt(void) -{ - unsigned long flags; - - WARN_ONCE(1, "floppy_disable_hlt() scheduled for removal in 2012"); - spin_lock_irqsave(&floppy_hlt_lock, flags); - if (!hlt_disabled) { - hlt_disabled = 1; -#ifdef HAVE_DISABLE_HLT - disable_hlt(); -#endif - } - spin_unlock_irqrestore(&floppy_hlt_lock, flags); -} - -static void floppy_enable_hlt(void) -{ - unsigned long flags; - - spin_lock_irqsave(&floppy_hlt_lock, flags); - if (hlt_disabled) { - hlt_disabled = 0; -#ifdef HAVE_DISABLE_HLT - enable_hlt(); -#endif - } - spin_unlock_irqrestore(&floppy_hlt_lock, flags); -} - static void setup_DMA(void) { unsigned long f; @@ -1105,7 +1074,6 @@ static void setup_DMA(void) fd_enable_dma(); release_dma_lock(f); #endif - floppy_disable_hlt(); } static void show_floppy(void); @@ -1707,7 +1675,6 @@ irqreturn_t floppy_interrupt(int irq, void *dev_id) fd_disable_dma(); release_dma_lock(f); - floppy_enable_hlt(); do_floppy = NULL; if (fdc >= N_FDC || FDCS->address == -1) { /* we don't even know which FDC is the culprit */ @@ -1856,8 +1823,6 @@ static void floppy_shutdown(unsigned long data) show_floppy(); cancel_activity(); - floppy_enable_hlt(); - flags = claim_dma_lock(); fd_disable_dma(); release_dma_lock(flags); @@ -4508,7 +4473,6 @@ static void floppy_release_irq_and_dma(void) #if N_FDC > 1 set_dor(1, ~8, 0); #endif - floppy_enable_hlt(); if (floppy_track_buffer && max_buffer_sectors) { tmpsize = max_buffer_sectors * 1024; -- cgit v1.2.3-70-g09d2 From f68e556e23d1a4176b563bcb25d8baf2c5313f91 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 6 Apr 2012 13:54:56 -0700 Subject: Make the "word-at-a-time" helper functions more commonly usable I have a new optimized x86 "strncpy_from_user()" that will use these same helper functions for all the same reasons the name lookup code uses them. This is preparation for that. This moves them into an architecture-specific header file. It's architecture-specific for two reasons: - some of the functions are likely to want architecture-specific implementations. Even if the current code happens to be "generic" in the sense that it should work on any little-endian machine, it's likely that the "multiply by a big constant and shift" implementation is less than optimal for an architecture that has a guaranteed fast bit count instruction, for example. - I expect that if architectures like sparc want to start playing around with this, we'll need to abstract out a few more details (in particular the actual unaligned accesses). So we're likely to have more architecture-specific stuff if non-x86 architectures start using this. (and if it turns out that non-x86 architectures don't start using this, then having it in an architecture-specific header is still the right thing to do, of course) Signed-off-by: Linus Torvalds --- arch/x86/include/asm/word-at-a-time.h | 46 +++++++++++++++++++++++++++++++++++ fs/namei.c | 35 +++----------------------- 2 files changed, 49 insertions(+), 32 deletions(-) create mode 100644 arch/x86/include/asm/word-at-a-time.h (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/word-at-a-time.h b/arch/x86/include/asm/word-at-a-time.h new file mode 100644 index 00000000000..6fe6767b712 --- /dev/null +++ b/arch/x86/include/asm/word-at-a-time.h @@ -0,0 +1,46 @@ +#ifndef _ASM_WORD_AT_A_TIME_H +#define _ASM_WORD_AT_A_TIME_H + +/* + * This is largely generic for little-endian machines, but the + * optimal byte mask counting is probably going to be something + * that is architecture-specific. If you have a reliably fast + * bit count instruction, that might be better than the multiply + * and shift, for example. + */ + +#ifdef CONFIG_64BIT + +/* + * Jan Achrenius on G+: microoptimized version of + * the simpler "(mask & ONEBYTES) * ONEBYTES >> 56" + * that works for the bytemasks without having to + * mask them first. + */ +static inline long count_masked_bytes(unsigned long mask) +{ + return mask*0x0001020304050608ul >> 56; +} + +#else /* 32-bit case */ + +/* Carl Chatfield / Jan Achrenius G+ version for 32-bit */ +static inline long count_masked_bytes(long mask) +{ + /* (000000 0000ff 00ffff ffffff) -> ( 1 1 2 3 ) */ + long a = (0x0ff0001+mask) >> 23; + /* Fix the 1 for 00 case */ + return a & mask; +} + +#endif + +#define REPEAT_BYTE(x) ((~0ul / 0xff) * (x)) + +/* Return the high bit set in the first byte that is a zero */ +static inline unsigned long has_zero(unsigned long a) +{ + return ((a - REPEAT_BYTE(0x01)) & ~a) & REPEAT_BYTE(0x80); +} + +#endif /* _ASM_WORD_AT_A_TIME_H */ diff --git a/fs/namei.c b/fs/namei.c index 1898198abc3..0062dd17eb5 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1407,18 +1407,9 @@ static inline int can_lookup(struct inode *inode) */ #ifdef CONFIG_DCACHE_WORD_ACCESS -#ifdef CONFIG_64BIT +#include -/* - * Jan Achrenius on G+: microoptimized version of - * the simpler "(mask & ONEBYTES) * ONEBYTES >> 56" - * that works for the bytemasks without having to - * mask them first. - */ -static inline long count_masked_bytes(unsigned long mask) -{ - return mask*0x0001020304050608ul >> 56; -} +#ifdef CONFIG_64BIT static inline unsigned int fold_hash(unsigned long hash) { @@ -1428,15 +1419,6 @@ static inline unsigned int fold_hash(unsigned long hash) #else /* 32-bit case */ -/* Carl Chatfield / Jan Achrenius G+ version for 32-bit */ -static inline long count_masked_bytes(long mask) -{ - /* (000000 0000ff 00ffff ffffff) -> ( 1 1 2 3 ) */ - long a = (0x0ff0001+mask) >> 23; - /* Fix the 1 for 00 case */ - return a & mask; -} - #define fold_hash(x) (x) #endif @@ -1464,17 +1446,6 @@ done: } EXPORT_SYMBOL(full_name_hash); -#define REPEAT_BYTE(x) ((~0ul / 0xff) * (x)) -#define ONEBYTES REPEAT_BYTE(0x01) -#define SLASHBYTES REPEAT_BYTE('/') -#define HIGHBITS REPEAT_BYTE(0x80) - -/* Return the high bit set in the first byte that is a zero */ -static inline unsigned long has_zero(unsigned long a) -{ - return ((a - ONEBYTES) & ~a) & HIGHBITS; -} - /* * Calculate the length and hash of the path component, and * return the length of the component; @@ -1490,7 +1461,7 @@ static inline unsigned long hash_name(const char *name, unsigned int *hashp) len += sizeof(unsigned long); a = *(unsigned long *)(name+len); /* Do we have any NUL or '/' bytes in this word? */ - mask = has_zero(a) | has_zero(a ^ SLASHBYTES); + mask = has_zero(a) | has_zero(a ^ REPEAT_BYTE('/')); } while (!mask); /* The mask *below* the first high bit set */ -- cgit v1.2.3-70-g09d2