diff options
Diffstat (limited to 'arch/x86')
204 files changed, 3721 insertions, 2713 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b32ebf92b0c..5c0ed72c02a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -16,6 +16,7 @@ config X86_64 def_bool y depends on 64BIT select X86_DEV_DMA_OPS + select ARCH_USE_CMPXCHG_LOCKREF ### Arch settings config X86 @@ -81,7 +82,6 @@ config X86 select HAVE_USER_RETURN_NOTIFIER select ARCH_BINFMT_ELF_RANDOMIZE_PIE select HAVE_ARCH_JUMP_LABEL - select HAVE_TEXT_POKE_SMP select HAVE_GENERIC_HARDIRQS select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select SPARSE_IRQ @@ -632,6 +632,7 @@ config PARAVIRT_DEBUG config PARAVIRT_SPINLOCKS bool "Paravirtualization layer for spinlocks" depends on PARAVIRT && SMP + select UNINLINE_SPIN_UNLOCK ---help--- Paravirtualized spinlocks allow a pvops backend to replace the spinlock implementation with something virtualization-friendly @@ -656,6 +657,15 @@ config KVM_GUEST underlying device model, the host provides the guest with timing infrastructure such as time of day, and system time +config KVM_DEBUG_FS + bool "Enable debug information for KVM Guests in debugfs" + depends on KVM_GUEST && DEBUG_FS + default n + ---help--- + This option enables collection of various statistics for KVM guest. + Statistics are displayed in debugfs filesystem. Enabling this option + may incur significant overhead. + source "arch/x86/lguest/Kconfig" config PARAVIRT_TIME_ACCOUNTING @@ -1344,8 +1354,12 @@ config ARCH_SELECT_MEMORY_MODEL depends on ARCH_SPARSEMEM_ENABLE config ARCH_MEMORY_PROBE - def_bool y + bool "Enable sysfs memory/probe interface" depends on X86_64 && MEMORY_HOTPLUG + help + This option enables a sysfs memory/probe interface for testing. + See Documentation/memory-hotplug.txt for more information. + If you are unsure how to answer this question, answer N. config ARCH_PROC_KCORE_TEXT def_bool y @@ -1627,9 +1641,9 @@ config KEXEC It is an ongoing process to be certain the hardware in a machine is properly shutdown, so do not be surprised if this code does not - initially work for you. It may help to enable device hotplugging - support. As of this writing the exact hardware interface is - strongly in flux, so no good recommendation can be made. + initially work for you. As of this writing the exact hardware + interface is strongly in flux, so no good recommendation can be + made. config CRASH_DUMP bool "kernel crash dumps" @@ -1716,9 +1730,10 @@ config X86_NEED_RELOCS depends on X86_32 && RELOCATABLE config PHYSICAL_ALIGN - hex "Alignment value to which kernel should be aligned" if X86_32 + hex "Alignment value to which kernel should be aligned" default "0x1000000" - range 0x2000 0x1000000 + range 0x2000 0x1000000 if X86_32 + range 0x200000 0x1000000 if X86_64 ---help--- This value puts the alignment restrictions on physical address where kernel is loaded and run from. Kernel is compiled for an @@ -1736,6 +1751,9 @@ config PHYSICAL_ALIGN end result is that kernel runs from a physical address meeting above alignment restrictions. + On 32-bit this value must be a multiple of 0x2000. On 64-bit + this value must be a multiple of 0x200000. + Don't change this unless you know what you are doing. config HOTPLUG_CPU @@ -2270,6 +2288,32 @@ config RAPIDIO source "drivers/rapidio/Kconfig" +config X86_SYSFB + bool "Mark VGA/VBE/EFI FB as generic system framebuffer" + help + Firmwares often provide initial graphics framebuffers so the BIOS, + bootloader or kernel can show basic video-output during boot for + user-guidance and debugging. Historically, x86 used the VESA BIOS + Extensions and EFI-framebuffers for this, which are mostly limited + to x86. + This option, if enabled, marks VGA/VBE/EFI framebuffers as generic + framebuffers so the new generic system-framebuffer drivers can be + used on x86. If the framebuffer is not compatible with the generic + modes, it is adverticed as fallback platform framebuffer so legacy + drivers like efifb, vesafb and uvesafb can pick it up. + If this option is not selected, all system framebuffers are always + marked as fallback platform framebuffers as usual. + + Note: Legacy fbdev drivers, including vesafb, efifb, uvesafb, will + not be able to pick up generic system framebuffers if this option + is selected. You are highly encouraged to enable simplefb as + replacement if you select this option. simplefb can correctly deal + with generic system framebuffers. But you should still keep vesafb + and others enabled as fallback if a system framebuffer is + incompatible with simplefb. + + If unsure, say Y. + endmenu @@ -2332,10 +2376,6 @@ config HAVE_ATOMIC_IOMAP def_bool y depends on X86_32 -config HAVE_TEXT_POKE_SMP - bool - select STOP_MACHINE if SMP - config X86_DEV_DMA_OPS bool depends on X86_64 || STA2X11 diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 07639c656fc..41250fb3398 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -16,6 +16,10 @@ endif # e.g.: obj-y += foo_$(BITS).o export BITS +ifdef CONFIG_X86_NEED_RELOCS + LDFLAGS_vmlinux := --emit-relocs +endif + ifeq ($(CONFIG_X86_32),y) BITS := 32 UTS_MACHINE := i386 @@ -25,10 +29,6 @@ ifeq ($(CONFIG_X86_32),y) KBUILD_AFLAGS += $(biarch) KBUILD_CFLAGS += $(biarch) - ifdef CONFIG_RELOCATABLE - LDFLAGS_vmlinux := --emit-relocs - endif - KBUILD_CFLAGS += -msoft-float -mregparm=3 -freg-struct-return # Never want PIC in a 32-bit kernel, prevent breakage with GCC built diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index 5b7531966b8..ef72baeff48 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h @@ -355,6 +355,7 @@ int strncmp(const char *cs, const char *ct, size_t count); size_t strnlen(const char *s, size_t maxlen); unsigned int atou(const char *s); unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base); +size_t strlen(const char *s); /* tty.c */ void puts(const char *); diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index d606463aa6d..b7388a425f0 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -225,7 +225,7 @@ static void low_free(unsigned long size, unsigned long addr) unsigned long nr_pages; nr_pages = round_up(size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE; - efi_call_phys2(sys_table->boottime->free_pages, addr, size); + efi_call_phys2(sys_table->boottime->free_pages, addr, nr_pages); } static void find_bits(unsigned long mask, u8 *pos, u8 *size) diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 1e3184f6072..5d6f6891b18 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -181,8 +181,9 @@ relocated: /* * Do the decompression, and jump to the new kernel.. */ - leal z_extract_offset_negative(%ebx), %ebp /* push arguments for decompress_kernel: */ + pushl $z_output_len /* decompressed length */ + leal z_extract_offset_negative(%ebx), %ebp pushl %ebp /* output address */ pushl $z_input_len /* input_len */ leal input_data(%ebx), %eax @@ -191,33 +192,7 @@ relocated: pushl %eax /* heap area */ pushl %esi /* real mode pointer */ call decompress_kernel - addl $20, %esp - -#if CONFIG_RELOCATABLE -/* - * Find the address of the relocations. - */ - leal z_output_len(%ebp), %edi - -/* - * Calculate the delta between where vmlinux was compiled to run - * and where it was actually loaded. - */ - movl %ebp, %ebx - subl $LOAD_PHYSICAL_ADDR, %ebx - jz 2f /* Nothing to be done if loaded at compiled addr. */ -/* - * Process relocations. - */ - -1: subl $4, %edi - movl (%edi), %ecx - testl %ecx, %ecx - jz 2f - addl %ebx, -__PAGE_OFFSET(%ebx, %ecx) - jmp 1b -2: -#endif + addl $24, %esp /* * Jump to the decompressed kernel. diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 06e71c2c16b..c337422b575 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -338,6 +338,7 @@ relocated: leaq input_data(%rip), %rdx /* input_data */ movl $z_input_len, %ecx /* input_len */ movq %rbp, %r8 /* output target address */ + movq $z_output_len, %r9 /* decompressed length */ call decompress_kernel popq %rsi diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 0319c88290a..434f077d2c4 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -271,6 +271,79 @@ static void error(char *x) asm("hlt"); } +#if CONFIG_X86_NEED_RELOCS +static void handle_relocations(void *output, unsigned long output_len) +{ + int *reloc; + unsigned long delta, map, ptr; + unsigned long min_addr = (unsigned long)output; + unsigned long max_addr = min_addr + output_len; + + /* + * Calculate the delta between where vmlinux was linked to load + * and where it was actually loaded. + */ + delta = min_addr - LOAD_PHYSICAL_ADDR; + if (!delta) { + debug_putstr("No relocation needed... "); + return; + } + debug_putstr("Performing relocations... "); + + /* + * The kernel contains a table of relocation addresses. Those + * addresses have the final load address of the kernel in virtual + * memory. We are currently working in the self map. So we need to + * create an adjustment for kernel memory addresses to the self map. + * This will involve subtracting out the base address of the kernel. + */ + map = delta - __START_KERNEL_map; + + /* + * Process relocations: 32 bit relocations first then 64 bit after. + * Two sets of binary relocations are added to the end of the kernel + * before compression. Each relocation table entry is the kernel + * address of the location which needs to be updated stored as a + * 32-bit value which is sign extended to 64 bits. + * + * Format is: + * + * kernel bits... + * 0 - zero terminator for 64 bit relocations + * 64 bit relocation repeated + * 0 - zero terminator for 32 bit relocations + * 32 bit relocation repeated + * + * So we work backwards from the end of the decompressed image. + */ + for (reloc = output + output_len - sizeof(*reloc); *reloc; reloc--) { + int extended = *reloc; + extended += map; + + ptr = (unsigned long)extended; + if (ptr < min_addr || ptr > max_addr) + error("32-bit relocation outside of kernel!\n"); + + *(uint32_t *)ptr += delta; + } +#ifdef CONFIG_X86_64 + for (reloc--; *reloc; reloc--) { + long extended = *reloc; + extended += map; + + ptr = (unsigned long)extended; + if (ptr < min_addr || ptr > max_addr) + error("64-bit relocation outside of kernel!\n"); + + *(uint64_t *)ptr += delta; + } +#endif +} +#else +static inline void handle_relocations(void *output, unsigned long output_len) +{ } +#endif + static void parse_elf(void *output) { #ifdef CONFIG_X86_64 @@ -325,7 +398,8 @@ static void parse_elf(void *output) asmlinkage void decompress_kernel(void *rmode, memptr heap, unsigned char *input_data, unsigned long input_len, - unsigned char *output) + unsigned char *output, + unsigned long output_len) { real_mode = rmode; @@ -365,6 +439,7 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap, debug_putstr("\nDecompressing Linux... "); decompress(input_data, input_len, NULL, NULL, output, NULL, error); parse_elf(output); + handle_relocations(output, output_len); debug_putstr("done.\nBooting the kernel.\n"); return; } diff --git a/arch/x86/boot/printf.c b/arch/x86/boot/printf.c index cdac91ca55d..565083c16e5 100644 --- a/arch/x86/boot/printf.c +++ b/arch/x86/boot/printf.c @@ -55,7 +55,7 @@ static char *number(char *str, long num, int base, int size, int precision, locase = (type & SMALL); if (type & LEFT) type &= ~ZEROPAD; - if (base < 2 || base > 36) + if (base < 2 || base > 16) return NULL; c = (type & ZEROPAD) ? '0' : ' '; sign = 0; diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 7d6ba9db1be..6c63c358a7e 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -27,7 +27,6 @@ obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o obj-$(CONFIG_CRYPTO_SHA256_SSSE3) += sha256-ssse3.o obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o -obj-$(CONFIG_CRYPTO_CRCT10DIF_PCLMUL) += crct10dif-pclmul.o # These modules require assembler to support AVX. ifeq ($(avx_supported),yes) @@ -82,4 +81,3 @@ crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o sha256-ssse3-y := sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256_ssse3_glue.o sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o -crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o diff --git a/arch/x86/crypto/crct10dif-pcl-asm_64.S b/arch/x86/crypto/crct10dif-pcl-asm_64.S deleted file mode 100644 index 35e97569d05..00000000000 --- a/arch/x86/crypto/crct10dif-pcl-asm_64.S +++ /dev/null @@ -1,643 +0,0 @@ -######################################################################## -# Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions -# -# Copyright (c) 2013, Intel Corporation -# -# Authors: -# Erdinc Ozturk <erdinc.ozturk@intel.com> -# Vinodh Gopal <vinodh.gopal@intel.com> -# James Guilford <james.guilford@intel.com> -# Tim Chen <tim.c.chen@linux.intel.com> -# -# This software is available to you under a choice of one of two -# licenses. You may choose to be licensed under the terms of the GNU -# General Public License (GPL) Version 2, available from the file -# COPYING in the main directory of this source tree, or the -# OpenIB.org BSD license below: -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: -# -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the -# distribution. -# -# * Neither the name of the Intel Corporation nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# -# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -######################################################################## -# Function API: -# UINT16 crc_t10dif_pcl( -# UINT16 init_crc, //initial CRC value, 16 bits -# const unsigned char *buf, //buffer pointer to calculate CRC on -# UINT64 len //buffer length in bytes (64-bit data) -# ); -# -# Reference paper titled "Fast CRC Computation for Generic -# Polynomials Using PCLMULQDQ Instruction" -# URL: http://www.intel.com/content/dam/www/public/us/en/documents -# /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf -# -# - -#include <linux/linkage.h> - -.text - -#define arg1 %rdi -#define arg2 %rsi -#define arg3 %rdx - -#define arg1_low32 %edi - -ENTRY(crc_t10dif_pcl) -.align 16 - - # adjust the 16-bit initial_crc value, scale it to 32 bits - shl $16, arg1_low32 - - # Allocate Stack Space - mov %rsp, %rcx - sub $16*2, %rsp - # align stack to 16 byte boundary - and $~(0x10 - 1), %rsp - - # check if smaller than 256 - cmp $256, arg3 - - # for sizes less than 128, we can't fold 64B at a time... - jl _less_than_128 - - - # load the initial crc value - movd arg1_low32, %xmm10 # initial crc - - # crc value does not need to be byte-reflected, but it needs - # to be moved to the high part of the register. - # because data will be byte-reflected and will align with - # initial crc at correct place. - pslldq $12, %xmm10 - - movdqa SHUF_MASK(%rip), %xmm11 - # receive the initial 64B data, xor the initial crc value - movdqu 16*0(arg2), %xmm0 - movdqu 16*1(arg2), %xmm1 - movdqu 16*2(arg2), %xmm2 - movdqu 16*3(arg2), %xmm3 - movdqu 16*4(arg2), %xmm4 - movdqu 16*5(arg2), %xmm5 - movdqu 16*6(arg2), %xmm6 - movdqu 16*7(arg2), %xmm7 - - pshufb %xmm11, %xmm0 - # XOR the initial_crc value - pxor %xmm10, %xmm0 - pshufb %xmm11, %xmm1 - pshufb %xmm11, %xmm2 - pshufb %xmm11, %xmm3 - pshufb %xmm11, %xmm4 - pshufb %xmm11, %xmm5 - pshufb %xmm11, %xmm6 - pshufb %xmm11, %xmm7 - - movdqa rk3(%rip), %xmm10 #xmm10 has rk3 and rk4 - #imm value of pclmulqdq instruction - #will determine which constant to use - - ################################################################# - # we subtract 256 instead of 128 to save one instruction from the loop - sub $256, arg3 - - # at this section of the code, there is 64*x+y (0<=y<64) bytes of - # buffer. The _fold_64_B_loop will fold 64B at a time - # until we have 64+y Bytes of buffer - - - # fold 64B at a time. This section of the code folds 4 xmm - # registers in parallel -_fold_64_B_loop: - - # update the buffer pointer - add $128, arg2 # buf += 64# - - movdqu 16*0(arg2), %xmm9 - movdqu 16*1(arg2), %xmm12 - pshufb %xmm11, %xmm9 - pshufb %xmm11, %xmm12 - movdqa %xmm0, %xmm8 - movdqa %xmm1, %xmm13 - pclmulqdq $0x0 , %xmm10, %xmm0 - pclmulqdq $0x11, %xmm10, %xmm8 - pclmulqdq $0x0 , %xmm10, %xmm1 - pclmulqdq $0x11, %xmm10, %xmm13 - pxor %xmm9 , %xmm0 - xorps %xmm8 , %xmm0 - pxor %xmm12, %xmm1 - xorps %xmm13, %xmm1 - - movdqu 16*2(arg2), %xmm9 - movdqu 16*3(arg2), %xmm12 - pshufb %xmm11, %xmm9 - pshufb %xmm11, %xmm12 - movdqa %xmm2, %xmm8 - movdqa %xmm3, %xmm13 - pclmulqdq $0x0, %xmm10, %xmm2 - pclmulqdq $0x11, %xmm10, %xmm8 - pclmulqdq $0x0, %xmm10, %xmm3 - pclmulqdq $0x11, %xmm10, %xmm13 - pxor %xmm9 , %xmm2 - xorps %xmm8 , %xmm2 - pxor %xmm12, %xmm3 - xorps %xmm13, %xmm3 - - movdqu 16*4(arg2), %xmm9 - movdqu 16*5(arg2), %xmm12 - pshufb %xmm11, %xmm9 - pshufb %xmm11, %xmm12 - movdqa %xmm4, %xmm8 - movdqa %xmm5, %xmm13 - pclmulqdq $0x0, %xmm10, %xmm4 - pclmulqdq $0x11, %xmm10, %xmm8 - pclmulqdq $0x0, %xmm10, %xmm5 - pclmulqdq $0x11, %xmm10, %xmm13 - pxor %xmm9 , %xmm4 - xorps %xmm8 , %xmm4 - pxor %xmm12, %xmm5 - xorps %xmm13, %xmm5 - - movdqu 16*6(arg2), %xmm9 - movdqu 16*7(arg2), %xmm12 - pshufb %xmm11, %xmm9 - pshufb %xmm11, %xmm12 - movdqa %xmm6 , %xmm8 - movdqa %xmm7 , %xmm13 - pclmulqdq $0x0 , %xmm10, %xmm6 - pclmulqdq $0x11, %xmm10, %xmm8 - pclmulqdq $0x0 , %xmm10, %xmm7 - pclmulqdq $0x11, %xmm10, %xmm13 - pxor %xmm9 , %xmm6 - xorps %xmm8 , %xmm6 - pxor %xmm12, %xmm7 - xorps %xmm13, %xmm7 - - sub $128, arg3 - - # check if there is another 64B in the buffer to be able to fold - jge _fold_64_B_loop - ################################################################## - - - add $128, arg2 - # at this point, the buffer pointer is pointing at the last y Bytes - # of the buffer the 64B of folded data is in 4 of the xmm - # registers: xmm0, xmm1, xmm2, xmm3 - - - # fold the 8 xmm registers to 1 xmm register with different constants - - movdqa rk9(%rip), %xmm10 - movdqa %xmm0, %xmm8 - pclmulqdq $0x11, %xmm10, %xmm0 - pclmulqdq $0x0 , %xmm10, %xmm8 - pxor %xmm8, %xmm7 - xorps %xmm0, %xmm7 - - movdqa rk11(%rip), %xmm10 - movdqa %xmm1, %xmm8 - pclmulqdq $0x11, %xmm10, %xmm1 - pclmulqdq $0x0 , %xmm10, %xmm8 - pxor %xmm8, %xmm7 - xorps %xmm1, %xmm7 - - movdqa rk13(%rip), %xmm10 - movdqa %xmm2, %xmm8 - pclmulqdq $0x11, %xmm10, %xmm2 - pclmulqdq $0x0 , %xmm10, %xmm8 - pxor %xmm8, %xmm7 - pxor %xmm2, %xmm7 - - movdqa rk15(%rip), %xmm10 - movdqa %xmm3, %xmm8 - pclmulqdq $0x11, %xmm10, %xmm3 - pclmulqdq $0x0 , %xmm10, %xmm8 - pxor %xmm8, %xmm7 - xorps %xmm3, %xmm7 - - movdqa rk17(%rip), %xmm10 - movdqa %xmm4, %xmm8 - pclmulqdq $0x11, %xmm10, %xmm4 - pclmulqdq $0x0 , %xmm10, %xmm8 - pxor %xmm8, %xmm7 - pxor %xmm4, %xmm7 - - movdqa rk19(%rip), %xmm10 - movdqa %xmm5, %xmm8 - pclmulqdq $0x11, %xmm10, %xmm5 - pclmulqdq $0x0 , %xmm10, %xmm8 - pxor %xmm8, %xmm7 - xorps %xmm5, %xmm7 - - movdqa rk1(%rip), %xmm10 #xmm10 has rk1 and rk2 - #imm value of pclmulqdq instruction - #will determine which constant to use - movdqa %xmm6, %xmm8 - pclmulqdq $0x11, %xmm10, %xmm6 - pclmulqdq $0x0 , %xmm10, %xmm8 - pxor %xmm8, %xmm7 - pxor %xmm6, %xmm7 - - - # instead of 64, we add 48 to the loop counter to save 1 instruction - # from the loop instead of a cmp instruction, we use the negative - # flag with the jl instruction - add $128-16, arg3 - jl _final_reduction_for_128 - - # now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 - # and the rest is in memory. We can fold 16 bytes at a time if y>=16 - # continue folding 16B at a time - -_16B_reduction_loop: - movdqa %xmm7, %xmm8 - pclmulqdq $0x11, %xmm10, %xmm7 - pclmulqdq $0x0 , %xmm10, %xmm8 - pxor %xmm8, %xmm7 - movdqu (arg2), %xmm0 - pshufb %xmm11, %xmm0 - pxor %xmm0 , %xmm7 - add $16, arg2 - sub $16, arg3 - # instead of a cmp instruction, we utilize the flags with the - # jge instruction equivalent of: cmp arg3, 16-16 - # check if there is any more 16B in the buffer to be able to fold - jge _16B_reduction_loop - - #now we have 16+z bytes left to reduce, where 0<= z < 16. - #first, we reduce the data in the xmm7 register - - -_final_reduction_for_128: - # check if any more data to fold. If not, compute the CRC of - # the final 128 bits - add $16, arg3 - je _128_done - - # here we are getting data that is less than 16 bytes. - # since we know that there was data before the pointer, we can - # offset the input pointer before the actual point, to receive - # exactly 16 bytes. after that the registers need to be adjusted. -_get_last_two_xmms: - movdqa %xmm7, %xmm2 - - movdqu -16(arg2, arg3), %xmm1 - pshufb %xmm11, %xmm1 - - # get rid of the extra data that was loaded before - # load the shift constant - lea pshufb_shf_table+16(%rip), %rax - sub arg3, %rax - movdqu (%rax), %xmm0 - - # shift xmm2 to the left by arg3 bytes - pshufb %xmm0, %xmm2 - - # shift xmm7 to the right by 16-arg3 bytes - pxor mask1(%rip), %xmm0 - pshufb %xmm0, %xmm7 - pblendvb %xmm2, %xmm1 #xmm0 is implicit - - # fold 16 Bytes - movdqa %xmm1, %xmm2 - movdqa %xmm7, %xmm8 - pclmulqdq $0x11, %xmm10, %xmm7 - pclmulqdq $0x0 , %xmm10, %xmm8 - pxor %xmm8, %xmm7 - pxor %xmm2, %xmm7 - -_128_done: - # compute crc of a 128-bit value - movdqa rk5(%rip), %xmm10 # rk5 and rk6 in xmm10 - movdqa %xmm7, %xmm0 - - #64b fold - pclmulqdq $0x1, %xmm10, %xmm7 - pslldq $8 , %xmm0 - pxor %xmm0, %xmm7 - - #32b fold - movdqa %xmm7, %xmm0 - - pand mask2(%rip), %xmm0 - - psrldq $12, %xmm7 - pclmulqdq $0x10, %xmm10, %xmm7 - pxor %xmm0, %xmm7 - - #barrett reduction -_barrett: - movdqa rk7(%rip), %xmm10 # rk7 and rk8 in xmm10 - movdqa %xmm7, %xmm0 - pclmulqdq $0x01, %xmm10, %xmm7 - pslldq $4, %xmm7 - pclmulqdq $0x11, %xmm10, %xmm7 - - pslldq $4, %xmm7 - pxor %xmm0, %xmm7 - pextrd $1, %xmm7, %eax - -_cleanup: - # scale the result back to 16 bits - shr $16, %eax - mov %rcx, %rsp - ret - -######################################################################## - -.align 16 -_less_than_128: - - # check if there is enough buffer to be able to fold 16B at a time - cmp $32, arg3 - jl _less_than_32 - movdqa SHUF_MASK(%rip), %xmm11 - - # now if there is, load the constants - movdqa rk1(%rip), %xmm10 # rk1 and rk2 in xmm10 - - movd arg1_low32, %xmm0 # get the initial crc value - pslldq $12, %xmm0 # align it to its correct place - movdqu (arg2), %xmm7 # load the plaintext - pshufb %xmm11, %xmm7 # byte-reflect the plaintext - pxor %xmm0, %xmm7 - - - # update the buffer pointer - add $16, arg2 - - # update the counter. subtract 32 instead of 16 to save one - # instruction from the loop - sub $32, arg3 - - jmp _16B_reduction_loop - - -.align 16 -_less_than_32: - # mov initial crc to the return value. this is necessary for - # zero-length buffers. - mov arg1_low32, %eax - test arg3, arg3 - je _cleanup - - movdqa SHUF_MASK(%rip), %xmm11 - - movd arg1_low32, %xmm0 # get the initial crc value - pslldq $12, %xmm0 # align it to its correct place - - cmp $16, arg3 - je _exact_16_left - jl _less_than_16_left - - movdqu (arg2), %xmm7 # load the plaintext - pshufb %xmm11, %xmm7 # byte-reflect the plaintext - pxor %xmm0 , %xmm7 # xor the initial crc value - add $16, arg2 - sub $16, arg3 - movdqa rk1(%rip), %xmm10 # rk1 and rk2 in xmm10 - jmp _get_last_two_xmms - - -.align 16 -_less_than_16_left: - # use stack space to load data less than 16 bytes, zero-out - # the 16B in memory first. - - pxor %xmm1, %xmm1 - mov %rsp, %r11 - movdqa %xmm1, (%r11) - - cmp $4, arg3 - jl _only_less_than_4 - - # backup the counter value - mov arg3, %r9 - cmp $8, arg3 - jl _less_than_8_left - - # load 8 Bytes - mov (arg2), %rax - mov %rax, (%r11) - add $8, %r11 - sub $8, arg3 - add $8, arg2 -_less_than_8_left: - - cmp $4, arg3 - jl _less_than_4_left - - # load 4 Bytes - mov (arg2), %eax - mov %eax, (%r11) - add $4, %r11 - sub $4, arg3 - add $4, arg2 -_less_than_4_left: - - cmp $2, arg3 - jl _less_than_2_left - - # load 2 Bytes - mov (arg2), %ax - mov %ax, (%r11) - add $2, %r11 - sub $2, arg3 - add $2, arg2 -_less_than_2_left: - cmp $1, arg3 - jl _zero_left - - # load 1 Byte - mov (arg2), %al - mov %al, (%r11) -_zero_left: - movdqa (%rsp), %xmm7 - pshufb %xmm11, %xmm7 - pxor %xmm0 , %xmm7 # xor the initial crc value - - # shl r9, 4 - lea pshufb_shf_table+16(%rip), %rax - sub %r9, %rax - movdqu (%rax), %xmm0 - pxor mask1(%rip), %xmm0 - - pshufb %xmm0, %xmm7 - jmp _128_done - -.align 16 -_exact_16_left: - movdqu (arg2), %xmm7 - pshufb %xmm11, %xmm7 - pxor %xmm0 , %xmm7 # xor the initial crc value - - jmp _128_done - -_only_less_than_4: - cmp $3, arg3 - jl _only_less_than_3 - - # load 3 Bytes - mov (arg2), %al - mov %al, (%r11) - - mov 1(arg2), %al - mov %al, 1(%r11) - - mov 2(arg2), %al - mov %al, 2(%r11) - - movdqa (%rsp), %xmm7 - pshufb %xmm11, %xmm7 - pxor %xmm0 , %xmm7 # xor the initial crc value - - psrldq $5, %xmm7 - - jmp _barrett -_only_less_than_3: - cmp $2, arg3 - jl _only_less_than_2 - - # load 2 Bytes - mov (arg2), %al - mov %al, (%r11) - - mov 1(arg2), %al - mov %al, 1(%r11) - - movdqa (%rsp), %xmm7 - pshufb %xmm11, %xmm7 - pxor %xmm0 , %xmm7 # xor the initial crc value - - psrldq $6, %xmm7 - - jmp _barrett -_only_less_than_2: - - # load 1 Byte - mov (arg2), %al - mov %al, (%r11) - - movdqa (%rsp), %xmm7 - pshufb %xmm11, %xmm7 - pxor %xmm0 , %xmm7 # xor the initial crc value - - psrldq $7, %xmm7 - - jmp _barrett - -ENDPROC(crc_t10dif_pcl) - -.data - -# precomputed constants -# these constants are precomputed from the poly: -# 0x8bb70000 (0x8bb7 scaled to 32 bits) -.align 16 -# Q = 0x18BB70000 -# rk1 = 2^(32*3) mod Q << 32 -# rk2 = 2^(32*5) mod Q << 32 -# rk3 = 2^(32*15) mod Q << 32 -# rk4 = 2^(32*17) mod Q << 32 -# rk5 = 2^(32*3) mod Q << 32 -# rk6 = 2^(32*2) mod Q << 32 -# rk7 = floor(2^64/Q) -# rk8 = Q -rk1: -.quad 0x2d56000000000000 -rk2: -.quad 0x06df000000000000 -rk3: -.quad 0x9d9d000000000000 -rk4: -.quad 0x7cf5000000000000 -rk5: -.quad 0x2d56000000000000 -rk6: -.quad 0x1368000000000000 -rk7: -.quad 0x00000001f65a57f8 -rk8: -.quad 0x000000018bb70000 - -rk9: -.quad 0xceae000000000000 -rk10: -.quad 0xbfd6000000000000 -rk11: -.quad 0x1e16000000000000 -rk12: -.quad 0x713c000000000000 -rk13: -.quad 0xf7f9000000000000 -rk14: -.quad 0x80a6000000000000 -rk15: -.quad 0x044c000000000000 -rk16: -.quad 0xe658000000000000 -rk17: -.quad 0xad18000000000000 -rk18: -.quad 0xa497000000000000 -rk19: -.quad 0x6ee3000000000000 -rk20: -.quad 0xe7b5000000000000 - - - -mask1: -.octa 0x80808080808080808080808080808080 -mask2: -.octa 0x00000000FFFFFFFFFFFFFFFFFFFFFFFF - -SHUF_MASK: -.octa 0x000102030405060708090A0B0C0D0E0F - -pshufb_shf_table: -# use these values for shift constants for the pshufb instruction -# different alignments result in values as shown: -# DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1 -# DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2 -# DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3 -# DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4 -# DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5 -# DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6 -# DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9 (16-7) / shr7 -# DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8 (16-8) / shr8 -# DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7 (16-9) / shr9 -# DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6 (16-10) / shr10 -# DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5 (16-11) / shr11 -# DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4 (16-12) / shr12 -# DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3 (16-13) / shr13 -# DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2 (16-14) / shr14 -# DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1 (16-15) / shr15 -.octa 0x8f8e8d8c8b8a89888786858483828100 -.octa 0x000e0d0c0b0a09080706050403020100 diff --git a/arch/x86/crypto/crct10dif-pclmul_glue.c b/arch/x86/crypto/crct10dif-pclmul_glue.c deleted file mode 100644 index 7845d7fd54c..00000000000 --- a/arch/x86/crypto/crct10dif-pclmul_glue.c +++ /dev/null @@ -1,151 +0,0 @@ -/* - * Cryptographic API. - * - * T10 Data Integrity Field CRC16 Crypto Transform using PCLMULQDQ Instructions - * - * Copyright (C) 2013 Intel Corporation - * Author: Tim Chen <tim.c.chen@linux.intel.com> - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - */ - -#include <linux/types.h> -#include <linux/module.h> -#include <linux/crc-t10dif.h> -#include <crypto/internal/hash.h> -#include <linux/init.h> -#include <linux/string.h> -#include <linux/kernel.h> -#include <asm/i387.h> -#include <asm/cpufeature.h> -#include <asm/cpu_device_id.h> - -asmlinkage __u16 crc_t10dif_pcl(__u16 crc, const unsigned char *buf, - size_t len); - -struct chksum_desc_ctx { - __u16 crc; -}; - -/* - * Steps through buffer one byte at at time, calculates reflected - * crc using table. - */ - -static int chksum_init(struct shash_desc *desc) -{ - struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); - - ctx->crc = 0; - - return 0; -} - -static int chksum_update(struct shash_desc *desc, const u8 *data, - unsigned int length) -{ - struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); - - if (irq_fpu_usable()) { - kernel_fpu_begin(); - ctx->crc = crc_t10dif_pcl(ctx->crc, data, length); - kernel_fpu_end(); - } else - ctx->crc = crc_t10dif_generic(ctx->crc, data, length); - return 0; -} - -static int chksum_final(struct shash_desc *desc, u8 *out) -{ - struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); - - *(__u16 *)out = ctx->crc; - return 0; -} - -static int __chksum_finup(__u16 *crcp, const u8 *data, unsigned int len, - u8 *out) -{ - if (irq_fpu_usable()) { - kernel_fpu_begin(); - *(__u16 *)out = crc_t10dif_pcl(*crcp, data, len); - kernel_fpu_end(); - } else - *(__u16 *)out = crc_t10dif_generic(*crcp, data, len); - return 0; -} - -static int chksum_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); - - return __chksum_finup(&ctx->crc, data, len, out); -} - -static int chksum_digest(struct shash_desc *desc, const u8 *data, - unsigned int length, u8 *out) -{ - struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); - - return __chksum_finup(&ctx->crc, data, length, out); -} - -static struct shash_alg alg = { - .digestsize = CRC_T10DIF_DIGEST_SIZE, - .init = chksum_init, - .update = chksum_update, - .final = chksum_final, - .finup = chksum_finup, - .digest = chksum_digest, - .descsize = sizeof(struct chksum_desc_ctx), - .base = { - .cra_name = "crct10dif", - .cra_driver_name = "crct10dif-pclmul", - .cra_priority = 200, - .cra_blocksize = CRC_T10DIF_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}; - -static const struct x86_cpu_id crct10dif_cpu_id[] = { - X86_FEATURE_MATCH(X86_FEATURE_PCLMULQDQ), - {} -}; -MODULE_DEVICE_TABLE(x86cpu, crct10dif_cpu_id); - -static int __init crct10dif_intel_mod_init(void) -{ - if (!x86_match_cpu(crct10dif_cpu_id)) - return -ENODEV; - - return crypto_register_shash(&alg); -} - -static void __exit crct10dif_intel_mod_fini(void) -{ - crypto_unregister_shash(&alg); -} - -module_init(crct10dif_intel_mod_init); -module_exit(crct10dif_intel_mod_fini); - -MODULE_AUTHOR("Tim Chen <tim.c.chen@linux.intel.com>"); -MODULE_DESCRIPTION("T10 DIF CRC calculation accelerated with PCLMULQDQ."); -MODULE_LICENSE("GPL"); - -MODULE_ALIAS("crct10dif"); -MODULE_ALIAS("crct10dif-pclmul"); diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index bccfca68430..665a730307f 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -457,7 +457,7 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig, else put_user_ex(0, &frame->uc.uc_flags); put_user_ex(0, &frame->uc.uc_link); - err |= __compat_save_altstack(&frame->uc.uc_stack, regs->sp); + compat_save_altstack_ex(&frame->uc.uc_stack, regs->sp); if (ksig->ka.sa.sa_flags & SA_RESTORER) restorer = ksig->ka.sa.sa_restorer; diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 474dc1b59f7..4299eb05023 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -452,7 +452,7 @@ ia32_badsys: CFI_ENDPROC - .macro PTREGSCALL label, func, arg + .macro PTREGSCALL label, func ALIGN GLOBAL(\label) leaq \func(%rip),%rax diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 2dfac58f3b1..b1977bad543 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -86,6 +86,7 @@ extern int acpi_pci_disabled; extern int acpi_skip_timer_override; extern int acpi_use_timer_override; extern int acpi_fix_pin2_polarity; +extern int acpi_disable_cmcff; extern u8 acpi_sci_flags; extern int acpi_sci_override_gsi; @@ -168,6 +169,7 @@ static inline void arch_acpi_set_pdc_bits(u32 *buf) #define acpi_lapic 0 #define acpi_ioapic 0 +#define acpi_disable_cmcff 0 static inline void acpi_noirq_set(void) { } static inline void acpi_disable_pci(void) { } static inline void disable_acpi(void) { } diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 58ed6d96a6a..0a3f9c9f98d 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -5,6 +5,7 @@ #include <linux/stddef.h> #include <linux/stringify.h> #include <asm/asm.h> +#include <asm/ptrace.h> /* * Alternative inline assembly for SMP. @@ -220,20 +221,11 @@ extern void *text_poke_early(void *addr, const void *opcode, size_t len); * no thread can be preempted in the instructions being modified (no iret to an * invalid instruction possible) or if the instructions are changed from a * consistent state to another consistent state atomically. - * More care must be taken when modifying code in the SMP case because of - * Intel's errata. text_poke_smp() takes care that errata, but still - * doesn't support NMI/MCE handler code modifying. * On the local CPU you need to be protected again NMI or MCE handlers seeing an * inconsistent instruction while you patch. */ -struct text_poke_param { - void *addr; - const void *opcode; - size_t len; -}; - extern void *text_poke(void *addr, const void *opcode, size_t len); -extern void *text_poke_smp(void *addr, const void *opcode, size_t len); -extern void text_poke_smp_batch(struct text_poke_param *params, int n); +extern int poke_int3_handler(struct pt_regs *regs); +extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler); #endif /* _ASM_X86_ALTERNATIVE_H */ diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index f8119b582c3..1d2091a226b 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -715,4 +715,6 @@ static inline void exiting_ack_irq(void) ack_APIC_irq(); } +extern void ioapic_zap_locks(void); + #endif /* _ASM_X86_APIC_H */ diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 1c2d247f65c..4582e8e1cd1 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -3,21 +3,25 @@ #ifdef __ASSEMBLY__ # define __ASM_FORM(x) x +# define __ASM_FORM_RAW(x) x # define __ASM_FORM_COMMA(x) x, #else # define __ASM_FORM(x) " " #x " " +# define __ASM_FORM_RAW(x) #x # define __ASM_FORM_COMMA(x) " " #x "," #endif #ifdef CONFIG_X86_32 # define __ASM_SEL(a,b) __ASM_FORM(a) +# define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(a) #else # define __ASM_SEL(a,b) __ASM_FORM(b) +# define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(b) #endif #define __ASM_SIZE(inst, ...) __ASM_SEL(inst##l##__VA_ARGS__, \ inst##q##__VA_ARGS__) -#define __ASM_REG(reg) __ASM_SEL(e##reg, r##reg) +#define __ASM_REG(reg) __ASM_SEL_RAW(e##reg, r##reg) #define _ASM_PTR __ASM_SEL(.long, .quad) #define _ASM_ALIGN __ASM_SEL(.balign 4, .balign 8) diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 6dfd0195bb5..41639ce8fd6 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -15,6 +15,14 @@ #include <linux/compiler.h> #include <asm/alternative.h> +#if BITS_PER_LONG == 32 +# define _BITOPS_LONG_SHIFT 5 +#elif BITS_PER_LONG == 64 +# define _BITOPS_LONG_SHIFT 6 +#else +# error "Unexpected BITS_PER_LONG" +#endif + #define BIT_64(n) (U64_C(1) << (n)) /* @@ -59,7 +67,7 @@ * restricted to acting on a single-word quantity. */ static __always_inline void -set_bit(unsigned int nr, volatile unsigned long *addr) +set_bit(long nr, volatile unsigned long *addr) { if (IS_IMMEDIATE(nr)) { asm volatile(LOCK_PREFIX "orb %1,%0" @@ -81,7 +89,7 @@ set_bit(unsigned int nr, volatile unsigned long *addr) * If it's called on the same region of memory simultaneously, the effect * may be that only one operation succeeds. */ -static inline void __set_bit(int nr, volatile unsigned long *addr) +static inline void __set_bit(long nr, volatile unsigned long *addr) { asm volatile("bts %1,%0" : ADDR : "Ir" (nr) : "memory"); } @@ -97,7 +105,7 @@ static inline void __set_bit(int nr, volatile unsigned long *addr) * in order to ensure changes are visible on other processors. */ static __always_inline void -clear_bit(int nr, volatile unsigned long *addr) +clear_bit(long nr, volatile unsigned long *addr) { if (IS_IMMEDIATE(nr)) { asm volatile(LOCK_PREFIX "andb %1,%0" @@ -118,13 +126,13 @@ clear_bit(int nr, volatile unsigned long *addr) * clear_bit() is atomic and implies release semantics before the memory * operation. It can be used for an unlock. */ -static inline void clear_bit_unlock(unsigned nr, volatile unsigned long *addr) +static inline void clear_bit_unlock(long nr, volatile unsigned long *addr) { barrier(); clear_bit(nr, addr); } -static inline void __clear_bit(int nr, volatile unsigned long *addr) +static inline void __clear_bit(long nr, volatile unsigned long *addr) { asm volatile("btr %1,%0" : ADDR : "Ir" (nr)); } @@ -141,7 +149,7 @@ static inline void __clear_bit(int nr, volatile unsigned long *addr) * No memory barrier is required here, because x86 cannot reorder stores past * older loads. Same principle as spin_unlock. */ -static inline void __clear_bit_unlock(unsigned nr, volatile unsigned long *addr) +static inline void __clear_bit_unlock(long nr, volatile unsigned long *addr) { barrier(); __clear_bit(nr, addr); @@ -159,7 +167,7 @@ static inline void __clear_bit_unlock(unsigned nr, volatile unsigned long *addr) * If it's called on the same region of memory simultaneously, the effect * may be that only one operation succeeds. */ -static inline void __change_bit(int nr, volatile unsigned long *addr) +static inline void __change_bit(long nr, volatile unsigned long *addr) { asm volatile("btc %1,%0" : ADDR : "Ir" (nr)); } @@ -173,7 +181,7 @@ static inline void __change_bit(int nr, volatile unsigned long *addr) * Note that @nr may be almost arbitrarily large; this function is not * restricted to acting on a single-word quantity. */ -static inline void change_bit(int nr, volatile unsigned long *addr) +static inline void change_bit(long nr, volatile unsigned long *addr) { if (IS_IMMEDIATE(nr)) { asm volatile(LOCK_PREFIX "xorb %1,%0" @@ -194,7 +202,7 @@ static inline void change_bit(int nr, volatile unsigned long *addr) * This operation is atomic and cannot be reordered. * It also implies a memory barrier. */ -static inline int test_and_set_bit(int nr, volatile unsigned long *addr) +static inline int test_and_set_bit(long nr, volatile unsigned long *addr) { int oldbit; @@ -212,7 +220,7 @@ static inline int test_and_set_bit(int nr, volatile unsigned long *addr) * This is the same as test_and_set_bit on x86. */ static __always_inline int -test_and_set_bit_lock(int nr, volatile unsigned long *addr) +test_and_set_bit_lock(long nr, volatile unsigned long *addr) { return test_and_set_bit(nr, addr); } @@ -226,7 +234,7 @@ test_and_set_bit_lock(int nr, volatile unsigned long *addr) * If two examples of this operation race, one can appear to succeed * but actually fail. You must protect multiple accesses with a lock. */ -static inline int __test_and_set_bit(int nr, volatile unsigned long *addr) +static inline int __test_and_set_bit(long nr, volatile unsigned long *addr) { int oldbit; @@ -245,7 +253,7 @@ static inline int __test_and_set_bit(int nr, volatile unsigned long *addr) * This operation is atomic and cannot be reordered. * It also implies a memory barrier. */ -static inline int test_and_clear_bit(int nr, volatile unsigned long *addr) +static inline int test_and_clear_bit(long nr, volatile unsigned long *addr) { int oldbit; @@ -272,7 +280,7 @@ static inline int test_and_clear_bit(int nr, volatile unsigned long *addr) * accessed from a hypervisor on the same CPU if running in a VM: don't change * this without also updating arch/x86/kernel/kvm.c */ -static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr) +static inline int __test_and_clear_bit(long nr, volatile unsigned long *addr) { int oldbit; @@ -284,7 +292,7 @@ static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr) } /* WARNING: non atomic and it can be reordered! */ -static inline int __test_and_change_bit(int nr, volatile unsigned long *addr) +static inline int __test_and_change_bit(long nr, volatile unsigned long *addr) { int oldbit; @@ -304,7 +312,7 @@ static inline int __test_and_change_bit(int nr, volatile unsigned long *addr) * This operation is atomic and cannot be reordered. * It also implies a memory barrier. */ -static inline int test_and_change_bit(int nr, volatile unsigned long *addr) +static inline int test_and_change_bit(long nr, volatile unsigned long *addr) { int oldbit; @@ -315,13 +323,13 @@ static inline int test_and_change_bit(int nr, volatile unsigned long *addr) return oldbit; } -static __always_inline int constant_test_bit(unsigned int nr, const volatile unsigned long *addr) +static __always_inline int constant_test_bit(long nr, const volatile unsigned long *addr) { - return ((1UL << (nr % BITS_PER_LONG)) & - (addr[nr / BITS_PER_LONG])) != 0; + return ((1UL << (nr & (BITS_PER_LONG-1))) & + (addr[nr >> _BITOPS_LONG_SHIFT])) != 0; } -static inline int variable_test_bit(int nr, volatile const unsigned long *addr) +static inline int variable_test_bit(long nr, volatile const unsigned long *addr) { int oldbit; diff --git a/arch/x86/include/asm/bootparam_utils.h b/arch/x86/include/asm/bootparam_utils.h index 653668d140f..4a8cb8d7cbd 100644 --- a/arch/x86/include/asm/bootparam_utils.h +++ b/arch/x86/include/asm/bootparam_utils.h @@ -35,9 +35,9 @@ static void sanitize_boot_params(struct boot_params *boot_params) */ if (boot_params->sentinel) { /* fields in boot_params are left uninitialized, clear them */ - memset(&boot_params->olpc_ofw_header, 0, + memset(&boot_params->ext_ramdisk_image, 0, (char *)&boot_params->efi_info - - (char *)&boot_params->olpc_ofw_header); + (char *)&boot_params->ext_ramdisk_image); memset(&boot_params->kbd_status, 0, (char *)&boot_params->hdr - (char *)&boot_params->kbd_status); diff --git a/arch/x86/include/asm/checksum_32.h b/arch/x86/include/asm/checksum_32.h index 46fc474fd81..f50de695173 100644 --- a/arch/x86/include/asm/checksum_32.h +++ b/arch/x86/include/asm/checksum_32.h @@ -49,9 +49,15 @@ static inline __wsum csum_partial_copy_from_user(const void __user *src, int len, __wsum sum, int *err_ptr) { + __wsum ret; + might_sleep(); - return csum_partial_copy_generic((__force void *)src, dst, - len, sum, err_ptr, NULL); + stac(); + ret = csum_partial_copy_generic((__force void *)src, dst, + len, sum, err_ptr, NULL); + clac(); + + return ret; } /* @@ -176,10 +182,16 @@ static inline __wsum csum_and_copy_to_user(const void *src, int len, __wsum sum, int *err_ptr) { + __wsum ret; + might_sleep(); - if (access_ok(VERIFY_WRITE, dst, len)) - return csum_partial_copy_generic(src, (__force void *)dst, - len, sum, NULL, err_ptr); + if (access_ok(VERIFY_WRITE, dst, len)) { + stac(); + ret = csum_partial_copy_generic(src, (__force void *)dst, + len, sum, NULL, err_ptr); + clac(); + return ret; + } if (len) *err_ptr = -EFAULT; diff --git a/arch/x86/include/asm/checksum_64.h b/arch/x86/include/asm/checksum_64.h index 9bfdc41629e..e6fd8a026c7 100644 --- a/arch/x86/include/asm/checksum_64.h +++ b/arch/x86/include/asm/checksum_64.h @@ -133,7 +133,7 @@ extern __wsum csum_partial(const void *buff, int len, __wsum sum); /* Do not call this directly. Use the wrappers below */ -extern __wsum csum_partial_copy_generic(const void *src, const void *dst, +extern __visible __wsum csum_partial_copy_generic(const void *src, const void *dst, int len, __wsum sum, int *src_err_ptr, int *dst_err_ptr); diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h index 5f9a1243190..d2b12988d2e 100644 --- a/arch/x86/include/asm/cpu.h +++ b/arch/x86/include/asm/cpu.h @@ -28,7 +28,7 @@ struct x86_cpu { #ifdef CONFIG_HOTPLUG_CPU extern int arch_register_cpu(int num); extern void arch_unregister_cpu(int); -extern void __cpuinit start_cpu0(void); +extern void start_cpu0(void); #ifdef CONFIG_DEBUG_HOTPLUG_CPU0 extern int _debug_hotplug_cpu(int cpu, int action); #endif diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 47538a61c91..d3f5c63078d 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -366,9 +366,10 @@ extern bool __static_cpu_has_safe(u16 bit); */ static __always_inline __pure bool __static_cpu_has(u16 bit) { -#if __GNUC__ > 4 || __GNUC_MINOR__ >= 5 +#ifdef CC_HAVE_ASM_GOTO #ifdef CONFIG_X86_DEBUG_STATIC_CPU_HAS + /* * Catch too early usage of this before alternatives * have run. @@ -384,6 +385,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) ".previous\n" /* skipping size check since replacement size = 0 */ : : "i" (X86_FEATURE_ALWAYS) : : t_warn); + #endif asm goto("1: jmp %l[t_no]\n" @@ -406,7 +408,9 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) warn_pre_alternatives(); return false; #endif -#else /* GCC_VERSION >= 40500 */ + +#else /* CC_HAVE_ASM_GOTO */ + u8 flag; /* Open-coded due to __stringify() in ALTERNATIVE() */ asm volatile("1: movb $0,%0\n" @@ -427,7 +431,8 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) ".previous\n" : "=qm" (flag) : "i" (bit)); return flag; -#endif + +#endif /* CC_HAVE_ASM_GOTO */ } #define static_cpu_has(bit) \ @@ -441,7 +446,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) static __always_inline __pure bool _static_cpu_has_safe(u16 bit) { -#if __GNUC__ > 4 || __GNUC_MINOR__ >= 5 +#ifdef CC_HAVE_ASM_GOTO /* * We need to spell the jumps to the compiler because, depending on the offset, * the replacement jump can be bigger than the original jump, and this we cannot @@ -475,7 +480,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit) return false; t_dynamic: return __static_cpu_has_safe(bit); -#else /* GCC_VERSION >= 40500 */ +#else u8 flag; /* Open-coded due to __stringify() in ALTERNATIVE() */ asm volatile("1: movb $2,%0\n" @@ -511,7 +516,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit) : "=qm" (flag) : "i" (bit), "i" (X86_FEATURE_ALWAYS)); return (flag == 2 ? __static_cpu_has_safe(bit) : flag); -#endif +#endif /* CC_HAVE_ASM_GOTO */ } #define static_cpu_has_safe(bit) \ diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h index cccd07fa5e3..779c2efe2e9 100644 --- a/arch/x86/include/asm/e820.h +++ b/arch/x86/include/asm/e820.h @@ -29,7 +29,7 @@ extern void e820_setup_gap(void); extern int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize, unsigned long start_addr, unsigned long long end_addr); struct setup_data; -extern void parse_e820_ext(struct setup_data *data); +extern void parse_e820_ext(u64 phys_addr, u32 data_len); #if defined(CONFIG_X86_64) || \ (defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION)) diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index e4ac559c4a2..92b3bae08b7 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -26,56 +26,56 @@ #include <asm/sections.h> /* Interrupt handlers registered during init_IRQ */ -extern void apic_timer_interrupt(void); -extern void x86_platform_ipi(void); -extern void kvm_posted_intr_ipi(void); -extern void error_interrupt(void); -extern void irq_work_interrupt(void); - -extern void spurious_interrupt(void); -extern void thermal_interrupt(void); -extern void reschedule_interrupt(void); - -extern void invalidate_interrupt(void); -extern void invalidate_interrupt0(void); -extern void invalidate_interrupt1(void); -extern void invalidate_interrupt2(void); -extern void invalidate_interrupt3(void); -extern void invalidate_interrupt4(void); -extern void invalidate_interrupt5(void); -extern void invalidate_interrupt6(void); -extern void invalidate_interrupt7(void); -extern void invalidate_interrupt8(void); -extern void invalidate_interrupt9(void); -extern void invalidate_interrupt10(void); -extern void invalidate_interrupt11(void); -extern void invalidate_interrupt12(void); -extern void invalidate_interrupt13(void); -extern void invalidate_interrupt14(void); -extern void invalidate_interrupt15(void); -extern void invalidate_interrupt16(void); -extern void invalidate_interrupt17(void); -extern void invalidate_interrupt18(void); -extern void invalidate_interrupt19(void); -extern void invalidate_interrupt20(void); -extern void invalidate_interrupt21(void); -extern void invalidate_interrupt22(void); -extern void invalidate_interrupt23(void); -extern void invalidate_interrupt24(void); -extern void invalidate_interrupt25(void); -extern void invalidate_interrupt26(void); -extern void invalidate_interrupt27(void); -extern void invalidate_interrupt28(void); -extern void invalidate_interrupt29(void); -extern void invalidate_interrupt30(void); -extern void invalidate_interrupt31(void); - -extern void irq_move_cleanup_interrupt(void); -extern void reboot_interrupt(void); -extern void threshold_interrupt(void); - -extern void call_function_interrupt(void); -extern void call_function_single_interrupt(void); +extern asmlinkage void apic_timer_interrupt(void); +extern asmlinkage void x86_platform_ipi(void); +extern asmlinkage void kvm_posted_intr_ipi(void); +extern asmlinkage void error_interrupt(void); +extern asmlinkage void irq_work_interrupt(void); + +extern asmlinkage void spurious_interrupt(void); +extern asmlinkage void thermal_interrupt(void); +extern asmlinkage void reschedule_interrupt(void); + +extern asmlinkage void invalidate_interrupt(void); +extern asmlinkage void invalidate_interrupt0(void); +extern asmlinkage void invalidate_interrupt1(void); +extern asmlinkage void invalidate_interrupt2(void); +extern asmlinkage void invalidate_interrupt3(void); +extern asmlinkage void invalidate_interrupt4(void); +extern asmlinkage void invalidate_interrupt5(void); +extern asmlinkage void invalidate_interrupt6(void); +extern asmlinkage void invalidate_interrupt7(void); +extern asmlinkage void invalidate_interrupt8(void); +extern asmlinkage void invalidate_interrupt9(void); +extern asmlinkage void invalidate_interrupt10(void); +extern asmlinkage void invalidate_interrupt11(void); +extern asmlinkage void invalidate_interrupt12(void); +extern asmlinkage void invalidate_interrupt13(void); +extern asmlinkage void invalidate_interrupt14(void); +extern asmlinkage void invalidate_interrupt15(void); +extern asmlinkage void invalidate_interrupt16(void); +extern asmlinkage void invalidate_interrupt17(void); +extern asmlinkage void invalidate_interrupt18(void); +extern asmlinkage void invalidate_interrupt19(void); +extern asmlinkage void invalidate_interrupt20(void); +extern asmlinkage void invalidate_interrupt21(void); +extern asmlinkage void invalidate_interrupt22(void); +extern asmlinkage void invalidate_interrupt23(void); +extern asmlinkage void invalidate_interrupt24(void); +extern asmlinkage void invalidate_interrupt25(void); +extern asmlinkage void invalidate_interrupt26(void); +extern asmlinkage void invalidate_interrupt27(void); +extern asmlinkage void invalidate_interrupt28(void); +extern asmlinkage void invalidate_interrupt29(void); +extern asmlinkage void invalidate_interrupt30(void); +extern asmlinkage void invalidate_interrupt31(void); + +extern asmlinkage void irq_move_cleanup_interrupt(void); +extern asmlinkage void reboot_interrupt(void); +extern asmlinkage void threshold_interrupt(void); + +extern asmlinkage void call_function_interrupt(void); +extern asmlinkage void call_function_single_interrupt(void); #ifdef CONFIG_TRACING /* Interrupt handlers registered during init_IRQ */ @@ -172,22 +172,18 @@ extern atomic_t irq_mis_count; extern void eisa_set_level_irq(unsigned int irq); /* SMP */ -extern void smp_apic_timer_interrupt(struct pt_regs *); -extern void smp_spurious_interrupt(struct pt_regs *); -extern void smp_x86_platform_ipi(struct pt_regs *); -extern void smp_error_interrupt(struct pt_regs *); +extern __visible void smp_apic_timer_interrupt(struct pt_regs *); +extern __visible void smp_spurious_interrupt(struct pt_regs *); +extern __visible void smp_x86_platform_ipi(struct pt_regs *); +extern __visible void smp_error_interrupt(struct pt_regs *); #ifdef CONFIG_X86_IO_APIC extern asmlinkage void smp_irq_move_cleanup_interrupt(void); #endif #ifdef CONFIG_SMP -extern void smp_reschedule_interrupt(struct pt_regs *); -extern void smp_call_function_interrupt(struct pt_regs *); -extern void smp_call_function_single_interrupt(struct pt_regs *); -#ifdef CONFIG_X86_32 -extern void smp_invalidate_interrupt(struct pt_regs *); -#else -extern asmlinkage void smp_invalidate_interrupt(struct pt_regs *); -#endif +extern __visible void smp_reschedule_interrupt(struct pt_regs *); +extern __visible void smp_call_function_interrupt(struct pt_regs *); +extern __visible void smp_call_function_single_interrupt(struct pt_regs *); +extern __visible void smp_invalidate_interrupt(struct pt_regs *); #endif extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void); diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h index 2d4b5e6107c..e42f758a0fb 100644 --- a/arch/x86/include/asm/hypervisor.h +++ b/arch/x86/include/asm/hypervisor.h @@ -33,7 +33,7 @@ struct hypervisor_x86 { const char *name; /* Detection routine */ - bool (*detect)(void); + uint32_t (*detect)(void); /* Adjust CPU feature bits (run once per CPU) */ void (*set_cpu_features)(struct cpuinfo_x86 *); diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index 57873beb329..0ea10f27d61 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -33,7 +33,7 @@ extern void (*x86_platform_ipi_callback)(void); extern void native_init_IRQ(void); extern bool handle_irq(unsigned irq, struct pt_regs *regs); -extern unsigned int do_IRQ(struct pt_regs *regs); +extern __visible unsigned int do_IRQ(struct pt_regs *regs); /* Interrupt vector management */ extern DECLARE_BITMAP(used_vectors, NR_VECTORS); diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h index 5a6d2873f80..9454c167629 100644 --- a/arch/x86/include/asm/kprobes.h +++ b/arch/x86/include/asm/kprobes.h @@ -49,10 +49,10 @@ typedef u8 kprobe_opcode_t; #define flush_insn_slot(p) do { } while (0) /* optinsn template addresses */ -extern kprobe_opcode_t optprobe_template_entry; -extern kprobe_opcode_t optprobe_template_val; -extern kprobe_opcode_t optprobe_template_call; -extern kprobe_opcode_t optprobe_template_end; +extern __visible kprobe_opcode_t optprobe_template_entry; +extern __visible kprobe_opcode_t optprobe_template_val; +extern __visible kprobe_opcode_t optprobe_template_call; +extern __visible kprobe_opcode_t optprobe_template_end; #define MAX_OPTIMIZED_LENGTH (MAX_INSN_SIZE + RELATIVE_ADDR_SIZE) #define MAX_OPTINSN_SIZE \ (((unsigned long)&optprobe_template_end - \ @@ -62,7 +62,7 @@ extern kprobe_opcode_t optprobe_template_end; extern const int kretprobe_blacklist_size; void arch_remove_kprobe(struct kprobe *p); -void kretprobe_trampoline(void); +asmlinkage void kretprobe_trampoline(void); /* Architecture specific copy of original instruction*/ struct arch_specific_insn { diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f87f7fcefa0..c76ff74a98f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -286,6 +286,7 @@ struct kvm_mmu { u64 *pae_root; u64 *lm_root; u64 rsvd_bits_mask[2][4]; + u64 bad_mt_xwr; /* * Bitmap: bit set = last pte in walk @@ -323,6 +324,7 @@ struct kvm_pmu { u64 global_ovf_ctrl; u64 counter_bitmask[2]; u64 global_ctrl_mask; + u64 reserved_bits; u8 version; struct kvm_pmc gp_counters[INTEL_PMC_MAX_GENERIC]; struct kvm_pmc fixed_counters[INTEL_PMC_MAX_FIXED]; @@ -511,6 +513,14 @@ struct kvm_vcpu_arch { * instruction. */ bool write_fault_to_shadow_pgtable; + + /* set at EPT violation at this point */ + unsigned long exit_qualification; + + /* pv related host specific info */ + struct { + bool pv_unhalted; + } pv; }; struct kvm_lpage_info { @@ -802,8 +812,8 @@ extern u32 kvm_min_guest_tsc_khz; extern u32 kvm_max_guest_tsc_khz; enum emulation_result { - EMULATE_DONE, /* no further processing */ - EMULATE_DO_MMIO, /* kvm_run filled with mmio request */ + EMULATE_DONE, /* no further processing */ + EMULATE_USER_EXIT, /* kvm_run ready for userspace exit */ EMULATE_FAIL, /* can't emulate this instruction */ }; diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 695399f2d5e..1df11590975 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -85,26 +85,20 @@ static inline long kvm_hypercall4(unsigned int nr, unsigned long p1, return ret; } -static inline bool kvm_para_available(void) +static inline uint32_t kvm_cpuid_base(void) { - unsigned int eax, ebx, ecx, edx; - char signature[13]; - if (boot_cpu_data.cpuid_level < 0) - return false; /* So we don't blow up on old processors */ + return 0; /* So we don't blow up on old processors */ - if (cpu_has_hypervisor) { - cpuid(KVM_CPUID_SIGNATURE, &eax, &ebx, &ecx, &edx); - memcpy(signature + 0, &ebx, 4); - memcpy(signature + 4, &ecx, 4); - memcpy(signature + 8, &edx, 4); - signature[12] = 0; + if (cpu_has_hypervisor) + return hypervisor_cpuid_base("KVMKVMKVM\0\0\0", 0); - if (strcmp(signature, "KVMKVMKVM") == 0) - return true; - } + return 0; +} - return false; +static inline bool kvm_para_available(void) +{ + return kvm_cpuid_base() != 0; } static inline unsigned int kvm_arch_para_features(void) @@ -118,10 +112,20 @@ void kvm_async_pf_task_wait(u32 token); void kvm_async_pf_task_wake(u32 token); u32 kvm_read_and_reset_pf_reason(void); extern void kvm_disable_steal_time(void); -#else -#define kvm_guest_init() do { } while (0) + +#ifdef CONFIG_PARAVIRT_SPINLOCKS +void __init kvm_spinlock_init(void); +#else /* !CONFIG_PARAVIRT_SPINLOCKS */ +static inline void kvm_spinlock_init(void) +{ +} +#endif /* CONFIG_PARAVIRT_SPINLOCKS */ + +#else /* CONFIG_KVM_GUEST */ +#define kvm_guest_init() do {} while (0) #define kvm_async_pf_task_wait(T) do {} while(0) #define kvm_async_pf_task_wake(T) do {} while(0) + static inline u32 kvm_read_and_reset_pf_reason(void) { return 0; diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 29e3093bbd2..cbe6b9e404c 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -32,11 +32,20 @@ #define MCI_STATUS_PCC (1ULL<<57) /* processor context corrupt */ #define MCI_STATUS_S (1ULL<<56) /* Signaled machine check */ #define MCI_STATUS_AR (1ULL<<55) /* Action required */ -#define MCACOD 0xffff /* MCA Error Code */ + +/* + * Note that the full MCACOD field of IA32_MCi_STATUS MSR is + * bits 15:0. But bit 12 is the 'F' bit, defined for corrected + * errors to indicate that errors are being filtered by hardware. + * We should mask out bit 12 when looking for specific signatures + * of uncorrected errors - so the F bit is deliberately skipped + * in this #define. + */ +#define MCACOD 0xefff /* MCA Error Code */ /* Architecturally defined codes from SDM Vol. 3B Chapter 15 */ #define MCACOD_SCRUB 0x00C0 /* 0xC0-0xCF Memory Scrubbing */ -#define MCACOD_SCRUBMSK 0xfff0 +#define MCACOD_SCRUBMSK 0xeff0 /* Skip bit 12 ('F' bit) */ #define MCACOD_L3WB 0x017A /* L3 Explicit Writeback */ #define MCACOD_DATA 0x0134 /* Data Load */ #define MCACOD_INSTR 0x0150 /* Instruction Fetch */ @@ -188,6 +197,9 @@ extern void register_mce_write_callback(ssize_t (*)(struct file *filp, const char __user *ubuf, size_t usize, loff_t *off)); +/* Disable CMCI/polling for MCA bank claimed by firmware */ +extern void mce_disable_bank(int bank); + /* * Exception handler */ diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index 6bc3985ee47..f98bd662531 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -60,11 +60,11 @@ static inline void __exit exit_amd_microcode(void) {} #ifdef CONFIG_MICROCODE_EARLY #define MAX_UCODE_COUNT 128 extern void __init load_ucode_bsp(void); -extern void __cpuinit load_ucode_ap(void); +extern void load_ucode_ap(void); extern int __init save_microcode_in_initrd(void); #else static inline void __init load_ucode_bsp(void) {} -static inline void __cpuinit load_ucode_ap(void) {} +static inline void load_ucode_ap(void) {} static inline int __init save_microcode_in_initrd(void) { return 0; diff --git a/arch/x86/include/asm/microcode_amd.h b/arch/x86/include/asm/microcode_amd.h index c6b043f4027..4c019179a57 100644 --- a/arch/x86/include/asm/microcode_amd.h +++ b/arch/x86/include/asm/microcode_amd.h @@ -59,7 +59,7 @@ static inline u16 find_equiv_id(struct equiv_cpu_entry *equiv_cpu_table, extern int __apply_microcode_amd(struct microcode_amd *mc_amd); extern int apply_microcode_amd(int cpu); -extern enum ucode_state load_microcode_amd(int cpu, const u8 *data, size_t size); +extern enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size); #ifdef CONFIG_MICROCODE_AMD_EARLY #ifdef CONFIG_X86_32 @@ -67,11 +67,11 @@ extern enum ucode_state load_microcode_amd(int cpu, const u8 *data, size_t size) extern u8 amd_bsp_mpb[MPB_MAX_SIZE]; #endif extern void __init load_ucode_amd_bsp(void); -extern void __cpuinit load_ucode_amd_ap(void); +extern void load_ucode_amd_ap(void); extern int __init save_microcode_in_initrd_amd(void); #else static inline void __init load_ucode_amd_bsp(void) {} -static inline void __cpuinit load_ucode_amd_ap(void) {} +static inline void load_ucode_amd_ap(void) {} static inline int __init save_microcode_in_initrd_amd(void) { return -EINVAL; } #endif diff --git a/arch/x86/include/asm/microcode_intel.h b/arch/x86/include/asm/microcode_intel.h index 87a085333cb..9067166409b 100644 --- a/arch/x86/include/asm/microcode_intel.h +++ b/arch/x86/include/asm/microcode_intel.h @@ -65,12 +65,12 @@ update_match_revision(struct microcode_header_intel *mc_header, int rev); #ifdef CONFIG_MICROCODE_INTEL_EARLY extern void __init load_ucode_intel_bsp(void); -extern void __cpuinit load_ucode_intel_ap(void); +extern void load_ucode_intel_ap(void); extern void show_ucode_info_early(void); extern int __init save_microcode_in_initrd_intel(void); #else static inline __init void load_ucode_intel_bsp(void) {} -static inline __cpuinit void load_ucode_intel_ap(void) {} +static inline void load_ucode_intel_ap(void) {} static inline void show_ucode_info_early(void) {} static inline int __init save_microcode_in_initrd_intel(void) { return -EINVAL; } #endif diff --git a/arch/x86/include/asm/mmconfig.h b/arch/x86/include/asm/mmconfig.h index 9b119da1d10..04a3fed22cf 100644 --- a/arch/x86/include/asm/mmconfig.h +++ b/arch/x86/include/asm/mmconfig.h @@ -2,8 +2,8 @@ #define _ASM_X86_MMCONFIG_H #ifdef CONFIG_PCI_MMCONFIG -extern void __cpuinit fam10h_check_enable_mmcfg(void); -extern void __cpuinit check_enable_amd_mmconf_dmi(void); +extern void fam10h_check_enable_mmcfg(void); +extern void check_enable_amd_mmconf_dmi(void); #else static inline void fam10h_check_enable_mmcfg(void) { } static inline void check_enable_amd_mmconf_dmi(void) { } diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index cdbf3677610..be12c534fd5 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -45,22 +45,28 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, /* Re-load page tables */ load_cr3(next->pgd); - /* stop flush ipis for the previous mm */ + /* Stop flush ipis for the previous mm */ cpumask_clear_cpu(cpu, mm_cpumask(prev)); - /* - * load the LDT, if the LDT is different: - */ + /* Load the LDT, if the LDT is different: */ if (unlikely(prev->context.ldt != next->context.ldt)) load_LDT_nolock(&next->context); } #ifdef CONFIG_SMP - else { + else { this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next); - if (!cpumask_test_and_set_cpu(cpu, mm_cpumask(next))) { - /* We were in lazy tlb mode and leave_mm disabled + if (!cpumask_test_cpu(cpu, mm_cpumask(next))) { + /* + * On established mms, the mm_cpumask is only changed + * from irq context, from ptep_clear_flush() while in + * lazy tlb mode, and here. Irqs are blocked during + * schedule, protecting us from simultaneous changes. + */ + cpumask_set_cpu(cpu, mm_cpumask(next)); + /* + * We were in lazy tlb mode and leave_mm disabled * tlb flush IPI delivery. We must reload CR3 * to make sure to use no freed page tables. */ diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h index 3e2f42a4b87..626cf70082d 100644 --- a/arch/x86/include/asm/mpspec.h +++ b/arch/x86/include/asm/mpspec.h @@ -94,7 +94,7 @@ static inline void early_reserve_e820_mpc_new(void) { } #define default_get_smp_config x86_init_uint_noop #endif -void __cpuinit generic_processor_info(int apicid, int version); +void generic_processor_info(int apicid, int version); #ifdef CONFIG_ACPI extern void mp_register_ioapic(int id, u32 address, u32 gsi_base); extern void mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, diff --git a/arch/x86/include/asm/mutex_64.h b/arch/x86/include/asm/mutex_64.h index 2c543fff241..e7e6751648e 100644 --- a/arch/x86/include/asm/mutex_64.h +++ b/arch/x86/include/asm/mutex_64.h @@ -16,6 +16,20 @@ * * Atomically decrements @v and calls <fail_fn> if the result is negative. */ +#ifdef CC_HAVE_ASM_GOTO +static inline void __mutex_fastpath_lock(atomic_t *v, + void (*fail_fn)(atomic_t *)) +{ + asm volatile goto(LOCK_PREFIX " decl %0\n" + " jns %l[exit]\n" + : : "m" (v->counter) + : "memory", "cc" + : exit); + fail_fn(v); +exit: + return; +} +#else #define __mutex_fastpath_lock(v, fail_fn) \ do { \ unsigned long dummy; \ @@ -32,6 +46,7 @@ do { \ : "rax", "rsi", "rdx", "rcx", \ "r8", "r9", "r10", "r11", "memory"); \ } while (0) +#endif /** * __mutex_fastpath_lock_retval - try to take the lock by moving the count @@ -56,6 +71,20 @@ static inline int __mutex_fastpath_lock_retval(atomic_t *count) * * Atomically increments @v and calls <fail_fn> if the result is nonpositive. */ +#ifdef CC_HAVE_ASM_GOTO +static inline void __mutex_fastpath_unlock(atomic_t *v, + void (*fail_fn)(atomic_t *)) +{ + asm volatile goto(LOCK_PREFIX " incl %0\n" + " jg %l[exit]\n" + : : "m" (v->counter) + : "memory", "cc" + : exit); + fail_fn(v); +exit: + return; +} +#else #define __mutex_fastpath_unlock(v, fail_fn) \ do { \ unsigned long dummy; \ @@ -72,6 +101,7 @@ do { \ : "rax", "rsi", "rdx", "rcx", \ "r8", "r9", "r10", "r11", "memory"); \ } while (0) +#endif #define __mutex_slowpath_needs_to_unlock() 1 diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h index 1b99ee5c9f0..4064acae625 100644 --- a/arch/x86/include/asm/numa.h +++ b/arch/x86/include/asm/numa.h @@ -39,7 +39,7 @@ static inline void set_apicid_to_node(int apicid, s16 node) __apicid_to_node[apicid] = node; } -extern int __cpuinit numa_cpu_node(int cpu); +extern int numa_cpu_node(int cpu); #else /* CONFIG_NUMA */ static inline void set_apicid_to_node(int apicid, s16 node) @@ -60,8 +60,8 @@ static inline int numa_cpu_node(int cpu) extern void numa_set_node(int cpu, int node); extern void numa_clear_node(int cpu); extern void __init init_cpu_to_node(void); -extern void __cpuinit numa_add_cpu(int cpu); -extern void __cpuinit numa_remove_cpu(int cpu); +extern void numa_add_cpu(int cpu); +extern void numa_remove_cpu(int cpu); #else /* CONFIG_NUMA */ static inline void numa_set_node(int cpu, int node) { } static inline void numa_clear_node(int cpu) { } diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h index ef17af01347..f48b17df422 100644 --- a/arch/x86/include/asm/page_32_types.h +++ b/arch/x86/include/asm/page_32_types.h @@ -15,6 +15,8 @@ */ #define __PAGE_OFFSET _AC(CONFIG_PAGE_OFFSET, UL) +#define __START_KERNEL_map __PAGE_OFFSET + #define THREAD_SIZE_ORDER 1 #define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER) diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 6c896fbe21d..43dcd804ebd 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -32,11 +32,6 @@ */ #define __PAGE_OFFSET _AC(0xffff880000000000, UL) -#define __PHYSICAL_START ((CONFIG_PHYSICAL_START + \ - (CONFIG_PHYSICAL_ALIGN - 1)) & \ - ~(CONFIG_PHYSICAL_ALIGN - 1)) - -#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START) #define __START_KERNEL_map _AC(0xffffffff80000000, UL) /* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */ diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 54c97879195..f97fbe3abb6 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -33,6 +33,11 @@ (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \ VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) +#define __PHYSICAL_START ALIGN(CONFIG_PHYSICAL_START, \ + CONFIG_PHYSICAL_ALIGN) + +#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START) + #ifdef CONFIG_X86_64 #include <asm/page_64_types.h> #else diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index cfdc9ee4c90..401f350ef71 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -712,36 +712,16 @@ static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx, #if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS) -static inline int arch_spin_is_locked(struct arch_spinlock *lock) +static __always_inline void __ticket_lock_spinning(struct arch_spinlock *lock, + __ticket_t ticket) { - return PVOP_CALL1(int, pv_lock_ops.spin_is_locked, lock); + PVOP_VCALLEE2(pv_lock_ops.lock_spinning, lock, ticket); } -static inline int arch_spin_is_contended(struct arch_spinlock *lock) +static __always_inline void __ticket_unlock_kick(struct arch_spinlock *lock, + __ticket_t ticket) { - return PVOP_CALL1(int, pv_lock_ops.spin_is_contended, lock); -} -#define arch_spin_is_contended arch_spin_is_contended - -static __always_inline void arch_spin_lock(struct arch_spinlock *lock) -{ - PVOP_VCALL1(pv_lock_ops.spin_lock, lock); -} - -static __always_inline void arch_spin_lock_flags(struct arch_spinlock *lock, - unsigned long flags) -{ - PVOP_VCALL2(pv_lock_ops.spin_lock_flags, lock, flags); -} - -static __always_inline int arch_spin_trylock(struct arch_spinlock *lock) -{ - return PVOP_CALL1(int, pv_lock_ops.spin_trylock, lock); -} - -static __always_inline void arch_spin_unlock(struct arch_spinlock *lock) -{ - PVOP_VCALL1(pv_lock_ops.spin_unlock, lock); + PVOP_VCALL2(pv_lock_ops.unlock_kick, lock, ticket); } #endif diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 0db1fcac668..aab8f671b52 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -327,13 +327,15 @@ struct pv_mmu_ops { }; struct arch_spinlock; +#ifdef CONFIG_SMP +#include <asm/spinlock_types.h> +#else +typedef u16 __ticket_t; +#endif + struct pv_lock_ops { - int (*spin_is_locked)(struct arch_spinlock *lock); - int (*spin_is_contended)(struct arch_spinlock *lock); - void (*spin_lock)(struct arch_spinlock *lock); - void (*spin_lock_flags)(struct arch_spinlock *lock, unsigned long flags); - int (*spin_trylock)(struct arch_spinlock *lock); - void (*spin_unlock)(struct arch_spinlock *lock); + struct paravirt_callee_save lock_spinning; + void (*unlock_kick)(struct arch_spinlock *lock, __ticket_t ticket); }; /* This contains all the paravirt structures: we get a convenient @@ -387,7 +389,8 @@ extern struct pv_lock_ops pv_lock_ops; /* Simple instruction patching code. */ #define DEF_NATIVE(ops, name, code) \ - extern const char start_##ops##_##name[], end_##ops##_##name[]; \ + extern const char start_##ops##_##name[] __visible, \ + end_##ops##_##name[] __visible; \ asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":") unsigned paravirt_patch_nop(void); diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h index f2b489cf160..3bf2dd0cf61 100644 --- a/arch/x86/include/asm/pgtable-2level.h +++ b/arch/x86/include/asm/pgtable-2level.h @@ -55,9 +55,53 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp) #define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp) #endif +#ifdef CONFIG_MEM_SOFT_DIRTY + +/* + * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE, _PAGE_BIT_SOFT_DIRTY and + * _PAGE_BIT_PROTNONE are taken, split up the 28 bits of offset + * into this range. + */ +#define PTE_FILE_MAX_BITS 28 +#define PTE_FILE_SHIFT1 (_PAGE_BIT_PRESENT + 1) +#define PTE_FILE_SHIFT2 (_PAGE_BIT_FILE + 1) +#define PTE_FILE_SHIFT3 (_PAGE_BIT_PROTNONE + 1) +#define PTE_FILE_SHIFT4 (_PAGE_BIT_SOFT_DIRTY + 1) +#define PTE_FILE_BITS1 (PTE_FILE_SHIFT2 - PTE_FILE_SHIFT1 - 1) +#define PTE_FILE_BITS2 (PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1) +#define PTE_FILE_BITS3 (PTE_FILE_SHIFT4 - PTE_FILE_SHIFT3 - 1) + +#define pte_to_pgoff(pte) \ + ((((pte).pte_low >> (PTE_FILE_SHIFT1)) \ + & ((1U << PTE_FILE_BITS1) - 1))) \ + + ((((pte).pte_low >> (PTE_FILE_SHIFT2)) \ + & ((1U << PTE_FILE_BITS2) - 1)) \ + << (PTE_FILE_BITS1)) \ + + ((((pte).pte_low >> (PTE_FILE_SHIFT3)) \ + & ((1U << PTE_FILE_BITS3) - 1)) \ + << (PTE_FILE_BITS1 + PTE_FILE_BITS2)) \ + + ((((pte).pte_low >> (PTE_FILE_SHIFT4))) \ + << (PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3)) + +#define pgoff_to_pte(off) \ + ((pte_t) { .pte_low = \ + ((((off)) & ((1U << PTE_FILE_BITS1) - 1)) << PTE_FILE_SHIFT1) \ + + ((((off) >> PTE_FILE_BITS1) \ + & ((1U << PTE_FILE_BITS2) - 1)) \ + << PTE_FILE_SHIFT2) \ + + ((((off) >> (PTE_FILE_BITS1 + PTE_FILE_BITS2)) \ + & ((1U << PTE_FILE_BITS3) - 1)) \ + << PTE_FILE_SHIFT3) \ + + ((((off) >> \ + (PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3))) \ + << PTE_FILE_SHIFT4) \ + + _PAGE_FILE }) + +#else /* CONFIG_MEM_SOFT_DIRTY */ + /* * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken, - * split up the 29 bits of offset into this range: + * split up the 29 bits of offset into this range. */ #define PTE_FILE_MAX_BITS 29 #define PTE_FILE_SHIFT1 (_PAGE_BIT_PRESENT + 1) @@ -88,6 +132,8 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp) << PTE_FILE_SHIFT3) \ + _PAGE_FILE }) +#endif /* CONFIG_MEM_SOFT_DIRTY */ + /* Encode and de-code a swap entry */ #if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE #define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1) diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index 4cc9f2b7cdc..81bb91b49a8 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -179,6 +179,9 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp) /* * Bits 0, 6 and 7 are taken in the low part of the pte, * put the 32 bits of offset into the high part. + * + * For soft-dirty tracking 11 bit is taken from + * the low part of pte as well. */ #define pte_to_pgoff(pte) ((pte).pte_high) #define pgoff_to_pte(off) \ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 7dc305a4605..8d16befdec8 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -22,7 +22,8 @@ * ZERO_PAGE is a global shared page that is always zero: used * for zero-mapped memory areas etc.. */ -extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; +extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] + __visible; #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) extern spinlock_t pgd_lock; @@ -314,6 +315,36 @@ static inline pmd_t pmd_mksoft_dirty(pmd_t pmd) return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY); } +static inline pte_t pte_swp_mksoft_dirty(pte_t pte) +{ + return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY); +} + +static inline int pte_swp_soft_dirty(pte_t pte) +{ + return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY; +} + +static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) +{ + return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY); +} + +static inline pte_t pte_file_clear_soft_dirty(pte_t pte) +{ + return pte_clear_flags(pte, _PAGE_SOFT_DIRTY); +} + +static inline pte_t pte_file_mksoft_dirty(pte_t pte) +{ + return pte_set_flags(pte, _PAGE_SOFT_DIRTY); +} + +static inline int pte_file_soft_dirty(pte_t pte) +{ + return pte_flags(pte) & _PAGE_SOFT_DIRTY; +} + /* * Mask out unsupported bits in a present pgprot. Non-present pgprots * can use those bits for other purposes, so leave them be. diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index c98ac63aae4..f4843e03113 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -61,12 +61,27 @@ * they do not conflict with each other. */ +#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_HIDDEN + #ifdef CONFIG_MEM_SOFT_DIRTY -#define _PAGE_SOFT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN) +#define _PAGE_SOFT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_SOFT_DIRTY) #else #define _PAGE_SOFT_DIRTY (_AT(pteval_t, 0)) #endif +/* + * Tracking soft dirty bit when a page goes to a swap is tricky. + * We need a bit which can be stored in pte _and_ not conflict + * with swap entry format. On x86 bits 6 and 7 are *not* involved + * into swap entry computation, but bit 6 is used for nonlinear + * file mapping, so we borrow bit 7 for soft dirty tracking. + */ +#ifdef CONFIG_MEM_SOFT_DIRTY +#define _PAGE_SWP_SOFT_DIRTY _PAGE_PSE +#else +#define _PAGE_SWP_SOFT_DIRTY (_AT(pteval_t, 0)) +#endif + #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) #define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) #else diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 29937c4f6ff..987c75ecc33 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -164,7 +164,7 @@ extern const struct seq_operations cpuinfo_op; #define cache_line_size() (boot_cpu_data.x86_cache_alignment) extern void cpu_detect(struct cpuinfo_x86 *c); -extern void __cpuinit fpu_detect(struct cpuinfo_x86 *c); +extern void fpu_detect(struct cpuinfo_x86 *c); extern void early_cpu_init(void); extern void identify_boot_cpu(void); @@ -412,7 +412,7 @@ union irq_stack_union { }; }; -DECLARE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union); +DECLARE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __visible; DECLARE_INIT_PER_CPU(irq_stack_union); DECLARE_PER_CPU(char *, irq_stack_ptr); @@ -942,33 +942,19 @@ extern int set_tsc_mode(unsigned int val); extern u16 amd_get_nb_id(int cpu); -struct aperfmperf { - u64 aperf, mperf; -}; - -static inline void get_aperfmperf(struct aperfmperf *am) +static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves) { - WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_APERFMPERF)); - - rdmsrl(MSR_IA32_APERF, am->aperf); - rdmsrl(MSR_IA32_MPERF, am->mperf); -} + uint32_t base, eax, signature[3]; -#define APERFMPERF_SHIFT 10 + for (base = 0x40000000; base < 0x40010000; base += 0x100) { + cpuid(base, &eax, &signature[0], &signature[1], &signature[2]); -static inline -unsigned long calc_aperfmperf_ratio(struct aperfmperf *old, - struct aperfmperf *new) -{ - u64 aperf = new->aperf - old->aperf; - u64 mperf = new->mperf - old->mperf; - unsigned long ratio = aperf; - - mperf >>= APERFMPERF_SHIFT; - if (mperf) - ratio = div64_u64(aperf, mperf); + if (!memcmp(sig, signature, 12) && + (leaves == 0 || ((eax - base) >= leaves))) + return base; + } - return ratio; + return 0; } extern unsigned long arch_align_stack(unsigned long sp); diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h index 60bef663609..bade6ac3b14 100644 --- a/arch/x86/include/asm/prom.h +++ b/arch/x86/include/asm/prom.h @@ -27,7 +27,7 @@ extern int of_ioapic; extern u64 initial_dtb; extern void add_dtb(u64 data); extern void x86_add_irq_domains(void); -void __cpuinit x86_of_pci_init(void); +void x86_of_pci_init(void); void x86_dtb_init(void); #else static inline void add_dtb(u64 data) { } diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index 109a9dd5d45..be8269b00e2 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -93,7 +93,6 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src, struct pvclock_vsyscall_time_info { struct pvclock_vcpu_time_info pvti; - u32 migrate_count; } __attribute__((__aligned__(SMP_CACHE_BYTES))); #define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info) diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index b7bf3505e1e..347555492da 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -6,6 +6,8 @@ #define COMMAND_LINE_SIZE 2048 +#include <linux/linkage.h> + #ifdef __i386__ #include <linux/pfn.h> @@ -108,11 +110,11 @@ void *extend_brk(size_t size, size_t align); extern void probe_roms(void); #ifdef __i386__ -void __init i386_start_kernel(void); +asmlinkage void __init i386_start_kernel(void); #else -void __init x86_64_start_kernel(char *real_mode); -void __init x86_64_start_reservations(char *real_mode_data); +asmlinkage void __init x86_64_start_kernel(char *real_mode); +asmlinkage void __init x86_64_start_reservations(char *real_mode_data); #endif /* __i386__ */ #endif /* _SETUP */ diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index b073aaea747..4137890e88e 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -179,7 +179,7 @@ static inline int wbinvd_on_all_cpus(void) } #endif /* CONFIG_SMP */ -extern unsigned disabled_cpus __cpuinitdata; +extern unsigned disabled_cpus; #ifdef CONFIG_X86_32_SMP /* diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index 2f4d924fe6c..645cad2c95f 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -101,7 +101,7 @@ static inline void native_wbinvd(void) asm volatile("wbinvd": : :"memory"); } -extern void native_load_gs_index(unsigned); +extern asmlinkage void native_load_gs_index(unsigned); #ifdef CONFIG_PARAVIRT #include <asm/paravirt.h> diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index 33692eaabab..bf156ded74b 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -1,11 +1,14 @@ #ifndef _ASM_X86_SPINLOCK_H #define _ASM_X86_SPINLOCK_H +#include <linux/jump_label.h> #include <linux/atomic.h> #include <asm/page.h> #include <asm/processor.h> #include <linux/compiler.h> #include <asm/paravirt.h> +#include <asm/bitops.h> + /* * Your basic SMP spinlocks, allowing only a single CPU anywhere * @@ -34,6 +37,36 @@ # define UNLOCK_LOCK_PREFIX #endif +/* How long a lock should spin before we consider blocking */ +#define SPIN_THRESHOLD (1 << 15) + +extern struct static_key paravirt_ticketlocks_enabled; +static __always_inline bool static_key_false(struct static_key *key); + +#ifdef CONFIG_PARAVIRT_SPINLOCKS + +static inline void __ticket_enter_slowpath(arch_spinlock_t *lock) +{ + set_bit(0, (volatile unsigned long *)&lock->tickets.tail); +} + +#else /* !CONFIG_PARAVIRT_SPINLOCKS */ +static __always_inline void __ticket_lock_spinning(arch_spinlock_t *lock, + __ticket_t ticket) +{ +} +static inline void __ticket_unlock_kick(arch_spinlock_t *lock, + __ticket_t ticket) +{ +} + +#endif /* CONFIG_PARAVIRT_SPINLOCKS */ + +static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock) +{ + return lock.tickets.head == lock.tickets.tail; +} + /* * Ticket locks are conceptually two parts, one indicating the current head of * the queue, and the other indicating the current tail. The lock is acquired @@ -47,81 +80,101 @@ * in the high part, because a wide xadd increment of the low part would carry * up and contaminate the high part. */ -static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock) +static __always_inline void arch_spin_lock(arch_spinlock_t *lock) { - register struct __raw_tickets inc = { .tail = 1 }; + register struct __raw_tickets inc = { .tail = TICKET_LOCK_INC }; inc = xadd(&lock->tickets, inc); + if (likely(inc.head == inc.tail)) + goto out; + inc.tail &= ~TICKET_SLOWPATH_FLAG; for (;;) { - if (inc.head == inc.tail) - break; - cpu_relax(); - inc.head = ACCESS_ONCE(lock->tickets.head); + unsigned count = SPIN_THRESHOLD; + + do { + if (ACCESS_ONCE(lock->tickets.head) == inc.tail) + goto out; + cpu_relax(); + } while (--count); + __ticket_lock_spinning(lock, inc.tail); } - barrier(); /* make sure nothing creeps before the lock is taken */ +out: barrier(); /* make sure nothing creeps before the lock is taken */ } -static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock) +static __always_inline int arch_spin_trylock(arch_spinlock_t *lock) { arch_spinlock_t old, new; old.tickets = ACCESS_ONCE(lock->tickets); - if (old.tickets.head != old.tickets.tail) + if (old.tickets.head != (old.tickets.tail & ~TICKET_SLOWPATH_FLAG)) return 0; - new.head_tail = old.head_tail + (1 << TICKET_SHIFT); + new.head_tail = old.head_tail + (TICKET_LOCK_INC << TICKET_SHIFT); /* cmpxchg is a full barrier, so nothing can move before it */ return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail; } -static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock) +static inline void __ticket_unlock_slowpath(arch_spinlock_t *lock, + arch_spinlock_t old) { - __add(&lock->tickets.head, 1, UNLOCK_LOCK_PREFIX); + arch_spinlock_t new; + + BUILD_BUG_ON(((__ticket_t)NR_CPUS) != NR_CPUS); + + /* Perform the unlock on the "before" copy */ + old.tickets.head += TICKET_LOCK_INC; + + /* Clear the slowpath flag */ + new.head_tail = old.head_tail & ~(TICKET_SLOWPATH_FLAG << TICKET_SHIFT); + + /* + * If the lock is uncontended, clear the flag - use cmpxchg in + * case it changes behind our back though. + */ + if (new.tickets.head != new.tickets.tail || + cmpxchg(&lock->head_tail, old.head_tail, + new.head_tail) != old.head_tail) { + /* + * Lock still has someone queued for it, so wake up an + * appropriate waiter. + */ + __ticket_unlock_kick(lock, old.tickets.head); + } } -static inline int __ticket_spin_is_locked(arch_spinlock_t *lock) +static __always_inline void arch_spin_unlock(arch_spinlock_t *lock) { - struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets); + if (TICKET_SLOWPATH_FLAG && + static_key_false(¶virt_ticketlocks_enabled)) { + arch_spinlock_t prev; - return tmp.tail != tmp.head; -} + prev = *lock; + add_smp(&lock->tickets.head, TICKET_LOCK_INC); -static inline int __ticket_spin_is_contended(arch_spinlock_t *lock) -{ - struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets); + /* add_smp() is a full mb() */ - return (__ticket_t)(tmp.tail - tmp.head) > 1; + if (unlikely(lock->tickets.tail & TICKET_SLOWPATH_FLAG)) + __ticket_unlock_slowpath(lock, prev); + } else + __add(&lock->tickets.head, TICKET_LOCK_INC, UNLOCK_LOCK_PREFIX); } -#ifndef CONFIG_PARAVIRT_SPINLOCKS - static inline int arch_spin_is_locked(arch_spinlock_t *lock) { - return __ticket_spin_is_locked(lock); -} - -static inline int arch_spin_is_contended(arch_spinlock_t *lock) -{ - return __ticket_spin_is_contended(lock); -} -#define arch_spin_is_contended arch_spin_is_contended + struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets); -static __always_inline void arch_spin_lock(arch_spinlock_t *lock) -{ - __ticket_spin_lock(lock); + return tmp.tail != tmp.head; } -static __always_inline int arch_spin_trylock(arch_spinlock_t *lock) +static inline int arch_spin_is_contended(arch_spinlock_t *lock) { - return __ticket_spin_trylock(lock); -} + struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets); -static __always_inline void arch_spin_unlock(arch_spinlock_t *lock) -{ - __ticket_spin_unlock(lock); + return (__ticket_t)(tmp.tail - tmp.head) > TICKET_LOCK_INC; } +#define arch_spin_is_contended arch_spin_is_contended static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags) @@ -129,8 +182,6 @@ static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock, arch_spin_lock(lock); } -#endif /* CONFIG_PARAVIRT_SPINLOCKS */ - static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) { while (arch_spin_is_locked(lock)) @@ -233,8 +284,4 @@ static inline void arch_write_unlock(arch_rwlock_t *rw) #define arch_read_relax(lock) cpu_relax() #define arch_write_relax(lock) cpu_relax() -/* The {read|write|spin}_lock() on x86 are full memory barriers. */ -static inline void smp_mb__after_lock(void) { } -#define ARCH_HAS_SMP_MB_AFTER_LOCK - #endif /* _ASM_X86_SPINLOCK_H */ diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h index ad0ad07fc00..4f1bea19945 100644 --- a/arch/x86/include/asm/spinlock_types.h +++ b/arch/x86/include/asm/spinlock_types.h @@ -1,13 +1,17 @@ #ifndef _ASM_X86_SPINLOCK_TYPES_H #define _ASM_X86_SPINLOCK_TYPES_H -#ifndef __LINUX_SPINLOCK_TYPES_H -# error "please don't include this file directly" -#endif - #include <linux/types.h> -#if (CONFIG_NR_CPUS < 256) +#ifdef CONFIG_PARAVIRT_SPINLOCKS +#define __TICKET_LOCK_INC 2 +#define TICKET_SLOWPATH_FLAG ((__ticket_t)1) +#else +#define __TICKET_LOCK_INC 1 +#define TICKET_SLOWPATH_FLAG ((__ticket_t)0) +#endif + +#if (CONFIG_NR_CPUS < (256 / __TICKET_LOCK_INC)) typedef u8 __ticket_t; typedef u16 __ticketpair_t; #else @@ -15,6 +19,8 @@ typedef u16 __ticket_t; typedef u32 __ticketpair_t; #endif +#define TICKET_LOCK_INC ((__ticket_t)__TICKET_LOCK_INC) + #define TICKET_SHIFT (sizeof(__ticket_t) * 8) typedef struct arch_spinlock { diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 4ec45b3abba..d7f3b3b78ac 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -2,8 +2,8 @@ #define _ASM_X86_SWITCH_TO_H struct task_struct; /* one of the stranger aspects of C forward declarations */ -struct task_struct *__switch_to(struct task_struct *prev, - struct task_struct *next); +__visible struct task_struct *__switch_to(struct task_struct *prev, + struct task_struct *next); struct tss_struct; void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, struct tss_struct *tss); diff --git a/arch/x86/include/asm/sync_bitops.h b/arch/x86/include/asm/sync_bitops.h index 9d09b4073b6..05af3b31d52 100644 --- a/arch/x86/include/asm/sync_bitops.h +++ b/arch/x86/include/asm/sync_bitops.h @@ -26,9 +26,9 @@ * Note that @nr may be almost arbitrarily large; this function is not * restricted to acting on a single-word quantity. */ -static inline void sync_set_bit(int nr, volatile unsigned long *addr) +static inline void sync_set_bit(long nr, volatile unsigned long *addr) { - asm volatile("lock; btsl %1,%0" + asm volatile("lock; bts %1,%0" : "+m" (ADDR) : "Ir" (nr) : "memory"); @@ -44,9 +44,9 @@ static inline void sync_set_bit(int nr, volatile unsigned long *addr) * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit() * in order to ensure changes are visible on other processors. */ -static inline void sync_clear_bit(int nr, volatile unsigned long *addr) +static inline void sync_clear_bit(long nr, volatile unsigned long *addr) { - asm volatile("lock; btrl %1,%0" + asm volatile("lock; btr %1,%0" : "+m" (ADDR) : "Ir" (nr) : "memory"); @@ -61,9 +61,9 @@ static inline void sync_clear_bit(int nr, volatile unsigned long *addr) * Note that @nr may be almost arbitrarily large; this function is not * restricted to acting on a single-word quantity. */ -static inline void sync_change_bit(int nr, volatile unsigned long *addr) +static inline void sync_change_bit(long nr, volatile unsigned long *addr) { - asm volatile("lock; btcl %1,%0" + asm volatile("lock; btc %1,%0" : "+m" (ADDR) : "Ir" (nr) : "memory"); @@ -77,11 +77,11 @@ static inline void sync_change_bit(int nr, volatile unsigned long *addr) * This operation is atomic and cannot be reordered. * It also implies a memory barrier. */ -static inline int sync_test_and_set_bit(int nr, volatile unsigned long *addr) +static inline int sync_test_and_set_bit(long nr, volatile unsigned long *addr) { int oldbit; - asm volatile("lock; btsl %2,%1\n\tsbbl %0,%0" + asm volatile("lock; bts %2,%1\n\tsbbl %0,%0" : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory"); return oldbit; @@ -95,11 +95,11 @@ static inline int sync_test_and_set_bit(int nr, volatile unsigned long *addr) * This operation is atomic and cannot be reordered. * It also implies a memory barrier. */ -static inline int sync_test_and_clear_bit(int nr, volatile unsigned long *addr) +static inline int sync_test_and_clear_bit(long nr, volatile unsigned long *addr) { int oldbit; - asm volatile("lock; btrl %2,%1\n\tsbbl %0,%0" + asm volatile("lock; btr %2,%1\n\tsbbl %0,%0" : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory"); return oldbit; @@ -113,11 +113,11 @@ static inline int sync_test_and_clear_bit(int nr, volatile unsigned long *addr) * This operation is atomic and cannot be reordered. * It also implies a memory barrier. */ -static inline int sync_test_and_change_bit(int nr, volatile unsigned long *addr) +static inline int sync_test_and_change_bit(long nr, volatile unsigned long *addr) { int oldbit; - asm volatile("lock; btcl %2,%1\n\tsbbl %0,%0" + asm volatile("lock; btc %2,%1\n\tsbbl %0,%0" : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory"); return oldbit; diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index 2e188d68397..aea284b4131 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -20,7 +20,8 @@ #include <asm/thread_info.h> /* for TS_COMPAT */ #include <asm/unistd.h> -extern const unsigned long sys_call_table[]; +typedef void (*sys_call_ptr_t)(void); +extern const sys_call_ptr_t sys_call_table[]; /* * Only the low 32 bits of orig_ax are meaningful, so we return int. diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h index 2917a6452c4..592a6a672e0 100644 --- a/arch/x86/include/asm/syscalls.h +++ b/arch/x86/include/asm/syscalls.h @@ -24,7 +24,7 @@ asmlinkage long sys_iopl(unsigned int); asmlinkage int sys_modify_ldt(int, void __user *, unsigned long); /* kernel/signal.c */ -long sys_rt_sigreturn(void); +asmlinkage long sys_rt_sigreturn(void); /* kernel/tls.c */ asmlinkage long sys_set_thread_area(struct user_desc __user *); @@ -34,7 +34,7 @@ asmlinkage long sys_get_thread_area(struct user_desc __user *); #ifdef CONFIG_X86_32 /* kernel/signal.c */ -unsigned long sys_sigreturn(void); +asmlinkage unsigned long sys_sigreturn(void); /* kernel/vm86_32.c */ asmlinkage long sys_vm86old(struct vm86_struct __user *); @@ -44,7 +44,7 @@ asmlinkage long sys_vm86(unsigned long, unsigned long); /* X86_64 only */ /* kernel/process_64.c */ -long sys_arch_prctl(int, unsigned long); +asmlinkage long sys_arch_prctl(int, unsigned long); /* kernel/sys_x86_64.c */ asmlinkage long sys_mmap(unsigned long, unsigned long, unsigned long, diff --git a/arch/x86/include/asm/sysfb.h b/arch/x86/include/asm/sysfb.h new file mode 100644 index 00000000000..2aeb3e25579 --- /dev/null +++ b/arch/x86/include/asm/sysfb.h @@ -0,0 +1,98 @@ +#ifndef _ARCH_X86_KERNEL_SYSFB_H +#define _ARCH_X86_KERNEL_SYSFB_H + +/* + * Generic System Framebuffers on x86 + * Copyright (c) 2012-2013 David Herrmann <dh.herrmann@gmail.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +#include <linux/kernel.h> +#include <linux/platform_data/simplefb.h> +#include <linux/screen_info.h> + +enum { + M_I17, /* 17-Inch iMac */ + M_I20, /* 20-Inch iMac */ + M_I20_SR, /* 20-Inch iMac (Santa Rosa) */ + M_I24, /* 24-Inch iMac */ + M_I24_8_1, /* 24-Inch iMac, 8,1th gen */ + M_I24_10_1, /* 24-Inch iMac, 10,1th gen */ + M_I27_11_1, /* 27-Inch iMac, 11,1th gen */ + M_MINI, /* Mac Mini */ + M_MINI_3_1, /* Mac Mini, 3,1th gen */ + M_MINI_4_1, /* Mac Mini, 4,1th gen */ + M_MB, /* MacBook */ + M_MB_2, /* MacBook, 2nd rev. */ + M_MB_3, /* MacBook, 3rd rev. */ + M_MB_5_1, /* MacBook, 5th rev. */ + M_MB_6_1, /* MacBook, 6th rev. */ + M_MB_7_1, /* MacBook, 7th rev. */ + M_MB_SR, /* MacBook, 2nd gen, (Santa Rosa) */ + M_MBA, /* MacBook Air */ + M_MBA_3, /* Macbook Air, 3rd rev */ + M_MBP, /* MacBook Pro */ + M_MBP_2, /* MacBook Pro 2nd gen */ + M_MBP_2_2, /* MacBook Pro 2,2nd gen */ + M_MBP_SR, /* MacBook Pro (Santa Rosa) */ + M_MBP_4, /* MacBook Pro, 4th gen */ + M_MBP_5_1, /* MacBook Pro, 5,1th gen */ + M_MBP_5_2, /* MacBook Pro, 5,2th gen */ + M_MBP_5_3, /* MacBook Pro, 5,3rd gen */ + M_MBP_6_1, /* MacBook Pro, 6,1th gen */ + M_MBP_6_2, /* MacBook Pro, 6,2th gen */ + M_MBP_7_1, /* MacBook Pro, 7,1th gen */ + M_MBP_8_2, /* MacBook Pro, 8,2nd gen */ + M_UNKNOWN /* placeholder */ +}; + +struct efifb_dmi_info { + char *optname; + unsigned long base; + int stride; + int width; + int height; + int flags; +}; + +#ifdef CONFIG_EFI + +extern struct efifb_dmi_info efifb_dmi_list[]; +void sysfb_apply_efi_quirks(void); + +#else /* CONFIG_EFI */ + +static inline void sysfb_apply_efi_quirks(void) +{ +} + +#endif /* CONFIG_EFI */ + +#ifdef CONFIG_X86_SYSFB + +bool parse_mode(const struct screen_info *si, + struct simplefb_platform_data *mode); +int create_simplefb(const struct screen_info *si, + const struct simplefb_platform_data *mode); + +#else /* CONFIG_X86_SYSFB */ + +static inline bool parse_mode(const struct screen_info *si, + struct simplefb_platform_data *mode) +{ + return false; +} + +static inline int create_simplefb(const struct screen_info *si, + const struct simplefb_platform_data *mode) +{ + return -EINVAL; +} + +#endif /* CONFIG_X86_SYSFB */ + +#endif /* _ARCH_X86_KERNEL_SYSFB_H */ diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 095b21507b6..d35f24e231c 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -124,9 +124,6 @@ extern const struct cpumask *cpu_coregroup_mask(int cpu); #define topology_core_id(cpu) (cpu_data(cpu).cpu_core_id) #define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu)) #define topology_thread_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu)) - -/* indicates that pointers to the topology cpumask_t maps are valid */ -#define arch_provides_topology_pointers yes #endif static inline void arch_fix_phys_package_id(int num, u32 slot) diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 88eae2aec61..7036cb60cd8 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -6,11 +6,7 @@ #include <asm/debugreg.h> #include <asm/siginfo.h> /* TRAP_TRACE, ... */ -#ifdef CONFIG_X86_32 -#define dotraplinkage -#else -#define dotraplinkage asmlinkage -#endif +#define dotraplinkage __visible asmlinkage void divide_error(void); asmlinkage void debug(void); diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index c91e8b9d588..235be70d5bb 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -49,6 +49,7 @@ extern void tsc_init(void); extern void mark_tsc_unstable(char *reason); extern int unsynchronized_tsc(void); extern int check_tsc_unstable(void); +extern int check_tsc_disabled(void); extern unsigned long native_calibrate_tsc(void); extern int tsc_clocksource_reliable; diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 5ee26875bae..5838fa911aa 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -153,16 +153,19 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL)) * Careful: we have to cast the result to the type of the pointer * for sign reasons. * - * The use of %edx as the register specifier is a bit of a + * The use of _ASM_DX as the register specifier is a bit of a * simplification, as gcc only cares about it as the starting point * and not size: for a 64-bit value it will use %ecx:%edx on 32 bits * (%ecx being the next register in gcc's x86 register sequence), and * %rdx on 64 bits. + * + * Clang/LLVM cares about the size of the register, but still wants + * the base register for something that ends up being a pair. */ #define get_user(x, ptr) \ ({ \ int __ret_gu; \ - register __inttype(*(ptr)) __val_gu asm("%edx"); \ + register __inttype(*(ptr)) __val_gu asm("%"_ASM_DX); \ __chk_user_ptr(ptr); \ might_fault(); \ asm volatile("call __get_user_%P3" \ diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index f3e01a2cbaa..966502d4682 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -387,6 +387,7 @@ enum vmcs_field { #define VMX_EPT_EXTENT_INDIVIDUAL_ADDR 0 #define VMX_EPT_EXTENT_CONTEXT 1 #define VMX_EPT_EXTENT_GLOBAL 2 +#define VMX_EPT_EXTENT_SHIFT 24 #define VMX_EPT_EXECUTE_ONLY_BIT (1ull) #define VMX_EPT_PAGE_WALK_4_BIT (1ull << 6) @@ -394,6 +395,7 @@ enum vmcs_field { #define VMX_EPTP_WB_BIT (1ull << 14) #define VMX_EPT_2MB_PAGE_BIT (1ull << 16) #define VMX_EPT_1GB_PAGE_BIT (1ull << 17) +#define VMX_EPT_INVEPT_BIT (1ull << 20) #define VMX_EPT_AD_BIT (1ull << 21) #define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) #define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h index de656ac2af4..d76ac40da20 100644 --- a/arch/x86/include/asm/vvar.h +++ b/arch/x86/include/asm/vvar.h @@ -35,7 +35,7 @@ #define DEFINE_VVAR(type, name) \ type name \ - __attribute__((section(".vvar_" #name), aligned(16))) + __attribute__((section(".vvar_" #name), aligned(16))) __visible #define VVAR(name) (*vvaraddr_ ## name) diff --git a/arch/x86/include/asm/xen/events.h b/arch/x86/include/asm/xen/events.h index ca842f2769e..608a79d5a46 100644 --- a/arch/x86/include/asm/xen/events.h +++ b/arch/x86/include/asm/xen/events.h @@ -7,6 +7,7 @@ enum ipi_vector { XEN_CALL_FUNCTION_SINGLE_VECTOR, XEN_SPIN_UNLOCK_VECTOR, XEN_IRQ_WORK_VECTOR, + XEN_NMI_VECTOR, XEN_NR_IPIS, }; diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h index 125f344f06a..d866959e568 100644 --- a/arch/x86/include/asm/xen/hypervisor.h +++ b/arch/x86/include/asm/xen/hypervisor.h @@ -40,21 +40,7 @@ extern struct start_info *xen_start_info; static inline uint32_t xen_cpuid_base(void) { - uint32_t base, eax, ebx, ecx, edx; - char signature[13]; - - for (base = 0x40000000; base < 0x40010000; base += 0x100) { - cpuid(base, &eax, &ebx, &ecx, &edx); - *(uint32_t *)(signature + 0) = ebx; - *(uint32_t *)(signature + 4) = ecx; - *(uint32_t *)(signature + 8) = edx; - signature[12] = 0; - - if (!strcmp("XenVMMXenVMM", signature) && ((eax - base) >= 2)) - return base; - } - - return 0; + return hypervisor_cpuid_base("XenVMMXenVMM", 2); } #ifdef CONFIG_XEN diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h index 06fdbd987e9..94dc8ca434e 100644 --- a/arch/x86/include/uapi/asm/kvm_para.h +++ b/arch/x86/include/uapi/asm/kvm_para.h @@ -23,6 +23,7 @@ #define KVM_FEATURE_ASYNC_PF 4 #define KVM_FEATURE_STEAL_TIME 5 #define KVM_FEATURE_PV_EOI 6 +#define KVM_FEATURE_PV_UNHALT 7 /* The last 8 bits are used to indicate how to interpret the flags field * in pvclock structure. If no bits are set, all flags are ignored. diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index d651082c7cf..0e79420376e 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -65,6 +65,7 @@ #define EXIT_REASON_EOI_INDUCED 45 #define EXIT_REASON_EPT_VIOLATION 48 #define EXIT_REASON_EPT_MISCONFIG 49 +#define EXIT_REASON_INVEPT 50 #define EXIT_REASON_PREEMPTION_TIMER 52 #define EXIT_REASON_WBINVD 54 #define EXIT_REASON_XSETBV 55 @@ -106,12 +107,13 @@ { EXIT_REASON_APIC_ACCESS, "APIC_ACCESS" }, \ { EXIT_REASON_EPT_VIOLATION, "EPT_VIOLATION" }, \ { EXIT_REASON_EPT_MISCONFIG, "EPT_MISCONFIG" }, \ + { EXIT_REASON_INVEPT, "INVEPT" }, \ + { EXIT_REASON_PREEMPTION_TIMER, "PREEMPTION_TIMER" }, \ { EXIT_REASON_WBINVD, "WBINVD" }, \ { EXIT_REASON_APIC_WRITE, "APIC_WRITE" }, \ { EXIT_REASON_EOI_INDUCED, "EOI_INDUCED" }, \ { EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \ { EXIT_REASON_INVD, "INVD" }, \ - { EXIT_REASON_INVPCID, "INVPCID" }, \ - { EXIT_REASON_PREEMPTION_TIMER, "PREEMPTION_TIMER" } + { EXIT_REASON_INVPCID, "INVPCID" } #endif /* _UAPIVMX_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 88d99ea7772..a5408b965c9 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -103,6 +103,9 @@ obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o obj-$(CONFIG_OF) += devicetree.o obj-$(CONFIG_UPROBES) += uprobes.o +obj-y += sysfb.o +obj-$(CONFIG_X86_SYSFB) += sysfb_simplefb.o +obj-$(CONFIG_EFI) += sysfb_efi.o obj-$(CONFIG_PERF_EVENTS) += perf_regs.o obj-$(CONFIG_TRACING) += tracepoint.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index d81a972dd50..40c76604199 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -67,6 +67,7 @@ EXPORT_SYMBOL(acpi_pci_disabled); int acpi_lapic; int acpi_ioapic; int acpi_strict; +int acpi_disable_cmcff; u8 acpi_sci_flags __initdata; int acpi_sci_override_gsi __initdata; @@ -141,16 +142,8 @@ static u32 irq_to_gsi(int irq) } /* - * Temporarily use the virtual area starting from FIX_IO_APIC_BASE_END, - * to map the target physical address. The problem is that set_fixmap() - * provides a single page, and it is possible that the page is not - * sufficient. - * By using this area, we can map up to MAX_IO_APICS pages temporarily, - * i.e. until the next __va_range() call. - * - * Important Safety Note: The fixed I/O APIC page numbers are *subtracted* - * from the fixed base. That's why we start at FIX_IO_APIC_BASE_END and - * count idx down while incrementing the phys address. + * This is just a simple wrapper around early_ioremap(), + * with sanity checks for phys == 0 and size == 0. */ char *__init __acpi_map_table(unsigned long phys, unsigned long size) { @@ -160,6 +153,7 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size) return early_ioremap(phys, size); } + void __init __acpi_unmap_table(char *map, unsigned long size) { if (!map || !size) @@ -195,11 +189,11 @@ static int __init acpi_parse_madt(struct acpi_table_header *table) return 0; } -static void __cpuinit acpi_register_lapic(int id, u8 enabled) +static void acpi_register_lapic(int id, u8 enabled) { unsigned int ver = 0; - if (id >= (MAX_LOCAL_APIC-1)) { + if (id >= MAX_LOCAL_APIC) { printk(KERN_INFO PREFIX "skipped apicid that is too big\n"); return; } @@ -607,7 +601,7 @@ void __init acpi_set_irq_model_ioapic(void) #ifdef CONFIG_ACPI_HOTPLUG_CPU #include <acpi/processor.h> -static void __cpuinit acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) +static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) { #ifdef CONFIG_ACPI_NUMA int nid; @@ -620,7 +614,7 @@ static void __cpuinit acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) #endif } -static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu) +static int _acpi_map_lsapic(acpi_handle handle, int *pcpu) { struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; union acpi_object *obj; @@ -1120,6 +1114,7 @@ int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) int ioapic; int ioapic_pin; struct io_apic_irq_attr irq_attr; + int ret; if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) return gsi; @@ -1149,7 +1144,9 @@ int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) set_io_apic_irq_attr(&irq_attr, ioapic, ioapic_pin, trigger == ACPI_EDGE_SENSITIVE ? 0 : 1, polarity == ACPI_ACTIVE_HIGH ? 0 : 1); - io_apic_set_pci_routing(dev, gsi_to_irq(gsi), &irq_attr); + ret = io_apic_set_pci_routing(dev, gsi_to_irq(gsi), &irq_attr); + if (ret < 0) + gsi = INT_MIN; return gsi; } @@ -1626,6 +1623,10 @@ static int __init parse_acpi(char *arg) /* "acpi=copy_dsdt" copys DSDT */ else if (strcmp(arg, "copy_dsdt") == 0) { acpi_gbl_copy_dsdt_locally = 1; + } + /* "acpi=nocmcff" disables FF mode for corrected errors */ + else if (strcmp(arg, "nocmcff") == 0) { + acpi_disable_cmcff = 1; } else { /* Core will printk when we return error. */ return -EINVAL; diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 2a34aaf3c8f..33120100ff5 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -48,9 +48,20 @@ int x86_acpi_suspend_lowlevel(void) #ifndef CONFIG_64BIT native_store_gdt((struct desc_ptr *)&header->pmode_gdt); + /* + * We have to check that we can write back the value, and not + * just read it. At least on 90 nm Pentium M (Family 6, Model + * 13), reading an invalid MSR is not guaranteed to trap, see + * Erratum X4 in "Intel Pentium M Processor on 90 nm Process + * with 2-MB L2 Cache and Intel® Processor A100 and A110 on 90 + * nm process with 512-KB L2 Cache Specification Update". + */ if (!rdmsr_safe(MSR_EFER, &header->pmode_efer_low, - &header->pmode_efer_high)) + &header->pmode_efer_high) && + !wrmsr_safe(MSR_EFER, + header->pmode_efer_low, + header->pmode_efer_high)) header->pmode_behavior |= (1 << WAKEUP_BEHAVIOR_RESTORE_EFER); #endif /* !CONFIG_64BIT */ @@ -61,7 +72,10 @@ int x86_acpi_suspend_lowlevel(void) } if (!rdmsr_safe(MSR_IA32_MISC_ENABLE, &header->pmode_misc_en_low, - &header->pmode_misc_en_high)) + &header->pmode_misc_en_high) && + !wrmsr_safe(MSR_IA32_MISC_ENABLE, + header->pmode_misc_en_low, + header->pmode_misc_en_high)) header->pmode_behavior |= (1 << WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE); header->realmode_flags = acpi_realmode_flags; diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index c15cf9a25e2..15e8563e5c2 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -11,6 +11,7 @@ #include <linux/memory.h> #include <linux/stop_machine.h> #include <linux/slab.h> +#include <linux/kdebug.h> #include <asm/alternative.h> #include <asm/sections.h> #include <asm/pgtable.h> @@ -596,97 +597,93 @@ void *__kprobes text_poke(void *addr, const void *opcode, size_t len) return addr; } -/* - * Cross-modifying kernel text with stop_machine(). - * This code originally comes from immediate value. - */ -static atomic_t stop_machine_first; -static int wrote_text; +static void do_sync_core(void *info) +{ + sync_core(); +} -struct text_poke_params { - struct text_poke_param *params; - int nparams; -}; +static bool bp_patching_in_progress; +static void *bp_int3_handler, *bp_int3_addr; -static int __kprobes stop_machine_text_poke(void *data) +int poke_int3_handler(struct pt_regs *regs) { - struct text_poke_params *tpp = data; - struct text_poke_param *p; - int i; + /* bp_patching_in_progress */ + smp_rmb(); - if (atomic_xchg(&stop_machine_first, 0)) { - for (i = 0; i < tpp->nparams; i++) { - p = &tpp->params[i]; - text_poke(p->addr, p->opcode, p->len); - } - smp_wmb(); /* Make sure other cpus see that this has run */ - wrote_text = 1; - } else { - while (!wrote_text) - cpu_relax(); - smp_mb(); /* Load wrote_text before following execution */ - } + if (likely(!bp_patching_in_progress)) + return 0; - for (i = 0; i < tpp->nparams; i++) { - p = &tpp->params[i]; - flush_icache_range((unsigned long)p->addr, - (unsigned long)p->addr + p->len); - } - /* - * Intel Archiecture Software Developer's Manual section 7.1.3 specifies - * that a core serializing instruction such as "cpuid" should be - * executed on _each_ core before the new instruction is made visible. - */ - sync_core(); - return 0; -} + if (user_mode_vm(regs) || regs->ip != (unsigned long)bp_int3_addr) + return 0; + + /* set up the specified breakpoint handler */ + regs->ip = (unsigned long) bp_int3_handler; + + return 1; -/** - * text_poke_smp - Update instructions on a live kernel on SMP - * @addr: address to modify - * @opcode: source of the copy - * @len: length to copy - * - * Modify multi-byte instruction by using stop_machine() on SMP. This allows - * user to poke/set multi-byte text on SMP. Only non-NMI/MCE code modifying - * should be allowed, since stop_machine() does _not_ protect code against - * NMI and MCE. - * - * Note: Must be called under get_online_cpus() and text_mutex. - */ -void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len) -{ - struct text_poke_params tpp; - struct text_poke_param p; - - p.addr = addr; - p.opcode = opcode; - p.len = len; - tpp.params = &p; - tpp.nparams = 1; - atomic_set(&stop_machine_first, 1); - wrote_text = 0; - /* Use __stop_machine() because the caller already got online_cpus. */ - __stop_machine(stop_machine_text_poke, (void *)&tpp, cpu_online_mask); - return addr; } /** - * text_poke_smp_batch - Update instructions on a live kernel on SMP - * @params: an array of text_poke parameters - * @n: the number of elements in params. + * text_poke_bp() -- update instructions on live kernel on SMP + * @addr: address to patch + * @opcode: opcode of new instruction + * @len: length to copy + * @handler: address to jump to when the temporary breakpoint is hit * - * Modify multi-byte instruction by using stop_machine() on SMP. Since the - * stop_machine() is heavy task, it is better to aggregate text_poke requests - * and do it once if possible. + * Modify multi-byte instruction by using int3 breakpoint on SMP. + * We completely avoid stop_machine() here, and achieve the + * synchronization using int3 breakpoint. * - * Note: Must be called under get_online_cpus() and text_mutex. + * The way it is done: + * - add a int3 trap to the address that will be patched + * - sync cores + * - update all but the first byte of the patched range + * - sync cores + * - replace the first byte (int3) by the first byte of + * replacing opcode + * - sync cores + * + * Note: must be called under text_mutex. */ -void __kprobes text_poke_smp_batch(struct text_poke_param *params, int n) +void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler) { - struct text_poke_params tpp = {.params = params, .nparams = n}; + unsigned char int3 = 0xcc; + + bp_int3_handler = handler; + bp_int3_addr = (u8 *)addr + sizeof(int3); + bp_patching_in_progress = true; + /* + * Corresponding read barrier in int3 notifier for + * making sure the in_progress flags is correctly ordered wrt. + * patching + */ + smp_wmb(); + + text_poke(addr, &int3, sizeof(int3)); - atomic_set(&stop_machine_first, 1); - wrote_text = 0; - __stop_machine(stop_machine_text_poke, (void *)&tpp, cpu_online_mask); + on_each_cpu(do_sync_core, NULL, 1); + + if (len - sizeof(int3) > 0) { + /* patch all but the first byte */ + text_poke((char *)addr + sizeof(int3), + (const char *) opcode + sizeof(int3), + len - sizeof(int3)); + /* + * According to Intel, this core syncing is very likely + * not necessary and we'd be safe even without it. But + * better safe than sorry (plus there's not only Intel). + */ + on_each_cpu(do_sync_core, NULL, 1); + } + + /* patch the first byte */ + text_poke(addr, opcode, sizeof(int3)); + + on_each_cpu(do_sync_core, NULL, 1); + + bp_patching_in_progress = false; + smp_wmb(); + + return addr; } + diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 3048ded1b59..59554dca96e 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -20,6 +20,7 @@ const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M10H_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) }, {} }; @@ -27,6 +28,7 @@ EXPORT_SYMBOL(amd_nb_misc_ids); static const struct pci_device_id amd_nb_link_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F4) }, {} }; @@ -81,13 +83,20 @@ int amd_cache_northbridges(void) next_northbridge(misc, amd_nb_misc_ids); node_to_amd_nb(i)->link = link = next_northbridge(link, amd_nb_link_ids); - } + } + /* GART present only on Fam15h upto model 0fh */ if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 || - boot_cpu_data.x86 == 0x15) + (boot_cpu_data.x86 == 0x15 && boot_cpu_data.x86_model < 0x10)) amd_northbridges.flags |= AMD_NB_GART; /* + * Check for L3 cache presence. + */ + if (!cpuid_edx(0x80000006)) + return 0; + + /* * Some CPU families support L3 Cache Index Disable. There are some * limitations because of E382 and E388 on family 0x10. */ diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 99663b59123..a7eb82d9b01 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -58,7 +58,7 @@ unsigned int num_processors; -unsigned disabled_cpus __cpuinitdata; +unsigned disabled_cpus; /* Processor that is doing the boot up */ unsigned int boot_cpu_physical_apicid = -1U; @@ -544,7 +544,7 @@ static DEFINE_PER_CPU(struct clock_event_device, lapic_events); * Setup the local APIC timer for this CPU. Copy the initialized values * of the boot CPU and register the clock event in the framework. */ -static void __cpuinit setup_APIC_timer(void) +static void setup_APIC_timer(void) { struct clock_event_device *levt = &__get_cpu_var(lapic_events); @@ -866,7 +866,7 @@ void __init setup_boot_APIC_clock(void) setup_APIC_timer(); } -void __cpuinit setup_secondary_APIC_clock(void) +void setup_secondary_APIC_clock(void) { setup_APIC_timer(); } @@ -913,7 +913,7 @@ static void local_apic_timer_interrupt(void) * [ if a single-CPU system runs an SMP kernel then we call the local * interrupt as well. Thus we cannot inline the local irq ... ] */ -void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); @@ -932,7 +932,7 @@ void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs) set_irq_regs(old_regs); } -void __irq_entry smp_trace_apic_timer_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_trace_apic_timer_interrupt(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); @@ -1229,7 +1229,7 @@ void __init init_bsp_APIC(void) apic_write(APIC_LVT1, value); } -static void __cpuinit lapic_setup_esr(void) +static void lapic_setup_esr(void) { unsigned int oldvalue, value, maxlvt; @@ -1276,7 +1276,7 @@ static void __cpuinit lapic_setup_esr(void) * Used to setup local APIC while initializing BSP or bringin up APs. * Always called with preemption disabled. */ -void __cpuinit setup_local_APIC(void) +void setup_local_APIC(void) { int cpu = smp_processor_id(); unsigned int value, queued; @@ -1471,7 +1471,7 @@ void __cpuinit setup_local_APIC(void) #endif } -void __cpuinit end_local_APIC_setup(void) +void end_local_APIC_setup(void) { lapic_setup_esr(); @@ -1946,14 +1946,14 @@ static inline void __smp_spurious_interrupt(void) "should never happen.\n", smp_processor_id()); } -void smp_spurious_interrupt(struct pt_regs *regs) +__visible void smp_spurious_interrupt(struct pt_regs *regs) { entering_irq(); __smp_spurious_interrupt(); exiting_irq(); } -void smp_trace_spurious_interrupt(struct pt_regs *regs) +__visible void smp_trace_spurious_interrupt(struct pt_regs *regs) { entering_irq(); trace_spurious_apic_entry(SPURIOUS_APIC_VECTOR); @@ -2002,14 +2002,14 @@ static inline void __smp_error_interrupt(struct pt_regs *regs) } -void smp_error_interrupt(struct pt_regs *regs) +__visible void smp_error_interrupt(struct pt_regs *regs) { entering_irq(); __smp_error_interrupt(regs); exiting_irq(); } -void smp_trace_error_interrupt(struct pt_regs *regs) +__visible void smp_trace_error_interrupt(struct pt_regs *regs) { entering_irq(); trace_error_apic_entry(ERROR_APIC_VECTOR); @@ -2107,7 +2107,7 @@ void disconnect_bsp_APIC(int virt_wire_setup) apic_write(APIC_LVT1, value); } -void __cpuinit generic_processor_info(int apicid, int version) +void generic_processor_info(int apicid, int version) { int cpu, max = nr_cpu_ids; bool boot_cpu_detected = physid_isset(boot_cpu_physical_apicid, @@ -2377,7 +2377,7 @@ static struct syscore_ops lapic_syscore_ops = { .suspend = lapic_suspend, }; -static void __cpuinit apic_pm_activate(void) +static void apic_pm_activate(void) { apic_pm_state.active = 1; } @@ -2402,7 +2402,7 @@ static void apic_pm_activate(void) { } #ifdef CONFIG_X86_64 -static int __cpuinit apic_cluster_num(void) +static int apic_cluster_num(void) { int i, clusters, zeros; unsigned id; @@ -2447,10 +2447,10 @@ static int __cpuinit apic_cluster_num(void) return clusters; } -static int __cpuinitdata multi_checked; -static int __cpuinitdata multi; +static int multi_checked; +static int multi; -static int __cpuinit set_multi(const struct dmi_system_id *d) +static int set_multi(const struct dmi_system_id *d) { if (multi) return 0; @@ -2459,7 +2459,7 @@ static int __cpuinit set_multi(const struct dmi_system_id *d) return 0; } -static const __cpuinitconst struct dmi_system_id multi_dmi_table[] = { +static const struct dmi_system_id multi_dmi_table[] = { { .callback = set_multi, .ident = "IBM System Summit2", @@ -2471,7 +2471,7 @@ static const __cpuinitconst struct dmi_system_id multi_dmi_table[] = { {} }; -static void __cpuinit dmi_check_multi(void) +static void dmi_check_multi(void) { if (multi_checked) return; @@ -2488,7 +2488,7 @@ static void __cpuinit dmi_check_multi(void) * multi-chassis. * Use DMI to check them */ -__cpuinit int apic_is_clustered_box(void) +int apic_is_clustered_box(void) { dmi_check_multi(); if (multi) diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 9a9110918ca..3e67f9e3d7e 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -74,7 +74,7 @@ static int numachip_phys_pkg_id(int initial_apic_id, int index_msb) return initial_apic_id >> index_msb; } -static int __cpuinit numachip_wakeup_secondary(int phys_apicid, unsigned long start_rip) +static int numachip_wakeup_secondary(int phys_apicid, unsigned long start_rip) { union numachip_csr_g3_ext_irq_gen int_gen; diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 0874799a98c..c55224731b2 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -130,7 +130,7 @@ int es7000_plat; */ -static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) +static int wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) { unsigned long vect = 0, psaival = 0; diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 9ed796ccc32..e63a5bd2a78 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1534,6 +1534,11 @@ void intel_ir_io_apic_print_entries(unsigned int apic, } } +void ioapic_zap_locks(void) +{ + raw_spin_lock_init(&ioapic_lock); +} + __apicdebuginit(void) print_IO_APIC(int ioapic_idx) { union IO_APIC_reg_00 reg_00; @@ -3375,12 +3380,15 @@ int io_apic_setup_irq_pin_once(unsigned int irq, int node, { unsigned int ioapic_idx = attr->ioapic, pin = attr->ioapic_pin; int ret; + struct IO_APIC_route_entry orig_entry; /* Avoid redundant programming */ if (test_bit(pin, ioapics[ioapic_idx].pin_programmed)) { - pr_debug("Pin %d-%d already programmed\n", - mpc_ioapic_id(ioapic_idx), pin); - return 0; + pr_debug("Pin %d-%d already programmed\n", mpc_ioapic_id(ioapic_idx), pin); + orig_entry = ioapic_read_entry(attr->ioapic, pin); + if (attr->trigger == orig_entry.trigger && attr->polarity == orig_entry.polarity) + return 0; + return -EBUSY; } ret = io_apic_setup_irq_pin(irq, node, attr); if (!ret) diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index d661ee95cab..1e42e8f305e 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -105,7 +105,7 @@ static void __init smp_dump_qct(void) } } -void __cpuinit numaq_tsc_disable(void) +void numaq_tsc_disable(void) { if (!found_numaq) return; diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index c88baa4ff0e..140e29db478 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -148,7 +148,7 @@ static void init_x2apic_ldr(void) /* * At CPU state changes, update the x2apic cluster sibling info. */ -static int __cpuinit +static int update_clusterinfo(struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int this_cpu = (unsigned long)hcpu; diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 63092afb142..1191ac1c9d2 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -209,7 +209,7 @@ EXPORT_SYMBOL_GPL(uv_possible_blades); unsigned long sn_rtc_cycles_per_second; EXPORT_SYMBOL(sn_rtc_cycles_per_second); -static int __cpuinit uv_wakeup_secondary(int phys_apicid, unsigned long start_rip) +static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip) { #ifdef CONFIG_SMP unsigned long val; @@ -416,7 +416,7 @@ static struct apic __refdata apic_x2apic_uv_x = { .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle, }; -static __cpuinit void set_x2apic_extra_bits(int pnode) +static void set_x2apic_extra_bits(int pnode) { __this_cpu_write(x2apic_extra_bits, pnode << uvh_apicid.s.pnode_shift); } @@ -735,7 +735,7 @@ static void uv_heartbeat(unsigned long ignored) mod_timer_pinned(timer, jiffies + SCIR_CPU_HB_INTERVAL); } -static void __cpuinit uv_heartbeat_enable(int cpu) +static void uv_heartbeat_enable(int cpu) { while (!uv_cpu_hub_info(cpu)->scir.enabled) { struct timer_list *timer = &uv_cpu_hub_info(cpu)->scir.timer; @@ -752,7 +752,7 @@ static void __cpuinit uv_heartbeat_enable(int cpu) } #ifdef CONFIG_HOTPLUG_CPU -static void __cpuinit uv_heartbeat_disable(int cpu) +static void uv_heartbeat_disable(int cpu) { if (uv_cpu_hub_info(cpu)->scir.enabled) { uv_cpu_hub_info(cpu)->scir.enabled = 0; @@ -764,8 +764,8 @@ static void __cpuinit uv_heartbeat_disable(int cpu) /* * cpu hotplug notifier */ -static __cpuinit int uv_scir_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) +static int uv_scir_cpu_notify(struct notifier_block *self, unsigned long action, + void *hcpu) { long cpu = (long)hcpu; @@ -835,7 +835,7 @@ int uv_set_vga_state(struct pci_dev *pdev, bool decode, * Called on each cpu to initialize the per_cpu UV data area. * FIXME: hotplug not supported yet */ -void __cpuinit uv_cpu_init(void) +void uv_cpu_init(void) { /* CPU 0 initilization will be done via uv_system_init. */ if (!uv_blade_info) diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 53a4e274484..3ab03430211 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -392,7 +392,7 @@ static struct cpuidle_device apm_cpuidle_device; /* * Local variables */ -static struct { +__visible struct { unsigned long offset; unsigned short segment; } apm_bios_entry; diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index c587a875722..903a264af98 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -66,10 +66,10 @@ static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val) * performance at the same time.. */ -extern void vide(void); -__asm__(".align 4\nvide: ret"); +extern __visible void vide(void); +__asm__(".globl vide\n\t.align 4\nvide: ret"); -static void __cpuinit init_amd_k5(struct cpuinfo_x86 *c) +static void init_amd_k5(struct cpuinfo_x86 *c) { /* * General Systems BIOSen alias the cpu frequency registers @@ -87,7 +87,7 @@ static void __cpuinit init_amd_k5(struct cpuinfo_x86 *c) } -static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c) +static void init_amd_k6(struct cpuinfo_x86 *c) { u32 l, h; int mbytes = get_num_physpages() >> (20-PAGE_SHIFT); @@ -179,7 +179,7 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c) } } -static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c) +static void amd_k7_smp_check(struct cpuinfo_x86 *c) { /* calling is from identify_secondary_cpu() ? */ if (!c->cpu_index) @@ -222,7 +222,7 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c) add_taint(TAINT_UNSAFE_SMP, LOCKDEP_NOW_UNRELIABLE); } -static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c) +static void init_amd_k7(struct cpuinfo_x86 *c) { u32 l, h; @@ -267,7 +267,7 @@ static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c) * To workaround broken NUMA config. Read the comment in * srat_detect_node(). */ -static int __cpuinit nearby_node(int apicid) +static int nearby_node(int apicid) { int i, node; @@ -292,7 +292,7 @@ static int __cpuinit nearby_node(int apicid) * (2) AMD processors supporting compute units */ #ifdef CONFIG_X86_HT -static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c) +static void amd_get_topology(struct cpuinfo_x86 *c) { u32 nodes, cores_per_cu = 1; u8 node_id; @@ -342,7 +342,7 @@ static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c) * On a AMD dual core setup the lower bits of the APIC id distingush the cores. * Assumes number of cores is a power of two. */ -static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c) +static void amd_detect_cmp(struct cpuinfo_x86 *c) { #ifdef CONFIG_X86_HT unsigned bits; @@ -369,7 +369,7 @@ u16 amd_get_nb_id(int cpu) } EXPORT_SYMBOL_GPL(amd_get_nb_id); -static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) +static void srat_detect_node(struct cpuinfo_x86 *c) { #ifdef CONFIG_NUMA int cpu = smp_processor_id(); @@ -421,7 +421,7 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) #endif } -static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c) +static void early_init_amd_mc(struct cpuinfo_x86 *c) { #ifdef CONFIG_X86_HT unsigned bits, ecx; @@ -447,7 +447,7 @@ static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c) #endif } -static void __cpuinit bsp_init_amd(struct cpuinfo_x86 *c) +static void bsp_init_amd(struct cpuinfo_x86 *c) { if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) { @@ -475,7 +475,7 @@ static void __cpuinit bsp_init_amd(struct cpuinfo_x86 *c) } } -static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) +static void early_init_amd(struct cpuinfo_x86 *c) { early_init_amd_mc(c); @@ -512,9 +512,9 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) static const int amd_erratum_383[]; static const int amd_erratum_400[]; -static bool cpu_has_amd_erratum(const int *erratum); +static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum); -static void __cpuinit init_amd(struct cpuinfo_x86 *c) +static void init_amd(struct cpuinfo_x86 *c) { u32 dummy; unsigned long long value; @@ -729,19 +729,18 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) value &= ~(1ULL << 24); wrmsrl_safe(MSR_AMD64_BU_CFG2, value); - if (cpu_has_amd_erratum(amd_erratum_383)) + if (cpu_has_amd_erratum(c, amd_erratum_383)) set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH); } - if (cpu_has_amd_erratum(amd_erratum_400)) + if (cpu_has_amd_erratum(c, amd_erratum_400)) set_cpu_bug(c, X86_BUG_AMD_APIC_C1E); rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); } #ifdef CONFIG_X86_32 -static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, - unsigned int size) +static unsigned int amd_size_cache(struct cpuinfo_x86 *c, unsigned int size) { /* AMD errata T13 (order #21922) */ if ((c->x86 == 6)) { @@ -757,7 +756,7 @@ static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, } #endif -static void __cpuinit cpu_set_tlb_flushall_shift(struct cpuinfo_x86 *c) +static void cpu_set_tlb_flushall_shift(struct cpuinfo_x86 *c) { tlb_flushall_shift = 5; @@ -765,7 +764,7 @@ static void __cpuinit cpu_set_tlb_flushall_shift(struct cpuinfo_x86 *c) tlb_flushall_shift = 4; } -static void __cpuinit cpu_detect_tlb_amd(struct cpuinfo_x86 *c) +static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c) { u32 ebx, eax, ecx, edx; u16 mask = 0xfff; @@ -820,7 +819,7 @@ static void __cpuinit cpu_detect_tlb_amd(struct cpuinfo_x86 *c) cpu_set_tlb_flushall_shift(c); } -static const struct cpu_dev __cpuinitconst amd_cpu_dev = { +static const struct cpu_dev amd_cpu_dev = { .c_vendor = "AMD", .c_ident = { "AuthenticAMD" }, #ifdef CONFIG_X86_32 @@ -879,23 +878,13 @@ static const int amd_erratum_400[] = static const int amd_erratum_383[] = AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf)); -static bool cpu_has_amd_erratum(const int *erratum) + +static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum) { - struct cpuinfo_x86 *cpu = __this_cpu_ptr(&cpu_info); int osvw_id = *erratum++; u32 range; u32 ms; - /* - * If called early enough that current_cpu_data hasn't been initialized - * yet, fall back to boot_cpu_data. - */ - if (cpu->x86 == 0) - cpu = &boot_cpu_data; - - if (cpu->x86_vendor != X86_VENDOR_AMD) - return false; - if (osvw_id >= 0 && osvw_id < 65536 && cpu_has(cpu, X86_FEATURE_OSVW)) { u64 osvw_len; diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index 159103c0b1f..fbf6c3bc240 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -11,7 +11,7 @@ #ifdef CONFIG_X86_OOSTORE -static u32 __cpuinit power2(u32 x) +static u32 power2(u32 x) { u32 s = 1; @@ -25,7 +25,7 @@ static u32 __cpuinit power2(u32 x) /* * Set up an actual MCR */ -static void __cpuinit centaur_mcr_insert(int reg, u32 base, u32 size, int key) +static void centaur_mcr_insert(int reg, u32 base, u32 size, int key) { u32 lo, hi; @@ -42,7 +42,7 @@ static void __cpuinit centaur_mcr_insert(int reg, u32 base, u32 size, int key) * * Shortcut: We know you can't put 4Gig of RAM on a winchip */ -static u32 __cpuinit ramtop(void) +static u32 ramtop(void) { u32 clip = 0xFFFFFFFFUL; u32 top = 0; @@ -91,7 +91,7 @@ static u32 __cpuinit ramtop(void) /* * Compute a set of MCR's to give maximum coverage */ -static int __cpuinit centaur_mcr_compute(int nr, int key) +static int centaur_mcr_compute(int nr, int key) { u32 mem = ramtop(); u32 root = power2(mem); @@ -157,7 +157,7 @@ static int __cpuinit centaur_mcr_compute(int nr, int key) return ct; } -static void __cpuinit centaur_create_optimal_mcr(void) +static void centaur_create_optimal_mcr(void) { int used; int i; @@ -181,7 +181,7 @@ static void __cpuinit centaur_create_optimal_mcr(void) wrmsr(MSR_IDT_MCR0+i, 0, 0); } -static void __cpuinit winchip2_create_optimal_mcr(void) +static void winchip2_create_optimal_mcr(void) { u32 lo, hi; int used; @@ -217,7 +217,7 @@ static void __cpuinit winchip2_create_optimal_mcr(void) /* * Handle the MCR key on the Winchip 2. */ -static void __cpuinit winchip2_unprotect_mcr(void) +static void winchip2_unprotect_mcr(void) { u32 lo, hi; u32 key; @@ -229,7 +229,7 @@ static void __cpuinit winchip2_unprotect_mcr(void) wrmsr(MSR_IDT_MCR_CTRL, lo, hi); } -static void __cpuinit winchip2_protect_mcr(void) +static void winchip2_protect_mcr(void) { u32 lo, hi; @@ -247,7 +247,7 @@ static void __cpuinit winchip2_protect_mcr(void) #define RNG_ENABLED (1 << 3) #define RNG_ENABLE (1 << 6) /* MSR_VIA_RNG */ -static void __cpuinit init_c3(struct cpuinfo_x86 *c) +static void init_c3(struct cpuinfo_x86 *c) { u32 lo, hi; @@ -318,7 +318,7 @@ enum { EAMD3D = 1<<20, }; -static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c) +static void early_init_centaur(struct cpuinfo_x86 *c) { switch (c->x86) { #ifdef CONFIG_X86_32 @@ -337,7 +337,7 @@ static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c) #endif } -static void __cpuinit init_centaur(struct cpuinfo_x86 *c) +static void init_centaur(struct cpuinfo_x86 *c) { #ifdef CONFIG_X86_32 char *name; @@ -468,7 +468,7 @@ static void __cpuinit init_centaur(struct cpuinfo_x86 *c) #endif } -static unsigned int __cpuinit +static unsigned int centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size) { #ifdef CONFIG_X86_32 @@ -488,7 +488,7 @@ centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size) return size; } -static const struct cpu_dev __cpuinitconst centaur_cpu_dev = { +static const struct cpu_dev centaur_cpu_dev = { .c_vendor = "Centaur", .c_ident = { "CentaurHauls" }, .c_early_init = early_init_centaur, diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 548bd039784..2793d1f095a 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -63,7 +63,7 @@ void __init setup_cpu_local_masks(void) alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask); } -static void __cpuinit default_init(struct cpuinfo_x86 *c) +static void default_init(struct cpuinfo_x86 *c) { #ifdef CONFIG_X86_64 cpu_detect_cache_sizes(c); @@ -80,13 +80,13 @@ static void __cpuinit default_init(struct cpuinfo_x86 *c) #endif } -static const struct cpu_dev __cpuinitconst default_cpu = { +static const struct cpu_dev default_cpu = { .c_init = default_init, .c_vendor = "Unknown", .c_x86_vendor = X86_VENDOR_UNKNOWN, }; -static const struct cpu_dev *this_cpu __cpuinitdata = &default_cpu; +static const struct cpu_dev *this_cpu = &default_cpu; DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { #ifdef CONFIG_X86_64 @@ -160,8 +160,8 @@ static int __init x86_xsaveopt_setup(char *s) __setup("noxsaveopt", x86_xsaveopt_setup); #ifdef CONFIG_X86_32 -static int cachesize_override __cpuinitdata = -1; -static int disable_x86_serial_nr __cpuinitdata = 1; +static int cachesize_override = -1; +static int disable_x86_serial_nr = 1; static int __init cachesize_setup(char *str) { @@ -215,12 +215,12 @@ static inline int flag_is_changeable_p(u32 flag) } /* Probe for the CPUID instruction */ -int __cpuinit have_cpuid_p(void) +int have_cpuid_p(void) { return flag_is_changeable_p(X86_EFLAGS_ID); } -static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c) +static void squash_the_stupid_serial_number(struct cpuinfo_x86 *c) { unsigned long lo, hi; @@ -298,7 +298,7 @@ struct cpuid_dependent_feature { u32 level; }; -static const struct cpuid_dependent_feature __cpuinitconst +static const struct cpuid_dependent_feature cpuid_dependent_features[] = { { X86_FEATURE_MWAIT, 0x00000005 }, { X86_FEATURE_DCA, 0x00000009 }, @@ -306,7 +306,7 @@ cpuid_dependent_features[] = { { 0, 0 } }; -static void __cpuinit filter_cpuid_features(struct cpuinfo_x86 *c, bool warn) +static void filter_cpuid_features(struct cpuinfo_x86 *c, bool warn) { const struct cpuid_dependent_feature *df; @@ -344,7 +344,7 @@ static void __cpuinit filter_cpuid_features(struct cpuinfo_x86 *c, bool warn) */ /* Look up CPU names by table lookup. */ -static const char *__cpuinit table_lookup_model(struct cpuinfo_x86 *c) +static const char *table_lookup_model(struct cpuinfo_x86 *c) { const struct cpu_model_info *info; @@ -364,8 +364,8 @@ static const char *__cpuinit table_lookup_model(struct cpuinfo_x86 *c) return NULL; /* Not found */ } -__u32 cpu_caps_cleared[NCAPINTS] __cpuinitdata; -__u32 cpu_caps_set[NCAPINTS] __cpuinitdata; +__u32 cpu_caps_cleared[NCAPINTS]; +__u32 cpu_caps_set[NCAPINTS]; void load_percpu_segment(int cpu) { @@ -394,9 +394,9 @@ void switch_to_new_gdt(int cpu) load_percpu_segment(cpu); } -static const struct cpu_dev *__cpuinitdata cpu_devs[X86_VENDOR_NUM] = {}; +static const struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; -static void __cpuinit get_model_name(struct cpuinfo_x86 *c) +static void get_model_name(struct cpuinfo_x86 *c) { unsigned int *v; char *p, *q; @@ -425,7 +425,7 @@ static void __cpuinit get_model_name(struct cpuinfo_x86 *c) } } -void __cpuinit cpu_detect_cache_sizes(struct cpuinfo_x86 *c) +void cpu_detect_cache_sizes(struct cpuinfo_x86 *c) { unsigned int n, dummy, ebx, ecx, edx, l2size; @@ -479,7 +479,7 @@ u16 __read_mostly tlb_lld_4m[NR_INFO]; */ s8 __read_mostly tlb_flushall_shift = -1; -void __cpuinit cpu_detect_tlb(struct cpuinfo_x86 *c) +void cpu_detect_tlb(struct cpuinfo_x86 *c) { if (this_cpu->c_detect_tlb) this_cpu->c_detect_tlb(c); @@ -493,7 +493,7 @@ void __cpuinit cpu_detect_tlb(struct cpuinfo_x86 *c) tlb_flushall_shift); } -void __cpuinit detect_ht(struct cpuinfo_x86 *c) +void detect_ht(struct cpuinfo_x86 *c) { #ifdef CONFIG_X86_HT u32 eax, ebx, ecx, edx; @@ -544,7 +544,7 @@ out: #endif } -static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) +static void get_cpu_vendor(struct cpuinfo_x86 *c) { char *v = c->x86_vendor_id; int i; @@ -571,7 +571,7 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) this_cpu = &default_cpu; } -void __cpuinit cpu_detect(struct cpuinfo_x86 *c) +void cpu_detect(struct cpuinfo_x86 *c) { /* Get vendor name */ cpuid(0x00000000, (unsigned int *)&c->cpuid_level, @@ -601,7 +601,7 @@ void __cpuinit cpu_detect(struct cpuinfo_x86 *c) } } -void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) +void get_cpu_cap(struct cpuinfo_x86 *c) { u32 tfms, xlvl; u32 ebx; @@ -652,7 +652,7 @@ void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) init_scattered_cpuid_features(c); } -static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c) +static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) { #ifdef CONFIG_X86_32 int i; @@ -769,7 +769,7 @@ void __init early_cpu_init(void) * unless we can find a reliable way to detect all the broken cases. * Enable it explicitly on 64-bit for non-constant inputs of cpu_has(). */ -static void __cpuinit detect_nopl(struct cpuinfo_x86 *c) +static void detect_nopl(struct cpuinfo_x86 *c) { #ifdef CONFIG_X86_32 clear_cpu_cap(c, X86_FEATURE_NOPL); @@ -778,7 +778,7 @@ static void __cpuinit detect_nopl(struct cpuinfo_x86 *c) #endif } -static void __cpuinit generic_identify(struct cpuinfo_x86 *c) +static void generic_identify(struct cpuinfo_x86 *c) { c->extended_cpuid_level = 0; @@ -815,7 +815,7 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c) /* * This does the hard work of actually picking apart the CPU stuff... */ -static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) +static void identify_cpu(struct cpuinfo_x86 *c) { int i; @@ -960,7 +960,7 @@ void __init identify_boot_cpu(void) cpu_detect_tlb(&boot_cpu_data); } -void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) +void identify_secondary_cpu(struct cpuinfo_x86 *c) { BUG_ON(c == &boot_cpu_data); identify_cpu(c); @@ -975,14 +975,14 @@ struct msr_range { unsigned max; }; -static const struct msr_range msr_range_array[] __cpuinitconst = { +static const struct msr_range msr_range_array[] = { { 0x00000000, 0x00000418}, { 0xc0000000, 0xc000040b}, { 0xc0010000, 0xc0010142}, { 0xc0011000, 0xc001103b}, }; -static void __cpuinit __print_cpu_msr(void) +static void __print_cpu_msr(void) { unsigned index_min, index_max; unsigned index; @@ -1001,7 +1001,7 @@ static void __cpuinit __print_cpu_msr(void) } } -static int show_msr __cpuinitdata; +static int show_msr; static __init int setup_show_msr(char *arg) { @@ -1022,7 +1022,7 @@ static __init int setup_noclflush(char *arg) } __setup("noclflush", setup_noclflush); -void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) +void print_cpu_info(struct cpuinfo_x86 *c) { const char *vendor = NULL; @@ -1051,7 +1051,7 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) print_cpu_msr(c); } -void __cpuinit print_cpu_msr(struct cpuinfo_x86 *c) +void print_cpu_msr(struct cpuinfo_x86 *c) { if (c->cpu_index < show_msr) __print_cpu_msr(); @@ -1076,7 +1076,7 @@ struct desc_ptr debug_idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) debug_idt_table }; DEFINE_PER_CPU_FIRST(union irq_stack_union, - irq_stack_union) __aligned(PAGE_SIZE); + irq_stack_union) __aligned(PAGE_SIZE) __visible; /* * The following four percpu variables are hot. Align current_task to @@ -1093,7 +1093,7 @@ EXPORT_PER_CPU_SYMBOL(kernel_stack); DEFINE_PER_CPU(char *, irq_stack_ptr) = init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64; -DEFINE_PER_CPU(unsigned int, irq_count) = -1; +DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1; DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); @@ -1216,7 +1216,7 @@ static void dbg_restore_debug_regs(void) */ #ifdef CONFIG_X86_64 -void __cpuinit cpu_init(void) +void cpu_init(void) { struct orig_ist *oist; struct task_struct *me; @@ -1315,7 +1315,7 @@ void __cpuinit cpu_init(void) #else -void __cpuinit cpu_init(void) +void cpu_init(void) { int cpu = smp_processor_id(); struct task_struct *curr = current; diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index 7582f475b16..d0969c75ab5 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -15,7 +15,7 @@ /* * Read NSC/Cyrix DEVID registers (DIR) to get more detailed info. about the CPU */ -static void __cpuinit __do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) +static void __do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) { unsigned char ccr2, ccr3; @@ -44,7 +44,7 @@ static void __cpuinit __do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) } } -static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) +static void do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) { unsigned long flags; @@ -59,25 +59,25 @@ static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) * Actually since bugs.h doesn't even reference this perhaps someone should * fix the documentation ??? */ -static unsigned char Cx86_dir0_msb __cpuinitdata = 0; +static unsigned char Cx86_dir0_msb = 0; -static const char __cpuinitconst Cx86_model[][9] = { +static const char Cx86_model[][9] = { "Cx486", "Cx486", "5x86 ", "6x86", "MediaGX ", "6x86MX ", "M II ", "Unknown" }; -static const char __cpuinitconst Cx486_name[][5] = { +static const char Cx486_name[][5] = { "SLC", "DLC", "SLC2", "DLC2", "SRx", "DRx", "SRx2", "DRx2" }; -static const char __cpuinitconst Cx486S_name[][4] = { +static const char Cx486S_name[][4] = { "S", "S2", "Se", "S2e" }; -static const char __cpuinitconst Cx486D_name[][4] = { +static const char Cx486D_name[][4] = { "DX", "DX2", "?", "?", "?", "DX4" }; -static char Cx86_cb[] __cpuinitdata = "?.5x Core/Bus Clock"; -static const char __cpuinitconst cyrix_model_mult1[] = "12??43"; -static const char __cpuinitconst cyrix_model_mult2[] = "12233445"; +static char Cx86_cb[] = "?.5x Core/Bus Clock"; +static const char cyrix_model_mult1[] = "12??43"; +static const char cyrix_model_mult2[] = "12233445"; /* * Reset the slow-loop (SLOP) bit on the 686(L) which is set by some old @@ -87,7 +87,7 @@ static const char __cpuinitconst cyrix_model_mult2[] = "12233445"; * FIXME: our newer udelay uses the tsc. We don't need to frob with SLOP */ -static void __cpuinit check_cx686_slop(struct cpuinfo_x86 *c) +static void check_cx686_slop(struct cpuinfo_x86 *c) { unsigned long flags; @@ -112,7 +112,7 @@ static void __cpuinit check_cx686_slop(struct cpuinfo_x86 *c) } -static void __cpuinit set_cx86_reorder(void) +static void set_cx86_reorder(void) { u8 ccr3; @@ -127,7 +127,7 @@ static void __cpuinit set_cx86_reorder(void) setCx86(CX86_CCR3, ccr3); } -static void __cpuinit set_cx86_memwb(void) +static void set_cx86_memwb(void) { printk(KERN_INFO "Enable Memory-Write-back mode on Cyrix/NSC processor.\n"); @@ -143,7 +143,7 @@ static void __cpuinit set_cx86_memwb(void) * Configure later MediaGX and/or Geode processor. */ -static void __cpuinit geode_configure(void) +static void geode_configure(void) { unsigned long flags; u8 ccr3; @@ -166,7 +166,7 @@ static void __cpuinit geode_configure(void) local_irq_restore(flags); } -static void __cpuinit early_init_cyrix(struct cpuinfo_x86 *c) +static void early_init_cyrix(struct cpuinfo_x86 *c) { unsigned char dir0, dir0_msn, dir1 = 0; @@ -185,7 +185,7 @@ static void __cpuinit early_init_cyrix(struct cpuinfo_x86 *c) } } -static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) +static void init_cyrix(struct cpuinfo_x86 *c) { unsigned char dir0, dir0_msn, dir0_lsn, dir1 = 0; char *buf = c->x86_model_id; @@ -356,7 +356,7 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) /* * Handle National Semiconductor branded processors */ -static void __cpuinit init_nsc(struct cpuinfo_x86 *c) +static void init_nsc(struct cpuinfo_x86 *c) { /* * There may be GX1 processors in the wild that are branded @@ -405,7 +405,7 @@ static inline int test_cyrix_52div(void) return (unsigned char) (test >> 8) == 0x02; } -static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c) +static void cyrix_identify(struct cpuinfo_x86 *c) { /* Detect Cyrix with disabled CPUID */ if (c->x86 == 4 && test_cyrix_52div()) { @@ -441,7 +441,7 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c) } } -static const struct cpu_dev __cpuinitconst cyrix_cpu_dev = { +static const struct cpu_dev cyrix_cpu_dev = { .c_vendor = "Cyrix", .c_ident = { "CyrixInstead" }, .c_early_init = early_init_cyrix, @@ -452,7 +452,7 @@ static const struct cpu_dev __cpuinitconst cyrix_cpu_dev = { cpu_dev_register(cyrix_cpu_dev); -static const struct cpu_dev __cpuinitconst nsc_cpu_dev = { +static const struct cpu_dev nsc_cpu_dev = { .c_vendor = "NSC", .c_ident = { "Geode by NSC" }, .c_init = init_nsc, diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index 1e7e84a02eb..36ce402a3fa 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -25,11 +25,6 @@ #include <asm/processor.h> #include <asm/hypervisor.h> -/* - * Hypervisor detect order. This is specified explicitly here because - * some hypervisors might implement compatibility modes for other - * hypervisors and therefore need to be detected in specific sequence. - */ static const __initconst struct hypervisor_x86 * const hypervisors[] = { #ifdef CONFIG_XEN_PVHVM @@ -49,18 +44,22 @@ static inline void __init detect_hypervisor_vendor(void) { const struct hypervisor_x86 *h, * const *p; + uint32_t pri, max_pri = 0; for (p = hypervisors; p < hypervisors + ARRAY_SIZE(hypervisors); p++) { h = *p; - if (h->detect()) { + pri = h->detect(); + if (pri != 0 && pri > max_pri) { + max_pri = pri; x86_hyper = h; - printk(KERN_INFO "Hypervisor detected: %s\n", h->name); - break; } } + + if (max_pri) + printk(KERN_INFO "Hypervisor detected: %s\n", x86_hyper->name); } -void __cpuinit init_hypervisor(struct cpuinfo_x86 *c) +void init_hypervisor(struct cpuinfo_x86 *c) { if (x86_hyper && x86_hyper->set_cpu_features) x86_hyper->set_cpu_features(c); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 9b0c441c03f..ec7299566f7 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -26,7 +26,7 @@ #include <asm/apic.h> #endif -static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) +static void early_init_intel(struct cpuinfo_x86 *c) { u64 misc_enable; @@ -163,7 +163,7 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) * This is called before we do cpu ident work */ -int __cpuinit ppro_with_ram_bug(void) +int ppro_with_ram_bug(void) { /* Uses data from early_cpu_detect now */ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && @@ -176,7 +176,7 @@ int __cpuinit ppro_with_ram_bug(void) return 0; } -static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c) +static void intel_smp_check(struct cpuinfo_x86 *c) { /* calling is from identify_secondary_cpu() ? */ if (!c->cpu_index) @@ -196,7 +196,7 @@ static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c) } } -static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) +static void intel_workarounds(struct cpuinfo_x86 *c) { unsigned long lo, hi; @@ -275,12 +275,12 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) intel_smp_check(c); } #else -static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) +static void intel_workarounds(struct cpuinfo_x86 *c) { } #endif -static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) +static void srat_detect_node(struct cpuinfo_x86 *c) { #ifdef CONFIG_NUMA unsigned node; @@ -300,7 +300,7 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) /* * find out the number of processor cores on the die */ -static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c) +static int intel_num_cpu_cores(struct cpuinfo_x86 *c) { unsigned int eax, ebx, ecx, edx; @@ -315,7 +315,7 @@ static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c) return 1; } -static void __cpuinit detect_vmx_virtcap(struct cpuinfo_x86 *c) +static void detect_vmx_virtcap(struct cpuinfo_x86 *c) { /* Intel VMX MSR indicated features */ #define X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW 0x00200000 @@ -353,7 +353,7 @@ static void __cpuinit detect_vmx_virtcap(struct cpuinfo_x86 *c) } } -static void __cpuinit init_intel(struct cpuinfo_x86 *c) +static void init_intel(struct cpuinfo_x86 *c) { unsigned int l2 = 0; @@ -472,7 +472,7 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) } #ifdef CONFIG_X86_32 -static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned int size) +static unsigned int intel_size_cache(struct cpuinfo_x86 *c, unsigned int size) { /* * Intel PIII Tualatin. This comes in two flavours. @@ -506,7 +506,7 @@ static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned i #define STLB_4K 0x41 -static const struct _tlb_table intel_tlb_table[] __cpuinitconst = { +static const struct _tlb_table intel_tlb_table[] = { { 0x01, TLB_INST_4K, 32, " TLB_INST 4 KByte pages, 4-way set associative" }, { 0x02, TLB_INST_4M, 2, " TLB_INST 4 MByte pages, full associative" }, { 0x03, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way set associative" }, @@ -536,7 +536,7 @@ static const struct _tlb_table intel_tlb_table[] __cpuinitconst = { { 0x00, 0, 0 } }; -static void __cpuinit intel_tlb_lookup(const unsigned char desc) +static void intel_tlb_lookup(const unsigned char desc) { unsigned char k; if (desc == 0) @@ -605,7 +605,7 @@ static void __cpuinit intel_tlb_lookup(const unsigned char desc) } } -static void __cpuinit intel_tlb_flushall_shift_set(struct cpuinfo_x86 *c) +static void intel_tlb_flushall_shift_set(struct cpuinfo_x86 *c) { switch ((c->x86 << 8) + c->x86_model) { case 0x60f: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ @@ -634,7 +634,7 @@ static void __cpuinit intel_tlb_flushall_shift_set(struct cpuinfo_x86 *c) } } -static void __cpuinit intel_detect_tlb(struct cpuinfo_x86 *c) +static void intel_detect_tlb(struct cpuinfo_x86 *c) { int i, j, n; unsigned int regs[4]; @@ -661,7 +661,7 @@ static void __cpuinit intel_detect_tlb(struct cpuinfo_x86 *c) intel_tlb_flushall_shift_set(c); } -static const struct cpu_dev __cpuinitconst intel_cpu_dev = { +static const struct cpu_dev intel_cpu_dev = { .c_vendor = "Intel", .c_ident = { "GenuineIntel" }, #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 8dc72dda66f..1414c90feab 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -37,7 +37,7 @@ struct _cache_table { /* All the cache descriptor types we care about (no TLB or trace cache entries) */ -static const struct _cache_table __cpuinitconst cache_table[] = +static const struct _cache_table cache_table[] = { { 0x06, LVL_1_INST, 8 }, /* 4-way set assoc, 32 byte line size */ { 0x08, LVL_1_INST, 16 }, /* 4-way set assoc, 32 byte line size */ @@ -203,7 +203,7 @@ union l3_cache { unsigned val; }; -static const unsigned short __cpuinitconst assocs[] = { +static const unsigned short assocs[] = { [1] = 1, [2] = 2, [4] = 4, @@ -217,10 +217,10 @@ static const unsigned short __cpuinitconst assocs[] = { [0xf] = 0xffff /* fully associative - no way to show this currently */ }; -static const unsigned char __cpuinitconst levels[] = { 1, 1, 2, 3 }; -static const unsigned char __cpuinitconst types[] = { 1, 2, 3, 3 }; +static const unsigned char levels[] = { 1, 1, 2, 3 }; +static const unsigned char types[] = { 1, 2, 3, 3 }; -static void __cpuinit +static void amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, union _cpuid4_leaf_ebx *ebx, union _cpuid4_leaf_ecx *ecx) @@ -302,7 +302,7 @@ struct _cache_attr { /* * L3 cache descriptors */ -static void __cpuinit amd_calc_l3_indices(struct amd_northbridge *nb) +static void amd_calc_l3_indices(struct amd_northbridge *nb) { struct amd_l3_cache *l3 = &nb->l3_cache; unsigned int sc0, sc1, sc2, sc3; @@ -325,7 +325,7 @@ static void __cpuinit amd_calc_l3_indices(struct amd_northbridge *nb) l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1; } -static void __cpuinit amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index) +static void amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index) { int node; @@ -528,8 +528,7 @@ static struct _cache_attr subcaches = #endif /* CONFIG_AMD_NB && CONFIG_SYSFS */ static int -__cpuinit cpuid4_cache_lookup_regs(int index, - struct _cpuid4_info_regs *this_leaf) +cpuid4_cache_lookup_regs(int index, struct _cpuid4_info_regs *this_leaf) { union _cpuid4_leaf_eax eax; union _cpuid4_leaf_ebx ebx; @@ -560,7 +559,7 @@ __cpuinit cpuid4_cache_lookup_regs(int index, return 0; } -static int __cpuinit find_num_cache_leaves(struct cpuinfo_x86 *c) +static int find_num_cache_leaves(struct cpuinfo_x86 *c) { unsigned int eax, ebx, ecx, edx, op; union _cpuid4_leaf_eax cache_eax; @@ -580,7 +579,7 @@ static int __cpuinit find_num_cache_leaves(struct cpuinfo_x86 *c) return i; } -void __cpuinit init_amd_cacheinfo(struct cpuinfo_x86 *c) +void init_amd_cacheinfo(struct cpuinfo_x86 *c) { if (cpu_has_topoext) { @@ -593,7 +592,7 @@ void __cpuinit init_amd_cacheinfo(struct cpuinfo_x86 *c) } } -unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) +unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c) { /* Cache sizes */ unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0; @@ -744,7 +743,7 @@ static DEFINE_PER_CPU(struct _cpuid4_info *, ici_cpuid4_info); #ifdef CONFIG_SMP -static int __cpuinit cache_shared_amd_cpu_map_setup(unsigned int cpu, int index) +static int cache_shared_amd_cpu_map_setup(unsigned int cpu, int index) { struct _cpuid4_info *this_leaf; int i, sibling; @@ -793,7 +792,7 @@ static int __cpuinit cache_shared_amd_cpu_map_setup(unsigned int cpu, int index) return 1; } -static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) +static void cache_shared_cpu_map_setup(unsigned int cpu, int index) { struct _cpuid4_info *this_leaf, *sibling_leaf; unsigned long num_threads_sharing; @@ -828,7 +827,7 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) } } } -static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index) +static void cache_remove_shared_cpu_map(unsigned int cpu, int index) { struct _cpuid4_info *this_leaf, *sibling_leaf; int sibling; @@ -841,16 +840,16 @@ static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index) } } #else -static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) +static void cache_shared_cpu_map_setup(unsigned int cpu, int index) { } -static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index) +static void cache_remove_shared_cpu_map(unsigned int cpu, int index) { } #endif -static void __cpuinit free_cache_attributes(unsigned int cpu) +static void free_cache_attributes(unsigned int cpu) { int i; @@ -861,7 +860,7 @@ static void __cpuinit free_cache_attributes(unsigned int cpu) per_cpu(ici_cpuid4_info, cpu) = NULL; } -static void __cpuinit get_cpu_leaves(void *_retval) +static void get_cpu_leaves(void *_retval) { int j, *retval = _retval, cpu = smp_processor_id(); @@ -881,7 +880,7 @@ static void __cpuinit get_cpu_leaves(void *_retval) } } -static int __cpuinit detect_cache_attributes(unsigned int cpu) +static int detect_cache_attributes(unsigned int cpu) { int retval; @@ -1015,7 +1014,7 @@ static struct attribute *default_attrs[] = { }; #ifdef CONFIG_AMD_NB -static struct attribute ** __cpuinit amd_l3_attrs(void) +static struct attribute **amd_l3_attrs(void) { static struct attribute **attrs; int n; @@ -1091,7 +1090,7 @@ static struct kobj_type ktype_percpu_entry = { .sysfs_ops = &sysfs_ops, }; -static void __cpuinit cpuid4_cache_sysfs_exit(unsigned int cpu) +static void cpuid4_cache_sysfs_exit(unsigned int cpu) { kfree(per_cpu(ici_cache_kobject, cpu)); kfree(per_cpu(ici_index_kobject, cpu)); @@ -1100,7 +1099,7 @@ static void __cpuinit cpuid4_cache_sysfs_exit(unsigned int cpu) free_cache_attributes(cpu); } -static int __cpuinit cpuid4_cache_sysfs_init(unsigned int cpu) +static int cpuid4_cache_sysfs_init(unsigned int cpu) { int err; @@ -1132,7 +1131,7 @@ err_out: static DECLARE_BITMAP(cache_dev_map, NR_CPUS); /* Add/Remove cache interface for CPU device */ -static int __cpuinit cache_add_dev(struct device *dev) +static int cache_add_dev(struct device *dev) { unsigned int cpu = dev->id; unsigned long i, j; @@ -1183,7 +1182,7 @@ static int __cpuinit cache_add_dev(struct device *dev) return 0; } -static void __cpuinit cache_remove_dev(struct device *dev) +static void cache_remove_dev(struct device *dev) { unsigned int cpu = dev->id; unsigned long i; @@ -1200,8 +1199,8 @@ static void __cpuinit cache_remove_dev(struct device *dev) cpuid4_cache_sysfs_exit(cpu); } -static int __cpuinit cacheinfo_cpu_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) +static int cacheinfo_cpu_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; struct device *dev; @@ -1220,7 +1219,7 @@ static int __cpuinit cacheinfo_cpu_callback(struct notifier_block *nfb, return NOTIFY_OK; } -static struct notifier_block __cpuinitdata cacheinfo_cpu_notifier = { +static struct notifier_block cacheinfo_cpu_notifier = { .notifier_call = cacheinfo_cpu_callback, }; diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index 5b7d4fa5d3b..09edd0b65fe 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -25,15 +25,18 @@ int mce_severity(struct mce *a, int tolerant, char **msg); struct dentry *mce_get_debugfs_dir(void); extern struct mce_bank *mce_banks; +extern mce_banks_t mce_banks_ce_disabled; #ifdef CONFIG_X86_MCE_INTEL unsigned long mce_intel_adjust_timer(unsigned long interval); void mce_intel_cmci_poll(void); void mce_intel_hcpu_update(unsigned long cpu); +void cmci_disable_bank(int bank); #else # define mce_intel_adjust_timer mce_adjust_timer_default static inline void mce_intel_cmci_poll(void) { } static inline void mce_intel_hcpu_update(unsigned long cpu) { } +static inline void cmci_disable_bank(int bank) { } #endif void mce_timer_kick(unsigned long interval); diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index e2703520d12..c370e1c4468 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -111,8 +111,8 @@ static struct severity { #ifdef CONFIG_MEMORY_FAILURE MCESEV( KEEP, "Action required but unaffected thread is continuable", - SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR), - MCGMASK(MCG_STATUS_RIPV, MCG_STATUS_RIPV) + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR, MCI_UC_SAR|MCI_ADDR), + MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, MCG_STATUS_RIPV) ), MCESEV( AR, "Action required: data load error in a user process", diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index bf49cdbb010..b3218cdee95 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -97,6 +97,15 @@ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL }; +/* + * MCA banks controlled through firmware first for corrected errors. + * This is a global list of banks for which we won't enable CMCI and we + * won't poll. Firmware controls these banks and is responsible for + * reporting corrected errors through GHES. Uncorrected/recoverable + * errors are still notified through a machine check. + */ +mce_banks_t mce_banks_ce_disabled; + static DEFINE_PER_CPU(struct work_struct, mce_work); static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs); @@ -1363,7 +1372,7 @@ int mce_notify_irq(void) } EXPORT_SYMBOL_GPL(mce_notify_irq); -static int __cpuinit __mcheck_cpu_mce_banks_init(void) +static int __mcheck_cpu_mce_banks_init(void) { int i; u8 num_banks = mca_cfg.banks; @@ -1384,7 +1393,7 @@ static int __cpuinit __mcheck_cpu_mce_banks_init(void) /* * Initialize Machine Checks for a CPU. */ -static int __cpuinit __mcheck_cpu_cap_init(void) +static int __mcheck_cpu_cap_init(void) { unsigned b; u64 cap; @@ -1483,7 +1492,7 @@ static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs) } /* Add per CPU specific workarounds here */ -static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) +static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) { struct mca_config *cfg = &mca_cfg; @@ -1593,7 +1602,7 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) return 0; } -static int __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) +static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) { if (c->x86 != 5) return 0; @@ -1664,7 +1673,7 @@ void (*machine_check_vector)(struct pt_regs *, long error_code) = * Called for each booted CPU to set up machine checks. * Must be called with preempt off: */ -void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c) +void mcheck_cpu_init(struct cpuinfo_x86 *c) { if (mca_cfg.disabled) return; @@ -1935,6 +1944,25 @@ static struct miscdevice mce_chrdev_device = { &mce_chrdev_ops, }; +static void __mce_disable_bank(void *arg) +{ + int bank = *((int *)arg); + __clear_bit(bank, __get_cpu_var(mce_poll_banks)); + cmci_disable_bank(bank); +} + +void mce_disable_bank(int bank) +{ + if (bank >= mca_cfg.banks) { + pr_warn(FW_BUG + "Ignoring request to disable invalid MCA bank %d.\n", + bank); + return; + } + set_bit(bank, mce_banks_ce_disabled); + on_each_cpu(__mce_disable_bank, &bank, 1); +} + /* * mce=off Disables machine check * mce=no_cmci Disables CMCI @@ -2082,7 +2110,6 @@ static struct bus_type mce_subsys = { DEFINE_PER_CPU(struct device *, mce_device); -__cpuinitdata void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); static inline struct mce_bank *attr_to_bank(struct device_attribute *attr) @@ -2228,7 +2255,7 @@ static void mce_device_release(struct device *dev) } /* Per cpu device init. All of the cpus still share the same ctrl bank: */ -static __cpuinit int mce_device_create(unsigned int cpu) +static int mce_device_create(unsigned int cpu) { struct device *dev; int err; @@ -2274,7 +2301,7 @@ error: return err; } -static __cpuinit void mce_device_remove(unsigned int cpu) +static void mce_device_remove(unsigned int cpu) { struct device *dev = per_cpu(mce_device, cpu); int i; @@ -2294,7 +2321,7 @@ static __cpuinit void mce_device_remove(unsigned int cpu) } /* Make sure there are no machine checks on offlined CPUs. */ -static void __cpuinit mce_disable_cpu(void *h) +static void mce_disable_cpu(void *h) { unsigned long action = *(unsigned long *)h; int i; @@ -2312,7 +2339,7 @@ static void __cpuinit mce_disable_cpu(void *h) } } -static void __cpuinit mce_reenable_cpu(void *h) +static void mce_reenable_cpu(void *h) { unsigned long action = *(unsigned long *)h; int i; @@ -2331,7 +2358,7 @@ static void __cpuinit mce_reenable_cpu(void *h) } /* Get notified when a cpu comes on/off. Be hotplug friendly. */ -static int __cpuinit +static int mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; @@ -2367,7 +2394,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) return NOTIFY_OK; } -static struct notifier_block mce_cpu_notifier __cpuinitdata = { +static struct notifier_block mce_cpu_notifier = { .notifier_call = mce_cpu_callback, }; diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 9cb52767999..603df4f7464 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -458,10 +458,8 @@ static struct kobj_type threshold_ktype = { .default_attrs = default_attrs, }; -static __cpuinit int allocate_threshold_blocks(unsigned int cpu, - unsigned int bank, - unsigned int block, - u32 address) +static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank, + unsigned int block, u32 address) { struct threshold_block *b = NULL; u32 low, high; @@ -543,7 +541,7 @@ out_free: return err; } -static __cpuinit int __threshold_add_blocks(struct threshold_bank *b) +static int __threshold_add_blocks(struct threshold_bank *b) { struct list_head *head = &b->blocks->miscj; struct threshold_block *pos = NULL; @@ -567,7 +565,7 @@ static __cpuinit int __threshold_add_blocks(struct threshold_bank *b) return err; } -static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) +static int threshold_create_bank(unsigned int cpu, unsigned int bank) { struct device *dev = per_cpu(mce_device, cpu); struct amd_northbridge *nb = NULL; @@ -632,7 +630,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) } /* create dir/files for all valid threshold banks */ -static __cpuinit int threshold_create_device(unsigned int cpu) +static int threshold_create_device(unsigned int cpu) { unsigned int bank; struct threshold_bank **bp; @@ -736,7 +734,7 @@ static void threshold_remove_device(unsigned int cpu) } /* get notified when a cpu comes on/off */ -static void __cpuinit +static void amd_64_threshold_cpu_callback(unsigned long action, unsigned int cpu) { switch (action) { diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index d56405309dc..4cfe0458ca6 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -203,6 +203,10 @@ static void cmci_discover(int banks) if (test_bit(i, owned)) continue; + /* Skip banks in firmware first mode */ + if (test_bit(i, mce_banks_ce_disabled)) + continue; + rdmsrl(MSR_IA32_MCx_CTL2(i), val); /* Already owned by someone else? */ @@ -271,6 +275,19 @@ void cmci_recheck(void) local_irq_restore(flags); } +/* Caller must hold the lock on cmci_discover_lock */ +static void __cmci_disable_bank(int bank) +{ + u64 val; + + if (!test_bit(bank, __get_cpu_var(mce_banks_owned))) + return; + rdmsrl(MSR_IA32_MCx_CTL2(bank), val); + val &= ~MCI_CTL2_CMCI_EN; + wrmsrl(MSR_IA32_MCx_CTL2(bank), val); + __clear_bit(bank, __get_cpu_var(mce_banks_owned)); +} + /* * Disable CMCI on this CPU for all banks it owns when it goes down. * This allows other CPUs to claim the banks on rediscovery. @@ -280,20 +297,12 @@ void cmci_clear(void) unsigned long flags; int i; int banks; - u64 val; if (!cmci_supported(&banks)) return; raw_spin_lock_irqsave(&cmci_discover_lock, flags); - for (i = 0; i < banks; i++) { - if (!test_bit(i, __get_cpu_var(mce_banks_owned))) - continue; - /* Disable CMCI */ - rdmsrl(MSR_IA32_MCx_CTL2(i), val); - val &= ~MCI_CTL2_CMCI_EN; - wrmsrl(MSR_IA32_MCx_CTL2(i), val); - __clear_bit(i, __get_cpu_var(mce_banks_owned)); - } + for (i = 0; i < banks; i++) + __cmci_disable_bank(i); raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); } @@ -327,6 +336,19 @@ void cmci_reenable(void) cmci_discover(banks); } +void cmci_disable_bank(int bank) +{ + int banks; + unsigned long flags; + + if (!cmci_supported(&banks)) + return; + + raw_spin_lock_irqsave(&cmci_discover_lock, flags); + __cmci_disable_bank(bank); + raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); +} + static void intel_init_cmci(void) { int banks; diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 41e8e00a663..3eec7de76ef 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -240,8 +240,7 @@ __setup("int_pln_enable", int_pln_enable_setup); #ifdef CONFIG_SYSFS /* Add/Remove thermal_throttle interface for CPU device: */ -static __cpuinit int thermal_throttle_add_dev(struct device *dev, - unsigned int cpu) +static int thermal_throttle_add_dev(struct device *dev, unsigned int cpu) { int err; struct cpuinfo_x86 *c = &cpu_data(cpu); @@ -267,7 +266,7 @@ static __cpuinit int thermal_throttle_add_dev(struct device *dev, return err; } -static __cpuinit void thermal_throttle_remove_dev(struct device *dev) +static void thermal_throttle_remove_dev(struct device *dev) { sysfs_remove_group(&dev->kobj, &thermal_attr_group); } @@ -276,7 +275,7 @@ static __cpuinit void thermal_throttle_remove_dev(struct device *dev) static DEFINE_MUTEX(therm_cpu_lock); /* Get notified when a cpu comes on/off. Be hotplug friendly. */ -static __cpuinit int +static int thermal_throttle_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) @@ -307,7 +306,7 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb, return notifier_from_errno(err); } -static struct notifier_block thermal_throttle_cpu_notifier __cpuinitdata = +static struct notifier_block thermal_throttle_cpu_notifier = { .notifier_call = thermal_throttle_cpu_callback, }; diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 8f4be53ea04..71a39f3621b 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -27,20 +27,23 @@ struct ms_hyperv_info ms_hyperv; EXPORT_SYMBOL_GPL(ms_hyperv); -static bool __init ms_hyperv_platform(void) +static uint32_t __init ms_hyperv_platform(void) { u32 eax; u32 hyp_signature[3]; if (!boot_cpu_has(X86_FEATURE_HYPERVISOR)) - return false; + return 0; cpuid(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS, &eax, &hyp_signature[0], &hyp_signature[1], &hyp_signature[2]); - return eax >= HYPERV_CPUID_MIN && - eax <= HYPERV_CPUID_MAX && - !memcmp("Microsoft Hv", hyp_signature, 12); + if (eax >= HYPERV_CPUID_MIN && + eax <= HYPERV_CPUID_MAX && + !memcmp("Microsoft Hv", hyp_signature, 12)) + return HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS; + + return 0; } static cycle_t read_hv_clock(struct clocksource *arg) diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 9e581c5cf6d..8355c84b972 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1295,7 +1295,7 @@ perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs) struct event_constraint emptyconstraint; struct event_constraint unconstrained; -static int __cpuinit +static int x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) { unsigned int cpu = (long)hcpu; @@ -1884,6 +1884,7 @@ static struct pmu pmu = { void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) { userpg->cap_usr_time = 0; + userpg->cap_usr_time_zero = 0; userpg->cap_usr_rdpmc = x86_pmu.attr_rdpmc; userpg->pmc_width = x86_pmu.cntval_bits; @@ -1897,6 +1898,11 @@ void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) userpg->time_mult = this_cpu_read(cyc2ns); userpg->time_shift = CYC2NS_SCALE_FACTOR; userpg->time_offset = this_cpu_read(cyc2ns_offset) - now; + + if (sched_clock_stable && !check_tsc_disabled()) { + userpg->cap_usr_time_zero = 1; + userpg->time_zero = this_cpu_read(cyc2ns_offset); + } } /* diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 97e557bc4c9..cc16faae053 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -641,6 +641,8 @@ extern struct event_constraint intel_core2_pebs_event_constraints[]; extern struct event_constraint intel_atom_pebs_event_constraints[]; +extern struct event_constraint intel_slm_pebs_event_constraints[]; + extern struct event_constraint intel_nehalem_pebs_event_constraints[]; extern struct event_constraint intel_westmere_pebs_event_constraints[]; diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 4cbe03287b0..beeb7cc0704 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -347,8 +347,7 @@ static struct amd_nb *amd_alloc_nb(int cpu) struct amd_nb *nb; int i; - nb = kmalloc_node(sizeof(struct amd_nb), GFP_KERNEL | __GFP_ZERO, - cpu_to_node(cpu)); + nb = kzalloc_node(sizeof(struct amd_nb), GFP_KERNEL, cpu_to_node(cpu)); if (!nb) return NULL; diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index 5f0581e713c..e09f0bfb7b8 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -851,7 +851,7 @@ static void clear_APIC_ibs(void *dummy) setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1); } -static int __cpuinit +static int perf_ibs_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) { switch (action & ~CPU_TASKS_FROZEN) { diff --git a/arch/x86/kernel/cpu/perf_event_amd_uncore.c b/arch/x86/kernel/cpu/perf_event_amd_uncore.c index c0c661adf03..754291adec3 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_amd_uncore.c @@ -288,13 +288,13 @@ static struct pmu amd_l2_pmu = { .read = amd_uncore_read, }; -static struct amd_uncore * __cpuinit amd_uncore_alloc(unsigned int cpu) +static struct amd_uncore *amd_uncore_alloc(unsigned int cpu) { return kzalloc_node(sizeof(struct amd_uncore), GFP_KERNEL, cpu_to_node(cpu)); } -static void __cpuinit amd_uncore_cpu_up_prepare(unsigned int cpu) +static void amd_uncore_cpu_up_prepare(unsigned int cpu) { struct amd_uncore *uncore; @@ -322,8 +322,8 @@ static void __cpuinit amd_uncore_cpu_up_prepare(unsigned int cpu) } static struct amd_uncore * -__cpuinit amd_uncore_find_online_sibling(struct amd_uncore *this, - struct amd_uncore * __percpu *uncores) +amd_uncore_find_online_sibling(struct amd_uncore *this, + struct amd_uncore * __percpu *uncores) { unsigned int cpu; struct amd_uncore *that; @@ -348,7 +348,7 @@ __cpuinit amd_uncore_find_online_sibling(struct amd_uncore *this, return this; } -static void __cpuinit amd_uncore_cpu_starting(unsigned int cpu) +static void amd_uncore_cpu_starting(unsigned int cpu) { unsigned int eax, ebx, ecx, edx; struct amd_uncore *uncore; @@ -376,8 +376,8 @@ static void __cpuinit amd_uncore_cpu_starting(unsigned int cpu) } } -static void __cpuinit uncore_online(unsigned int cpu, - struct amd_uncore * __percpu *uncores) +static void uncore_online(unsigned int cpu, + struct amd_uncore * __percpu *uncores) { struct amd_uncore *uncore = *per_cpu_ptr(uncores, cpu); @@ -388,7 +388,7 @@ static void __cpuinit uncore_online(unsigned int cpu, cpumask_set_cpu(cpu, uncore->active_mask); } -static void __cpuinit amd_uncore_cpu_online(unsigned int cpu) +static void amd_uncore_cpu_online(unsigned int cpu) { if (amd_uncore_nb) uncore_online(cpu, amd_uncore_nb); @@ -397,8 +397,8 @@ static void __cpuinit amd_uncore_cpu_online(unsigned int cpu) uncore_online(cpu, amd_uncore_l2); } -static void __cpuinit uncore_down_prepare(unsigned int cpu, - struct amd_uncore * __percpu *uncores) +static void uncore_down_prepare(unsigned int cpu, + struct amd_uncore * __percpu *uncores) { unsigned int i; struct amd_uncore *this = *per_cpu_ptr(uncores, cpu); @@ -423,7 +423,7 @@ static void __cpuinit uncore_down_prepare(unsigned int cpu, } } -static void __cpuinit amd_uncore_cpu_down_prepare(unsigned int cpu) +static void amd_uncore_cpu_down_prepare(unsigned int cpu) { if (amd_uncore_nb) uncore_down_prepare(cpu, amd_uncore_nb); @@ -432,8 +432,7 @@ static void __cpuinit amd_uncore_cpu_down_prepare(unsigned int cpu) uncore_down_prepare(cpu, amd_uncore_l2); } -static void __cpuinit uncore_dead(unsigned int cpu, - struct amd_uncore * __percpu *uncores) +static void uncore_dead(unsigned int cpu, struct amd_uncore * __percpu *uncores) { struct amd_uncore *uncore = *per_cpu_ptr(uncores, cpu); @@ -445,7 +444,7 @@ static void __cpuinit uncore_dead(unsigned int cpu, *per_cpu_ptr(amd_uncore_nb, cpu) = NULL; } -static void __cpuinit amd_uncore_cpu_dead(unsigned int cpu) +static void amd_uncore_cpu_dead(unsigned int cpu) { if (amd_uncore_nb) uncore_dead(cpu, amd_uncore_nb); @@ -454,7 +453,7 @@ static void __cpuinit amd_uncore_cpu_dead(unsigned int cpu) uncore_dead(cpu, amd_uncore_l2); } -static int __cpuinit +static int amd_uncore_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) { @@ -489,7 +488,7 @@ amd_uncore_cpu_notifier(struct notifier_block *self, unsigned long action, return NOTIFY_OK; } -static struct notifier_block amd_uncore_cpu_notifier_block __cpuinitdata = { +static struct notifier_block amd_uncore_cpu_notifier_block = { .notifier_call = amd_uncore_cpu_notifier, .priority = CPU_PRI_PERF + 1, }; diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index fbc9210b45b..0abf6742a8b 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -81,7 +81,8 @@ static struct event_constraint intel_nehalem_event_constraints[] __read_mostly = static struct extra_reg intel_nehalem_extra_regs[] __read_mostly = { - INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0), + /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ + INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0), INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x100b), EVENT_EXTRA_END }; @@ -143,8 +144,9 @@ static struct event_constraint intel_ivb_event_constraints[] __read_mostly = static struct extra_reg intel_westmere_extra_regs[] __read_mostly = { - INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0), - INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0xffff, RSP_1), + /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ + INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0xffff, RSP_1), INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x100b), EVENT_EXTRA_END }; @@ -162,16 +164,27 @@ static struct event_constraint intel_gen_event_constraints[] __read_mostly = EVENT_CONSTRAINT_END }; +static struct event_constraint intel_slm_event_constraints[] __read_mostly = +{ + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + FIXED_EVENT_CONSTRAINT(0x013c, 2), /* CPU_CLK_UNHALTED.REF */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* pseudo CPU_CLK_UNHALTED.REF */ + EVENT_CONSTRAINT_END +}; + static struct extra_reg intel_snb_extra_regs[] __read_mostly = { - INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0x3f807f8fffull, RSP_0), - INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0x3f807f8fffull, RSP_1), + /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ + INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3f807f8fffull, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3f807f8fffull, RSP_1), INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd), EVENT_EXTRA_END }; static struct extra_reg intel_snbep_extra_regs[] __read_mostly = { - INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0), - INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1), + /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ + INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1), INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd), EVENT_EXTRA_END }; @@ -882,6 +895,140 @@ static __initconst const u64 atom_hw_cache_event_ids }, }; +static struct extra_reg intel_slm_extra_regs[] __read_mostly = +{ + /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ + INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x768005ffff, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x768005ffff, RSP_1), + EVENT_EXTRA_END +}; + +#define SLM_DMND_READ SNB_DMND_DATA_RD +#define SLM_DMND_WRITE SNB_DMND_RFO +#define SLM_DMND_PREFETCH (SNB_PF_DATA_RD|SNB_PF_RFO) + +#define SLM_SNP_ANY (SNB_SNP_NONE|SNB_SNP_MISS|SNB_NO_FWD|SNB_HITM) +#define SLM_LLC_ACCESS SNB_RESP_ANY +#define SLM_LLC_MISS (SLM_SNP_ANY|SNB_NON_DRAM) + +static __initconst const u64 slm_hw_cache_extra_regs + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = SLM_DMND_READ|SLM_LLC_ACCESS, + [ C(RESULT_MISS) ] = SLM_DMND_READ|SLM_LLC_MISS, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = SLM_DMND_WRITE|SLM_LLC_ACCESS, + [ C(RESULT_MISS) ] = SLM_DMND_WRITE|SLM_LLC_MISS, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = SLM_DMND_PREFETCH|SLM_LLC_ACCESS, + [ C(RESULT_MISS) ] = SLM_DMND_PREFETCH|SLM_LLC_MISS, + }, + }, +}; + +static __initconst const u64 slm_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0x0104, /* LD_DCU_MISS */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0380, /* ICACHE.ACCESSES */ + [ C(RESULT_MISS) ] = 0x0280, /* ICACGE.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, + }, + [ C(OP_WRITE) ] = { + /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, + }, + [ C(OP_PREFETCH) ] = { + /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0x0804, /* LD_DTLB_MISS */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */ + [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */ + [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event) { /* user explicitly requested branch sampling */ @@ -1301,11 +1448,11 @@ static void intel_fixup_er(struct perf_event *event, int idx) if (idx == EXTRA_REG_RSP_0) { event->hw.config &= ~INTEL_ARCH_EVENT_MASK; - event->hw.config |= 0x01b7; + event->hw.config |= x86_pmu.extra_regs[EXTRA_REG_RSP_0].event; event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0; } else if (idx == EXTRA_REG_RSP_1) { event->hw.config &= ~INTEL_ARCH_EVENT_MASK; - event->hw.config |= 0x01bb; + event->hw.config |= x86_pmu.extra_regs[EXTRA_REG_RSP_1].event; event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1; } } @@ -2176,6 +2323,21 @@ __init int intel_pmu_init(void) pr_cont("Atom events, "); break; + case 55: /* Atom 22nm "Silvermont" */ + memcpy(hw_cache_event_ids, slm_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + memcpy(hw_cache_extra_regs, slm_hw_cache_extra_regs, + sizeof(hw_cache_extra_regs)); + + intel_pmu_lbr_init_atom(); + + x86_pmu.event_constraints = intel_slm_event_constraints; + x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints; + x86_pmu.extra_regs = intel_slm_extra_regs; + x86_pmu.er_flags |= ERF_HAS_RSP_1; + pr_cont("Silvermont events, "); + break; + case 37: /* 32 nm nehalem, "Clarkdale" */ case 44: /* 32 nm nehalem, "Gulftown" */ case 47: /* 32 nm Xeon E7 */ @@ -2270,6 +2432,7 @@ __init int intel_pmu_init(void) case 70: case 71: case 63: + case 69: x86_pmu.late_ack = true; memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 3065c57a63c..63438aad177 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -224,7 +224,7 @@ static int alloc_pebs_buffer(int cpu) if (!x86_pmu.pebs) return 0; - buffer = kmalloc_node(PEBS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node); + buffer = kzalloc_node(PEBS_BUFFER_SIZE, GFP_KERNEL, node); if (unlikely(!buffer)) return -ENOMEM; @@ -262,7 +262,7 @@ static int alloc_bts_buffer(int cpu) if (!x86_pmu.bts) return 0; - buffer = kmalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node); + buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL, node); if (unlikely(!buffer)) return -ENOMEM; @@ -295,7 +295,7 @@ static int alloc_ds_buffer(int cpu) int node = cpu_to_node(cpu); struct debug_store *ds; - ds = kmalloc_node(sizeof(*ds), GFP_KERNEL | __GFP_ZERO, node); + ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node); if (unlikely(!ds)) return -ENOMEM; @@ -517,6 +517,32 @@ struct event_constraint intel_atom_pebs_event_constraints[] = { EVENT_CONSTRAINT_END }; +struct event_constraint intel_slm_pebs_event_constraints[] = { + INTEL_UEVENT_CONSTRAINT(0x0103, 0x1), /* REHABQ.LD_BLOCK_ST_FORWARD_PS */ + INTEL_UEVENT_CONSTRAINT(0x0803, 0x1), /* REHABQ.LD_SPLITS_PS */ + INTEL_UEVENT_CONSTRAINT(0x0204, 0x1), /* MEM_UOPS_RETIRED.L2_HIT_LOADS_PS */ + INTEL_UEVENT_CONSTRAINT(0x0404, 0x1), /* MEM_UOPS_RETIRED.L2_MISS_LOADS_PS */ + INTEL_UEVENT_CONSTRAINT(0x0804, 0x1), /* MEM_UOPS_RETIRED.DTLB_MISS_LOADS_PS */ + INTEL_UEVENT_CONSTRAINT(0x2004, 0x1), /* MEM_UOPS_RETIRED.HITM_PS */ + INTEL_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY_PS */ + INTEL_UEVENT_CONSTRAINT(0x00c4, 0x1), /* BR_INST_RETIRED.ALL_BRANCHES_PS */ + INTEL_UEVENT_CONSTRAINT(0x7ec4, 0x1), /* BR_INST_RETIRED.JCC_PS */ + INTEL_UEVENT_CONSTRAINT(0xbfc4, 0x1), /* BR_INST_RETIRED.FAR_BRANCH_PS */ + INTEL_UEVENT_CONSTRAINT(0xebc4, 0x1), /* BR_INST_RETIRED.NON_RETURN_IND_PS */ + INTEL_UEVENT_CONSTRAINT(0xf7c4, 0x1), /* BR_INST_RETIRED.RETURN_PS */ + INTEL_UEVENT_CONSTRAINT(0xf9c4, 0x1), /* BR_INST_RETIRED.CALL_PS */ + INTEL_UEVENT_CONSTRAINT(0xfbc4, 0x1), /* BR_INST_RETIRED.IND_CALL_PS */ + INTEL_UEVENT_CONSTRAINT(0xfdc4, 0x1), /* BR_INST_RETIRED.REL_CALL_PS */ + INTEL_UEVENT_CONSTRAINT(0xfec4, 0x1), /* BR_INST_RETIRED.TAKEN_JCC_PS */ + INTEL_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_MISP_RETIRED.ALL_BRANCHES_PS */ + INTEL_UEVENT_CONSTRAINT(0x7ec5, 0x1), /* BR_INST_MISP_RETIRED.JCC_PS */ + INTEL_UEVENT_CONSTRAINT(0xebc5, 0x1), /* BR_INST_MISP_RETIRED.NON_RETURN_IND_PS */ + INTEL_UEVENT_CONSTRAINT(0xf7c5, 0x1), /* BR_INST_MISP_RETIRED.RETURN_PS */ + INTEL_UEVENT_CONSTRAINT(0xfbc5, 0x1), /* BR_INST_MISP_RETIRED.IND_CALL_PS */ + INTEL_UEVENT_CONSTRAINT(0xfec5, 0x1), /* BR_INST_MISP_RETIRED.TAKEN_JCC_PS */ + EVENT_CONSTRAINT_END +}; + struct event_constraint intel_nehalem_pebs_event_constraints[] = { INTEL_PLD_CONSTRAINT(0x100b, 0xf), /* MEM_INST_RETIRED.* */ INTEL_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */ diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index 9dd99751ccf..fd8011ed4dc 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -6,6 +6,8 @@ static struct intel_uncore_type **pci_uncores = empty_uncore; /* pci bus to socket mapping */ static int pcibus_to_physid[256] = { [0 ... 255] = -1, }; +static struct pci_dev *extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX]; + static DEFINE_RAW_SPINLOCK(uncore_box_lock); /* mask of cpus that collect uncore events */ @@ -45,6 +47,24 @@ DEFINE_UNCORE_FORMAT_ATTR(filter_band0, filter_band0, "config1:0-7"); DEFINE_UNCORE_FORMAT_ATTR(filter_band1, filter_band1, "config1:8-15"); DEFINE_UNCORE_FORMAT_ATTR(filter_band2, filter_band2, "config1:16-23"); DEFINE_UNCORE_FORMAT_ATTR(filter_band3, filter_band3, "config1:24-31"); +DEFINE_UNCORE_FORMAT_ATTR(match_rds, match_rds, "config1:48-51"); +DEFINE_UNCORE_FORMAT_ATTR(match_rnid30, match_rnid30, "config1:32-35"); +DEFINE_UNCORE_FORMAT_ATTR(match_rnid4, match_rnid4, "config1:31"); +DEFINE_UNCORE_FORMAT_ATTR(match_dnid, match_dnid, "config1:13-17"); +DEFINE_UNCORE_FORMAT_ATTR(match_mc, match_mc, "config1:9-12"); +DEFINE_UNCORE_FORMAT_ATTR(match_opc, match_opc, "config1:5-8"); +DEFINE_UNCORE_FORMAT_ATTR(match_vnw, match_vnw, "config1:3-4"); +DEFINE_UNCORE_FORMAT_ATTR(match0, match0, "config1:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(match1, match1, "config1:32-63"); +DEFINE_UNCORE_FORMAT_ATTR(mask_rds, mask_rds, "config2:48-51"); +DEFINE_UNCORE_FORMAT_ATTR(mask_rnid30, mask_rnid30, "config2:32-35"); +DEFINE_UNCORE_FORMAT_ATTR(mask_rnid4, mask_rnid4, "config2:31"); +DEFINE_UNCORE_FORMAT_ATTR(mask_dnid, mask_dnid, "config2:13-17"); +DEFINE_UNCORE_FORMAT_ATTR(mask_mc, mask_mc, "config2:9-12"); +DEFINE_UNCORE_FORMAT_ATTR(mask_opc, mask_opc, "config2:5-8"); +DEFINE_UNCORE_FORMAT_ATTR(mask_vnw, mask_vnw, "config2:3-4"); +DEFINE_UNCORE_FORMAT_ATTR(mask0, mask0, "config2:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(mask1, mask1, "config2:32-63"); static u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event) { @@ -281,7 +301,7 @@ static struct attribute *snbep_uncore_cbox_formats_attr[] = { }; static struct attribute *snbep_uncore_pcu_formats_attr[] = { - &format_attr_event.attr, + &format_attr_event_ext.attr, &format_attr_occ_sel.attr, &format_attr_edge.attr, &format_attr_inv.attr, @@ -301,6 +321,24 @@ static struct attribute *snbep_uncore_qpi_formats_attr[] = { &format_attr_edge.attr, &format_attr_inv.attr, &format_attr_thresh8.attr, + &format_attr_match_rds.attr, + &format_attr_match_rnid30.attr, + &format_attr_match_rnid4.attr, + &format_attr_match_dnid.attr, + &format_attr_match_mc.attr, + &format_attr_match_opc.attr, + &format_attr_match_vnw.attr, + &format_attr_match0.attr, + &format_attr_match1.attr, + &format_attr_mask_rds.attr, + &format_attr_mask_rnid30.attr, + &format_attr_mask_rnid4.attr, + &format_attr_mask_dnid.attr, + &format_attr_mask_mc.attr, + &format_attr_mask_opc.attr, + &format_attr_mask_vnw.attr, + &format_attr_mask0.attr, + &format_attr_mask1.attr, NULL, }; @@ -314,8 +352,8 @@ static struct uncore_event_desc snbep_uncore_imc_events[] = { static struct uncore_event_desc snbep_uncore_qpi_events[] = { INTEL_UNCORE_EVENT_DESC(clockticks, "event=0x14"), INTEL_UNCORE_EVENT_DESC(txl_flits_active, "event=0x00,umask=0x06"), - INTEL_UNCORE_EVENT_DESC(drs_data, "event=0x02,umask=0x08"), - INTEL_UNCORE_EVENT_DESC(ncb_data, "event=0x03,umask=0x04"), + INTEL_UNCORE_EVENT_DESC(drs_data, "event=0x102,umask=0x08"), + INTEL_UNCORE_EVENT_DESC(ncb_data, "event=0x103,umask=0x04"), { /* end: all zeroes */ }, }; @@ -356,13 +394,16 @@ static struct intel_uncore_ops snbep_uncore_msr_ops = { SNBEP_UNCORE_MSR_OPS_COMMON_INIT(), }; +#define SNBEP_UNCORE_PCI_OPS_COMMON_INIT() \ + .init_box = snbep_uncore_pci_init_box, \ + .disable_box = snbep_uncore_pci_disable_box, \ + .enable_box = snbep_uncore_pci_enable_box, \ + .disable_event = snbep_uncore_pci_disable_event, \ + .read_counter = snbep_uncore_pci_read_counter + static struct intel_uncore_ops snbep_uncore_pci_ops = { - .init_box = snbep_uncore_pci_init_box, - .disable_box = snbep_uncore_pci_disable_box, - .enable_box = snbep_uncore_pci_enable_box, - .disable_event = snbep_uncore_pci_disable_event, - .enable_event = snbep_uncore_pci_enable_event, - .read_counter = snbep_uncore_pci_read_counter, + SNBEP_UNCORE_PCI_OPS_COMMON_INIT(), + .enable_event = snbep_uncore_pci_enable_event, \ }; static struct event_constraint snbep_uncore_cbox_constraints[] = { @@ -726,6 +767,61 @@ static struct intel_uncore_type *snbep_msr_uncores[] = { NULL, }; +enum { + SNBEP_PCI_QPI_PORT0_FILTER, + SNBEP_PCI_QPI_PORT1_FILTER, +}; + +static int snbep_qpi_hw_config(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + struct hw_perf_event_extra *reg2 = &hwc->branch_reg; + + if ((hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK) == 0x38) { + reg1->idx = 0; + reg1->reg = SNBEP_Q_Py_PCI_PMON_PKT_MATCH0; + reg1->config = event->attr.config1; + reg2->reg = SNBEP_Q_Py_PCI_PMON_PKT_MASK0; + reg2->config = event->attr.config2; + } + return 0; +} + +static void snbep_qpi_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + struct hw_perf_event_extra *reg2 = &hwc->branch_reg; + + if (reg1->idx != EXTRA_REG_NONE) { + int idx = box->pmu->pmu_idx + SNBEP_PCI_QPI_PORT0_FILTER; + struct pci_dev *filter_pdev = extra_pci_dev[box->phys_id][idx]; + WARN_ON_ONCE(!filter_pdev); + if (filter_pdev) { + pci_write_config_dword(filter_pdev, reg1->reg, + (u32)reg1->config); + pci_write_config_dword(filter_pdev, reg1->reg + 4, + (u32)(reg1->config >> 32)); + pci_write_config_dword(filter_pdev, reg2->reg, + (u32)reg2->config); + pci_write_config_dword(filter_pdev, reg2->reg + 4, + (u32)(reg2->config >> 32)); + } + } + + pci_write_config_dword(pdev, hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN); +} + +static struct intel_uncore_ops snbep_uncore_qpi_ops = { + SNBEP_UNCORE_PCI_OPS_COMMON_INIT(), + .enable_event = snbep_qpi_enable_event, + .hw_config = snbep_qpi_hw_config, + .get_constraint = uncore_get_constraint, + .put_constraint = uncore_put_constraint, +}; + #define SNBEP_UNCORE_PCI_COMMON_INIT() \ .perf_ctr = SNBEP_PCI_PMON_CTR0, \ .event_ctl = SNBEP_PCI_PMON_CTL0, \ @@ -755,17 +851,18 @@ static struct intel_uncore_type snbep_uncore_imc = { }; static struct intel_uncore_type snbep_uncore_qpi = { - .name = "qpi", - .num_counters = 4, - .num_boxes = 2, - .perf_ctr_bits = 48, - .perf_ctr = SNBEP_PCI_PMON_CTR0, - .event_ctl = SNBEP_PCI_PMON_CTL0, - .event_mask = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK, - .box_ctl = SNBEP_PCI_PMON_BOX_CTL, - .ops = &snbep_uncore_pci_ops, - .event_descs = snbep_uncore_qpi_events, - .format_group = &snbep_uncore_qpi_format_group, + .name = "qpi", + .num_counters = 4, + .num_boxes = 2, + .perf_ctr_bits = 48, + .perf_ctr = SNBEP_PCI_PMON_CTR0, + .event_ctl = SNBEP_PCI_PMON_CTL0, + .event_mask = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK, + .box_ctl = SNBEP_PCI_PMON_BOX_CTL, + .num_shared_regs = 1, + .ops = &snbep_uncore_qpi_ops, + .event_descs = snbep_uncore_qpi_events, + .format_group = &snbep_uncore_qpi_format_group, }; @@ -807,43 +904,53 @@ static struct intel_uncore_type *snbep_pci_uncores[] = { static DEFINE_PCI_DEVICE_TABLE(snbep_uncore_pci_ids) = { { /* Home Agent */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_HA), - .driver_data = SNBEP_PCI_UNCORE_HA, + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_HA, 0), }, { /* MC Channel 0 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC0), - .driver_data = SNBEP_PCI_UNCORE_IMC, + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 0), }, { /* MC Channel 1 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC1), - .driver_data = SNBEP_PCI_UNCORE_IMC, + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 1), }, { /* MC Channel 2 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC2), - .driver_data = SNBEP_PCI_UNCORE_IMC, + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 2), }, { /* MC Channel 3 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC3), - .driver_data = SNBEP_PCI_UNCORE_IMC, + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 3), }, { /* QPI Port 0 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI0), - .driver_data = SNBEP_PCI_UNCORE_QPI, + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_QPI, 0), }, { /* QPI Port 1 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI1), - .driver_data = SNBEP_PCI_UNCORE_QPI, + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_QPI, 1), }, { /* R2PCIe */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R2PCIE), - .driver_data = SNBEP_PCI_UNCORE_R2PCIE, + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R2PCIE, 0), }, { /* R3QPI Link 0 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI0), - .driver_data = SNBEP_PCI_UNCORE_R3QPI, + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R3QPI, 0), }, { /* R3QPI Link 1 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI1), - .driver_data = SNBEP_PCI_UNCORE_R3QPI, + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R3QPI, 1), + }, + { /* QPI Port 0 filter */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3c86), + .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, + SNBEP_PCI_QPI_PORT0_FILTER), + }, + { /* QPI Port 0 filter */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3c96), + .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, + SNBEP_PCI_QPI_PORT1_FILTER), }, { /* end: all zeroes */ } }; @@ -1256,71 +1363,71 @@ static struct intel_uncore_type *ivt_pci_uncores[] = { static DEFINE_PCI_DEVICE_TABLE(ivt_uncore_pci_ids) = { { /* Home Agent 0 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe30), - .driver_data = IVT_PCI_UNCORE_HA, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_HA, 0), }, { /* Home Agent 1 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe38), - .driver_data = IVT_PCI_UNCORE_HA, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_HA, 1), }, { /* MC0 Channel 0 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb4), - .driver_data = IVT_PCI_UNCORE_IMC, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 0), }, { /* MC0 Channel 1 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb5), - .driver_data = IVT_PCI_UNCORE_IMC, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 1), }, { /* MC0 Channel 3 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb0), - .driver_data = IVT_PCI_UNCORE_IMC, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 2), }, { /* MC0 Channel 4 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb1), - .driver_data = IVT_PCI_UNCORE_IMC, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 3), }, { /* MC1 Channel 0 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef4), - .driver_data = IVT_PCI_UNCORE_IMC, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 4), }, { /* MC1 Channel 1 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef5), - .driver_data = IVT_PCI_UNCORE_IMC, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 5), }, { /* MC1 Channel 3 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef0), - .driver_data = IVT_PCI_UNCORE_IMC, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 6), }, { /* MC1 Channel 4 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef1), - .driver_data = IVT_PCI_UNCORE_IMC, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 7), }, { /* QPI0 Port 0 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe32), - .driver_data = IVT_PCI_UNCORE_QPI, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_QPI, 0), }, { /* QPI0 Port 1 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe33), - .driver_data = IVT_PCI_UNCORE_QPI, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_QPI, 1), }, { /* QPI1 Port 2 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3a), - .driver_data = IVT_PCI_UNCORE_QPI, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_QPI, 2), }, { /* R2PCIe */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe34), - .driver_data = IVT_PCI_UNCORE_R2PCIE, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_R2PCIE, 0), }, { /* R3QPI0 Link 0 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe36), - .driver_data = IVT_PCI_UNCORE_R3QPI, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_R3QPI, 0), }, { /* R3QPI0 Link 1 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe37), - .driver_data = IVT_PCI_UNCORE_R3QPI, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_R3QPI, 1), }, { /* R3QPI1 Link 2 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3e), - .driver_data = IVT_PCI_UNCORE_R3QPI, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_R3QPI, 2), }, { /* end: all zeroes */ } }; @@ -2606,7 +2713,7 @@ struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, int cp size = sizeof(*box) + type->num_shared_regs * sizeof(struct intel_uncore_extra_reg); - box = kmalloc_node(size, GFP_KERNEL | __GFP_ZERO, cpu_to_node(cpu)); + box = kzalloc_node(size, GFP_KERNEL, cpu_to_node(cpu)); if (!box) return NULL; @@ -3167,16 +3274,24 @@ static bool pcidrv_registered; /* * add a pci uncore device */ -static int uncore_pci_add(struct intel_uncore_type *type, struct pci_dev *pdev) +static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) { struct intel_uncore_pmu *pmu; struct intel_uncore_box *box; - int i, phys_id; + struct intel_uncore_type *type; + int phys_id; phys_id = pcibus_to_physid[pdev->bus->number]; if (phys_id < 0) return -ENODEV; + if (UNCORE_PCI_DEV_TYPE(id->driver_data) == UNCORE_EXTRA_PCI_DEV) { + extra_pci_dev[phys_id][UNCORE_PCI_DEV_IDX(id->driver_data)] = pdev; + pci_set_drvdata(pdev, NULL); + return 0; + } + + type = pci_uncores[UNCORE_PCI_DEV_TYPE(id->driver_data)]; box = uncore_alloc_box(type, 0); if (!box) return -ENOMEM; @@ -3185,21 +3300,11 @@ static int uncore_pci_add(struct intel_uncore_type *type, struct pci_dev *pdev) * for performance monitoring unit with multiple boxes, * each box has a different function id. */ - for (i = 0; i < type->num_boxes; i++) { - pmu = &type->pmus[i]; - if (pmu->func_id == pdev->devfn) - break; - if (pmu->func_id < 0) { - pmu->func_id = pdev->devfn; - break; - } - pmu = NULL; - } - - if (!pmu) { - kfree(box); - return -EINVAL; - } + pmu = &type->pmus[UNCORE_PCI_DEV_IDX(id->driver_data)]; + if (pmu->func_id < 0) + pmu->func_id = pdev->devfn; + else + WARN_ON_ONCE(pmu->func_id != pdev->devfn); box->phys_id = phys_id; box->pci_dev = pdev; @@ -3217,9 +3322,22 @@ static int uncore_pci_add(struct intel_uncore_type *type, struct pci_dev *pdev) static void uncore_pci_remove(struct pci_dev *pdev) { struct intel_uncore_box *box = pci_get_drvdata(pdev); - struct intel_uncore_pmu *pmu = box->pmu; - int cpu, phys_id = pcibus_to_physid[pdev->bus->number]; + struct intel_uncore_pmu *pmu; + int i, cpu, phys_id = pcibus_to_physid[pdev->bus->number]; + box = pci_get_drvdata(pdev); + if (!box) { + for (i = 0; i < UNCORE_EXTRA_PCI_DEV_MAX; i++) { + if (extra_pci_dev[phys_id][i] == pdev) { + extra_pci_dev[phys_id][i] = NULL; + break; + } + } + WARN_ON_ONCE(i >= UNCORE_EXTRA_PCI_DEV_MAX); + return; + } + + pmu = box->pmu; if (WARN_ON_ONCE(phys_id != box->phys_id)) return; @@ -3240,12 +3358,6 @@ static void uncore_pci_remove(struct pci_dev *pdev) kfree(box); } -static int uncore_pci_probe(struct pci_dev *pdev, - const struct pci_device_id *id) -{ - return uncore_pci_add(pci_uncores[id->driver_data], pdev); -} - static int __init uncore_pci_init(void) { int ret; @@ -3297,7 +3409,7 @@ static void __init uncore_pci_exit(void) /* CPU hot plug/unplug are serialized by cpu_add_remove_lock mutex */ static LIST_HEAD(boxes_to_free); -static void __cpuinit uncore_kfree_boxes(void) +static void uncore_kfree_boxes(void) { struct intel_uncore_box *box; @@ -3309,7 +3421,7 @@ static void __cpuinit uncore_kfree_boxes(void) } } -static void __cpuinit uncore_cpu_dying(int cpu) +static void uncore_cpu_dying(int cpu) { struct intel_uncore_type *type; struct intel_uncore_pmu *pmu; @@ -3328,7 +3440,7 @@ static void __cpuinit uncore_cpu_dying(int cpu) } } -static int __cpuinit uncore_cpu_starting(int cpu) +static int uncore_cpu_starting(int cpu) { struct intel_uncore_type *type; struct intel_uncore_pmu *pmu; @@ -3371,7 +3483,7 @@ static int __cpuinit uncore_cpu_starting(int cpu) return 0; } -static int __cpuinit uncore_cpu_prepare(int cpu, int phys_id) +static int uncore_cpu_prepare(int cpu, int phys_id) { struct intel_uncore_type *type; struct intel_uncore_pmu *pmu; @@ -3397,7 +3509,7 @@ static int __cpuinit uncore_cpu_prepare(int cpu, int phys_id) return 0; } -static void __cpuinit +static void uncore_change_context(struct intel_uncore_type **uncores, int old_cpu, int new_cpu) { struct intel_uncore_type *type; @@ -3435,7 +3547,7 @@ uncore_change_context(struct intel_uncore_type **uncores, int old_cpu, int new_c } } -static void __cpuinit uncore_event_exit_cpu(int cpu) +static void uncore_event_exit_cpu(int cpu) { int i, phys_id, target; @@ -3463,7 +3575,7 @@ static void __cpuinit uncore_event_exit_cpu(int cpu) uncore_change_context(pci_uncores, cpu, target); } -static void __cpuinit uncore_event_init_cpu(int cpu) +static void uncore_event_init_cpu(int cpu) { int i, phys_id; @@ -3479,8 +3591,8 @@ static void __cpuinit uncore_event_init_cpu(int cpu) uncore_change_context(pci_uncores, -1, cpu); } -static int - __cpuinit uncore_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) +static int uncore_cpu_notifier(struct notifier_block *self, + unsigned long action, void *hcpu) { unsigned int cpu = (long)hcpu; @@ -3520,7 +3632,7 @@ static int return NOTIFY_OK; } -static struct notifier_block uncore_cpu_nb __cpuinitdata = { +static struct notifier_block uncore_cpu_nb = { .notifier_call = uncore_cpu_notifier, /* * to migrate uncore events, our notifier should be executed diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h index 47b3d00c9d8..a80ab71a883 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h @@ -12,6 +12,15 @@ #define UNCORE_PMC_IDX_FIXED UNCORE_PMC_IDX_MAX_GENERIC #define UNCORE_PMC_IDX_MAX (UNCORE_PMC_IDX_FIXED + 1) +#define UNCORE_PCI_DEV_DATA(type, idx) ((type << 8) | idx) +#define UNCORE_PCI_DEV_TYPE(data) ((data >> 8) & 0xff) +#define UNCORE_PCI_DEV_IDX(data) (data & 0xff) +#define UNCORE_EXTRA_PCI_DEV 0xff +#define UNCORE_EXTRA_PCI_DEV_MAX 2 + +/* support up to 8 sockets */ +#define UNCORE_SOCKET_MAX 8 + #define UNCORE_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, 0xff) /* SNB event control */ @@ -108,6 +117,7 @@ (SNBEP_PMON_CTL_EV_SEL_MASK | \ SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \ SNBEP_PMON_CTL_EDGE_DET | \ + SNBEP_PMON_CTL_EV_SEL_EXT | \ SNBEP_PMON_CTL_INVERT | \ SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \ SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \ diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c index feca286c2bb..88db010845c 100644 --- a/arch/x86/kernel/cpu/rdrand.c +++ b/arch/x86/kernel/cpu/rdrand.c @@ -52,7 +52,7 @@ static inline int rdrand_long(unsigned long *v) */ #define RESEED_LOOP ((512*128)/sizeof(unsigned long)) -void __cpuinit x86_init_rdrand(struct cpuinfo_x86 *c) +void x86_init_rdrand(struct cpuinfo_x86 *c) { #ifdef CONFIG_ARCH_RANDOM unsigned long tmp; diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index d92b5dad15d..f2cc63e9cf0 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -24,13 +24,13 @@ enum cpuid_regs { CR_EBX }; -void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) +void init_scattered_cpuid_features(struct cpuinfo_x86 *c) { u32 max_level; u32 regs[4]; const struct cpuid_bit *cb; - static const struct cpuid_bit __cpuinitconst cpuid_bits[] = { + static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_DTHERM, CR_EAX, 0, 0x00000006, 0 }, { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006, 0 }, { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006, 0 }, diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index 4397e987a1c..4c60eaf0571 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -26,7 +26,7 @@ * exists, use it for populating initial_apicid and cpu topology * detection. */ -void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c) +void detect_extended_topology(struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP unsigned int eax, ebx, ecx, edx, sub_index; diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c index 28000743bbb..aa0430d69b9 100644 --- a/arch/x86/kernel/cpu/transmeta.c +++ b/arch/x86/kernel/cpu/transmeta.c @@ -5,7 +5,7 @@ #include <asm/msr.h> #include "cpu.h" -static void __cpuinit early_init_transmeta(struct cpuinfo_x86 *c) +static void early_init_transmeta(struct cpuinfo_x86 *c) { u32 xlvl; @@ -17,7 +17,7 @@ static void __cpuinit early_init_transmeta(struct cpuinfo_x86 *c) } } -static void __cpuinit init_transmeta(struct cpuinfo_x86 *c) +static void init_transmeta(struct cpuinfo_x86 *c) { unsigned int cap_mask, uk, max, dummy; unsigned int cms_rev1, cms_rev2; @@ -98,7 +98,7 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c) #endif } -static const struct cpu_dev __cpuinitconst transmeta_cpu_dev = { +static const struct cpu_dev transmeta_cpu_dev = { .c_vendor = "Transmeta", .c_ident = { "GenuineTMx86", "TransmetaCPU" }, .c_early_init = early_init_transmeta, diff --git a/arch/x86/kernel/cpu/umc.c b/arch/x86/kernel/cpu/umc.c index fd2c37bf7ac..202759a1412 100644 --- a/arch/x86/kernel/cpu/umc.c +++ b/arch/x86/kernel/cpu/umc.c @@ -8,7 +8,7 @@ * so no special init takes place. */ -static const struct cpu_dev __cpuinitconst umc_cpu_dev = { +static const struct cpu_dev umc_cpu_dev = { .c_vendor = "UMC", .c_ident = { "UMC UMC UMC" }, .c_models = { diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 03a36321ec5..628a059a9a0 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -93,7 +93,7 @@ static void __init vmware_platform_setup(void) * serial key should be enough, as this will always have a VMware * specific string when running under VMware hypervisor. */ -static bool __init vmware_platform(void) +static uint32_t __init vmware_platform(void) { if (cpu_has_hypervisor) { unsigned int eax; @@ -102,12 +102,12 @@ static bool __init vmware_platform(void) cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &hyper_vendor_id[0], &hyper_vendor_id[1], &hyper_vendor_id[2]); if (!memcmp(hyper_vendor_id, "VMwareVMware", 12)) - return true; + return CPUID_VMWARE_INFO_LEAF; } else if (dmi_available && dmi_name_in_serial("VMware") && __vmware_platform()) - return true; + return 1; - return false; + return 0; } /* @@ -122,7 +122,7 @@ static bool __init vmware_platform(void) * so that the kernel could just trust the hypervisor with providing a * reliable virtual TSC that is suitable for timekeeping. */ -static void __cpuinit vmware_set_cpu_features(struct cpuinfo_x86 *c) +static void vmware_set_cpu_features(struct cpuinfo_x86 *c) { set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE); diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 1e4dbcfe6d3..7d9481c743f 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -137,7 +137,7 @@ static const struct file_operations cpuid_fops = { .open = cpuid_open, }; -static __cpuinit int cpuid_device_create(int cpu) +static int cpuid_device_create(int cpu) { struct device *dev; @@ -151,9 +151,8 @@ static void cpuid_device_destroy(int cpu) device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu)); } -static int __cpuinit cpuid_class_cpu_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) +static int cpuid_class_cpu_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; int err = 0; diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 74467feb4dc..e0e0841eef4 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -128,7 +128,9 @@ void native_machine_crash_shutdown(struct pt_regs *regs) cpu_emergency_svm_disable(); lapic_shutdown(); -#if defined(CONFIG_X86_IO_APIC) +#ifdef CONFIG_X86_IO_APIC + /* Prevent crash_kexec() from deadlocking on ioapic_lock. */ + ioapic_zap_locks(); disable_IO_APIC(); #endif #ifdef CONFIG_HPET_TIMER diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 4934890e4db..69eb2fa2549 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -133,7 +133,7 @@ static void x86_of_pci_irq_disable(struct pci_dev *dev) { } -void __cpuinit x86_of_pci_init(void) +void x86_of_pci_init(void) { pcibios_enable_irq = x86_of_pci_irq_enable; pcibios_disable_irq = x86_of_pci_irq_disable; diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index d32abeabbda..174da5fc5a7 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -658,15 +658,18 @@ __init void e820_setup_gap(void) * boot_params.e820_map, others are passed via SETUP_E820_EXT node of * linked list of struct setup_data, which is parsed here. */ -void __init parse_e820_ext(struct setup_data *sdata) +void __init parse_e820_ext(u64 phys_addr, u32 data_len) { int entries; struct e820entry *extmap; + struct setup_data *sdata; + sdata = early_memremap(phys_addr, data_len); entries = sdata->len / sizeof(struct e820entry); extmap = (struct e820entry *)(sdata->data); __append_e820_map(extmap, entries); sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + early_iounmap(sdata, data_len); printk(KERN_INFO "e820: extended physical RAM map:\n"); e820_print_map("extended"); } diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 94ab6b90dd3..63bdb29b254 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -196,15 +196,23 @@ static void __init ati_bugs_contd(int num, int slot, int func) static void __init intel_remapping_check(int num, int slot, int func) { u8 revision; + u16 device; + device = read_pci_config_16(num, slot, func, PCI_DEVICE_ID); revision = read_pci_config_byte(num, slot, func, PCI_REVISION_ID); /* - * Revision 0x13 of this chipset supports irq remapping - * but has an erratum that breaks its behavior, flag it as such + * Revision 13 of all triggering devices id in this quirk have + * a problem draining interrupts when irq remapping is enabled, + * and should be flagged as broken. Additionally revisions 0x12 + * and 0x22 of device id 0x3405 has this problem. */ if (revision == 0x13) set_irq_remapping_broken(); + else if ((device == 0x3405) && + ((revision == 0x12) || + (revision == 0x22))) + set_irq_remapping_broken(); } @@ -239,6 +247,8 @@ static struct chipset early_qrk[] __initdata = { PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs_contd }, { PCI_VENDOR_ID_INTEL, 0x3403, PCI_CLASS_BRIDGE_HOST, PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check }, + { PCI_VENDOR_ID_INTEL, 0x3405, PCI_CLASS_BRIDGE_HOST, + PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check }, { PCI_VENDOR_ID_INTEL, 0x3406, PCI_CLASS_BRIDGE_HOST, PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check }, {} diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 138463a2487..06f87bece92 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -29,7 +29,7 @@ static void __init i386_default_early_setup(void) reserve_ebda_region(); } -void __init i386_start_kernel(void) +asmlinkage void __init i386_start_kernel(void) { sanitize_boot_params(&boot_params); diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 55b67614ed9..1be8e43b669 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -137,7 +137,7 @@ static void __init copy_bootdata(char *real_mode_data) } } -void __init x86_64_start_kernel(char * real_mode_data) +asmlinkage void __init x86_64_start_kernel(char * real_mode_data) { int i; diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index e65ddc62e11..81ba27679f1 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -292,7 +292,6 @@ ENDPROC(start_cpu0) * If cpu hotplug is not supported then this code can go in init section * which will be freed later */ -__CPUINIT ENTRY(startup_32_smp) cld movl $(__BOOT_DS),%eax @@ -410,6 +409,7 @@ enable_paging: /* * Check if it is 486 */ + movb $4,X86 # at least 486 cmpl $-1,X86_CPUID je is486 @@ -437,7 +437,6 @@ enable_paging: movl %edx,X86_CAPABILITY is486: - movb $4,X86 movl $0x50022,%ecx # set AM, WP, NE and MP movl %cr0,%eax andl $0x80000011,%eax # Save PG,PE,ET diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 5e4d8a8a5c4..e1aabdb314c 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -512,21 +512,6 @@ ENTRY(phys_base) #include "../../x86/xen/xen-head.S" - .section .bss, "aw", @nobits - .align L1_CACHE_BYTES -ENTRY(idt_table) - .skip IDT_ENTRIES * 16 - - .align L1_CACHE_BYTES -ENTRY(debug_idt_table) - .skip IDT_ENTRIES * 16 - -#ifdef CONFIG_TRACING - .align L1_CACHE_BYTES -ENTRY(trace_idt_table) - .skip IDT_ENTRIES * 16 -#endif - __PAGE_ALIGNED_BSS NEXT_PAGE(empty_zero_page) .skip PAGE_SIZE diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index b627746f6b1..5d576ab3440 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -108,15 +108,15 @@ EXPORT_SYMBOL(unlazy_fpu); unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu; unsigned int xstate_size; EXPORT_SYMBOL_GPL(xstate_size); -static struct i387_fxsave_struct fx_scratch __cpuinitdata; +static struct i387_fxsave_struct fx_scratch; -static void __cpuinit mxcsr_feature_mask_init(void) +static void mxcsr_feature_mask_init(void) { unsigned long mask = 0; if (cpu_has_fxsr) { memset(&fx_scratch, 0, sizeof(struct i387_fxsave_struct)); - asm volatile("fxsave %0" : : "m" (fx_scratch)); + asm volatile("fxsave %0" : "+m" (fx_scratch)); mask = fx_scratch.mxcsr_mask; if (mask == 0) mask = 0x0000ffbf; @@ -124,7 +124,7 @@ static void __cpuinit mxcsr_feature_mask_init(void) mxcsr_feature_mask &= mask; } -static void __cpuinit init_thread_xstate(void) +static void init_thread_xstate(void) { /* * Note that xstate_size might be overwriten later during @@ -153,7 +153,7 @@ static void __cpuinit init_thread_xstate(void) * into all processes. */ -void __cpuinit fpu_init(void) +void fpu_init(void) { unsigned long cr0; unsigned long cr4_mask = 0; @@ -608,7 +608,7 @@ static int __init no_387(char *s) __setup("no387", no_387); -void __cpuinit fpu_detect(struct cpuinfo_x86 *c) +void fpu_detect(struct cpuinfo_x86 *c) { unsigned long cr0; u16 fsw, fcw; diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 3a8185c042a..22d0687e7fd 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -177,7 +177,7 @@ u64 arch_irq_stat(void) * SMP cross-CPU interrupts have their own specific * handlers). */ -unsigned int __irq_entry do_IRQ(struct pt_regs *regs) +__visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); @@ -215,7 +215,7 @@ void __smp_x86_platform_ipi(void) x86_platform_ipi_callback(); } -void smp_x86_platform_ipi(struct pt_regs *regs) +__visible void smp_x86_platform_ipi(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); @@ -229,7 +229,7 @@ void smp_x86_platform_ipi(struct pt_regs *regs) /* * Handler for POSTED_INTERRUPT_VECTOR. */ -void smp_kvm_posted_intr_ipi(struct pt_regs *regs) +__visible void smp_kvm_posted_intr_ipi(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); @@ -247,7 +247,7 @@ void smp_kvm_posted_intr_ipi(struct pt_regs *regs) } #endif -void smp_trace_x86_platform_ipi(struct pt_regs *regs) +__visible void smp_trace_x86_platform_ipi(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 344faf8d0d6..4186755f1d7 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -119,7 +119,7 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) /* * allocate per-cpu stacks for hardirq and for softirq processing */ -void __cpuinit irq_ctx_init(int cpu) +void irq_ctx_init(int cpu) { union irq_ctx *irqctx; diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c index 636a55e4a13..1de84e3ab4e 100644 --- a/arch/x86/kernel/irq_work.c +++ b/arch/x86/kernel/irq_work.c @@ -22,14 +22,14 @@ static inline void __smp_irq_work_interrupt(void) irq_work_run(); } -void smp_irq_work_interrupt(struct pt_regs *regs) +__visible void smp_irq_work_interrupt(struct pt_regs *regs) { irq_work_entering_irq(); __smp_irq_work_interrupt(); exiting_irq(); } -void smp_trace_irq_work_interrupt(struct pt_regs *regs) +__visible void smp_trace_irq_work_interrupt(struct pt_regs *regs) { irq_work_entering_irq(); trace_irq_work_entry(IRQ_WORK_VECTOR); diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index 2889b3d4388..460f5d9ceeb 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -37,7 +37,19 @@ static void __jump_label_transform(struct jump_entry *entry, } else memcpy(&code, ideal_nops[NOP_ATOMIC5], JUMP_LABEL_NOP_SIZE); - (*poker)((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE); + /* + * Make text_poke_bp() a default fallback poker. + * + * At the time the change is being done, just ignore whether we + * are doing nop -> jump or jump -> nop transition, and assume + * always nop being the 'currently valid' instruction + * + */ + if (poker) + (*poker)((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE); + else + text_poke_bp((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE, + (void *)entry->code + JUMP_LABEL_NOP_SIZE); } void arch_jump_label_transform(struct jump_entry *entry, @@ -45,7 +57,7 @@ void arch_jump_label_transform(struct jump_entry *entry, { get_online_cpus(); mutex_lock(&text_mutex); - __jump_label_transform(entry, type, text_poke_smp); + __jump_label_transform(entry, type, NULL); mutex_unlock(&text_mutex); put_online_cpus(); } diff --git a/arch/x86/kernel/kprobes/common.h b/arch/x86/kernel/kprobes/common.h index 2e9d4b5af03..c6ee63f927a 100644 --- a/arch/x86/kernel/kprobes/common.h +++ b/arch/x86/kernel/kprobes/common.h @@ -82,14 +82,9 @@ extern void synthesize_reljump(void *from, void *to); extern void synthesize_relcall(void *from, void *to); #ifdef CONFIG_OPTPROBES -extern int arch_init_optprobes(void); extern int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter); extern unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr); #else /* !CONFIG_OPTPROBES */ -static inline int arch_init_optprobes(void) -{ - return 0; -} static inline int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter) { return 0; diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 211bce44552..79a3f968287 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -661,7 +661,7 @@ static void __used __kprobes kretprobe_trampoline_holder(void) /* * Called from kretprobe_trampoline */ -static __used __kprobes void *trampoline_handler(struct pt_regs *regs) +__visible __used __kprobes void *trampoline_handler(struct pt_regs *regs) { struct kretprobe_instance *ri = NULL; struct hlist_head *head, empty_rp; @@ -1068,7 +1068,7 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) int __init arch_init_kprobes(void) { - return arch_init_optprobes(); + return 0; } int __kprobes arch_trampoline_kprobe(struct kprobe *p) diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 76dc6f09572..898160b42e4 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -88,9 +88,7 @@ static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr, unsigned long v *(unsigned long *)addr = val; } -static void __used __kprobes kprobes_optinsn_template_holder(void) -{ - asm volatile ( +asm ( ".global optprobe_template_entry\n" "optprobe_template_entry:\n" #ifdef CONFIG_X86_64 @@ -129,7 +127,6 @@ static void __used __kprobes kprobes_optinsn_template_holder(void) #endif ".global optprobe_template_end\n" "optprobe_template_end:\n"); -} #define TMPL_MOVE_IDX \ ((long)&optprobe_template_val - (long)&optprobe_template_entry) @@ -371,31 +368,6 @@ int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op) return 0; } -#define MAX_OPTIMIZE_PROBES 256 -static struct text_poke_param *jump_poke_params; -static struct jump_poke_buffer { - u8 buf[RELATIVEJUMP_SIZE]; -} *jump_poke_bufs; - -static void __kprobes setup_optimize_kprobe(struct text_poke_param *tprm, - u8 *insn_buf, - struct optimized_kprobe *op) -{ - s32 rel = (s32)((long)op->optinsn.insn - - ((long)op->kp.addr + RELATIVEJUMP_SIZE)); - - /* Backup instructions which will be replaced by jump address */ - memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE, - RELATIVE_ADDR_SIZE); - - insn_buf[0] = RELATIVEJUMP_OPCODE; - *(s32 *)(&insn_buf[1]) = rel; - - tprm->addr = op->kp.addr; - tprm->opcode = insn_buf; - tprm->len = RELATIVEJUMP_SIZE; -} - /* * Replace breakpoints (int3) with relative jumps. * Caller must call with locking kprobe_mutex and text_mutex. @@ -403,37 +375,38 @@ static void __kprobes setup_optimize_kprobe(struct text_poke_param *tprm, void __kprobes arch_optimize_kprobes(struct list_head *oplist) { struct optimized_kprobe *op, *tmp; - int c = 0; + u8 insn_buf[RELATIVEJUMP_SIZE]; list_for_each_entry_safe(op, tmp, oplist, list) { + s32 rel = (s32)((long)op->optinsn.insn - + ((long)op->kp.addr + RELATIVEJUMP_SIZE)); + WARN_ON(kprobe_disabled(&op->kp)); - /* Setup param */ - setup_optimize_kprobe(&jump_poke_params[c], - jump_poke_bufs[c].buf, op); + + /* Backup instructions which will be replaced by jump address */ + memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE, + RELATIVE_ADDR_SIZE); + + insn_buf[0] = RELATIVEJUMP_OPCODE; + *(s32 *)(&insn_buf[1]) = rel; + + text_poke_bp(op->kp.addr, insn_buf, RELATIVEJUMP_SIZE, + op->optinsn.insn); + list_del_init(&op->list); - if (++c >= MAX_OPTIMIZE_PROBES) - break; } - - /* - * text_poke_smp doesn't support NMI/MCE code modifying. - * However, since kprobes itself also doesn't support NMI/MCE - * code probing, it's not a problem. - */ - text_poke_smp_batch(jump_poke_params, c); } -static void __kprobes setup_unoptimize_kprobe(struct text_poke_param *tprm, - u8 *insn_buf, - struct optimized_kprobe *op) +/* Replace a relative jump with a breakpoint (int3). */ +void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op) { + u8 insn_buf[RELATIVEJUMP_SIZE]; + /* Set int3 to first byte for kprobes */ insn_buf[0] = BREAKPOINT_INSTRUCTION; memcpy(insn_buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); - - tprm->addr = op->kp.addr; - tprm->opcode = insn_buf; - tprm->len = RELATIVEJUMP_SIZE; + text_poke_bp(op->kp.addr, insn_buf, RELATIVEJUMP_SIZE, + op->optinsn.insn); } /* @@ -444,34 +417,11 @@ extern void arch_unoptimize_kprobes(struct list_head *oplist, struct list_head *done_list) { struct optimized_kprobe *op, *tmp; - int c = 0; list_for_each_entry_safe(op, tmp, oplist, list) { - /* Setup param */ - setup_unoptimize_kprobe(&jump_poke_params[c], - jump_poke_bufs[c].buf, op); + arch_unoptimize_kprobe(op); list_move(&op->list, done_list); - if (++c >= MAX_OPTIMIZE_PROBES) - break; } - - /* - * text_poke_smp doesn't support NMI/MCE code modifying. - * However, since kprobes itself also doesn't support NMI/MCE - * code probing, it's not a problem. - */ - text_poke_smp_batch(jump_poke_params, c); -} - -/* Replace a relative jump with a breakpoint (int3). */ -void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op) -{ - u8 buf[RELATIVEJUMP_SIZE]; - - /* Set int3 to first byte for kprobes */ - buf[0] = BREAKPOINT_INSTRUCTION; - memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); - text_poke_smp(op->kp.addr, buf, RELATIVEJUMP_SIZE); } int __kprobes @@ -491,22 +441,3 @@ setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter) } return 0; } - -int __kprobes arch_init_optprobes(void) -{ - /* Allocate code buffer and parameter array */ - jump_poke_bufs = kmalloc(sizeof(struct jump_poke_buffer) * - MAX_OPTIMIZE_PROBES, GFP_KERNEL); - if (!jump_poke_bufs) - return -ENOMEM; - - jump_poke_params = kmalloc(sizeof(struct text_poke_param) * - MAX_OPTIMIZE_PROBES, GFP_KERNEL); - if (!jump_poke_params) { - kfree(jump_poke_bufs); - jump_poke_bufs = NULL; - return -ENOMEM; - } - - return 0; -} diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index cd6d9a5a42f..697b93af02d 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -34,6 +34,7 @@ #include <linux/sched.h> #include <linux/slab.h> #include <linux/kprobes.h> +#include <linux/debugfs.h> #include <asm/timer.h> #include <asm/cpu.h> #include <asm/traps.h> @@ -320,7 +321,7 @@ static void kvm_guest_apic_eoi_write(u32 reg, u32 val) apic_write(APIC_EOI, APIC_EOI_ACK); } -void __cpuinit kvm_guest_cpu_init(void) +void kvm_guest_cpu_init(void) { if (!kvm_para_available()) return; @@ -419,9 +420,10 @@ static void __init kvm_smp_prepare_boot_cpu(void) WARN_ON(kvm_register_clock("primary cpu clock")); kvm_guest_cpu_init(); native_smp_prepare_boot_cpu(); + kvm_spinlock_init(); } -static void __cpuinit kvm_guest_cpu_online(void *dummy) +static void kvm_guest_cpu_online(void *dummy) { kvm_guest_cpu_init(); } @@ -435,8 +437,8 @@ static void kvm_guest_cpu_offline(void *dummy) apf_task_wake_all(); } -static int __cpuinit kvm_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) +static int kvm_cpu_notify(struct notifier_block *self, unsigned long action, + void *hcpu) { int cpu = (unsigned long)hcpu; switch (action) { @@ -455,7 +457,7 @@ static int __cpuinit kvm_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block __cpuinitdata kvm_cpu_notifier = { +static struct notifier_block kvm_cpu_notifier = { .notifier_call = kvm_cpu_notify, }; #endif @@ -498,11 +500,9 @@ void __init kvm_guest_init(void) #endif } -static bool __init kvm_detect(void) +static uint32_t __init kvm_detect(void) { - if (!kvm_para_available()) - return false; - return true; + return kvm_cpuid_base(); } const struct hypervisor_x86 x86_hyper_kvm __refconst = { @@ -523,3 +523,263 @@ static __init int activate_jump_labels(void) return 0; } arch_initcall(activate_jump_labels); + +#ifdef CONFIG_PARAVIRT_SPINLOCKS + +/* Kick a cpu by its apicid. Used to wake up a halted vcpu */ +static void kvm_kick_cpu(int cpu) +{ + int apicid; + unsigned long flags = 0; + + apicid = per_cpu(x86_cpu_to_apicid, cpu); + kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid); +} + +enum kvm_contention_stat { + TAKEN_SLOW, + TAKEN_SLOW_PICKUP, + RELEASED_SLOW, + RELEASED_SLOW_KICKED, + NR_CONTENTION_STATS +}; + +#ifdef CONFIG_KVM_DEBUG_FS +#define HISTO_BUCKETS 30 + +static struct kvm_spinlock_stats +{ + u32 contention_stats[NR_CONTENTION_STATS]; + u32 histo_spin_blocked[HISTO_BUCKETS+1]; + u64 time_blocked; +} spinlock_stats; + +static u8 zero_stats; + +static inline void check_zero(void) +{ + u8 ret; + u8 old; + + old = ACCESS_ONCE(zero_stats); + if (unlikely(old)) { + ret = cmpxchg(&zero_stats, old, 0); + /* This ensures only one fellow resets the stat */ + if (ret == old) + memset(&spinlock_stats, 0, sizeof(spinlock_stats)); + } +} + +static inline void add_stats(enum kvm_contention_stat var, u32 val) +{ + check_zero(); + spinlock_stats.contention_stats[var] += val; +} + + +static inline u64 spin_time_start(void) +{ + return sched_clock(); +} + +static void __spin_time_accum(u64 delta, u32 *array) +{ + unsigned index; + + index = ilog2(delta); + check_zero(); + + if (index < HISTO_BUCKETS) + array[index]++; + else + array[HISTO_BUCKETS]++; +} + +static inline void spin_time_accum_blocked(u64 start) +{ + u32 delta; + + delta = sched_clock() - start; + __spin_time_accum(delta, spinlock_stats.histo_spin_blocked); + spinlock_stats.time_blocked += delta; +} + +static struct dentry *d_spin_debug; +static struct dentry *d_kvm_debug; + +struct dentry *kvm_init_debugfs(void) +{ + d_kvm_debug = debugfs_create_dir("kvm", NULL); + if (!d_kvm_debug) + printk(KERN_WARNING "Could not create 'kvm' debugfs directory\n"); + + return d_kvm_debug; +} + +static int __init kvm_spinlock_debugfs(void) +{ + struct dentry *d_kvm; + + d_kvm = kvm_init_debugfs(); + if (d_kvm == NULL) + return -ENOMEM; + + d_spin_debug = debugfs_create_dir("spinlocks", d_kvm); + + debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats); + + debugfs_create_u32("taken_slow", 0444, d_spin_debug, + &spinlock_stats.contention_stats[TAKEN_SLOW]); + debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug, + &spinlock_stats.contention_stats[TAKEN_SLOW_PICKUP]); + + debugfs_create_u32("released_slow", 0444, d_spin_debug, + &spinlock_stats.contention_stats[RELEASED_SLOW]); + debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug, + &spinlock_stats.contention_stats[RELEASED_SLOW_KICKED]); + + debugfs_create_u64("time_blocked", 0444, d_spin_debug, + &spinlock_stats.time_blocked); + + debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug, + spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1); + + return 0; +} +fs_initcall(kvm_spinlock_debugfs); +#else /* !CONFIG_KVM_DEBUG_FS */ +static inline void add_stats(enum kvm_contention_stat var, u32 val) +{ +} + +static inline u64 spin_time_start(void) +{ + return 0; +} + +static inline void spin_time_accum_blocked(u64 start) +{ +} +#endif /* CONFIG_KVM_DEBUG_FS */ + +struct kvm_lock_waiting { + struct arch_spinlock *lock; + __ticket_t want; +}; + +/* cpus 'waiting' on a spinlock to become available */ +static cpumask_t waiting_cpus; + +/* Track spinlock on which a cpu is waiting */ +static DEFINE_PER_CPU(struct kvm_lock_waiting, klock_waiting); + +static void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want) +{ + struct kvm_lock_waiting *w; + int cpu; + u64 start; + unsigned long flags; + + if (in_nmi()) + return; + + w = &__get_cpu_var(klock_waiting); + cpu = smp_processor_id(); + start = spin_time_start(); + + /* + * Make sure an interrupt handler can't upset things in a + * partially setup state. + */ + local_irq_save(flags); + + /* + * The ordering protocol on this is that the "lock" pointer + * may only be set non-NULL if the "want" ticket is correct. + * If we're updating "want", we must first clear "lock". + */ + w->lock = NULL; + smp_wmb(); + w->want = want; + smp_wmb(); + w->lock = lock; + + add_stats(TAKEN_SLOW, 1); + + /* + * This uses set_bit, which is atomic but we should not rely on its + * reordering gurantees. So barrier is needed after this call. + */ + cpumask_set_cpu(cpu, &waiting_cpus); + + barrier(); + + /* + * Mark entry to slowpath before doing the pickup test to make + * sure we don't deadlock with an unlocker. + */ + __ticket_enter_slowpath(lock); + + /* + * check again make sure it didn't become free while + * we weren't looking. + */ + if (ACCESS_ONCE(lock->tickets.head) == want) { + add_stats(TAKEN_SLOW_PICKUP, 1); + goto out; + } + + /* + * halt until it's our turn and kicked. Note that we do safe halt + * for irq enabled case to avoid hang when lock info is overwritten + * in irq spinlock slowpath and no spurious interrupt occur to save us. + */ + if (arch_irqs_disabled_flags(flags)) + halt(); + else + safe_halt(); + +out: + cpumask_clear_cpu(cpu, &waiting_cpus); + w->lock = NULL; + local_irq_restore(flags); + spin_time_accum_blocked(start); +} +PV_CALLEE_SAVE_REGS_THUNK(kvm_lock_spinning); + +/* Kick vcpu waiting on @lock->head to reach value @ticket */ +static void kvm_unlock_kick(struct arch_spinlock *lock, __ticket_t ticket) +{ + int cpu; + + add_stats(RELEASED_SLOW, 1); + for_each_cpu(cpu, &waiting_cpus) { + const struct kvm_lock_waiting *w = &per_cpu(klock_waiting, cpu); + if (ACCESS_ONCE(w->lock) == lock && + ACCESS_ONCE(w->want) == ticket) { + add_stats(RELEASED_SLOW_KICKED, 1); + kvm_kick_cpu(cpu); + break; + } + } +} + +/* + * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present. + */ +void __init kvm_spinlock_init(void) +{ + if (!kvm_para_available()) + return; + /* Does host kernel support KVM_FEATURE_PV_UNHALT? */ + if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) + return; + + printk(KERN_INFO "KVM setup paravirtual spinlock\n"); + + static_key_slow_inc(¶virt_ticketlocks_enabled); + + pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(kvm_lock_spinning); + pv_lock_ops.unlock_kick = kvm_unlock_kick; +} +#endif /* CONFIG_PARAVIRT_SPINLOCKS */ diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 1f354f4b602..1570e074134 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -182,7 +182,7 @@ static void kvm_restore_sched_clock_state(void) } #ifdef CONFIG_X86_LOCAL_APIC -static void __cpuinit kvm_setup_secondary_clock(void) +static void kvm_setup_secondary_clock(void) { /* * Now that the first cpu already had this clocksource initialized, diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 47ebb1dbfbc..7123b5df479 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -145,10 +145,9 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) return 0; } -static unsigned int verify_patch_size(int cpu, u32 patch_size, +static unsigned int verify_patch_size(u8 family, u32 patch_size, unsigned int size) { - struct cpuinfo_x86 *c = &cpu_data(cpu); u32 max_size; #define F1XH_MPB_MAX_SIZE 2048 @@ -156,7 +155,7 @@ static unsigned int verify_patch_size(int cpu, u32 patch_size, #define F15H_MPB_MAX_SIZE 4096 #define F16H_MPB_MAX_SIZE 3458 - switch (c->x86) { + switch (family) { case 0x14: max_size = F14H_MPB_MAX_SIZE; break; @@ -220,12 +219,13 @@ int apply_microcode_amd(int cpu) return 0; } - if (__apply_microcode_amd(mc_amd)) + if (__apply_microcode_amd(mc_amd)) { pr_err("CPU%d: update failed for patch_level=0x%08x\n", cpu, mc_amd->hdr.patch_id); - else - pr_info("CPU%d: new patch_level=0x%08x\n", cpu, - mc_amd->hdr.patch_id); + return -1; + } + pr_info("CPU%d: new patch_level=0x%08x\n", cpu, + mc_amd->hdr.patch_id); uci->cpu_sig.rev = mc_amd->hdr.patch_id; c->microcode = mc_amd->hdr.patch_id; @@ -276,9 +276,8 @@ static void cleanup(void) * driver cannot continue functioning normally. In such cases, we tear * down everything we've used up so far and exit. */ -static int verify_and_add_patch(unsigned int cpu, u8 *fw, unsigned int leftover) +static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover) { - struct cpuinfo_x86 *c = &cpu_data(cpu); struct microcode_header_amd *mc_hdr; struct ucode_patch *patch; unsigned int patch_size, crnt_size, ret; @@ -298,7 +297,7 @@ static int verify_and_add_patch(unsigned int cpu, u8 *fw, unsigned int leftover) /* check if patch is for the current family */ proc_fam = ((proc_fam >> 8) & 0xf) + ((proc_fam >> 20) & 0xff); - if (proc_fam != c->x86) + if (proc_fam != family) return crnt_size; if (mc_hdr->nb_dev_id || mc_hdr->sb_dev_id) { @@ -307,7 +306,7 @@ static int verify_and_add_patch(unsigned int cpu, u8 *fw, unsigned int leftover) return crnt_size; } - ret = verify_patch_size(cpu, patch_size, leftover); + ret = verify_patch_size(family, patch_size, leftover); if (!ret) { pr_err("Patch-ID 0x%08x: size mismatch.\n", mc_hdr->patch_id); return crnt_size; @@ -338,7 +337,8 @@ static int verify_and_add_patch(unsigned int cpu, u8 *fw, unsigned int leftover) return crnt_size; } -static enum ucode_state __load_microcode_amd(int cpu, const u8 *data, size_t size) +static enum ucode_state __load_microcode_amd(u8 family, const u8 *data, + size_t size) { enum ucode_state ret = UCODE_ERROR; unsigned int leftover; @@ -361,7 +361,7 @@ static enum ucode_state __load_microcode_amd(int cpu, const u8 *data, size_t siz } while (leftover) { - crnt_size = verify_and_add_patch(cpu, fw, leftover); + crnt_size = verify_and_add_patch(family, fw, leftover); if (crnt_size < 0) return ret; @@ -372,22 +372,22 @@ static enum ucode_state __load_microcode_amd(int cpu, const u8 *data, size_t siz return UCODE_OK; } -enum ucode_state load_microcode_amd(int cpu, const u8 *data, size_t size) +enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size) { enum ucode_state ret; /* free old equiv table */ free_equiv_cpu_table(); - ret = __load_microcode_amd(cpu, data, size); + ret = __load_microcode_amd(family, data, size); if (ret != UCODE_OK) cleanup(); #if defined(CONFIG_MICROCODE_AMD_EARLY) && defined(CONFIG_X86_32) /* save BSP's matching patch for early load */ - if (cpu_data(cpu).cpu_index == boot_cpu_data.cpu_index) { - struct ucode_patch *p = find_patch(cpu); + if (cpu_data(smp_processor_id()).cpu_index == boot_cpu_data.cpu_index) { + struct ucode_patch *p = find_patch(smp_processor_id()); if (p) { memset(amd_bsp_mpb, 0, MPB_MAX_SIZE); memcpy(amd_bsp_mpb, p->data, min_t(u32, ksize(p->data), @@ -440,7 +440,7 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device, goto fw_release; } - ret = load_microcode_amd(cpu, fw->data, fw->size); + ret = load_microcode_amd(c->x86, fw->data, fw->size); fw_release: release_firmware(fw); diff --git a/arch/x86/kernel/microcode_amd_early.c b/arch/x86/kernel/microcode_amd_early.c index 1ac6e9aee76..6073104ccaa 100644 --- a/arch/x86/kernel/microcode_amd_early.c +++ b/arch/x86/kernel/microcode_amd_early.c @@ -82,7 +82,7 @@ static struct cpio_data __init find_ucode_in_initrd(void) * load_microcode_amd() to save equivalent cpu table and microcode patches in * kernel heap memory. */ -static void __cpuinit apply_ucode_in_initrd(void *ucode, size_t size) +static void apply_ucode_in_initrd(void *ucode, size_t size) { struct equiv_cpu_entry *eq; u32 *header; @@ -206,7 +206,7 @@ u8 amd_bsp_mpb[MPB_MAX_SIZE]; * save_microcode_in_initrd_amd() BSP's patch is copied to amd_bsp_mpb, which * is used upon resume from suspend. */ -void __cpuinit load_ucode_amd_ap(void) +void load_ucode_amd_ap(void) { struct microcode_amd *mc; unsigned long *initrd; @@ -238,25 +238,17 @@ static void __init collect_cpu_sig_on_bsp(void *arg) uci->cpu_sig.sig = cpuid_eax(0x00000001); } #else -static void __cpuinit collect_cpu_info_amd_early(struct cpuinfo_x86 *c, - struct ucode_cpu_info *uci) +void load_ucode_amd_ap(void) { + unsigned int cpu = smp_processor_id(); + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; u32 rev, eax; rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax); eax = cpuid_eax(0x00000001); - uci->cpu_sig.sig = eax; uci->cpu_sig.rev = rev; - c->microcode = rev; - c->x86 = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); -} - -void __cpuinit load_ucode_amd_ap(void) -{ - unsigned int cpu = smp_processor_id(); - - collect_cpu_info_amd_early(&cpu_data(cpu), ucode_cpu_info + cpu); + uci->cpu_sig.sig = eax; if (cpu && !ucode_loaded) { void *ucode; @@ -265,8 +257,10 @@ void __cpuinit load_ucode_amd_ap(void) return; ucode = (void *)(initrd_start + ucode_offset); - if (load_microcode_amd(0, ucode, ucode_size) != UCODE_OK) + eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); + if (load_microcode_amd(eax, ucode, ucode_size) != UCODE_OK) return; + ucode_loaded = true; } @@ -278,6 +272,8 @@ int __init save_microcode_in_initrd_amd(void) { enum ucode_state ret; void *ucode; + u32 eax; + #ifdef CONFIG_X86_32 unsigned int bsp = boot_cpu_data.cpu_index; struct ucode_cpu_info *uci = ucode_cpu_info + bsp; @@ -293,7 +289,10 @@ int __init save_microcode_in_initrd_amd(void) return 0; ucode = (void *)(initrd_start + ucode_offset); - ret = load_microcode_amd(0, ucode, ucode_size); + eax = cpuid_eax(0x00000001); + eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); + + ret = load_microcode_amd(eax, ucode, ucode_size); if (ret != UCODE_OK) return -EINVAL; diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 22db92bbdf1..15c987698b0 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -468,7 +468,7 @@ static struct syscore_ops mc_syscore_ops = { .resume = mc_bp_resume, }; -static __cpuinit int +static int mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; diff --git a/arch/x86/kernel/microcode_core_early.c b/arch/x86/kernel/microcode_core_early.c index 86119f63db0..be7f8514f57 100644 --- a/arch/x86/kernel/microcode_core_early.c +++ b/arch/x86/kernel/microcode_core_early.c @@ -41,7 +41,7 @@ * * x86_vendor() gets vendor information directly through cpuid. */ -static int __cpuinit x86_vendor(void) +static int x86_vendor(void) { u32 eax = 0x00000000; u32 ebx, ecx = 0, edx; @@ -57,7 +57,7 @@ static int __cpuinit x86_vendor(void) return X86_VENDOR_UNKNOWN; } -static int __cpuinit x86_family(void) +static int x86_family(void) { u32 eax = 0x00000001; u32 ebx, ecx = 0, edx; @@ -96,7 +96,7 @@ void __init load_ucode_bsp(void) } } -void __cpuinit load_ucode_ap(void) +void load_ucode_ap(void) { int vendor, x86; diff --git a/arch/x86/kernel/microcode_intel_early.c b/arch/x86/kernel/microcode_intel_early.c index dabef95506f..1575deb2e63 100644 --- a/arch/x86/kernel/microcode_intel_early.c +++ b/arch/x86/kernel/microcode_intel_early.c @@ -34,7 +34,7 @@ struct mc_saved_data { struct microcode_intel **mc_saved; } mc_saved_data; -static enum ucode_state __cpuinit +static enum ucode_state generic_load_microcode_early(struct microcode_intel **mc_saved_p, unsigned int mc_saved_count, struct ucode_cpu_info *uci) @@ -69,7 +69,7 @@ out: return state; } -static void __cpuinit +static void microcode_pointer(struct microcode_intel **mc_saved, unsigned long *mc_saved_in_initrd, unsigned long initrd_start, int mc_saved_count) @@ -82,7 +82,7 @@ microcode_pointer(struct microcode_intel **mc_saved, } #ifdef CONFIG_X86_32 -static void __cpuinit +static void microcode_phys(struct microcode_intel **mc_saved_tmp, struct mc_saved_data *mc_saved_data) { @@ -101,7 +101,7 @@ microcode_phys(struct microcode_intel **mc_saved_tmp, } #endif -static enum ucode_state __cpuinit +static enum ucode_state load_microcode(struct mc_saved_data *mc_saved_data, unsigned long *mc_saved_in_initrd, unsigned long initrd_start, @@ -375,7 +375,7 @@ do { \ #define native_wrmsr(msr, low, high) \ native_write_msr(msr, low, high); -static int __cpuinit collect_cpu_info_early(struct ucode_cpu_info *uci) +static int collect_cpu_info_early(struct ucode_cpu_info *uci) { unsigned int val[2]; u8 x86, x86_model; @@ -584,7 +584,7 @@ scan_microcode(unsigned long start, unsigned long end, /* * Print ucode update info. */ -static void __cpuinit +static void print_ucode_info(struct ucode_cpu_info *uci, unsigned int date) { int cpu = smp_processor_id(); @@ -605,7 +605,7 @@ static int current_mc_date; /* * Print early updated ucode info after printk works. This is delayed info dump. */ -void __cpuinit show_ucode_info_early(void) +void show_ucode_info_early(void) { struct ucode_cpu_info uci; @@ -621,7 +621,7 @@ void __cpuinit show_ucode_info_early(void) * mc_saved_data.mc_saved and delay printing microcode info in * show_ucode_info_early() until printk() works. */ -static void __cpuinit print_ucode(struct ucode_cpu_info *uci) +static void print_ucode(struct ucode_cpu_info *uci) { struct microcode_intel *mc_intel; int *delay_ucode_info_p; @@ -643,12 +643,12 @@ static void __cpuinit print_ucode(struct ucode_cpu_info *uci) * Flush global tlb. We only do this in x86_64 where paging has been enabled * already and PGE should be enabled as well. */ -static inline void __cpuinit flush_tlb_early(void) +static inline void flush_tlb_early(void) { __native_flush_tlb_global_irq_disabled(); } -static inline void __cpuinit print_ucode(struct ucode_cpu_info *uci) +static inline void print_ucode(struct ucode_cpu_info *uci) { struct microcode_intel *mc_intel; @@ -660,8 +660,8 @@ static inline void __cpuinit print_ucode(struct ucode_cpu_info *uci) } #endif -static int __cpuinit apply_microcode_early(struct mc_saved_data *mc_saved_data, - struct ucode_cpu_info *uci) +static int apply_microcode_early(struct mc_saved_data *mc_saved_data, + struct ucode_cpu_info *uci) { struct microcode_intel *mc_intel; unsigned int val[2]; @@ -763,7 +763,7 @@ load_ucode_intel_bsp(void) #endif } -void __cpuinit load_ucode_intel_ap(void) +void load_ucode_intel_ap(void) { struct mc_saved_data *mc_saved_data_p; struct ucode_cpu_info uci; diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c index ac861b8348e..f4c886d9165 100644 --- a/arch/x86/kernel/mmconf-fam10h_64.c +++ b/arch/x86/kernel/mmconf-fam10h_64.c @@ -24,14 +24,14 @@ struct pci_hostbridge_probe { u32 device; }; -static u64 __cpuinitdata fam10h_pci_mmconf_base; +static u64 fam10h_pci_mmconf_base; -static struct pci_hostbridge_probe pci_probes[] __cpuinitdata = { +static struct pci_hostbridge_probe pci_probes[] = { { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1200 }, { 0xff, 0, PCI_VENDOR_ID_AMD, 0x1200 }, }; -static int __cpuinit cmp_range(const void *x1, const void *x2) +static int cmp_range(const void *x1, const void *x2) { const struct range *r1 = x1; const struct range *r2 = x2; @@ -49,7 +49,7 @@ static int __cpuinit cmp_range(const void *x1, const void *x2) /* need to avoid (0xfd<<32), (0xfe<<32), and (0xff<<32), ht used space */ #define FAM10H_PCI_MMCONF_BASE (0xfcULL<<32) #define BASE_VALID(b) ((b) + MMCONF_SIZE <= (0xfdULL<<32) || (b) >= (1ULL<<40)) -static void __cpuinit get_fam10h_pci_mmconf_base(void) +static void get_fam10h_pci_mmconf_base(void) { int i; unsigned bus; @@ -166,7 +166,7 @@ out: fam10h_pci_mmconf_base = base; } -void __cpuinit fam10h_check_enable_mmcfg(void) +void fam10h_check_enable_mmcfg(void) { u64 val; u32 address; @@ -230,7 +230,7 @@ static const struct dmi_system_id __initconst mmconf_dmi_table[] = { {} }; -/* Called from a __cpuinit function, but only on the BSP. */ +/* Called from a non __init function, but only on the BSP. */ void __ref check_enable_amd_mmconf_dmi(void) { dmi_check_system(mmconf_dmi_table); diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index ce130493b80..88458faea2f 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -200,7 +200,7 @@ static const struct file_operations msr_fops = { .compat_ioctl = msr_ioctl, }; -static int __cpuinit msr_device_create(int cpu) +static int msr_device_create(int cpu) { struct device *dev; @@ -214,8 +214,8 @@ static void msr_device_destroy(int cpu) device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu)); } -static int __cpuinit msr_class_cpu_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) +static int msr_class_cpu_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; int err = 0; diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c index 676b8c77a97..bbb6c731634 100644 --- a/arch/x86/kernel/paravirt-spinlocks.c +++ b/arch/x86/kernel/paravirt-spinlocks.c @@ -4,25 +4,17 @@ */ #include <linux/spinlock.h> #include <linux/module.h> +#include <linux/jump_label.h> #include <asm/paravirt.h> -static inline void -default_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags) -{ - arch_spin_lock(lock); -} - struct pv_lock_ops pv_lock_ops = { #ifdef CONFIG_SMP - .spin_is_locked = __ticket_spin_is_locked, - .spin_is_contended = __ticket_spin_is_contended, - - .spin_lock = __ticket_spin_lock, - .spin_lock_flags = default_spin_lock_flags, - .spin_trylock = __ticket_spin_trylock, - .spin_unlock = __ticket_spin_unlock, + .lock_spinning = __PV_IS_CALLEE_SAVE(paravirt_nop), + .unlock_kick = paravirt_nop, #endif }; EXPORT_SYMBOL(pv_lock_ops); +struct static_key paravirt_ticketlocks_enabled = STATIC_KEY_INIT_FALSE; +EXPORT_SYMBOL(paravirt_ticketlocks_enabled); diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index cd6de64cc48..1b10af835c3 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -62,11 +62,6 @@ void __init default_banner(void) pv_info.name); } -/* Simple instruction patching code. */ -#define DEF_NATIVE(ops, name, code) \ - extern const char start_##ops##_##name[], end_##ops##_##name[]; \ - asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":") - /* Undefined instruction for dealing with missing ops pointers. */ static const unsigned char ud2a[] = { 0x0f, 0x0b }; @@ -324,7 +319,7 @@ struct pv_time_ops pv_time_ops = { .steal_clock = native_steal_clock, }; -struct pv_irq_ops pv_irq_ops = { +__visible struct pv_irq_ops pv_irq_ops = { .save_fl = __PV_IS_CALLEE_SAVE(native_save_fl), .restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl), .irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable), @@ -336,7 +331,7 @@ struct pv_irq_ops pv_irq_ops = { #endif }; -struct pv_cpu_ops pv_cpu_ops = { +__visible struct pv_cpu_ops pv_cpu_ops = { .cpuid = native_cpuid, .get_debugreg = native_get_debugreg, .set_debugreg = native_set_debugreg, diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 81a5f5e8f14..c83516be105 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -36,7 +36,7 @@ * section. Since TSS's are completely CPU-local, we want them * on exact cacheline boundaries, to eliminate cacheline ping-pong. */ -DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS; +__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS; #ifdef CONFIG_X86_64 static DEFINE_PER_CPU(unsigned char, is_idle); @@ -398,7 +398,7 @@ static void amd_e400_idle(void) default_idle(); } -void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) +void select_idle_routine(const struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP if (boot_option_idle_override == IDLE_POLL && smp_num_siblings > 1) diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index f8adefca71d..884f98f6935 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -247,7 +247,7 @@ EXPORT_SYMBOL_GPL(start_thread); * the task-switch, and shows up in ret_from_fork in entry.S, * for example. */ -__notrace_funcgraph struct task_struct * +__visible __notrace_funcgraph struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *prev = &prev_p->thread, diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 05646bab4ca..bb1dc51bab0 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -52,7 +52,7 @@ asmlinkage extern void ret_from_fork(void); -DEFINE_PER_CPU(unsigned long, old_rsp); +asmlinkage DEFINE_PER_CPU(unsigned long, old_rsp); /* Prints also some state that isn't saved in the pt_regs */ void __show_regs(struct pt_regs *regs, int all) @@ -274,7 +274,7 @@ void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp) * Kprobes not supported here. Set the probe on schedule instead. * Function graph tracer not supported too. */ -__notrace_funcgraph struct task_struct * +__visible __notrace_funcgraph struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *prev = &prev_p->thread; diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 2cb9470ea85..a16bae3f83b 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -128,46 +128,7 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock, set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); } -static struct pvclock_vsyscall_time_info *pvclock_vdso_info; - -static struct pvclock_vsyscall_time_info * -pvclock_get_vsyscall_user_time_info(int cpu) -{ - if (!pvclock_vdso_info) { - BUG(); - return NULL; - } - - return &pvclock_vdso_info[cpu]; -} - -struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu) -{ - return &pvclock_get_vsyscall_user_time_info(cpu)->pvti; -} - #ifdef CONFIG_X86_64 -static int pvclock_task_migrate(struct notifier_block *nb, unsigned long l, - void *v) -{ - struct task_migration_notifier *mn = v; - struct pvclock_vsyscall_time_info *pvti; - - pvti = pvclock_get_vsyscall_user_time_info(mn->from_cpu); - - /* this is NULL when pvclock vsyscall is not initialized */ - if (unlikely(pvti == NULL)) - return NOTIFY_DONE; - - pvti->migrate_count++; - - return NOTIFY_DONE; -} - -static struct notifier_block pvclock_migrate = { - .notifier_call = pvclock_task_migrate, -}; - /* * Initialize the generic pvclock vsyscall state. This will allocate * a/some page(s) for the per-vcpu pvclock information, set up a @@ -181,17 +142,12 @@ int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i, WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE); - pvclock_vdso_info = i; - for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) { __set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx, __pa(i) + (idx*PAGE_SIZE), PAGE_KERNEL_VVAR); } - - register_task_migration_notifier(&pvclock_migrate); - return 0; } #endif diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index e68709da825..f0de6294b95 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -170,7 +170,7 @@ static struct resource bss_resource = { #ifdef CONFIG_X86_32 /* cpu data as detected by the assembly code in head.S */ -struct cpuinfo_x86 new_cpu_data __cpuinitdata = { +struct cpuinfo_x86 new_cpu_data = { .wp_works_ok = -1, }; /* common cpu data for all cpus */ @@ -206,9 +206,9 @@ EXPORT_SYMBOL(boot_cpu_data); #if !defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64) -unsigned long mmu_cr4_features; +__visible unsigned long mmu_cr4_features; #else -unsigned long mmu_cr4_features = X86_CR4_PAE; +__visible unsigned long mmu_cr4_features = X86_CR4_PAE; #endif /* Boot loader ID and version as integers, for the benefit of proc_dointvec */ @@ -426,25 +426,23 @@ static void __init reserve_initrd(void) static void __init parse_setup_data(void) { struct setup_data *data; - u64 pa_data; + u64 pa_data, pa_next; pa_data = boot_params.hdr.setup_data; while (pa_data) { - u32 data_len, map_len; + u32 data_len, map_len, data_type; map_len = max(PAGE_SIZE - (pa_data & ~PAGE_MASK), (u64)sizeof(struct setup_data)); data = early_memremap(pa_data, map_len); data_len = data->len + sizeof(struct setup_data); - if (data_len > map_len) { - early_iounmap(data, map_len); - data = early_memremap(pa_data, data_len); - map_len = data_len; - } + data_type = data->type; + pa_next = data->next; + early_iounmap(data, map_len); - switch (data->type) { + switch (data_type) { case SETUP_E820_EXT: - parse_e820_ext(data); + parse_e820_ext(pa_data, data_len); break; case SETUP_DTB: add_dtb(pa_data); @@ -452,8 +450,7 @@ static void __init parse_setup_data(void) default: break; } - pa_data = data->next; - early_iounmap(data, map_len); + pa_data = pa_next; } } @@ -1070,7 +1067,7 @@ void __init setup_arch(char **cmdline_p) cleanup_highmap(); - memblock.current_limit = ISA_END_ADDRESS; + memblock_set_current_limit(ISA_END_ADDRESS); memblock_x86_fill(); /* @@ -1103,7 +1100,7 @@ void __init setup_arch(char **cmdline_p) setup_real_mode(); - memblock.current_limit = get_max_mapped(); + memblock_set_current_limit(get_max_mapped()); dma_contiguous_reserve(0); /* diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index cf913587d4d..9e5de6813e1 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -358,7 +358,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, else put_user_ex(0, &frame->uc.uc_flags); put_user_ex(0, &frame->uc.uc_link); - err |= __save_altstack(&frame->uc.uc_stack, regs->sp); + save_altstack_ex(&frame->uc.uc_stack, regs->sp); /* Set up to return from userspace. */ restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn); @@ -423,7 +423,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, else put_user_ex(0, &frame->uc.uc_flags); put_user_ex(0, &frame->uc.uc_link); - err |= __save_altstack(&frame->uc.uc_stack, regs->sp); + save_altstack_ex(&frame->uc.uc_stack, regs->sp); /* Set up to return from userspace. If provided, use a stub already in userspace. */ @@ -490,7 +490,7 @@ static int x32_setup_rt_frame(struct ksignal *ksig, else put_user_ex(0, &frame->uc.uc_flags); put_user_ex(0, &frame->uc.uc_link); - err |= __compat_save_altstack(&frame->uc.uc_stack, regs->sp); + compat_save_altstack_ex(&frame->uc.uc_stack, regs->sp); put_user_ex(0, &frame->uc.uc__pad0); if (ksig->ka.sa.sa_flags & SA_RESTORER) { @@ -533,7 +533,7 @@ static int x32_setup_rt_frame(struct ksignal *ksig, * Do a signal return; undo the signal stack. */ #ifdef CONFIG_X86_32 -unsigned long sys_sigreturn(void) +asmlinkage unsigned long sys_sigreturn(void) { struct pt_regs *regs = current_pt_regs(); struct sigframe __user *frame; @@ -562,7 +562,7 @@ badframe: } #endif /* CONFIG_X86_32 */ -long sys_rt_sigreturn(void) +asmlinkage long sys_rt_sigreturn(void) { struct pt_regs *regs = current_pt_regs(); struct rt_sigframe __user *frame; @@ -728,7 +728,7 @@ static void do_signal(struct pt_regs *regs) * notification of userspace execution resumption * - triggered by the TIF_WORK_MASK flags */ -void +__visible void do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) { user_exit(); diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index cdaa347dfca..7c3a5a61f2e 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -256,7 +256,7 @@ static inline void __smp_reschedule_interrupt(void) scheduler_ipi(); } -void smp_reschedule_interrupt(struct pt_regs *regs) +__visible void smp_reschedule_interrupt(struct pt_regs *regs) { ack_APIC_irq(); __smp_reschedule_interrupt(); @@ -271,7 +271,7 @@ static inline void smp_entering_irq(void) irq_enter(); } -void smp_trace_reschedule_interrupt(struct pt_regs *regs) +__visible void smp_trace_reschedule_interrupt(struct pt_regs *regs) { /* * Need to call irq_enter() before calling the trace point. @@ -295,14 +295,14 @@ static inline void __smp_call_function_interrupt(void) inc_irq_stat(irq_call_count); } -void smp_call_function_interrupt(struct pt_regs *regs) +__visible void smp_call_function_interrupt(struct pt_regs *regs) { smp_entering_irq(); __smp_call_function_interrupt(); exiting_irq(); } -void smp_trace_call_function_interrupt(struct pt_regs *regs) +__visible void smp_trace_call_function_interrupt(struct pt_regs *regs) { smp_entering_irq(); trace_call_function_entry(CALL_FUNCTION_VECTOR); @@ -317,14 +317,14 @@ static inline void __smp_call_function_single_interrupt(void) inc_irq_stat(irq_call_count); } -void smp_call_function_single_interrupt(struct pt_regs *regs) +__visible void smp_call_function_single_interrupt(struct pt_regs *regs) { smp_entering_irq(); __smp_call_function_single_interrupt(); exiting_irq(); } -void smp_trace_call_function_single_interrupt(struct pt_regs *regs) +__visible void smp_trace_call_function_single_interrupt(struct pt_regs *regs) { smp_entering_irq(); trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index bfd348e9936..aecc98a93d1 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -130,7 +130,7 @@ atomic_t init_deasserted; * Report back to the Boot Processor during boot time or to the caller processor * during CPU online. */ -static void __cpuinit smp_callin(void) +static void smp_callin(void) { int cpuid, phys_id; unsigned long timeout; @@ -237,7 +237,7 @@ static int enable_start_cpu0; /* * Activate a secondary processor. */ -notrace static void __cpuinit start_secondary(void *unused) +static void notrace start_secondary(void *unused) { /* * Don't put *anything* before cpu_init(), SMP booting is too @@ -300,7 +300,7 @@ void __init smp_store_boot_cpu_info(void) * The bootstrap kernel entry code has set these up. Save them for * a given CPU */ -void __cpuinit smp_store_cpu_info(int id) +void smp_store_cpu_info(int id) { struct cpuinfo_x86 *c = &cpu_data(id); @@ -313,7 +313,7 @@ void __cpuinit smp_store_cpu_info(int id) identify_secondary_cpu(c); } -static bool __cpuinit +static bool topology_sane(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o, const char *name) { int cpu1 = c->cpu_index, cpu2 = o->cpu_index; @@ -330,7 +330,7 @@ do { \ cpumask_set_cpu((c2), cpu_##_m##_mask(c1)); \ } while (0) -static bool __cpuinit match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) +static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) { if (cpu_has_topoext) { int cpu1 = c->cpu_index, cpu2 = o->cpu_index; @@ -348,7 +348,7 @@ static bool __cpuinit match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) return false; } -static bool __cpuinit match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) +static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) { int cpu1 = c->cpu_index, cpu2 = o->cpu_index; @@ -359,7 +359,7 @@ static bool __cpuinit match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) return false; } -static bool __cpuinit match_mc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) +static bool match_mc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) { if (c->phys_proc_id == o->phys_proc_id) { if (cpu_has(c, X86_FEATURE_AMD_DCM)) @@ -370,7 +370,7 @@ static bool __cpuinit match_mc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) return false; } -void __cpuinit set_cpu_sibling_map(int cpu) +void set_cpu_sibling_map(int cpu) { bool has_smt = smp_num_siblings > 1; bool has_mp = has_smt || boot_cpu_data.x86_max_cores > 1; @@ -499,7 +499,7 @@ void __inquire_remote_apic(int apicid) * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this * won't ... remember to clear down the APIC, etc later. */ -int __cpuinit +int wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip) { unsigned long send_status, accept_status = 0; @@ -533,7 +533,7 @@ wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip) return (send_status | accept_status); } -static int __cpuinit +static int wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) { unsigned long send_status, accept_status = 0; @@ -649,7 +649,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) } /* reduce the number of lines printed when booting a large cpu count system */ -static void __cpuinit announce_cpu(int cpu, int apicid) +static void announce_cpu(int cpu, int apicid) { static int current_node = -1; int node = early_cpu_to_node(cpu); @@ -691,7 +691,7 @@ static int wakeup_cpu0_nmi(unsigned int cmd, struct pt_regs *regs) * We'll change this code in the future to wake up hard offlined CPU0 if * real platform and request are available. */ -static int __cpuinit +static int wakeup_cpu_via_init_nmi(int cpu, unsigned long start_ip, int apicid, int *cpu0_nmi_registered) { @@ -731,7 +731,7 @@ wakeup_cpu_via_init_nmi(int cpu, unsigned long start_ip, int apicid, * Returns zero if CPU booted OK, else error code from * ->wakeup_secondary_cpu. */ -static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle) +static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) { volatile u32 *trampoline_status = (volatile u32 *) __va(real_mode_header->trampoline_status); @@ -872,7 +872,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle) return boot_error; } -int __cpuinit native_cpu_up(unsigned int cpu, struct task_struct *tidle) +int native_cpu_up(unsigned int cpu, struct task_struct *tidle) { int apicid = apic->cpu_present_to_apicid(cpu); unsigned long flags; diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index dbded5aedb8..30277e27431 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -101,7 +101,7 @@ static void find_start_end(unsigned long flags, unsigned long *begin, *begin = new_begin; } } else { - *begin = TASK_UNMAPPED_BASE; + *begin = current->mm->mmap_legacy_base; *end = TASK_SIZE; } } diff --git a/arch/x86/kernel/syscall_32.c b/arch/x86/kernel/syscall_32.c index 147fcd4941c..e9bcd57d8a9 100644 --- a/arch/x86/kernel/syscall_32.c +++ b/arch/x86/kernel/syscall_32.c @@ -15,7 +15,7 @@ typedef asmlinkage void (*sys_call_ptr_t)(void); extern asmlinkage void sys_ni_syscall(void); -const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { +__visible const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { /* * Smells like a compiler bug -- it doesn't work * when the & below is removed. diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c index 5c7f8c20da7..4ac730b37f0 100644 --- a/arch/x86/kernel/syscall_64.c +++ b/arch/x86/kernel/syscall_64.c @@ -4,6 +4,7 @@ #include <linux/sys.h> #include <linux/cache.h> #include <asm/asm-offsets.h> +#include <asm/syscall.h> #define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat) @@ -19,11 +20,9 @@ #define __SYSCALL_64(nr, sym, compat) [nr] = sym, -typedef void (*sys_call_ptr_t)(void); - extern void sys_ni_syscall(void); -const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { +asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { /* * Smells like a compiler bug -- it doesn't work * when the & below is removed. diff --git a/arch/x86/kernel/sysfb.c b/arch/x86/kernel/sysfb.c new file mode 100644 index 00000000000..193ec2ce46c --- /dev/null +++ b/arch/x86/kernel/sysfb.c @@ -0,0 +1,74 @@ +/* + * Generic System Framebuffers on x86 + * Copyright (c) 2012-2013 David Herrmann <dh.herrmann@gmail.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +/* + * Simple-Framebuffer support for x86 systems + * Create a platform-device for any available boot framebuffer. The + * simple-framebuffer platform device is already available on DT systems, so + * this module parses the global "screen_info" object and creates a suitable + * platform device compatible with the "simple-framebuffer" DT object. If + * the framebuffer is incompatible, we instead create a legacy + * "vesa-framebuffer", "efi-framebuffer" or "platform-framebuffer" device and + * pass the screen_info as platform_data. This allows legacy drivers + * to pick these devices up without messing with simple-framebuffer drivers. + * The global "screen_info" is still valid at all times. + * + * If CONFIG_X86_SYSFB is not selected, we never register "simple-framebuffer" + * platform devices, but only use legacy framebuffer devices for + * backwards compatibility. + * + * TODO: We set the dev_id field of all platform-devices to 0. This allows + * other x86 OF/DT parsers to create such devices, too. However, they must + * start at offset 1 for this to work. + */ + +#include <linux/err.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/platform_data/simplefb.h> +#include <linux/platform_device.h> +#include <linux/screen_info.h> +#include <asm/sysfb.h> + +static __init int sysfb_init(void) +{ + struct screen_info *si = &screen_info; + struct simplefb_platform_data mode; + struct platform_device *pd; + const char *name; + bool compatible; + int ret; + + sysfb_apply_efi_quirks(); + + /* try to create a simple-framebuffer device */ + compatible = parse_mode(si, &mode); + if (compatible) { + ret = create_simplefb(si, &mode); + if (!ret) + return 0; + } + + /* if the FB is incompatible, create a legacy framebuffer device */ + if (si->orig_video_isVGA == VIDEO_TYPE_EFI) + name = "efi-framebuffer"; + else if (si->orig_video_isVGA == VIDEO_TYPE_VLFB) + name = "vesa-framebuffer"; + else + name = "platform-framebuffer"; + + pd = platform_device_register_resndata(NULL, name, 0, + NULL, 0, si, sizeof(*si)); + return IS_ERR(pd) ? PTR_ERR(pd) : 0; +} + +/* must execute after PCI subsystem for EFI quirks */ +device_initcall(sysfb_init); diff --git a/arch/x86/kernel/sysfb_efi.c b/arch/x86/kernel/sysfb_efi.c new file mode 100644 index 00000000000..b285d4e8c68 --- /dev/null +++ b/arch/x86/kernel/sysfb_efi.c @@ -0,0 +1,214 @@ +/* + * Generic System Framebuffers on x86 + * Copyright (c) 2012-2013 David Herrmann <dh.herrmann@gmail.com> + * + * EFI Quirks Copyright (c) 2006 Edgar Hucek <gimli@dark-green.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +/* + * EFI Quirks + * Several EFI systems do not correctly advertise their boot framebuffers. + * Hence, we use this static table of known broken machines and fix up the + * information so framebuffer drivers can load corectly. + */ + +#include <linux/dmi.h> +#include <linux/err.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/pci.h> +#include <linux/screen_info.h> +#include <video/vga.h> +#include <asm/sysfb.h> + +enum { + OVERRIDE_NONE = 0x0, + OVERRIDE_BASE = 0x1, + OVERRIDE_STRIDE = 0x2, + OVERRIDE_HEIGHT = 0x4, + OVERRIDE_WIDTH = 0x8, +}; + +struct efifb_dmi_info efifb_dmi_list[] = { + [M_I17] = { "i17", 0x80010000, 1472 * 4, 1440, 900, OVERRIDE_NONE }, + [M_I20] = { "i20", 0x80010000, 1728 * 4, 1680, 1050, OVERRIDE_NONE }, /* guess */ + [M_I20_SR] = { "imac7", 0x40010000, 1728 * 4, 1680, 1050, OVERRIDE_NONE }, + [M_I24] = { "i24", 0x80010000, 2048 * 4, 1920, 1200, OVERRIDE_NONE }, /* guess */ + [M_I24_8_1] = { "imac8", 0xc0060000, 2048 * 4, 1920, 1200, OVERRIDE_NONE }, + [M_I24_10_1] = { "imac10", 0xc0010000, 2048 * 4, 1920, 1080, OVERRIDE_NONE }, + [M_I27_11_1] = { "imac11", 0xc0010000, 2560 * 4, 2560, 1440, OVERRIDE_NONE }, + [M_MINI]= { "mini", 0x80000000, 2048 * 4, 1024, 768, OVERRIDE_NONE }, + [M_MINI_3_1] = { "mini31", 0x40010000, 1024 * 4, 1024, 768, OVERRIDE_NONE }, + [M_MINI_4_1] = { "mini41", 0xc0010000, 2048 * 4, 1920, 1200, OVERRIDE_NONE }, + [M_MB] = { "macbook", 0x80000000, 2048 * 4, 1280, 800, OVERRIDE_NONE }, + [M_MB_5_1] = { "macbook51", 0x80010000, 2048 * 4, 1280, 800, OVERRIDE_NONE }, + [M_MB_6_1] = { "macbook61", 0x80010000, 2048 * 4, 1280, 800, OVERRIDE_NONE }, + [M_MB_7_1] = { "macbook71", 0x80010000, 2048 * 4, 1280, 800, OVERRIDE_NONE }, + [M_MBA] = { "mba", 0x80000000, 2048 * 4, 1280, 800, OVERRIDE_NONE }, + /* 11" Macbook Air 3,1 passes the wrong stride */ + [M_MBA_3] = { "mba3", 0, 2048 * 4, 0, 0, OVERRIDE_STRIDE }, + [M_MBP] = { "mbp", 0x80010000, 1472 * 4, 1440, 900, OVERRIDE_NONE }, + [M_MBP_2] = { "mbp2", 0, 0, 0, 0, OVERRIDE_NONE }, /* placeholder */ + [M_MBP_2_2] = { "mbp22", 0x80010000, 1472 * 4, 1440, 900, OVERRIDE_NONE }, + [M_MBP_SR] = { "mbp3", 0x80030000, 2048 * 4, 1440, 900, OVERRIDE_NONE }, + [M_MBP_4] = { "mbp4", 0xc0060000, 2048 * 4, 1920, 1200, OVERRIDE_NONE }, + [M_MBP_5_1] = { "mbp51", 0xc0010000, 2048 * 4, 1440, 900, OVERRIDE_NONE }, + [M_MBP_5_2] = { "mbp52", 0xc0010000, 2048 * 4, 1920, 1200, OVERRIDE_NONE }, + [M_MBP_5_3] = { "mbp53", 0xd0010000, 2048 * 4, 1440, 900, OVERRIDE_NONE }, + [M_MBP_6_1] = { "mbp61", 0x90030000, 2048 * 4, 1920, 1200, OVERRIDE_NONE }, + [M_MBP_6_2] = { "mbp62", 0x90030000, 2048 * 4, 1680, 1050, OVERRIDE_NONE }, + [M_MBP_7_1] = { "mbp71", 0xc0010000, 2048 * 4, 1280, 800, OVERRIDE_NONE }, + [M_MBP_8_2] = { "mbp82", 0x90010000, 1472 * 4, 1440, 900, OVERRIDE_NONE }, + [M_UNKNOWN] = { NULL, 0, 0, 0, 0, OVERRIDE_NONE } +}; + +#define choose_value(dmivalue, fwvalue, field, flags) ({ \ + typeof(fwvalue) _ret_ = fwvalue; \ + if ((flags) & (field)) \ + _ret_ = dmivalue; \ + else if ((fwvalue) == 0) \ + _ret_ = dmivalue; \ + _ret_; \ + }) + +static int __init efifb_set_system(const struct dmi_system_id *id) +{ + struct efifb_dmi_info *info = id->driver_data; + + if (info->base == 0 && info->height == 0 && info->width == 0 && + info->stride == 0) + return 0; + + /* Trust the bootloader over the DMI tables */ + if (screen_info.lfb_base == 0) { +#if defined(CONFIG_PCI) + struct pci_dev *dev = NULL; + int found_bar = 0; +#endif + if (info->base) { + screen_info.lfb_base = choose_value(info->base, + screen_info.lfb_base, OVERRIDE_BASE, + info->flags); + +#if defined(CONFIG_PCI) + /* make sure that the address in the table is actually + * on a VGA device's PCI BAR */ + + for_each_pci_dev(dev) { + int i; + if ((dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) + continue; + for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) { + resource_size_t start, end; + + start = pci_resource_start(dev, i); + if (start == 0) + break; + end = pci_resource_end(dev, i); + if (screen_info.lfb_base >= start && + screen_info.lfb_base < end) { + found_bar = 1; + } + } + } + if (!found_bar) + screen_info.lfb_base = 0; +#endif + } + } + if (screen_info.lfb_base) { + screen_info.lfb_linelength = choose_value(info->stride, + screen_info.lfb_linelength, OVERRIDE_STRIDE, + info->flags); + screen_info.lfb_width = choose_value(info->width, + screen_info.lfb_width, OVERRIDE_WIDTH, + info->flags); + screen_info.lfb_height = choose_value(info->height, + screen_info.lfb_height, OVERRIDE_HEIGHT, + info->flags); + if (screen_info.orig_video_isVGA == 0) + screen_info.orig_video_isVGA = VIDEO_TYPE_EFI; + } else { + screen_info.lfb_linelength = 0; + screen_info.lfb_width = 0; + screen_info.lfb_height = 0; + screen_info.orig_video_isVGA = 0; + return 0; + } + + printk(KERN_INFO "efifb: dmi detected %s - framebuffer at 0x%08x " + "(%dx%d, stride %d)\n", id->ident, + screen_info.lfb_base, screen_info.lfb_width, + screen_info.lfb_height, screen_info.lfb_linelength); + + return 1; +} + +#define EFIFB_DMI_SYSTEM_ID(vendor, name, enumid) \ + { \ + efifb_set_system, \ + name, \ + { \ + DMI_MATCH(DMI_BIOS_VENDOR, vendor), \ + DMI_MATCH(DMI_PRODUCT_NAME, name) \ + }, \ + &efifb_dmi_list[enumid] \ + } + +static const struct dmi_system_id efifb_dmi_system_table[] __initconst = { + EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "iMac4,1", M_I17), + /* At least one of these two will be right; maybe both? */ + EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "iMac5,1", M_I20), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac5,1", M_I20), + /* At least one of these two will be right; maybe both? */ + EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "iMac6,1", M_I24), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac6,1", M_I24), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac7,1", M_I20_SR), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac8,1", M_I24_8_1), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac10,1", M_I24_10_1), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac11,1", M_I27_11_1), + EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "Macmini1,1", M_MINI), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "Macmini3,1", M_MINI_3_1), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "Macmini4,1", M_MINI_4_1), + EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBook1,1", M_MB), + /* At least one of these two will be right; maybe both? */ + EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBook2,1", M_MB), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook2,1", M_MB), + /* At least one of these two will be right; maybe both? */ + EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBook3,1", M_MB), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook3,1", M_MB), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook4,1", M_MB), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook5,1", M_MB_5_1), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook6,1", M_MB_6_1), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook7,1", M_MB_7_1), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookAir1,1", M_MBA), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookAir3,1", M_MBA_3), + EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBookPro1,1", M_MBP), + EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBookPro2,1", M_MBP_2), + EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBookPro2,2", M_MBP_2_2), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro2,1", M_MBP_2), + EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBookPro3,1", M_MBP_SR), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro3,1", M_MBP_SR), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro4,1", M_MBP_4), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro5,1", M_MBP_5_1), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro5,2", M_MBP_5_2), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro5,3", M_MBP_5_3), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro6,1", M_MBP_6_1), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro6,2", M_MBP_6_2), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro7,1", M_MBP_7_1), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro8,2", M_MBP_8_2), + {}, +}; + +__init void sysfb_apply_efi_quirks(void) +{ + if (screen_info.orig_video_isVGA != VIDEO_TYPE_EFI || + !(screen_info.capabilities & VIDEO_CAPABILITY_SKIP_QUIRKS)) + dmi_check_system(efifb_dmi_system_table); +} diff --git a/arch/x86/kernel/sysfb_simplefb.c b/arch/x86/kernel/sysfb_simplefb.c new file mode 100644 index 00000000000..22513e96b01 --- /dev/null +++ b/arch/x86/kernel/sysfb_simplefb.c @@ -0,0 +1,95 @@ +/* + * Generic System Framebuffers on x86 + * Copyright (c) 2012-2013 David Herrmann <dh.herrmann@gmail.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +/* + * simple-framebuffer probing + * Try to convert "screen_info" into a "simple-framebuffer" compatible mode. + * If the mode is incompatible, we return "false" and let the caller create + * legacy nodes instead. + */ + +#include <linux/err.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/platform_data/simplefb.h> +#include <linux/platform_device.h> +#include <linux/screen_info.h> +#include <asm/sysfb.h> + +static const char simplefb_resname[] = "BOOTFB"; +static const struct simplefb_format formats[] = SIMPLEFB_FORMATS; + +/* try parsing x86 screen_info into a simple-framebuffer mode struct */ +__init bool parse_mode(const struct screen_info *si, + struct simplefb_platform_data *mode) +{ + const struct simplefb_format *f; + __u8 type; + unsigned int i; + + type = si->orig_video_isVGA; + if (type != VIDEO_TYPE_VLFB && type != VIDEO_TYPE_EFI) + return false; + + for (i = 0; i < ARRAY_SIZE(formats); ++i) { + f = &formats[i]; + if (si->lfb_depth == f->bits_per_pixel && + si->red_size == f->red.length && + si->red_pos == f->red.offset && + si->green_size == f->green.length && + si->green_pos == f->green.offset && + si->blue_size == f->blue.length && + si->blue_pos == f->blue.offset && + si->rsvd_size == f->transp.length && + si->rsvd_pos == f->transp.offset) { + mode->format = f->name; + mode->width = si->lfb_width; + mode->height = si->lfb_height; + mode->stride = si->lfb_linelength; + return true; + } + } + + return false; +} + +__init int create_simplefb(const struct screen_info *si, + const struct simplefb_platform_data *mode) +{ + struct platform_device *pd; + struct resource res; + unsigned long len; + + /* don't use lfb_size as it may contain the whole VMEM instead of only + * the part that is occupied by the framebuffer */ + len = mode->height * mode->stride; + len = PAGE_ALIGN(len); + if (len > si->lfb_size << 16) { + printk(KERN_WARNING "sysfb: VRAM smaller than advertised\n"); + return -EINVAL; + } + + /* setup IORESOURCE_MEM as framebuffer memory */ + memset(&res, 0, sizeof(res)); + res.flags = IORESOURCE_MEM; + res.name = simplefb_resname; + res.start = si->lfb_base; + res.end = si->lfb_base + len - 1; + if (res.end <= res.start) + return -EINVAL; + + pd = platform_device_register_resndata(NULL, "simple-framebuffer", 0, + &res, 1, mode, sizeof(*mode)); + if (IS_ERR(pd)) + return PTR_ERR(pd); + + return 0; +} diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index 3ff42d2f046..91a4496db43 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -301,6 +301,15 @@ static int tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control) return 0; } +static int tboot_extended_sleep(u8 sleep_state, u32 val_a, u32 val_b) +{ + if (!tboot_enabled()) + return 0; + + pr_warning("tboot is not able to suspend on platforms with reduced hardware sleep (ACPIv5)"); + return -ENODEV; +} + static atomic_t ap_wfs_count; static int tboot_wait_for_aps(int num_aps) @@ -320,8 +329,8 @@ static int tboot_wait_for_aps(int num_aps) return !(atomic_read((atomic_t *)&tboot->num_in_wfs) == num_aps); } -static int __cpuinit tboot_cpu_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) +static int tboot_cpu_callback(struct notifier_block *nfb, unsigned long action, + void *hcpu) { switch (action) { case CPU_DYING: @@ -334,7 +343,7 @@ static int __cpuinit tboot_cpu_callback(struct notifier_block *nfb, return NOTIFY_OK; } -static struct notifier_block tboot_cpu_notifier __cpuinitdata = +static struct notifier_block tboot_cpu_notifier = { .notifier_call = tboot_cpu_callback, }; @@ -422,6 +431,7 @@ static __init int tboot_late_init(void) #endif acpi_os_set_prepare_sleep(&tboot_sleep); + acpi_os_set_prepare_extended_sleep(&tboot_extended_sleep); return 0; } diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c index 4e584a8d6ed..1c113db9ed5 100644 --- a/arch/x86/kernel/tracepoint.c +++ b/arch/x86/kernel/tracepoint.c @@ -12,10 +12,8 @@ atomic_t trace_idt_ctr = ATOMIC_INIT(0); struct desc_ptr trace_idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) trace_idt_table }; -#ifndef CONFIG_X86_64 -gate_desc trace_idt_table[NR_VECTORS] __page_aligned_data - = { { { { 0, 0 } } }, }; -#endif +/* No need to be aligned, but done to keep all IDTs defined the same way. */ +gate_desc trace_idt_table[NR_VECTORS] __page_aligned_bss; static int trace_irq_vector_refcount; static DEFINE_MUTEX(irq_vector_mutex); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index b0865e88d3c..8c8093b146c 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -58,24 +58,25 @@ #include <asm/mce.h> #include <asm/fixmap.h> #include <asm/mach_traps.h> +#include <asm/alternative.h> #ifdef CONFIG_X86_64 #include <asm/x86_init.h> #include <asm/pgalloc.h> #include <asm/proto.h> + +/* No need to be aligned, but done to keep all IDTs defined the same way. */ +gate_desc debug_idt_table[NR_VECTORS] __page_aligned_bss; #else #include <asm/processor-flags.h> #include <asm/setup.h> asmlinkage int system_call(void); - -/* - * The IDT has to be page-aligned to simplify the Pentium - * F0 0F bug workaround. - */ -gate_desc idt_table[NR_VECTORS] __page_aligned_data = { { { { 0, 0 } } }, }; #endif +/* Must be page-aligned because the real IDT is used in a fixmap. */ +gate_desc idt_table[NR_VECTORS] __page_aligned_bss; + DECLARE_BITMAP(used_vectors, NR_VECTORS); EXPORT_SYMBOL_GPL(used_vectors); @@ -327,6 +328,9 @@ dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_co ftrace_int3_handler(regs)) return; #endif + if (poke_int3_handler(regs)) + return; + prev_state = exception_enter(); #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 098b3cfda72..930e5d48f56 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -89,6 +89,12 @@ int check_tsc_unstable(void) } EXPORT_SYMBOL_GPL(check_tsc_unstable); +int check_tsc_disabled(void) +{ + return tsc_disabled; +} +EXPORT_SYMBOL_GPL(check_tsc_disabled); + #ifdef CONFIG_X86_TSC int __init notsc_setup(char *str) { @@ -824,7 +830,7 @@ static void __init check_system_tsc_reliable(void) * Make an educated guess if the TSC is trustworthy and synchronized * over all CPUs. */ -__cpuinit int unsynchronized_tsc(void) +int unsynchronized_tsc(void) { if (!cpu_has_tsc || tsc_unstable) return 1; @@ -1020,7 +1026,7 @@ void __init tsc_init(void) * been calibrated. This assumes that CONSTANT_TSC applies to all * cpus in the socket - this should be a safe assumption. */ -unsigned long __cpuinit calibrate_delay_is_known(void) +unsigned long calibrate_delay_is_known(void) { int i, cpu = smp_processor_id(); diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index fc25e60a588..adfdf56a371 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -25,24 +25,24 @@ * Entry/exit counters that make sure that both CPUs * run the measurement code at once: */ -static __cpuinitdata atomic_t start_count; -static __cpuinitdata atomic_t stop_count; +static atomic_t start_count; +static atomic_t stop_count; /* * We use a raw spinlock in this exceptional case, because * we want to have the fastest, inlined, non-debug version * of a critical section, to be able to prove TSC time-warps: */ -static __cpuinitdata arch_spinlock_t sync_lock = __ARCH_SPIN_LOCK_UNLOCKED; +static arch_spinlock_t sync_lock = __ARCH_SPIN_LOCK_UNLOCKED; -static __cpuinitdata cycles_t last_tsc; -static __cpuinitdata cycles_t max_warp; -static __cpuinitdata int nr_warps; +static cycles_t last_tsc; +static cycles_t max_warp; +static int nr_warps; /* * TSC-warp measurement loop running on both CPUs: */ -static __cpuinit void check_tsc_warp(unsigned int timeout) +static void check_tsc_warp(unsigned int timeout) { cycles_t start, now, prev, end; int i; @@ -121,7 +121,7 @@ static inline unsigned int loop_timeout(int cpu) * Source CPU calls into this - it waits for the freshly booted * target CPU to arrive and then starts the measurement: */ -void __cpuinit check_tsc_sync_source(int cpu) +void check_tsc_sync_source(int cpu) { int cpus = 2; @@ -187,7 +187,7 @@ void __cpuinit check_tsc_sync_source(int cpu) /* * Freshly booted CPUs call into this: */ -void __cpuinit check_tsc_sync_target(void) +void check_tsc_sync_target(void) { int cpus = 2; diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 9a907a67be8..1f96f9347ed 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -331,7 +331,7 @@ sigsegv: * Assume __initcall executes before all user space. Hopefully kmod * doesn't violate that. We'll find out if it does. */ -static void __cpuinit vsyscall_set_cpu(int cpu) +static void vsyscall_set_cpu(int cpu) { unsigned long d; unsigned long node = 0; @@ -353,13 +353,13 @@ static void __cpuinit vsyscall_set_cpu(int cpu) write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S); } -static void __cpuinit cpu_vsyscall_init(void *arg) +static void cpu_vsyscall_init(void *arg) { /* preemption should be already off */ vsyscall_set_cpu(raw_smp_processor_id()); } -static int __cpuinit +static int cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg) { long cpu = (long)arg; diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 45a14dbbdda..5f24c71acca 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -25,7 +25,7 @@ #include <asm/iommu.h> #include <asm/mach_traps.h> -void __cpuinit x86_init_noop(void) { } +void x86_init_noop(void) { } void __init x86_init_uint_noop(unsigned int unused) { } int __init iommu_init_noop(void) { return 0; } void iommu_shutdown_noop(void) { } @@ -85,7 +85,7 @@ struct x86_init_ops x86_init __initdata = { }, }; -struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { +struct x86_cpuinit_ops x86_cpuinit = { .early_percpu_clock_init = x86_init_noop, .setup_percpu_clockev = setup_secondary_APIC_clock, }; diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index d6c28acdf99..422fd822347 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -573,7 +573,7 @@ static void __init xstate_enable_boot_cpu(void) * This is somewhat obfuscated due to the lack of powerful enough * overrides for the section checks. */ -void __cpuinit xsave_init(void) +void xsave_init(void) { static __refdata void (*next_func)(void) = xstate_enable_boot_cpu; void (*this_func)(void); @@ -594,7 +594,7 @@ static inline void __init eager_fpu_init_bp(void) setup_init_fpu_buf(); } -void __cpuinit eager_fpu_init(void) +void eager_fpu_init(void) { static __refdata void (*boot_func)(void) = eager_fpu_init_bp; diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index a20ecb5b6cb..b110fe6c03d 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -413,7 +413,8 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, (1 << KVM_FEATURE_CLOCKSOURCE2) | (1 << KVM_FEATURE_ASYNC_PF) | (1 << KVM_FEATURE_PV_EOI) | - (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); + (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) | + (1 << KVM_FEATURE_PV_UNHALT); if (sched_info_on()) entry->eax |= (1 << KVM_FEATURE_STEAL_TIME); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index afc11245827..5439117d5c4 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -79,16 +79,6 @@ static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val) *((u32 *) (apic->regs + reg_off)) = val; } -static inline int apic_test_and_set_vector(int vec, void *bitmap) -{ - return test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); -} - -static inline int apic_test_and_clear_vector(int vec, void *bitmap) -{ - return test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); -} - static inline int apic_test_vector(int vec, void *bitmap) { return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); @@ -331,10 +321,10 @@ void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir) } EXPORT_SYMBOL_GPL(kvm_apic_update_irr); -static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic) +static inline void apic_set_irr(int vec, struct kvm_lapic *apic) { apic->irr_pending = true; - return apic_test_and_set_vector(vec, apic->regs + APIC_IRR); + apic_set_vector(vec, apic->regs + APIC_IRR); } static inline int apic_search_irr(struct kvm_lapic *apic) @@ -681,32 +671,28 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, if (unlikely(!apic_enabled(apic))) break; + result = 1; + if (dest_map) __set_bit(vcpu->vcpu_id, dest_map); - if (kvm_x86_ops->deliver_posted_interrupt) { - result = 1; + if (kvm_x86_ops->deliver_posted_interrupt) kvm_x86_ops->deliver_posted_interrupt(vcpu, vector); - } else { - result = !apic_test_and_set_irr(vector, apic); - - if (!result) { - if (trig_mode) - apic_debug("level trig mode repeatedly " - "for vector %d", vector); - goto out; - } + else { + apic_set_irr(vector, apic); kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); } -out: trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode, - trig_mode, vector, !result); + trig_mode, vector, false); break; case APIC_DM_REMRD: - apic_debug("Ignoring delivery mode 3\n"); + result = 1; + vcpu->arch.pv.pv_unhalted = 1; + kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_vcpu_kick(vcpu); break; case APIC_DM_SMI: diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 0d094da4954..6e2d2c8f230 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -132,8 +132,8 @@ module_param(dbg, bool, 0644); (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ * PT32_LEVEL_BITS))) - 1)) -#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ - | PT64_NX_MASK) +#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \ + | shadow_x_mask | shadow_nx_mask) #define ACC_EXEC_MASK 1 #define ACC_WRITE_MASK PT_WRITABLE_MASK @@ -331,11 +331,6 @@ static int is_large_pte(u64 pte) return pte & PT_PAGE_SIZE_MASK; } -static int is_dirty_gpte(unsigned long pte) -{ - return pte & PT_DIRTY_MASK; -} - static int is_rmap_spte(u64 pte) { return is_shadow_present_pte(pte); @@ -2052,12 +2047,18 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) return __shadow_walk_next(iterator, *iterator->sptep); } -static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp) +static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp, bool accessed) { u64 spte; + BUILD_BUG_ON(VMX_EPT_READABLE_MASK != PT_PRESENT_MASK || + VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK); + spte = __pa(sp->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK | - shadow_user_mask | shadow_x_mask | shadow_accessed_mask; + shadow_user_mask | shadow_x_mask; + + if (accessed) + spte |= shadow_accessed_mask; mmu_spte_set(sptep, spte); } @@ -2574,14 +2575,6 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) mmu_free_roots(vcpu); } -static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level) -{ - int bit7; - - bit7 = (gpte >> 7) & 1; - return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0; -} - static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log) { @@ -2594,26 +2587,6 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, return gfn_to_pfn_memslot_atomic(slot, gfn); } -static bool prefetch_invalid_gpte(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp, u64 *spte, - u64 gpte) -{ - if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) - goto no_present; - - if (!is_present_gpte(gpte)) - goto no_present; - - if (!(gpte & PT_ACCESSED_MASK)) - goto no_present; - - return false; - -no_present: - drop_spte(vcpu->kvm, spte); - return true; -} - static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *start, u64 *end) @@ -2710,7 +2683,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, iterator.level - 1, 1, ACC_ALL, iterator.sptep); - link_shadow_page(iterator.sptep, sp); + link_shadow_page(iterator.sptep, sp, true); } } return emulate; @@ -2808,9 +2781,16 @@ exit: return ret; } -static bool page_fault_can_be_fast(struct kvm_vcpu *vcpu, u32 error_code) +static bool page_fault_can_be_fast(u32 error_code) { /* + * Do not fix the mmio spte with invalid generation number which + * need to be updated by slow page fault path. + */ + if (unlikely(error_code & PFERR_RSVD_MASK)) + return false; + + /* * #PF can be fast only if the shadow page table is present and it * is caused by write-protect, that means we just need change the * W bit of the spte which can be done out of mmu-lock. @@ -2854,7 +2834,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, bool ret = false; u64 spte = 0ull; - if (!page_fault_can_be_fast(vcpu, error_code)) + if (!page_fault_can_be_fast(error_code)) return false; walk_shadow_page_lockless_begin(vcpu); @@ -3202,6 +3182,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) mmu_sync_roots(vcpu); spin_unlock(&vcpu->kvm->mmu_lock); } +EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots); static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, struct x86_exception *exception) @@ -3471,6 +3452,7 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) ++vcpu->stat.tlb_flush; kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); } +EXPORT_SYMBOL_GPL(kvm_mmu_flush_tlb); static void paging_new_cr3(struct kvm_vcpu *vcpu) { @@ -3494,18 +3476,6 @@ static void paging_free(struct kvm_vcpu *vcpu) nonpaging_free(vcpu); } -static inline void protect_clean_gpte(unsigned *access, unsigned gpte) -{ - unsigned mask; - - BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK); - - mask = (unsigned)~ACC_WRITE_MASK; - /* Allow write access to dirty gptes */ - mask |= (gpte >> (PT_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) & PT_WRITABLE_MASK; - *access &= mask; -} - static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn, unsigned access, int *nr_present) { @@ -3523,16 +3493,6 @@ static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn, return false; } -static inline unsigned gpte_access(struct kvm_vcpu *vcpu, u64 gpte) -{ - unsigned access; - - access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; - access &= ~(gpte >> PT64_NX_SHIFT); - - return access; -} - static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gpte) { unsigned index; @@ -3542,6 +3502,11 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gp return mmu->last_pte_bitmap & (1 << index); } +#define PTTYPE_EPT 18 /* arbitrary */ +#define PTTYPE PTTYPE_EPT +#include "paging_tmpl.h" +#undef PTTYPE + #define PTTYPE 64 #include "paging_tmpl.h" #undef PTTYPE @@ -3556,6 +3521,8 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int maxphyaddr = cpuid_maxphyaddr(vcpu); u64 exb_bit_rsvd = 0; + context->bad_mt_xwr = 0; + if (!context->nx) exb_bit_rsvd = rsvd_bits(63, 63); switch (context->root_level) { @@ -3611,7 +3578,40 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, } } -static void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) +static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, + struct kvm_mmu *context, bool execonly) +{ + int maxphyaddr = cpuid_maxphyaddr(vcpu); + int pte; + + context->rsvd_bits_mask[0][3] = + rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7); + context->rsvd_bits_mask[0][2] = + rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6); + context->rsvd_bits_mask[0][1] = + rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6); + context->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51); + + /* large page */ + context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3]; + context->rsvd_bits_mask[1][2] = + rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29); + context->rsvd_bits_mask[1][1] = + rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20); + context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0]; + + for (pte = 0; pte < 64; pte++) { + int rwx_bits = pte & 7; + int mt = pte >> 3; + if (mt == 0x2 || mt == 0x3 || mt == 0x7 || + rwx_bits == 0x2 || rwx_bits == 0x6 || + (rwx_bits == 0x4 && !execonly)) + context->bad_mt_xwr |= (1ull << pte); + } +} + +static void update_permission_bitmask(struct kvm_vcpu *vcpu, + struct kvm_mmu *mmu, bool ept) { unsigned bit, byte, pfec; u8 map; @@ -3629,12 +3629,16 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu w = bit & ACC_WRITE_MASK; u = bit & ACC_USER_MASK; - /* Not really needed: !nx will cause pte.nx to fault */ - x |= !mmu->nx; - /* Allow supervisor writes if !cr0.wp */ - w |= !is_write_protection(vcpu) && !uf; - /* Disallow supervisor fetches of user code if cr4.smep */ - x &= !(smep && u && !uf); + if (!ept) { + /* Not really needed: !nx will cause pte.nx to fault */ + x |= !mmu->nx; + /* Allow supervisor writes if !cr0.wp */ + w |= !is_write_protection(vcpu) && !uf; + /* Disallow supervisor fetches of user code if cr4.smep */ + x &= !(smep && u && !uf); + } else + /* Not really needed: no U/S accesses on ept */ + u = 1; fault = (ff && !x) || (uf && !u) || (wf && !w); map |= fault << bit; @@ -3669,7 +3673,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, context->root_level = level; reset_rsvds_bits_mask(vcpu, context); - update_permission_bitmask(vcpu, context); + update_permission_bitmask(vcpu, context, false); update_last_pte_bitmap(vcpu, context); ASSERT(is_pae(vcpu)); @@ -3699,7 +3703,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu, context->root_level = PT32_ROOT_LEVEL; reset_rsvds_bits_mask(vcpu, context); - update_permission_bitmask(vcpu, context); + update_permission_bitmask(vcpu, context, false); update_last_pte_bitmap(vcpu, context); context->new_cr3 = paging_new_cr3; @@ -3761,7 +3765,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->gva_to_gpa = paging32_gva_to_gpa; } - update_permission_bitmask(vcpu, context); + update_permission_bitmask(vcpu, context, false); update_last_pte_bitmap(vcpu, context); return 0; @@ -3793,6 +3797,33 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context) } EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); +int kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context, + bool execonly) +{ + ASSERT(vcpu); + ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); + + context->shadow_root_level = kvm_x86_ops->get_tdp_level(); + + context->nx = true; + context->new_cr3 = paging_new_cr3; + context->page_fault = ept_page_fault; + context->gva_to_gpa = ept_gva_to_gpa; + context->sync_page = ept_sync_page; + context->invlpg = ept_invlpg; + context->update_pte = ept_update_pte; + context->free = paging_free; + context->root_level = context->shadow_root_level; + context->root_hpa = INVALID_PAGE; + context->direct_map = false; + + update_permission_bitmask(vcpu, context, true); + reset_rsvds_bits_mask_ept(vcpu, context, execonly); + + return 0; +} +EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu); + static int init_kvm_softmmu(struct kvm_vcpu *vcpu) { int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu); @@ -3840,7 +3871,7 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu) g_context->gva_to_gpa = paging32_gva_to_gpa_nested; } - update_permission_bitmask(vcpu, g_context); + update_permission_bitmask(vcpu, g_context, false); update_last_pte_bitmap(vcpu, g_context); return 0; @@ -3916,8 +3947,8 @@ static bool need_remote_flush(u64 old, u64 new) return true; if ((old ^ new) & PT64_BASE_ADDR_MASK) return true; - old ^= PT64_NX_MASK; - new ^= PT64_NX_MASK; + old ^= shadow_nx_mask; + new ^= shadow_nx_mask; return (old & ~new & PT64_PERM_MASK) != 0; } @@ -4175,7 +4206,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, switch (er) { case EMULATE_DONE: return 1; - case EMULATE_DO_MMIO: + case EMULATE_USER_EXIT: ++vcpu->stat.mmio_exits; /* fall through */ case EMULATE_FAIL: @@ -4383,11 +4414,8 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm) /* * The very rare case: if the generation-number is round, * zap all shadow pages. - * - * The max value is MMIO_MAX_GEN - 1 since it is not called - * when mark memslot invalid. */ - if (unlikely(kvm_current_mmio_generation(kvm) >= (MMIO_MAX_GEN - 1))) { + if (unlikely(kvm_current_mmio_generation(kvm) >= MMIO_MAX_GEN)) { printk_ratelimited(KERN_INFO "kvm: zapping shadow pages for mmio generation wraparound\n"); kvm_mmu_invalidate_zap_all_pages(kvm); } diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 5b59c573aba..77e044a0f5f 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -71,6 +71,8 @@ enum { int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct); int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); +int kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context, + bool execonly); static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) { diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 7769699d48a..04333015917 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -23,6 +23,13 @@ * so the code in this file is compiled twice, once per pte size. */ +/* + * This is used to catch non optimized PT_GUEST_(DIRTY|ACCESS)_SHIFT macro + * uses for EPT without A/D paging type. + */ +extern u64 __pure __using_nonexistent_pte_bit(void) + __compiletime_error("wrong use of PT_GUEST_(DIRTY|ACCESS)_SHIFT"); + #if PTTYPE == 64 #define pt_element_t u64 #define guest_walker guest_walker64 @@ -32,6 +39,10 @@ #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) #define PT_INDEX(addr, level) PT64_INDEX(addr, level) #define PT_LEVEL_BITS PT64_LEVEL_BITS + #define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK + #define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK + #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT + #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT #ifdef CONFIG_X86_64 #define PT_MAX_FULL_LEVELS 4 #define CMPXCHG cmpxchg @@ -49,7 +60,26 @@ #define PT_INDEX(addr, level) PT32_INDEX(addr, level) #define PT_LEVEL_BITS PT32_LEVEL_BITS #define PT_MAX_FULL_LEVELS 2 + #define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK + #define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK + #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT + #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT #define CMPXCHG cmpxchg +#elif PTTYPE == PTTYPE_EPT + #define pt_element_t u64 + #define guest_walker guest_walkerEPT + #define FNAME(name) ept_##name + #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK + #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) + #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) + #define PT_INDEX(addr, level) PT64_INDEX(addr, level) + #define PT_LEVEL_BITS PT64_LEVEL_BITS + #define PT_GUEST_ACCESSED_MASK 0 + #define PT_GUEST_DIRTY_MASK 0 + #define PT_GUEST_DIRTY_SHIFT __using_nonexistent_pte_bit() + #define PT_GUEST_ACCESSED_SHIFT __using_nonexistent_pte_bit() + #define CMPXCHG cmpxchg64 + #define PT_MAX_FULL_LEVELS 4 #else #error Invalid PTTYPE value #endif @@ -80,6 +110,40 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl) return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT; } +static inline void FNAME(protect_clean_gpte)(unsigned *access, unsigned gpte) +{ + unsigned mask; + + /* dirty bit is not supported, so no need to track it */ + if (!PT_GUEST_DIRTY_MASK) + return; + + BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK); + + mask = (unsigned)~ACC_WRITE_MASK; + /* Allow write access to dirty gptes */ + mask |= (gpte >> (PT_GUEST_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) & + PT_WRITABLE_MASK; + *access &= mask; +} + +static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level) +{ + int bit7 = (gpte >> 7) & 1, low6 = gpte & 0x3f; + + return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) | + ((mmu->bad_mt_xwr & (1ull << low6)) != 0); +} + +static inline int FNAME(is_present_gpte)(unsigned long pte) +{ +#if PTTYPE != PTTYPE_EPT + return is_present_gpte(pte); +#else + return pte & 7; +#endif +} + static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, pt_element_t __user *ptep_user, unsigned index, pt_element_t orig_pte, pt_element_t new_pte) @@ -103,6 +167,42 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, return (ret != orig_pte); } +static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp, u64 *spte, + u64 gpte) +{ + if (FNAME(is_rsvd_bits_set)(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) + goto no_present; + + if (!FNAME(is_present_gpte)(gpte)) + goto no_present; + + /* if accessed bit is not supported prefetch non accessed gpte */ + if (PT_GUEST_ACCESSED_MASK && !(gpte & PT_GUEST_ACCESSED_MASK)) + goto no_present; + + return false; + +no_present: + drop_spte(vcpu->kvm, spte); + return true; +} + +static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte) +{ + unsigned access; +#if PTTYPE == PTTYPE_EPT + access = ((gpte & VMX_EPT_WRITABLE_MASK) ? ACC_WRITE_MASK : 0) | + ((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) | + ACC_USER_MASK; +#else + access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; + access &= ~(gpte >> PT64_NX_SHIFT); +#endif + + return access; +} + static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, struct guest_walker *walker, @@ -114,18 +214,23 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, gfn_t table_gfn; int ret; + /* dirty/accessed bits are not supported, so no need to update them */ + if (!PT_GUEST_DIRTY_MASK) + return 0; + for (level = walker->max_level; level >= walker->level; --level) { pte = orig_pte = walker->ptes[level - 1]; table_gfn = walker->table_gfn[level - 1]; ptep_user = walker->ptep_user[level - 1]; index = offset_in_page(ptep_user) / sizeof(pt_element_t); - if (!(pte & PT_ACCESSED_MASK)) { + if (!(pte & PT_GUEST_ACCESSED_MASK)) { trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte)); - pte |= PT_ACCESSED_MASK; + pte |= PT_GUEST_ACCESSED_MASK; } - if (level == walker->level && write_fault && !is_dirty_gpte(pte)) { + if (level == walker->level && write_fault && + !(pte & PT_GUEST_DIRTY_MASK)) { trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); - pte |= PT_DIRTY_MASK; + pte |= PT_GUEST_DIRTY_MASK; } if (pte == orig_pte) continue; @@ -170,7 +275,7 @@ retry_walk: if (walker->level == PT32E_ROOT_LEVEL) { pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3); trace_kvm_mmu_paging_element(pte, walker->level); - if (!is_present_gpte(pte)) + if (!FNAME(is_present_gpte)(pte)) goto error; --walker->level; } @@ -179,7 +284,7 @@ retry_walk: ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0); - accessed_dirty = PT_ACCESSED_MASK; + accessed_dirty = PT_GUEST_ACCESSED_MASK; pt_access = pte_access = ACC_ALL; ++walker->level; @@ -215,17 +320,17 @@ retry_walk: trace_kvm_mmu_paging_element(pte, walker->level); - if (unlikely(!is_present_gpte(pte))) + if (unlikely(!FNAME(is_present_gpte)(pte))) goto error; - if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte, - walker->level))) { + if (unlikely(FNAME(is_rsvd_bits_set)(mmu, pte, + walker->level))) { errcode |= PFERR_RSVD_MASK | PFERR_PRESENT_MASK; goto error; } accessed_dirty &= pte; - pte_access = pt_access & gpte_access(vcpu, pte); + pte_access = pt_access & FNAME(gpte_access)(vcpu, pte); walker->ptes[walker->level - 1] = pte; } while (!is_last_gpte(mmu, walker->level, pte)); @@ -248,13 +353,15 @@ retry_walk: walker->gfn = real_gpa >> PAGE_SHIFT; if (!write_fault) - protect_clean_gpte(&pte_access, pte); + FNAME(protect_clean_gpte)(&pte_access, pte); else /* - * On a write fault, fold the dirty bit into accessed_dirty by - * shifting it one place right. + * On a write fault, fold the dirty bit into accessed_dirty. + * For modes without A/D bits support accessed_dirty will be + * always clear. */ - accessed_dirty &= pte >> (PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT); + accessed_dirty &= pte >> + (PT_GUEST_DIRTY_SHIFT - PT_GUEST_ACCESSED_SHIFT); if (unlikely(!accessed_dirty)) { ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault); @@ -279,6 +386,25 @@ error: walker->fault.vector = PF_VECTOR; walker->fault.error_code_valid = true; walker->fault.error_code = errcode; + +#if PTTYPE == PTTYPE_EPT + /* + * Use PFERR_RSVD_MASK in error_code to to tell if EPT + * misconfiguration requires to be injected. The detection is + * done by is_rsvd_bits_set() above. + * + * We set up the value of exit_qualification to inject: + * [2:0] - Derive from [2:0] of real exit_qualification at EPT violation + * [5:3] - Calculated by the page walk of the guest EPT page tables + * [7:8] - Derived from [7:8] of real exit_qualification + * + * The other bits are set to 0. + */ + if (!(errcode & PFERR_RSVD_MASK)) { + vcpu->arch.exit_qualification &= 0x187; + vcpu->arch.exit_qualification |= ((pt_access & pte) & 0x7) << 3; + } +#endif walker->fault.address = addr; walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu; @@ -293,6 +419,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker, access); } +#if PTTYPE != PTTYPE_EPT static int FNAME(walk_addr_nested)(struct guest_walker *walker, struct kvm_vcpu *vcpu, gva_t addr, u32 access) @@ -300,6 +427,7 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker, return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu, addr, access); } +#endif static bool FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, @@ -309,14 +437,14 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, gfn_t gfn; pfn_t pfn; - if (prefetch_invalid_gpte(vcpu, sp, spte, gpte)) + if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) return false; pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); gfn = gpte_to_gfn(gpte); - pte_access = sp->role.access & gpte_access(vcpu, gpte); - protect_clean_gpte(&pte_access, gpte); + pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); + FNAME(protect_clean_gpte)(&pte_access, gpte); pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, no_dirty_log && (pte_access & ACC_WRITE_MASK)); if (is_error_pfn(pfn)) @@ -446,7 +574,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, goto out_gpte_changed; if (sp) - link_shadow_page(it.sptep, sp); + link_shadow_page(it.sptep, sp, PT_GUEST_ACCESSED_MASK); } for (; @@ -466,7 +594,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1, true, direct_access, it.sptep); - link_shadow_page(it.sptep, sp); + link_shadow_page(it.sptep, sp, PT_GUEST_ACCESSED_MASK); } clear_sp_write_flooding_count(it.sptep); @@ -727,6 +855,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, return gpa; } +#if PTTYPE != PTTYPE_EPT static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, struct x86_exception *exception) @@ -745,6 +874,7 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, return gpa; } +#endif /* * Using the cached information from sp->gfns is safe because: @@ -785,15 +915,15 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) sizeof(pt_element_t))) return -EINVAL; - if (prefetch_invalid_gpte(vcpu, sp, &sp->spt[i], gpte)) { + if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { vcpu->kvm->tlbs_dirty++; continue; } gfn = gpte_to_gfn(gpte); pte_access = sp->role.access; - pte_access &= gpte_access(vcpu, gpte); - protect_clean_gpte(&pte_access, gpte); + pte_access &= FNAME(gpte_access)(vcpu, gpte); + FNAME(protect_clean_gpte)(&pte_access, gpte); if (sync_mmio_spte(vcpu->kvm, &sp->spt[i], gfn, pte_access, &nr_present)) @@ -830,3 +960,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) #undef gpte_to_gfn #undef gpte_to_gfn_lvl #undef CMPXCHG +#undef PT_GUEST_ACCESSED_MASK +#undef PT_GUEST_DIRTY_MASK +#undef PT_GUEST_DIRTY_SHIFT +#undef PT_GUEST_ACCESSED_SHIFT diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index c53e797e736..5c4f63151b4 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -160,7 +160,7 @@ static void stop_counter(struct kvm_pmc *pmc) static void reprogram_counter(struct kvm_pmc *pmc, u32 type, unsigned config, bool exclude_user, bool exclude_kernel, - bool intr) + bool intr, bool in_tx, bool in_tx_cp) { struct perf_event *event; struct perf_event_attr attr = { @@ -173,6 +173,10 @@ static void reprogram_counter(struct kvm_pmc *pmc, u32 type, .exclude_kernel = exclude_kernel, .config = config, }; + if (in_tx) + attr.config |= HSW_IN_TX; + if (in_tx_cp) + attr.config |= HSW_IN_TX_CHECKPOINTED; attr.sample_period = (-pmc->counter) & pmc_bitmask(pmc); @@ -226,7 +230,9 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE | ARCH_PERFMON_EVENTSEL_INV | - ARCH_PERFMON_EVENTSEL_CMASK))) { + ARCH_PERFMON_EVENTSEL_CMASK | + HSW_IN_TX | + HSW_IN_TX_CHECKPOINTED))) { config = find_arch_event(&pmc->vcpu->arch.pmu, event_select, unit_mask); if (config != PERF_COUNT_HW_MAX) @@ -239,7 +245,9 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) reprogram_counter(pmc, type, config, !(eventsel & ARCH_PERFMON_EVENTSEL_USR), !(eventsel & ARCH_PERFMON_EVENTSEL_OS), - eventsel & ARCH_PERFMON_EVENTSEL_INT); + eventsel & ARCH_PERFMON_EVENTSEL_INT, + (eventsel & HSW_IN_TX), + (eventsel & HSW_IN_TX_CHECKPOINTED)); } static void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 en_pmi, int idx) @@ -256,7 +264,7 @@ static void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 en_pmi, int idx) arch_events[fixed_pmc_events[idx]].event_type, !(en & 0x2), /* exclude user */ !(en & 0x1), /* exclude kernel */ - pmi); + pmi, false, false); } static inline u8 fixed_en_pmi(u64 ctrl, int idx) @@ -408,7 +416,7 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) { if (data == pmc->eventsel) return 0; - if (!(data & 0xffffffff00200000ull)) { + if (!(data & pmu->reserved_bits)) { reprogram_gp_counter(pmc, data); return 0; } @@ -450,6 +458,7 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu) pmu->counter_bitmask[KVM_PMC_GP] = 0; pmu->counter_bitmask[KVM_PMC_FIXED] = 0; pmu->version = 0; + pmu->reserved_bits = 0xffffffff00200000ull; entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); if (!entry) @@ -478,6 +487,12 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu) pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) | (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED); pmu->global_ctrl_mask = ~pmu->global_ctrl; + + entry = kvm_find_cpuid_entry(vcpu, 7, 0); + if (entry && + (boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) && + (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM))) + pmu->reserved_bits ^= HSW_IN_TX|HSW_IN_TX_CHECKPOINTED; } void kvm_pmu_init(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 064d0be67ec..1f1da43ff2a 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -373,6 +373,7 @@ struct nested_vmx { * we must keep them pinned while L2 runs. */ struct page *apic_access_page; + u64 msr_ia32_feature_control; }; #define POSTED_INTR_ON 0 @@ -711,10 +712,10 @@ static void nested_release_page_clean(struct page *page) kvm_release_page_clean(page); } +static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu); static u64 construct_eptp(unsigned long root_hpa); static void kvm_cpu_vmxon(u64 addr); static void kvm_cpu_vmxoff(void); -static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); @@ -1039,12 +1040,16 @@ static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit) (vmcs12->secondary_vm_exec_control & bit); } -static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12, - struct kvm_vcpu *vcpu) +static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12) { return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS; } +static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT); +} + static inline bool is_exception(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) @@ -2155,6 +2160,7 @@ static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high; static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high; static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high; static u32 nested_vmx_misc_low, nested_vmx_misc_high; +static u32 nested_vmx_ept_caps; static __init void nested_vmx_setup_ctls_msrs(void) { /* @@ -2190,14 +2196,17 @@ static __init void nested_vmx_setup_ctls_msrs(void) * If bit 55 of VMX_BASIC is off, bits 0-8 and 10, 11, 13, 14, 16 and * 17 must be 1. */ + rdmsr(MSR_IA32_VMX_EXIT_CTLS, + nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high); nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; /* Note that guest use of VM_EXIT_ACK_INTR_ON_EXIT is not supported. */ + nested_vmx_exit_ctls_high &= #ifdef CONFIG_X86_64 - nested_vmx_exit_ctls_high = VM_EXIT_HOST_ADDR_SPACE_SIZE; -#else - nested_vmx_exit_ctls_high = 0; + VM_EXIT_HOST_ADDR_SPACE_SIZE | #endif - nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; + VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT; + nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | + VM_EXIT_LOAD_IA32_EFER); /* entry controls */ rdmsr(MSR_IA32_VMX_ENTRY_CTLS, @@ -2205,8 +2214,12 @@ static __init void nested_vmx_setup_ctls_msrs(void) /* If bit 55 of VMX_BASIC is off, bits 0-8 and 12 must be 1. */ nested_vmx_entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; nested_vmx_entry_ctls_high &= - VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE; - nested_vmx_entry_ctls_high |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; +#ifdef CONFIG_X86_64 + VM_ENTRY_IA32E_MODE | +#endif + VM_ENTRY_LOAD_IA32_PAT; + nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | + VM_ENTRY_LOAD_IA32_EFER); /* cpu-based controls */ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, @@ -2241,6 +2254,22 @@ static __init void nested_vmx_setup_ctls_msrs(void) SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_WBINVD_EXITING; + if (enable_ept) { + /* nested EPT: emulate EPT also to L1 */ + nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT; + nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT | + VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT; + nested_vmx_ept_caps &= vmx_capability.ept; + /* + * Since invept is completely emulated we support both global + * and context invalidation independent of what host cpu + * supports + */ + nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | + VMX_EPT_EXTENT_CONTEXT_BIT; + } else + nested_vmx_ept_caps = 0; + /* miscellaneous data */ rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high); nested_vmx_misc_low &= VMX_MISC_PREEMPTION_TIMER_RATE_MASK | @@ -2282,8 +2311,11 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) switch (msr_index) { case MSR_IA32_FEATURE_CONTROL: - *pdata = 0; - break; + if (nested_vmx_allowed(vcpu)) { + *pdata = to_vmx(vcpu)->nested.msr_ia32_feature_control; + break; + } + return 0; case MSR_IA32_VMX_BASIC: /* * This MSR reports some information about VMX support. We @@ -2346,8 +2378,8 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) nested_vmx_secondary_ctls_high); break; case MSR_IA32_VMX_EPT_VPID_CAP: - /* Currently, no nested ept or nested vpid */ - *pdata = 0; + /* Currently, no nested vpid support */ + *pdata = nested_vmx_ept_caps; break; default: return 0; @@ -2356,14 +2388,24 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) return 1; } -static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) +static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { + u32 msr_index = msr_info->index; + u64 data = msr_info->data; + bool host_initialized = msr_info->host_initiated; + if (!nested_vmx_allowed(vcpu)) return 0; - if (msr_index == MSR_IA32_FEATURE_CONTROL) - /* TODO: the right thing. */ + if (msr_index == MSR_IA32_FEATURE_CONTROL) { + if (!host_initialized && + to_vmx(vcpu)->nested.msr_ia32_feature_control + & FEATURE_CONTROL_LOCKED) + return 0; + to_vmx(vcpu)->nested.msr_ia32_feature_control = data; return 1; + } + /* * No need to treat VMX capability MSRs specially: If we don't handle * them, handle_wrmsr will #GP(0), which is correct (they are readonly) @@ -2494,7 +2536,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; /* Otherwise falls through */ default: - if (vmx_set_vmx_msr(vcpu, msr_index, data)) + if (vmx_set_vmx_msr(vcpu, msr_info)) break; msr = find_msr_entry(vmx, msr_index); if (msr) { @@ -5302,9 +5344,13 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) /* It is a write fault? */ error_code = exit_qualification & (1U << 1); + /* It is a fetch fault? */ + error_code |= (exit_qualification & (1U << 2)) << 2; /* ept page table is present? */ error_code |= (exit_qualification >> 3) & 0x1; + vcpu->arch.exit_qualification = exit_qualification; + return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); } @@ -5438,7 +5484,8 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) err = emulate_instruction(vcpu, EMULTYPE_NO_REEXECUTE); - if (err == EMULATE_DO_MMIO) { + if (err == EMULATE_USER_EXIT) { + ++vcpu->stat.mmio_exits; ret = 0; goto out; } @@ -5567,8 +5614,47 @@ static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx) free_loaded_vmcs(&vmx->vmcs01); } +/* + * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), + * set the success or error code of an emulated VMX instruction, as specified + * by Vol 2B, VMX Instruction Reference, "Conventions". + */ +static void nested_vmx_succeed(struct kvm_vcpu *vcpu) +{ + vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) + & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | + X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); +} + +static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu) +{ + vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) + & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | + X86_EFLAGS_SF | X86_EFLAGS_OF)) + | X86_EFLAGS_CF); +} + static void nested_vmx_failValid(struct kvm_vcpu *vcpu, - u32 vm_instruction_error); + u32 vm_instruction_error) +{ + if (to_vmx(vcpu)->nested.current_vmptr == -1ull) { + /* + * failValid writes the error number to the current VMCS, which + * can't be done there isn't a current VMCS. + */ + nested_vmx_failInvalid(vcpu); + return; + } + vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) + & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | + X86_EFLAGS_SF | X86_EFLAGS_OF)) + | X86_EFLAGS_ZF); + get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; + /* + * We don't need to force a shadow sync because + * VM_INSTRUCTION_ERROR is not shadowed + */ +} /* * Emulate the VMXON instruction. @@ -5583,6 +5669,8 @@ static int handle_vmon(struct kvm_vcpu *vcpu) struct kvm_segment cs; struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs *shadow_vmcs; + const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED + | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; /* The Intel VMX Instruction Reference lists a bunch of bits that * are prerequisite to running VMXON, most notably cr4.VMXE must be @@ -5611,6 +5699,13 @@ static int handle_vmon(struct kvm_vcpu *vcpu) skip_emulated_instruction(vcpu); return 1; } + + if ((vmx->nested.msr_ia32_feature_control & VMXON_NEEDED_FEATURES) + != VMXON_NEEDED_FEATURES) { + kvm_inject_gp(vcpu, 0); + return 1; + } + if (enable_shadow_vmcs) { shadow_vmcs = alloc_vmcs(); if (!shadow_vmcs) @@ -5628,6 +5723,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu) vmx->nested.vmxon = true; skip_emulated_instruction(vcpu); + nested_vmx_succeed(vcpu); return 1; } @@ -5712,6 +5808,7 @@ static int handle_vmoff(struct kvm_vcpu *vcpu) return 1; free_nested(to_vmx(vcpu)); skip_emulated_instruction(vcpu); + nested_vmx_succeed(vcpu); return 1; } @@ -5768,48 +5865,6 @@ static int get_vmx_mem_address(struct kvm_vcpu *vcpu, return 0; } -/* - * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), - * set the success or error code of an emulated VMX instruction, as specified - * by Vol 2B, VMX Instruction Reference, "Conventions". - */ -static void nested_vmx_succeed(struct kvm_vcpu *vcpu) -{ - vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) - & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | - X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); -} - -static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu) -{ - vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) - & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | - X86_EFLAGS_SF | X86_EFLAGS_OF)) - | X86_EFLAGS_CF); -} - -static void nested_vmx_failValid(struct kvm_vcpu *vcpu, - u32 vm_instruction_error) -{ - if (to_vmx(vcpu)->nested.current_vmptr == -1ull) { - /* - * failValid writes the error number to the current VMCS, which - * can't be done there isn't a current VMCS. - */ - nested_vmx_failInvalid(vcpu); - return; - } - vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) - & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | - X86_EFLAGS_SF | X86_EFLAGS_OF)) - | X86_EFLAGS_ZF); - get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; - /* - * We don't need to force a shadow sync because - * VM_INSTRUCTION_ERROR is not shadowed - */ -} - /* Emulate the VMCLEAR instruction */ static int handle_vmclear(struct kvm_vcpu *vcpu) { @@ -5972,8 +6027,8 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) unsigned long field; u64 field_value; struct vmcs *shadow_vmcs = vmx->nested.current_shadow_vmcs; - unsigned long *fields = (unsigned long *)shadow_read_write_fields; - int num_fields = max_shadow_read_write_fields; + const unsigned long *fields = shadow_read_write_fields; + const int num_fields = max_shadow_read_write_fields; vmcs_load(shadow_vmcs); @@ -6002,12 +6057,11 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) { - unsigned long *fields[] = { - (unsigned long *)shadow_read_write_fields, - (unsigned long *)shadow_read_only_fields + const unsigned long *fields[] = { + shadow_read_write_fields, + shadow_read_only_fields }; - int num_lists = ARRAY_SIZE(fields); - int max_fields[] = { + const int max_fields[] = { max_shadow_read_write_fields, max_shadow_read_only_fields }; @@ -6018,7 +6072,7 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) vmcs_load(shadow_vmcs); - for (q = 0; q < num_lists; q++) { + for (q = 0; q < ARRAY_SIZE(fields); q++) { for (i = 0; i < max_fields[q]; i++) { field = fields[q][i]; vmcs12_read_any(&vmx->vcpu, field, &field_value); @@ -6248,6 +6302,74 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu) return 1; } +/* Emulate the INVEPT instruction */ +static int handle_invept(struct kvm_vcpu *vcpu) +{ + u32 vmx_instruction_info, types; + unsigned long type; + gva_t gva; + struct x86_exception e; + struct { + u64 eptp, gpa; + } operand; + u64 eptp_mask = ((1ull << 51) - 1) & PAGE_MASK; + + if (!(nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) || + !(nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + if (!nested_vmx_check_permission(vcpu)) + return 1; + + if (!kvm_read_cr0_bits(vcpu, X86_CR0_PE)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf); + + types = (nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; + + if (!(types & (1UL << type))) { + nested_vmx_failValid(vcpu, + VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + return 1; + } + + /* According to the Intel VMX instruction reference, the memory + * operand is read even if it isn't needed (e.g., for type==global) + */ + if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), + vmx_instruction_info, &gva)) + return 1; + if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand, + sizeof(operand), &e)) { + kvm_inject_page_fault(vcpu, &e); + return 1; + } + + switch (type) { + case VMX_EPT_EXTENT_CONTEXT: + if ((operand.eptp & eptp_mask) != + (nested_ept_get_cr3(vcpu) & eptp_mask)) + break; + case VMX_EPT_EXTENT_GLOBAL: + kvm_mmu_sync_roots(vcpu); + kvm_mmu_flush_tlb(vcpu); + nested_vmx_succeed(vcpu); + break; + default: + BUG_ON(1); + break; + } + + skip_emulated_instruction(vcpu); + return 1; +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -6292,6 +6414,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, [EXIT_REASON_MWAIT_INSTRUCTION] = handle_invalid_op, [EXIT_REASON_MONITOR_INSTRUCTION] = handle_invalid_op, + [EXIT_REASON_INVEPT] = handle_invept, }; static const int kvm_vmx_max_exit_handlers = @@ -6518,6 +6641,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD: case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE: case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: + case EXIT_REASON_INVEPT: /* * VMX instructions trap unconditionally. This allows L1 to * emulate them for its L2 guest, i.e., allows 3-level nesting! @@ -6550,7 +6674,20 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); case EXIT_REASON_EPT_VIOLATION: + /* + * L0 always deals with the EPT violation. If nested EPT is + * used, and the nested mmu code discovers that the address is + * missing in the guest EPT table (EPT12), the EPT violation + * will be injected with nested_ept_inject_page_fault() + */ + return 0; case EXIT_REASON_EPT_MISCONFIG: + /* + * L2 never uses directly L1's EPT, but rather L0's own EPT + * table (shadow on EPT) or a merged EPT table that L0 built + * (EPT on EPT). So any problems with the structure of the + * table is L0's fault. + */ return 0; case EXIT_REASON_PREEMPTION_TIMER: return vmcs12->pin_based_vm_exec_control & @@ -6638,7 +6775,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked && !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis( - get_vmcs12(vcpu), vcpu)))) { + get_vmcs12(vcpu))))) { if (vmx_interrupt_allowed(vcpu)) { vmx->soft_vnmi_blocked = 0; } else if (vmx->vnmi_blocked_time > 1000000000LL && @@ -7326,6 +7463,48 @@ static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) entry->ecx |= bit(X86_FEATURE_VMX); } +static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, + struct x86_exception *fault) +{ + struct vmcs12 *vmcs12; + nested_vmx_vmexit(vcpu); + vmcs12 = get_vmcs12(vcpu); + + if (fault->error_code & PFERR_RSVD_MASK) + vmcs12->vm_exit_reason = EXIT_REASON_EPT_MISCONFIG; + else + vmcs12->vm_exit_reason = EXIT_REASON_EPT_VIOLATION; + vmcs12->exit_qualification = vcpu->arch.exit_qualification; + vmcs12->guest_physical_address = fault->address; +} + +/* Callbacks for nested_ept_init_mmu_context: */ + +static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu) +{ + /* return the page table to be shadowed - in our case, EPT12 */ + return get_vmcs12(vcpu)->ept_pointer; +} + +static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) +{ + int r = kvm_init_shadow_ept_mmu(vcpu, &vcpu->arch.mmu, + nested_vmx_ept_caps & VMX_EPT_EXECUTE_ONLY_BIT); + + vcpu->arch.mmu.set_cr3 = vmx_set_cr3; + vcpu->arch.mmu.get_cr3 = nested_ept_get_cr3; + vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault; + + vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; + + return r; +} + +static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) +{ + vcpu->arch.walk_mmu = &vcpu->arch.mmu; +} + /* * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it @@ -7388,7 +7567,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->guest_interruptibility_info); vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); - vmcs_writel(GUEST_RFLAGS, vmcs12->guest_rflags); + vmx_set_rflags(vcpu, vmcs12->guest_rflags); vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, vmcs12->guest_pending_dbg_exceptions); vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); @@ -7508,15 +7687,24 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); - /* Note: IA32_MODE, LOAD_IA32_EFER are modified by vmx_set_efer below */ - vmcs_write32(VM_EXIT_CONTROLS, - vmcs12->vm_exit_controls | vmcs_config.vmexit_ctrl); - vmcs_write32(VM_ENTRY_CONTROLS, vmcs12->vm_entry_controls | + /* L2->L1 exit controls are emulated - the hardware exit is to L0 so + * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER + * bits are further modified by vmx_set_efer() below. + */ + vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); + + /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are + * emulated by vmx_set_efer(), below. + */ + vmcs_write32(VM_ENTRY_CONTROLS, + (vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER & + ~VM_ENTRY_IA32E_MODE) | (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE)); - if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) + if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) { vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); - else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) + vcpu->arch.pat = vmcs12->guest_ia32_pat; + } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); @@ -7538,6 +7726,11 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmx_flush_tlb(vcpu); } + if (nested_cpu_has_ept(vmcs12)) { + kvm_mmu_unload(vcpu); + nested_ept_init_mmu_context(vcpu); + } + if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER) vcpu->arch.efer = vmcs12->guest_ia32_efer; else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) @@ -7565,6 +7758,16 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) kvm_set_cr3(vcpu, vmcs12->guest_cr3); kvm_mmu_reset_context(vcpu); + /* + * L1 may access the L2's PDPTR, so save them to construct vmcs12 + */ + if (enable_ept) { + vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); + vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); + vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); + vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); + } + kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp); kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip); } @@ -7887,6 +8090,22 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->guest_pending_dbg_exceptions = vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); + /* + * In some cases (usually, nested EPT), L2 is allowed to change its + * own CR3 without exiting. If it has changed it, we must keep it. + * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined + * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12. + * + * Additionally, restore L2's PDPTR to vmcs12. + */ + if (enable_ept) { + vmcs12->guest_cr3 = vmcs_read64(GUEST_CR3); + vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); + vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); + vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); + vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); + } + vmcs12->vm_entry_controls = (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | (vmcs_read32(VM_ENTRY_CONTROLS) & VM_ENTRY_IA32E_MODE); @@ -7948,6 +8167,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { + struct kvm_segment seg; + if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) vcpu->arch.efer = vmcs12->host_ia32_efer; else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) @@ -7982,7 +8203,9 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); kvm_set_cr4(vcpu, vmcs12->host_cr4); - /* shadow page tables on either EPT or shadow page tables */ + if (nested_cpu_has_ept(vmcs12)) + nested_ept_uninit_mmu_context(vcpu); + kvm_set_cr3(vcpu, vmcs12->host_cr3); kvm_mmu_reset_context(vcpu); @@ -8001,23 +8224,61 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); - vmcs_writel(GUEST_TR_BASE, vmcs12->host_tr_base); - vmcs_writel(GUEST_GS_BASE, vmcs12->host_gs_base); - vmcs_writel(GUEST_FS_BASE, vmcs12->host_fs_base); - vmcs_write16(GUEST_ES_SELECTOR, vmcs12->host_es_selector); - vmcs_write16(GUEST_CS_SELECTOR, vmcs12->host_cs_selector); - vmcs_write16(GUEST_SS_SELECTOR, vmcs12->host_ss_selector); - vmcs_write16(GUEST_DS_SELECTOR, vmcs12->host_ds_selector); - vmcs_write16(GUEST_FS_SELECTOR, vmcs12->host_fs_selector); - vmcs_write16(GUEST_GS_SELECTOR, vmcs12->host_gs_selector); - vmcs_write16(GUEST_TR_SELECTOR, vmcs12->host_tr_selector); - - if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) + + if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); + vcpu->arch.pat = vmcs12->host_ia32_pat; + } if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, vmcs12->host_ia32_perf_global_ctrl); + /* Set L1 segment info according to Intel SDM + 27.5.2 Loading Host Segment and Descriptor-Table Registers */ + seg = (struct kvm_segment) { + .base = 0, + .limit = 0xFFFFFFFF, + .selector = vmcs12->host_cs_selector, + .type = 11, + .present = 1, + .s = 1, + .g = 1 + }; + if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) + seg.l = 1; + else + seg.db = 1; + vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); + seg = (struct kvm_segment) { + .base = 0, + .limit = 0xFFFFFFFF, + .type = 3, + .present = 1, + .s = 1, + .db = 1, + .g = 1 + }; + seg.selector = vmcs12->host_ds_selector; + vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); + seg.selector = vmcs12->host_es_selector; + vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); + seg.selector = vmcs12->host_ss_selector; + vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); + seg.selector = vmcs12->host_fs_selector; + seg.base = vmcs12->host_fs_base; + vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); + seg.selector = vmcs12->host_gs_selector; + seg.base = vmcs12->host_gs_base; + vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); + seg = (struct kvm_segment) { + .base = vmcs12->host_tr_base, + .limit = 0x67, + .selector = vmcs12->host_tr_selector, + .type = 11, + .present = 1 + }; + vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); + kvm_set_dr(vcpu, 7, 0x400); vmcs_write64(GUEST_IA32_DEBUGCTL, 0); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d21bce50531..e5ca72a5cdb 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -682,17 +682,6 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) */ } - /* - * Does the new cr3 value map to physical memory? (Note, we - * catch an invalid cr3 even in real-mode, because it would - * cause trouble later on when we turn on paging anyway.) - * - * A real CPU would silently accept an invalid cr3 and would - * attempt to use it - with largely undefined (and often hard - * to debug) behavior on the guest side. - */ - if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) - return 1; vcpu->arch.cr3 = cr3; __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); vcpu->arch.mmu.new_cr3(vcpu); @@ -850,7 +839,8 @@ static u32 msrs_to_save[] = { #ifdef CONFIG_X86_64 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, #endif - MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA + MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, + MSR_IA32_FEATURE_CONTROL }; static unsigned num_msrs_to_save; @@ -1457,6 +1447,29 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm) #endif } +static void kvm_gen_update_masterclock(struct kvm *kvm) +{ +#ifdef CONFIG_X86_64 + int i; + struct kvm_vcpu *vcpu; + struct kvm_arch *ka = &kvm->arch; + + spin_lock(&ka->pvclock_gtod_sync_lock); + kvm_make_mclock_inprogress_request(kvm); + /* no guest entries from this point */ + pvclock_update_vm_gtod_copy(kvm); + + kvm_for_each_vcpu(i, vcpu, kvm) + set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests); + + /* guest entries allowed */ + kvm_for_each_vcpu(i, vcpu, kvm) + clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests); + + spin_unlock(&ka->pvclock_gtod_sync_lock); +#endif +} + static int kvm_guest_time_update(struct kvm_vcpu *v) { unsigned long flags, this_tsc_khz; @@ -3806,6 +3819,7 @@ long kvm_arch_vm_ioctl(struct file *filp, delta = user_ns.clock - now_ns; local_irq_enable(); kvm->arch.kvmclock_offset = delta; + kvm_gen_update_masterclock(kvm); break; } case KVM_GET_CLOCK: { @@ -4955,6 +4969,97 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt, static int complete_emulated_mmio(struct kvm_vcpu *vcpu); static int complete_emulated_pio(struct kvm_vcpu *vcpu); +static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7, + unsigned long *db) +{ + u32 dr6 = 0; + int i; + u32 enable, rwlen; + + enable = dr7; + rwlen = dr7 >> 16; + for (i = 0; i < 4; i++, enable >>= 2, rwlen >>= 4) + if ((enable & 3) && (rwlen & 15) == type && db[i] == addr) + dr6 |= (1 << i); + return dr6; +} + +static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, int *r) +{ + struct kvm_run *kvm_run = vcpu->run; + + /* + * Use the "raw" value to see if TF was passed to the processor. + * Note that the new value of the flags has not been saved yet. + * + * This is correct even for TF set by the guest, because "the + * processor will not generate this exception after the instruction + * that sets the TF flag". + */ + unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); + + if (unlikely(rflags & X86_EFLAGS_TF)) { + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { + kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1; + kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip; + kvm_run->debug.arch.exception = DB_VECTOR; + kvm_run->exit_reason = KVM_EXIT_DEBUG; + *r = EMULATE_USER_EXIT; + } else { + vcpu->arch.emulate_ctxt.eflags &= ~X86_EFLAGS_TF; + /* + * "Certain debug exceptions may clear bit 0-3. The + * remaining contents of the DR6 register are never + * cleared by the processor". + */ + vcpu->arch.dr6 &= ~15; + vcpu->arch.dr6 |= DR6_BS; + kvm_queue_exception(vcpu, DB_VECTOR); + } + } +} + +static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r) +{ + struct kvm_run *kvm_run = vcpu->run; + unsigned long eip = vcpu->arch.emulate_ctxt.eip; + u32 dr6 = 0; + + if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) && + (vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) { + dr6 = kvm_vcpu_check_hw_bp(eip, 0, + vcpu->arch.guest_debug_dr7, + vcpu->arch.eff_db); + + if (dr6 != 0) { + kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1; + kvm_run->debug.arch.pc = kvm_rip_read(vcpu) + + get_segment_base(vcpu, VCPU_SREG_CS); + + kvm_run->debug.arch.exception = DB_VECTOR; + kvm_run->exit_reason = KVM_EXIT_DEBUG; + *r = EMULATE_USER_EXIT; + return true; + } + } + + if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK)) { + dr6 = kvm_vcpu_check_hw_bp(eip, 0, + vcpu->arch.dr7, + vcpu->arch.db); + + if (dr6 != 0) { + vcpu->arch.dr6 &= ~15; + vcpu->arch.dr6 |= dr6; + kvm_queue_exception(vcpu, DB_VECTOR); + *r = EMULATE_DONE; + return true; + } + } + + return false; +} + int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, int emulation_type, @@ -4975,6 +5080,16 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, if (!(emulation_type & EMULTYPE_NO_DECODE)) { init_emulate_ctxt(vcpu); + + /* + * We will reenter on the same instruction since + * we do not set complete_userspace_io. This does not + * handle watchpoints yet, those would be handled in + * the emulate_ops. + */ + if (kvm_vcpu_check_breakpoint(vcpu, &r)) + return r; + ctxt->interruptibility = 0; ctxt->have_exception = false; ctxt->perm_ok = false; @@ -5031,17 +5146,18 @@ restart: inject_emulated_exception(vcpu); r = EMULATE_DONE; } else if (vcpu->arch.pio.count) { - if (!vcpu->arch.pio.in) + if (!vcpu->arch.pio.in) { + /* FIXME: return into emulator if single-stepping. */ vcpu->arch.pio.count = 0; - else { + } else { writeback = false; vcpu->arch.complete_userspace_io = complete_emulated_pio; } - r = EMULATE_DO_MMIO; + r = EMULATE_USER_EXIT; } else if (vcpu->mmio_needed) { if (!vcpu->mmio_is_write) writeback = false; - r = EMULATE_DO_MMIO; + r = EMULATE_USER_EXIT; vcpu->arch.complete_userspace_io = complete_emulated_mmio; } else if (r == EMULATION_RESTART) goto restart; @@ -5050,10 +5166,12 @@ restart: if (writeback) { toggle_interruptibility(vcpu, ctxt->interruptibility); - kvm_set_rflags(vcpu, ctxt->eflags); kvm_make_request(KVM_REQ_EVENT, vcpu); vcpu->arch.emulate_regs_need_sync_to_vcpu = false; kvm_rip_write(vcpu, ctxt->eip); + if (r == EMULATE_DONE) + kvm_vcpu_check_singlestep(vcpu, &r); + kvm_set_rflags(vcpu, ctxt->eflags); } else vcpu->arch.emulate_regs_need_sync_to_vcpu = true; @@ -5347,7 +5465,7 @@ static struct notifier_block pvclock_gtod_notifier = { int kvm_arch_init(void *opaque) { int r; - struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque; + struct kvm_x86_ops *ops = opaque; if (kvm_x86_ops) { printk(KERN_ERR "kvm: already loaded the other module\n"); @@ -5495,6 +5613,23 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) return 1; } +/* + * kvm_pv_kick_cpu_op: Kick a vcpu. + * + * @apicid - apicid of vcpu to be kicked. + */ +static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid) +{ + struct kvm_lapic_irq lapic_irq; + + lapic_irq.shorthand = 0; + lapic_irq.dest_mode = 0; + lapic_irq.dest_id = apicid; + + lapic_irq.delivery_mode = APIC_DM_REMRD; + kvm_irq_delivery_to_apic(kvm, 0, &lapic_irq, NULL); +} + int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) { unsigned long nr, a0, a1, a2, a3, ret; @@ -5528,6 +5663,10 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) case KVM_HC_VAPIC_POLL_IRQ: ret = 0; break; + case KVM_HC_KICK_CPU: + kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1); + ret = 0; + break; default: ret = -KVM_ENOSYS; break; @@ -5689,29 +5828,6 @@ static void process_nmi(struct kvm_vcpu *vcpu) kvm_make_request(KVM_REQ_EVENT, vcpu); } -static void kvm_gen_update_masterclock(struct kvm *kvm) -{ -#ifdef CONFIG_X86_64 - int i; - struct kvm_vcpu *vcpu; - struct kvm_arch *ka = &kvm->arch; - - spin_lock(&ka->pvclock_gtod_sync_lock); - kvm_make_mclock_inprogress_request(kvm); - /* no guest entries from this point */ - pvclock_update_vm_gtod_copy(kvm); - - kvm_for_each_vcpu(i, vcpu, kvm) - set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests); - - /* guest entries allowed */ - kvm_for_each_vcpu(i, vcpu, kvm) - clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests); - - spin_unlock(&ka->pvclock_gtod_sync_lock); -#endif -} - static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) { u64 eoi_exit_bitmap[4]; @@ -5950,6 +6066,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) kvm_apic_accept_events(vcpu); switch(vcpu->arch.mp_state) { case KVM_MP_STATE_HALTED: + vcpu->arch.pv.pv_unhalted = false; vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; case KVM_MP_STATE_RUNNABLE: @@ -6061,6 +6178,8 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) { vcpu->mmio_needed = 0; + + /* FIXME: return into emulator if single-stepping. */ if (vcpu->mmio_is_write) return 1; vcpu->mmio_read_completed = 1; @@ -6249,7 +6368,12 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { kvm_apic_accept_events(vcpu); - mp_state->mp_state = vcpu->arch.mp_state; + if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED && + vcpu->arch.pv.pv_unhalted) + mp_state->mp_state = KVM_MP_STATE_RUNNABLE; + else + mp_state->mp_state = vcpu->arch.mp_state; + return 0; } @@ -6770,6 +6894,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) BUG_ON(vcpu->kvm == NULL); kvm = vcpu->kvm; + vcpu->arch.pv.pv_unhalted = false; vcpu->arch.emulate_ctxt.ops = &emulate_ops; if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; @@ -7019,6 +7144,15 @@ out_free: return -ENOMEM; } +void kvm_arch_memslots_updated(struct kvm *kvm) +{ + /* + * memslots->generation has been incremented. + * mmio generation may have reached its maximum value. + */ + kvm_mmu_invalidate_mmio_sptes(kvm); +} + int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, struct kvm_userspace_memory_region *mem, @@ -7079,11 +7213,6 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, */ if ((change != KVM_MR_DELETE) && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES)) kvm_mmu_slot_remove_write_access(kvm, mem->slot); - /* - * If memory slot is created, or moved, we need to clear all - * mmio sptes. - */ - kvm_mmu_invalidate_mmio_sptes(kvm); } void kvm_arch_flush_shadow_all(struct kvm *kvm) @@ -7103,6 +7232,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) !vcpu->arch.apf.halted) || !list_empty_careful(&vcpu->async_pf.done) || kvm_apic_has_events(vcpu) + || vcpu->arch.pv.pv_unhalted || atomic_read(&vcpu->arch.nmi_queued) || (kvm_arch_interrupt_allowed(vcpu) && kvm_cpu_has_interrupt(vcpu)); diff --git a/arch/x86/lib/csum-wrappers_64.c b/arch/x86/lib/csum-wrappers_64.c index 25b7ae8d058..7609e0e421e 100644 --- a/arch/x86/lib/csum-wrappers_64.c +++ b/arch/x86/lib/csum-wrappers_64.c @@ -6,6 +6,7 @@ */ #include <asm/checksum.h> #include <linux/module.h> +#include <asm/smap.h> /** * csum_partial_copy_from_user - Copy and checksum from user space. @@ -52,8 +53,10 @@ csum_partial_copy_from_user(const void __user *src, void *dst, len -= 2; } } + stac(); isum = csum_partial_copy_generic((__force const void *)src, dst, len, isum, errp, NULL); + clac(); if (unlikely(*errp)) goto out_err; @@ -82,6 +85,8 @@ __wsum csum_partial_copy_to_user(const void *src, void __user *dst, int len, __wsum isum, int *errp) { + __wsum ret; + might_sleep(); if (unlikely(!access_ok(VERIFY_WRITE, dst, len))) { @@ -105,8 +110,11 @@ csum_partial_copy_to_user(const void *src, void __user *dst, } *errp = 0; - return csum_partial_copy_generic(src, (void __force *)dst, - len, isum, NULL, errp); + stac(); + ret = csum_partial_copy_generic(src, (void __force *)dst, + len, isum, NULL, errp); + clac(); + return ret; } EXPORT_SYMBOL(csum_partial_copy_to_user); diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index 906fea31579..c905e89e19f 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -68,7 +68,7 @@ EXPORT_SYMBOL(copy_in_user); * Since protection fault in copy_from/to_user is not a normal situation, * it is not necessary to optimize tail handling. */ -unsigned long +__visible unsigned long copy_user_handle_tail(char *to, char *from, unsigned len, unsigned zerorest) { char c; diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index 5d7e51f3fd2..533a85e3a07 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -1,10 +1,8 @@ # x86 Opcode Maps # # This is (mostly) based on following documentations. -# - Intel(R) 64 and IA-32 Architectures Software Developer's Manual Vol.2 -# (#325383-040US, October 2011) -# - Intel(R) Advanced Vector Extensions Programming Reference -# (#319433-011,JUNE 2011). +# - Intel(R) 64 and IA-32 Architectures Software Developer's Manual Vol.2C +# (#326018-047US, June 2013) # #<Opcode maps> # Table: table-name @@ -29,6 +27,7 @@ # - (F3): the last prefix is 0xF3 # - (F2): the last prefix is 0xF2 # - (!F3) : the last prefix is not 0xF3 (including non-last prefix case) +# - (66&F2): Both 0x66 and 0xF2 prefixes are specified. Table: one byte opcode Referrer: @@ -246,8 +245,8 @@ c2: RETN Iw (f64) c3: RETN c4: LES Gz,Mp (i64) | VEX+2byte (Prefix) c5: LDS Gz,Mp (i64) | VEX+1byte (Prefix) -c6: Grp11 Eb,Ib (1A) -c7: Grp11 Ev,Iz (1A) +c6: Grp11A Eb,Ib (1A) +c7: Grp11B Ev,Iz (1A) c8: ENTER Iw,Ib c9: LEAVE (d64) ca: RETF Iw @@ -293,8 +292,8 @@ ef: OUT DX,eAX # 0xf0 - 0xff f0: LOCK (Prefix) f1: -f2: REPNE (Prefix) -f3: REP/REPE (Prefix) +f2: REPNE (Prefix) | XACQUIRE (Prefix) +f3: REP/REPE (Prefix) | XRELEASE (Prefix) f4: HLT f5: CMC f6: Grp3_1 Eb (1A) @@ -326,7 +325,8 @@ AVXcode: 1 0a: 0b: UD2 (1B) 0c: -0d: NOP Ev | GrpP +# AMD's prefetch group. Intel supports prefetchw(/1) only. +0d: GrpP 0e: FEMMS # 3DNow! uses the last imm byte as opcode extension. 0f: 3DNow! Pq,Qq,Ib @@ -729,12 +729,12 @@ dc: VAESENC Vdq,Hdq,Wdq (66),(v1) dd: VAESENCLAST Vdq,Hdq,Wdq (66),(v1) de: VAESDEC Vdq,Hdq,Wdq (66),(v1) df: VAESDECLAST Vdq,Hdq,Wdq (66),(v1) -f0: MOVBE Gy,My | MOVBE Gw,Mw (66) | CRC32 Gd,Eb (F2) -f1: MOVBE My,Gy | MOVBE Mw,Gw (66) | CRC32 Gd,Ey (F2) +f0: MOVBE Gy,My | MOVBE Gw,Mw (66) | CRC32 Gd,Eb (F2) | CRC32 Gd,Eb (66&F2) +f1: MOVBE My,Gy | MOVBE Mw,Gw (66) | CRC32 Gd,Ey (F2) | CRC32 Gd,Ew (66&F2) f2: ANDN Gy,By,Ey (v) f3: Grp17 (1A) f5: BZHI Gy,Ey,By (v) | PEXT Gy,By,Ey (F3),(v) | PDEP Gy,By,Ey (F2),(v) -f6: MULX By,Gy,rDX,Ey (F2),(v) +f6: ADCX Gy,Ey (66) | ADOX Gy,Ey (F3) | MULX By,Gy,rDX,Ey (F2),(v) f7: BEXTR Gy,Ey,By (v) | SHLX Gy,Ey,By (66),(v) | SARX Gy,Ey,By (F3),(v) | SHRX Gy,Ey,By (F2),(v) EndTable @@ -861,8 +861,8 @@ EndTable GrpTable: Grp7 0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B) -1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001) -2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B) +1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001),(11B) | CLAC (010),(11B) | STAC (011),(11B) +2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B) | XEND (101)(11B) | XTEST (110)(11B) 3: LIDT Ms 4: SMSW Mw/Rv 5: @@ -880,15 +880,21 @@ EndTable GrpTable: Grp9 1: CMPXCHG8B/16B Mq/Mdq 6: VMPTRLD Mq | VMCLEAR Mq (66) | VMXON Mq (F3) | RDRAND Rv (11B) -7: VMPTRST Mq | VMPTRST Mq (F3) +7: VMPTRST Mq | VMPTRST Mq (F3) | RDSEED Rv (11B) EndTable GrpTable: Grp10 EndTable -GrpTable: Grp11 -# Note: the operands are given by group opcode -0: MOV +# Grp11A and Grp11B are expressed as Grp11 in Intel SDM +GrpTable: Grp11A +0: MOV Eb,Ib +7: XABORT Ib (000),(11B) +EndTable + +GrpTable: Grp11B +0: MOV Eb,Iz +7: XBEGIN Jz (000),(11B) EndTable GrpTable: Grp12 diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 2ec29ac78ae..04664cdb7fd 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -78,8 +78,8 @@ __ref void *alloc_low_pages(unsigned int num) return __va(pfn << PAGE_SHIFT); } -/* need 4 4k for initial PMD_SIZE, 4k for 0-ISA_END_ADDRESS */ -#define INIT_PGT_BUF_SIZE (5 * PAGE_SIZE) +/* need 3 4k for initial PMD_SIZE, 3 4k for 0-ISA_END_ADDRESS */ +#define INIT_PGT_BUF_SIZE (6 * PAGE_SIZE) RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE); void __init early_alloc_pgt_buf(void) { diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 0215e2c563e..799580cabc7 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -487,7 +487,7 @@ __early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot) unsigned long offset; resource_size_t last_addr; unsigned int nrpages; - enum fixed_addresses idx0, idx; + enum fixed_addresses idx; int i, slot; WARN_ON(system_state != SYSTEM_BOOTING); @@ -540,8 +540,7 @@ __early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot) /* * Ok, go for it.. */ - idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; - idx = idx0; + idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; while (nrpages > 0) { early_set_fixmap(idx, phys_addr, prot); phys_addr += PAGE_SIZE; diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 62c29a5bfe2..25e7e1372bb 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -112,11 +112,13 @@ static unsigned long mmap_legacy_base(void) */ void arch_pick_mmap_layout(struct mm_struct *mm) { + mm->mmap_legacy_base = mmap_legacy_base(); + mm->mmap_base = mmap_base(); + if (mmap_is_legacy()) { - mm->mmap_base = mmap_legacy_base(); + mm->mmap_base = mm->mmap_legacy_base; mm->get_unmapped_area = arch_get_unmapped_area; } else { - mm->mmap_base = mmap_base(); mm->get_unmapped_area = arch_get_unmapped_area_topdown; } } diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index dc0b727742f..0057a7accfb 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -410,9 +410,7 @@ out: pr_warning("multiple CPUs still online, may miss events.\n"); } -/* __ref because leave_uniprocessor calls cpu_up which is __cpuinit, - but this whole function is ifdefed CONFIG_HOTPLUG_CPU */ -static void __ref leave_uniprocessor(void) +static void leave_uniprocessor(void) { int cpu; int err; diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index a71c4e20767..8bf93bae1f1 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -60,7 +60,7 @@ s16 __apicid_to_node[MAX_LOCAL_APIC] = { [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE }; -int __cpuinit numa_cpu_node(int cpu) +int numa_cpu_node(int cpu) { int apicid = early_per_cpu(x86_cpu_to_apicid, cpu); @@ -691,12 +691,12 @@ void __init init_cpu_to_node(void) #ifndef CONFIG_DEBUG_PER_CPU_MAPS # ifndef CONFIG_NUMA_EMU -void __cpuinit numa_add_cpu(int cpu) +void numa_add_cpu(int cpu) { cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); } -void __cpuinit numa_remove_cpu(int cpu) +void numa_remove_cpu(int cpu) { cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); } @@ -763,17 +763,17 @@ void debug_cpumask_set_cpu(int cpu, int node, bool enable) } # ifndef CONFIG_NUMA_EMU -static void __cpuinit numa_set_cpumask(int cpu, bool enable) +static void numa_set_cpumask(int cpu, bool enable) { debug_cpumask_set_cpu(cpu, early_cpu_to_node(cpu), enable); } -void __cpuinit numa_add_cpu(int cpu) +void numa_add_cpu(int cpu) { numa_set_cpumask(cpu, true); } -void __cpuinit numa_remove_cpu(int cpu) +void numa_remove_cpu(int cpu) { numa_set_cpumask(cpu, false); } diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index dbbbb47260c..a8f90ce3ded 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -10,7 +10,7 @@ #include "numa_internal.h" -static int emu_nid_to_phys[MAX_NUMNODES] __cpuinitdata; +static int emu_nid_to_phys[MAX_NUMNODES]; static char *emu_cmdline __initdata; void __init numa_emu_cmdline(char *str) @@ -444,7 +444,7 @@ no_emu: } #ifndef CONFIG_DEBUG_PER_CPU_MAPS -void __cpuinit numa_add_cpu(int cpu) +void numa_add_cpu(int cpu) { int physnid, nid; @@ -462,7 +462,7 @@ void __cpuinit numa_add_cpu(int cpu) cpumask_set_cpu(cpu, node_to_cpumask_map[nid]); } -void __cpuinit numa_remove_cpu(int cpu) +void numa_remove_cpu(int cpu) { int i; @@ -470,7 +470,7 @@ void __cpuinit numa_remove_cpu(int cpu) cpumask_clear_cpu(cpu, node_to_cpumask_map[i]); } #else /* !CONFIG_DEBUG_PER_CPU_MAPS */ -static void __cpuinit numa_set_cpumask(int cpu, bool enable) +static void numa_set_cpumask(int cpu, bool enable) { int nid, physnid; @@ -490,12 +490,12 @@ static void __cpuinit numa_set_cpumask(int cpu, bool enable) } } -void __cpuinit numa_add_cpu(int cpu) +void numa_add_cpu(int cpu) { numa_set_cpumask(cpu, true); } -void __cpuinit numa_remove_cpu(int cpu) +void numa_remove_cpu(int cpu) { numa_set_cpumask(cpu, false); } diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c index 410531d3c29..90555bf60aa 100644 --- a/arch/x86/mm/setup_nx.c +++ b/arch/x86/mm/setup_nx.c @@ -5,7 +5,7 @@ #include <asm/pgtable.h> #include <asm/proto.h> -static int disable_nx __cpuinitdata; +static int disable_nx; /* * noexec = on|off @@ -29,7 +29,7 @@ static int __init noexec_setup(char *str) } early_param("noexec", noexec_setup); -void __cpuinit x86_configure_nx(void) +void x86_configure_nx(void) { if (cpu_has_nx && !disable_nx) __supported_pte_mask |= _PAGE_NX; diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c index cdd0da9dd53..266ca912f62 100644 --- a/arch/x86/mm/srat.c +++ b/arch/x86/mm/srat.c @@ -146,6 +146,7 @@ int __init acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) { u64 start, end; + u32 hotpluggable; int node, pxm; if (srat_disabled()) @@ -154,7 +155,8 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) goto out_err_bad_srat; if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0) goto out_err; - if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info()) + hotpluggable = ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE; + if (hotpluggable && !save_add_info()) goto out_err; start = ma->base_address; @@ -174,9 +176,10 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) node_set(node, numa_nodes_parsed); - printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]\n", - node, pxm, - (unsigned long long) start, (unsigned long long) end - 1); + pr_info("SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]%s\n", + node, pxm, + (unsigned long long) start, (unsigned long long) end - 1, + hotpluggable ? " hotplug" : ""); return 0; out_err_bad_srat: diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 48768df2471..6890d8498e0 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -403,7 +403,7 @@ static void nmi_cpu_down(void *dummy) nmi_cpu_shutdown(dummy); } -static int nmi_create_files(struct super_block *sb, struct dentry *root) +static int nmi_create_files(struct dentry *root) { unsigned int i; @@ -420,14 +420,14 @@ static int nmi_create_files(struct super_block *sb, struct dentry *root) continue; snprintf(buf, sizeof(buf), "%d", i); - dir = oprofilefs_mkdir(sb, root, buf); - oprofilefs_create_ulong(sb, dir, "enabled", &counter_config[i].enabled); - oprofilefs_create_ulong(sb, dir, "event", &counter_config[i].event); - oprofilefs_create_ulong(sb, dir, "count", &counter_config[i].count); - oprofilefs_create_ulong(sb, dir, "unit_mask", &counter_config[i].unit_mask); - oprofilefs_create_ulong(sb, dir, "kernel", &counter_config[i].kernel); - oprofilefs_create_ulong(sb, dir, "user", &counter_config[i].user); - oprofilefs_create_ulong(sb, dir, "extra", &counter_config[i].extra); + dir = oprofilefs_mkdir(root, buf); + oprofilefs_create_ulong(dir, "enabled", &counter_config[i].enabled); + oprofilefs_create_ulong(dir, "event", &counter_config[i].event); + oprofilefs_create_ulong(dir, "count", &counter_config[i].count); + oprofilefs_create_ulong(dir, "unit_mask", &counter_config[i].unit_mask); + oprofilefs_create_ulong(dir, "kernel", &counter_config[i].kernel); + oprofilefs_create_ulong(dir, "user", &counter_config[i].user); + oprofilefs_create_ulong(dir, "extra", &counter_config[i].extra); } return 0; diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index b2b94438ff0..50d86c0e9ba 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -454,16 +454,16 @@ static void init_ibs(void) printk(KERN_INFO "oprofile: AMD IBS detected (0x%08x)\n", ibs_caps); } -static int (*create_arch_files)(struct super_block *sb, struct dentry *root); +static int (*create_arch_files)(struct dentry *root); -static int setup_ibs_files(struct super_block *sb, struct dentry *root) +static int setup_ibs_files(struct dentry *root) { struct dentry *dir; int ret = 0; /* architecture specific files */ if (create_arch_files) - ret = create_arch_files(sb, root); + ret = create_arch_files(root); if (ret) return ret; @@ -479,26 +479,26 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root) ibs_config.max_cnt_op = 250000; if (ibs_caps & IBS_CAPS_FETCHSAM) { - dir = oprofilefs_mkdir(sb, root, "ibs_fetch"); - oprofilefs_create_ulong(sb, dir, "enable", + dir = oprofilefs_mkdir(root, "ibs_fetch"); + oprofilefs_create_ulong(dir, "enable", &ibs_config.fetch_enabled); - oprofilefs_create_ulong(sb, dir, "max_count", + oprofilefs_create_ulong(dir, "max_count", &ibs_config.max_cnt_fetch); - oprofilefs_create_ulong(sb, dir, "rand_enable", + oprofilefs_create_ulong(dir, "rand_enable", &ibs_config.rand_en); } if (ibs_caps & IBS_CAPS_OPSAM) { - dir = oprofilefs_mkdir(sb, root, "ibs_op"); - oprofilefs_create_ulong(sb, dir, "enable", + dir = oprofilefs_mkdir(root, "ibs_op"); + oprofilefs_create_ulong(dir, "enable", &ibs_config.op_enabled); - oprofilefs_create_ulong(sb, dir, "max_count", + oprofilefs_create_ulong(dir, "max_count", &ibs_config.max_cnt_op); if (ibs_caps & IBS_CAPS_OPCNT) - oprofilefs_create_ulong(sb, dir, "dispatched_ops", + oprofilefs_create_ulong(dir, "dispatched_ops", &ibs_config.dispatched_ops); if (ibs_caps & IBS_CAPS_BRNTRGT) - oprofilefs_create_ulong(sb, dir, "branch_target", + oprofilefs_create_ulong(dir, "branch_target", &ibs_config.branch_target); } diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index d641897a1f4..b30e937689d 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -568,13 +568,8 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) */ if (bus) { struct pci_bus *child; - list_for_each_entry(child, &bus->children, node) { - struct pci_dev *self = child->self; - if (!self) - continue; - - pcie_bus_configure_settings(child, self->pcie_mpss); - } + list_for_each_entry(child, &bus->children, node) + pcie_bus_configure_settings(child); } if (bus && node != -1) { diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index e9e6ed5cdf9..a48be98e9de 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c @@ -312,7 +312,7 @@ static int __init early_fill_mp_bus_info(void) #define ENABLE_CF8_EXT_CFG (1ULL << 46) -static void __cpuinit enable_pci_io_ecs(void *unused) +static void enable_pci_io_ecs(void *unused) { u64 reg; rdmsrl(MSR_AMD64_NB_CFG, reg); @@ -322,8 +322,8 @@ static void __cpuinit enable_pci_io_ecs(void *unused) } } -static int __cpuinit amd_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) +static int amd_cpu_notify(struct notifier_block *self, unsigned long action, + void *hcpu) { int cpu = (long)hcpu; switch (action) { @@ -337,7 +337,7 @@ static int __cpuinit amd_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block __cpuinitdata amd_cpu_notifier = { +static struct notifier_block amd_cpu_notifier = { .notifier_call = amd_cpu_notify, }; diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index 94919e307f8..db6b1ab4325 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -210,6 +210,8 @@ static void pcibios_allocate_bridge_resources(struct pci_dev *dev) r = &dev->resource[idx]; if (!r->flags) continue; + if (r->parent) /* Already allocated */ + continue; if (!r->start || pci_claim_resource(dev, idx) < 0) { /* * Something is wrong with the region. @@ -318,6 +320,8 @@ static void pcibios_allocate_dev_rom_resource(struct pci_dev *dev) r = &dev->resource[PCI_ROM_RESOURCE]; if (!r->flags || !r->start) return; + if (r->parent) /* Already allocated */ + return; if (pci_claim_resource(dev, PCI_ROM_RESOURCE) < 0) { r->end -= r->start; diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 082e8812971..5596c7bdd32 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -700,7 +700,7 @@ int pci_mmconfig_insert(struct device *dev, u16 seg, u8 start, u8 end, if (!(pci_probe & PCI_PROBE_MMCONF) || pci_mmcfg_arch_init_failed) return -ENODEV; - if (start > end) + if (start > end || !addr) return -EINVAL; mutex_lock(&pci_mmcfg_lock); @@ -716,11 +716,6 @@ int pci_mmconfig_insert(struct device *dev, u16 seg, u8 start, u8 end, return -EEXIST; } - if (!addr) { - mutex_unlock(&pci_mmcfg_lock); - return -EINVAL; - } - rc = -EBUSY; cfg = pci_mmconfig_alloc(seg, start, end, addr); if (cfg == NULL) { diff --git a/arch/x86/pci/mrst.c b/arch/x86/pci/mrst.c index 6eb18c42a28..903fded5078 100644 --- a/arch/x86/pci/mrst.c +++ b/arch/x86/pci/mrst.c @@ -23,11 +23,11 @@ #include <linux/ioport.h> #include <linux/init.h> #include <linux/dmi.h> +#include <linux/acpi.h> +#include <linux/io.h> +#include <linux/smp.h> -#include <asm/acpi.h> #include <asm/segment.h> -#include <asm/io.h> -#include <asm/smp.h> #include <asm/pci_x86.h> #include <asm/hw_irq.h> #include <asm/io_apic.h> @@ -43,7 +43,7 @@ #define PCI_FIXED_BAR_4_SIZE 0x14 #define PCI_FIXED_BAR_5_SIZE 0x1c -static int pci_soc_mode = 0; +static int pci_soc_mode; /** * fixed_bar_cap - return the offset of the fixed BAR cap if found @@ -141,7 +141,8 @@ static int pci_device_update_fixed(struct pci_bus *bus, unsigned int devfn, */ static bool type1_access_ok(unsigned int bus, unsigned int devfn, int reg) { - /* This is a workaround for A0 LNC bug where PCI status register does + /* + * This is a workaround for A0 LNC bug where PCI status register does * not have new CAP bit set. can not be written by SW either. * * PCI header type in real LNC indicates a single function device, this @@ -154,7 +155,7 @@ static bool type1_access_ok(unsigned int bus, unsigned int devfn, int reg) || devfn == PCI_DEVFN(0, 0) || devfn == PCI_DEVFN(3, 0))) return 1; - return 0; /* langwell on others */ + return 0; /* Langwell on others */ } static int pci_read(struct pci_bus *bus, unsigned int devfn, int where, @@ -172,7 +173,8 @@ static int pci_write(struct pci_bus *bus, unsigned int devfn, int where, { int offset; - /* On MRST, there is no PCI ROM BAR, this will cause a subsequent read + /* + * On MRST, there is no PCI ROM BAR, this will cause a subsequent read * to ROM BAR return 0 then being ignored. */ if (where == PCI_ROM_ADDRESS) @@ -210,7 +212,8 @@ static int mrst_pci_irq_enable(struct pci_dev *dev) pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); - /* MRST only have IOAPIC, the PCI irq lines are 1:1 mapped to + /* + * MRST only have IOAPIC, the PCI irq lines are 1:1 mapped to * IOAPIC RTE entries, so we just enable RTE for the device. */ irq_attr.ioapic = mp_find_ioapic(dev->irq); @@ -235,7 +238,7 @@ struct pci_ops pci_mrst_ops = { */ int __init pci_mrst_init(void) { - printk(KERN_INFO "Intel MID platform detected, using MID PCI ops\n"); + pr_info("Intel MID platform detected, using MID PCI ops\n"); pci_mmcfg_late_init(); pcibios_enable_irq = mrst_pci_irq_enable; pci_root_ops = pci_mrst_ops; @@ -244,17 +247,21 @@ int __init pci_mrst_init(void) return 1; } -/* Langwell devices are not true pci devices, they are not subject to 10 ms - * d3 to d0 delay required by pci spec. +/* + * Langwell devices are not true PCI devices; they are not subject to 10 ms + * d3 to d0 delay required by PCI spec. */ static void pci_d3delay_fixup(struct pci_dev *dev) { - /* PCI fixups are effectively decided compile time. If we have a dual - SoC/non-SoC kernel we don't want to mangle d3 on non SoC devices */ - if (!pci_soc_mode) - return; - /* true pci devices in lincroft should allow type 1 access, the rest - * are langwell fake pci devices. + /* + * PCI fixups are effectively decided compile time. If we have a dual + * SoC/non-SoC kernel we don't want to mangle d3 on non-SoC devices. + */ + if (!pci_soc_mode) + return; + /* + * True PCI devices in Lincroft should allow type 1 access, the rest + * are Langwell fake PCI devices. */ if (type1_access_ok(dev->bus->number, dev->devfn, PCI_DEVICE_ID)) return; diff --git a/arch/x86/platform/ce4100/ce4100.c b/arch/x86/platform/ce4100/ce4100.c index f8ab4945892..8244f5ec2f4 100644 --- a/arch/x86/platform/ce4100/ce4100.c +++ b/arch/x86/platform/ce4100/ce4100.c @@ -12,8 +12,10 @@ #include <linux/kernel.h> #include <linux/irq.h> #include <linux/module.h> +#include <linux/reboot.h> #include <linux/serial_reg.h> #include <linux/serial_8250.h> +#include <linux/reboot.h> #include <asm/ce4100.h> #include <asm/prom.h> @@ -134,7 +136,7 @@ static void __init sdv_arch_setup(void) } #ifdef CONFIG_X86_IO_APIC -static void __cpuinit sdv_pci_init(void) +static void sdv_pci_init(void) { x86_of_pci_init(); /* We can't set this earlier, because we need to calibrate the timer */ diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index c8d5577044b..90f6ed12709 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -931,13 +931,6 @@ void __init efi_enter_virtual_mode(void) va = efi_ioremap(md->phys_addr, size, md->type, md->attribute); - if (!(md->attribute & EFI_MEMORY_RUNTIME)) { - if (!va) - pr_err("ioremap of 0x%llX failed!\n", - (unsigned long long)md->phys_addr); - continue; - } - md->virt_addr = (u64) (unsigned long) va; if (!va) { diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c index a0a0a4389bb..47fe66fe61f 100644 --- a/arch/x86/platform/mrst/mrst.c +++ b/arch/x86/platform/mrst/mrst.c @@ -65,7 +65,7 @@ * lapic (always-on,ARAT) ------ 150 */ -__cpuinitdata enum mrst_timer_options mrst_timer_options; +enum mrst_timer_options mrst_timer_options; static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM]; static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM]; @@ -248,7 +248,7 @@ static void __init mrst_time_init(void) apbt_time_init(); } -static void __cpuinit mrst_arch_setup(void) +static void mrst_arch_setup(void) { if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27) __mrst_cpu_chip = MRST_CPU_CHIP_PENWELL; diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 1cf5b300305..424f4c97a44 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -25,10 +25,10 @@ #include <asm/cpu.h> #ifdef CONFIG_X86_32 -unsigned long saved_context_ebx; -unsigned long saved_context_esp, saved_context_ebp; -unsigned long saved_context_esi, saved_context_edi; -unsigned long saved_context_eflags; +__visible unsigned long saved_context_ebx; +__visible unsigned long saved_context_esp, saved_context_ebp; +__visible unsigned long saved_context_esi, saved_context_edi; +__visible unsigned long saved_context_eflags; #endif struct saved_context saved_context; diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index a0fde91c16c..304fca20d96 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -20,26 +20,26 @@ #include <asm/suspend.h> /* References to section boundaries */ -extern const void __nosave_begin, __nosave_end; +extern __visible const void __nosave_begin, __nosave_end; /* Defined in hibernate_asm_64.S */ -extern int restore_image(void); +extern asmlinkage int restore_image(void); /* * Address to jump to in the last phase of restore in order to get to the image * kernel's text (this value is passed in the image header). */ -unsigned long restore_jump_address; +unsigned long restore_jump_address __visible; /* * Value of the cr3 register from before the hibernation (this value is passed * in the image header). */ -unsigned long restore_cr3; +unsigned long restore_cr3 __visible; -pgd_t *temp_level4_pgt; +pgd_t *temp_level4_pgt __visible; -void *relocated_restore_code; +void *relocated_restore_code __visible; static void *alloc_pgt_page(void *context) { diff --git a/arch/x86/tools/gen-insn-attr-x86.awk b/arch/x86/tools/gen-insn-attr-x86.awk index e6773dc8ac4..093a892026f 100644 --- a/arch/x86/tools/gen-insn-attr-x86.awk +++ b/arch/x86/tools/gen-insn-attr-x86.awk @@ -68,7 +68,7 @@ BEGIN { lprefix1_expr = "\\((66|!F3)\\)" lprefix2_expr = "\\(F3\\)" - lprefix3_expr = "\\((F2|!F3)\\)" + lprefix3_expr = "\\((F2|!F3|66\\&F2)\\)" lprefix_expr = "\\((66|F2|F3)\\)" max_lprefix = 4 @@ -83,6 +83,8 @@ BEGIN { prefix_num["Operand-Size"] = "INAT_PFX_OPNDSZ" prefix_num["REPNE"] = "INAT_PFX_REPNE" prefix_num["REP/REPE"] = "INAT_PFX_REPE" + prefix_num["XACQUIRE"] = "INAT_PFX_REPNE" + prefix_num["XRELEASE"] = "INAT_PFX_REPE" prefix_num["LOCK"] = "INAT_PFX_LOCK" prefix_num["SEG=CS"] = "INAT_PFX_CS" prefix_num["SEG=DS"] = "INAT_PFX_DS" diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c index ae7319db18e..5e04a1c899f 100644 --- a/arch/x86/um/signal.c +++ b/arch/x86/um/signal.c @@ -508,7 +508,6 @@ int setup_signal_stack_si(unsigned long stack_top, int sig, { struct rt_sigframe __user *frame; int err = 0; - struct task_struct *me = current; frame = (struct rt_sigframe __user *) round_down(stack_top - sizeof(struct rt_sigframe), 16); diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index c74436e687b..72074d52840 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -85,15 +85,18 @@ static notrace cycle_t vread_pvclock(int *mode) cycle_t ret; u64 last; u32 version; - u32 migrate_count; u8 flags; unsigned cpu, cpu1; /* - * When looping to get a consistent (time-info, tsc) pair, we - * also need to deal with the possibility we can switch vcpus, - * so make sure we always re-fetch time-info for the current vcpu. + * Note: hypervisor must guarantee that: + * 1. cpu ID number maps 1:1 to per-CPU pvclock time info. + * 2. that per-CPU pvclock time info is updated if the + * underlying CPU changes. + * 3. that version is increased whenever underlying CPU + * changes. + * */ do { cpu = __getcpu() & VGETCPU_CPU_MASK; @@ -104,8 +107,6 @@ static notrace cycle_t vread_pvclock(int *mode) pvti = get_pvti(cpu); - migrate_count = pvti->migrate_count; - version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags); /* @@ -117,8 +118,7 @@ static notrace cycle_t vread_pvclock(int *mode) cpu1 = __getcpu() & VGETCPU_CPU_MASK; } while (unlikely(cpu != cpu1 || (pvti->pvti.version & 1) || - pvti->pvti.version != version || - pvti->migrate_count != migrate_count)); + pvti->pvti.version != version)); if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT))) *mode = VCLOCK_NONE; diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 2fa02bc5003..2fc216dfbd9 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -427,8 +427,7 @@ static void __init xen_init_cpuid_mask(void) if (!xen_initial_domain()) cpuid_leaf1_edx_mask &= - ~((1 << X86_FEATURE_APIC) | /* disable local APIC */ - (1 << X86_FEATURE_ACPI)); /* disable ACPI */ + ~((1 << X86_FEATURE_ACPI)); /* disable ACPI */ cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_X2APIC % 32)); @@ -735,8 +734,7 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val, addr = (unsigned long)xen_int3; else if (addr == (unsigned long)stack_segment) addr = (unsigned long)xen_stack_segment; - else if (addr == (unsigned long)double_fault || - addr == (unsigned long)nmi) { + else if (addr == (unsigned long)double_fault) { /* Don't need to handle these */ return 0; #ifdef CONFIG_X86_MCE @@ -747,7 +745,12 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val, */ ; #endif - } else { + } else if (addr == (unsigned long)nmi) + /* + * Use the native version as well. + */ + ; + else { /* Some other trap using IST? */ if (WARN_ON(val->ist != 0)) return 0; @@ -1681,8 +1684,8 @@ static void __init init_hvm_pv_info(void) xen_domain_type = XEN_HVM_DOMAIN; } -static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) +static int xen_hvm_cpu_notify(struct notifier_block *self, unsigned long action, + void *hcpu) { int cpu = (long)hcpu; switch (action) { @@ -1700,7 +1703,7 @@ static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block xen_hvm_cpu_notifier __cpuinitdata = { +static struct notifier_block xen_hvm_cpu_notifier = { .notifier_call = xen_hvm_cpu_notify, }; @@ -1710,6 +1713,8 @@ static void __init xen_hvm_guest_init(void) xen_hvm_init_shared_info(); + xen_panic_handler_init(); + if (xen_feature(XENFEAT_hvm_callback_vector)) xen_have_vector_callback = 1; xen_hvm_smp_init(); @@ -1720,15 +1725,12 @@ static void __init xen_hvm_guest_init(void) xen_hvm_init_mmu_ops(); } -static bool __init xen_hvm_platform(void) +static uint32_t __init xen_hvm_platform(void) { if (xen_pv_domain()) - return false; - - if (!xen_cpuid_base()) - return false; + return 0; - return true; + return xen_cpuid_base(); } bool xen_hvm_need_lapic(void) diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c index 01a4dc015ae..0da7f863056 100644 --- a/arch/x86/xen/irq.c +++ b/arch/x86/xen/irq.c @@ -47,23 +47,18 @@ static void xen_restore_fl(unsigned long flags) /* convert from IF type flag */ flags = !(flags & X86_EFLAGS_IF); - /* There's a one instruction preempt window here. We need to - make sure we're don't switch CPUs between getting the vcpu - pointer and updating the mask. */ + /* See xen_irq_enable() for why preemption must be disabled. */ preempt_disable(); vcpu = this_cpu_read(xen_vcpu); vcpu->evtchn_upcall_mask = flags; - preempt_enable_no_resched(); - - /* Doesn't matter if we get preempted here, because any - pending event will get dealt with anyway. */ if (flags == 0) { - preempt_check_resched(); barrier(); /* unmask then check (avoid races) */ if (unlikely(vcpu->evtchn_upcall_pending)) xen_force_evtchn_callback(); - } + preempt_enable(); + } else + preempt_enable_no_resched(); } PV_CALLEE_SAVE_REGS_THUNK(xen_restore_fl); @@ -82,10 +77,12 @@ static void xen_irq_enable(void) { struct vcpu_info *vcpu; - /* We don't need to worry about being preempted here, since - either a) interrupts are disabled, so no preemption, or b) - the caller is confused and is trying to re-enable interrupts - on an indeterminate processor. */ + /* + * We may be preempted as soon as vcpu->evtchn_upcall_mask is + * cleared, so disable preemption to ensure we check for + * events on the VCPU we are still running on. + */ + preempt_disable(); vcpu = this_cpu_read(xen_vcpu); vcpu->evtchn_upcall_mask = 0; @@ -96,6 +93,8 @@ static void xen_irq_enable(void) barrier(); /* unmask then check (avoid races) */ if (unlikely(vcpu->evtchn_upcall_pending)) xen_force_evtchn_callback(); + + preempt_enable(); } PV_CALLEE_SAVE_REGS_THUNK(xen_irq_enable); diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 95fb2aa5927..0d4ec35895d 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -161,6 +161,7 @@ #include <asm/xen/page.h> #include <asm/xen/hypercall.h> #include <asm/xen/hypervisor.h> +#include <xen/balloon.h> #include <xen/grant_table.h> #include "multicalls.h" @@ -967,7 +968,10 @@ int m2p_remove_override(struct page *page, if (kmap_op != NULL) { if (!PageHighMem(page)) { struct multicall_space mcs; - struct gnttab_unmap_grant_ref *unmap_op; + struct gnttab_unmap_and_replace *unmap_op; + struct page *scratch_page = get_balloon_scratch_page(); + unsigned long scratch_page_address = (unsigned long) + __va(page_to_pfn(scratch_page) << PAGE_SHIFT); /* * It might be that we queued all the m2p grant table @@ -990,21 +994,25 @@ int m2p_remove_override(struct page *page, } mcs = xen_mc_entry( - sizeof(struct gnttab_unmap_grant_ref)); + sizeof(struct gnttab_unmap_and_replace)); unmap_op = mcs.args; unmap_op->host_addr = kmap_op->host_addr; + unmap_op->new_addr = scratch_page_address; unmap_op->handle = kmap_op->handle; - unmap_op->dev_bus_addr = 0; MULTI_grant_table_op(mcs.mc, - GNTTABOP_unmap_grant_ref, unmap_op, 1); + GNTTABOP_unmap_and_replace, unmap_op, 1); xen_mc_issue(PARAVIRT_LAZY_MMU); - set_pte_at(&init_mm, address, ptep, - pfn_pte(pfn, PAGE_KERNEL)); - __flush_tlb_single(address); + mcs = __xen_mc_entry(0); + MULTI_update_va_mapping(mcs.mc, scratch_page_address, + pfn_pte(page_to_pfn(get_balloon_scratch_page()), + PAGE_KERNEL_RO), 0); + xen_mc_issue(PARAVIRT_LAZY_MMU); + kmap_op->host_addr = 0; + put_balloon_scratch_page(); } } diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 94eac5c85cd..09f3059cb00 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -33,6 +33,9 @@ /* These are code, but not functions. Defined in entry.S */ extern const char xen_hypervisor_callback[]; extern const char xen_failsafe_callback[]; +#ifdef CONFIG_X86_64 +extern const char nmi[]; +#endif extern void xen_sysenter_target(void); extern void xen_syscall_target(void); extern void xen_syscall32_target(void); @@ -215,13 +218,19 @@ static void __init xen_set_identity_and_release_chunk( unsigned long pfn; /* - * If the PFNs are currently mapped, the VA mapping also needs - * to be updated to be 1:1. + * If the PFNs are currently mapped, clear the mappings + * (except for the ISA region which must be 1:1 mapped) to + * release the refcounts (in Xen) on the original frames. */ - for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) + for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) { + pte_t pte = __pte_ma(0); + + if (pfn < PFN_UP(ISA_END_ADDRESS)) + pte = mfn_pte(pfn, PAGE_KERNEL_IO); + (void)HYPERVISOR_update_va_mapping( - (unsigned long)__va(pfn << PAGE_SHIFT), - mfn_pte(pfn, PAGE_KERNEL_IO), 0); + (unsigned long)__va(pfn << PAGE_SHIFT), pte, 0); + } if (start_pfn < nr_pages) *released += xen_release_chunk( @@ -313,6 +322,17 @@ static void xen_align_and_add_e820_region(u64 start, u64 size, int type) e820_add_region(start, end - start, type); } +void xen_ignore_unusable(struct e820entry *list, size_t map_size) +{ + struct e820entry *entry; + unsigned int i; + + for (i = 0, entry = list; i < map_size; i++, entry++) { + if (entry->type == E820_UNUSABLE) + entry->type = E820_RAM; + } +} + /** * machine_specific_memory_setup - Hook for machine specific memory setup. **/ @@ -353,6 +373,17 @@ char * __init xen_memory_setup(void) } BUG_ON(rc); + /* + * Xen won't allow a 1:1 mapping to be created to UNUSABLE + * regions, so if we're using the machine memory map leave the + * region as RAM as it is in the pseudo-physical map. + * + * UNUSABLE regions in domUs are not handled and will need + * a patch in the future. + */ + if (xen_initial_domain()) + xen_ignore_unusable(map, memmap.nr_entries); + /* Make sure the Xen-supplied memory map is well-ordered. */ sanitize_e820_map(map, memmap.nr_entries, &memmap.nr_entries); @@ -475,7 +506,7 @@ static void __init fiddle_vdso(void) #endif } -static int __cpuinit register_callback(unsigned type, const void *func) +static int register_callback(unsigned type, const void *func) { struct callback_register callback = { .type = type, @@ -486,7 +517,7 @@ static int __cpuinit register_callback(unsigned type, const void *func) return HYPERVISOR_callback_op(CALLBACKOP_register, &callback); } -void __cpuinit xen_enable_sysenter(void) +void xen_enable_sysenter(void) { int ret; unsigned sysenter_feature; @@ -505,7 +536,7 @@ void __cpuinit xen_enable_sysenter(void) setup_clear_cpu_cap(sysenter_feature); } -void __cpuinit xen_enable_syscall(void) +void xen_enable_syscall(void) { #ifdef CONFIG_X86_64 int ret; @@ -525,7 +556,13 @@ void __cpuinit xen_enable_syscall(void) } #endif /* CONFIG_X86_64 */ } - +void __cpuinit xen_enable_nmi(void) +{ +#ifdef CONFIG_X86_64 + if (register_callback(CALLBACKTYPE_nmi, nmi)) + BUG(); +#endif +} void __init xen_arch_setup(void) { xen_panic_handler_init(); @@ -543,7 +580,7 @@ void __init xen_arch_setup(void) xen_enable_sysenter(); xen_enable_syscall(); - + xen_enable_nmi(); #ifdef CONFIG_ACPI if (!(xen_start_info->flags & SIF_INITDOMAIN)) { printk(KERN_INFO "ACPI in unprivileged domain disabled\n"); diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index c1367b29c3b..9235842cd76 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -65,7 +65,7 @@ static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } -static void __cpuinit cpu_bringup(void) +static void cpu_bringup(void) { int cpu; @@ -97,7 +97,7 @@ static void __cpuinit cpu_bringup(void) wmb(); /* make sure everything is out */ } -static void __cpuinit cpu_bringup_and_idle(void) +static void cpu_bringup_and_idle(void) { cpu_bringup(); cpu_startup_entry(CPUHP_ONLINE); @@ -279,6 +279,7 @@ static void __init xen_smp_prepare_boot_cpu(void) xen_filter_cpu_maps(); xen_setup_vcpu_info_placement(); + xen_init_spinlocks(); } static void __init xen_smp_prepare_cpus(unsigned int max_cpus) @@ -326,7 +327,7 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus) set_cpu_present(cpu, true); } -static int __cpuinit +static int cpu_initialize_context(unsigned int cpu, struct task_struct *idle) { struct vcpu_guest_context *ctxt; @@ -397,7 +398,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) return 0; } -static int __cpuinit xen_cpu_up(unsigned int cpu, struct task_struct *idle) +static int xen_cpu_up(unsigned int cpu, struct task_struct *idle) { int rc; @@ -470,7 +471,7 @@ static void xen_cpu_die(unsigned int cpu) xen_teardown_timer(cpu); } -static void __cpuinit xen_play_dead(void) /* used only with HOTPLUG_CPU */ +static void xen_play_dead(void) /* used only with HOTPLUG_CPU */ { play_dead_common(); HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); @@ -572,6 +573,12 @@ static inline int xen_map_vector(int vector) case IRQ_WORK_VECTOR: xen_vector = XEN_IRQ_WORK_VECTOR; break; +#ifdef CONFIG_X86_64 + case NMI_VECTOR: + case APIC_DM_NMI: /* Some use that instead of NMI_VECTOR */ + xen_vector = XEN_NMI_VECTOR; + break; +#endif default: xen_vector = -1; printk(KERN_ERR "xen: vector 0x%x is not implemented\n", @@ -680,7 +687,6 @@ void __init xen_smp_init(void) { smp_ops = xen_smp_ops; xen_fill_possible_map(); - xen_init_spinlocks(); } static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus) @@ -691,11 +697,18 @@ static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus) xen_init_lock_cpu(0); } -static int __cpuinit xen_hvm_cpu_up(unsigned int cpu, struct task_struct *tidle) +static int xen_hvm_cpu_up(unsigned int cpu, struct task_struct *tidle) { int rc; - rc = native_cpu_up(cpu, tidle); - WARN_ON (xen_smp_intr_init(cpu)); + /* + * xen_smp_intr_init() needs to run before native_cpu_up() + * so that IPI vectors are set up on the booting CPU before + * it is marked online in native_cpu_up(). + */ + rc = xen_smp_intr_init(cpu); + WARN_ON(rc); + if (!rc) + rc = native_cpu_up(cpu, tidle); return rc; } diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index a40f8508e76..0438b9324a7 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -17,45 +17,44 @@ #include "xen-ops.h" #include "debugfs.h" -#ifdef CONFIG_XEN_DEBUG_FS -static struct xen_spinlock_stats -{ - u64 taken; - u32 taken_slow; - u32 taken_slow_nested; - u32 taken_slow_pickup; - u32 taken_slow_spurious; - u32 taken_slow_irqenable; +enum xen_contention_stat { + TAKEN_SLOW, + TAKEN_SLOW_PICKUP, + TAKEN_SLOW_SPURIOUS, + RELEASED_SLOW, + RELEASED_SLOW_KICKED, + NR_CONTENTION_STATS +}; - u64 released; - u32 released_slow; - u32 released_slow_kicked; +#ifdef CONFIG_XEN_DEBUG_FS #define HISTO_BUCKETS 30 - u32 histo_spin_total[HISTO_BUCKETS+1]; - u32 histo_spin_spinning[HISTO_BUCKETS+1]; +static struct xen_spinlock_stats +{ + u32 contention_stats[NR_CONTENTION_STATS]; u32 histo_spin_blocked[HISTO_BUCKETS+1]; - - u64 time_total; - u64 time_spinning; u64 time_blocked; } spinlock_stats; static u8 zero_stats; -static unsigned lock_timeout = 1 << 10; -#define TIMEOUT lock_timeout - static inline void check_zero(void) { - if (unlikely(zero_stats)) { - memset(&spinlock_stats, 0, sizeof(spinlock_stats)); - zero_stats = 0; + u8 ret; + u8 old = ACCESS_ONCE(zero_stats); + if (unlikely(old)) { + ret = cmpxchg(&zero_stats, old, 0); + /* This ensures only one fellow resets the stat */ + if (ret == old) + memset(&spinlock_stats, 0, sizeof(spinlock_stats)); } } -#define ADD_STATS(elem, val) \ - do { check_zero(); spinlock_stats.elem += (val); } while(0) +static inline void add_stats(enum xen_contention_stat var, u32 val) +{ + check_zero(); + spinlock_stats.contention_stats[var] += val; +} static inline u64 spin_time_start(void) { @@ -74,22 +73,6 @@ static void __spin_time_accum(u64 delta, u32 *array) array[HISTO_BUCKETS]++; } -static inline void spin_time_accum_spinning(u64 start) -{ - u32 delta = xen_clocksource_read() - start; - - __spin_time_accum(delta, spinlock_stats.histo_spin_spinning); - spinlock_stats.time_spinning += delta; -} - -static inline void spin_time_accum_total(u64 start) -{ - u32 delta = xen_clocksource_read() - start; - - __spin_time_accum(delta, spinlock_stats.histo_spin_total); - spinlock_stats.time_total += delta; -} - static inline void spin_time_accum_blocked(u64 start) { u32 delta = xen_clocksource_read() - start; @@ -99,19 +82,15 @@ static inline void spin_time_accum_blocked(u64 start) } #else /* !CONFIG_XEN_DEBUG_FS */ #define TIMEOUT (1 << 10) -#define ADD_STATS(elem, val) do { (void)(val); } while(0) +static inline void add_stats(enum xen_contention_stat var, u32 val) +{ +} static inline u64 spin_time_start(void) { return 0; } -static inline void spin_time_accum_total(u64 start) -{ -} -static inline void spin_time_accum_spinning(u64 start) -{ -} static inline void spin_time_accum_blocked(u64 start) { } @@ -134,234 +113,130 @@ typedef u16 xen_spinners_t; asm(LOCK_PREFIX " decw %0" : "+m" ((xl)->spinners) : : "memory"); #endif -struct xen_spinlock { - unsigned char lock; /* 0 -> free; 1 -> locked */ - xen_spinners_t spinners; /* count of waiting cpus */ +struct xen_lock_waiting { + struct arch_spinlock *lock; + __ticket_t want; }; -static int xen_spin_is_locked(struct arch_spinlock *lock) -{ - struct xen_spinlock *xl = (struct xen_spinlock *)lock; - - return xl->lock != 0; -} - -static int xen_spin_is_contended(struct arch_spinlock *lock) -{ - struct xen_spinlock *xl = (struct xen_spinlock *)lock; - - /* Not strictly true; this is only the count of contended - lock-takers entering the slow path. */ - return xl->spinners != 0; -} - -static int xen_spin_trylock(struct arch_spinlock *lock) -{ - struct xen_spinlock *xl = (struct xen_spinlock *)lock; - u8 old = 1; - - asm("xchgb %b0,%1" - : "+q" (old), "+m" (xl->lock) : : "memory"); - - return old == 0; -} - -static DEFINE_PER_CPU(char *, irq_name); static DEFINE_PER_CPU(int, lock_kicker_irq) = -1; -static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners); - -/* - * Mark a cpu as interested in a lock. Returns the CPU's previous - * lock of interest, in case we got preempted by an interrupt. - */ -static inline struct xen_spinlock *spinning_lock(struct xen_spinlock *xl) -{ - struct xen_spinlock *prev; - - prev = __this_cpu_read(lock_spinners); - __this_cpu_write(lock_spinners, xl); - - wmb(); /* set lock of interest before count */ - - inc_spinners(xl); - - return prev; -} - -/* - * Mark a cpu as no longer interested in a lock. Restores previous - * lock of interest (NULL for none). - */ -static inline void unspinning_lock(struct xen_spinlock *xl, struct xen_spinlock *prev) -{ - dec_spinners(xl); - wmb(); /* decrement count before restoring lock */ - __this_cpu_write(lock_spinners, prev); -} +static DEFINE_PER_CPU(char *, irq_name); +static DEFINE_PER_CPU(struct xen_lock_waiting, lock_waiting); +static cpumask_t waiting_cpus; -static noinline int xen_spin_lock_slow(struct arch_spinlock *lock, bool irq_enable) +static void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want) { - struct xen_spinlock *xl = (struct xen_spinlock *)lock; - struct xen_spinlock *prev; int irq = __this_cpu_read(lock_kicker_irq); - int ret; + struct xen_lock_waiting *w = &__get_cpu_var(lock_waiting); + int cpu = smp_processor_id(); u64 start; + unsigned long flags; /* If kicker interrupts not initialized yet, just spin */ if (irq == -1) - return 0; + return; start = spin_time_start(); - /* announce we're spinning */ - prev = spinning_lock(xl); + /* + * Make sure an interrupt handler can't upset things in a + * partially setup state. + */ + local_irq_save(flags); + /* + * We don't really care if we're overwriting some other + * (lock,want) pair, as that would mean that we're currently + * in an interrupt context, and the outer context had + * interrupts enabled. That has already kicked the VCPU out + * of xen_poll_irq(), so it will just return spuriously and + * retry with newly setup (lock,want). + * + * The ordering protocol on this is that the "lock" pointer + * may only be set non-NULL if the "want" ticket is correct. + * If we're updating "want", we must first clear "lock". + */ + w->lock = NULL; + smp_wmb(); + w->want = want; + smp_wmb(); + w->lock = lock; - ADD_STATS(taken_slow, 1); - ADD_STATS(taken_slow_nested, prev != NULL); + /* This uses set_bit, which atomic and therefore a barrier */ + cpumask_set_cpu(cpu, &waiting_cpus); + add_stats(TAKEN_SLOW, 1); - do { - unsigned long flags; + /* clear pending */ + xen_clear_irq_pending(irq); - /* clear pending */ - xen_clear_irq_pending(irq); + /* Only check lock once pending cleared */ + barrier(); - /* check again make sure it didn't become free while - we weren't looking */ - ret = xen_spin_trylock(lock); - if (ret) { - ADD_STATS(taken_slow_pickup, 1); + /* + * Mark entry to slowpath before doing the pickup test to make + * sure we don't deadlock with an unlocker. + */ + __ticket_enter_slowpath(lock); - /* - * If we interrupted another spinlock while it - * was blocking, make sure it doesn't block - * without rechecking the lock. - */ - if (prev != NULL) - xen_set_irq_pending(irq); - goto out; - } + /* + * check again make sure it didn't become free while + * we weren't looking + */ + if (ACCESS_ONCE(lock->tickets.head) == want) { + add_stats(TAKEN_SLOW_PICKUP, 1); + goto out; + } - flags = arch_local_save_flags(); - if (irq_enable) { - ADD_STATS(taken_slow_irqenable, 1); - raw_local_irq_enable(); - } + /* Allow interrupts while blocked */ + local_irq_restore(flags); - /* - * Block until irq becomes pending. If we're - * interrupted at this point (after the trylock but - * before entering the block), then the nested lock - * handler guarantees that the irq will be left - * pending if there's any chance the lock became free; - * xen_poll_irq() returns immediately if the irq is - * pending. - */ - xen_poll_irq(irq); + /* + * If an interrupt happens here, it will leave the wakeup irq + * pending, which will cause xen_poll_irq() to return + * immediately. + */ - raw_local_irq_restore(flags); + /* Block until irq becomes pending (or perhaps a spurious wakeup) */ + xen_poll_irq(irq); + add_stats(TAKEN_SLOW_SPURIOUS, !xen_test_irq_pending(irq)); - ADD_STATS(taken_slow_spurious, !xen_test_irq_pending(irq)); - } while (!xen_test_irq_pending(irq)); /* check for spurious wakeups */ + local_irq_save(flags); kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq)); - out: - unspinning_lock(xl, prev); - spin_time_accum_blocked(start); - - return ret; -} - -static inline void __xen_spin_lock(struct arch_spinlock *lock, bool irq_enable) -{ - struct xen_spinlock *xl = (struct xen_spinlock *)lock; - unsigned timeout; - u8 oldval; - u64 start_spin; - - ADD_STATS(taken, 1); - - start_spin = spin_time_start(); - - do { - u64 start_spin_fast = spin_time_start(); - - timeout = TIMEOUT; - - asm("1: xchgb %1,%0\n" - " testb %1,%1\n" - " jz 3f\n" - "2: rep;nop\n" - " cmpb $0,%0\n" - " je 1b\n" - " dec %2\n" - " jnz 2b\n" - "3:\n" - : "+m" (xl->lock), "=q" (oldval), "+r" (timeout) - : "1" (1) - : "memory"); + cpumask_clear_cpu(cpu, &waiting_cpus); + w->lock = NULL; - spin_time_accum_spinning(start_spin_fast); + local_irq_restore(flags); - } while (unlikely(oldval != 0 && - (TIMEOUT == ~0 || !xen_spin_lock_slow(lock, irq_enable)))); - - spin_time_accum_total(start_spin); -} - -static void xen_spin_lock(struct arch_spinlock *lock) -{ - __xen_spin_lock(lock, false); -} - -static void xen_spin_lock_flags(struct arch_spinlock *lock, unsigned long flags) -{ - __xen_spin_lock(lock, !raw_irqs_disabled_flags(flags)); + spin_time_accum_blocked(start); } +PV_CALLEE_SAVE_REGS_THUNK(xen_lock_spinning); -static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl) +static void xen_unlock_kick(struct arch_spinlock *lock, __ticket_t next) { int cpu; - ADD_STATS(released_slow, 1); + add_stats(RELEASED_SLOW, 1); + + for_each_cpu(cpu, &waiting_cpus) { + const struct xen_lock_waiting *w = &per_cpu(lock_waiting, cpu); - for_each_online_cpu(cpu) { - /* XXX should mix up next cpu selection */ - if (per_cpu(lock_spinners, cpu) == xl) { - ADD_STATS(released_slow_kicked, 1); + /* Make sure we read lock before want */ + if (ACCESS_ONCE(w->lock) == lock && + ACCESS_ONCE(w->want) == next) { + add_stats(RELEASED_SLOW_KICKED, 1); xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR); + break; } } } -static void xen_spin_unlock(struct arch_spinlock *lock) -{ - struct xen_spinlock *xl = (struct xen_spinlock *)lock; - - ADD_STATS(released, 1); - - smp_wmb(); /* make sure no writes get moved after unlock */ - xl->lock = 0; /* release lock */ - - /* - * Make sure unlock happens before checking for waiting - * spinners. We need a strong barrier to enforce the - * write-read ordering to different memory locations, as the - * CPU makes no implied guarantees about their ordering. - */ - mb(); - - if (unlikely(xl->spinners)) - xen_spin_unlock_slow(xl); -} - static irqreturn_t dummy_handler(int irq, void *dev_id) { BUG(); return IRQ_HANDLED; } -void __cpuinit xen_init_lock_cpu(int cpu) +void xen_init_lock_cpu(int cpu) { int irq; char *name; @@ -408,6 +283,8 @@ void xen_uninit_lock_cpu(int cpu) per_cpu(irq_name, cpu) = NULL; } +static bool xen_pvspin __initdata = true; + void __init xen_init_spinlocks(void) { /* @@ -417,15 +294,23 @@ void __init xen_init_spinlocks(void) if (xen_hvm_domain()) return; - BUILD_BUG_ON(sizeof(struct xen_spinlock) > sizeof(arch_spinlock_t)); + if (!xen_pvspin) { + printk(KERN_DEBUG "xen: PV spinlocks disabled\n"); + return; + } - pv_lock_ops.spin_is_locked = xen_spin_is_locked; - pv_lock_ops.spin_is_contended = xen_spin_is_contended; - pv_lock_ops.spin_lock = xen_spin_lock; - pv_lock_ops.spin_lock_flags = xen_spin_lock_flags; - pv_lock_ops.spin_trylock = xen_spin_trylock; - pv_lock_ops.spin_unlock = xen_spin_unlock; + static_key_slow_inc(¶virt_ticketlocks_enabled); + + pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(xen_lock_spinning); + pv_lock_ops.unlock_kick = xen_unlock_kick; +} + +static __init int xen_parse_nopvspin(char *arg) +{ + xen_pvspin = false; + return 0; } +early_param("xen_nopvspin", xen_parse_nopvspin); #ifdef CONFIG_XEN_DEBUG_FS @@ -442,37 +327,21 @@ static int __init xen_spinlock_debugfs(void) debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats); - debugfs_create_u32("timeout", 0644, d_spin_debug, &lock_timeout); - - debugfs_create_u64("taken", 0444, d_spin_debug, &spinlock_stats.taken); debugfs_create_u32("taken_slow", 0444, d_spin_debug, - &spinlock_stats.taken_slow); - debugfs_create_u32("taken_slow_nested", 0444, d_spin_debug, - &spinlock_stats.taken_slow_nested); + &spinlock_stats.contention_stats[TAKEN_SLOW]); debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug, - &spinlock_stats.taken_slow_pickup); + &spinlock_stats.contention_stats[TAKEN_SLOW_PICKUP]); debugfs_create_u32("taken_slow_spurious", 0444, d_spin_debug, - &spinlock_stats.taken_slow_spurious); - debugfs_create_u32("taken_slow_irqenable", 0444, d_spin_debug, - &spinlock_stats.taken_slow_irqenable); + &spinlock_stats.contention_stats[TAKEN_SLOW_SPURIOUS]); - debugfs_create_u64("released", 0444, d_spin_debug, &spinlock_stats.released); debugfs_create_u32("released_slow", 0444, d_spin_debug, - &spinlock_stats.released_slow); + &spinlock_stats.contention_stats[RELEASED_SLOW]); debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug, - &spinlock_stats.released_slow_kicked); + &spinlock_stats.contention_stats[RELEASED_SLOW_KICKED]); - debugfs_create_u64("time_spinning", 0444, d_spin_debug, - &spinlock_stats.time_spinning); debugfs_create_u64("time_blocked", 0444, d_spin_debug, &spinlock_stats.time_blocked); - debugfs_create_u64("time_total", 0444, d_spin_debug, - &spinlock_stats.time_total); - debugfs_create_u32_array("histo_total", 0444, d_spin_debug, - spinlock_stats.histo_spin_total, HISTO_BUCKETS + 1); - debugfs_create_u32_array("histo_spinning", 0444, d_spin_debug, - spinlock_stats.histo_spin_spinning, HISTO_BUCKETS + 1); debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug, spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1); diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index a95b41744ad..95f8c614232 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -73,7 +73,7 @@ static inline void xen_hvm_smp_init(void) {} #ifdef CONFIG_PARAVIRT_SPINLOCKS void __init xen_init_spinlocks(void); -void __cpuinit xen_init_lock_cpu(int cpu); +void xen_init_lock_cpu(int cpu); void xen_uninit_lock_cpu(int cpu); #else static inline void xen_init_spinlocks(void) @@ -105,9 +105,9 @@ static inline void __init xen_init_apic(void) /* Declare an asm function, along with symbols needed to make it inlineable */ #define DECL_ASM(ret, name, ...) \ - ret name(__VA_ARGS__); \ - extern char name##_end[]; \ - extern char name##_reloc[] \ + __visible ret name(__VA_ARGS__); \ + extern char name##_end[] __visible; \ + extern char name##_reloc[] __visible DECL_ASM(void, xen_irq_enable_direct, void); DECL_ASM(void, xen_irq_disable_direct, void); @@ -115,11 +115,11 @@ DECL_ASM(unsigned long, xen_save_fl_direct, void); DECL_ASM(void, xen_restore_fl_direct, unsigned long); /* These are not functions, and cannot be called normally */ -void xen_iret(void); -void xen_sysexit(void); -void xen_sysret32(void); -void xen_sysret64(void); -void xen_adjust_exception_frame(void); +__visible void xen_iret(void); +__visible void xen_sysexit(void); +__visible void xen_sysret32(void); +__visible void xen_sysret64(void); +__visible void xen_adjust_exception_frame(void); extern int xen_panic_handler_init(void); |