diff options
Diffstat (limited to 'arch/x86')
146 files changed, 3325 insertions, 1519 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b32ebf92b0c..5c0ed72c02a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -16,6 +16,7 @@ config X86_64 def_bool y depends on 64BIT select X86_DEV_DMA_OPS + select ARCH_USE_CMPXCHG_LOCKREF ### Arch settings config X86 @@ -81,7 +82,6 @@ config X86 select HAVE_USER_RETURN_NOTIFIER select ARCH_BINFMT_ELF_RANDOMIZE_PIE select HAVE_ARCH_JUMP_LABEL - select HAVE_TEXT_POKE_SMP select HAVE_GENERIC_HARDIRQS select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select SPARSE_IRQ @@ -632,6 +632,7 @@ config PARAVIRT_DEBUG config PARAVIRT_SPINLOCKS bool "Paravirtualization layer for spinlocks" depends on PARAVIRT && SMP + select UNINLINE_SPIN_UNLOCK ---help--- Paravirtualized spinlocks allow a pvops backend to replace the spinlock implementation with something virtualization-friendly @@ -656,6 +657,15 @@ config KVM_GUEST underlying device model, the host provides the guest with timing infrastructure such as time of day, and system time +config KVM_DEBUG_FS + bool "Enable debug information for KVM Guests in debugfs" + depends on KVM_GUEST && DEBUG_FS + default n + ---help--- + This option enables collection of various statistics for KVM guest. + Statistics are displayed in debugfs filesystem. Enabling this option + may incur significant overhead. + source "arch/x86/lguest/Kconfig" config PARAVIRT_TIME_ACCOUNTING @@ -1344,8 +1354,12 @@ config ARCH_SELECT_MEMORY_MODEL depends on ARCH_SPARSEMEM_ENABLE config ARCH_MEMORY_PROBE - def_bool y + bool "Enable sysfs memory/probe interface" depends on X86_64 && MEMORY_HOTPLUG + help + This option enables a sysfs memory/probe interface for testing. + See Documentation/memory-hotplug.txt for more information. + If you are unsure how to answer this question, answer N. config ARCH_PROC_KCORE_TEXT def_bool y @@ -1627,9 +1641,9 @@ config KEXEC It is an ongoing process to be certain the hardware in a machine is properly shutdown, so do not be surprised if this code does not - initially work for you. It may help to enable device hotplugging - support. As of this writing the exact hardware interface is - strongly in flux, so no good recommendation can be made. + initially work for you. As of this writing the exact hardware + interface is strongly in flux, so no good recommendation can be + made. config CRASH_DUMP bool "kernel crash dumps" @@ -1716,9 +1730,10 @@ config X86_NEED_RELOCS depends on X86_32 && RELOCATABLE config PHYSICAL_ALIGN - hex "Alignment value to which kernel should be aligned" if X86_32 + hex "Alignment value to which kernel should be aligned" default "0x1000000" - range 0x2000 0x1000000 + range 0x2000 0x1000000 if X86_32 + range 0x200000 0x1000000 if X86_64 ---help--- This value puts the alignment restrictions on physical address where kernel is loaded and run from. Kernel is compiled for an @@ -1736,6 +1751,9 @@ config PHYSICAL_ALIGN end result is that kernel runs from a physical address meeting above alignment restrictions. + On 32-bit this value must be a multiple of 0x2000. On 64-bit + this value must be a multiple of 0x200000. + Don't change this unless you know what you are doing. config HOTPLUG_CPU @@ -2270,6 +2288,32 @@ config RAPIDIO source "drivers/rapidio/Kconfig" +config X86_SYSFB + bool "Mark VGA/VBE/EFI FB as generic system framebuffer" + help + Firmwares often provide initial graphics framebuffers so the BIOS, + bootloader or kernel can show basic video-output during boot for + user-guidance and debugging. Historically, x86 used the VESA BIOS + Extensions and EFI-framebuffers for this, which are mostly limited + to x86. + This option, if enabled, marks VGA/VBE/EFI framebuffers as generic + framebuffers so the new generic system-framebuffer drivers can be + used on x86. If the framebuffer is not compatible with the generic + modes, it is adverticed as fallback platform framebuffer so legacy + drivers like efifb, vesafb and uvesafb can pick it up. + If this option is not selected, all system framebuffers are always + marked as fallback platform framebuffers as usual. + + Note: Legacy fbdev drivers, including vesafb, efifb, uvesafb, will + not be able to pick up generic system framebuffers if this option + is selected. You are highly encouraged to enable simplefb as + replacement if you select this option. simplefb can correctly deal + with generic system framebuffers. But you should still keep vesafb + and others enabled as fallback if a system framebuffer is + incompatible with simplefb. + + If unsure, say Y. + endmenu @@ -2332,10 +2376,6 @@ config HAVE_ATOMIC_IOMAP def_bool y depends on X86_32 -config HAVE_TEXT_POKE_SMP - bool - select STOP_MACHINE if SMP - config X86_DEV_DMA_OPS bool depends on X86_64 || STA2X11 diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 07639c656fc..41250fb3398 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -16,6 +16,10 @@ endif # e.g.: obj-y += foo_$(BITS).o export BITS +ifdef CONFIG_X86_NEED_RELOCS + LDFLAGS_vmlinux := --emit-relocs +endif + ifeq ($(CONFIG_X86_32),y) BITS := 32 UTS_MACHINE := i386 @@ -25,10 +29,6 @@ ifeq ($(CONFIG_X86_32),y) KBUILD_AFLAGS += $(biarch) KBUILD_CFLAGS += $(biarch) - ifdef CONFIG_RELOCATABLE - LDFLAGS_vmlinux := --emit-relocs - endif - KBUILD_CFLAGS += -msoft-float -mregparm=3 -freg-struct-return # Never want PIC in a 32-bit kernel, prevent breakage with GCC built diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index 5b7531966b8..ef72baeff48 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h @@ -355,6 +355,7 @@ int strncmp(const char *cs, const char *ct, size_t count); size_t strnlen(const char *s, size_t maxlen); unsigned int atou(const char *s); unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base); +size_t strlen(const char *s); /* tty.c */ void puts(const char *); diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 1e3184f6072..5d6f6891b18 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -181,8 +181,9 @@ relocated: /* * Do the decompression, and jump to the new kernel.. */ - leal z_extract_offset_negative(%ebx), %ebp /* push arguments for decompress_kernel: */ + pushl $z_output_len /* decompressed length */ + leal z_extract_offset_negative(%ebx), %ebp pushl %ebp /* output address */ pushl $z_input_len /* input_len */ leal input_data(%ebx), %eax @@ -191,33 +192,7 @@ relocated: pushl %eax /* heap area */ pushl %esi /* real mode pointer */ call decompress_kernel - addl $20, %esp - -#if CONFIG_RELOCATABLE -/* - * Find the address of the relocations. - */ - leal z_output_len(%ebp), %edi - -/* - * Calculate the delta between where vmlinux was compiled to run - * and where it was actually loaded. - */ - movl %ebp, %ebx - subl $LOAD_PHYSICAL_ADDR, %ebx - jz 2f /* Nothing to be done if loaded at compiled addr. */ -/* - * Process relocations. - */ - -1: subl $4, %edi - movl (%edi), %ecx - testl %ecx, %ecx - jz 2f - addl %ebx, -__PAGE_OFFSET(%ebx, %ecx) - jmp 1b -2: -#endif + addl $24, %esp /* * Jump to the decompressed kernel. diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 06e71c2c16b..c337422b575 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -338,6 +338,7 @@ relocated: leaq input_data(%rip), %rdx /* input_data */ movl $z_input_len, %ecx /* input_len */ movq %rbp, %r8 /* output target address */ + movq $z_output_len, %r9 /* decompressed length */ call decompress_kernel popq %rsi diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 0319c88290a..434f077d2c4 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -271,6 +271,79 @@ static void error(char *x) asm("hlt"); } +#if CONFIG_X86_NEED_RELOCS +static void handle_relocations(void *output, unsigned long output_len) +{ + int *reloc; + unsigned long delta, map, ptr; + unsigned long min_addr = (unsigned long)output; + unsigned long max_addr = min_addr + output_len; + + /* + * Calculate the delta between where vmlinux was linked to load + * and where it was actually loaded. + */ + delta = min_addr - LOAD_PHYSICAL_ADDR; + if (!delta) { + debug_putstr("No relocation needed... "); + return; + } + debug_putstr("Performing relocations... "); + + /* + * The kernel contains a table of relocation addresses. Those + * addresses have the final load address of the kernel in virtual + * memory. We are currently working in the self map. So we need to + * create an adjustment for kernel memory addresses to the self map. + * This will involve subtracting out the base address of the kernel. + */ + map = delta - __START_KERNEL_map; + + /* + * Process relocations: 32 bit relocations first then 64 bit after. + * Two sets of binary relocations are added to the end of the kernel + * before compression. Each relocation table entry is the kernel + * address of the location which needs to be updated stored as a + * 32-bit value which is sign extended to 64 bits. + * + * Format is: + * + * kernel bits... + * 0 - zero terminator for 64 bit relocations + * 64 bit relocation repeated + * 0 - zero terminator for 32 bit relocations + * 32 bit relocation repeated + * + * So we work backwards from the end of the decompressed image. + */ + for (reloc = output + output_len - sizeof(*reloc); *reloc; reloc--) { + int extended = *reloc; + extended += map; + + ptr = (unsigned long)extended; + if (ptr < min_addr || ptr > max_addr) + error("32-bit relocation outside of kernel!\n"); + + *(uint32_t *)ptr += delta; + } +#ifdef CONFIG_X86_64 + for (reloc--; *reloc; reloc--) { + long extended = *reloc; + extended += map; + + ptr = (unsigned long)extended; + if (ptr < min_addr || ptr > max_addr) + error("64-bit relocation outside of kernel!\n"); + + *(uint64_t *)ptr += delta; + } +#endif +} +#else +static inline void handle_relocations(void *output, unsigned long output_len) +{ } +#endif + static void parse_elf(void *output) { #ifdef CONFIG_X86_64 @@ -325,7 +398,8 @@ static void parse_elf(void *output) asmlinkage void decompress_kernel(void *rmode, memptr heap, unsigned char *input_data, unsigned long input_len, - unsigned char *output) + unsigned char *output, + unsigned long output_len) { real_mode = rmode; @@ -365,6 +439,7 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap, debug_putstr("\nDecompressing Linux... "); decompress(input_data, input_len, NULL, NULL, output, NULL, error); parse_elf(output); + handle_relocations(output, output_len); debug_putstr("done.\nBooting the kernel.\n"); return; } diff --git a/arch/x86/boot/printf.c b/arch/x86/boot/printf.c index cdac91ca55d..565083c16e5 100644 --- a/arch/x86/boot/printf.c +++ b/arch/x86/boot/printf.c @@ -55,7 +55,7 @@ static char *number(char *str, long num, int base, int size, int precision, locase = (type & SMALL); if (type & LEFT) type &= ~ZEROPAD; - if (base < 2 || base > 36) + if (base < 2 || base > 16) return NULL; c = (type & ZEROPAD) ? '0' : ' '; sign = 0; diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index bccfca68430..665a730307f 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -457,7 +457,7 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig, else put_user_ex(0, &frame->uc.uc_flags); put_user_ex(0, &frame->uc.uc_link); - err |= __compat_save_altstack(&frame->uc.uc_stack, regs->sp); + compat_save_altstack_ex(&frame->uc.uc_stack, regs->sp); if (ksig->ka.sa.sa_flags & SA_RESTORER) restorer = ksig->ka.sa.sa_restorer; diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 474dc1b59f7..4299eb05023 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -452,7 +452,7 @@ ia32_badsys: CFI_ENDPROC - .macro PTREGSCALL label, func, arg + .macro PTREGSCALL label, func ALIGN GLOBAL(\label) leaq \func(%rip),%rax diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 2dfac58f3b1..b1977bad543 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -86,6 +86,7 @@ extern int acpi_pci_disabled; extern int acpi_skip_timer_override; extern int acpi_use_timer_override; extern int acpi_fix_pin2_polarity; +extern int acpi_disable_cmcff; extern u8 acpi_sci_flags; extern int acpi_sci_override_gsi; @@ -168,6 +169,7 @@ static inline void arch_acpi_set_pdc_bits(u32 *buf) #define acpi_lapic 0 #define acpi_ioapic 0 +#define acpi_disable_cmcff 0 static inline void acpi_noirq_set(void) { } static inline void acpi_disable_pci(void) { } static inline void disable_acpi(void) { } diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 58ed6d96a6a..0a3f9c9f98d 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -5,6 +5,7 @@ #include <linux/stddef.h> #include <linux/stringify.h> #include <asm/asm.h> +#include <asm/ptrace.h> /* * Alternative inline assembly for SMP. @@ -220,20 +221,11 @@ extern void *text_poke_early(void *addr, const void *opcode, size_t len); * no thread can be preempted in the instructions being modified (no iret to an * invalid instruction possible) or if the instructions are changed from a * consistent state to another consistent state atomically. - * More care must be taken when modifying code in the SMP case because of - * Intel's errata. text_poke_smp() takes care that errata, but still - * doesn't support NMI/MCE handler code modifying. * On the local CPU you need to be protected again NMI or MCE handlers seeing an * inconsistent instruction while you patch. */ -struct text_poke_param { - void *addr; - const void *opcode; - size_t len; -}; - extern void *text_poke(void *addr, const void *opcode, size_t len); -extern void *text_poke_smp(void *addr, const void *opcode, size_t len); -extern void text_poke_smp_batch(struct text_poke_param *params, int n); +extern int poke_int3_handler(struct pt_regs *regs); +extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler); #endif /* _ASM_X86_ALTERNATIVE_H */ diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index f8119b582c3..1d2091a226b 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -715,4 +715,6 @@ static inline void exiting_ack_irq(void) ack_APIC_irq(); } +extern void ioapic_zap_locks(void); + #endif /* _ASM_X86_APIC_H */ diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 1c2d247f65c..4582e8e1cd1 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -3,21 +3,25 @@ #ifdef __ASSEMBLY__ # define __ASM_FORM(x) x +# define __ASM_FORM_RAW(x) x # define __ASM_FORM_COMMA(x) x, #else # define __ASM_FORM(x) " " #x " " +# define __ASM_FORM_RAW(x) #x # define __ASM_FORM_COMMA(x) " " #x "," #endif #ifdef CONFIG_X86_32 # define __ASM_SEL(a,b) __ASM_FORM(a) +# define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(a) #else # define __ASM_SEL(a,b) __ASM_FORM(b) +# define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(b) #endif #define __ASM_SIZE(inst, ...) __ASM_SEL(inst##l##__VA_ARGS__, \ inst##q##__VA_ARGS__) -#define __ASM_REG(reg) __ASM_SEL(e##reg, r##reg) +#define __ASM_REG(reg) __ASM_SEL_RAW(e##reg, r##reg) #define _ASM_PTR __ASM_SEL(.long, .quad) #define _ASM_ALIGN __ASM_SEL(.balign 4, .balign 8) diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 6dfd0195bb5..41639ce8fd6 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -15,6 +15,14 @@ #include <linux/compiler.h> #include <asm/alternative.h> +#if BITS_PER_LONG == 32 +# define _BITOPS_LONG_SHIFT 5 +#elif BITS_PER_LONG == 64 +# define _BITOPS_LONG_SHIFT 6 +#else +# error "Unexpected BITS_PER_LONG" +#endif + #define BIT_64(n) (U64_C(1) << (n)) /* @@ -59,7 +67,7 @@ * restricted to acting on a single-word quantity. */ static __always_inline void -set_bit(unsigned int nr, volatile unsigned long *addr) +set_bit(long nr, volatile unsigned long *addr) { if (IS_IMMEDIATE(nr)) { asm volatile(LOCK_PREFIX "orb %1,%0" @@ -81,7 +89,7 @@ set_bit(unsigned int nr, volatile unsigned long *addr) * If it's called on the same region of memory simultaneously, the effect * may be that only one operation succeeds. */ -static inline void __set_bit(int nr, volatile unsigned long *addr) +static inline void __set_bit(long nr, volatile unsigned long *addr) { asm volatile("bts %1,%0" : ADDR : "Ir" (nr) : "memory"); } @@ -97,7 +105,7 @@ static inline void __set_bit(int nr, volatile unsigned long *addr) * in order to ensure changes are visible on other processors. */ static __always_inline void -clear_bit(int nr, volatile unsigned long *addr) +clear_bit(long nr, volatile unsigned long *addr) { if (IS_IMMEDIATE(nr)) { asm volatile(LOCK_PREFIX "andb %1,%0" @@ -118,13 +126,13 @@ clear_bit(int nr, volatile unsigned long *addr) * clear_bit() is atomic and implies release semantics before the memory * operation. It can be used for an unlock. */ -static inline void clear_bit_unlock(unsigned nr, volatile unsigned long *addr) +static inline void clear_bit_unlock(long nr, volatile unsigned long *addr) { barrier(); clear_bit(nr, addr); } -static inline void __clear_bit(int nr, volatile unsigned long *addr) +static inline void __clear_bit(long nr, volatile unsigned long *addr) { asm volatile("btr %1,%0" : ADDR : "Ir" (nr)); } @@ -141,7 +149,7 @@ static inline void __clear_bit(int nr, volatile unsigned long *addr) * No memory barrier is required here, because x86 cannot reorder stores past * older loads. Same principle as spin_unlock. */ -static inline void __clear_bit_unlock(unsigned nr, volatile unsigned long *addr) +static inline void __clear_bit_unlock(long nr, volatile unsigned long *addr) { barrier(); __clear_bit(nr, addr); @@ -159,7 +167,7 @@ static inline void __clear_bit_unlock(unsigned nr, volatile unsigned long *addr) * If it's called on the same region of memory simultaneously, the effect * may be that only one operation succeeds. */ -static inline void __change_bit(int nr, volatile unsigned long *addr) +static inline void __change_bit(long nr, volatile unsigned long *addr) { asm volatile("btc %1,%0" : ADDR : "Ir" (nr)); } @@ -173,7 +181,7 @@ static inline void __change_bit(int nr, volatile unsigned long *addr) * Note that @nr may be almost arbitrarily large; this function is not * restricted to acting on a single-word quantity. */ -static inline void change_bit(int nr, volatile unsigned long *addr) +static inline void change_bit(long nr, volatile unsigned long *addr) { if (IS_IMMEDIATE(nr)) { asm volatile(LOCK_PREFIX "xorb %1,%0" @@ -194,7 +202,7 @@ static inline void change_bit(int nr, volatile unsigned long *addr) * This operation is atomic and cannot be reordered. * It also implies a memory barrier. */ -static inline int test_and_set_bit(int nr, volatile unsigned long *addr) +static inline int test_and_set_bit(long nr, volatile unsigned long *addr) { int oldbit; @@ -212,7 +220,7 @@ static inline int test_and_set_bit(int nr, volatile unsigned long *addr) * This is the same as test_and_set_bit on x86. */ static __always_inline int -test_and_set_bit_lock(int nr, volatile unsigned long *addr) +test_and_set_bit_lock(long nr, volatile unsigned long *addr) { return test_and_set_bit(nr, addr); } @@ -226,7 +234,7 @@ test_and_set_bit_lock(int nr, volatile unsigned long *addr) * If two examples of this operation race, one can appear to succeed * but actually fail. You must protect multiple accesses with a lock. */ -static inline int __test_and_set_bit(int nr, volatile unsigned long *addr) +static inline int __test_and_set_bit(long nr, volatile unsigned long *addr) { int oldbit; @@ -245,7 +253,7 @@ static inline int __test_and_set_bit(int nr, volatile unsigned long *addr) * This operation is atomic and cannot be reordered. * It also implies a memory barrier. */ -static inline int test_and_clear_bit(int nr, volatile unsigned long *addr) +static inline int test_and_clear_bit(long nr, volatile unsigned long *addr) { int oldbit; @@ -272,7 +280,7 @@ static inline int test_and_clear_bit(int nr, volatile unsigned long *addr) * accessed from a hypervisor on the same CPU if running in a VM: don't change * this without also updating arch/x86/kernel/kvm.c */ -static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr) +static inline int __test_and_clear_bit(long nr, volatile unsigned long *addr) { int oldbit; @@ -284,7 +292,7 @@ static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr) } /* WARNING: non atomic and it can be reordered! */ -static inline int __test_and_change_bit(int nr, volatile unsigned long *addr) +static inline int __test_and_change_bit(long nr, volatile unsigned long *addr) { int oldbit; @@ -304,7 +312,7 @@ static inline int __test_and_change_bit(int nr, volatile unsigned long *addr) * This operation is atomic and cannot be reordered. * It also implies a memory barrier. */ -static inline int test_and_change_bit(int nr, volatile unsigned long *addr) +static inline int test_and_change_bit(long nr, volatile unsigned long *addr) { int oldbit; @@ -315,13 +323,13 @@ static inline int test_and_change_bit(int nr, volatile unsigned long *addr) return oldbit; } -static __always_inline int constant_test_bit(unsigned int nr, const volatile unsigned long *addr) +static __always_inline int constant_test_bit(long nr, const volatile unsigned long *addr) { - return ((1UL << (nr % BITS_PER_LONG)) & - (addr[nr / BITS_PER_LONG])) != 0; + return ((1UL << (nr & (BITS_PER_LONG-1))) & + (addr[nr >> _BITOPS_LONG_SHIFT])) != 0; } -static inline int variable_test_bit(int nr, volatile const unsigned long *addr) +static inline int variable_test_bit(long nr, volatile const unsigned long *addr) { int oldbit; diff --git a/arch/x86/include/asm/bootparam_utils.h b/arch/x86/include/asm/bootparam_utils.h index 653668d140f..4a8cb8d7cbd 100644 --- a/arch/x86/include/asm/bootparam_utils.h +++ b/arch/x86/include/asm/bootparam_utils.h @@ -35,9 +35,9 @@ static void sanitize_boot_params(struct boot_params *boot_params) */ if (boot_params->sentinel) { /* fields in boot_params are left uninitialized, clear them */ - memset(&boot_params->olpc_ofw_header, 0, + memset(&boot_params->ext_ramdisk_image, 0, (char *)&boot_params->efi_info - - (char *)&boot_params->olpc_ofw_header); + (char *)&boot_params->ext_ramdisk_image); memset(&boot_params->kbd_status, 0, (char *)&boot_params->hdr - (char *)&boot_params->kbd_status); diff --git a/arch/x86/include/asm/checksum_32.h b/arch/x86/include/asm/checksum_32.h index 46fc474fd81..f50de695173 100644 --- a/arch/x86/include/asm/checksum_32.h +++ b/arch/x86/include/asm/checksum_32.h @@ -49,9 +49,15 @@ static inline __wsum csum_partial_copy_from_user(const void __user *src, int len, __wsum sum, int *err_ptr) { + __wsum ret; + might_sleep(); - return csum_partial_copy_generic((__force void *)src, dst, - len, sum, err_ptr, NULL); + stac(); + ret = csum_partial_copy_generic((__force void *)src, dst, + len, sum, err_ptr, NULL); + clac(); + + return ret; } /* @@ -176,10 +182,16 @@ static inline __wsum csum_and_copy_to_user(const void *src, int len, __wsum sum, int *err_ptr) { + __wsum ret; + might_sleep(); - if (access_ok(VERIFY_WRITE, dst, len)) - return csum_partial_copy_generic(src, (__force void *)dst, - len, sum, NULL, err_ptr); + if (access_ok(VERIFY_WRITE, dst, len)) { + stac(); + ret = csum_partial_copy_generic(src, (__force void *)dst, + len, sum, NULL, err_ptr); + clac(); + return ret; + } if (len) *err_ptr = -EFAULT; diff --git a/arch/x86/include/asm/checksum_64.h b/arch/x86/include/asm/checksum_64.h index 9bfdc41629e..e6fd8a026c7 100644 --- a/arch/x86/include/asm/checksum_64.h +++ b/arch/x86/include/asm/checksum_64.h @@ -133,7 +133,7 @@ extern __wsum csum_partial(const void *buff, int len, __wsum sum); /* Do not call this directly. Use the wrappers below */ -extern __wsum csum_partial_copy_generic(const void *src, const void *dst, +extern __visible __wsum csum_partial_copy_generic(const void *src, const void *dst, int len, __wsum sum, int *src_err_ptr, int *dst_err_ptr); diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 47538a61c91..d3f5c63078d 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -366,9 +366,10 @@ extern bool __static_cpu_has_safe(u16 bit); */ static __always_inline __pure bool __static_cpu_has(u16 bit) { -#if __GNUC__ > 4 || __GNUC_MINOR__ >= 5 +#ifdef CC_HAVE_ASM_GOTO #ifdef CONFIG_X86_DEBUG_STATIC_CPU_HAS + /* * Catch too early usage of this before alternatives * have run. @@ -384,6 +385,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) ".previous\n" /* skipping size check since replacement size = 0 */ : : "i" (X86_FEATURE_ALWAYS) : : t_warn); + #endif asm goto("1: jmp %l[t_no]\n" @@ -406,7 +408,9 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) warn_pre_alternatives(); return false; #endif -#else /* GCC_VERSION >= 40500 */ + +#else /* CC_HAVE_ASM_GOTO */ + u8 flag; /* Open-coded due to __stringify() in ALTERNATIVE() */ asm volatile("1: movb $0,%0\n" @@ -427,7 +431,8 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) ".previous\n" : "=qm" (flag) : "i" (bit)); return flag; -#endif + +#endif /* CC_HAVE_ASM_GOTO */ } #define static_cpu_has(bit) \ @@ -441,7 +446,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) static __always_inline __pure bool _static_cpu_has_safe(u16 bit) { -#if __GNUC__ > 4 || __GNUC_MINOR__ >= 5 +#ifdef CC_HAVE_ASM_GOTO /* * We need to spell the jumps to the compiler because, depending on the offset, * the replacement jump can be bigger than the original jump, and this we cannot @@ -475,7 +480,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit) return false; t_dynamic: return __static_cpu_has_safe(bit); -#else /* GCC_VERSION >= 40500 */ +#else u8 flag; /* Open-coded due to __stringify() in ALTERNATIVE() */ asm volatile("1: movb $2,%0\n" @@ -511,7 +516,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit) : "=qm" (flag) : "i" (bit), "i" (X86_FEATURE_ALWAYS)); return (flag == 2 ? __static_cpu_has_safe(bit) : flag); -#endif +#endif /* CC_HAVE_ASM_GOTO */ } #define static_cpu_has_safe(bit) \ diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h index cccd07fa5e3..779c2efe2e9 100644 --- a/arch/x86/include/asm/e820.h +++ b/arch/x86/include/asm/e820.h @@ -29,7 +29,7 @@ extern void e820_setup_gap(void); extern int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize, unsigned long start_addr, unsigned long long end_addr); struct setup_data; -extern void parse_e820_ext(struct setup_data *data); +extern void parse_e820_ext(u64 phys_addr, u32 data_len); #if defined(CONFIG_X86_64) || \ (defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION)) diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index e4ac559c4a2..92b3bae08b7 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -26,56 +26,56 @@ #include <asm/sections.h> /* Interrupt handlers registered during init_IRQ */ -extern void apic_timer_interrupt(void); -extern void x86_platform_ipi(void); -extern void kvm_posted_intr_ipi(void); -extern void error_interrupt(void); -extern void irq_work_interrupt(void); - -extern void spurious_interrupt(void); -extern void thermal_interrupt(void); -extern void reschedule_interrupt(void); - -extern void invalidate_interrupt(void); -extern void invalidate_interrupt0(void); -extern void invalidate_interrupt1(void); -extern void invalidate_interrupt2(void); -extern void invalidate_interrupt3(void); -extern void invalidate_interrupt4(void); -extern void invalidate_interrupt5(void); -extern void invalidate_interrupt6(void); -extern void invalidate_interrupt7(void); -extern void invalidate_interrupt8(void); -extern void invalidate_interrupt9(void); -extern void invalidate_interrupt10(void); -extern void invalidate_interrupt11(void); -extern void invalidate_interrupt12(void); -extern void invalidate_interrupt13(void); -extern void invalidate_interrupt14(void); -extern void invalidate_interrupt15(void); -extern void invalidate_interrupt16(void); -extern void invalidate_interrupt17(void); -extern void invalidate_interrupt18(void); -extern void invalidate_interrupt19(void); -extern void invalidate_interrupt20(void); -extern void invalidate_interrupt21(void); -extern void invalidate_interrupt22(void); -extern void invalidate_interrupt23(void); -extern void invalidate_interrupt24(void); -extern void invalidate_interrupt25(void); -extern void invalidate_interrupt26(void); -extern void invalidate_interrupt27(void); -extern void invalidate_interrupt28(void); -extern void invalidate_interrupt29(void); -extern void invalidate_interrupt30(void); -extern void invalidate_interrupt31(void); - -extern void irq_move_cleanup_interrupt(void); -extern void reboot_interrupt(void); -extern void threshold_interrupt(void); - -extern void call_function_interrupt(void); -extern void call_function_single_interrupt(void); +extern asmlinkage void apic_timer_interrupt(void); +extern asmlinkage void x86_platform_ipi(void); +extern asmlinkage void kvm_posted_intr_ipi(void); +extern asmlinkage void error_interrupt(void); +extern asmlinkage void irq_work_interrupt(void); + +extern asmlinkage void spurious_interrupt(void); +extern asmlinkage void thermal_interrupt(void); +extern asmlinkage void reschedule_interrupt(void); + +extern asmlinkage void invalidate_interrupt(void); +extern asmlinkage void invalidate_interrupt0(void); +extern asmlinkage void invalidate_interrupt1(void); +extern asmlinkage void invalidate_interrupt2(void); +extern asmlinkage void invalidate_interrupt3(void); +extern asmlinkage void invalidate_interrupt4(void); +extern asmlinkage void invalidate_interrupt5(void); +extern asmlinkage void invalidate_interrupt6(void); +extern asmlinkage void invalidate_interrupt7(void); +extern asmlinkage void invalidate_interrupt8(void); +extern asmlinkage void invalidate_interrupt9(void); +extern asmlinkage void invalidate_interrupt10(void); +extern asmlinkage void invalidate_interrupt11(void); +extern asmlinkage void invalidate_interrupt12(void); +extern asmlinkage void invalidate_interrupt13(void); +extern asmlinkage void invalidate_interrupt14(void); +extern asmlinkage void invalidate_interrupt15(void); +extern asmlinkage void invalidate_interrupt16(void); +extern asmlinkage void invalidate_interrupt17(void); +extern asmlinkage void invalidate_interrupt18(void); +extern asmlinkage void invalidate_interrupt19(void); +extern asmlinkage void invalidate_interrupt20(void); +extern asmlinkage void invalidate_interrupt21(void); +extern asmlinkage void invalidate_interrupt22(void); +extern asmlinkage void invalidate_interrupt23(void); +extern asmlinkage void invalidate_interrupt24(void); +extern asmlinkage void invalidate_interrupt25(void); +extern asmlinkage void invalidate_interrupt26(void); +extern asmlinkage void invalidate_interrupt27(void); +extern asmlinkage void invalidate_interrupt28(void); +extern asmlinkage void invalidate_interrupt29(void); +extern asmlinkage void invalidate_interrupt30(void); +extern asmlinkage void invalidate_interrupt31(void); + +extern asmlinkage void irq_move_cleanup_interrupt(void); +extern asmlinkage void reboot_interrupt(void); +extern asmlinkage void threshold_interrupt(void); + +extern asmlinkage void call_function_interrupt(void); +extern asmlinkage void call_function_single_interrupt(void); #ifdef CONFIG_TRACING /* Interrupt handlers registered during init_IRQ */ @@ -172,22 +172,18 @@ extern atomic_t irq_mis_count; extern void eisa_set_level_irq(unsigned int irq); /* SMP */ -extern void smp_apic_timer_interrupt(struct pt_regs *); -extern void smp_spurious_interrupt(struct pt_regs *); -extern void smp_x86_platform_ipi(struct pt_regs *); -extern void smp_error_interrupt(struct pt_regs *); +extern __visible void smp_apic_timer_interrupt(struct pt_regs *); +extern __visible void smp_spurious_interrupt(struct pt_regs *); +extern __visible void smp_x86_platform_ipi(struct pt_regs *); +extern __visible void smp_error_interrupt(struct pt_regs *); #ifdef CONFIG_X86_IO_APIC extern asmlinkage void smp_irq_move_cleanup_interrupt(void); #endif #ifdef CONFIG_SMP -extern void smp_reschedule_interrupt(struct pt_regs *); -extern void smp_call_function_interrupt(struct pt_regs *); -extern void smp_call_function_single_interrupt(struct pt_regs *); -#ifdef CONFIG_X86_32 -extern void smp_invalidate_interrupt(struct pt_regs *); -#else -extern asmlinkage void smp_invalidate_interrupt(struct pt_regs *); -#endif +extern __visible void smp_reschedule_interrupt(struct pt_regs *); +extern __visible void smp_call_function_interrupt(struct pt_regs *); +extern __visible void smp_call_function_single_interrupt(struct pt_regs *); +extern __visible void smp_invalidate_interrupt(struct pt_regs *); #endif extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void); diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h index 2d4b5e6107c..e42f758a0fb 100644 --- a/arch/x86/include/asm/hypervisor.h +++ b/arch/x86/include/asm/hypervisor.h @@ -33,7 +33,7 @@ struct hypervisor_x86 { const char *name; /* Detection routine */ - bool (*detect)(void); + uint32_t (*detect)(void); /* Adjust CPU feature bits (run once per CPU) */ void (*set_cpu_features)(struct cpuinfo_x86 *); diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index 57873beb329..0ea10f27d61 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -33,7 +33,7 @@ extern void (*x86_platform_ipi_callback)(void); extern void native_init_IRQ(void); extern bool handle_irq(unsigned irq, struct pt_regs *regs); -extern unsigned int do_IRQ(struct pt_regs *regs); +extern __visible unsigned int do_IRQ(struct pt_regs *regs); /* Interrupt vector management */ extern DECLARE_BITMAP(used_vectors, NR_VECTORS); diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h index 5a6d2873f80..9454c167629 100644 --- a/arch/x86/include/asm/kprobes.h +++ b/arch/x86/include/asm/kprobes.h @@ -49,10 +49,10 @@ typedef u8 kprobe_opcode_t; #define flush_insn_slot(p) do { } while (0) /* optinsn template addresses */ -extern kprobe_opcode_t optprobe_template_entry; -extern kprobe_opcode_t optprobe_template_val; -extern kprobe_opcode_t optprobe_template_call; -extern kprobe_opcode_t optprobe_template_end; +extern __visible kprobe_opcode_t optprobe_template_entry; +extern __visible kprobe_opcode_t optprobe_template_val; +extern __visible kprobe_opcode_t optprobe_template_call; +extern __visible kprobe_opcode_t optprobe_template_end; #define MAX_OPTIMIZED_LENGTH (MAX_INSN_SIZE + RELATIVE_ADDR_SIZE) #define MAX_OPTINSN_SIZE \ (((unsigned long)&optprobe_template_end - \ @@ -62,7 +62,7 @@ extern kprobe_opcode_t optprobe_template_end; extern const int kretprobe_blacklist_size; void arch_remove_kprobe(struct kprobe *p); -void kretprobe_trampoline(void); +asmlinkage void kretprobe_trampoline(void); /* Architecture specific copy of original instruction*/ struct arch_specific_insn { diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f87f7fcefa0..c76ff74a98f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -286,6 +286,7 @@ struct kvm_mmu { u64 *pae_root; u64 *lm_root; u64 rsvd_bits_mask[2][4]; + u64 bad_mt_xwr; /* * Bitmap: bit set = last pte in walk @@ -323,6 +324,7 @@ struct kvm_pmu { u64 global_ovf_ctrl; u64 counter_bitmask[2]; u64 global_ctrl_mask; + u64 reserved_bits; u8 version; struct kvm_pmc gp_counters[INTEL_PMC_MAX_GENERIC]; struct kvm_pmc fixed_counters[INTEL_PMC_MAX_FIXED]; @@ -511,6 +513,14 @@ struct kvm_vcpu_arch { * instruction. */ bool write_fault_to_shadow_pgtable; + + /* set at EPT violation at this point */ + unsigned long exit_qualification; + + /* pv related host specific info */ + struct { + bool pv_unhalted; + } pv; }; struct kvm_lpage_info { @@ -802,8 +812,8 @@ extern u32 kvm_min_guest_tsc_khz; extern u32 kvm_max_guest_tsc_khz; enum emulation_result { - EMULATE_DONE, /* no further processing */ - EMULATE_DO_MMIO, /* kvm_run filled with mmio request */ + EMULATE_DONE, /* no further processing */ + EMULATE_USER_EXIT, /* kvm_run ready for userspace exit */ EMULATE_FAIL, /* can't emulate this instruction */ }; diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 695399f2d5e..1df11590975 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -85,26 +85,20 @@ static inline long kvm_hypercall4(unsigned int nr, unsigned long p1, return ret; } -static inline bool kvm_para_available(void) +static inline uint32_t kvm_cpuid_base(void) { - unsigned int eax, ebx, ecx, edx; - char signature[13]; - if (boot_cpu_data.cpuid_level < 0) - return false; /* So we don't blow up on old processors */ + return 0; /* So we don't blow up on old processors */ - if (cpu_has_hypervisor) { - cpuid(KVM_CPUID_SIGNATURE, &eax, &ebx, &ecx, &edx); - memcpy(signature + 0, &ebx, 4); - memcpy(signature + 4, &ecx, 4); - memcpy(signature + 8, &edx, 4); - signature[12] = 0; + if (cpu_has_hypervisor) + return hypervisor_cpuid_base("KVMKVMKVM\0\0\0", 0); - if (strcmp(signature, "KVMKVMKVM") == 0) - return true; - } + return 0; +} - return false; +static inline bool kvm_para_available(void) +{ + return kvm_cpuid_base() != 0; } static inline unsigned int kvm_arch_para_features(void) @@ -118,10 +112,20 @@ void kvm_async_pf_task_wait(u32 token); void kvm_async_pf_task_wake(u32 token); u32 kvm_read_and_reset_pf_reason(void); extern void kvm_disable_steal_time(void); -#else -#define kvm_guest_init() do { } while (0) + +#ifdef CONFIG_PARAVIRT_SPINLOCKS +void __init kvm_spinlock_init(void); +#else /* !CONFIG_PARAVIRT_SPINLOCKS */ +static inline void kvm_spinlock_init(void) +{ +} +#endif /* CONFIG_PARAVIRT_SPINLOCKS */ + +#else /* CONFIG_KVM_GUEST */ +#define kvm_guest_init() do {} while (0) #define kvm_async_pf_task_wait(T) do {} while(0) #define kvm_async_pf_task_wake(T) do {} while(0) + static inline u32 kvm_read_and_reset_pf_reason(void) { return 0; diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 29e3093bbd2..cbe6b9e404c 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -32,11 +32,20 @@ #define MCI_STATUS_PCC (1ULL<<57) /* processor context corrupt */ #define MCI_STATUS_S (1ULL<<56) /* Signaled machine check */ #define MCI_STATUS_AR (1ULL<<55) /* Action required */ -#define MCACOD 0xffff /* MCA Error Code */ + +/* + * Note that the full MCACOD field of IA32_MCi_STATUS MSR is + * bits 15:0. But bit 12 is the 'F' bit, defined for corrected + * errors to indicate that errors are being filtered by hardware. + * We should mask out bit 12 when looking for specific signatures + * of uncorrected errors - so the F bit is deliberately skipped + * in this #define. + */ +#define MCACOD 0xefff /* MCA Error Code */ /* Architecturally defined codes from SDM Vol. 3B Chapter 15 */ #define MCACOD_SCRUB 0x00C0 /* 0xC0-0xCF Memory Scrubbing */ -#define MCACOD_SCRUBMSK 0xfff0 +#define MCACOD_SCRUBMSK 0xeff0 /* Skip bit 12 ('F' bit) */ #define MCACOD_L3WB 0x017A /* L3 Explicit Writeback */ #define MCACOD_DATA 0x0134 /* Data Load */ #define MCACOD_INSTR 0x0150 /* Instruction Fetch */ @@ -188,6 +197,9 @@ extern void register_mce_write_callback(ssize_t (*)(struct file *filp, const char __user *ubuf, size_t usize, loff_t *off)); +/* Disable CMCI/polling for MCA bank claimed by firmware */ +extern void mce_disable_bank(int bank); + /* * Exception handler */ diff --git a/arch/x86/include/asm/microcode_amd.h b/arch/x86/include/asm/microcode_amd.h index 50e5c58ced2..4c019179a57 100644 --- a/arch/x86/include/asm/microcode_amd.h +++ b/arch/x86/include/asm/microcode_amd.h @@ -59,7 +59,7 @@ static inline u16 find_equiv_id(struct equiv_cpu_entry *equiv_cpu_table, extern int __apply_microcode_amd(struct microcode_amd *mc_amd); extern int apply_microcode_amd(int cpu); -extern enum ucode_state load_microcode_amd(int cpu, const u8 *data, size_t size); +extern enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size); #ifdef CONFIG_MICROCODE_AMD_EARLY #ifdef CONFIG_X86_32 diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index cdbf3677610..be12c534fd5 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -45,22 +45,28 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, /* Re-load page tables */ load_cr3(next->pgd); - /* stop flush ipis for the previous mm */ + /* Stop flush ipis for the previous mm */ cpumask_clear_cpu(cpu, mm_cpumask(prev)); - /* - * load the LDT, if the LDT is different: - */ + /* Load the LDT, if the LDT is different: */ if (unlikely(prev->context.ldt != next->context.ldt)) load_LDT_nolock(&next->context); } #ifdef CONFIG_SMP - else { + else { this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next); - if (!cpumask_test_and_set_cpu(cpu, mm_cpumask(next))) { - /* We were in lazy tlb mode and leave_mm disabled + if (!cpumask_test_cpu(cpu, mm_cpumask(next))) { + /* + * On established mms, the mm_cpumask is only changed + * from irq context, from ptep_clear_flush() while in + * lazy tlb mode, and here. Irqs are blocked during + * schedule, protecting us from simultaneous changes. + */ + cpumask_set_cpu(cpu, mm_cpumask(next)); + /* + * We were in lazy tlb mode and leave_mm disabled * tlb flush IPI delivery. We must reload CR3 * to make sure to use no freed page tables. */ diff --git a/arch/x86/include/asm/mutex_64.h b/arch/x86/include/asm/mutex_64.h index 2c543fff241..e7e6751648e 100644 --- a/arch/x86/include/asm/mutex_64.h +++ b/arch/x86/include/asm/mutex_64.h @@ -16,6 +16,20 @@ * * Atomically decrements @v and calls <fail_fn> if the result is negative. */ +#ifdef CC_HAVE_ASM_GOTO +static inline void __mutex_fastpath_lock(atomic_t *v, + void (*fail_fn)(atomic_t *)) +{ + asm volatile goto(LOCK_PREFIX " decl %0\n" + " jns %l[exit]\n" + : : "m" (v->counter) + : "memory", "cc" + : exit); + fail_fn(v); +exit: + return; +} +#else #define __mutex_fastpath_lock(v, fail_fn) \ do { \ unsigned long dummy; \ @@ -32,6 +46,7 @@ do { \ : "rax", "rsi", "rdx", "rcx", \ "r8", "r9", "r10", "r11", "memory"); \ } while (0) +#endif /** * __mutex_fastpath_lock_retval - try to take the lock by moving the count @@ -56,6 +71,20 @@ static inline int __mutex_fastpath_lock_retval(atomic_t *count) * * Atomically increments @v and calls <fail_fn> if the result is nonpositive. */ +#ifdef CC_HAVE_ASM_GOTO +static inline void __mutex_fastpath_unlock(atomic_t *v, + void (*fail_fn)(atomic_t *)) +{ + asm volatile goto(LOCK_PREFIX " incl %0\n" + " jg %l[exit]\n" + : : "m" (v->counter) + : "memory", "cc" + : exit); + fail_fn(v); +exit: + return; +} +#else #define __mutex_fastpath_unlock(v, fail_fn) \ do { \ unsigned long dummy; \ @@ -72,6 +101,7 @@ do { \ : "rax", "rsi", "rdx", "rcx", \ "r8", "r9", "r10", "r11", "memory"); \ } while (0) +#endif #define __mutex_slowpath_needs_to_unlock() 1 diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h index ef17af01347..f48b17df422 100644 --- a/arch/x86/include/asm/page_32_types.h +++ b/arch/x86/include/asm/page_32_types.h @@ -15,6 +15,8 @@ */ #define __PAGE_OFFSET _AC(CONFIG_PAGE_OFFSET, UL) +#define __START_KERNEL_map __PAGE_OFFSET + #define THREAD_SIZE_ORDER 1 #define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER) diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 6c896fbe21d..43dcd804ebd 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -32,11 +32,6 @@ */ #define __PAGE_OFFSET _AC(0xffff880000000000, UL) -#define __PHYSICAL_START ((CONFIG_PHYSICAL_START + \ - (CONFIG_PHYSICAL_ALIGN - 1)) & \ - ~(CONFIG_PHYSICAL_ALIGN - 1)) - -#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START) #define __START_KERNEL_map _AC(0xffffffff80000000, UL) /* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */ diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 54c97879195..f97fbe3abb6 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -33,6 +33,11 @@ (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \ VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) +#define __PHYSICAL_START ALIGN(CONFIG_PHYSICAL_START, \ + CONFIG_PHYSICAL_ALIGN) + +#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START) + #ifdef CONFIG_X86_64 #include <asm/page_64_types.h> #else diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index cfdc9ee4c90..401f350ef71 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -712,36 +712,16 @@ static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx, #if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS) -static inline int arch_spin_is_locked(struct arch_spinlock *lock) +static __always_inline void __ticket_lock_spinning(struct arch_spinlock *lock, + __ticket_t ticket) { - return PVOP_CALL1(int, pv_lock_ops.spin_is_locked, lock); + PVOP_VCALLEE2(pv_lock_ops.lock_spinning, lock, ticket); } -static inline int arch_spin_is_contended(struct arch_spinlock *lock) +static __always_inline void __ticket_unlock_kick(struct arch_spinlock *lock, + __ticket_t ticket) { - return PVOP_CALL1(int, pv_lock_ops.spin_is_contended, lock); -} -#define arch_spin_is_contended arch_spin_is_contended - -static __always_inline void arch_spin_lock(struct arch_spinlock *lock) -{ - PVOP_VCALL1(pv_lock_ops.spin_lock, lock); -} - -static __always_inline void arch_spin_lock_flags(struct arch_spinlock *lock, - unsigned long flags) -{ - PVOP_VCALL2(pv_lock_ops.spin_lock_flags, lock, flags); -} - -static __always_inline int arch_spin_trylock(struct arch_spinlock *lock) -{ - return PVOP_CALL1(int, pv_lock_ops.spin_trylock, lock); -} - -static __always_inline void arch_spin_unlock(struct arch_spinlock *lock) -{ - PVOP_VCALL1(pv_lock_ops.spin_unlock, lock); + PVOP_VCALL2(pv_lock_ops.unlock_kick, lock, ticket); } #endif diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 0db1fcac668..aab8f671b52 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -327,13 +327,15 @@ struct pv_mmu_ops { }; struct arch_spinlock; +#ifdef CONFIG_SMP +#include <asm/spinlock_types.h> +#else +typedef u16 __ticket_t; +#endif + struct pv_lock_ops { - int (*spin_is_locked)(struct arch_spinlock *lock); - int (*spin_is_contended)(struct arch_spinlock *lock); - void (*spin_lock)(struct arch_spinlock *lock); - void (*spin_lock_flags)(struct arch_spinlock *lock, unsigned long flags); - int (*spin_trylock)(struct arch_spinlock *lock); - void (*spin_unlock)(struct arch_spinlock *lock); + struct paravirt_callee_save lock_spinning; + void (*unlock_kick)(struct arch_spinlock *lock, __ticket_t ticket); }; /* This contains all the paravirt structures: we get a convenient @@ -387,7 +389,8 @@ extern struct pv_lock_ops pv_lock_ops; /* Simple instruction patching code. */ #define DEF_NATIVE(ops, name, code) \ - extern const char start_##ops##_##name[], end_##ops##_##name[]; \ + extern const char start_##ops##_##name[] __visible, \ + end_##ops##_##name[] __visible; \ asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":") unsigned paravirt_patch_nop(void); diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h index f2b489cf160..3bf2dd0cf61 100644 --- a/arch/x86/include/asm/pgtable-2level.h +++ b/arch/x86/include/asm/pgtable-2level.h @@ -55,9 +55,53 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp) #define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp) #endif +#ifdef CONFIG_MEM_SOFT_DIRTY + +/* + * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE, _PAGE_BIT_SOFT_DIRTY and + * _PAGE_BIT_PROTNONE are taken, split up the 28 bits of offset + * into this range. + */ +#define PTE_FILE_MAX_BITS 28 +#define PTE_FILE_SHIFT1 (_PAGE_BIT_PRESENT + 1) +#define PTE_FILE_SHIFT2 (_PAGE_BIT_FILE + 1) +#define PTE_FILE_SHIFT3 (_PAGE_BIT_PROTNONE + 1) +#define PTE_FILE_SHIFT4 (_PAGE_BIT_SOFT_DIRTY + 1) +#define PTE_FILE_BITS1 (PTE_FILE_SHIFT2 - PTE_FILE_SHIFT1 - 1) +#define PTE_FILE_BITS2 (PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1) +#define PTE_FILE_BITS3 (PTE_FILE_SHIFT4 - PTE_FILE_SHIFT3 - 1) + +#define pte_to_pgoff(pte) \ + ((((pte).pte_low >> (PTE_FILE_SHIFT1)) \ + & ((1U << PTE_FILE_BITS1) - 1))) \ + + ((((pte).pte_low >> (PTE_FILE_SHIFT2)) \ + & ((1U << PTE_FILE_BITS2) - 1)) \ + << (PTE_FILE_BITS1)) \ + + ((((pte).pte_low >> (PTE_FILE_SHIFT3)) \ + & ((1U << PTE_FILE_BITS3) - 1)) \ + << (PTE_FILE_BITS1 + PTE_FILE_BITS2)) \ + + ((((pte).pte_low >> (PTE_FILE_SHIFT4))) \ + << (PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3)) + +#define pgoff_to_pte(off) \ + ((pte_t) { .pte_low = \ + ((((off)) & ((1U << PTE_FILE_BITS1) - 1)) << PTE_FILE_SHIFT1) \ + + ((((off) >> PTE_FILE_BITS1) \ + & ((1U << PTE_FILE_BITS2) - 1)) \ + << PTE_FILE_SHIFT2) \ + + ((((off) >> (PTE_FILE_BITS1 + PTE_FILE_BITS2)) \ + & ((1U << PTE_FILE_BITS3) - 1)) \ + << PTE_FILE_SHIFT3) \ + + ((((off) >> \ + (PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3))) \ + << PTE_FILE_SHIFT4) \ + + _PAGE_FILE }) + +#else /* CONFIG_MEM_SOFT_DIRTY */ + /* * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken, - * split up the 29 bits of offset into this range: + * split up the 29 bits of offset into this range. */ #define PTE_FILE_MAX_BITS 29 #define PTE_FILE_SHIFT1 (_PAGE_BIT_PRESENT + 1) @@ -88,6 +132,8 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp) << PTE_FILE_SHIFT3) \ + _PAGE_FILE }) +#endif /* CONFIG_MEM_SOFT_DIRTY */ + /* Encode and de-code a swap entry */ #if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE #define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1) diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index 4cc9f2b7cdc..81bb91b49a8 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -179,6 +179,9 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp) /* * Bits 0, 6 and 7 are taken in the low part of the pte, * put the 32 bits of offset into the high part. + * + * For soft-dirty tracking 11 bit is taken from + * the low part of pte as well. */ #define pte_to_pgoff(pte) ((pte).pte_high) #define pgoff_to_pte(off) \ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 7dc305a4605..8d16befdec8 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -22,7 +22,8 @@ * ZERO_PAGE is a global shared page that is always zero: used * for zero-mapped memory areas etc.. */ -extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; +extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] + __visible; #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) extern spinlock_t pgd_lock; @@ -314,6 +315,36 @@ static inline pmd_t pmd_mksoft_dirty(pmd_t pmd) return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY); } +static inline pte_t pte_swp_mksoft_dirty(pte_t pte) +{ + return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY); +} + +static inline int pte_swp_soft_dirty(pte_t pte) +{ + return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY; +} + +static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) +{ + return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY); +} + +static inline pte_t pte_file_clear_soft_dirty(pte_t pte) +{ + return pte_clear_flags(pte, _PAGE_SOFT_DIRTY); +} + +static inline pte_t pte_file_mksoft_dirty(pte_t pte) +{ + return pte_set_flags(pte, _PAGE_SOFT_DIRTY); +} + +static inline int pte_file_soft_dirty(pte_t pte) +{ + return pte_flags(pte) & _PAGE_SOFT_DIRTY; +} + /* * Mask out unsupported bits in a present pgprot. Non-present pgprots * can use those bits for other purposes, so leave them be. diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index c98ac63aae4..f4843e03113 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -61,12 +61,27 @@ * they do not conflict with each other. */ +#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_HIDDEN + #ifdef CONFIG_MEM_SOFT_DIRTY -#define _PAGE_SOFT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN) +#define _PAGE_SOFT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_SOFT_DIRTY) #else #define _PAGE_SOFT_DIRTY (_AT(pteval_t, 0)) #endif +/* + * Tracking soft dirty bit when a page goes to a swap is tricky. + * We need a bit which can be stored in pte _and_ not conflict + * with swap entry format. On x86 bits 6 and 7 are *not* involved + * into swap entry computation, but bit 6 is used for nonlinear + * file mapping, so we borrow bit 7 for soft dirty tracking. + */ +#ifdef CONFIG_MEM_SOFT_DIRTY +#define _PAGE_SWP_SOFT_DIRTY _PAGE_PSE +#else +#define _PAGE_SWP_SOFT_DIRTY (_AT(pteval_t, 0)) +#endif + #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) #define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) #else diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 24cf5aefb70..987c75ecc33 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -412,7 +412,7 @@ union irq_stack_union { }; }; -DECLARE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union); +DECLARE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __visible; DECLARE_INIT_PER_CPU(irq_stack_union); DECLARE_PER_CPU(char *, irq_stack_ptr); @@ -942,33 +942,19 @@ extern int set_tsc_mode(unsigned int val); extern u16 amd_get_nb_id(int cpu); -struct aperfmperf { - u64 aperf, mperf; -}; - -static inline void get_aperfmperf(struct aperfmperf *am) +static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves) { - WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_APERFMPERF)); - - rdmsrl(MSR_IA32_APERF, am->aperf); - rdmsrl(MSR_IA32_MPERF, am->mperf); -} + uint32_t base, eax, signature[3]; -#define APERFMPERF_SHIFT 10 + for (base = 0x40000000; base < 0x40010000; base += 0x100) { + cpuid(base, &eax, &signature[0], &signature[1], &signature[2]); -static inline -unsigned long calc_aperfmperf_ratio(struct aperfmperf *old, - struct aperfmperf *new) -{ - u64 aperf = new->aperf - old->aperf; - u64 mperf = new->mperf - old->mperf; - unsigned long ratio = aperf; - - mperf >>= APERFMPERF_SHIFT; - if (mperf) - ratio = div64_u64(aperf, mperf); + if (!memcmp(sig, signature, 12) && + (leaves == 0 || ((eax - base) >= leaves))) + return base; + } - return ratio; + return 0; } extern unsigned long arch_align_stack(unsigned long sp); diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index 109a9dd5d45..be8269b00e2 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -93,7 +93,6 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src, struct pvclock_vsyscall_time_info { struct pvclock_vcpu_time_info pvti; - u32 migrate_count; } __attribute__((__aligned__(SMP_CACHE_BYTES))); #define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info) diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index b7bf3505e1e..347555492da 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -6,6 +6,8 @@ #define COMMAND_LINE_SIZE 2048 +#include <linux/linkage.h> + #ifdef __i386__ #include <linux/pfn.h> @@ -108,11 +110,11 @@ void *extend_brk(size_t size, size_t align); extern void probe_roms(void); #ifdef __i386__ -void __init i386_start_kernel(void); +asmlinkage void __init i386_start_kernel(void); #else -void __init x86_64_start_kernel(char *real_mode); -void __init x86_64_start_reservations(char *real_mode_data); +asmlinkage void __init x86_64_start_kernel(char *real_mode); +asmlinkage void __init x86_64_start_reservations(char *real_mode_data); #endif /* __i386__ */ #endif /* _SETUP */ diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index 2f4d924fe6c..645cad2c95f 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -101,7 +101,7 @@ static inline void native_wbinvd(void) asm volatile("wbinvd": : :"memory"); } -extern void native_load_gs_index(unsigned); +extern asmlinkage void native_load_gs_index(unsigned); #ifdef CONFIG_PARAVIRT #include <asm/paravirt.h> diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index 33692eaabab..bf156ded74b 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -1,11 +1,14 @@ #ifndef _ASM_X86_SPINLOCK_H #define _ASM_X86_SPINLOCK_H +#include <linux/jump_label.h> #include <linux/atomic.h> #include <asm/page.h> #include <asm/processor.h> #include <linux/compiler.h> #include <asm/paravirt.h> +#include <asm/bitops.h> + /* * Your basic SMP spinlocks, allowing only a single CPU anywhere * @@ -34,6 +37,36 @@ # define UNLOCK_LOCK_PREFIX #endif +/* How long a lock should spin before we consider blocking */ +#define SPIN_THRESHOLD (1 << 15) + +extern struct static_key paravirt_ticketlocks_enabled; +static __always_inline bool static_key_false(struct static_key *key); + +#ifdef CONFIG_PARAVIRT_SPINLOCKS + +static inline void __ticket_enter_slowpath(arch_spinlock_t *lock) +{ + set_bit(0, (volatile unsigned long *)&lock->tickets.tail); +} + +#else /* !CONFIG_PARAVIRT_SPINLOCKS */ +static __always_inline void __ticket_lock_spinning(arch_spinlock_t *lock, + __ticket_t ticket) +{ +} +static inline void __ticket_unlock_kick(arch_spinlock_t *lock, + __ticket_t ticket) +{ +} + +#endif /* CONFIG_PARAVIRT_SPINLOCKS */ + +static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock) +{ + return lock.tickets.head == lock.tickets.tail; +} + /* * Ticket locks are conceptually two parts, one indicating the current head of * the queue, and the other indicating the current tail. The lock is acquired @@ -47,81 +80,101 @@ * in the high part, because a wide xadd increment of the low part would carry * up and contaminate the high part. */ -static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock) +static __always_inline void arch_spin_lock(arch_spinlock_t *lock) { - register struct __raw_tickets inc = { .tail = 1 }; + register struct __raw_tickets inc = { .tail = TICKET_LOCK_INC }; inc = xadd(&lock->tickets, inc); + if (likely(inc.head == inc.tail)) + goto out; + inc.tail &= ~TICKET_SLOWPATH_FLAG; for (;;) { - if (inc.head == inc.tail) - break; - cpu_relax(); - inc.head = ACCESS_ONCE(lock->tickets.head); + unsigned count = SPIN_THRESHOLD; + + do { + if (ACCESS_ONCE(lock->tickets.head) == inc.tail) + goto out; + cpu_relax(); + } while (--count); + __ticket_lock_spinning(lock, inc.tail); } - barrier(); /* make sure nothing creeps before the lock is taken */ +out: barrier(); /* make sure nothing creeps before the lock is taken */ } -static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock) +static __always_inline int arch_spin_trylock(arch_spinlock_t *lock) { arch_spinlock_t old, new; old.tickets = ACCESS_ONCE(lock->tickets); - if (old.tickets.head != old.tickets.tail) + if (old.tickets.head != (old.tickets.tail & ~TICKET_SLOWPATH_FLAG)) return 0; - new.head_tail = old.head_tail + (1 << TICKET_SHIFT); + new.head_tail = old.head_tail + (TICKET_LOCK_INC << TICKET_SHIFT); /* cmpxchg is a full barrier, so nothing can move before it */ return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail; } -static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock) +static inline void __ticket_unlock_slowpath(arch_spinlock_t *lock, + arch_spinlock_t old) { - __add(&lock->tickets.head, 1, UNLOCK_LOCK_PREFIX); + arch_spinlock_t new; + + BUILD_BUG_ON(((__ticket_t)NR_CPUS) != NR_CPUS); + + /* Perform the unlock on the "before" copy */ + old.tickets.head += TICKET_LOCK_INC; + + /* Clear the slowpath flag */ + new.head_tail = old.head_tail & ~(TICKET_SLOWPATH_FLAG << TICKET_SHIFT); + + /* + * If the lock is uncontended, clear the flag - use cmpxchg in + * case it changes behind our back though. + */ + if (new.tickets.head != new.tickets.tail || + cmpxchg(&lock->head_tail, old.head_tail, + new.head_tail) != old.head_tail) { + /* + * Lock still has someone queued for it, so wake up an + * appropriate waiter. + */ + __ticket_unlock_kick(lock, old.tickets.head); + } } -static inline int __ticket_spin_is_locked(arch_spinlock_t *lock) +static __always_inline void arch_spin_unlock(arch_spinlock_t *lock) { - struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets); + if (TICKET_SLOWPATH_FLAG && + static_key_false(¶virt_ticketlocks_enabled)) { + arch_spinlock_t prev; - return tmp.tail != tmp.head; -} + prev = *lock; + add_smp(&lock->tickets.head, TICKET_LOCK_INC); -static inline int __ticket_spin_is_contended(arch_spinlock_t *lock) -{ - struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets); + /* add_smp() is a full mb() */ - return (__ticket_t)(tmp.tail - tmp.head) > 1; + if (unlikely(lock->tickets.tail & TICKET_SLOWPATH_FLAG)) + __ticket_unlock_slowpath(lock, prev); + } else + __add(&lock->tickets.head, TICKET_LOCK_INC, UNLOCK_LOCK_PREFIX); } -#ifndef CONFIG_PARAVIRT_SPINLOCKS - static inline int arch_spin_is_locked(arch_spinlock_t *lock) { - return __ticket_spin_is_locked(lock); -} - -static inline int arch_spin_is_contended(arch_spinlock_t *lock) -{ - return __ticket_spin_is_contended(lock); -} -#define arch_spin_is_contended arch_spin_is_contended + struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets); -static __always_inline void arch_spin_lock(arch_spinlock_t *lock) -{ - __ticket_spin_lock(lock); + return tmp.tail != tmp.head; } -static __always_inline int arch_spin_trylock(arch_spinlock_t *lock) +static inline int arch_spin_is_contended(arch_spinlock_t *lock) { - return __ticket_spin_trylock(lock); -} + struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets); -static __always_inline void arch_spin_unlock(arch_spinlock_t *lock) -{ - __ticket_spin_unlock(lock); + return (__ticket_t)(tmp.tail - tmp.head) > TICKET_LOCK_INC; } +#define arch_spin_is_contended arch_spin_is_contended static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags) @@ -129,8 +182,6 @@ static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock, arch_spin_lock(lock); } -#endif /* CONFIG_PARAVIRT_SPINLOCKS */ - static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) { while (arch_spin_is_locked(lock)) @@ -233,8 +284,4 @@ static inline void arch_write_unlock(arch_rwlock_t *rw) #define arch_read_relax(lock) cpu_relax() #define arch_write_relax(lock) cpu_relax() -/* The {read|write|spin}_lock() on x86 are full memory barriers. */ -static inline void smp_mb__after_lock(void) { } -#define ARCH_HAS_SMP_MB_AFTER_LOCK - #endif /* _ASM_X86_SPINLOCK_H */ diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h index ad0ad07fc00..4f1bea19945 100644 --- a/arch/x86/include/asm/spinlock_types.h +++ b/arch/x86/include/asm/spinlock_types.h @@ -1,13 +1,17 @@ #ifndef _ASM_X86_SPINLOCK_TYPES_H #define _ASM_X86_SPINLOCK_TYPES_H -#ifndef __LINUX_SPINLOCK_TYPES_H -# error "please don't include this file directly" -#endif - #include <linux/types.h> -#if (CONFIG_NR_CPUS < 256) +#ifdef CONFIG_PARAVIRT_SPINLOCKS +#define __TICKET_LOCK_INC 2 +#define TICKET_SLOWPATH_FLAG ((__ticket_t)1) +#else +#define __TICKET_LOCK_INC 1 +#define TICKET_SLOWPATH_FLAG ((__ticket_t)0) +#endif + +#if (CONFIG_NR_CPUS < (256 / __TICKET_LOCK_INC)) typedef u8 __ticket_t; typedef u16 __ticketpair_t; #else @@ -15,6 +19,8 @@ typedef u16 __ticket_t; typedef u32 __ticketpair_t; #endif +#define TICKET_LOCK_INC ((__ticket_t)__TICKET_LOCK_INC) + #define TICKET_SHIFT (sizeof(__ticket_t) * 8) typedef struct arch_spinlock { diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 4ec45b3abba..d7f3b3b78ac 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -2,8 +2,8 @@ #define _ASM_X86_SWITCH_TO_H struct task_struct; /* one of the stranger aspects of C forward declarations */ -struct task_struct *__switch_to(struct task_struct *prev, - struct task_struct *next); +__visible struct task_struct *__switch_to(struct task_struct *prev, + struct task_struct *next); struct tss_struct; void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, struct tss_struct *tss); diff --git a/arch/x86/include/asm/sync_bitops.h b/arch/x86/include/asm/sync_bitops.h index 9d09b4073b6..05af3b31d52 100644 --- a/arch/x86/include/asm/sync_bitops.h +++ b/arch/x86/include/asm/sync_bitops.h @@ -26,9 +26,9 @@ * Note that @nr may be almost arbitrarily large; this function is not * restricted to acting on a single-word quantity. */ -static inline void sync_set_bit(int nr, volatile unsigned long *addr) +static inline void sync_set_bit(long nr, volatile unsigned long *addr) { - asm volatile("lock; btsl %1,%0" + asm volatile("lock; bts %1,%0" : "+m" (ADDR) : "Ir" (nr) : "memory"); @@ -44,9 +44,9 @@ static inline void sync_set_bit(int nr, volatile unsigned long *addr) * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit() * in order to ensure changes are visible on other processors. */ -static inline void sync_clear_bit(int nr, volatile unsigned long *addr) +static inline void sync_clear_bit(long nr, volatile unsigned long *addr) { - asm volatile("lock; btrl %1,%0" + asm volatile("lock; btr %1,%0" : "+m" (ADDR) : "Ir" (nr) : "memory"); @@ -61,9 +61,9 @@ static inline void sync_clear_bit(int nr, volatile unsigned long *addr) * Note that @nr may be almost arbitrarily large; this function is not * restricted to acting on a single-word quantity. */ -static inline void sync_change_bit(int nr, volatile unsigned long *addr) +static inline void sync_change_bit(long nr, volatile unsigned long *addr) { - asm volatile("lock; btcl %1,%0" + asm volatile("lock; btc %1,%0" : "+m" (ADDR) : "Ir" (nr) : "memory"); @@ -77,11 +77,11 @@ static inline void sync_change_bit(int nr, volatile unsigned long *addr) * This operation is atomic and cannot be reordered. * It also implies a memory barrier. */ -static inline int sync_test_and_set_bit(int nr, volatile unsigned long *addr) +static inline int sync_test_and_set_bit(long nr, volatile unsigned long *addr) { int oldbit; - asm volatile("lock; btsl %2,%1\n\tsbbl %0,%0" + asm volatile("lock; bts %2,%1\n\tsbbl %0,%0" : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory"); return oldbit; @@ -95,11 +95,11 @@ static inline int sync_test_and_set_bit(int nr, volatile unsigned long *addr) * This operation is atomic and cannot be reordered. * It also implies a memory barrier. */ -static inline int sync_test_and_clear_bit(int nr, volatile unsigned long *addr) +static inline int sync_test_and_clear_bit(long nr, volatile unsigned long *addr) { int oldbit; - asm volatile("lock; btrl %2,%1\n\tsbbl %0,%0" + asm volatile("lock; btr %2,%1\n\tsbbl %0,%0" : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory"); return oldbit; @@ -113,11 +113,11 @@ static inline int sync_test_and_clear_bit(int nr, volatile unsigned long *addr) * This operation is atomic and cannot be reordered. * It also implies a memory barrier. */ -static inline int sync_test_and_change_bit(int nr, volatile unsigned long *addr) +static inline int sync_test_and_change_bit(long nr, volatile unsigned long *addr) { int oldbit; - asm volatile("lock; btcl %2,%1\n\tsbbl %0,%0" + asm volatile("lock; btc %2,%1\n\tsbbl %0,%0" : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory"); return oldbit; diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index 2e188d68397..aea284b4131 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -20,7 +20,8 @@ #include <asm/thread_info.h> /* for TS_COMPAT */ #include <asm/unistd.h> -extern const unsigned long sys_call_table[]; +typedef void (*sys_call_ptr_t)(void); +extern const sys_call_ptr_t sys_call_table[]; /* * Only the low 32 bits of orig_ax are meaningful, so we return int. diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h index 2917a6452c4..592a6a672e0 100644 --- a/arch/x86/include/asm/syscalls.h +++ b/arch/x86/include/asm/syscalls.h @@ -24,7 +24,7 @@ asmlinkage long sys_iopl(unsigned int); asmlinkage int sys_modify_ldt(int, void __user *, unsigned long); /* kernel/signal.c */ -long sys_rt_sigreturn(void); +asmlinkage long sys_rt_sigreturn(void); /* kernel/tls.c */ asmlinkage long sys_set_thread_area(struct user_desc __user *); @@ -34,7 +34,7 @@ asmlinkage long sys_get_thread_area(struct user_desc __user *); #ifdef CONFIG_X86_32 /* kernel/signal.c */ -unsigned long sys_sigreturn(void); +asmlinkage unsigned long sys_sigreturn(void); /* kernel/vm86_32.c */ asmlinkage long sys_vm86old(struct vm86_struct __user *); @@ -44,7 +44,7 @@ asmlinkage long sys_vm86(unsigned long, unsigned long); /* X86_64 only */ /* kernel/process_64.c */ -long sys_arch_prctl(int, unsigned long); +asmlinkage long sys_arch_prctl(int, unsigned long); /* kernel/sys_x86_64.c */ asmlinkage long sys_mmap(unsigned long, unsigned long, unsigned long, diff --git a/arch/x86/include/asm/sysfb.h b/arch/x86/include/asm/sysfb.h new file mode 100644 index 00000000000..2aeb3e25579 --- /dev/null +++ b/arch/x86/include/asm/sysfb.h @@ -0,0 +1,98 @@ +#ifndef _ARCH_X86_KERNEL_SYSFB_H +#define _ARCH_X86_KERNEL_SYSFB_H + +/* + * Generic System Framebuffers on x86 + * Copyright (c) 2012-2013 David Herrmann <dh.herrmann@gmail.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +#include <linux/kernel.h> +#include <linux/platform_data/simplefb.h> +#include <linux/screen_info.h> + +enum { + M_I17, /* 17-Inch iMac */ + M_I20, /* 20-Inch iMac */ + M_I20_SR, /* 20-Inch iMac (Santa Rosa) */ + M_I24, /* 24-Inch iMac */ + M_I24_8_1, /* 24-Inch iMac, 8,1th gen */ + M_I24_10_1, /* 24-Inch iMac, 10,1th gen */ + M_I27_11_1, /* 27-Inch iMac, 11,1th gen */ + M_MINI, /* Mac Mini */ + M_MINI_3_1, /* Mac Mini, 3,1th gen */ + M_MINI_4_1, /* Mac Mini, 4,1th gen */ + M_MB, /* MacBook */ + M_MB_2, /* MacBook, 2nd rev. */ + M_MB_3, /* MacBook, 3rd rev. */ + M_MB_5_1, /* MacBook, 5th rev. */ + M_MB_6_1, /* MacBook, 6th rev. */ + M_MB_7_1, /* MacBook, 7th rev. */ + M_MB_SR, /* MacBook, 2nd gen, (Santa Rosa) */ + M_MBA, /* MacBook Air */ + M_MBA_3, /* Macbook Air, 3rd rev */ + M_MBP, /* MacBook Pro */ + M_MBP_2, /* MacBook Pro 2nd gen */ + M_MBP_2_2, /* MacBook Pro 2,2nd gen */ + M_MBP_SR, /* MacBook Pro (Santa Rosa) */ + M_MBP_4, /* MacBook Pro, 4th gen */ + M_MBP_5_1, /* MacBook Pro, 5,1th gen */ + M_MBP_5_2, /* MacBook Pro, 5,2th gen */ + M_MBP_5_3, /* MacBook Pro, 5,3rd gen */ + M_MBP_6_1, /* MacBook Pro, 6,1th gen */ + M_MBP_6_2, /* MacBook Pro, 6,2th gen */ + M_MBP_7_1, /* MacBook Pro, 7,1th gen */ + M_MBP_8_2, /* MacBook Pro, 8,2nd gen */ + M_UNKNOWN /* placeholder */ +}; + +struct efifb_dmi_info { + char *optname; + unsigned long base; + int stride; + int width; + int height; + int flags; +}; + +#ifdef CONFIG_EFI + +extern struct efifb_dmi_info efifb_dmi_list[]; +void sysfb_apply_efi_quirks(void); + +#else /* CONFIG_EFI */ + +static inline void sysfb_apply_efi_quirks(void) +{ +} + +#endif /* CONFIG_EFI */ + +#ifdef CONFIG_X86_SYSFB + +bool parse_mode(const struct screen_info *si, + struct simplefb_platform_data *mode); +int create_simplefb(const struct screen_info *si, + const struct simplefb_platform_data *mode); + +#else /* CONFIG_X86_SYSFB */ + +static inline bool parse_mode(const struct screen_info *si, + struct simplefb_platform_data *mode) +{ + return false; +} + +static inline int create_simplefb(const struct screen_info *si, + const struct simplefb_platform_data *mode) +{ + return -EINVAL; +} + +#endif /* CONFIG_X86_SYSFB */ + +#endif /* _ARCH_X86_KERNEL_SYSFB_H */ diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 095b21507b6..d35f24e231c 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -124,9 +124,6 @@ extern const struct cpumask *cpu_coregroup_mask(int cpu); #define topology_core_id(cpu) (cpu_data(cpu).cpu_core_id) #define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu)) #define topology_thread_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu)) - -/* indicates that pointers to the topology cpumask_t maps are valid */ -#define arch_provides_topology_pointers yes #endif static inline void arch_fix_phys_package_id(int num, u32 slot) diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 88eae2aec61..7036cb60cd8 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -6,11 +6,7 @@ #include <asm/debugreg.h> #include <asm/siginfo.h> /* TRAP_TRACE, ... */ -#ifdef CONFIG_X86_32 -#define dotraplinkage -#else -#define dotraplinkage asmlinkage -#endif +#define dotraplinkage __visible asmlinkage void divide_error(void); asmlinkage void debug(void); diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index c91e8b9d588..235be70d5bb 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -49,6 +49,7 @@ extern void tsc_init(void); extern void mark_tsc_unstable(char *reason); extern int unsynchronized_tsc(void); extern int check_tsc_unstable(void); +extern int check_tsc_disabled(void); extern unsigned long native_calibrate_tsc(void); extern int tsc_clocksource_reliable; diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 5ee26875bae..5838fa911aa 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -153,16 +153,19 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL)) * Careful: we have to cast the result to the type of the pointer * for sign reasons. * - * The use of %edx as the register specifier is a bit of a + * The use of _ASM_DX as the register specifier is a bit of a * simplification, as gcc only cares about it as the starting point * and not size: for a 64-bit value it will use %ecx:%edx on 32 bits * (%ecx being the next register in gcc's x86 register sequence), and * %rdx on 64 bits. + * + * Clang/LLVM cares about the size of the register, but still wants + * the base register for something that ends up being a pair. */ #define get_user(x, ptr) \ ({ \ int __ret_gu; \ - register __inttype(*(ptr)) __val_gu asm("%edx"); \ + register __inttype(*(ptr)) __val_gu asm("%"_ASM_DX); \ __chk_user_ptr(ptr); \ might_fault(); \ asm volatile("call __get_user_%P3" \ diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index f3e01a2cbaa..966502d4682 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -387,6 +387,7 @@ enum vmcs_field { #define VMX_EPT_EXTENT_INDIVIDUAL_ADDR 0 #define VMX_EPT_EXTENT_CONTEXT 1 #define VMX_EPT_EXTENT_GLOBAL 2 +#define VMX_EPT_EXTENT_SHIFT 24 #define VMX_EPT_EXECUTE_ONLY_BIT (1ull) #define VMX_EPT_PAGE_WALK_4_BIT (1ull << 6) @@ -394,6 +395,7 @@ enum vmcs_field { #define VMX_EPTP_WB_BIT (1ull << 14) #define VMX_EPT_2MB_PAGE_BIT (1ull << 16) #define VMX_EPT_1GB_PAGE_BIT (1ull << 17) +#define VMX_EPT_INVEPT_BIT (1ull << 20) #define VMX_EPT_AD_BIT (1ull << 21) #define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) #define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h index de656ac2af4..d76ac40da20 100644 --- a/arch/x86/include/asm/vvar.h +++ b/arch/x86/include/asm/vvar.h @@ -35,7 +35,7 @@ #define DEFINE_VVAR(type, name) \ type name \ - __attribute__((section(".vvar_" #name), aligned(16))) + __attribute__((section(".vvar_" #name), aligned(16))) __visible #define VVAR(name) (*vvaraddr_ ## name) diff --git a/arch/x86/include/asm/xen/events.h b/arch/x86/include/asm/xen/events.h index ca842f2769e..608a79d5a46 100644 --- a/arch/x86/include/asm/xen/events.h +++ b/arch/x86/include/asm/xen/events.h @@ -7,6 +7,7 @@ enum ipi_vector { XEN_CALL_FUNCTION_SINGLE_VECTOR, XEN_SPIN_UNLOCK_VECTOR, XEN_IRQ_WORK_VECTOR, + XEN_NMI_VECTOR, XEN_NR_IPIS, }; diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h index 125f344f06a..d866959e568 100644 --- a/arch/x86/include/asm/xen/hypervisor.h +++ b/arch/x86/include/asm/xen/hypervisor.h @@ -40,21 +40,7 @@ extern struct start_info *xen_start_info; static inline uint32_t xen_cpuid_base(void) { - uint32_t base, eax, ebx, ecx, edx; - char signature[13]; - - for (base = 0x40000000; base < 0x40010000; base += 0x100) { - cpuid(base, &eax, &ebx, &ecx, &edx); - *(uint32_t *)(signature + 0) = ebx; - *(uint32_t *)(signature + 4) = ecx; - *(uint32_t *)(signature + 8) = edx; - signature[12] = 0; - - if (!strcmp("XenVMMXenVMM", signature) && ((eax - base) >= 2)) - return base; - } - - return 0; + return hypervisor_cpuid_base("XenVMMXenVMM", 2); } #ifdef CONFIG_XEN diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h index 06fdbd987e9..94dc8ca434e 100644 --- a/arch/x86/include/uapi/asm/kvm_para.h +++ b/arch/x86/include/uapi/asm/kvm_para.h @@ -23,6 +23,7 @@ #define KVM_FEATURE_ASYNC_PF 4 #define KVM_FEATURE_STEAL_TIME 5 #define KVM_FEATURE_PV_EOI 6 +#define KVM_FEATURE_PV_UNHALT 7 /* The last 8 bits are used to indicate how to interpret the flags field * in pvclock structure. If no bits are set, all flags are ignored. diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index d651082c7cf..0e79420376e 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -65,6 +65,7 @@ #define EXIT_REASON_EOI_INDUCED 45 #define EXIT_REASON_EPT_VIOLATION 48 #define EXIT_REASON_EPT_MISCONFIG 49 +#define EXIT_REASON_INVEPT 50 #define EXIT_REASON_PREEMPTION_TIMER 52 #define EXIT_REASON_WBINVD 54 #define EXIT_REASON_XSETBV 55 @@ -106,12 +107,13 @@ { EXIT_REASON_APIC_ACCESS, "APIC_ACCESS" }, \ { EXIT_REASON_EPT_VIOLATION, "EPT_VIOLATION" }, \ { EXIT_REASON_EPT_MISCONFIG, "EPT_MISCONFIG" }, \ + { EXIT_REASON_INVEPT, "INVEPT" }, \ + { EXIT_REASON_PREEMPTION_TIMER, "PREEMPTION_TIMER" }, \ { EXIT_REASON_WBINVD, "WBINVD" }, \ { EXIT_REASON_APIC_WRITE, "APIC_WRITE" }, \ { EXIT_REASON_EOI_INDUCED, "EOI_INDUCED" }, \ { EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \ { EXIT_REASON_INVD, "INVD" }, \ - { EXIT_REASON_INVPCID, "INVPCID" }, \ - { EXIT_REASON_PREEMPTION_TIMER, "PREEMPTION_TIMER" } + { EXIT_REASON_INVPCID, "INVPCID" } #endif /* _UAPIVMX_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 88d99ea7772..a5408b965c9 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -103,6 +103,9 @@ obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o obj-$(CONFIG_OF) += devicetree.o obj-$(CONFIG_UPROBES) += uprobes.o +obj-y += sysfb.o +obj-$(CONFIG_X86_SYSFB) += sysfb_simplefb.o +obj-$(CONFIG_EFI) += sysfb_efi.o obj-$(CONFIG_PERF_EVENTS) += perf_regs.o obj-$(CONFIG_TRACING) += tracepoint.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 2627a81253e..40c76604199 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -67,6 +67,7 @@ EXPORT_SYMBOL(acpi_pci_disabled); int acpi_lapic; int acpi_ioapic; int acpi_strict; +int acpi_disable_cmcff; u8 acpi_sci_flags __initdata; int acpi_sci_override_gsi __initdata; @@ -141,16 +142,8 @@ static u32 irq_to_gsi(int irq) } /* - * Temporarily use the virtual area starting from FIX_IO_APIC_BASE_END, - * to map the target physical address. The problem is that set_fixmap() - * provides a single page, and it is possible that the page is not - * sufficient. - * By using this area, we can map up to MAX_IO_APICS pages temporarily, - * i.e. until the next __va_range() call. - * - * Important Safety Note: The fixed I/O APIC page numbers are *subtracted* - * from the fixed base. That's why we start at FIX_IO_APIC_BASE_END and - * count idx down while incrementing the phys address. + * This is just a simple wrapper around early_ioremap(), + * with sanity checks for phys == 0 and size == 0. */ char *__init __acpi_map_table(unsigned long phys, unsigned long size) { @@ -160,6 +153,7 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size) return early_ioremap(phys, size); } + void __init __acpi_unmap_table(char *map, unsigned long size) { if (!map || !size) @@ -199,7 +193,7 @@ static void acpi_register_lapic(int id, u8 enabled) { unsigned int ver = 0; - if (id >= (MAX_LOCAL_APIC-1)) { + if (id >= MAX_LOCAL_APIC) { printk(KERN_INFO PREFIX "skipped apicid that is too big\n"); return; } @@ -1120,6 +1114,7 @@ int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) int ioapic; int ioapic_pin; struct io_apic_irq_attr irq_attr; + int ret; if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) return gsi; @@ -1149,7 +1144,9 @@ int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) set_io_apic_irq_attr(&irq_attr, ioapic, ioapic_pin, trigger == ACPI_EDGE_SENSITIVE ? 0 : 1, polarity == ACPI_ACTIVE_HIGH ? 0 : 1); - io_apic_set_pci_routing(dev, gsi_to_irq(gsi), &irq_attr); + ret = io_apic_set_pci_routing(dev, gsi_to_irq(gsi), &irq_attr); + if (ret < 0) + gsi = INT_MIN; return gsi; } @@ -1626,6 +1623,10 @@ static int __init parse_acpi(char *arg) /* "acpi=copy_dsdt" copys DSDT */ else if (strcmp(arg, "copy_dsdt") == 0) { acpi_gbl_copy_dsdt_locally = 1; + } + /* "acpi=nocmcff" disables FF mode for corrected errors */ + else if (strcmp(arg, "nocmcff") == 0) { + acpi_disable_cmcff = 1; } else { /* Core will printk when we return error. */ return -EINVAL; diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index c15cf9a25e2..15e8563e5c2 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -11,6 +11,7 @@ #include <linux/memory.h> #include <linux/stop_machine.h> #include <linux/slab.h> +#include <linux/kdebug.h> #include <asm/alternative.h> #include <asm/sections.h> #include <asm/pgtable.h> @@ -596,97 +597,93 @@ void *__kprobes text_poke(void *addr, const void *opcode, size_t len) return addr; } -/* - * Cross-modifying kernel text with stop_machine(). - * This code originally comes from immediate value. - */ -static atomic_t stop_machine_first; -static int wrote_text; +static void do_sync_core(void *info) +{ + sync_core(); +} -struct text_poke_params { - struct text_poke_param *params; - int nparams; -}; +static bool bp_patching_in_progress; +static void *bp_int3_handler, *bp_int3_addr; -static int __kprobes stop_machine_text_poke(void *data) +int poke_int3_handler(struct pt_regs *regs) { - struct text_poke_params *tpp = data; - struct text_poke_param *p; - int i; + /* bp_patching_in_progress */ + smp_rmb(); - if (atomic_xchg(&stop_machine_first, 0)) { - for (i = 0; i < tpp->nparams; i++) { - p = &tpp->params[i]; - text_poke(p->addr, p->opcode, p->len); - } - smp_wmb(); /* Make sure other cpus see that this has run */ - wrote_text = 1; - } else { - while (!wrote_text) - cpu_relax(); - smp_mb(); /* Load wrote_text before following execution */ - } + if (likely(!bp_patching_in_progress)) + return 0; - for (i = 0; i < tpp->nparams; i++) { - p = &tpp->params[i]; - flush_icache_range((unsigned long)p->addr, - (unsigned long)p->addr + p->len); - } - /* - * Intel Archiecture Software Developer's Manual section 7.1.3 specifies - * that a core serializing instruction such as "cpuid" should be - * executed on _each_ core before the new instruction is made visible. - */ - sync_core(); - return 0; -} + if (user_mode_vm(regs) || regs->ip != (unsigned long)bp_int3_addr) + return 0; + + /* set up the specified breakpoint handler */ + regs->ip = (unsigned long) bp_int3_handler; + + return 1; -/** - * text_poke_smp - Update instructions on a live kernel on SMP - * @addr: address to modify - * @opcode: source of the copy - * @len: length to copy - * - * Modify multi-byte instruction by using stop_machine() on SMP. This allows - * user to poke/set multi-byte text on SMP. Only non-NMI/MCE code modifying - * should be allowed, since stop_machine() does _not_ protect code against - * NMI and MCE. - * - * Note: Must be called under get_online_cpus() and text_mutex. - */ -void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len) -{ - struct text_poke_params tpp; - struct text_poke_param p; - - p.addr = addr; - p.opcode = opcode; - p.len = len; - tpp.params = &p; - tpp.nparams = 1; - atomic_set(&stop_machine_first, 1); - wrote_text = 0; - /* Use __stop_machine() because the caller already got online_cpus. */ - __stop_machine(stop_machine_text_poke, (void *)&tpp, cpu_online_mask); - return addr; } /** - * text_poke_smp_batch - Update instructions on a live kernel on SMP - * @params: an array of text_poke parameters - * @n: the number of elements in params. + * text_poke_bp() -- update instructions on live kernel on SMP + * @addr: address to patch + * @opcode: opcode of new instruction + * @len: length to copy + * @handler: address to jump to when the temporary breakpoint is hit * - * Modify multi-byte instruction by using stop_machine() on SMP. Since the - * stop_machine() is heavy task, it is better to aggregate text_poke requests - * and do it once if possible. + * Modify multi-byte instruction by using int3 breakpoint on SMP. + * We completely avoid stop_machine() here, and achieve the + * synchronization using int3 breakpoint. * - * Note: Must be called under get_online_cpus() and text_mutex. + * The way it is done: + * - add a int3 trap to the address that will be patched + * - sync cores + * - update all but the first byte of the patched range + * - sync cores + * - replace the first byte (int3) by the first byte of + * replacing opcode + * - sync cores + * + * Note: must be called under text_mutex. */ -void __kprobes text_poke_smp_batch(struct text_poke_param *params, int n) +void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler) { - struct text_poke_params tpp = {.params = params, .nparams = n}; + unsigned char int3 = 0xcc; + + bp_int3_handler = handler; + bp_int3_addr = (u8 *)addr + sizeof(int3); + bp_patching_in_progress = true; + /* + * Corresponding read barrier in int3 notifier for + * making sure the in_progress flags is correctly ordered wrt. + * patching + */ + smp_wmb(); + + text_poke(addr, &int3, sizeof(int3)); - atomic_set(&stop_machine_first, 1); - wrote_text = 0; - __stop_machine(stop_machine_text_poke, (void *)&tpp, cpu_online_mask); + on_each_cpu(do_sync_core, NULL, 1); + + if (len - sizeof(int3) > 0) { + /* patch all but the first byte */ + text_poke((char *)addr + sizeof(int3), + (const char *) opcode + sizeof(int3), + len - sizeof(int3)); + /* + * According to Intel, this core syncing is very likely + * not necessary and we'd be safe even without it. But + * better safe than sorry (plus there's not only Intel). + */ + on_each_cpu(do_sync_core, NULL, 1); + } + + /* patch the first byte */ + text_poke(addr, opcode, sizeof(int3)); + + on_each_cpu(do_sync_core, NULL, 1); + + bp_patching_in_progress = false; + smp_wmb(); + + return addr; } + diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 3048ded1b59..59554dca96e 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -20,6 +20,7 @@ const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M10H_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) }, {} }; @@ -27,6 +28,7 @@ EXPORT_SYMBOL(amd_nb_misc_ids); static const struct pci_device_id amd_nb_link_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F4) }, {} }; @@ -81,13 +83,20 @@ int amd_cache_northbridges(void) next_northbridge(misc, amd_nb_misc_ids); node_to_amd_nb(i)->link = link = next_northbridge(link, amd_nb_link_ids); - } + } + /* GART present only on Fam15h upto model 0fh */ if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 || - boot_cpu_data.x86 == 0x15) + (boot_cpu_data.x86 == 0x15 && boot_cpu_data.x86_model < 0x10)) amd_northbridges.flags |= AMD_NB_GART; /* + * Check for L3 cache presence. + */ + if (!cpuid_edx(0x80000006)) + return 0; + + /* * Some CPU families support L3 Cache Index Disable. There are some * limitations because of E382 and E388 on family 0x10. */ diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index eca89c53a7f..a7eb82d9b01 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -913,7 +913,7 @@ static void local_apic_timer_interrupt(void) * [ if a single-CPU system runs an SMP kernel then we call the local * interrupt as well. Thus we cannot inline the local irq ... ] */ -void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); @@ -932,7 +932,7 @@ void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs) set_irq_regs(old_regs); } -void __irq_entry smp_trace_apic_timer_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_trace_apic_timer_interrupt(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); @@ -1946,14 +1946,14 @@ static inline void __smp_spurious_interrupt(void) "should never happen.\n", smp_processor_id()); } -void smp_spurious_interrupt(struct pt_regs *regs) +__visible void smp_spurious_interrupt(struct pt_regs *regs) { entering_irq(); __smp_spurious_interrupt(); exiting_irq(); } -void smp_trace_spurious_interrupt(struct pt_regs *regs) +__visible void smp_trace_spurious_interrupt(struct pt_regs *regs) { entering_irq(); trace_spurious_apic_entry(SPURIOUS_APIC_VECTOR); @@ -2002,14 +2002,14 @@ static inline void __smp_error_interrupt(struct pt_regs *regs) } -void smp_error_interrupt(struct pt_regs *regs) +__visible void smp_error_interrupt(struct pt_regs *regs) { entering_irq(); __smp_error_interrupt(regs); exiting_irq(); } -void smp_trace_error_interrupt(struct pt_regs *regs) +__visible void smp_trace_error_interrupt(struct pt_regs *regs) { entering_irq(); trace_error_apic_entry(ERROR_APIC_VECTOR); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 9ed796ccc32..e63a5bd2a78 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1534,6 +1534,11 @@ void intel_ir_io_apic_print_entries(unsigned int apic, } } +void ioapic_zap_locks(void) +{ + raw_spin_lock_init(&ioapic_lock); +} + __apicdebuginit(void) print_IO_APIC(int ioapic_idx) { union IO_APIC_reg_00 reg_00; @@ -3375,12 +3380,15 @@ int io_apic_setup_irq_pin_once(unsigned int irq, int node, { unsigned int ioapic_idx = attr->ioapic, pin = attr->ioapic_pin; int ret; + struct IO_APIC_route_entry orig_entry; /* Avoid redundant programming */ if (test_bit(pin, ioapics[ioapic_idx].pin_programmed)) { - pr_debug("Pin %d-%d already programmed\n", - mpc_ioapic_id(ioapic_idx), pin); - return 0; + pr_debug("Pin %d-%d already programmed\n", mpc_ioapic_id(ioapic_idx), pin); + orig_entry = ioapic_read_entry(attr->ioapic, pin); + if (attr->trigger == orig_entry.trigger && attr->polarity == orig_entry.polarity) + return 0; + return -EBUSY; } ret = io_apic_setup_irq_pin(irq, node, attr); if (!ret) diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 53a4e274484..3ab03430211 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -392,7 +392,7 @@ static struct cpuidle_device apm_cpuidle_device; /* * Local variables */ -static struct { +__visible struct { unsigned long offset; unsigned short segment; } apm_bios_entry; diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index f654ecefea5..903a264af98 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -66,8 +66,8 @@ static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val) * performance at the same time.. */ -extern void vide(void); -__asm__(".align 4\nvide: ret"); +extern __visible void vide(void); +__asm__(".globl vide\n\t.align 4\nvide: ret"); static void init_amd_k5(struct cpuinfo_x86 *c) { @@ -512,7 +512,7 @@ static void early_init_amd(struct cpuinfo_x86 *c) static const int amd_erratum_383[]; static const int amd_erratum_400[]; -static bool cpu_has_amd_erratum(const int *erratum); +static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum); static void init_amd(struct cpuinfo_x86 *c) { @@ -729,11 +729,11 @@ static void init_amd(struct cpuinfo_x86 *c) value &= ~(1ULL << 24); wrmsrl_safe(MSR_AMD64_BU_CFG2, value); - if (cpu_has_amd_erratum(amd_erratum_383)) + if (cpu_has_amd_erratum(c, amd_erratum_383)) set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH); } - if (cpu_has_amd_erratum(amd_erratum_400)) + if (cpu_has_amd_erratum(c, amd_erratum_400)) set_cpu_bug(c, X86_BUG_AMD_APIC_C1E); rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); @@ -878,23 +878,13 @@ static const int amd_erratum_400[] = static const int amd_erratum_383[] = AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf)); -static bool cpu_has_amd_erratum(const int *erratum) + +static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum) { - struct cpuinfo_x86 *cpu = __this_cpu_ptr(&cpu_info); int osvw_id = *erratum++; u32 range; u32 ms; - /* - * If called early enough that current_cpu_data hasn't been initialized - * yet, fall back to boot_cpu_data. - */ - if (cpu->x86 == 0) - cpu = &boot_cpu_data; - - if (cpu->x86_vendor != X86_VENDOR_AMD) - return false; - if (osvw_id >= 0 && osvw_id < 65536 && cpu_has(cpu, X86_FEATURE_OSVW)) { u64 osvw_len; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 25eb2747b06..2793d1f095a 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1076,7 +1076,7 @@ struct desc_ptr debug_idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) debug_idt_table }; DEFINE_PER_CPU_FIRST(union irq_stack_union, - irq_stack_union) __aligned(PAGE_SIZE); + irq_stack_union) __aligned(PAGE_SIZE) __visible; /* * The following four percpu variables are hot. Align current_task to @@ -1093,7 +1093,7 @@ EXPORT_PER_CPU_SYMBOL(kernel_stack); DEFINE_PER_CPU(char *, irq_stack_ptr) = init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64; -DEFINE_PER_CPU(unsigned int, irq_count) = -1; +DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1; DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index 87279212d31..36ce402a3fa 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -25,11 +25,6 @@ #include <asm/processor.h> #include <asm/hypervisor.h> -/* - * Hypervisor detect order. This is specified explicitly here because - * some hypervisors might implement compatibility modes for other - * hypervisors and therefore need to be detected in specific sequence. - */ static const __initconst struct hypervisor_x86 * const hypervisors[] = { #ifdef CONFIG_XEN_PVHVM @@ -49,15 +44,19 @@ static inline void __init detect_hypervisor_vendor(void) { const struct hypervisor_x86 *h, * const *p; + uint32_t pri, max_pri = 0; for (p = hypervisors; p < hypervisors + ARRAY_SIZE(hypervisors); p++) { h = *p; - if (h->detect()) { + pri = h->detect(); + if (pri != 0 && pri > max_pri) { + max_pri = pri; x86_hyper = h; - printk(KERN_INFO "Hypervisor detected: %s\n", h->name); - break; } } + + if (max_pri) + printk(KERN_INFO "Hypervisor detected: %s\n", x86_hyper->name); } void init_hypervisor(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index 5b7d4fa5d3b..09edd0b65fe 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -25,15 +25,18 @@ int mce_severity(struct mce *a, int tolerant, char **msg); struct dentry *mce_get_debugfs_dir(void); extern struct mce_bank *mce_banks; +extern mce_banks_t mce_banks_ce_disabled; #ifdef CONFIG_X86_MCE_INTEL unsigned long mce_intel_adjust_timer(unsigned long interval); void mce_intel_cmci_poll(void); void mce_intel_hcpu_update(unsigned long cpu); +void cmci_disable_bank(int bank); #else # define mce_intel_adjust_timer mce_adjust_timer_default static inline void mce_intel_cmci_poll(void) { } static inline void mce_intel_hcpu_update(unsigned long cpu) { } +static inline void cmci_disable_bank(int bank) { } #endif void mce_timer_kick(unsigned long interval); diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 87a65c939bc..b3218cdee95 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -97,6 +97,15 @@ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL }; +/* + * MCA banks controlled through firmware first for corrected errors. + * This is a global list of banks for which we won't enable CMCI and we + * won't poll. Firmware controls these banks and is responsible for + * reporting corrected errors through GHES. Uncorrected/recoverable + * errors are still notified through a machine check. + */ +mce_banks_t mce_banks_ce_disabled; + static DEFINE_PER_CPU(struct work_struct, mce_work); static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs); @@ -1935,6 +1944,25 @@ static struct miscdevice mce_chrdev_device = { &mce_chrdev_ops, }; +static void __mce_disable_bank(void *arg) +{ + int bank = *((int *)arg); + __clear_bit(bank, __get_cpu_var(mce_poll_banks)); + cmci_disable_bank(bank); +} + +void mce_disable_bank(int bank) +{ + if (bank >= mca_cfg.banks) { + pr_warn(FW_BUG + "Ignoring request to disable invalid MCA bank %d.\n", + bank); + return; + } + set_bit(bank, mce_banks_ce_disabled); + on_each_cpu(__mce_disable_bank, &bank, 1); +} + /* * mce=off Disables machine check * mce=no_cmci Disables CMCI diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index d56405309dc..4cfe0458ca6 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -203,6 +203,10 @@ static void cmci_discover(int banks) if (test_bit(i, owned)) continue; + /* Skip banks in firmware first mode */ + if (test_bit(i, mce_banks_ce_disabled)) + continue; + rdmsrl(MSR_IA32_MCx_CTL2(i), val); /* Already owned by someone else? */ @@ -271,6 +275,19 @@ void cmci_recheck(void) local_irq_restore(flags); } +/* Caller must hold the lock on cmci_discover_lock */ +static void __cmci_disable_bank(int bank) +{ + u64 val; + + if (!test_bit(bank, __get_cpu_var(mce_banks_owned))) + return; + rdmsrl(MSR_IA32_MCx_CTL2(bank), val); + val &= ~MCI_CTL2_CMCI_EN; + wrmsrl(MSR_IA32_MCx_CTL2(bank), val); + __clear_bit(bank, __get_cpu_var(mce_banks_owned)); +} + /* * Disable CMCI on this CPU for all banks it owns when it goes down. * This allows other CPUs to claim the banks on rediscovery. @@ -280,20 +297,12 @@ void cmci_clear(void) unsigned long flags; int i; int banks; - u64 val; if (!cmci_supported(&banks)) return; raw_spin_lock_irqsave(&cmci_discover_lock, flags); - for (i = 0; i < banks; i++) { - if (!test_bit(i, __get_cpu_var(mce_banks_owned))) - continue; - /* Disable CMCI */ - rdmsrl(MSR_IA32_MCx_CTL2(i), val); - val &= ~MCI_CTL2_CMCI_EN; - wrmsrl(MSR_IA32_MCx_CTL2(i), val); - __clear_bit(i, __get_cpu_var(mce_banks_owned)); - } + for (i = 0; i < banks; i++) + __cmci_disable_bank(i); raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); } @@ -327,6 +336,19 @@ void cmci_reenable(void) cmci_discover(banks); } +void cmci_disable_bank(int bank) +{ + int banks; + unsigned long flags; + + if (!cmci_supported(&banks)) + return; + + raw_spin_lock_irqsave(&cmci_discover_lock, flags); + __cmci_disable_bank(bank); + raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); +} + static void intel_init_cmci(void) { int banks; diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 8f4be53ea04..71a39f3621b 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -27,20 +27,23 @@ struct ms_hyperv_info ms_hyperv; EXPORT_SYMBOL_GPL(ms_hyperv); -static bool __init ms_hyperv_platform(void) +static uint32_t __init ms_hyperv_platform(void) { u32 eax; u32 hyp_signature[3]; if (!boot_cpu_has(X86_FEATURE_HYPERVISOR)) - return false; + return 0; cpuid(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS, &eax, &hyp_signature[0], &hyp_signature[1], &hyp_signature[2]); - return eax >= HYPERV_CPUID_MIN && - eax <= HYPERV_CPUID_MAX && - !memcmp("Microsoft Hv", hyp_signature, 12); + if (eax >= HYPERV_CPUID_MIN && + eax <= HYPERV_CPUID_MAX && + !memcmp("Microsoft Hv", hyp_signature, 12)) + return HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS; + + return 0; } static cycle_t read_hv_clock(struct clocksource *arg) diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index a7c7305030c..8355c84b972 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1884,6 +1884,7 @@ static struct pmu pmu = { void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) { userpg->cap_usr_time = 0; + userpg->cap_usr_time_zero = 0; userpg->cap_usr_rdpmc = x86_pmu.attr_rdpmc; userpg->pmc_width = x86_pmu.cntval_bits; @@ -1897,6 +1898,11 @@ void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) userpg->time_mult = this_cpu_read(cyc2ns); userpg->time_shift = CYC2NS_SCALE_FACTOR; userpg->time_offset = this_cpu_read(cyc2ns_offset) - now; + + if (sched_clock_stable && !check_tsc_disabled()) { + userpg->cap_usr_time_zero = 1; + userpg->time_zero = this_cpu_read(cyc2ns_offset); + } } /* diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 97e557bc4c9..cc16faae053 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -641,6 +641,8 @@ extern struct event_constraint intel_core2_pebs_event_constraints[]; extern struct event_constraint intel_atom_pebs_event_constraints[]; +extern struct event_constraint intel_slm_pebs_event_constraints[]; + extern struct event_constraint intel_nehalem_pebs_event_constraints[]; extern struct event_constraint intel_westmere_pebs_event_constraints[]; diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 4cbe03287b0..beeb7cc0704 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -347,8 +347,7 @@ static struct amd_nb *amd_alloc_nb(int cpu) struct amd_nb *nb; int i; - nb = kmalloc_node(sizeof(struct amd_nb), GFP_KERNEL | __GFP_ZERO, - cpu_to_node(cpu)); + nb = kzalloc_node(sizeof(struct amd_nb), GFP_KERNEL, cpu_to_node(cpu)); if (!nb) return NULL; diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index fbc9210b45b..0abf6742a8b 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -81,7 +81,8 @@ static struct event_constraint intel_nehalem_event_constraints[] __read_mostly = static struct extra_reg intel_nehalem_extra_regs[] __read_mostly = { - INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0), + /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ + INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0), INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x100b), EVENT_EXTRA_END }; @@ -143,8 +144,9 @@ static struct event_constraint intel_ivb_event_constraints[] __read_mostly = static struct extra_reg intel_westmere_extra_regs[] __read_mostly = { - INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0), - INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0xffff, RSP_1), + /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ + INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0xffff, RSP_1), INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x100b), EVENT_EXTRA_END }; @@ -162,16 +164,27 @@ static struct event_constraint intel_gen_event_constraints[] __read_mostly = EVENT_CONSTRAINT_END }; +static struct event_constraint intel_slm_event_constraints[] __read_mostly = +{ + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + FIXED_EVENT_CONSTRAINT(0x013c, 2), /* CPU_CLK_UNHALTED.REF */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* pseudo CPU_CLK_UNHALTED.REF */ + EVENT_CONSTRAINT_END +}; + static struct extra_reg intel_snb_extra_regs[] __read_mostly = { - INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0x3f807f8fffull, RSP_0), - INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0x3f807f8fffull, RSP_1), + /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ + INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3f807f8fffull, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3f807f8fffull, RSP_1), INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd), EVENT_EXTRA_END }; static struct extra_reg intel_snbep_extra_regs[] __read_mostly = { - INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0), - INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1), + /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ + INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1), INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd), EVENT_EXTRA_END }; @@ -882,6 +895,140 @@ static __initconst const u64 atom_hw_cache_event_ids }, }; +static struct extra_reg intel_slm_extra_regs[] __read_mostly = +{ + /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ + INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x768005ffff, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x768005ffff, RSP_1), + EVENT_EXTRA_END +}; + +#define SLM_DMND_READ SNB_DMND_DATA_RD +#define SLM_DMND_WRITE SNB_DMND_RFO +#define SLM_DMND_PREFETCH (SNB_PF_DATA_RD|SNB_PF_RFO) + +#define SLM_SNP_ANY (SNB_SNP_NONE|SNB_SNP_MISS|SNB_NO_FWD|SNB_HITM) +#define SLM_LLC_ACCESS SNB_RESP_ANY +#define SLM_LLC_MISS (SLM_SNP_ANY|SNB_NON_DRAM) + +static __initconst const u64 slm_hw_cache_extra_regs + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = SLM_DMND_READ|SLM_LLC_ACCESS, + [ C(RESULT_MISS) ] = SLM_DMND_READ|SLM_LLC_MISS, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = SLM_DMND_WRITE|SLM_LLC_ACCESS, + [ C(RESULT_MISS) ] = SLM_DMND_WRITE|SLM_LLC_MISS, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = SLM_DMND_PREFETCH|SLM_LLC_ACCESS, + [ C(RESULT_MISS) ] = SLM_DMND_PREFETCH|SLM_LLC_MISS, + }, + }, +}; + +static __initconst const u64 slm_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0x0104, /* LD_DCU_MISS */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0380, /* ICACHE.ACCESSES */ + [ C(RESULT_MISS) ] = 0x0280, /* ICACGE.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, + }, + [ C(OP_WRITE) ] = { + /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, + }, + [ C(OP_PREFETCH) ] = { + /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0x0804, /* LD_DTLB_MISS */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */ + [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */ + [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event) { /* user explicitly requested branch sampling */ @@ -1301,11 +1448,11 @@ static void intel_fixup_er(struct perf_event *event, int idx) if (idx == EXTRA_REG_RSP_0) { event->hw.config &= ~INTEL_ARCH_EVENT_MASK; - event->hw.config |= 0x01b7; + event->hw.config |= x86_pmu.extra_regs[EXTRA_REG_RSP_0].event; event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0; } else if (idx == EXTRA_REG_RSP_1) { event->hw.config &= ~INTEL_ARCH_EVENT_MASK; - event->hw.config |= 0x01bb; + event->hw.config |= x86_pmu.extra_regs[EXTRA_REG_RSP_1].event; event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1; } } @@ -2176,6 +2323,21 @@ __init int intel_pmu_init(void) pr_cont("Atom events, "); break; + case 55: /* Atom 22nm "Silvermont" */ + memcpy(hw_cache_event_ids, slm_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + memcpy(hw_cache_extra_regs, slm_hw_cache_extra_regs, + sizeof(hw_cache_extra_regs)); + + intel_pmu_lbr_init_atom(); + + x86_pmu.event_constraints = intel_slm_event_constraints; + x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints; + x86_pmu.extra_regs = intel_slm_extra_regs; + x86_pmu.er_flags |= ERF_HAS_RSP_1; + pr_cont("Silvermont events, "); + break; + case 37: /* 32 nm nehalem, "Clarkdale" */ case 44: /* 32 nm nehalem, "Gulftown" */ case 47: /* 32 nm Xeon E7 */ @@ -2270,6 +2432,7 @@ __init int intel_pmu_init(void) case 70: case 71: case 63: + case 69: x86_pmu.late_ack = true; memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 3065c57a63c..63438aad177 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -224,7 +224,7 @@ static int alloc_pebs_buffer(int cpu) if (!x86_pmu.pebs) return 0; - buffer = kmalloc_node(PEBS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node); + buffer = kzalloc_node(PEBS_BUFFER_SIZE, GFP_KERNEL, node); if (unlikely(!buffer)) return -ENOMEM; @@ -262,7 +262,7 @@ static int alloc_bts_buffer(int cpu) if (!x86_pmu.bts) return 0; - buffer = kmalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node); + buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL, node); if (unlikely(!buffer)) return -ENOMEM; @@ -295,7 +295,7 @@ static int alloc_ds_buffer(int cpu) int node = cpu_to_node(cpu); struct debug_store *ds; - ds = kmalloc_node(sizeof(*ds), GFP_KERNEL | __GFP_ZERO, node); + ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node); if (unlikely(!ds)) return -ENOMEM; @@ -517,6 +517,32 @@ struct event_constraint intel_atom_pebs_event_constraints[] = { EVENT_CONSTRAINT_END }; +struct event_constraint intel_slm_pebs_event_constraints[] = { + INTEL_UEVENT_CONSTRAINT(0x0103, 0x1), /* REHABQ.LD_BLOCK_ST_FORWARD_PS */ + INTEL_UEVENT_CONSTRAINT(0x0803, 0x1), /* REHABQ.LD_SPLITS_PS */ + INTEL_UEVENT_CONSTRAINT(0x0204, 0x1), /* MEM_UOPS_RETIRED.L2_HIT_LOADS_PS */ + INTEL_UEVENT_CONSTRAINT(0x0404, 0x1), /* MEM_UOPS_RETIRED.L2_MISS_LOADS_PS */ + INTEL_UEVENT_CONSTRAINT(0x0804, 0x1), /* MEM_UOPS_RETIRED.DTLB_MISS_LOADS_PS */ + INTEL_UEVENT_CONSTRAINT(0x2004, 0x1), /* MEM_UOPS_RETIRED.HITM_PS */ + INTEL_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY_PS */ + INTEL_UEVENT_CONSTRAINT(0x00c4, 0x1), /* BR_INST_RETIRED.ALL_BRANCHES_PS */ + INTEL_UEVENT_CONSTRAINT(0x7ec4, 0x1), /* BR_INST_RETIRED.JCC_PS */ + INTEL_UEVENT_CONSTRAINT(0xbfc4, 0x1), /* BR_INST_RETIRED.FAR_BRANCH_PS */ + INTEL_UEVENT_CONSTRAINT(0xebc4, 0x1), /* BR_INST_RETIRED.NON_RETURN_IND_PS */ + INTEL_UEVENT_CONSTRAINT(0xf7c4, 0x1), /* BR_INST_RETIRED.RETURN_PS */ + INTEL_UEVENT_CONSTRAINT(0xf9c4, 0x1), /* BR_INST_RETIRED.CALL_PS */ + INTEL_UEVENT_CONSTRAINT(0xfbc4, 0x1), /* BR_INST_RETIRED.IND_CALL_PS */ + INTEL_UEVENT_CONSTRAINT(0xfdc4, 0x1), /* BR_INST_RETIRED.REL_CALL_PS */ + INTEL_UEVENT_CONSTRAINT(0xfec4, 0x1), /* BR_INST_RETIRED.TAKEN_JCC_PS */ + INTEL_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_MISP_RETIRED.ALL_BRANCHES_PS */ + INTEL_UEVENT_CONSTRAINT(0x7ec5, 0x1), /* BR_INST_MISP_RETIRED.JCC_PS */ + INTEL_UEVENT_CONSTRAINT(0xebc5, 0x1), /* BR_INST_MISP_RETIRED.NON_RETURN_IND_PS */ + INTEL_UEVENT_CONSTRAINT(0xf7c5, 0x1), /* BR_INST_MISP_RETIRED.RETURN_PS */ + INTEL_UEVENT_CONSTRAINT(0xfbc5, 0x1), /* BR_INST_MISP_RETIRED.IND_CALL_PS */ + INTEL_UEVENT_CONSTRAINT(0xfec5, 0x1), /* BR_INST_MISP_RETIRED.TAKEN_JCC_PS */ + EVENT_CONSTRAINT_END +}; + struct event_constraint intel_nehalem_pebs_event_constraints[] = { INTEL_PLD_CONSTRAINT(0x100b, 0xf), /* MEM_INST_RETIRED.* */ INTEL_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */ diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index cad791dbde9..fd8011ed4dc 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -6,6 +6,8 @@ static struct intel_uncore_type **pci_uncores = empty_uncore; /* pci bus to socket mapping */ static int pcibus_to_physid[256] = { [0 ... 255] = -1, }; +static struct pci_dev *extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX]; + static DEFINE_RAW_SPINLOCK(uncore_box_lock); /* mask of cpus that collect uncore events */ @@ -45,6 +47,24 @@ DEFINE_UNCORE_FORMAT_ATTR(filter_band0, filter_band0, "config1:0-7"); DEFINE_UNCORE_FORMAT_ATTR(filter_band1, filter_band1, "config1:8-15"); DEFINE_UNCORE_FORMAT_ATTR(filter_band2, filter_band2, "config1:16-23"); DEFINE_UNCORE_FORMAT_ATTR(filter_band3, filter_band3, "config1:24-31"); +DEFINE_UNCORE_FORMAT_ATTR(match_rds, match_rds, "config1:48-51"); +DEFINE_UNCORE_FORMAT_ATTR(match_rnid30, match_rnid30, "config1:32-35"); +DEFINE_UNCORE_FORMAT_ATTR(match_rnid4, match_rnid4, "config1:31"); +DEFINE_UNCORE_FORMAT_ATTR(match_dnid, match_dnid, "config1:13-17"); +DEFINE_UNCORE_FORMAT_ATTR(match_mc, match_mc, "config1:9-12"); +DEFINE_UNCORE_FORMAT_ATTR(match_opc, match_opc, "config1:5-8"); +DEFINE_UNCORE_FORMAT_ATTR(match_vnw, match_vnw, "config1:3-4"); +DEFINE_UNCORE_FORMAT_ATTR(match0, match0, "config1:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(match1, match1, "config1:32-63"); +DEFINE_UNCORE_FORMAT_ATTR(mask_rds, mask_rds, "config2:48-51"); +DEFINE_UNCORE_FORMAT_ATTR(mask_rnid30, mask_rnid30, "config2:32-35"); +DEFINE_UNCORE_FORMAT_ATTR(mask_rnid4, mask_rnid4, "config2:31"); +DEFINE_UNCORE_FORMAT_ATTR(mask_dnid, mask_dnid, "config2:13-17"); +DEFINE_UNCORE_FORMAT_ATTR(mask_mc, mask_mc, "config2:9-12"); +DEFINE_UNCORE_FORMAT_ATTR(mask_opc, mask_opc, "config2:5-8"); +DEFINE_UNCORE_FORMAT_ATTR(mask_vnw, mask_vnw, "config2:3-4"); +DEFINE_UNCORE_FORMAT_ATTR(mask0, mask0, "config2:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(mask1, mask1, "config2:32-63"); static u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event) { @@ -281,7 +301,7 @@ static struct attribute *snbep_uncore_cbox_formats_attr[] = { }; static struct attribute *snbep_uncore_pcu_formats_attr[] = { - &format_attr_event.attr, + &format_attr_event_ext.attr, &format_attr_occ_sel.attr, &format_attr_edge.attr, &format_attr_inv.attr, @@ -301,6 +321,24 @@ static struct attribute *snbep_uncore_qpi_formats_attr[] = { &format_attr_edge.attr, &format_attr_inv.attr, &format_attr_thresh8.attr, + &format_attr_match_rds.attr, + &format_attr_match_rnid30.attr, + &format_attr_match_rnid4.attr, + &format_attr_match_dnid.attr, + &format_attr_match_mc.attr, + &format_attr_match_opc.attr, + &format_attr_match_vnw.attr, + &format_attr_match0.attr, + &format_attr_match1.attr, + &format_attr_mask_rds.attr, + &format_attr_mask_rnid30.attr, + &format_attr_mask_rnid4.attr, + &format_attr_mask_dnid.attr, + &format_attr_mask_mc.attr, + &format_attr_mask_opc.attr, + &format_attr_mask_vnw.attr, + &format_attr_mask0.attr, + &format_attr_mask1.attr, NULL, }; @@ -314,8 +352,8 @@ static struct uncore_event_desc snbep_uncore_imc_events[] = { static struct uncore_event_desc snbep_uncore_qpi_events[] = { INTEL_UNCORE_EVENT_DESC(clockticks, "event=0x14"), INTEL_UNCORE_EVENT_DESC(txl_flits_active, "event=0x00,umask=0x06"), - INTEL_UNCORE_EVENT_DESC(drs_data, "event=0x02,umask=0x08"), - INTEL_UNCORE_EVENT_DESC(ncb_data, "event=0x03,umask=0x04"), + INTEL_UNCORE_EVENT_DESC(drs_data, "event=0x102,umask=0x08"), + INTEL_UNCORE_EVENT_DESC(ncb_data, "event=0x103,umask=0x04"), { /* end: all zeroes */ }, }; @@ -356,13 +394,16 @@ static struct intel_uncore_ops snbep_uncore_msr_ops = { SNBEP_UNCORE_MSR_OPS_COMMON_INIT(), }; +#define SNBEP_UNCORE_PCI_OPS_COMMON_INIT() \ + .init_box = snbep_uncore_pci_init_box, \ + .disable_box = snbep_uncore_pci_disable_box, \ + .enable_box = snbep_uncore_pci_enable_box, \ + .disable_event = snbep_uncore_pci_disable_event, \ + .read_counter = snbep_uncore_pci_read_counter + static struct intel_uncore_ops snbep_uncore_pci_ops = { - .init_box = snbep_uncore_pci_init_box, - .disable_box = snbep_uncore_pci_disable_box, - .enable_box = snbep_uncore_pci_enable_box, - .disable_event = snbep_uncore_pci_disable_event, - .enable_event = snbep_uncore_pci_enable_event, - .read_counter = snbep_uncore_pci_read_counter, + SNBEP_UNCORE_PCI_OPS_COMMON_INIT(), + .enable_event = snbep_uncore_pci_enable_event, \ }; static struct event_constraint snbep_uncore_cbox_constraints[] = { @@ -726,6 +767,61 @@ static struct intel_uncore_type *snbep_msr_uncores[] = { NULL, }; +enum { + SNBEP_PCI_QPI_PORT0_FILTER, + SNBEP_PCI_QPI_PORT1_FILTER, +}; + +static int snbep_qpi_hw_config(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + struct hw_perf_event_extra *reg2 = &hwc->branch_reg; + + if ((hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK) == 0x38) { + reg1->idx = 0; + reg1->reg = SNBEP_Q_Py_PCI_PMON_PKT_MATCH0; + reg1->config = event->attr.config1; + reg2->reg = SNBEP_Q_Py_PCI_PMON_PKT_MASK0; + reg2->config = event->attr.config2; + } + return 0; +} + +static void snbep_qpi_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + struct hw_perf_event_extra *reg2 = &hwc->branch_reg; + + if (reg1->idx != EXTRA_REG_NONE) { + int idx = box->pmu->pmu_idx + SNBEP_PCI_QPI_PORT0_FILTER; + struct pci_dev *filter_pdev = extra_pci_dev[box->phys_id][idx]; + WARN_ON_ONCE(!filter_pdev); + if (filter_pdev) { + pci_write_config_dword(filter_pdev, reg1->reg, + (u32)reg1->config); + pci_write_config_dword(filter_pdev, reg1->reg + 4, + (u32)(reg1->config >> 32)); + pci_write_config_dword(filter_pdev, reg2->reg, + (u32)reg2->config); + pci_write_config_dword(filter_pdev, reg2->reg + 4, + (u32)(reg2->config >> 32)); + } + } + + pci_write_config_dword(pdev, hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN); +} + +static struct intel_uncore_ops snbep_uncore_qpi_ops = { + SNBEP_UNCORE_PCI_OPS_COMMON_INIT(), + .enable_event = snbep_qpi_enable_event, + .hw_config = snbep_qpi_hw_config, + .get_constraint = uncore_get_constraint, + .put_constraint = uncore_put_constraint, +}; + #define SNBEP_UNCORE_PCI_COMMON_INIT() \ .perf_ctr = SNBEP_PCI_PMON_CTR0, \ .event_ctl = SNBEP_PCI_PMON_CTL0, \ @@ -755,17 +851,18 @@ static struct intel_uncore_type snbep_uncore_imc = { }; static struct intel_uncore_type snbep_uncore_qpi = { - .name = "qpi", - .num_counters = 4, - .num_boxes = 2, - .perf_ctr_bits = 48, - .perf_ctr = SNBEP_PCI_PMON_CTR0, - .event_ctl = SNBEP_PCI_PMON_CTL0, - .event_mask = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK, - .box_ctl = SNBEP_PCI_PMON_BOX_CTL, - .ops = &snbep_uncore_pci_ops, - .event_descs = snbep_uncore_qpi_events, - .format_group = &snbep_uncore_qpi_format_group, + .name = "qpi", + .num_counters = 4, + .num_boxes = 2, + .perf_ctr_bits = 48, + .perf_ctr = SNBEP_PCI_PMON_CTR0, + .event_ctl = SNBEP_PCI_PMON_CTL0, + .event_mask = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK, + .box_ctl = SNBEP_PCI_PMON_BOX_CTL, + .num_shared_regs = 1, + .ops = &snbep_uncore_qpi_ops, + .event_descs = snbep_uncore_qpi_events, + .format_group = &snbep_uncore_qpi_format_group, }; @@ -807,43 +904,53 @@ static struct intel_uncore_type *snbep_pci_uncores[] = { static DEFINE_PCI_DEVICE_TABLE(snbep_uncore_pci_ids) = { { /* Home Agent */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_HA), - .driver_data = SNBEP_PCI_UNCORE_HA, + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_HA, 0), }, { /* MC Channel 0 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC0), - .driver_data = SNBEP_PCI_UNCORE_IMC, + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 0), }, { /* MC Channel 1 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC1), - .driver_data = SNBEP_PCI_UNCORE_IMC, + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 1), }, { /* MC Channel 2 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC2), - .driver_data = SNBEP_PCI_UNCORE_IMC, + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 2), }, { /* MC Channel 3 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC3), - .driver_data = SNBEP_PCI_UNCORE_IMC, + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 3), }, { /* QPI Port 0 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI0), - .driver_data = SNBEP_PCI_UNCORE_QPI, + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_QPI, 0), }, { /* QPI Port 1 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI1), - .driver_data = SNBEP_PCI_UNCORE_QPI, + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_QPI, 1), }, { /* R2PCIe */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R2PCIE), - .driver_data = SNBEP_PCI_UNCORE_R2PCIE, + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R2PCIE, 0), }, { /* R3QPI Link 0 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI0), - .driver_data = SNBEP_PCI_UNCORE_R3QPI, + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R3QPI, 0), }, { /* R3QPI Link 1 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI1), - .driver_data = SNBEP_PCI_UNCORE_R3QPI, + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R3QPI, 1), + }, + { /* QPI Port 0 filter */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3c86), + .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, + SNBEP_PCI_QPI_PORT0_FILTER), + }, + { /* QPI Port 0 filter */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3c96), + .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, + SNBEP_PCI_QPI_PORT1_FILTER), }, { /* end: all zeroes */ } }; @@ -1256,71 +1363,71 @@ static struct intel_uncore_type *ivt_pci_uncores[] = { static DEFINE_PCI_DEVICE_TABLE(ivt_uncore_pci_ids) = { { /* Home Agent 0 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe30), - .driver_data = IVT_PCI_UNCORE_HA, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_HA, 0), }, { /* Home Agent 1 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe38), - .driver_data = IVT_PCI_UNCORE_HA, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_HA, 1), }, { /* MC0 Channel 0 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb4), - .driver_data = IVT_PCI_UNCORE_IMC, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 0), }, { /* MC0 Channel 1 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb5), - .driver_data = IVT_PCI_UNCORE_IMC, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 1), }, { /* MC0 Channel 3 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb0), - .driver_data = IVT_PCI_UNCORE_IMC, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 2), }, { /* MC0 Channel 4 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb1), - .driver_data = IVT_PCI_UNCORE_IMC, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 3), }, { /* MC1 Channel 0 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef4), - .driver_data = IVT_PCI_UNCORE_IMC, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 4), }, { /* MC1 Channel 1 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef5), - .driver_data = IVT_PCI_UNCORE_IMC, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 5), }, { /* MC1 Channel 3 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef0), - .driver_data = IVT_PCI_UNCORE_IMC, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 6), }, { /* MC1 Channel 4 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef1), - .driver_data = IVT_PCI_UNCORE_IMC, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 7), }, { /* QPI0 Port 0 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe32), - .driver_data = IVT_PCI_UNCORE_QPI, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_QPI, 0), }, { /* QPI0 Port 1 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe33), - .driver_data = IVT_PCI_UNCORE_QPI, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_QPI, 1), }, { /* QPI1 Port 2 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3a), - .driver_data = IVT_PCI_UNCORE_QPI, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_QPI, 2), }, { /* R2PCIe */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe34), - .driver_data = IVT_PCI_UNCORE_R2PCIE, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_R2PCIE, 0), }, { /* R3QPI0 Link 0 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe36), - .driver_data = IVT_PCI_UNCORE_R3QPI, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_R3QPI, 0), }, { /* R3QPI0 Link 1 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe37), - .driver_data = IVT_PCI_UNCORE_R3QPI, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_R3QPI, 1), }, { /* R3QPI1 Link 2 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3e), - .driver_data = IVT_PCI_UNCORE_R3QPI, + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_R3QPI, 2), }, { /* end: all zeroes */ } }; @@ -2606,7 +2713,7 @@ struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, int cp size = sizeof(*box) + type->num_shared_regs * sizeof(struct intel_uncore_extra_reg); - box = kmalloc_node(size, GFP_KERNEL | __GFP_ZERO, cpu_to_node(cpu)); + box = kzalloc_node(size, GFP_KERNEL, cpu_to_node(cpu)); if (!box) return NULL; @@ -3167,16 +3274,24 @@ static bool pcidrv_registered; /* * add a pci uncore device */ -static int uncore_pci_add(struct intel_uncore_type *type, struct pci_dev *pdev) +static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) { struct intel_uncore_pmu *pmu; struct intel_uncore_box *box; - int i, phys_id; + struct intel_uncore_type *type; + int phys_id; phys_id = pcibus_to_physid[pdev->bus->number]; if (phys_id < 0) return -ENODEV; + if (UNCORE_PCI_DEV_TYPE(id->driver_data) == UNCORE_EXTRA_PCI_DEV) { + extra_pci_dev[phys_id][UNCORE_PCI_DEV_IDX(id->driver_data)] = pdev; + pci_set_drvdata(pdev, NULL); + return 0; + } + + type = pci_uncores[UNCORE_PCI_DEV_TYPE(id->driver_data)]; box = uncore_alloc_box(type, 0); if (!box) return -ENOMEM; @@ -3185,21 +3300,11 @@ static int uncore_pci_add(struct intel_uncore_type *type, struct pci_dev *pdev) * for performance monitoring unit with multiple boxes, * each box has a different function id. */ - for (i = 0; i < type->num_boxes; i++) { - pmu = &type->pmus[i]; - if (pmu->func_id == pdev->devfn) - break; - if (pmu->func_id < 0) { - pmu->func_id = pdev->devfn; - break; - } - pmu = NULL; - } - - if (!pmu) { - kfree(box); - return -EINVAL; - } + pmu = &type->pmus[UNCORE_PCI_DEV_IDX(id->driver_data)]; + if (pmu->func_id < 0) + pmu->func_id = pdev->devfn; + else + WARN_ON_ONCE(pmu->func_id != pdev->devfn); box->phys_id = phys_id; box->pci_dev = pdev; @@ -3217,9 +3322,22 @@ static int uncore_pci_add(struct intel_uncore_type *type, struct pci_dev *pdev) static void uncore_pci_remove(struct pci_dev *pdev) { struct intel_uncore_box *box = pci_get_drvdata(pdev); - struct intel_uncore_pmu *pmu = box->pmu; - int cpu, phys_id = pcibus_to_physid[pdev->bus->number]; + struct intel_uncore_pmu *pmu; + int i, cpu, phys_id = pcibus_to_physid[pdev->bus->number]; + box = pci_get_drvdata(pdev); + if (!box) { + for (i = 0; i < UNCORE_EXTRA_PCI_DEV_MAX; i++) { + if (extra_pci_dev[phys_id][i] == pdev) { + extra_pci_dev[phys_id][i] = NULL; + break; + } + } + WARN_ON_ONCE(i >= UNCORE_EXTRA_PCI_DEV_MAX); + return; + } + + pmu = box->pmu; if (WARN_ON_ONCE(phys_id != box->phys_id)) return; @@ -3240,12 +3358,6 @@ static void uncore_pci_remove(struct pci_dev *pdev) kfree(box); } -static int uncore_pci_probe(struct pci_dev *pdev, - const struct pci_device_id *id) -{ - return uncore_pci_add(pci_uncores[id->driver_data], pdev); -} - static int __init uncore_pci_init(void) { int ret; diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h index 47b3d00c9d8..a80ab71a883 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h @@ -12,6 +12,15 @@ #define UNCORE_PMC_IDX_FIXED UNCORE_PMC_IDX_MAX_GENERIC #define UNCORE_PMC_IDX_MAX (UNCORE_PMC_IDX_FIXED + 1) +#define UNCORE_PCI_DEV_DATA(type, idx) ((type << 8) | idx) +#define UNCORE_PCI_DEV_TYPE(data) ((data >> 8) & 0xff) +#define UNCORE_PCI_DEV_IDX(data) (data & 0xff) +#define UNCORE_EXTRA_PCI_DEV 0xff +#define UNCORE_EXTRA_PCI_DEV_MAX 2 + +/* support up to 8 sockets */ +#define UNCORE_SOCKET_MAX 8 + #define UNCORE_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, 0xff) /* SNB event control */ @@ -108,6 +117,7 @@ (SNBEP_PMON_CTL_EV_SEL_MASK | \ SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \ SNBEP_PMON_CTL_EDGE_DET | \ + SNBEP_PMON_CTL_EV_SEL_EXT | \ SNBEP_PMON_CTL_INVERT | \ SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \ SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \ diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 7076878404e..628a059a9a0 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -93,7 +93,7 @@ static void __init vmware_platform_setup(void) * serial key should be enough, as this will always have a VMware * specific string when running under VMware hypervisor. */ -static bool __init vmware_platform(void) +static uint32_t __init vmware_platform(void) { if (cpu_has_hypervisor) { unsigned int eax; @@ -102,12 +102,12 @@ static bool __init vmware_platform(void) cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &hyper_vendor_id[0], &hyper_vendor_id[1], &hyper_vendor_id[2]); if (!memcmp(hyper_vendor_id, "VMwareVMware", 12)) - return true; + return CPUID_VMWARE_INFO_LEAF; } else if (dmi_available && dmi_name_in_serial("VMware") && __vmware_platform()) - return true; + return 1; - return false; + return 0; } /* diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 74467feb4dc..e0e0841eef4 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -128,7 +128,9 @@ void native_machine_crash_shutdown(struct pt_regs *regs) cpu_emergency_svm_disable(); lapic_shutdown(); -#if defined(CONFIG_X86_IO_APIC) +#ifdef CONFIG_X86_IO_APIC + /* Prevent crash_kexec() from deadlocking on ioapic_lock. */ + ioapic_zap_locks(); disable_IO_APIC(); #endif #ifdef CONFIG_HPET_TIMER diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index d32abeabbda..174da5fc5a7 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -658,15 +658,18 @@ __init void e820_setup_gap(void) * boot_params.e820_map, others are passed via SETUP_E820_EXT node of * linked list of struct setup_data, which is parsed here. */ -void __init parse_e820_ext(struct setup_data *sdata) +void __init parse_e820_ext(u64 phys_addr, u32 data_len) { int entries; struct e820entry *extmap; + struct setup_data *sdata; + sdata = early_memremap(phys_addr, data_len); entries = sdata->len / sizeof(struct e820entry); extmap = (struct e820entry *)(sdata->data); __append_e820_map(extmap, entries); sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + early_iounmap(sdata, data_len); printk(KERN_INFO "e820: extended physical RAM map:\n"); e820_print_map("extended"); } diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 138463a2487..06f87bece92 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -29,7 +29,7 @@ static void __init i386_default_early_setup(void) reserve_ebda_region(); } -void __init i386_start_kernel(void) +asmlinkage void __init i386_start_kernel(void) { sanitize_boot_params(&boot_params); diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 55b67614ed9..1be8e43b669 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -137,7 +137,7 @@ static void __init copy_bootdata(char *real_mode_data) } } -void __init x86_64_start_kernel(char * real_mode_data) +asmlinkage void __init x86_64_start_kernel(char * real_mode_data) { int i; diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 5dd87a89f01..81ba27679f1 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -409,6 +409,7 @@ enable_paging: /* * Check if it is 486 */ + movb $4,X86 # at least 486 cmpl $-1,X86_CPUID je is486 @@ -436,7 +437,6 @@ enable_paging: movl %edx,X86_CAPABILITY is486: - movb $4,X86 movl $0x50022,%ecx # set AM, WP, NE and MP movl %cr0,%eax andl $0x80000011,%eax # Save PG,PE,ET diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 3a8185c042a..22d0687e7fd 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -177,7 +177,7 @@ u64 arch_irq_stat(void) * SMP cross-CPU interrupts have their own specific * handlers). */ -unsigned int __irq_entry do_IRQ(struct pt_regs *regs) +__visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); @@ -215,7 +215,7 @@ void __smp_x86_platform_ipi(void) x86_platform_ipi_callback(); } -void smp_x86_platform_ipi(struct pt_regs *regs) +__visible void smp_x86_platform_ipi(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); @@ -229,7 +229,7 @@ void smp_x86_platform_ipi(struct pt_regs *regs) /* * Handler for POSTED_INTERRUPT_VECTOR. */ -void smp_kvm_posted_intr_ipi(struct pt_regs *regs) +__visible void smp_kvm_posted_intr_ipi(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); @@ -247,7 +247,7 @@ void smp_kvm_posted_intr_ipi(struct pt_regs *regs) } #endif -void smp_trace_x86_platform_ipi(struct pt_regs *regs) +__visible void smp_trace_x86_platform_ipi(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c index 636a55e4a13..1de84e3ab4e 100644 --- a/arch/x86/kernel/irq_work.c +++ b/arch/x86/kernel/irq_work.c @@ -22,14 +22,14 @@ static inline void __smp_irq_work_interrupt(void) irq_work_run(); } -void smp_irq_work_interrupt(struct pt_regs *regs) +__visible void smp_irq_work_interrupt(struct pt_regs *regs) { irq_work_entering_irq(); __smp_irq_work_interrupt(); exiting_irq(); } -void smp_trace_irq_work_interrupt(struct pt_regs *regs) +__visible void smp_trace_irq_work_interrupt(struct pt_regs *regs) { irq_work_entering_irq(); trace_irq_work_entry(IRQ_WORK_VECTOR); diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index 2889b3d4388..460f5d9ceeb 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -37,7 +37,19 @@ static void __jump_label_transform(struct jump_entry *entry, } else memcpy(&code, ideal_nops[NOP_ATOMIC5], JUMP_LABEL_NOP_SIZE); - (*poker)((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE); + /* + * Make text_poke_bp() a default fallback poker. + * + * At the time the change is being done, just ignore whether we + * are doing nop -> jump or jump -> nop transition, and assume + * always nop being the 'currently valid' instruction + * + */ + if (poker) + (*poker)((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE); + else + text_poke_bp((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE, + (void *)entry->code + JUMP_LABEL_NOP_SIZE); } void arch_jump_label_transform(struct jump_entry *entry, @@ -45,7 +57,7 @@ void arch_jump_label_transform(struct jump_entry *entry, { get_online_cpus(); mutex_lock(&text_mutex); - __jump_label_transform(entry, type, text_poke_smp); + __jump_label_transform(entry, type, NULL); mutex_unlock(&text_mutex); put_online_cpus(); } diff --git a/arch/x86/kernel/kprobes/common.h b/arch/x86/kernel/kprobes/common.h index 2e9d4b5af03..c6ee63f927a 100644 --- a/arch/x86/kernel/kprobes/common.h +++ b/arch/x86/kernel/kprobes/common.h @@ -82,14 +82,9 @@ extern void synthesize_reljump(void *from, void *to); extern void synthesize_relcall(void *from, void *to); #ifdef CONFIG_OPTPROBES -extern int arch_init_optprobes(void); extern int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter); extern unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr); #else /* !CONFIG_OPTPROBES */ -static inline int arch_init_optprobes(void) -{ - return 0; -} static inline int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter) { return 0; diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 211bce44552..79a3f968287 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -661,7 +661,7 @@ static void __used __kprobes kretprobe_trampoline_holder(void) /* * Called from kretprobe_trampoline */ -static __used __kprobes void *trampoline_handler(struct pt_regs *regs) +__visible __used __kprobes void *trampoline_handler(struct pt_regs *regs) { struct kretprobe_instance *ri = NULL; struct hlist_head *head, empty_rp; @@ -1068,7 +1068,7 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) int __init arch_init_kprobes(void) { - return arch_init_optprobes(); + return 0; } int __kprobes arch_trampoline_kprobe(struct kprobe *p) diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 76dc6f09572..898160b42e4 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -88,9 +88,7 @@ static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr, unsigned long v *(unsigned long *)addr = val; } -static void __used __kprobes kprobes_optinsn_template_holder(void) -{ - asm volatile ( +asm ( ".global optprobe_template_entry\n" "optprobe_template_entry:\n" #ifdef CONFIG_X86_64 @@ -129,7 +127,6 @@ static void __used __kprobes kprobes_optinsn_template_holder(void) #endif ".global optprobe_template_end\n" "optprobe_template_end:\n"); -} #define TMPL_MOVE_IDX \ ((long)&optprobe_template_val - (long)&optprobe_template_entry) @@ -371,31 +368,6 @@ int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op) return 0; } -#define MAX_OPTIMIZE_PROBES 256 -static struct text_poke_param *jump_poke_params; -static struct jump_poke_buffer { - u8 buf[RELATIVEJUMP_SIZE]; -} *jump_poke_bufs; - -static void __kprobes setup_optimize_kprobe(struct text_poke_param *tprm, - u8 *insn_buf, - struct optimized_kprobe *op) -{ - s32 rel = (s32)((long)op->optinsn.insn - - ((long)op->kp.addr + RELATIVEJUMP_SIZE)); - - /* Backup instructions which will be replaced by jump address */ - memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE, - RELATIVE_ADDR_SIZE); - - insn_buf[0] = RELATIVEJUMP_OPCODE; - *(s32 *)(&insn_buf[1]) = rel; - - tprm->addr = op->kp.addr; - tprm->opcode = insn_buf; - tprm->len = RELATIVEJUMP_SIZE; -} - /* * Replace breakpoints (int3) with relative jumps. * Caller must call with locking kprobe_mutex and text_mutex. @@ -403,37 +375,38 @@ static void __kprobes setup_optimize_kprobe(struct text_poke_param *tprm, void __kprobes arch_optimize_kprobes(struct list_head *oplist) { struct optimized_kprobe *op, *tmp; - int c = 0; + u8 insn_buf[RELATIVEJUMP_SIZE]; list_for_each_entry_safe(op, tmp, oplist, list) { + s32 rel = (s32)((long)op->optinsn.insn - + ((long)op->kp.addr + RELATIVEJUMP_SIZE)); + WARN_ON(kprobe_disabled(&op->kp)); - /* Setup param */ - setup_optimize_kprobe(&jump_poke_params[c], - jump_poke_bufs[c].buf, op); + + /* Backup instructions which will be replaced by jump address */ + memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE, + RELATIVE_ADDR_SIZE); + + insn_buf[0] = RELATIVEJUMP_OPCODE; + *(s32 *)(&insn_buf[1]) = rel; + + text_poke_bp(op->kp.addr, insn_buf, RELATIVEJUMP_SIZE, + op->optinsn.insn); + list_del_init(&op->list); - if (++c >= MAX_OPTIMIZE_PROBES) - break; } - - /* - * text_poke_smp doesn't support NMI/MCE code modifying. - * However, since kprobes itself also doesn't support NMI/MCE - * code probing, it's not a problem. - */ - text_poke_smp_batch(jump_poke_params, c); } -static void __kprobes setup_unoptimize_kprobe(struct text_poke_param *tprm, - u8 *insn_buf, - struct optimized_kprobe *op) +/* Replace a relative jump with a breakpoint (int3). */ +void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op) { + u8 insn_buf[RELATIVEJUMP_SIZE]; + /* Set int3 to first byte for kprobes */ insn_buf[0] = BREAKPOINT_INSTRUCTION; memcpy(insn_buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); - - tprm->addr = op->kp.addr; - tprm->opcode = insn_buf; - tprm->len = RELATIVEJUMP_SIZE; + text_poke_bp(op->kp.addr, insn_buf, RELATIVEJUMP_SIZE, + op->optinsn.insn); } /* @@ -444,34 +417,11 @@ extern void arch_unoptimize_kprobes(struct list_head *oplist, struct list_head *done_list) { struct optimized_kprobe *op, *tmp; - int c = 0; list_for_each_entry_safe(op, tmp, oplist, list) { - /* Setup param */ - setup_unoptimize_kprobe(&jump_poke_params[c], - jump_poke_bufs[c].buf, op); + arch_unoptimize_kprobe(op); list_move(&op->list, done_list); - if (++c >= MAX_OPTIMIZE_PROBES) - break; } - - /* - * text_poke_smp doesn't support NMI/MCE code modifying. - * However, since kprobes itself also doesn't support NMI/MCE - * code probing, it's not a problem. - */ - text_poke_smp_batch(jump_poke_params, c); -} - -/* Replace a relative jump with a breakpoint (int3). */ -void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op) -{ - u8 buf[RELATIVEJUMP_SIZE]; - - /* Set int3 to first byte for kprobes */ - buf[0] = BREAKPOINT_INSTRUCTION; - memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); - text_poke_smp(op->kp.addr, buf, RELATIVEJUMP_SIZE); } int __kprobes @@ -491,22 +441,3 @@ setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter) } return 0; } - -int __kprobes arch_init_optprobes(void) -{ - /* Allocate code buffer and parameter array */ - jump_poke_bufs = kmalloc(sizeof(struct jump_poke_buffer) * - MAX_OPTIMIZE_PROBES, GFP_KERNEL); - if (!jump_poke_bufs) - return -ENOMEM; - - jump_poke_params = kmalloc(sizeof(struct text_poke_param) * - MAX_OPTIMIZE_PROBES, GFP_KERNEL); - if (!jump_poke_params) { - kfree(jump_poke_bufs); - jump_poke_bufs = NULL; - return -ENOMEM; - } - - return 0; -} diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index a96d32cc55b..697b93af02d 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -34,6 +34,7 @@ #include <linux/sched.h> #include <linux/slab.h> #include <linux/kprobes.h> +#include <linux/debugfs.h> #include <asm/timer.h> #include <asm/cpu.h> #include <asm/traps.h> @@ -419,6 +420,7 @@ static void __init kvm_smp_prepare_boot_cpu(void) WARN_ON(kvm_register_clock("primary cpu clock")); kvm_guest_cpu_init(); native_smp_prepare_boot_cpu(); + kvm_spinlock_init(); } static void kvm_guest_cpu_online(void *dummy) @@ -498,11 +500,9 @@ void __init kvm_guest_init(void) #endif } -static bool __init kvm_detect(void) +static uint32_t __init kvm_detect(void) { - if (!kvm_para_available()) - return false; - return true; + return kvm_cpuid_base(); } const struct hypervisor_x86 x86_hyper_kvm __refconst = { @@ -523,3 +523,263 @@ static __init int activate_jump_labels(void) return 0; } arch_initcall(activate_jump_labels); + +#ifdef CONFIG_PARAVIRT_SPINLOCKS + +/* Kick a cpu by its apicid. Used to wake up a halted vcpu */ +static void kvm_kick_cpu(int cpu) +{ + int apicid; + unsigned long flags = 0; + + apicid = per_cpu(x86_cpu_to_apicid, cpu); + kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid); +} + +enum kvm_contention_stat { + TAKEN_SLOW, + TAKEN_SLOW_PICKUP, + RELEASED_SLOW, + RELEASED_SLOW_KICKED, + NR_CONTENTION_STATS +}; + +#ifdef CONFIG_KVM_DEBUG_FS +#define HISTO_BUCKETS 30 + +static struct kvm_spinlock_stats +{ + u32 contention_stats[NR_CONTENTION_STATS]; + u32 histo_spin_blocked[HISTO_BUCKETS+1]; + u64 time_blocked; +} spinlock_stats; + +static u8 zero_stats; + +static inline void check_zero(void) +{ + u8 ret; + u8 old; + + old = ACCESS_ONCE(zero_stats); + if (unlikely(old)) { + ret = cmpxchg(&zero_stats, old, 0); + /* This ensures only one fellow resets the stat */ + if (ret == old) + memset(&spinlock_stats, 0, sizeof(spinlock_stats)); + } +} + +static inline void add_stats(enum kvm_contention_stat var, u32 val) +{ + check_zero(); + spinlock_stats.contention_stats[var] += val; +} + + +static inline u64 spin_time_start(void) +{ + return sched_clock(); +} + +static void __spin_time_accum(u64 delta, u32 *array) +{ + unsigned index; + + index = ilog2(delta); + check_zero(); + + if (index < HISTO_BUCKETS) + array[index]++; + else + array[HISTO_BUCKETS]++; +} + +static inline void spin_time_accum_blocked(u64 start) +{ + u32 delta; + + delta = sched_clock() - start; + __spin_time_accum(delta, spinlock_stats.histo_spin_blocked); + spinlock_stats.time_blocked += delta; +} + +static struct dentry *d_spin_debug; +static struct dentry *d_kvm_debug; + +struct dentry *kvm_init_debugfs(void) +{ + d_kvm_debug = debugfs_create_dir("kvm", NULL); + if (!d_kvm_debug) + printk(KERN_WARNING "Could not create 'kvm' debugfs directory\n"); + + return d_kvm_debug; +} + +static int __init kvm_spinlock_debugfs(void) +{ + struct dentry *d_kvm; + + d_kvm = kvm_init_debugfs(); + if (d_kvm == NULL) + return -ENOMEM; + + d_spin_debug = debugfs_create_dir("spinlocks", d_kvm); + + debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats); + + debugfs_create_u32("taken_slow", 0444, d_spin_debug, + &spinlock_stats.contention_stats[TAKEN_SLOW]); + debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug, + &spinlock_stats.contention_stats[TAKEN_SLOW_PICKUP]); + + debugfs_create_u32("released_slow", 0444, d_spin_debug, + &spinlock_stats.contention_stats[RELEASED_SLOW]); + debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug, + &spinlock_stats.contention_stats[RELEASED_SLOW_KICKED]); + + debugfs_create_u64("time_blocked", 0444, d_spin_debug, + &spinlock_stats.time_blocked); + + debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug, + spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1); + + return 0; +} +fs_initcall(kvm_spinlock_debugfs); +#else /* !CONFIG_KVM_DEBUG_FS */ +static inline void add_stats(enum kvm_contention_stat var, u32 val) +{ +} + +static inline u64 spin_time_start(void) +{ + return 0; +} + +static inline void spin_time_accum_blocked(u64 start) +{ +} +#endif /* CONFIG_KVM_DEBUG_FS */ + +struct kvm_lock_waiting { + struct arch_spinlock *lock; + __ticket_t want; +}; + +/* cpus 'waiting' on a spinlock to become available */ +static cpumask_t waiting_cpus; + +/* Track spinlock on which a cpu is waiting */ +static DEFINE_PER_CPU(struct kvm_lock_waiting, klock_waiting); + +static void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want) +{ + struct kvm_lock_waiting *w; + int cpu; + u64 start; + unsigned long flags; + + if (in_nmi()) + return; + + w = &__get_cpu_var(klock_waiting); + cpu = smp_processor_id(); + start = spin_time_start(); + + /* + * Make sure an interrupt handler can't upset things in a + * partially setup state. + */ + local_irq_save(flags); + + /* + * The ordering protocol on this is that the "lock" pointer + * may only be set non-NULL if the "want" ticket is correct. + * If we're updating "want", we must first clear "lock". + */ + w->lock = NULL; + smp_wmb(); + w->want = want; + smp_wmb(); + w->lock = lock; + + add_stats(TAKEN_SLOW, 1); + + /* + * This uses set_bit, which is atomic but we should not rely on its + * reordering gurantees. So barrier is needed after this call. + */ + cpumask_set_cpu(cpu, &waiting_cpus); + + barrier(); + + /* + * Mark entry to slowpath before doing the pickup test to make + * sure we don't deadlock with an unlocker. + */ + __ticket_enter_slowpath(lock); + + /* + * check again make sure it didn't become free while + * we weren't looking. + */ + if (ACCESS_ONCE(lock->tickets.head) == want) { + add_stats(TAKEN_SLOW_PICKUP, 1); + goto out; + } + + /* + * halt until it's our turn and kicked. Note that we do safe halt + * for irq enabled case to avoid hang when lock info is overwritten + * in irq spinlock slowpath and no spurious interrupt occur to save us. + */ + if (arch_irqs_disabled_flags(flags)) + halt(); + else + safe_halt(); + +out: + cpumask_clear_cpu(cpu, &waiting_cpus); + w->lock = NULL; + local_irq_restore(flags); + spin_time_accum_blocked(start); +} +PV_CALLEE_SAVE_REGS_THUNK(kvm_lock_spinning); + +/* Kick vcpu waiting on @lock->head to reach value @ticket */ +static void kvm_unlock_kick(struct arch_spinlock *lock, __ticket_t ticket) +{ + int cpu; + + add_stats(RELEASED_SLOW, 1); + for_each_cpu(cpu, &waiting_cpus) { + const struct kvm_lock_waiting *w = &per_cpu(klock_waiting, cpu); + if (ACCESS_ONCE(w->lock) == lock && + ACCESS_ONCE(w->want) == ticket) { + add_stats(RELEASED_SLOW_KICKED, 1); + kvm_kick_cpu(cpu); + break; + } + } +} + +/* + * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present. + */ +void __init kvm_spinlock_init(void) +{ + if (!kvm_para_available()) + return; + /* Does host kernel support KVM_FEATURE_PV_UNHALT? */ + if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) + return; + + printk(KERN_INFO "KVM setup paravirtual spinlock\n"); + + static_key_slow_inc(¶virt_ticketlocks_enabled); + + pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(kvm_lock_spinning); + pv_lock_ops.unlock_kick = kvm_unlock_kick; +} +#endif /* CONFIG_PARAVIRT_SPINLOCKS */ diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 7a0adb7ee43..7123b5df479 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -145,10 +145,9 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) return 0; } -static unsigned int verify_patch_size(int cpu, u32 patch_size, +static unsigned int verify_patch_size(u8 family, u32 patch_size, unsigned int size) { - struct cpuinfo_x86 *c = &cpu_data(cpu); u32 max_size; #define F1XH_MPB_MAX_SIZE 2048 @@ -156,7 +155,7 @@ static unsigned int verify_patch_size(int cpu, u32 patch_size, #define F15H_MPB_MAX_SIZE 4096 #define F16H_MPB_MAX_SIZE 3458 - switch (c->x86) { + switch (family) { case 0x14: max_size = F14H_MPB_MAX_SIZE; break; @@ -277,9 +276,8 @@ static void cleanup(void) * driver cannot continue functioning normally. In such cases, we tear * down everything we've used up so far and exit. */ -static int verify_and_add_patch(unsigned int cpu, u8 *fw, unsigned int leftover) +static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover) { - struct cpuinfo_x86 *c = &cpu_data(cpu); struct microcode_header_amd *mc_hdr; struct ucode_patch *patch; unsigned int patch_size, crnt_size, ret; @@ -299,7 +297,7 @@ static int verify_and_add_patch(unsigned int cpu, u8 *fw, unsigned int leftover) /* check if patch is for the current family */ proc_fam = ((proc_fam >> 8) & 0xf) + ((proc_fam >> 20) & 0xff); - if (proc_fam != c->x86) + if (proc_fam != family) return crnt_size; if (mc_hdr->nb_dev_id || mc_hdr->sb_dev_id) { @@ -308,7 +306,7 @@ static int verify_and_add_patch(unsigned int cpu, u8 *fw, unsigned int leftover) return crnt_size; } - ret = verify_patch_size(cpu, patch_size, leftover); + ret = verify_patch_size(family, patch_size, leftover); if (!ret) { pr_err("Patch-ID 0x%08x: size mismatch.\n", mc_hdr->patch_id); return crnt_size; @@ -339,7 +337,8 @@ static int verify_and_add_patch(unsigned int cpu, u8 *fw, unsigned int leftover) return crnt_size; } -static enum ucode_state __load_microcode_amd(int cpu, const u8 *data, size_t size) +static enum ucode_state __load_microcode_amd(u8 family, const u8 *data, + size_t size) { enum ucode_state ret = UCODE_ERROR; unsigned int leftover; @@ -362,7 +361,7 @@ static enum ucode_state __load_microcode_amd(int cpu, const u8 *data, size_t siz } while (leftover) { - crnt_size = verify_and_add_patch(cpu, fw, leftover); + crnt_size = verify_and_add_patch(family, fw, leftover); if (crnt_size < 0) return ret; @@ -373,22 +372,22 @@ static enum ucode_state __load_microcode_amd(int cpu, const u8 *data, size_t siz return UCODE_OK; } -enum ucode_state load_microcode_amd(int cpu, const u8 *data, size_t size) +enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size) { enum ucode_state ret; /* free old equiv table */ free_equiv_cpu_table(); - ret = __load_microcode_amd(cpu, data, size); + ret = __load_microcode_amd(family, data, size); if (ret != UCODE_OK) cleanup(); #if defined(CONFIG_MICROCODE_AMD_EARLY) && defined(CONFIG_X86_32) /* save BSP's matching patch for early load */ - if (cpu_data(cpu).cpu_index == boot_cpu_data.cpu_index) { - struct ucode_patch *p = find_patch(cpu); + if (cpu_data(smp_processor_id()).cpu_index == boot_cpu_data.cpu_index) { + struct ucode_patch *p = find_patch(smp_processor_id()); if (p) { memset(amd_bsp_mpb, 0, MPB_MAX_SIZE); memcpy(amd_bsp_mpb, p->data, min_t(u32, ksize(p->data), @@ -441,7 +440,7 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device, goto fw_release; } - ret = load_microcode_amd(cpu, fw->data, fw->size); + ret = load_microcode_amd(c->x86, fw->data, fw->size); fw_release: release_firmware(fw); diff --git a/arch/x86/kernel/microcode_amd_early.c b/arch/x86/kernel/microcode_amd_early.c index 1d14ffee574..6073104ccaa 100644 --- a/arch/x86/kernel/microcode_amd_early.c +++ b/arch/x86/kernel/microcode_amd_early.c @@ -238,25 +238,17 @@ static void __init collect_cpu_sig_on_bsp(void *arg) uci->cpu_sig.sig = cpuid_eax(0x00000001); } #else -static void collect_cpu_info_amd_early(struct cpuinfo_x86 *c, - struct ucode_cpu_info *uci) +void load_ucode_amd_ap(void) { + unsigned int cpu = smp_processor_id(); + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; u32 rev, eax; rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax); eax = cpuid_eax(0x00000001); - uci->cpu_sig.sig = eax; uci->cpu_sig.rev = rev; - c->microcode = rev; - c->x86 = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); -} - -void load_ucode_amd_ap(void) -{ - unsigned int cpu = smp_processor_id(); - - collect_cpu_info_amd_early(&cpu_data(cpu), ucode_cpu_info + cpu); + uci->cpu_sig.sig = eax; if (cpu && !ucode_loaded) { void *ucode; @@ -265,8 +257,10 @@ void load_ucode_amd_ap(void) return; ucode = (void *)(initrd_start + ucode_offset); - if (load_microcode_amd(0, ucode, ucode_size) != UCODE_OK) + eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); + if (load_microcode_amd(eax, ucode, ucode_size) != UCODE_OK) return; + ucode_loaded = true; } @@ -278,6 +272,8 @@ int __init save_microcode_in_initrd_amd(void) { enum ucode_state ret; void *ucode; + u32 eax; + #ifdef CONFIG_X86_32 unsigned int bsp = boot_cpu_data.cpu_index; struct ucode_cpu_info *uci = ucode_cpu_info + bsp; @@ -293,7 +289,10 @@ int __init save_microcode_in_initrd_amd(void) return 0; ucode = (void *)(initrd_start + ucode_offset); - ret = load_microcode_amd(0, ucode, ucode_size); + eax = cpuid_eax(0x00000001); + eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); + + ret = load_microcode_amd(eax, ucode, ucode_size); if (ret != UCODE_OK) return -EINVAL; diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c index 676b8c77a97..bbb6c731634 100644 --- a/arch/x86/kernel/paravirt-spinlocks.c +++ b/arch/x86/kernel/paravirt-spinlocks.c @@ -4,25 +4,17 @@ */ #include <linux/spinlock.h> #include <linux/module.h> +#include <linux/jump_label.h> #include <asm/paravirt.h> -static inline void -default_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags) -{ - arch_spin_lock(lock); -} - struct pv_lock_ops pv_lock_ops = { #ifdef CONFIG_SMP - .spin_is_locked = __ticket_spin_is_locked, - .spin_is_contended = __ticket_spin_is_contended, - - .spin_lock = __ticket_spin_lock, - .spin_lock_flags = default_spin_lock_flags, - .spin_trylock = __ticket_spin_trylock, - .spin_unlock = __ticket_spin_unlock, + .lock_spinning = __PV_IS_CALLEE_SAVE(paravirt_nop), + .unlock_kick = paravirt_nop, #endif }; EXPORT_SYMBOL(pv_lock_ops); +struct static_key paravirt_ticketlocks_enabled = STATIC_KEY_INIT_FALSE; +EXPORT_SYMBOL(paravirt_ticketlocks_enabled); diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index cd6de64cc48..1b10af835c3 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -62,11 +62,6 @@ void __init default_banner(void) pv_info.name); } -/* Simple instruction patching code. */ -#define DEF_NATIVE(ops, name, code) \ - extern const char start_##ops##_##name[], end_##ops##_##name[]; \ - asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":") - /* Undefined instruction for dealing with missing ops pointers. */ static const unsigned char ud2a[] = { 0x0f, 0x0b }; @@ -324,7 +319,7 @@ struct pv_time_ops pv_time_ops = { .steal_clock = native_steal_clock, }; -struct pv_irq_ops pv_irq_ops = { +__visible struct pv_irq_ops pv_irq_ops = { .save_fl = __PV_IS_CALLEE_SAVE(native_save_fl), .restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl), .irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable), @@ -336,7 +331,7 @@ struct pv_irq_ops pv_irq_ops = { #endif }; -struct pv_cpu_ops pv_cpu_ops = { +__visible struct pv_cpu_ops pv_cpu_ops = { .cpuid = native_cpuid, .get_debugreg = native_get_debugreg, .set_debugreg = native_set_debugreg, diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 83369e5a1d2..c83516be105 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -36,7 +36,7 @@ * section. Since TSS's are completely CPU-local, we want them * on exact cacheline boundaries, to eliminate cacheline ping-pong. */ -DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS; +__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS; #ifdef CONFIG_X86_64 static DEFINE_PER_CPU(unsigned char, is_idle); diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index f8adefca71d..884f98f6935 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -247,7 +247,7 @@ EXPORT_SYMBOL_GPL(start_thread); * the task-switch, and shows up in ret_from_fork in entry.S, * for example. */ -__notrace_funcgraph struct task_struct * +__visible __notrace_funcgraph struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *prev = &prev_p->thread, diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 05646bab4ca..bb1dc51bab0 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -52,7 +52,7 @@ asmlinkage extern void ret_from_fork(void); -DEFINE_PER_CPU(unsigned long, old_rsp); +asmlinkage DEFINE_PER_CPU(unsigned long, old_rsp); /* Prints also some state that isn't saved in the pt_regs */ void __show_regs(struct pt_regs *regs, int all) @@ -274,7 +274,7 @@ void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp) * Kprobes not supported here. Set the probe on schedule instead. * Function graph tracer not supported too. */ -__notrace_funcgraph struct task_struct * +__visible __notrace_funcgraph struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *prev = &prev_p->thread; diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 2cb9470ea85..a16bae3f83b 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -128,46 +128,7 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock, set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); } -static struct pvclock_vsyscall_time_info *pvclock_vdso_info; - -static struct pvclock_vsyscall_time_info * -pvclock_get_vsyscall_user_time_info(int cpu) -{ - if (!pvclock_vdso_info) { - BUG(); - return NULL; - } - - return &pvclock_vdso_info[cpu]; -} - -struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu) -{ - return &pvclock_get_vsyscall_user_time_info(cpu)->pvti; -} - #ifdef CONFIG_X86_64 -static int pvclock_task_migrate(struct notifier_block *nb, unsigned long l, - void *v) -{ - struct task_migration_notifier *mn = v; - struct pvclock_vsyscall_time_info *pvti; - - pvti = pvclock_get_vsyscall_user_time_info(mn->from_cpu); - - /* this is NULL when pvclock vsyscall is not initialized */ - if (unlikely(pvti == NULL)) - return NOTIFY_DONE; - - pvti->migrate_count++; - - return NOTIFY_DONE; -} - -static struct notifier_block pvclock_migrate = { - .notifier_call = pvclock_task_migrate, -}; - /* * Initialize the generic pvclock vsyscall state. This will allocate * a/some page(s) for the per-vcpu pvclock information, set up a @@ -181,17 +142,12 @@ int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i, WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE); - pvclock_vdso_info = i; - for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) { __set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx, __pa(i) + (idx*PAGE_SIZE), PAGE_KERNEL_VVAR); } - - register_task_migration_notifier(&pvclock_migrate); - return 0; } #endif diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index f8ec57815c0..f0de6294b95 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -206,9 +206,9 @@ EXPORT_SYMBOL(boot_cpu_data); #if !defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64) -unsigned long mmu_cr4_features; +__visible unsigned long mmu_cr4_features; #else -unsigned long mmu_cr4_features = X86_CR4_PAE; +__visible unsigned long mmu_cr4_features = X86_CR4_PAE; #endif /* Boot loader ID and version as integers, for the benefit of proc_dointvec */ @@ -426,25 +426,23 @@ static void __init reserve_initrd(void) static void __init parse_setup_data(void) { struct setup_data *data; - u64 pa_data; + u64 pa_data, pa_next; pa_data = boot_params.hdr.setup_data; while (pa_data) { - u32 data_len, map_len; + u32 data_len, map_len, data_type; map_len = max(PAGE_SIZE - (pa_data & ~PAGE_MASK), (u64)sizeof(struct setup_data)); data = early_memremap(pa_data, map_len); data_len = data->len + sizeof(struct setup_data); - if (data_len > map_len) { - early_iounmap(data, map_len); - data = early_memremap(pa_data, data_len); - map_len = data_len; - } + data_type = data->type; + pa_next = data->next; + early_iounmap(data, map_len); - switch (data->type) { + switch (data_type) { case SETUP_E820_EXT: - parse_e820_ext(data); + parse_e820_ext(pa_data, data_len); break; case SETUP_DTB: add_dtb(pa_data); @@ -452,8 +450,7 @@ static void __init parse_setup_data(void) default: break; } - pa_data = data->next; - early_iounmap(data, map_len); + pa_data = pa_next; } } @@ -1070,7 +1067,7 @@ void __init setup_arch(char **cmdline_p) cleanup_highmap(); - memblock.current_limit = ISA_END_ADDRESS; + memblock_set_current_limit(ISA_END_ADDRESS); memblock_x86_fill(); /* @@ -1103,7 +1100,7 @@ void __init setup_arch(char **cmdline_p) setup_real_mode(); - memblock.current_limit = get_max_mapped(); + memblock_set_current_limit(get_max_mapped()); dma_contiguous_reserve(0); /* diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index cf913587d4d..9e5de6813e1 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -358,7 +358,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, else put_user_ex(0, &frame->uc.uc_flags); put_user_ex(0, &frame->uc.uc_link); - err |= __save_altstack(&frame->uc.uc_stack, regs->sp); + save_altstack_ex(&frame->uc.uc_stack, regs->sp); /* Set up to return from userspace. */ restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn); @@ -423,7 +423,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, else put_user_ex(0, &frame->uc.uc_flags); put_user_ex(0, &frame->uc.uc_link); - err |= __save_altstack(&frame->uc.uc_stack, regs->sp); + save_altstack_ex(&frame->uc.uc_stack, regs->sp); /* Set up to return from userspace. If provided, use a stub already in userspace. */ @@ -490,7 +490,7 @@ static int x32_setup_rt_frame(struct ksignal *ksig, else put_user_ex(0, &frame->uc.uc_flags); put_user_ex(0, &frame->uc.uc_link); - err |= __compat_save_altstack(&frame->uc.uc_stack, regs->sp); + compat_save_altstack_ex(&frame->uc.uc_stack, regs->sp); put_user_ex(0, &frame->uc.uc__pad0); if (ksig->ka.sa.sa_flags & SA_RESTORER) { @@ -533,7 +533,7 @@ static int x32_setup_rt_frame(struct ksignal *ksig, * Do a signal return; undo the signal stack. */ #ifdef CONFIG_X86_32 -unsigned long sys_sigreturn(void) +asmlinkage unsigned long sys_sigreturn(void) { struct pt_regs *regs = current_pt_regs(); struct sigframe __user *frame; @@ -562,7 +562,7 @@ badframe: } #endif /* CONFIG_X86_32 */ -long sys_rt_sigreturn(void) +asmlinkage long sys_rt_sigreturn(void) { struct pt_regs *regs = current_pt_regs(); struct rt_sigframe __user *frame; @@ -728,7 +728,7 @@ static void do_signal(struct pt_regs *regs) * notification of userspace execution resumption * - triggered by the TIF_WORK_MASK flags */ -void +__visible void do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) { user_exit(); diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index cdaa347dfca..7c3a5a61f2e 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -256,7 +256,7 @@ static inline void __smp_reschedule_interrupt(void) scheduler_ipi(); } -void smp_reschedule_interrupt(struct pt_regs *regs) +__visible void smp_reschedule_interrupt(struct pt_regs *regs) { ack_APIC_irq(); __smp_reschedule_interrupt(); @@ -271,7 +271,7 @@ static inline void smp_entering_irq(void) irq_enter(); } -void smp_trace_reschedule_interrupt(struct pt_regs *regs) +__visible void smp_trace_reschedule_interrupt(struct pt_regs *regs) { /* * Need to call irq_enter() before calling the trace point. @@ -295,14 +295,14 @@ static inline void __smp_call_function_interrupt(void) inc_irq_stat(irq_call_count); } -void smp_call_function_interrupt(struct pt_regs *regs) +__visible void smp_call_function_interrupt(struct pt_regs *regs) { smp_entering_irq(); __smp_call_function_interrupt(); exiting_irq(); } -void smp_trace_call_function_interrupt(struct pt_regs *regs) +__visible void smp_trace_call_function_interrupt(struct pt_regs *regs) { smp_entering_irq(); trace_call_function_entry(CALL_FUNCTION_VECTOR); @@ -317,14 +317,14 @@ static inline void __smp_call_function_single_interrupt(void) inc_irq_stat(irq_call_count); } -void smp_call_function_single_interrupt(struct pt_regs *regs) +__visible void smp_call_function_single_interrupt(struct pt_regs *regs) { smp_entering_irq(); __smp_call_function_single_interrupt(); exiting_irq(); } -void smp_trace_call_function_single_interrupt(struct pt_regs *regs) +__visible void smp_trace_call_function_single_interrupt(struct pt_regs *regs) { smp_entering_irq(); trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR); diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index dbded5aedb8..30277e27431 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -101,7 +101,7 @@ static void find_start_end(unsigned long flags, unsigned long *begin, *begin = new_begin; } } else { - *begin = TASK_UNMAPPED_BASE; + *begin = current->mm->mmap_legacy_base; *end = TASK_SIZE; } } diff --git a/arch/x86/kernel/syscall_32.c b/arch/x86/kernel/syscall_32.c index 147fcd4941c..e9bcd57d8a9 100644 --- a/arch/x86/kernel/syscall_32.c +++ b/arch/x86/kernel/syscall_32.c @@ -15,7 +15,7 @@ typedef asmlinkage void (*sys_call_ptr_t)(void); extern asmlinkage void sys_ni_syscall(void); -const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { +__visible const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { /* * Smells like a compiler bug -- it doesn't work * when the & below is removed. diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c index 5c7f8c20da7..4ac730b37f0 100644 --- a/arch/x86/kernel/syscall_64.c +++ b/arch/x86/kernel/syscall_64.c @@ -4,6 +4,7 @@ #include <linux/sys.h> #include <linux/cache.h> #include <asm/asm-offsets.h> +#include <asm/syscall.h> #define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat) @@ -19,11 +20,9 @@ #define __SYSCALL_64(nr, sym, compat) [nr] = sym, -typedef void (*sys_call_ptr_t)(void); - extern void sys_ni_syscall(void); -const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { +asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { /* * Smells like a compiler bug -- it doesn't work * when the & below is removed. diff --git a/arch/x86/kernel/sysfb.c b/arch/x86/kernel/sysfb.c new file mode 100644 index 00000000000..193ec2ce46c --- /dev/null +++ b/arch/x86/kernel/sysfb.c @@ -0,0 +1,74 @@ +/* + * Generic System Framebuffers on x86 + * Copyright (c) 2012-2013 David Herrmann <dh.herrmann@gmail.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +/* + * Simple-Framebuffer support for x86 systems + * Create a platform-device for any available boot framebuffer. The + * simple-framebuffer platform device is already available on DT systems, so + * this module parses the global "screen_info" object and creates a suitable + * platform device compatible with the "simple-framebuffer" DT object. If + * the framebuffer is incompatible, we instead create a legacy + * "vesa-framebuffer", "efi-framebuffer" or "platform-framebuffer" device and + * pass the screen_info as platform_data. This allows legacy drivers + * to pick these devices up without messing with simple-framebuffer drivers. + * The global "screen_info" is still valid at all times. + * + * If CONFIG_X86_SYSFB is not selected, we never register "simple-framebuffer" + * platform devices, but only use legacy framebuffer devices for + * backwards compatibility. + * + * TODO: We set the dev_id field of all platform-devices to 0. This allows + * other x86 OF/DT parsers to create such devices, too. However, they must + * start at offset 1 for this to work. + */ + +#include <linux/err.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/platform_data/simplefb.h> +#include <linux/platform_device.h> +#include <linux/screen_info.h> +#include <asm/sysfb.h> + +static __init int sysfb_init(void) +{ + struct screen_info *si = &screen_info; + struct simplefb_platform_data mode; + struct platform_device *pd; + const char *name; + bool compatible; + int ret; + + sysfb_apply_efi_quirks(); + + /* try to create a simple-framebuffer device */ + compatible = parse_mode(si, &mode); + if (compatible) { + ret = create_simplefb(si, &mode); + if (!ret) + return 0; + } + + /* if the FB is incompatible, create a legacy framebuffer device */ + if (si->orig_video_isVGA == VIDEO_TYPE_EFI) + name = "efi-framebuffer"; + else if (si->orig_video_isVGA == VIDEO_TYPE_VLFB) + name = "vesa-framebuffer"; + else + name = "platform-framebuffer"; + + pd = platform_device_register_resndata(NULL, name, 0, + NULL, 0, si, sizeof(*si)); + return IS_ERR(pd) ? PTR_ERR(pd) : 0; +} + +/* must execute after PCI subsystem for EFI quirks */ +device_initcall(sysfb_init); diff --git a/arch/x86/kernel/sysfb_efi.c b/arch/x86/kernel/sysfb_efi.c new file mode 100644 index 00000000000..b285d4e8c68 --- /dev/null +++ b/arch/x86/kernel/sysfb_efi.c @@ -0,0 +1,214 @@ +/* + * Generic System Framebuffers on x86 + * Copyright (c) 2012-2013 David Herrmann <dh.herrmann@gmail.com> + * + * EFI Quirks Copyright (c) 2006 Edgar Hucek <gimli@dark-green.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +/* + * EFI Quirks + * Several EFI systems do not correctly advertise their boot framebuffers. + * Hence, we use this static table of known broken machines and fix up the + * information so framebuffer drivers can load corectly. + */ + +#include <linux/dmi.h> +#include <linux/err.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/pci.h> +#include <linux/screen_info.h> +#include <video/vga.h> +#include <asm/sysfb.h> + +enum { + OVERRIDE_NONE = 0x0, + OVERRIDE_BASE = 0x1, + OVERRIDE_STRIDE = 0x2, + OVERRIDE_HEIGHT = 0x4, + OVERRIDE_WIDTH = 0x8, +}; + +struct efifb_dmi_info efifb_dmi_list[] = { + [M_I17] = { "i17", 0x80010000, 1472 * 4, 1440, 900, OVERRIDE_NONE }, + [M_I20] = { "i20", 0x80010000, 1728 * 4, 1680, 1050, OVERRIDE_NONE }, /* guess */ + [M_I20_SR] = { "imac7", 0x40010000, 1728 * 4, 1680, 1050, OVERRIDE_NONE }, + [M_I24] = { "i24", 0x80010000, 2048 * 4, 1920, 1200, OVERRIDE_NONE }, /* guess */ + [M_I24_8_1] = { "imac8", 0xc0060000, 2048 * 4, 1920, 1200, OVERRIDE_NONE }, + [M_I24_10_1] = { "imac10", 0xc0010000, 2048 * 4, 1920, 1080, OVERRIDE_NONE }, + [M_I27_11_1] = { "imac11", 0xc0010000, 2560 * 4, 2560, 1440, OVERRIDE_NONE }, + [M_MINI]= { "mini", 0x80000000, 2048 * 4, 1024, 768, OVERRIDE_NONE }, + [M_MINI_3_1] = { "mini31", 0x40010000, 1024 * 4, 1024, 768, OVERRIDE_NONE }, + [M_MINI_4_1] = { "mini41", 0xc0010000, 2048 * 4, 1920, 1200, OVERRIDE_NONE }, + [M_MB] = { "macbook", 0x80000000, 2048 * 4, 1280, 800, OVERRIDE_NONE }, + [M_MB_5_1] = { "macbook51", 0x80010000, 2048 * 4, 1280, 800, OVERRIDE_NONE }, + [M_MB_6_1] = { "macbook61", 0x80010000, 2048 * 4, 1280, 800, OVERRIDE_NONE }, + [M_MB_7_1] = { "macbook71", 0x80010000, 2048 * 4, 1280, 800, OVERRIDE_NONE }, + [M_MBA] = { "mba", 0x80000000, 2048 * 4, 1280, 800, OVERRIDE_NONE }, + /* 11" Macbook Air 3,1 passes the wrong stride */ + [M_MBA_3] = { "mba3", 0, 2048 * 4, 0, 0, OVERRIDE_STRIDE }, + [M_MBP] = { "mbp", 0x80010000, 1472 * 4, 1440, 900, OVERRIDE_NONE }, + [M_MBP_2] = { "mbp2", 0, 0, 0, 0, OVERRIDE_NONE }, /* placeholder */ + [M_MBP_2_2] = { "mbp22", 0x80010000, 1472 * 4, 1440, 900, OVERRIDE_NONE }, + [M_MBP_SR] = { "mbp3", 0x80030000, 2048 * 4, 1440, 900, OVERRIDE_NONE }, + [M_MBP_4] = { "mbp4", 0xc0060000, 2048 * 4, 1920, 1200, OVERRIDE_NONE }, + [M_MBP_5_1] = { "mbp51", 0xc0010000, 2048 * 4, 1440, 900, OVERRIDE_NONE }, + [M_MBP_5_2] = { "mbp52", 0xc0010000, 2048 * 4, 1920, 1200, OVERRIDE_NONE }, + [M_MBP_5_3] = { "mbp53", 0xd0010000, 2048 * 4, 1440, 900, OVERRIDE_NONE }, + [M_MBP_6_1] = { "mbp61", 0x90030000, 2048 * 4, 1920, 1200, OVERRIDE_NONE }, + [M_MBP_6_2] = { "mbp62", 0x90030000, 2048 * 4, 1680, 1050, OVERRIDE_NONE }, + [M_MBP_7_1] = { "mbp71", 0xc0010000, 2048 * 4, 1280, 800, OVERRIDE_NONE }, + [M_MBP_8_2] = { "mbp82", 0x90010000, 1472 * 4, 1440, 900, OVERRIDE_NONE }, + [M_UNKNOWN] = { NULL, 0, 0, 0, 0, OVERRIDE_NONE } +}; + +#define choose_value(dmivalue, fwvalue, field, flags) ({ \ + typeof(fwvalue) _ret_ = fwvalue; \ + if ((flags) & (field)) \ + _ret_ = dmivalue; \ + else if ((fwvalue) == 0) \ + _ret_ = dmivalue; \ + _ret_; \ + }) + +static int __init efifb_set_system(const struct dmi_system_id *id) +{ + struct efifb_dmi_info *info = id->driver_data; + + if (info->base == 0 && info->height == 0 && info->width == 0 && + info->stride == 0) + return 0; + + /* Trust the bootloader over the DMI tables */ + if (screen_info.lfb_base == 0) { +#if defined(CONFIG_PCI) + struct pci_dev *dev = NULL; + int found_bar = 0; +#endif + if (info->base) { + screen_info.lfb_base = choose_value(info->base, + screen_info.lfb_base, OVERRIDE_BASE, + info->flags); + +#if defined(CONFIG_PCI) + /* make sure that the address in the table is actually + * on a VGA device's PCI BAR */ + + for_each_pci_dev(dev) { + int i; + if ((dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) + continue; + for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) { + resource_size_t start, end; + + start = pci_resource_start(dev, i); + if (start == 0) + break; + end = pci_resource_end(dev, i); + if (screen_info.lfb_base >= start && + screen_info.lfb_base < end) { + found_bar = 1; + } + } + } + if (!found_bar) + screen_info.lfb_base = 0; +#endif + } + } + if (screen_info.lfb_base) { + screen_info.lfb_linelength = choose_value(info->stride, + screen_info.lfb_linelength, OVERRIDE_STRIDE, + info->flags); + screen_info.lfb_width = choose_value(info->width, + screen_info.lfb_width, OVERRIDE_WIDTH, + info->flags); + screen_info.lfb_height = choose_value(info->height, + screen_info.lfb_height, OVERRIDE_HEIGHT, + info->flags); + if (screen_info.orig_video_isVGA == 0) + screen_info.orig_video_isVGA = VIDEO_TYPE_EFI; + } else { + screen_info.lfb_linelength = 0; + screen_info.lfb_width = 0; + screen_info.lfb_height = 0; + screen_info.orig_video_isVGA = 0; + return 0; + } + + printk(KERN_INFO "efifb: dmi detected %s - framebuffer at 0x%08x " + "(%dx%d, stride %d)\n", id->ident, + screen_info.lfb_base, screen_info.lfb_width, + screen_info.lfb_height, screen_info.lfb_linelength); + + return 1; +} + +#define EFIFB_DMI_SYSTEM_ID(vendor, name, enumid) \ + { \ + efifb_set_system, \ + name, \ + { \ + DMI_MATCH(DMI_BIOS_VENDOR, vendor), \ + DMI_MATCH(DMI_PRODUCT_NAME, name) \ + }, \ + &efifb_dmi_list[enumid] \ + } + +static const struct dmi_system_id efifb_dmi_system_table[] __initconst = { + EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "iMac4,1", M_I17), + /* At least one of these two will be right; maybe both? */ + EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "iMac5,1", M_I20), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac5,1", M_I20), + /* At least one of these two will be right; maybe both? */ + EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "iMac6,1", M_I24), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac6,1", M_I24), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac7,1", M_I20_SR), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac8,1", M_I24_8_1), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac10,1", M_I24_10_1), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac11,1", M_I27_11_1), + EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "Macmini1,1", M_MINI), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "Macmini3,1", M_MINI_3_1), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "Macmini4,1", M_MINI_4_1), + EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBook1,1", M_MB), + /* At least one of these two will be right; maybe both? */ + EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBook2,1", M_MB), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook2,1", M_MB), + /* At least one of these two will be right; maybe both? */ + EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBook3,1", M_MB), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook3,1", M_MB), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook4,1", M_MB), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook5,1", M_MB_5_1), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook6,1", M_MB_6_1), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook7,1", M_MB_7_1), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookAir1,1", M_MBA), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookAir3,1", M_MBA_3), + EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBookPro1,1", M_MBP), + EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBookPro2,1", M_MBP_2), + EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBookPro2,2", M_MBP_2_2), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro2,1", M_MBP_2), + EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBookPro3,1", M_MBP_SR), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro3,1", M_MBP_SR), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro4,1", M_MBP_4), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro5,1", M_MBP_5_1), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro5,2", M_MBP_5_2), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro5,3", M_MBP_5_3), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro6,1", M_MBP_6_1), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro6,2", M_MBP_6_2), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro7,1", M_MBP_7_1), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro8,2", M_MBP_8_2), + {}, +}; + +__init void sysfb_apply_efi_quirks(void) +{ + if (screen_info.orig_video_isVGA != VIDEO_TYPE_EFI || + !(screen_info.capabilities & VIDEO_CAPABILITY_SKIP_QUIRKS)) + dmi_check_system(efifb_dmi_system_table); +} diff --git a/arch/x86/kernel/sysfb_simplefb.c b/arch/x86/kernel/sysfb_simplefb.c new file mode 100644 index 00000000000..22513e96b01 --- /dev/null +++ b/arch/x86/kernel/sysfb_simplefb.c @@ -0,0 +1,95 @@ +/* + * Generic System Framebuffers on x86 + * Copyright (c) 2012-2013 David Herrmann <dh.herrmann@gmail.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +/* + * simple-framebuffer probing + * Try to convert "screen_info" into a "simple-framebuffer" compatible mode. + * If the mode is incompatible, we return "false" and let the caller create + * legacy nodes instead. + */ + +#include <linux/err.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/platform_data/simplefb.h> +#include <linux/platform_device.h> +#include <linux/screen_info.h> +#include <asm/sysfb.h> + +static const char simplefb_resname[] = "BOOTFB"; +static const struct simplefb_format formats[] = SIMPLEFB_FORMATS; + +/* try parsing x86 screen_info into a simple-framebuffer mode struct */ +__init bool parse_mode(const struct screen_info *si, + struct simplefb_platform_data *mode) +{ + const struct simplefb_format *f; + __u8 type; + unsigned int i; + + type = si->orig_video_isVGA; + if (type != VIDEO_TYPE_VLFB && type != VIDEO_TYPE_EFI) + return false; + + for (i = 0; i < ARRAY_SIZE(formats); ++i) { + f = &formats[i]; + if (si->lfb_depth == f->bits_per_pixel && + si->red_size == f->red.length && + si->red_pos == f->red.offset && + si->green_size == f->green.length && + si->green_pos == f->green.offset && + si->blue_size == f->blue.length && + si->blue_pos == f->blue.offset && + si->rsvd_size == f->transp.length && + si->rsvd_pos == f->transp.offset) { + mode->format = f->name; + mode->width = si->lfb_width; + mode->height = si->lfb_height; + mode->stride = si->lfb_linelength; + return true; + } + } + + return false; +} + +__init int create_simplefb(const struct screen_info *si, + const struct simplefb_platform_data *mode) +{ + struct platform_device *pd; + struct resource res; + unsigned long len; + + /* don't use lfb_size as it may contain the whole VMEM instead of only + * the part that is occupied by the framebuffer */ + len = mode->height * mode->stride; + len = PAGE_ALIGN(len); + if (len > si->lfb_size << 16) { + printk(KERN_WARNING "sysfb: VRAM smaller than advertised\n"); + return -EINVAL; + } + + /* setup IORESOURCE_MEM as framebuffer memory */ + memset(&res, 0, sizeof(res)); + res.flags = IORESOURCE_MEM; + res.name = simplefb_resname; + res.start = si->lfb_base; + res.end = si->lfb_base + len - 1; + if (res.end <= res.start) + return -EINVAL; + + pd = platform_device_register_resndata(NULL, "simple-framebuffer", 0, + &res, 1, mode, sizeof(*mode)); + if (IS_ERR(pd)) + return PTR_ERR(pd); + + return 0; +} diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index addf7b58f4e..91a4496db43 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -301,6 +301,15 @@ static int tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control) return 0; } +static int tboot_extended_sleep(u8 sleep_state, u32 val_a, u32 val_b) +{ + if (!tboot_enabled()) + return 0; + + pr_warning("tboot is not able to suspend on platforms with reduced hardware sleep (ACPIv5)"); + return -ENODEV; +} + static atomic_t ap_wfs_count; static int tboot_wait_for_aps(int num_aps) @@ -422,6 +431,7 @@ static __init int tboot_late_init(void) #endif acpi_os_set_prepare_sleep(&tboot_sleep); + acpi_os_set_prepare_extended_sleep(&tboot_extended_sleep); return 0; } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 1b23a1c9274..8c8093b146c 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -58,6 +58,7 @@ #include <asm/mce.h> #include <asm/fixmap.h> #include <asm/mach_traps.h> +#include <asm/alternative.h> #ifdef CONFIG_X86_64 #include <asm/x86_init.h> @@ -327,6 +328,9 @@ dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_co ftrace_int3_handler(regs)) return; #endif + if (poke_int3_handler(regs)) + return; + prev_state = exception_enter(); #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 6ff49247edf..930e5d48f56 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -89,6 +89,12 @@ int check_tsc_unstable(void) } EXPORT_SYMBOL_GPL(check_tsc_unstable); +int check_tsc_disabled(void) +{ + return tsc_disabled; +} +EXPORT_SYMBOL_GPL(check_tsc_disabled); + #ifdef CONFIG_X86_TSC int __init notsc_setup(char *str) { diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index a20ecb5b6cb..b110fe6c03d 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -413,7 +413,8 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, (1 << KVM_FEATURE_CLOCKSOURCE2) | (1 << KVM_FEATURE_ASYNC_PF) | (1 << KVM_FEATURE_PV_EOI) | - (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); + (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) | + (1 << KVM_FEATURE_PV_UNHALT); if (sched_info_on()) entry->eax |= (1 << KVM_FEATURE_STEAL_TIME); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index afc11245827..5439117d5c4 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -79,16 +79,6 @@ static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val) *((u32 *) (apic->regs + reg_off)) = val; } -static inline int apic_test_and_set_vector(int vec, void *bitmap) -{ - return test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); -} - -static inline int apic_test_and_clear_vector(int vec, void *bitmap) -{ - return test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); -} - static inline int apic_test_vector(int vec, void *bitmap) { return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); @@ -331,10 +321,10 @@ void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir) } EXPORT_SYMBOL_GPL(kvm_apic_update_irr); -static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic) +static inline void apic_set_irr(int vec, struct kvm_lapic *apic) { apic->irr_pending = true; - return apic_test_and_set_vector(vec, apic->regs + APIC_IRR); + apic_set_vector(vec, apic->regs + APIC_IRR); } static inline int apic_search_irr(struct kvm_lapic *apic) @@ -681,32 +671,28 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, if (unlikely(!apic_enabled(apic))) break; + result = 1; + if (dest_map) __set_bit(vcpu->vcpu_id, dest_map); - if (kvm_x86_ops->deliver_posted_interrupt) { - result = 1; + if (kvm_x86_ops->deliver_posted_interrupt) kvm_x86_ops->deliver_posted_interrupt(vcpu, vector); - } else { - result = !apic_test_and_set_irr(vector, apic); - - if (!result) { - if (trig_mode) - apic_debug("level trig mode repeatedly " - "for vector %d", vector); - goto out; - } + else { + apic_set_irr(vector, apic); kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); } -out: trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode, - trig_mode, vector, !result); + trig_mode, vector, false); break; case APIC_DM_REMRD: - apic_debug("Ignoring delivery mode 3\n"); + result = 1; + vcpu->arch.pv.pv_unhalted = 1; + kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_vcpu_kick(vcpu); break; case APIC_DM_SMI: diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 9e9285ae9b9..6e2d2c8f230 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -132,8 +132,8 @@ module_param(dbg, bool, 0644); (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ * PT32_LEVEL_BITS))) - 1)) -#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ - | PT64_NX_MASK) +#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \ + | shadow_x_mask | shadow_nx_mask) #define ACC_EXEC_MASK 1 #define ACC_WRITE_MASK PT_WRITABLE_MASK @@ -331,11 +331,6 @@ static int is_large_pte(u64 pte) return pte & PT_PAGE_SIZE_MASK; } -static int is_dirty_gpte(unsigned long pte) -{ - return pte & PT_DIRTY_MASK; -} - static int is_rmap_spte(u64 pte) { return is_shadow_present_pte(pte); @@ -2052,12 +2047,18 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) return __shadow_walk_next(iterator, *iterator->sptep); } -static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp) +static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp, bool accessed) { u64 spte; + BUILD_BUG_ON(VMX_EPT_READABLE_MASK != PT_PRESENT_MASK || + VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK); + spte = __pa(sp->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK | - shadow_user_mask | shadow_x_mask | shadow_accessed_mask; + shadow_user_mask | shadow_x_mask; + + if (accessed) + spte |= shadow_accessed_mask; mmu_spte_set(sptep, spte); } @@ -2574,14 +2575,6 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) mmu_free_roots(vcpu); } -static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level) -{ - int bit7; - - bit7 = (gpte >> 7) & 1; - return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0; -} - static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log) { @@ -2594,26 +2587,6 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, return gfn_to_pfn_memslot_atomic(slot, gfn); } -static bool prefetch_invalid_gpte(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp, u64 *spte, - u64 gpte) -{ - if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) - goto no_present; - - if (!is_present_gpte(gpte)) - goto no_present; - - if (!(gpte & PT_ACCESSED_MASK)) - goto no_present; - - return false; - -no_present: - drop_spte(vcpu->kvm, spte); - return true; -} - static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *start, u64 *end) @@ -2710,7 +2683,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, iterator.level - 1, 1, ACC_ALL, iterator.sptep); - link_shadow_page(iterator.sptep, sp); + link_shadow_page(iterator.sptep, sp, true); } } return emulate; @@ -2808,7 +2781,7 @@ exit: return ret; } -static bool page_fault_can_be_fast(struct kvm_vcpu *vcpu, u32 error_code) +static bool page_fault_can_be_fast(u32 error_code) { /* * Do not fix the mmio spte with invalid generation number which @@ -2861,7 +2834,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, bool ret = false; u64 spte = 0ull; - if (!page_fault_can_be_fast(vcpu, error_code)) + if (!page_fault_can_be_fast(error_code)) return false; walk_shadow_page_lockless_begin(vcpu); @@ -3209,6 +3182,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) mmu_sync_roots(vcpu); spin_unlock(&vcpu->kvm->mmu_lock); } +EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots); static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, struct x86_exception *exception) @@ -3478,6 +3452,7 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) ++vcpu->stat.tlb_flush; kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); } +EXPORT_SYMBOL_GPL(kvm_mmu_flush_tlb); static void paging_new_cr3(struct kvm_vcpu *vcpu) { @@ -3501,18 +3476,6 @@ static void paging_free(struct kvm_vcpu *vcpu) nonpaging_free(vcpu); } -static inline void protect_clean_gpte(unsigned *access, unsigned gpte) -{ - unsigned mask; - - BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK); - - mask = (unsigned)~ACC_WRITE_MASK; - /* Allow write access to dirty gptes */ - mask |= (gpte >> (PT_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) & PT_WRITABLE_MASK; - *access &= mask; -} - static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn, unsigned access, int *nr_present) { @@ -3530,16 +3493,6 @@ static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn, return false; } -static inline unsigned gpte_access(struct kvm_vcpu *vcpu, u64 gpte) -{ - unsigned access; - - access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; - access &= ~(gpte >> PT64_NX_SHIFT); - - return access; -} - static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gpte) { unsigned index; @@ -3549,6 +3502,11 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gp return mmu->last_pte_bitmap & (1 << index); } +#define PTTYPE_EPT 18 /* arbitrary */ +#define PTTYPE PTTYPE_EPT +#include "paging_tmpl.h" +#undef PTTYPE + #define PTTYPE 64 #include "paging_tmpl.h" #undef PTTYPE @@ -3563,6 +3521,8 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int maxphyaddr = cpuid_maxphyaddr(vcpu); u64 exb_bit_rsvd = 0; + context->bad_mt_xwr = 0; + if (!context->nx) exb_bit_rsvd = rsvd_bits(63, 63); switch (context->root_level) { @@ -3618,7 +3578,40 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, } } -static void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) +static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, + struct kvm_mmu *context, bool execonly) +{ + int maxphyaddr = cpuid_maxphyaddr(vcpu); + int pte; + + context->rsvd_bits_mask[0][3] = + rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7); + context->rsvd_bits_mask[0][2] = + rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6); + context->rsvd_bits_mask[0][1] = + rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6); + context->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51); + + /* large page */ + context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3]; + context->rsvd_bits_mask[1][2] = + rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29); + context->rsvd_bits_mask[1][1] = + rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20); + context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0]; + + for (pte = 0; pte < 64; pte++) { + int rwx_bits = pte & 7; + int mt = pte >> 3; + if (mt == 0x2 || mt == 0x3 || mt == 0x7 || + rwx_bits == 0x2 || rwx_bits == 0x6 || + (rwx_bits == 0x4 && !execonly)) + context->bad_mt_xwr |= (1ull << pte); + } +} + +static void update_permission_bitmask(struct kvm_vcpu *vcpu, + struct kvm_mmu *mmu, bool ept) { unsigned bit, byte, pfec; u8 map; @@ -3636,12 +3629,16 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu w = bit & ACC_WRITE_MASK; u = bit & ACC_USER_MASK; - /* Not really needed: !nx will cause pte.nx to fault */ - x |= !mmu->nx; - /* Allow supervisor writes if !cr0.wp */ - w |= !is_write_protection(vcpu) && !uf; - /* Disallow supervisor fetches of user code if cr4.smep */ - x &= !(smep && u && !uf); + if (!ept) { + /* Not really needed: !nx will cause pte.nx to fault */ + x |= !mmu->nx; + /* Allow supervisor writes if !cr0.wp */ + w |= !is_write_protection(vcpu) && !uf; + /* Disallow supervisor fetches of user code if cr4.smep */ + x &= !(smep && u && !uf); + } else + /* Not really needed: no U/S accesses on ept */ + u = 1; fault = (ff && !x) || (uf && !u) || (wf && !w); map |= fault << bit; @@ -3676,7 +3673,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, context->root_level = level; reset_rsvds_bits_mask(vcpu, context); - update_permission_bitmask(vcpu, context); + update_permission_bitmask(vcpu, context, false); update_last_pte_bitmap(vcpu, context); ASSERT(is_pae(vcpu)); @@ -3706,7 +3703,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu, context->root_level = PT32_ROOT_LEVEL; reset_rsvds_bits_mask(vcpu, context); - update_permission_bitmask(vcpu, context); + update_permission_bitmask(vcpu, context, false); update_last_pte_bitmap(vcpu, context); context->new_cr3 = paging_new_cr3; @@ -3768,7 +3765,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->gva_to_gpa = paging32_gva_to_gpa; } - update_permission_bitmask(vcpu, context); + update_permission_bitmask(vcpu, context, false); update_last_pte_bitmap(vcpu, context); return 0; @@ -3800,6 +3797,33 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context) } EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); +int kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context, + bool execonly) +{ + ASSERT(vcpu); + ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); + + context->shadow_root_level = kvm_x86_ops->get_tdp_level(); + + context->nx = true; + context->new_cr3 = paging_new_cr3; + context->page_fault = ept_page_fault; + context->gva_to_gpa = ept_gva_to_gpa; + context->sync_page = ept_sync_page; + context->invlpg = ept_invlpg; + context->update_pte = ept_update_pte; + context->free = paging_free; + context->root_level = context->shadow_root_level; + context->root_hpa = INVALID_PAGE; + context->direct_map = false; + + update_permission_bitmask(vcpu, context, true); + reset_rsvds_bits_mask_ept(vcpu, context, execonly); + + return 0; +} +EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu); + static int init_kvm_softmmu(struct kvm_vcpu *vcpu) { int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu); @@ -3847,7 +3871,7 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu) g_context->gva_to_gpa = paging32_gva_to_gpa_nested; } - update_permission_bitmask(vcpu, g_context); + update_permission_bitmask(vcpu, g_context, false); update_last_pte_bitmap(vcpu, g_context); return 0; @@ -3923,8 +3947,8 @@ static bool need_remote_flush(u64 old, u64 new) return true; if ((old ^ new) & PT64_BASE_ADDR_MASK) return true; - old ^= PT64_NX_MASK; - new ^= PT64_NX_MASK; + old ^= shadow_nx_mask; + new ^= shadow_nx_mask; return (old & ~new & PT64_PERM_MASK) != 0; } @@ -4182,7 +4206,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, switch (er) { case EMULATE_DONE: return 1; - case EMULATE_DO_MMIO: + case EMULATE_USER_EXIT: ++vcpu->stat.mmio_exits; /* fall through */ case EMULATE_FAIL: @@ -4390,11 +4414,8 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm) /* * The very rare case: if the generation-number is round, * zap all shadow pages. - * - * The max value is MMIO_MAX_GEN - 1 since it is not called - * when mark memslot invalid. */ - if (unlikely(kvm_current_mmio_generation(kvm) >= (MMIO_MAX_GEN - 1))) { + if (unlikely(kvm_current_mmio_generation(kvm) >= MMIO_MAX_GEN)) { printk_ratelimited(KERN_INFO "kvm: zapping shadow pages for mmio generation wraparound\n"); kvm_mmu_invalidate_zap_all_pages(kvm); } diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 5b59c573aba..77e044a0f5f 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -71,6 +71,8 @@ enum { int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct); int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); +int kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context, + bool execonly); static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) { diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 7769699d48a..04333015917 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -23,6 +23,13 @@ * so the code in this file is compiled twice, once per pte size. */ +/* + * This is used to catch non optimized PT_GUEST_(DIRTY|ACCESS)_SHIFT macro + * uses for EPT without A/D paging type. + */ +extern u64 __pure __using_nonexistent_pte_bit(void) + __compiletime_error("wrong use of PT_GUEST_(DIRTY|ACCESS)_SHIFT"); + #if PTTYPE == 64 #define pt_element_t u64 #define guest_walker guest_walker64 @@ -32,6 +39,10 @@ #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) #define PT_INDEX(addr, level) PT64_INDEX(addr, level) #define PT_LEVEL_BITS PT64_LEVEL_BITS + #define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK + #define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK + #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT + #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT #ifdef CONFIG_X86_64 #define PT_MAX_FULL_LEVELS 4 #define CMPXCHG cmpxchg @@ -49,7 +60,26 @@ #define PT_INDEX(addr, level) PT32_INDEX(addr, level) #define PT_LEVEL_BITS PT32_LEVEL_BITS #define PT_MAX_FULL_LEVELS 2 + #define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK + #define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK + #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT + #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT #define CMPXCHG cmpxchg +#elif PTTYPE == PTTYPE_EPT + #define pt_element_t u64 + #define guest_walker guest_walkerEPT + #define FNAME(name) ept_##name + #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK + #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) + #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) + #define PT_INDEX(addr, level) PT64_INDEX(addr, level) + #define PT_LEVEL_BITS PT64_LEVEL_BITS + #define PT_GUEST_ACCESSED_MASK 0 + #define PT_GUEST_DIRTY_MASK 0 + #define PT_GUEST_DIRTY_SHIFT __using_nonexistent_pte_bit() + #define PT_GUEST_ACCESSED_SHIFT __using_nonexistent_pte_bit() + #define CMPXCHG cmpxchg64 + #define PT_MAX_FULL_LEVELS 4 #else #error Invalid PTTYPE value #endif @@ -80,6 +110,40 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl) return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT; } +static inline void FNAME(protect_clean_gpte)(unsigned *access, unsigned gpte) +{ + unsigned mask; + + /* dirty bit is not supported, so no need to track it */ + if (!PT_GUEST_DIRTY_MASK) + return; + + BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK); + + mask = (unsigned)~ACC_WRITE_MASK; + /* Allow write access to dirty gptes */ + mask |= (gpte >> (PT_GUEST_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) & + PT_WRITABLE_MASK; + *access &= mask; +} + +static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level) +{ + int bit7 = (gpte >> 7) & 1, low6 = gpte & 0x3f; + + return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) | + ((mmu->bad_mt_xwr & (1ull << low6)) != 0); +} + +static inline int FNAME(is_present_gpte)(unsigned long pte) +{ +#if PTTYPE != PTTYPE_EPT + return is_present_gpte(pte); +#else + return pte & 7; +#endif +} + static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, pt_element_t __user *ptep_user, unsigned index, pt_element_t orig_pte, pt_element_t new_pte) @@ -103,6 +167,42 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, return (ret != orig_pte); } +static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp, u64 *spte, + u64 gpte) +{ + if (FNAME(is_rsvd_bits_set)(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) + goto no_present; + + if (!FNAME(is_present_gpte)(gpte)) + goto no_present; + + /* if accessed bit is not supported prefetch non accessed gpte */ + if (PT_GUEST_ACCESSED_MASK && !(gpte & PT_GUEST_ACCESSED_MASK)) + goto no_present; + + return false; + +no_present: + drop_spte(vcpu->kvm, spte); + return true; +} + +static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte) +{ + unsigned access; +#if PTTYPE == PTTYPE_EPT + access = ((gpte & VMX_EPT_WRITABLE_MASK) ? ACC_WRITE_MASK : 0) | + ((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) | + ACC_USER_MASK; +#else + access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; + access &= ~(gpte >> PT64_NX_SHIFT); +#endif + + return access; +} + static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, struct guest_walker *walker, @@ -114,18 +214,23 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, gfn_t table_gfn; int ret; + /* dirty/accessed bits are not supported, so no need to update them */ + if (!PT_GUEST_DIRTY_MASK) + return 0; + for (level = walker->max_level; level >= walker->level; --level) { pte = orig_pte = walker->ptes[level - 1]; table_gfn = walker->table_gfn[level - 1]; ptep_user = walker->ptep_user[level - 1]; index = offset_in_page(ptep_user) / sizeof(pt_element_t); - if (!(pte & PT_ACCESSED_MASK)) { + if (!(pte & PT_GUEST_ACCESSED_MASK)) { trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte)); - pte |= PT_ACCESSED_MASK; + pte |= PT_GUEST_ACCESSED_MASK; } - if (level == walker->level && write_fault && !is_dirty_gpte(pte)) { + if (level == walker->level && write_fault && + !(pte & PT_GUEST_DIRTY_MASK)) { trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); - pte |= PT_DIRTY_MASK; + pte |= PT_GUEST_DIRTY_MASK; } if (pte == orig_pte) continue; @@ -170,7 +275,7 @@ retry_walk: if (walker->level == PT32E_ROOT_LEVEL) { pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3); trace_kvm_mmu_paging_element(pte, walker->level); - if (!is_present_gpte(pte)) + if (!FNAME(is_present_gpte)(pte)) goto error; --walker->level; } @@ -179,7 +284,7 @@ retry_walk: ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0); - accessed_dirty = PT_ACCESSED_MASK; + accessed_dirty = PT_GUEST_ACCESSED_MASK; pt_access = pte_access = ACC_ALL; ++walker->level; @@ -215,17 +320,17 @@ retry_walk: trace_kvm_mmu_paging_element(pte, walker->level); - if (unlikely(!is_present_gpte(pte))) + if (unlikely(!FNAME(is_present_gpte)(pte))) goto error; - if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte, - walker->level))) { + if (unlikely(FNAME(is_rsvd_bits_set)(mmu, pte, + walker->level))) { errcode |= PFERR_RSVD_MASK | PFERR_PRESENT_MASK; goto error; } accessed_dirty &= pte; - pte_access = pt_access & gpte_access(vcpu, pte); + pte_access = pt_access & FNAME(gpte_access)(vcpu, pte); walker->ptes[walker->level - 1] = pte; } while (!is_last_gpte(mmu, walker->level, pte)); @@ -248,13 +353,15 @@ retry_walk: walker->gfn = real_gpa >> PAGE_SHIFT; if (!write_fault) - protect_clean_gpte(&pte_access, pte); + FNAME(protect_clean_gpte)(&pte_access, pte); else /* - * On a write fault, fold the dirty bit into accessed_dirty by - * shifting it one place right. + * On a write fault, fold the dirty bit into accessed_dirty. + * For modes without A/D bits support accessed_dirty will be + * always clear. */ - accessed_dirty &= pte >> (PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT); + accessed_dirty &= pte >> + (PT_GUEST_DIRTY_SHIFT - PT_GUEST_ACCESSED_SHIFT); if (unlikely(!accessed_dirty)) { ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault); @@ -279,6 +386,25 @@ error: walker->fault.vector = PF_VECTOR; walker->fault.error_code_valid = true; walker->fault.error_code = errcode; + +#if PTTYPE == PTTYPE_EPT + /* + * Use PFERR_RSVD_MASK in error_code to to tell if EPT + * misconfiguration requires to be injected. The detection is + * done by is_rsvd_bits_set() above. + * + * We set up the value of exit_qualification to inject: + * [2:0] - Derive from [2:0] of real exit_qualification at EPT violation + * [5:3] - Calculated by the page walk of the guest EPT page tables + * [7:8] - Derived from [7:8] of real exit_qualification + * + * The other bits are set to 0. + */ + if (!(errcode & PFERR_RSVD_MASK)) { + vcpu->arch.exit_qualification &= 0x187; + vcpu->arch.exit_qualification |= ((pt_access & pte) & 0x7) << 3; + } +#endif walker->fault.address = addr; walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu; @@ -293,6 +419,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker, access); } +#if PTTYPE != PTTYPE_EPT static int FNAME(walk_addr_nested)(struct guest_walker *walker, struct kvm_vcpu *vcpu, gva_t addr, u32 access) @@ -300,6 +427,7 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker, return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu, addr, access); } +#endif static bool FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, @@ -309,14 +437,14 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, gfn_t gfn; pfn_t pfn; - if (prefetch_invalid_gpte(vcpu, sp, spte, gpte)) + if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) return false; pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); gfn = gpte_to_gfn(gpte); - pte_access = sp->role.access & gpte_access(vcpu, gpte); - protect_clean_gpte(&pte_access, gpte); + pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); + FNAME(protect_clean_gpte)(&pte_access, gpte); pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, no_dirty_log && (pte_access & ACC_WRITE_MASK)); if (is_error_pfn(pfn)) @@ -446,7 +574,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, goto out_gpte_changed; if (sp) - link_shadow_page(it.sptep, sp); + link_shadow_page(it.sptep, sp, PT_GUEST_ACCESSED_MASK); } for (; @@ -466,7 +594,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1, true, direct_access, it.sptep); - link_shadow_page(it.sptep, sp); + link_shadow_page(it.sptep, sp, PT_GUEST_ACCESSED_MASK); } clear_sp_write_flooding_count(it.sptep); @@ -727,6 +855,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, return gpa; } +#if PTTYPE != PTTYPE_EPT static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, struct x86_exception *exception) @@ -745,6 +874,7 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, return gpa; } +#endif /* * Using the cached information from sp->gfns is safe because: @@ -785,15 +915,15 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) sizeof(pt_element_t))) return -EINVAL; - if (prefetch_invalid_gpte(vcpu, sp, &sp->spt[i], gpte)) { + if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { vcpu->kvm->tlbs_dirty++; continue; } gfn = gpte_to_gfn(gpte); pte_access = sp->role.access; - pte_access &= gpte_access(vcpu, gpte); - protect_clean_gpte(&pte_access, gpte); + pte_access &= FNAME(gpte_access)(vcpu, gpte); + FNAME(protect_clean_gpte)(&pte_access, gpte); if (sync_mmio_spte(vcpu->kvm, &sp->spt[i], gfn, pte_access, &nr_present)) @@ -830,3 +960,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) #undef gpte_to_gfn #undef gpte_to_gfn_lvl #undef CMPXCHG +#undef PT_GUEST_ACCESSED_MASK +#undef PT_GUEST_DIRTY_MASK +#undef PT_GUEST_DIRTY_SHIFT +#undef PT_GUEST_ACCESSED_SHIFT diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index c53e797e736..5c4f63151b4 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -160,7 +160,7 @@ static void stop_counter(struct kvm_pmc *pmc) static void reprogram_counter(struct kvm_pmc *pmc, u32 type, unsigned config, bool exclude_user, bool exclude_kernel, - bool intr) + bool intr, bool in_tx, bool in_tx_cp) { struct perf_event *event; struct perf_event_attr attr = { @@ -173,6 +173,10 @@ static void reprogram_counter(struct kvm_pmc *pmc, u32 type, .exclude_kernel = exclude_kernel, .config = config, }; + if (in_tx) + attr.config |= HSW_IN_TX; + if (in_tx_cp) + attr.config |= HSW_IN_TX_CHECKPOINTED; attr.sample_period = (-pmc->counter) & pmc_bitmask(pmc); @@ -226,7 +230,9 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE | ARCH_PERFMON_EVENTSEL_INV | - ARCH_PERFMON_EVENTSEL_CMASK))) { + ARCH_PERFMON_EVENTSEL_CMASK | + HSW_IN_TX | + HSW_IN_TX_CHECKPOINTED))) { config = find_arch_event(&pmc->vcpu->arch.pmu, event_select, unit_mask); if (config != PERF_COUNT_HW_MAX) @@ -239,7 +245,9 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) reprogram_counter(pmc, type, config, !(eventsel & ARCH_PERFMON_EVENTSEL_USR), !(eventsel & ARCH_PERFMON_EVENTSEL_OS), - eventsel & ARCH_PERFMON_EVENTSEL_INT); + eventsel & ARCH_PERFMON_EVENTSEL_INT, + (eventsel & HSW_IN_TX), + (eventsel & HSW_IN_TX_CHECKPOINTED)); } static void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 en_pmi, int idx) @@ -256,7 +264,7 @@ static void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 en_pmi, int idx) arch_events[fixed_pmc_events[idx]].event_type, !(en & 0x2), /* exclude user */ !(en & 0x1), /* exclude kernel */ - pmi); + pmi, false, false); } static inline u8 fixed_en_pmi(u64 ctrl, int idx) @@ -408,7 +416,7 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) { if (data == pmc->eventsel) return 0; - if (!(data & 0xffffffff00200000ull)) { + if (!(data & pmu->reserved_bits)) { reprogram_gp_counter(pmc, data); return 0; } @@ -450,6 +458,7 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu) pmu->counter_bitmask[KVM_PMC_GP] = 0; pmu->counter_bitmask[KVM_PMC_FIXED] = 0; pmu->version = 0; + pmu->reserved_bits = 0xffffffff00200000ull; entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); if (!entry) @@ -478,6 +487,12 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu) pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) | (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED); pmu->global_ctrl_mask = ~pmu->global_ctrl; + + entry = kvm_find_cpuid_entry(vcpu, 7, 0); + if (entry && + (boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) && + (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM))) + pmu->reserved_bits ^= HSW_IN_TX|HSW_IN_TX_CHECKPOINTED; } void kvm_pmu_init(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 064d0be67ec..1f1da43ff2a 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -373,6 +373,7 @@ struct nested_vmx { * we must keep them pinned while L2 runs. */ struct page *apic_access_page; + u64 msr_ia32_feature_control; }; #define POSTED_INTR_ON 0 @@ -711,10 +712,10 @@ static void nested_release_page_clean(struct page *page) kvm_release_page_clean(page); } +static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu); static u64 construct_eptp(unsigned long root_hpa); static void kvm_cpu_vmxon(u64 addr); static void kvm_cpu_vmxoff(void); -static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); @@ -1039,12 +1040,16 @@ static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit) (vmcs12->secondary_vm_exec_control & bit); } -static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12, - struct kvm_vcpu *vcpu) +static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12) { return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS; } +static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT); +} + static inline bool is_exception(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) @@ -2155,6 +2160,7 @@ static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high; static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high; static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high; static u32 nested_vmx_misc_low, nested_vmx_misc_high; +static u32 nested_vmx_ept_caps; static __init void nested_vmx_setup_ctls_msrs(void) { /* @@ -2190,14 +2196,17 @@ static __init void nested_vmx_setup_ctls_msrs(void) * If bit 55 of VMX_BASIC is off, bits 0-8 and 10, 11, 13, 14, 16 and * 17 must be 1. */ + rdmsr(MSR_IA32_VMX_EXIT_CTLS, + nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high); nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; /* Note that guest use of VM_EXIT_ACK_INTR_ON_EXIT is not supported. */ + nested_vmx_exit_ctls_high &= #ifdef CONFIG_X86_64 - nested_vmx_exit_ctls_high = VM_EXIT_HOST_ADDR_SPACE_SIZE; -#else - nested_vmx_exit_ctls_high = 0; + VM_EXIT_HOST_ADDR_SPACE_SIZE | #endif - nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; + VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT; + nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | + VM_EXIT_LOAD_IA32_EFER); /* entry controls */ rdmsr(MSR_IA32_VMX_ENTRY_CTLS, @@ -2205,8 +2214,12 @@ static __init void nested_vmx_setup_ctls_msrs(void) /* If bit 55 of VMX_BASIC is off, bits 0-8 and 12 must be 1. */ nested_vmx_entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; nested_vmx_entry_ctls_high &= - VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE; - nested_vmx_entry_ctls_high |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; +#ifdef CONFIG_X86_64 + VM_ENTRY_IA32E_MODE | +#endif + VM_ENTRY_LOAD_IA32_PAT; + nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | + VM_ENTRY_LOAD_IA32_EFER); /* cpu-based controls */ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, @@ -2241,6 +2254,22 @@ static __init void nested_vmx_setup_ctls_msrs(void) SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_WBINVD_EXITING; + if (enable_ept) { + /* nested EPT: emulate EPT also to L1 */ + nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT; + nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT | + VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT; + nested_vmx_ept_caps &= vmx_capability.ept; + /* + * Since invept is completely emulated we support both global + * and context invalidation independent of what host cpu + * supports + */ + nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | + VMX_EPT_EXTENT_CONTEXT_BIT; + } else + nested_vmx_ept_caps = 0; + /* miscellaneous data */ rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high); nested_vmx_misc_low &= VMX_MISC_PREEMPTION_TIMER_RATE_MASK | @@ -2282,8 +2311,11 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) switch (msr_index) { case MSR_IA32_FEATURE_CONTROL: - *pdata = 0; - break; + if (nested_vmx_allowed(vcpu)) { + *pdata = to_vmx(vcpu)->nested.msr_ia32_feature_control; + break; + } + return 0; case MSR_IA32_VMX_BASIC: /* * This MSR reports some information about VMX support. We @@ -2346,8 +2378,8 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) nested_vmx_secondary_ctls_high); break; case MSR_IA32_VMX_EPT_VPID_CAP: - /* Currently, no nested ept or nested vpid */ - *pdata = 0; + /* Currently, no nested vpid support */ + *pdata = nested_vmx_ept_caps; break; default: return 0; @@ -2356,14 +2388,24 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) return 1; } -static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) +static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { + u32 msr_index = msr_info->index; + u64 data = msr_info->data; + bool host_initialized = msr_info->host_initiated; + if (!nested_vmx_allowed(vcpu)) return 0; - if (msr_index == MSR_IA32_FEATURE_CONTROL) - /* TODO: the right thing. */ + if (msr_index == MSR_IA32_FEATURE_CONTROL) { + if (!host_initialized && + to_vmx(vcpu)->nested.msr_ia32_feature_control + & FEATURE_CONTROL_LOCKED) + return 0; + to_vmx(vcpu)->nested.msr_ia32_feature_control = data; return 1; + } + /* * No need to treat VMX capability MSRs specially: If we don't handle * them, handle_wrmsr will #GP(0), which is correct (they are readonly) @@ -2494,7 +2536,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; /* Otherwise falls through */ default: - if (vmx_set_vmx_msr(vcpu, msr_index, data)) + if (vmx_set_vmx_msr(vcpu, msr_info)) break; msr = find_msr_entry(vmx, msr_index); if (msr) { @@ -5302,9 +5344,13 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) /* It is a write fault? */ error_code = exit_qualification & (1U << 1); + /* It is a fetch fault? */ + error_code |= (exit_qualification & (1U << 2)) << 2; /* ept page table is present? */ error_code |= (exit_qualification >> 3) & 0x1; + vcpu->arch.exit_qualification = exit_qualification; + return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); } @@ -5438,7 +5484,8 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) err = emulate_instruction(vcpu, EMULTYPE_NO_REEXECUTE); - if (err == EMULATE_DO_MMIO) { + if (err == EMULATE_USER_EXIT) { + ++vcpu->stat.mmio_exits; ret = 0; goto out; } @@ -5567,8 +5614,47 @@ static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx) free_loaded_vmcs(&vmx->vmcs01); } +/* + * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), + * set the success or error code of an emulated VMX instruction, as specified + * by Vol 2B, VMX Instruction Reference, "Conventions". + */ +static void nested_vmx_succeed(struct kvm_vcpu *vcpu) +{ + vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) + & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | + X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); +} + +static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu) +{ + vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) + & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | + X86_EFLAGS_SF | X86_EFLAGS_OF)) + | X86_EFLAGS_CF); +} + static void nested_vmx_failValid(struct kvm_vcpu *vcpu, - u32 vm_instruction_error); + u32 vm_instruction_error) +{ + if (to_vmx(vcpu)->nested.current_vmptr == -1ull) { + /* + * failValid writes the error number to the current VMCS, which + * can't be done there isn't a current VMCS. + */ + nested_vmx_failInvalid(vcpu); + return; + } + vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) + & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | + X86_EFLAGS_SF | X86_EFLAGS_OF)) + | X86_EFLAGS_ZF); + get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; + /* + * We don't need to force a shadow sync because + * VM_INSTRUCTION_ERROR is not shadowed + */ +} /* * Emulate the VMXON instruction. @@ -5583,6 +5669,8 @@ static int handle_vmon(struct kvm_vcpu *vcpu) struct kvm_segment cs; struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs *shadow_vmcs; + const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED + | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; /* The Intel VMX Instruction Reference lists a bunch of bits that * are prerequisite to running VMXON, most notably cr4.VMXE must be @@ -5611,6 +5699,13 @@ static int handle_vmon(struct kvm_vcpu *vcpu) skip_emulated_instruction(vcpu); return 1; } + + if ((vmx->nested.msr_ia32_feature_control & VMXON_NEEDED_FEATURES) + != VMXON_NEEDED_FEATURES) { + kvm_inject_gp(vcpu, 0); + return 1; + } + if (enable_shadow_vmcs) { shadow_vmcs = alloc_vmcs(); if (!shadow_vmcs) @@ -5628,6 +5723,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu) vmx->nested.vmxon = true; skip_emulated_instruction(vcpu); + nested_vmx_succeed(vcpu); return 1; } @@ -5712,6 +5808,7 @@ static int handle_vmoff(struct kvm_vcpu *vcpu) return 1; free_nested(to_vmx(vcpu)); skip_emulated_instruction(vcpu); + nested_vmx_succeed(vcpu); return 1; } @@ -5768,48 +5865,6 @@ static int get_vmx_mem_address(struct kvm_vcpu *vcpu, return 0; } -/* - * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), - * set the success or error code of an emulated VMX instruction, as specified - * by Vol 2B, VMX Instruction Reference, "Conventions". - */ -static void nested_vmx_succeed(struct kvm_vcpu *vcpu) -{ - vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) - & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | - X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); -} - -static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu) -{ - vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) - & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | - X86_EFLAGS_SF | X86_EFLAGS_OF)) - | X86_EFLAGS_CF); -} - -static void nested_vmx_failValid(struct kvm_vcpu *vcpu, - u32 vm_instruction_error) -{ - if (to_vmx(vcpu)->nested.current_vmptr == -1ull) { - /* - * failValid writes the error number to the current VMCS, which - * can't be done there isn't a current VMCS. - */ - nested_vmx_failInvalid(vcpu); - return; - } - vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) - & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | - X86_EFLAGS_SF | X86_EFLAGS_OF)) - | X86_EFLAGS_ZF); - get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; - /* - * We don't need to force a shadow sync because - * VM_INSTRUCTION_ERROR is not shadowed - */ -} - /* Emulate the VMCLEAR instruction */ static int handle_vmclear(struct kvm_vcpu *vcpu) { @@ -5972,8 +6027,8 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) unsigned long field; u64 field_value; struct vmcs *shadow_vmcs = vmx->nested.current_shadow_vmcs; - unsigned long *fields = (unsigned long *)shadow_read_write_fields; - int num_fields = max_shadow_read_write_fields; + const unsigned long *fields = shadow_read_write_fields; + const int num_fields = max_shadow_read_write_fields; vmcs_load(shadow_vmcs); @@ -6002,12 +6057,11 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) { - unsigned long *fields[] = { - (unsigned long *)shadow_read_write_fields, - (unsigned long *)shadow_read_only_fields + const unsigned long *fields[] = { + shadow_read_write_fields, + shadow_read_only_fields }; - int num_lists = ARRAY_SIZE(fields); - int max_fields[] = { + const int max_fields[] = { max_shadow_read_write_fields, max_shadow_read_only_fields }; @@ -6018,7 +6072,7 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) vmcs_load(shadow_vmcs); - for (q = 0; q < num_lists; q++) { + for (q = 0; q < ARRAY_SIZE(fields); q++) { for (i = 0; i < max_fields[q]; i++) { field = fields[q][i]; vmcs12_read_any(&vmx->vcpu, field, &field_value); @@ -6248,6 +6302,74 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu) return 1; } +/* Emulate the INVEPT instruction */ +static int handle_invept(struct kvm_vcpu *vcpu) +{ + u32 vmx_instruction_info, types; + unsigned long type; + gva_t gva; + struct x86_exception e; + struct { + u64 eptp, gpa; + } operand; + u64 eptp_mask = ((1ull << 51) - 1) & PAGE_MASK; + + if (!(nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) || + !(nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + if (!nested_vmx_check_permission(vcpu)) + return 1; + + if (!kvm_read_cr0_bits(vcpu, X86_CR0_PE)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf); + + types = (nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; + + if (!(types & (1UL << type))) { + nested_vmx_failValid(vcpu, + VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + return 1; + } + + /* According to the Intel VMX instruction reference, the memory + * operand is read even if it isn't needed (e.g., for type==global) + */ + if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), + vmx_instruction_info, &gva)) + return 1; + if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand, + sizeof(operand), &e)) { + kvm_inject_page_fault(vcpu, &e); + return 1; + } + + switch (type) { + case VMX_EPT_EXTENT_CONTEXT: + if ((operand.eptp & eptp_mask) != + (nested_ept_get_cr3(vcpu) & eptp_mask)) + break; + case VMX_EPT_EXTENT_GLOBAL: + kvm_mmu_sync_roots(vcpu); + kvm_mmu_flush_tlb(vcpu); + nested_vmx_succeed(vcpu); + break; + default: + BUG_ON(1); + break; + } + + skip_emulated_instruction(vcpu); + return 1; +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -6292,6 +6414,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, [EXIT_REASON_MWAIT_INSTRUCTION] = handle_invalid_op, [EXIT_REASON_MONITOR_INSTRUCTION] = handle_invalid_op, + [EXIT_REASON_INVEPT] = handle_invept, }; static const int kvm_vmx_max_exit_handlers = @@ -6518,6 +6641,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD: case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE: case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: + case EXIT_REASON_INVEPT: /* * VMX instructions trap unconditionally. This allows L1 to * emulate them for its L2 guest, i.e., allows 3-level nesting! @@ -6550,7 +6674,20 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); case EXIT_REASON_EPT_VIOLATION: + /* + * L0 always deals with the EPT violation. If nested EPT is + * used, and the nested mmu code discovers that the address is + * missing in the guest EPT table (EPT12), the EPT violation + * will be injected with nested_ept_inject_page_fault() + */ + return 0; case EXIT_REASON_EPT_MISCONFIG: + /* + * L2 never uses directly L1's EPT, but rather L0's own EPT + * table (shadow on EPT) or a merged EPT table that L0 built + * (EPT on EPT). So any problems with the structure of the + * table is L0's fault. + */ return 0; case EXIT_REASON_PREEMPTION_TIMER: return vmcs12->pin_based_vm_exec_control & @@ -6638,7 +6775,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked && !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis( - get_vmcs12(vcpu), vcpu)))) { + get_vmcs12(vcpu))))) { if (vmx_interrupt_allowed(vcpu)) { vmx->soft_vnmi_blocked = 0; } else if (vmx->vnmi_blocked_time > 1000000000LL && @@ -7326,6 +7463,48 @@ static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) entry->ecx |= bit(X86_FEATURE_VMX); } +static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, + struct x86_exception *fault) +{ + struct vmcs12 *vmcs12; + nested_vmx_vmexit(vcpu); + vmcs12 = get_vmcs12(vcpu); + + if (fault->error_code & PFERR_RSVD_MASK) + vmcs12->vm_exit_reason = EXIT_REASON_EPT_MISCONFIG; + else + vmcs12->vm_exit_reason = EXIT_REASON_EPT_VIOLATION; + vmcs12->exit_qualification = vcpu->arch.exit_qualification; + vmcs12->guest_physical_address = fault->address; +} + +/* Callbacks for nested_ept_init_mmu_context: */ + +static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu) +{ + /* return the page table to be shadowed - in our case, EPT12 */ + return get_vmcs12(vcpu)->ept_pointer; +} + +static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) +{ + int r = kvm_init_shadow_ept_mmu(vcpu, &vcpu->arch.mmu, + nested_vmx_ept_caps & VMX_EPT_EXECUTE_ONLY_BIT); + + vcpu->arch.mmu.set_cr3 = vmx_set_cr3; + vcpu->arch.mmu.get_cr3 = nested_ept_get_cr3; + vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault; + + vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; + + return r; +} + +static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) +{ + vcpu->arch.walk_mmu = &vcpu->arch.mmu; +} + /* * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it @@ -7388,7 +7567,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->guest_interruptibility_info); vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); - vmcs_writel(GUEST_RFLAGS, vmcs12->guest_rflags); + vmx_set_rflags(vcpu, vmcs12->guest_rflags); vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, vmcs12->guest_pending_dbg_exceptions); vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); @@ -7508,15 +7687,24 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); - /* Note: IA32_MODE, LOAD_IA32_EFER are modified by vmx_set_efer below */ - vmcs_write32(VM_EXIT_CONTROLS, - vmcs12->vm_exit_controls | vmcs_config.vmexit_ctrl); - vmcs_write32(VM_ENTRY_CONTROLS, vmcs12->vm_entry_controls | + /* L2->L1 exit controls are emulated - the hardware exit is to L0 so + * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER + * bits are further modified by vmx_set_efer() below. + */ + vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); + + /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are + * emulated by vmx_set_efer(), below. + */ + vmcs_write32(VM_ENTRY_CONTROLS, + (vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER & + ~VM_ENTRY_IA32E_MODE) | (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE)); - if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) + if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) { vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); - else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) + vcpu->arch.pat = vmcs12->guest_ia32_pat; + } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); @@ -7538,6 +7726,11 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmx_flush_tlb(vcpu); } + if (nested_cpu_has_ept(vmcs12)) { + kvm_mmu_unload(vcpu); + nested_ept_init_mmu_context(vcpu); + } + if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER) vcpu->arch.efer = vmcs12->guest_ia32_efer; else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) @@ -7565,6 +7758,16 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) kvm_set_cr3(vcpu, vmcs12->guest_cr3); kvm_mmu_reset_context(vcpu); + /* + * L1 may access the L2's PDPTR, so save them to construct vmcs12 + */ + if (enable_ept) { + vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); + vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); + vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); + vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); + } + kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp); kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip); } @@ -7887,6 +8090,22 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->guest_pending_dbg_exceptions = vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); + /* + * In some cases (usually, nested EPT), L2 is allowed to change its + * own CR3 without exiting. If it has changed it, we must keep it. + * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined + * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12. + * + * Additionally, restore L2's PDPTR to vmcs12. + */ + if (enable_ept) { + vmcs12->guest_cr3 = vmcs_read64(GUEST_CR3); + vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); + vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); + vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); + vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); + } + vmcs12->vm_entry_controls = (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | (vmcs_read32(VM_ENTRY_CONTROLS) & VM_ENTRY_IA32E_MODE); @@ -7948,6 +8167,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { + struct kvm_segment seg; + if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) vcpu->arch.efer = vmcs12->host_ia32_efer; else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) @@ -7982,7 +8203,9 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); kvm_set_cr4(vcpu, vmcs12->host_cr4); - /* shadow page tables on either EPT or shadow page tables */ + if (nested_cpu_has_ept(vmcs12)) + nested_ept_uninit_mmu_context(vcpu); + kvm_set_cr3(vcpu, vmcs12->host_cr3); kvm_mmu_reset_context(vcpu); @@ -8001,23 +8224,61 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); - vmcs_writel(GUEST_TR_BASE, vmcs12->host_tr_base); - vmcs_writel(GUEST_GS_BASE, vmcs12->host_gs_base); - vmcs_writel(GUEST_FS_BASE, vmcs12->host_fs_base); - vmcs_write16(GUEST_ES_SELECTOR, vmcs12->host_es_selector); - vmcs_write16(GUEST_CS_SELECTOR, vmcs12->host_cs_selector); - vmcs_write16(GUEST_SS_SELECTOR, vmcs12->host_ss_selector); - vmcs_write16(GUEST_DS_SELECTOR, vmcs12->host_ds_selector); - vmcs_write16(GUEST_FS_SELECTOR, vmcs12->host_fs_selector); - vmcs_write16(GUEST_GS_SELECTOR, vmcs12->host_gs_selector); - vmcs_write16(GUEST_TR_SELECTOR, vmcs12->host_tr_selector); - - if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) + + if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); + vcpu->arch.pat = vmcs12->host_ia32_pat; + } if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, vmcs12->host_ia32_perf_global_ctrl); + /* Set L1 segment info according to Intel SDM + 27.5.2 Loading Host Segment and Descriptor-Table Registers */ + seg = (struct kvm_segment) { + .base = 0, + .limit = 0xFFFFFFFF, + .selector = vmcs12->host_cs_selector, + .type = 11, + .present = 1, + .s = 1, + .g = 1 + }; + if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) + seg.l = 1; + else + seg.db = 1; + vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); + seg = (struct kvm_segment) { + .base = 0, + .limit = 0xFFFFFFFF, + .type = 3, + .present = 1, + .s = 1, + .db = 1, + .g = 1 + }; + seg.selector = vmcs12->host_ds_selector; + vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); + seg.selector = vmcs12->host_es_selector; + vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); + seg.selector = vmcs12->host_ss_selector; + vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); + seg.selector = vmcs12->host_fs_selector; + seg.base = vmcs12->host_fs_base; + vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); + seg.selector = vmcs12->host_gs_selector; + seg.base = vmcs12->host_gs_base; + vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); + seg = (struct kvm_segment) { + .base = vmcs12->host_tr_base, + .limit = 0x67, + .selector = vmcs12->host_tr_selector, + .type = 11, + .present = 1 + }; + vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); + kvm_set_dr(vcpu, 7, 0x400); vmcs_write64(GUEST_IA32_DEBUGCTL, 0); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d21bce50531..e5ca72a5cdb 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -682,17 +682,6 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) */ } - /* - * Does the new cr3 value map to physical memory? (Note, we - * catch an invalid cr3 even in real-mode, because it would - * cause trouble later on when we turn on paging anyway.) - * - * A real CPU would silently accept an invalid cr3 and would - * attempt to use it - with largely undefined (and often hard - * to debug) behavior on the guest side. - */ - if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) - return 1; vcpu->arch.cr3 = cr3; __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); vcpu->arch.mmu.new_cr3(vcpu); @@ -850,7 +839,8 @@ static u32 msrs_to_save[] = { #ifdef CONFIG_X86_64 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, #endif - MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA + MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, + MSR_IA32_FEATURE_CONTROL }; static unsigned num_msrs_to_save; @@ -1457,6 +1447,29 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm) #endif } +static void kvm_gen_update_masterclock(struct kvm *kvm) +{ +#ifdef CONFIG_X86_64 + int i; + struct kvm_vcpu *vcpu; + struct kvm_arch *ka = &kvm->arch; + + spin_lock(&ka->pvclock_gtod_sync_lock); + kvm_make_mclock_inprogress_request(kvm); + /* no guest entries from this point */ + pvclock_update_vm_gtod_copy(kvm); + + kvm_for_each_vcpu(i, vcpu, kvm) + set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests); + + /* guest entries allowed */ + kvm_for_each_vcpu(i, vcpu, kvm) + clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests); + + spin_unlock(&ka->pvclock_gtod_sync_lock); +#endif +} + static int kvm_guest_time_update(struct kvm_vcpu *v) { unsigned long flags, this_tsc_khz; @@ -3806,6 +3819,7 @@ long kvm_arch_vm_ioctl(struct file *filp, delta = user_ns.clock - now_ns; local_irq_enable(); kvm->arch.kvmclock_offset = delta; + kvm_gen_update_masterclock(kvm); break; } case KVM_GET_CLOCK: { @@ -4955,6 +4969,97 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt, static int complete_emulated_mmio(struct kvm_vcpu *vcpu); static int complete_emulated_pio(struct kvm_vcpu *vcpu); +static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7, + unsigned long *db) +{ + u32 dr6 = 0; + int i; + u32 enable, rwlen; + + enable = dr7; + rwlen = dr7 >> 16; + for (i = 0; i < 4; i++, enable >>= 2, rwlen >>= 4) + if ((enable & 3) && (rwlen & 15) == type && db[i] == addr) + dr6 |= (1 << i); + return dr6; +} + +static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, int *r) +{ + struct kvm_run *kvm_run = vcpu->run; + + /* + * Use the "raw" value to see if TF was passed to the processor. + * Note that the new value of the flags has not been saved yet. + * + * This is correct even for TF set by the guest, because "the + * processor will not generate this exception after the instruction + * that sets the TF flag". + */ + unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); + + if (unlikely(rflags & X86_EFLAGS_TF)) { + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { + kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1; + kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip; + kvm_run->debug.arch.exception = DB_VECTOR; + kvm_run->exit_reason = KVM_EXIT_DEBUG; + *r = EMULATE_USER_EXIT; + } else { + vcpu->arch.emulate_ctxt.eflags &= ~X86_EFLAGS_TF; + /* + * "Certain debug exceptions may clear bit 0-3. The + * remaining contents of the DR6 register are never + * cleared by the processor". + */ + vcpu->arch.dr6 &= ~15; + vcpu->arch.dr6 |= DR6_BS; + kvm_queue_exception(vcpu, DB_VECTOR); + } + } +} + +static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r) +{ + struct kvm_run *kvm_run = vcpu->run; + unsigned long eip = vcpu->arch.emulate_ctxt.eip; + u32 dr6 = 0; + + if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) && + (vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) { + dr6 = kvm_vcpu_check_hw_bp(eip, 0, + vcpu->arch.guest_debug_dr7, + vcpu->arch.eff_db); + + if (dr6 != 0) { + kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1; + kvm_run->debug.arch.pc = kvm_rip_read(vcpu) + + get_segment_base(vcpu, VCPU_SREG_CS); + + kvm_run->debug.arch.exception = DB_VECTOR; + kvm_run->exit_reason = KVM_EXIT_DEBUG; + *r = EMULATE_USER_EXIT; + return true; + } + } + + if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK)) { + dr6 = kvm_vcpu_check_hw_bp(eip, 0, + vcpu->arch.dr7, + vcpu->arch.db); + + if (dr6 != 0) { + vcpu->arch.dr6 &= ~15; + vcpu->arch.dr6 |= dr6; + kvm_queue_exception(vcpu, DB_VECTOR); + *r = EMULATE_DONE; + return true; + } + } + + return false; +} + int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, int emulation_type, @@ -4975,6 +5080,16 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, if (!(emulation_type & EMULTYPE_NO_DECODE)) { init_emulate_ctxt(vcpu); + + /* + * We will reenter on the same instruction since + * we do not set complete_userspace_io. This does not + * handle watchpoints yet, those would be handled in + * the emulate_ops. + */ + if (kvm_vcpu_check_breakpoint(vcpu, &r)) + return r; + ctxt->interruptibility = 0; ctxt->have_exception = false; ctxt->perm_ok = false; @@ -5031,17 +5146,18 @@ restart: inject_emulated_exception(vcpu); r = EMULATE_DONE; } else if (vcpu->arch.pio.count) { - if (!vcpu->arch.pio.in) + if (!vcpu->arch.pio.in) { + /* FIXME: return into emulator if single-stepping. */ vcpu->arch.pio.count = 0; - else { + } else { writeback = false; vcpu->arch.complete_userspace_io = complete_emulated_pio; } - r = EMULATE_DO_MMIO; + r = EMULATE_USER_EXIT; } else if (vcpu->mmio_needed) { if (!vcpu->mmio_is_write) writeback = false; - r = EMULATE_DO_MMIO; + r = EMULATE_USER_EXIT; vcpu->arch.complete_userspace_io = complete_emulated_mmio; } else if (r == EMULATION_RESTART) goto restart; @@ -5050,10 +5166,12 @@ restart: if (writeback) { toggle_interruptibility(vcpu, ctxt->interruptibility); - kvm_set_rflags(vcpu, ctxt->eflags); kvm_make_request(KVM_REQ_EVENT, vcpu); vcpu->arch.emulate_regs_need_sync_to_vcpu = false; kvm_rip_write(vcpu, ctxt->eip); + if (r == EMULATE_DONE) + kvm_vcpu_check_singlestep(vcpu, &r); + kvm_set_rflags(vcpu, ctxt->eflags); } else vcpu->arch.emulate_regs_need_sync_to_vcpu = true; @@ -5347,7 +5465,7 @@ static struct notifier_block pvclock_gtod_notifier = { int kvm_arch_init(void *opaque) { int r; - struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque; + struct kvm_x86_ops *ops = opaque; if (kvm_x86_ops) { printk(KERN_ERR "kvm: already loaded the other module\n"); @@ -5495,6 +5613,23 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) return 1; } +/* + * kvm_pv_kick_cpu_op: Kick a vcpu. + * + * @apicid - apicid of vcpu to be kicked. + */ +static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid) +{ + struct kvm_lapic_irq lapic_irq; + + lapic_irq.shorthand = 0; + lapic_irq.dest_mode = 0; + lapic_irq.dest_id = apicid; + + lapic_irq.delivery_mode = APIC_DM_REMRD; + kvm_irq_delivery_to_apic(kvm, 0, &lapic_irq, NULL); +} + int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) { unsigned long nr, a0, a1, a2, a3, ret; @@ -5528,6 +5663,10 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) case KVM_HC_VAPIC_POLL_IRQ: ret = 0; break; + case KVM_HC_KICK_CPU: + kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1); + ret = 0; + break; default: ret = -KVM_ENOSYS; break; @@ -5689,29 +5828,6 @@ static void process_nmi(struct kvm_vcpu *vcpu) kvm_make_request(KVM_REQ_EVENT, vcpu); } -static void kvm_gen_update_masterclock(struct kvm *kvm) -{ -#ifdef CONFIG_X86_64 - int i; - struct kvm_vcpu *vcpu; - struct kvm_arch *ka = &kvm->arch; - - spin_lock(&ka->pvclock_gtod_sync_lock); - kvm_make_mclock_inprogress_request(kvm); - /* no guest entries from this point */ - pvclock_update_vm_gtod_copy(kvm); - - kvm_for_each_vcpu(i, vcpu, kvm) - set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests); - - /* guest entries allowed */ - kvm_for_each_vcpu(i, vcpu, kvm) - clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests); - - spin_unlock(&ka->pvclock_gtod_sync_lock); -#endif -} - static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) { u64 eoi_exit_bitmap[4]; @@ -5950,6 +6066,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) kvm_apic_accept_events(vcpu); switch(vcpu->arch.mp_state) { case KVM_MP_STATE_HALTED: + vcpu->arch.pv.pv_unhalted = false; vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; case KVM_MP_STATE_RUNNABLE: @@ -6061,6 +6178,8 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) { vcpu->mmio_needed = 0; + + /* FIXME: return into emulator if single-stepping. */ if (vcpu->mmio_is_write) return 1; vcpu->mmio_read_completed = 1; @@ -6249,7 +6368,12 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { kvm_apic_accept_events(vcpu); - mp_state->mp_state = vcpu->arch.mp_state; + if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED && + vcpu->arch.pv.pv_unhalted) + mp_state->mp_state = KVM_MP_STATE_RUNNABLE; + else + mp_state->mp_state = vcpu->arch.mp_state; + return 0; } @@ -6770,6 +6894,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) BUG_ON(vcpu->kvm == NULL); kvm = vcpu->kvm; + vcpu->arch.pv.pv_unhalted = false; vcpu->arch.emulate_ctxt.ops = &emulate_ops; if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; @@ -7019,6 +7144,15 @@ out_free: return -ENOMEM; } +void kvm_arch_memslots_updated(struct kvm *kvm) +{ + /* + * memslots->generation has been incremented. + * mmio generation may have reached its maximum value. + */ + kvm_mmu_invalidate_mmio_sptes(kvm); +} + int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, struct kvm_userspace_memory_region *mem, @@ -7079,11 +7213,6 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, */ if ((change != KVM_MR_DELETE) && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES)) kvm_mmu_slot_remove_write_access(kvm, mem->slot); - /* - * If memory slot is created, or moved, we need to clear all - * mmio sptes. - */ - kvm_mmu_invalidate_mmio_sptes(kvm); } void kvm_arch_flush_shadow_all(struct kvm *kvm) @@ -7103,6 +7232,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) !vcpu->arch.apf.halted) || !list_empty_careful(&vcpu->async_pf.done) || kvm_apic_has_events(vcpu) + || vcpu->arch.pv.pv_unhalted || atomic_read(&vcpu->arch.nmi_queued) || (kvm_arch_interrupt_allowed(vcpu) && kvm_cpu_has_interrupt(vcpu)); diff --git a/arch/x86/lib/csum-wrappers_64.c b/arch/x86/lib/csum-wrappers_64.c index 25b7ae8d058..7609e0e421e 100644 --- a/arch/x86/lib/csum-wrappers_64.c +++ b/arch/x86/lib/csum-wrappers_64.c @@ -6,6 +6,7 @@ */ #include <asm/checksum.h> #include <linux/module.h> +#include <asm/smap.h> /** * csum_partial_copy_from_user - Copy and checksum from user space. @@ -52,8 +53,10 @@ csum_partial_copy_from_user(const void __user *src, void *dst, len -= 2; } } + stac(); isum = csum_partial_copy_generic((__force const void *)src, dst, len, isum, errp, NULL); + clac(); if (unlikely(*errp)) goto out_err; @@ -82,6 +85,8 @@ __wsum csum_partial_copy_to_user(const void *src, void __user *dst, int len, __wsum isum, int *errp) { + __wsum ret; + might_sleep(); if (unlikely(!access_ok(VERIFY_WRITE, dst, len))) { @@ -105,8 +110,11 @@ csum_partial_copy_to_user(const void *src, void __user *dst, } *errp = 0; - return csum_partial_copy_generic(src, (void __force *)dst, - len, isum, NULL, errp); + stac(); + ret = csum_partial_copy_generic(src, (void __force *)dst, + len, isum, NULL, errp); + clac(); + return ret; } EXPORT_SYMBOL(csum_partial_copy_to_user); diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index 906fea31579..c905e89e19f 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -68,7 +68,7 @@ EXPORT_SYMBOL(copy_in_user); * Since protection fault in copy_from/to_user is not a normal situation, * it is not necessary to optimize tail handling. */ -unsigned long +__visible unsigned long copy_user_handle_tail(char *to, char *from, unsigned len, unsigned zerorest) { char c; diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index 5d7e51f3fd2..533a85e3a07 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -1,10 +1,8 @@ # x86 Opcode Maps # # This is (mostly) based on following documentations. -# - Intel(R) 64 and IA-32 Architectures Software Developer's Manual Vol.2 -# (#325383-040US, October 2011) -# - Intel(R) Advanced Vector Extensions Programming Reference -# (#319433-011,JUNE 2011). +# - Intel(R) 64 and IA-32 Architectures Software Developer's Manual Vol.2C +# (#326018-047US, June 2013) # #<Opcode maps> # Table: table-name @@ -29,6 +27,7 @@ # - (F3): the last prefix is 0xF3 # - (F2): the last prefix is 0xF2 # - (!F3) : the last prefix is not 0xF3 (including non-last prefix case) +# - (66&F2): Both 0x66 and 0xF2 prefixes are specified. Table: one byte opcode Referrer: @@ -246,8 +245,8 @@ c2: RETN Iw (f64) c3: RETN c4: LES Gz,Mp (i64) | VEX+2byte (Prefix) c5: LDS Gz,Mp (i64) | VEX+1byte (Prefix) -c6: Grp11 Eb,Ib (1A) -c7: Grp11 Ev,Iz (1A) +c6: Grp11A Eb,Ib (1A) +c7: Grp11B Ev,Iz (1A) c8: ENTER Iw,Ib c9: LEAVE (d64) ca: RETF Iw @@ -293,8 +292,8 @@ ef: OUT DX,eAX # 0xf0 - 0xff f0: LOCK (Prefix) f1: -f2: REPNE (Prefix) -f3: REP/REPE (Prefix) +f2: REPNE (Prefix) | XACQUIRE (Prefix) +f3: REP/REPE (Prefix) | XRELEASE (Prefix) f4: HLT f5: CMC f6: Grp3_1 Eb (1A) @@ -326,7 +325,8 @@ AVXcode: 1 0a: 0b: UD2 (1B) 0c: -0d: NOP Ev | GrpP +# AMD's prefetch group. Intel supports prefetchw(/1) only. +0d: GrpP 0e: FEMMS # 3DNow! uses the last imm byte as opcode extension. 0f: 3DNow! Pq,Qq,Ib @@ -729,12 +729,12 @@ dc: VAESENC Vdq,Hdq,Wdq (66),(v1) dd: VAESENCLAST Vdq,Hdq,Wdq (66),(v1) de: VAESDEC Vdq,Hdq,Wdq (66),(v1) df: VAESDECLAST Vdq,Hdq,Wdq (66),(v1) -f0: MOVBE Gy,My | MOVBE Gw,Mw (66) | CRC32 Gd,Eb (F2) -f1: MOVBE My,Gy | MOVBE Mw,Gw (66) | CRC32 Gd,Ey (F2) +f0: MOVBE Gy,My | MOVBE Gw,Mw (66) | CRC32 Gd,Eb (F2) | CRC32 Gd,Eb (66&F2) +f1: MOVBE My,Gy | MOVBE Mw,Gw (66) | CRC32 Gd,Ey (F2) | CRC32 Gd,Ew (66&F2) f2: ANDN Gy,By,Ey (v) f3: Grp17 (1A) f5: BZHI Gy,Ey,By (v) | PEXT Gy,By,Ey (F3),(v) | PDEP Gy,By,Ey (F2),(v) -f6: MULX By,Gy,rDX,Ey (F2),(v) +f6: ADCX Gy,Ey (66) | ADOX Gy,Ey (F3) | MULX By,Gy,rDX,Ey (F2),(v) f7: BEXTR Gy,Ey,By (v) | SHLX Gy,Ey,By (66),(v) | SARX Gy,Ey,By (F3),(v) | SHRX Gy,Ey,By (F2),(v) EndTable @@ -861,8 +861,8 @@ EndTable GrpTable: Grp7 0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B) -1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001) -2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B) +1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001),(11B) | CLAC (010),(11B) | STAC (011),(11B) +2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B) | XEND (101)(11B) | XTEST (110)(11B) 3: LIDT Ms 4: SMSW Mw/Rv 5: @@ -880,15 +880,21 @@ EndTable GrpTable: Grp9 1: CMPXCHG8B/16B Mq/Mdq 6: VMPTRLD Mq | VMCLEAR Mq (66) | VMXON Mq (F3) | RDRAND Rv (11B) -7: VMPTRST Mq | VMPTRST Mq (F3) +7: VMPTRST Mq | VMPTRST Mq (F3) | RDSEED Rv (11B) EndTable GrpTable: Grp10 EndTable -GrpTable: Grp11 -# Note: the operands are given by group opcode -0: MOV +# Grp11A and Grp11B are expressed as Grp11 in Intel SDM +GrpTable: Grp11A +0: MOV Eb,Ib +7: XABORT Ib (000),(11B) +EndTable + +GrpTable: Grp11B +0: MOV Eb,Iz +7: XBEGIN Jz (000),(11B) EndTable GrpTable: Grp12 diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 2ec29ac78ae..04664cdb7fd 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -78,8 +78,8 @@ __ref void *alloc_low_pages(unsigned int num) return __va(pfn << PAGE_SHIFT); } -/* need 4 4k for initial PMD_SIZE, 4k for 0-ISA_END_ADDRESS */ -#define INIT_PGT_BUF_SIZE (5 * PAGE_SIZE) +/* need 3 4k for initial PMD_SIZE, 3 4k for 0-ISA_END_ADDRESS */ +#define INIT_PGT_BUF_SIZE (6 * PAGE_SIZE) RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE); void __init early_alloc_pgt_buf(void) { diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 0215e2c563e..799580cabc7 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -487,7 +487,7 @@ __early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot) unsigned long offset; resource_size_t last_addr; unsigned int nrpages; - enum fixed_addresses idx0, idx; + enum fixed_addresses idx; int i, slot; WARN_ON(system_state != SYSTEM_BOOTING); @@ -540,8 +540,7 @@ __early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot) /* * Ok, go for it.. */ - idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; - idx = idx0; + idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; while (nrpages > 0) { early_set_fixmap(idx, phys_addr, prot); phys_addr += PAGE_SIZE; diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 62c29a5bfe2..25e7e1372bb 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -112,11 +112,13 @@ static unsigned long mmap_legacy_base(void) */ void arch_pick_mmap_layout(struct mm_struct *mm) { + mm->mmap_legacy_base = mmap_legacy_base(); + mm->mmap_base = mmap_base(); + if (mmap_is_legacy()) { - mm->mmap_base = mmap_legacy_base(); + mm->mmap_base = mm->mmap_legacy_base; mm->get_unmapped_area = arch_get_unmapped_area; } else { - mm->mmap_base = mmap_base(); mm->get_unmapped_area = arch_get_unmapped_area_topdown; } } diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c index cdd0da9dd53..266ca912f62 100644 --- a/arch/x86/mm/srat.c +++ b/arch/x86/mm/srat.c @@ -146,6 +146,7 @@ int __init acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) { u64 start, end; + u32 hotpluggable; int node, pxm; if (srat_disabled()) @@ -154,7 +155,8 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) goto out_err_bad_srat; if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0) goto out_err; - if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info()) + hotpluggable = ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE; + if (hotpluggable && !save_add_info()) goto out_err; start = ma->base_address; @@ -174,9 +176,10 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) node_set(node, numa_nodes_parsed); - printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]\n", - node, pxm, - (unsigned long long) start, (unsigned long long) end - 1); + pr_info("SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]%s\n", + node, pxm, + (unsigned long long) start, (unsigned long long) end - 1, + hotpluggable ? " hotplug" : ""); return 0; out_err_bad_srat: diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 48768df2471..6890d8498e0 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -403,7 +403,7 @@ static void nmi_cpu_down(void *dummy) nmi_cpu_shutdown(dummy); } -static int nmi_create_files(struct super_block *sb, struct dentry *root) +static int nmi_create_files(struct dentry *root) { unsigned int i; @@ -420,14 +420,14 @@ static int nmi_create_files(struct super_block *sb, struct dentry *root) continue; snprintf(buf, sizeof(buf), "%d", i); - dir = oprofilefs_mkdir(sb, root, buf); - oprofilefs_create_ulong(sb, dir, "enabled", &counter_config[i].enabled); - oprofilefs_create_ulong(sb, dir, "event", &counter_config[i].event); - oprofilefs_create_ulong(sb, dir, "count", &counter_config[i].count); - oprofilefs_create_ulong(sb, dir, "unit_mask", &counter_config[i].unit_mask); - oprofilefs_create_ulong(sb, dir, "kernel", &counter_config[i].kernel); - oprofilefs_create_ulong(sb, dir, "user", &counter_config[i].user); - oprofilefs_create_ulong(sb, dir, "extra", &counter_config[i].extra); + dir = oprofilefs_mkdir(root, buf); + oprofilefs_create_ulong(dir, "enabled", &counter_config[i].enabled); + oprofilefs_create_ulong(dir, "event", &counter_config[i].event); + oprofilefs_create_ulong(dir, "count", &counter_config[i].count); + oprofilefs_create_ulong(dir, "unit_mask", &counter_config[i].unit_mask); + oprofilefs_create_ulong(dir, "kernel", &counter_config[i].kernel); + oprofilefs_create_ulong(dir, "user", &counter_config[i].user); + oprofilefs_create_ulong(dir, "extra", &counter_config[i].extra); } return 0; diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index b2b94438ff0..50d86c0e9ba 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -454,16 +454,16 @@ static void init_ibs(void) printk(KERN_INFO "oprofile: AMD IBS detected (0x%08x)\n", ibs_caps); } -static int (*create_arch_files)(struct super_block *sb, struct dentry *root); +static int (*create_arch_files)(struct dentry *root); -static int setup_ibs_files(struct super_block *sb, struct dentry *root) +static int setup_ibs_files(struct dentry *root) { struct dentry *dir; int ret = 0; /* architecture specific files */ if (create_arch_files) - ret = create_arch_files(sb, root); + ret = create_arch_files(root); if (ret) return ret; @@ -479,26 +479,26 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root) ibs_config.max_cnt_op = 250000; if (ibs_caps & IBS_CAPS_FETCHSAM) { - dir = oprofilefs_mkdir(sb, root, "ibs_fetch"); - oprofilefs_create_ulong(sb, dir, "enable", + dir = oprofilefs_mkdir(root, "ibs_fetch"); + oprofilefs_create_ulong(dir, "enable", &ibs_config.fetch_enabled); - oprofilefs_create_ulong(sb, dir, "max_count", + oprofilefs_create_ulong(dir, "max_count", &ibs_config.max_cnt_fetch); - oprofilefs_create_ulong(sb, dir, "rand_enable", + oprofilefs_create_ulong(dir, "rand_enable", &ibs_config.rand_en); } if (ibs_caps & IBS_CAPS_OPSAM) { - dir = oprofilefs_mkdir(sb, root, "ibs_op"); - oprofilefs_create_ulong(sb, dir, "enable", + dir = oprofilefs_mkdir(root, "ibs_op"); + oprofilefs_create_ulong(dir, "enable", &ibs_config.op_enabled); - oprofilefs_create_ulong(sb, dir, "max_count", + oprofilefs_create_ulong(dir, "max_count", &ibs_config.max_cnt_op); if (ibs_caps & IBS_CAPS_OPCNT) - oprofilefs_create_ulong(sb, dir, "dispatched_ops", + oprofilefs_create_ulong(dir, "dispatched_ops", &ibs_config.dispatched_ops); if (ibs_caps & IBS_CAPS_BRNTRGT) - oprofilefs_create_ulong(sb, dir, "branch_target", + oprofilefs_create_ulong(dir, "branch_target", &ibs_config.branch_target); } diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index d641897a1f4..b30e937689d 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -568,13 +568,8 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) */ if (bus) { struct pci_bus *child; - list_for_each_entry(child, &bus->children, node) { - struct pci_dev *self = child->self; - if (!self) - continue; - - pcie_bus_configure_settings(child, self->pcie_mpss); - } + list_for_each_entry(child, &bus->children, node) + pcie_bus_configure_settings(child); } if (bus && node != -1) { diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index 94919e307f8..db6b1ab4325 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -210,6 +210,8 @@ static void pcibios_allocate_bridge_resources(struct pci_dev *dev) r = &dev->resource[idx]; if (!r->flags) continue; + if (r->parent) /* Already allocated */ + continue; if (!r->start || pci_claim_resource(dev, idx) < 0) { /* * Something is wrong with the region. @@ -318,6 +320,8 @@ static void pcibios_allocate_dev_rom_resource(struct pci_dev *dev) r = &dev->resource[PCI_ROM_RESOURCE]; if (!r->flags || !r->start) return; + if (r->parent) /* Already allocated */ + return; if (pci_claim_resource(dev, PCI_ROM_RESOURCE) < 0) { r->end -= r->start; diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 082e8812971..5596c7bdd32 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -700,7 +700,7 @@ int pci_mmconfig_insert(struct device *dev, u16 seg, u8 start, u8 end, if (!(pci_probe & PCI_PROBE_MMCONF) || pci_mmcfg_arch_init_failed) return -ENODEV; - if (start > end) + if (start > end || !addr) return -EINVAL; mutex_lock(&pci_mmcfg_lock); @@ -716,11 +716,6 @@ int pci_mmconfig_insert(struct device *dev, u16 seg, u8 start, u8 end, return -EEXIST; } - if (!addr) { - mutex_unlock(&pci_mmcfg_lock); - return -EINVAL; - } - rc = -EBUSY; cfg = pci_mmconfig_alloc(seg, start, end, addr); if (cfg == NULL) { diff --git a/arch/x86/pci/mrst.c b/arch/x86/pci/mrst.c index 6eb18c42a28..903fded5078 100644 --- a/arch/x86/pci/mrst.c +++ b/arch/x86/pci/mrst.c @@ -23,11 +23,11 @@ #include <linux/ioport.h> #include <linux/init.h> #include <linux/dmi.h> +#include <linux/acpi.h> +#include <linux/io.h> +#include <linux/smp.h> -#include <asm/acpi.h> #include <asm/segment.h> -#include <asm/io.h> -#include <asm/smp.h> #include <asm/pci_x86.h> #include <asm/hw_irq.h> #include <asm/io_apic.h> @@ -43,7 +43,7 @@ #define PCI_FIXED_BAR_4_SIZE 0x14 #define PCI_FIXED_BAR_5_SIZE 0x1c -static int pci_soc_mode = 0; +static int pci_soc_mode; /** * fixed_bar_cap - return the offset of the fixed BAR cap if found @@ -141,7 +141,8 @@ static int pci_device_update_fixed(struct pci_bus *bus, unsigned int devfn, */ static bool type1_access_ok(unsigned int bus, unsigned int devfn, int reg) { - /* This is a workaround for A0 LNC bug where PCI status register does + /* + * This is a workaround for A0 LNC bug where PCI status register does * not have new CAP bit set. can not be written by SW either. * * PCI header type in real LNC indicates a single function device, this @@ -154,7 +155,7 @@ static bool type1_access_ok(unsigned int bus, unsigned int devfn, int reg) || devfn == PCI_DEVFN(0, 0) || devfn == PCI_DEVFN(3, 0))) return 1; - return 0; /* langwell on others */ + return 0; /* Langwell on others */ } static int pci_read(struct pci_bus *bus, unsigned int devfn, int where, @@ -172,7 +173,8 @@ static int pci_write(struct pci_bus *bus, unsigned int devfn, int where, { int offset; - /* On MRST, there is no PCI ROM BAR, this will cause a subsequent read + /* + * On MRST, there is no PCI ROM BAR, this will cause a subsequent read * to ROM BAR return 0 then being ignored. */ if (where == PCI_ROM_ADDRESS) @@ -210,7 +212,8 @@ static int mrst_pci_irq_enable(struct pci_dev *dev) pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); - /* MRST only have IOAPIC, the PCI irq lines are 1:1 mapped to + /* + * MRST only have IOAPIC, the PCI irq lines are 1:1 mapped to * IOAPIC RTE entries, so we just enable RTE for the device. */ irq_attr.ioapic = mp_find_ioapic(dev->irq); @@ -235,7 +238,7 @@ struct pci_ops pci_mrst_ops = { */ int __init pci_mrst_init(void) { - printk(KERN_INFO "Intel MID platform detected, using MID PCI ops\n"); + pr_info("Intel MID platform detected, using MID PCI ops\n"); pci_mmcfg_late_init(); pcibios_enable_irq = mrst_pci_irq_enable; pci_root_ops = pci_mrst_ops; @@ -244,17 +247,21 @@ int __init pci_mrst_init(void) return 1; } -/* Langwell devices are not true pci devices, they are not subject to 10 ms - * d3 to d0 delay required by pci spec. +/* + * Langwell devices are not true PCI devices; they are not subject to 10 ms + * d3 to d0 delay required by PCI spec. */ static void pci_d3delay_fixup(struct pci_dev *dev) { - /* PCI fixups are effectively decided compile time. If we have a dual - SoC/non-SoC kernel we don't want to mangle d3 on non SoC devices */ - if (!pci_soc_mode) - return; - /* true pci devices in lincroft should allow type 1 access, the rest - * are langwell fake pci devices. + /* + * PCI fixups are effectively decided compile time. If we have a dual + * SoC/non-SoC kernel we don't want to mangle d3 on non-SoC devices. + */ + if (!pci_soc_mode) + return; + /* + * True PCI devices in Lincroft should allow type 1 access, the rest + * are Langwell fake PCI devices. */ if (type1_access_ok(dev->bus->number, dev->devfn, PCI_DEVICE_ID)) return; diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c index 47fe66fe61f..3ca5957b7a3 100644 --- a/arch/x86/platform/mrst/mrst.c +++ b/arch/x86/platform/mrst/mrst.c @@ -20,7 +20,7 @@ #include <linux/intel_pmic_gpio.h> #include <linux/spi/spi.h> #include <linux/i2c.h> -#include <linux/i2c/pca953x.h> +#include <linux/platform_data/pca953x.h> #include <linux/gpio_keys.h> #include <linux/input.h> #include <linux/platform_device.h> diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 1cf5b300305..424f4c97a44 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -25,10 +25,10 @@ #include <asm/cpu.h> #ifdef CONFIG_X86_32 -unsigned long saved_context_ebx; -unsigned long saved_context_esp, saved_context_ebp; -unsigned long saved_context_esi, saved_context_edi; -unsigned long saved_context_eflags; +__visible unsigned long saved_context_ebx; +__visible unsigned long saved_context_esp, saved_context_ebp; +__visible unsigned long saved_context_esi, saved_context_edi; +__visible unsigned long saved_context_eflags; #endif struct saved_context saved_context; diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index a0fde91c16c..304fca20d96 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -20,26 +20,26 @@ #include <asm/suspend.h> /* References to section boundaries */ -extern const void __nosave_begin, __nosave_end; +extern __visible const void __nosave_begin, __nosave_end; /* Defined in hibernate_asm_64.S */ -extern int restore_image(void); +extern asmlinkage int restore_image(void); /* * Address to jump to in the last phase of restore in order to get to the image * kernel's text (this value is passed in the image header). */ -unsigned long restore_jump_address; +unsigned long restore_jump_address __visible; /* * Value of the cr3 register from before the hibernation (this value is passed * in the image header). */ -unsigned long restore_cr3; +unsigned long restore_cr3 __visible; -pgd_t *temp_level4_pgt; +pgd_t *temp_level4_pgt __visible; -void *relocated_restore_code; +void *relocated_restore_code __visible; static void *alloc_pgt_page(void *context) { diff --git a/arch/x86/tools/gen-insn-attr-x86.awk b/arch/x86/tools/gen-insn-attr-x86.awk index e6773dc8ac4..093a892026f 100644 --- a/arch/x86/tools/gen-insn-attr-x86.awk +++ b/arch/x86/tools/gen-insn-attr-x86.awk @@ -68,7 +68,7 @@ BEGIN { lprefix1_expr = "\\((66|!F3)\\)" lprefix2_expr = "\\(F3\\)" - lprefix3_expr = "\\((F2|!F3)\\)" + lprefix3_expr = "\\((F2|!F3|66\\&F2)\\)" lprefix_expr = "\\((66|F2|F3)\\)" max_lprefix = 4 @@ -83,6 +83,8 @@ BEGIN { prefix_num["Operand-Size"] = "INAT_PFX_OPNDSZ" prefix_num["REPNE"] = "INAT_PFX_REPNE" prefix_num["REP/REPE"] = "INAT_PFX_REPE" + prefix_num["XACQUIRE"] = "INAT_PFX_REPNE" + prefix_num["XRELEASE"] = "INAT_PFX_REPE" prefix_num["LOCK"] = "INAT_PFX_LOCK" prefix_num["SEG=CS"] = "INAT_PFX_CS" prefix_num["SEG=DS"] = "INAT_PFX_DS" diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index c74436e687b..72074d52840 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -85,15 +85,18 @@ static notrace cycle_t vread_pvclock(int *mode) cycle_t ret; u64 last; u32 version; - u32 migrate_count; u8 flags; unsigned cpu, cpu1; /* - * When looping to get a consistent (time-info, tsc) pair, we - * also need to deal with the possibility we can switch vcpus, - * so make sure we always re-fetch time-info for the current vcpu. + * Note: hypervisor must guarantee that: + * 1. cpu ID number maps 1:1 to per-CPU pvclock time info. + * 2. that per-CPU pvclock time info is updated if the + * underlying CPU changes. + * 3. that version is increased whenever underlying CPU + * changes. + * */ do { cpu = __getcpu() & VGETCPU_CPU_MASK; @@ -104,8 +107,6 @@ static notrace cycle_t vread_pvclock(int *mode) pvti = get_pvti(cpu); - migrate_count = pvti->migrate_count; - version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags); /* @@ -117,8 +118,7 @@ static notrace cycle_t vread_pvclock(int *mode) cpu1 = __getcpu() & VGETCPU_CPU_MASK; } while (unlikely(cpu != cpu1 || (pvti->pvti.version & 1) || - pvti->pvti.version != version || - pvti->migrate_count != migrate_count)); + pvti->pvti.version != version)); if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT))) *mode = VCLOCK_NONE; diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 193097ef3d7..2fc216dfbd9 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -427,8 +427,7 @@ static void __init xen_init_cpuid_mask(void) if (!xen_initial_domain()) cpuid_leaf1_edx_mask &= - ~((1 << X86_FEATURE_APIC) | /* disable local APIC */ - (1 << X86_FEATURE_ACPI)); /* disable ACPI */ + ~((1 << X86_FEATURE_ACPI)); /* disable ACPI */ cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_X2APIC % 32)); @@ -735,8 +734,7 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val, addr = (unsigned long)xen_int3; else if (addr == (unsigned long)stack_segment) addr = (unsigned long)xen_stack_segment; - else if (addr == (unsigned long)double_fault || - addr == (unsigned long)nmi) { + else if (addr == (unsigned long)double_fault) { /* Don't need to handle these */ return 0; #ifdef CONFIG_X86_MCE @@ -747,7 +745,12 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val, */ ; #endif - } else { + } else if (addr == (unsigned long)nmi) + /* + * Use the native version as well. + */ + ; + else { /* Some other trap using IST? */ if (WARN_ON(val->ist != 0)) return 0; @@ -1710,6 +1713,8 @@ static void __init xen_hvm_guest_init(void) xen_hvm_init_shared_info(); + xen_panic_handler_init(); + if (xen_feature(XENFEAT_hvm_callback_vector)) xen_have_vector_callback = 1; xen_hvm_smp_init(); @@ -1720,15 +1725,12 @@ static void __init xen_hvm_guest_init(void) xen_hvm_init_mmu_ops(); } -static bool __init xen_hvm_platform(void) +static uint32_t __init xen_hvm_platform(void) { if (xen_pv_domain()) - return false; - - if (!xen_cpuid_base()) - return false; + return 0; - return true; + return xen_cpuid_base(); } bool xen_hvm_need_lapic(void) diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c index 01a4dc015ae..0da7f863056 100644 --- a/arch/x86/xen/irq.c +++ b/arch/x86/xen/irq.c @@ -47,23 +47,18 @@ static void xen_restore_fl(unsigned long flags) /* convert from IF type flag */ flags = !(flags & X86_EFLAGS_IF); - /* There's a one instruction preempt window here. We need to - make sure we're don't switch CPUs between getting the vcpu - pointer and updating the mask. */ + /* See xen_irq_enable() for why preemption must be disabled. */ preempt_disable(); vcpu = this_cpu_read(xen_vcpu); vcpu->evtchn_upcall_mask = flags; - preempt_enable_no_resched(); - - /* Doesn't matter if we get preempted here, because any - pending event will get dealt with anyway. */ if (flags == 0) { - preempt_check_resched(); barrier(); /* unmask then check (avoid races) */ if (unlikely(vcpu->evtchn_upcall_pending)) xen_force_evtchn_callback(); - } + preempt_enable(); + } else + preempt_enable_no_resched(); } PV_CALLEE_SAVE_REGS_THUNK(xen_restore_fl); @@ -82,10 +77,12 @@ static void xen_irq_enable(void) { struct vcpu_info *vcpu; - /* We don't need to worry about being preempted here, since - either a) interrupts are disabled, so no preemption, or b) - the caller is confused and is trying to re-enable interrupts - on an indeterminate processor. */ + /* + * We may be preempted as soon as vcpu->evtchn_upcall_mask is + * cleared, so disable preemption to ensure we check for + * events on the VCPU we are still running on. + */ + preempt_disable(); vcpu = this_cpu_read(xen_vcpu); vcpu->evtchn_upcall_mask = 0; @@ -96,6 +93,8 @@ static void xen_irq_enable(void) barrier(); /* unmask then check (avoid races) */ if (unlikely(vcpu->evtchn_upcall_pending)) xen_force_evtchn_callback(); + + preempt_enable(); } PV_CALLEE_SAVE_REGS_THUNK(xen_irq_enable); diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 95fb2aa5927..0d4ec35895d 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -161,6 +161,7 @@ #include <asm/xen/page.h> #include <asm/xen/hypercall.h> #include <asm/xen/hypervisor.h> +#include <xen/balloon.h> #include <xen/grant_table.h> #include "multicalls.h" @@ -967,7 +968,10 @@ int m2p_remove_override(struct page *page, if (kmap_op != NULL) { if (!PageHighMem(page)) { struct multicall_space mcs; - struct gnttab_unmap_grant_ref *unmap_op; + struct gnttab_unmap_and_replace *unmap_op; + struct page *scratch_page = get_balloon_scratch_page(); + unsigned long scratch_page_address = (unsigned long) + __va(page_to_pfn(scratch_page) << PAGE_SHIFT); /* * It might be that we queued all the m2p grant table @@ -990,21 +994,25 @@ int m2p_remove_override(struct page *page, } mcs = xen_mc_entry( - sizeof(struct gnttab_unmap_grant_ref)); + sizeof(struct gnttab_unmap_and_replace)); unmap_op = mcs.args; unmap_op->host_addr = kmap_op->host_addr; + unmap_op->new_addr = scratch_page_address; unmap_op->handle = kmap_op->handle; - unmap_op->dev_bus_addr = 0; MULTI_grant_table_op(mcs.mc, - GNTTABOP_unmap_grant_ref, unmap_op, 1); + GNTTABOP_unmap_and_replace, unmap_op, 1); xen_mc_issue(PARAVIRT_LAZY_MMU); - set_pte_at(&init_mm, address, ptep, - pfn_pte(pfn, PAGE_KERNEL)); - __flush_tlb_single(address); + mcs = __xen_mc_entry(0); + MULTI_update_va_mapping(mcs.mc, scratch_page_address, + pfn_pte(page_to_pfn(get_balloon_scratch_page()), + PAGE_KERNEL_RO), 0); + xen_mc_issue(PARAVIRT_LAZY_MMU); + kmap_op->host_addr = 0; + put_balloon_scratch_page(); } } diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 056d11faef2..09f3059cb00 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -33,6 +33,9 @@ /* These are code, but not functions. Defined in entry.S */ extern const char xen_hypervisor_callback[]; extern const char xen_failsafe_callback[]; +#ifdef CONFIG_X86_64 +extern const char nmi[]; +#endif extern void xen_sysenter_target(void); extern void xen_syscall_target(void); extern void xen_syscall32_target(void); @@ -215,13 +218,19 @@ static void __init xen_set_identity_and_release_chunk( unsigned long pfn; /* - * If the PFNs are currently mapped, the VA mapping also needs - * to be updated to be 1:1. + * If the PFNs are currently mapped, clear the mappings + * (except for the ISA region which must be 1:1 mapped) to + * release the refcounts (in Xen) on the original frames. */ - for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) + for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) { + pte_t pte = __pte_ma(0); + + if (pfn < PFN_UP(ISA_END_ADDRESS)) + pte = mfn_pte(pfn, PAGE_KERNEL_IO); + (void)HYPERVISOR_update_va_mapping( - (unsigned long)__va(pfn << PAGE_SHIFT), - mfn_pte(pfn, PAGE_KERNEL_IO), 0); + (unsigned long)__va(pfn << PAGE_SHIFT), pte, 0); + } if (start_pfn < nr_pages) *released += xen_release_chunk( @@ -313,6 +322,17 @@ static void xen_align_and_add_e820_region(u64 start, u64 size, int type) e820_add_region(start, end - start, type); } +void xen_ignore_unusable(struct e820entry *list, size_t map_size) +{ + struct e820entry *entry; + unsigned int i; + + for (i = 0, entry = list; i < map_size; i++, entry++) { + if (entry->type == E820_UNUSABLE) + entry->type = E820_RAM; + } +} + /** * machine_specific_memory_setup - Hook for machine specific memory setup. **/ @@ -353,6 +373,17 @@ char * __init xen_memory_setup(void) } BUG_ON(rc); + /* + * Xen won't allow a 1:1 mapping to be created to UNUSABLE + * regions, so if we're using the machine memory map leave the + * region as RAM as it is in the pseudo-physical map. + * + * UNUSABLE regions in domUs are not handled and will need + * a patch in the future. + */ + if (xen_initial_domain()) + xen_ignore_unusable(map, memmap.nr_entries); + /* Make sure the Xen-supplied memory map is well-ordered. */ sanitize_e820_map(map, memmap.nr_entries, &memmap.nr_entries); @@ -525,7 +556,13 @@ void xen_enable_syscall(void) } #endif /* CONFIG_X86_64 */ } - +void __cpuinit xen_enable_nmi(void) +{ +#ifdef CONFIG_X86_64 + if (register_callback(CALLBACKTYPE_nmi, nmi)) + BUG(); +#endif +} void __init xen_arch_setup(void) { xen_panic_handler_init(); @@ -543,7 +580,7 @@ void __init xen_arch_setup(void) xen_enable_sysenter(); xen_enable_syscall(); - + xen_enable_nmi(); #ifdef CONFIG_ACPI if (!(xen_start_info->flags & SIF_INITDOMAIN)) { printk(KERN_INFO "ACPI in unprivileged domain disabled\n"); diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index ca92754eb84..9235842cd76 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -279,6 +279,7 @@ static void __init xen_smp_prepare_boot_cpu(void) xen_filter_cpu_maps(); xen_setup_vcpu_info_placement(); + xen_init_spinlocks(); } static void __init xen_smp_prepare_cpus(unsigned int max_cpus) @@ -572,6 +573,12 @@ static inline int xen_map_vector(int vector) case IRQ_WORK_VECTOR: xen_vector = XEN_IRQ_WORK_VECTOR; break; +#ifdef CONFIG_X86_64 + case NMI_VECTOR: + case APIC_DM_NMI: /* Some use that instead of NMI_VECTOR */ + xen_vector = XEN_NMI_VECTOR; + break; +#endif default: xen_vector = -1; printk(KERN_ERR "xen: vector 0x%x is not implemented\n", @@ -680,7 +687,6 @@ void __init xen_smp_init(void) { smp_ops = xen_smp_ops; xen_fill_possible_map(); - xen_init_spinlocks(); } static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus) @@ -694,8 +700,15 @@ static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus) static int xen_hvm_cpu_up(unsigned int cpu, struct task_struct *tidle) { int rc; - rc = native_cpu_up(cpu, tidle); - WARN_ON (xen_smp_intr_init(cpu)); + /* + * xen_smp_intr_init() needs to run before native_cpu_up() + * so that IPI vectors are set up on the booting CPU before + * it is marked online in native_cpu_up(). + */ + rc = xen_smp_intr_init(cpu); + WARN_ON(rc); + if (!rc) + rc = native_cpu_up(cpu, tidle); return rc; } diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index cf3caee356b..0438b9324a7 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -17,45 +17,44 @@ #include "xen-ops.h" #include "debugfs.h" -#ifdef CONFIG_XEN_DEBUG_FS -static struct xen_spinlock_stats -{ - u64 taken; - u32 taken_slow; - u32 taken_slow_nested; - u32 taken_slow_pickup; - u32 taken_slow_spurious; - u32 taken_slow_irqenable; +enum xen_contention_stat { + TAKEN_SLOW, + TAKEN_SLOW_PICKUP, + TAKEN_SLOW_SPURIOUS, + RELEASED_SLOW, + RELEASED_SLOW_KICKED, + NR_CONTENTION_STATS +}; - u64 released; - u32 released_slow; - u32 released_slow_kicked; +#ifdef CONFIG_XEN_DEBUG_FS #define HISTO_BUCKETS 30 - u32 histo_spin_total[HISTO_BUCKETS+1]; - u32 histo_spin_spinning[HISTO_BUCKETS+1]; +static struct xen_spinlock_stats +{ + u32 contention_stats[NR_CONTENTION_STATS]; u32 histo_spin_blocked[HISTO_BUCKETS+1]; - - u64 time_total; - u64 time_spinning; u64 time_blocked; } spinlock_stats; static u8 zero_stats; -static unsigned lock_timeout = 1 << 10; -#define TIMEOUT lock_timeout - static inline void check_zero(void) { - if (unlikely(zero_stats)) { - memset(&spinlock_stats, 0, sizeof(spinlock_stats)); - zero_stats = 0; + u8 ret; + u8 old = ACCESS_ONCE(zero_stats); + if (unlikely(old)) { + ret = cmpxchg(&zero_stats, old, 0); + /* This ensures only one fellow resets the stat */ + if (ret == old) + memset(&spinlock_stats, 0, sizeof(spinlock_stats)); } } -#define ADD_STATS(elem, val) \ - do { check_zero(); spinlock_stats.elem += (val); } while(0) +static inline void add_stats(enum xen_contention_stat var, u32 val) +{ + check_zero(); + spinlock_stats.contention_stats[var] += val; +} static inline u64 spin_time_start(void) { @@ -74,22 +73,6 @@ static void __spin_time_accum(u64 delta, u32 *array) array[HISTO_BUCKETS]++; } -static inline void spin_time_accum_spinning(u64 start) -{ - u32 delta = xen_clocksource_read() - start; - - __spin_time_accum(delta, spinlock_stats.histo_spin_spinning); - spinlock_stats.time_spinning += delta; -} - -static inline void spin_time_accum_total(u64 start) -{ - u32 delta = xen_clocksource_read() - start; - - __spin_time_accum(delta, spinlock_stats.histo_spin_total); - spinlock_stats.time_total += delta; -} - static inline void spin_time_accum_blocked(u64 start) { u32 delta = xen_clocksource_read() - start; @@ -99,19 +82,15 @@ static inline void spin_time_accum_blocked(u64 start) } #else /* !CONFIG_XEN_DEBUG_FS */ #define TIMEOUT (1 << 10) -#define ADD_STATS(elem, val) do { (void)(val); } while(0) +static inline void add_stats(enum xen_contention_stat var, u32 val) +{ +} static inline u64 spin_time_start(void) { return 0; } -static inline void spin_time_accum_total(u64 start) -{ -} -static inline void spin_time_accum_spinning(u64 start) -{ -} static inline void spin_time_accum_blocked(u64 start) { } @@ -134,227 +113,123 @@ typedef u16 xen_spinners_t; asm(LOCK_PREFIX " decw %0" : "+m" ((xl)->spinners) : : "memory"); #endif -struct xen_spinlock { - unsigned char lock; /* 0 -> free; 1 -> locked */ - xen_spinners_t spinners; /* count of waiting cpus */ +struct xen_lock_waiting { + struct arch_spinlock *lock; + __ticket_t want; }; -static int xen_spin_is_locked(struct arch_spinlock *lock) -{ - struct xen_spinlock *xl = (struct xen_spinlock *)lock; - - return xl->lock != 0; -} - -static int xen_spin_is_contended(struct arch_spinlock *lock) -{ - struct xen_spinlock *xl = (struct xen_spinlock *)lock; - - /* Not strictly true; this is only the count of contended - lock-takers entering the slow path. */ - return xl->spinners != 0; -} - -static int xen_spin_trylock(struct arch_spinlock *lock) -{ - struct xen_spinlock *xl = (struct xen_spinlock *)lock; - u8 old = 1; - - asm("xchgb %b0,%1" - : "+q" (old), "+m" (xl->lock) : : "memory"); - - return old == 0; -} - -static DEFINE_PER_CPU(char *, irq_name); static DEFINE_PER_CPU(int, lock_kicker_irq) = -1; -static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners); - -/* - * Mark a cpu as interested in a lock. Returns the CPU's previous - * lock of interest, in case we got preempted by an interrupt. - */ -static inline struct xen_spinlock *spinning_lock(struct xen_spinlock *xl) -{ - struct xen_spinlock *prev; - - prev = __this_cpu_read(lock_spinners); - __this_cpu_write(lock_spinners, xl); - - wmb(); /* set lock of interest before count */ - - inc_spinners(xl); - - return prev; -} - -/* - * Mark a cpu as no longer interested in a lock. Restores previous - * lock of interest (NULL for none). - */ -static inline void unspinning_lock(struct xen_spinlock *xl, struct xen_spinlock *prev) -{ - dec_spinners(xl); - wmb(); /* decrement count before restoring lock */ - __this_cpu_write(lock_spinners, prev); -} +static DEFINE_PER_CPU(char *, irq_name); +static DEFINE_PER_CPU(struct xen_lock_waiting, lock_waiting); +static cpumask_t waiting_cpus; -static noinline int xen_spin_lock_slow(struct arch_spinlock *lock, bool irq_enable) +static void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want) { - struct xen_spinlock *xl = (struct xen_spinlock *)lock; - struct xen_spinlock *prev; int irq = __this_cpu_read(lock_kicker_irq); - int ret; + struct xen_lock_waiting *w = &__get_cpu_var(lock_waiting); + int cpu = smp_processor_id(); u64 start; + unsigned long flags; /* If kicker interrupts not initialized yet, just spin */ if (irq == -1) - return 0; + return; start = spin_time_start(); - /* announce we're spinning */ - prev = spinning_lock(xl); + /* + * Make sure an interrupt handler can't upset things in a + * partially setup state. + */ + local_irq_save(flags); + /* + * We don't really care if we're overwriting some other + * (lock,want) pair, as that would mean that we're currently + * in an interrupt context, and the outer context had + * interrupts enabled. That has already kicked the VCPU out + * of xen_poll_irq(), so it will just return spuriously and + * retry with newly setup (lock,want). + * + * The ordering protocol on this is that the "lock" pointer + * may only be set non-NULL if the "want" ticket is correct. + * If we're updating "want", we must first clear "lock". + */ + w->lock = NULL; + smp_wmb(); + w->want = want; + smp_wmb(); + w->lock = lock; - ADD_STATS(taken_slow, 1); - ADD_STATS(taken_slow_nested, prev != NULL); + /* This uses set_bit, which atomic and therefore a barrier */ + cpumask_set_cpu(cpu, &waiting_cpus); + add_stats(TAKEN_SLOW, 1); - do { - unsigned long flags; + /* clear pending */ + xen_clear_irq_pending(irq); - /* clear pending */ - xen_clear_irq_pending(irq); + /* Only check lock once pending cleared */ + barrier(); - /* check again make sure it didn't become free while - we weren't looking */ - ret = xen_spin_trylock(lock); - if (ret) { - ADD_STATS(taken_slow_pickup, 1); + /* + * Mark entry to slowpath before doing the pickup test to make + * sure we don't deadlock with an unlocker. + */ + __ticket_enter_slowpath(lock); - /* - * If we interrupted another spinlock while it - * was blocking, make sure it doesn't block - * without rechecking the lock. - */ - if (prev != NULL) - xen_set_irq_pending(irq); - goto out; - } + /* + * check again make sure it didn't become free while + * we weren't looking + */ + if (ACCESS_ONCE(lock->tickets.head) == want) { + add_stats(TAKEN_SLOW_PICKUP, 1); + goto out; + } - flags = arch_local_save_flags(); - if (irq_enable) { - ADD_STATS(taken_slow_irqenable, 1); - raw_local_irq_enable(); - } + /* Allow interrupts while blocked */ + local_irq_restore(flags); - /* - * Block until irq becomes pending. If we're - * interrupted at this point (after the trylock but - * before entering the block), then the nested lock - * handler guarantees that the irq will be left - * pending if there's any chance the lock became free; - * xen_poll_irq() returns immediately if the irq is - * pending. - */ - xen_poll_irq(irq); + /* + * If an interrupt happens here, it will leave the wakeup irq + * pending, which will cause xen_poll_irq() to return + * immediately. + */ - raw_local_irq_restore(flags); + /* Block until irq becomes pending (or perhaps a spurious wakeup) */ + xen_poll_irq(irq); + add_stats(TAKEN_SLOW_SPURIOUS, !xen_test_irq_pending(irq)); - ADD_STATS(taken_slow_spurious, !xen_test_irq_pending(irq)); - } while (!xen_test_irq_pending(irq)); /* check for spurious wakeups */ + local_irq_save(flags); kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq)); - out: - unspinning_lock(xl, prev); - spin_time_accum_blocked(start); - - return ret; -} - -static inline void __xen_spin_lock(struct arch_spinlock *lock, bool irq_enable) -{ - struct xen_spinlock *xl = (struct xen_spinlock *)lock; - unsigned timeout; - u8 oldval; - u64 start_spin; - - ADD_STATS(taken, 1); - - start_spin = spin_time_start(); - - do { - u64 start_spin_fast = spin_time_start(); - - timeout = TIMEOUT; - - asm("1: xchgb %1,%0\n" - " testb %1,%1\n" - " jz 3f\n" - "2: rep;nop\n" - " cmpb $0,%0\n" - " je 1b\n" - " dec %2\n" - " jnz 2b\n" - "3:\n" - : "+m" (xl->lock), "=q" (oldval), "+r" (timeout) - : "1" (1) - : "memory"); + cpumask_clear_cpu(cpu, &waiting_cpus); + w->lock = NULL; - spin_time_accum_spinning(start_spin_fast); + local_irq_restore(flags); - } while (unlikely(oldval != 0 && - (TIMEOUT == ~0 || !xen_spin_lock_slow(lock, irq_enable)))); - - spin_time_accum_total(start_spin); -} - -static void xen_spin_lock(struct arch_spinlock *lock) -{ - __xen_spin_lock(lock, false); -} - -static void xen_spin_lock_flags(struct arch_spinlock *lock, unsigned long flags) -{ - __xen_spin_lock(lock, !raw_irqs_disabled_flags(flags)); + spin_time_accum_blocked(start); } +PV_CALLEE_SAVE_REGS_THUNK(xen_lock_spinning); -static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl) +static void xen_unlock_kick(struct arch_spinlock *lock, __ticket_t next) { int cpu; - ADD_STATS(released_slow, 1); + add_stats(RELEASED_SLOW, 1); + + for_each_cpu(cpu, &waiting_cpus) { + const struct xen_lock_waiting *w = &per_cpu(lock_waiting, cpu); - for_each_online_cpu(cpu) { - /* XXX should mix up next cpu selection */ - if (per_cpu(lock_spinners, cpu) == xl) { - ADD_STATS(released_slow_kicked, 1); + /* Make sure we read lock before want */ + if (ACCESS_ONCE(w->lock) == lock && + ACCESS_ONCE(w->want) == next) { + add_stats(RELEASED_SLOW_KICKED, 1); xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR); + break; } } } -static void xen_spin_unlock(struct arch_spinlock *lock) -{ - struct xen_spinlock *xl = (struct xen_spinlock *)lock; - - ADD_STATS(released, 1); - - smp_wmb(); /* make sure no writes get moved after unlock */ - xl->lock = 0; /* release lock */ - - /* - * Make sure unlock happens before checking for waiting - * spinners. We need a strong barrier to enforce the - * write-read ordering to different memory locations, as the - * CPU makes no implied guarantees about their ordering. - */ - mb(); - - if (unlikely(xl->spinners)) - xen_spin_unlock_slow(xl); -} - static irqreturn_t dummy_handler(int irq, void *dev_id) { BUG(); @@ -408,6 +283,8 @@ void xen_uninit_lock_cpu(int cpu) per_cpu(irq_name, cpu) = NULL; } +static bool xen_pvspin __initdata = true; + void __init xen_init_spinlocks(void) { /* @@ -417,15 +294,23 @@ void __init xen_init_spinlocks(void) if (xen_hvm_domain()) return; - BUILD_BUG_ON(sizeof(struct xen_spinlock) > sizeof(arch_spinlock_t)); + if (!xen_pvspin) { + printk(KERN_DEBUG "xen: PV spinlocks disabled\n"); + return; + } - pv_lock_ops.spin_is_locked = xen_spin_is_locked; - pv_lock_ops.spin_is_contended = xen_spin_is_contended; - pv_lock_ops.spin_lock = xen_spin_lock; - pv_lock_ops.spin_lock_flags = xen_spin_lock_flags; - pv_lock_ops.spin_trylock = xen_spin_trylock; - pv_lock_ops.spin_unlock = xen_spin_unlock; + static_key_slow_inc(¶virt_ticketlocks_enabled); + + pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(xen_lock_spinning); + pv_lock_ops.unlock_kick = xen_unlock_kick; +} + +static __init int xen_parse_nopvspin(char *arg) +{ + xen_pvspin = false; + return 0; } +early_param("xen_nopvspin", xen_parse_nopvspin); #ifdef CONFIG_XEN_DEBUG_FS @@ -442,37 +327,21 @@ static int __init xen_spinlock_debugfs(void) debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats); - debugfs_create_u32("timeout", 0644, d_spin_debug, &lock_timeout); - - debugfs_create_u64("taken", 0444, d_spin_debug, &spinlock_stats.taken); debugfs_create_u32("taken_slow", 0444, d_spin_debug, - &spinlock_stats.taken_slow); - debugfs_create_u32("taken_slow_nested", 0444, d_spin_debug, - &spinlock_stats.taken_slow_nested); + &spinlock_stats.contention_stats[TAKEN_SLOW]); debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug, - &spinlock_stats.taken_slow_pickup); + &spinlock_stats.contention_stats[TAKEN_SLOW_PICKUP]); debugfs_create_u32("taken_slow_spurious", 0444, d_spin_debug, - &spinlock_stats.taken_slow_spurious); - debugfs_create_u32("taken_slow_irqenable", 0444, d_spin_debug, - &spinlock_stats.taken_slow_irqenable); + &spinlock_stats.contention_stats[TAKEN_SLOW_SPURIOUS]); - debugfs_create_u64("released", 0444, d_spin_debug, &spinlock_stats.released); debugfs_create_u32("released_slow", 0444, d_spin_debug, - &spinlock_stats.released_slow); + &spinlock_stats.contention_stats[RELEASED_SLOW]); debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug, - &spinlock_stats.released_slow_kicked); + &spinlock_stats.contention_stats[RELEASED_SLOW_KICKED]); - debugfs_create_u64("time_spinning", 0444, d_spin_debug, - &spinlock_stats.time_spinning); debugfs_create_u64("time_blocked", 0444, d_spin_debug, &spinlock_stats.time_blocked); - debugfs_create_u64("time_total", 0444, d_spin_debug, - &spinlock_stats.time_total); - debugfs_create_u32_array("histo_total", 0444, d_spin_debug, - spinlock_stats.histo_spin_total, HISTO_BUCKETS + 1); - debugfs_create_u32_array("histo_spinning", 0444, d_spin_debug, - spinlock_stats.histo_spin_spinning, HISTO_BUCKETS + 1); debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug, spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1); diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 86782c5d7e2..95f8c614232 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -105,9 +105,9 @@ static inline void __init xen_init_apic(void) /* Declare an asm function, along with symbols needed to make it inlineable */ #define DECL_ASM(ret, name, ...) \ - ret name(__VA_ARGS__); \ - extern char name##_end[]; \ - extern char name##_reloc[] \ + __visible ret name(__VA_ARGS__); \ + extern char name##_end[] __visible; \ + extern char name##_reloc[] __visible DECL_ASM(void, xen_irq_enable_direct, void); DECL_ASM(void, xen_irq_disable_direct, void); @@ -115,11 +115,11 @@ DECL_ASM(unsigned long, xen_save_fl_direct, void); DECL_ASM(void, xen_restore_fl_direct, unsigned long); /* These are not functions, and cannot be called normally */ -void xen_iret(void); -void xen_sysexit(void); -void xen_sysret32(void); -void xen_sysret64(void); -void xen_adjust_exception_frame(void); +__visible void xen_iret(void); +__visible void xen_sysexit(void); +__visible void xen_sysret32(void); +__visible void xen_sysret64(void); +__visible void xen_adjust_exception_frame(void); extern int xen_panic_handler_init(void); |