diff options
Diffstat (limited to 'arch/x86')
285 files changed, 17842 insertions, 8781 deletions
diff --git a/arch/x86/.gitignore b/arch/x86/.gitignore index 7cab8c08e6d..aff152c87cf 100644 --- a/arch/x86/.gitignore +++ b/arch/x86/.gitignore @@ -1,4 +1,6 @@ boot/compressed/vmlinux tools/test_get_len tools/insn_sanity +purgatory/kexec-purgatory.c +purgatory/purgatory.ro diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild index e5287d8517a..3942f74c92d 100644 --- a/arch/x86/Kbuild +++ b/arch/x86/Kbuild @@ -16,3 +16,5 @@ obj-$(CONFIG_IA32_EMULATION) += ia32/ obj-y += platform/ obj-y += net/ + +obj-$(CONFIG_KEXEC_FILE) += purgatory/ diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d24887b645d..f2327e88e07 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -21,14 +21,15 @@ config X86_64 ### Arch settings config X86 def_bool y + select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS + select ARCH_HAS_FAST_MULTIPLIER select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO select HAVE_AOUT if X86_32 select HAVE_UNSTABLE_SCHED_CLOCK select ARCH_SUPPORTS_NUMA_BALANCING if X86_64 select ARCH_SUPPORTS_INT128 if X86_64 - select ARCH_WANTS_PROT_NUMA_PROT_NONE select HAVE_IDE select HAVE_OPROFILE select HAVE_PCSPKR_PLATFORM @@ -54,7 +55,6 @@ config X86 select HAVE_FUNCTION_TRACER select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_GRAPH_FP_TEST - select HAVE_FUNCTION_TRACE_MCOUNT_TEST select HAVE_SYSCALL_TRACEPOINTS select SYSCTL_EXCEPTION_TRACE select HAVE_KVM @@ -96,6 +96,7 @@ config X86 select IRQ_FORCED_THREADING select HAVE_BPF_JIT if X86_64 select HAVE_ARCH_TRANSPARENT_HUGEPAGE + select ARCH_HAS_SG_CHAIN select CLKEVT_I8253 select ARCH_HAVE_NMI_SAFE_CMPXCHG select GENERIC_IOMAP @@ -109,9 +110,9 @@ config X86 select CLOCKSOURCE_WATCHDOG select GENERIC_CLOCKEVENTS select ARCH_CLOCKSOURCE_DATA + select CLOCKSOURCE_VALIDATE_LAST_CYCLE select GENERIC_CLOCKEVENTS_BROADCAST if X86_64 || (X86_32 && X86_LOCAL_APIC) select GENERIC_TIME_VSYSCALL - select KTIME_SCALAR if X86_32 select GENERIC_STRNCPY_FROM_USER select GENERIC_STRNLEN_USER select HAVE_CONTEXT_TRACKING if X86_64 @@ -132,6 +133,10 @@ config X86 select GENERIC_CPU_AUTOPROBE select HAVE_ARCH_AUDITSYSCALL select ARCH_SUPPORTS_ATOMIC_RMW + select HAVE_ACPI_APEI if ACPI + select HAVE_ACPI_APEI_NMI if ACPI + select ACPI_LEGACY_TABLES_LOOKUP if ACPI + select X86_FEATURE_NAMES if PROC_FS config INSTRUCTION_DECODER def_bool y @@ -309,6 +314,17 @@ config SMP If you don't know what to do here, say N. +config X86_FEATURE_NAMES + bool "Processor feature human-readable names" if EMBEDDED + default y + ---help--- + This option compiles in a table of x86 feature bits and corresponding + names. This is required to support /proc/cpuinfo and a few kernel + messages. You can disable this to save space, at the expense of + making those few kernel messages show numeric feature bits instead. + + If in doubt, say Y. + config X86_X2APIC bool "Support x2apic" depends on X86_LOCAL_APIC && X86_64 && IRQ_REMAP @@ -430,6 +446,7 @@ config X86_INTEL_CE bool "CE4100 TV platform" depends on PCI depends on PCI_GODIRECT + depends on X86_IO_APIC depends on X86_32 depends on X86_EXTENDED_PLATFORM select X86_REBOOTFIXUPS @@ -474,6 +491,36 @@ config X86_INTEL_LPSS things like clock tree (common clock framework) and pincontrol which are needed by the LPSS peripheral drivers. +config IOSF_MBI + tristate "Intel SoC IOSF Sideband support for SoC platforms" + depends on PCI + ---help--- + This option enables sideband register access support for Intel SoC + platforms. On these platforms the IOSF sideband is used in lieu of + MSR's for some register accesses, mostly but not limited to thermal + and power. Drivers may query the availability of this device to + determine if they need the sideband in order to work on these + platforms. The sideband is available on the following SoC products. + This list is not meant to be exclusive. + - BayTrail + - Braswell + - Quark + + You should say Y if you are running a kernel on one of these SoC's. + +config IOSF_MBI_DEBUG + bool "Enable IOSF sideband access through debugfs" + depends on IOSF_MBI && DEBUG_FS + ---help--- + Select this option to expose the IOSF sideband access registers (MCR, + MDR, MCRX) through debugfs to write and read register information from + different units on the SoC. This is most useful for obtaining device + state information for debug and analysis. As this is a general access + mechanism, users of this option would have specific knowledge of the + device they want to access. + + If you don't require the option or are in doubt, say N. + config X86_RDC321X bool "RDC R-321x SoC" depends on X86_32 @@ -836,6 +883,7 @@ config X86_IO_APIC def_bool y depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_IOAPIC || PCI_MSI select GENERIC_IRQ_LEGACY_ALLOC_HWIRQ + select IRQ_DOMAIN config X86_REROUTE_FOR_BROKEN_BOOT_IRQS bool "Reroute for broken boot IRQs" @@ -1523,6 +1571,7 @@ config EFI bool "EFI runtime service support" depends on ACPI select UCS2_STRING + select EFI_RUNTIME_WRAPPERS ---help--- This enables the kernel to use EFI runtime services that are available (such as the EFI variable services). @@ -1536,7 +1585,8 @@ config EFI config EFI_STUB bool "EFI stub support" - depends on EFI + depends on EFI && !X86_USE_3DNOW + select RELOCATABLE ---help--- This kernel feature allows a bzImage to be loaded directly by EFI firmware without the use of a bootloader. @@ -1591,6 +1641,41 @@ config KEXEC interface is strongly in flux, so no good recommendation can be made. +config KEXEC_FILE + bool "kexec file based system call" + select BUILD_BIN2C + depends on KEXEC + depends on X86_64 + depends on CRYPTO=y + depends on CRYPTO_SHA256=y + ---help--- + This is new version of kexec system call. This system call is + file based and takes file descriptors as system call argument + for kernel and initramfs as opposed to list of segments as + accepted by previous system call. + +config KEXEC_VERIFY_SIG + bool "Verify kernel signature during kexec_file_load() syscall" + depends on KEXEC_FILE + ---help--- + This option makes kernel signature verification mandatory for + kexec_file_load() syscall. If kernel is signature can not be + verified, kexec_file_load() will fail. + + This option enforces signature verification at generic level. + One needs to enable signature verification for type of kernel + image being loaded to make sure it works. For example, enable + bzImage signature verification option to be able to load and + verify signatures of bzImage. Otherwise kernel loading will fail. + +config KEXEC_BZIMAGE_VERIFY_SIG + bool "Enable bzImage signature verification support" + depends on KEXEC_VERIFY_SIG + depends on SIGNED_PE_FILE_VERIFICATION + select SYSTEM_TRUSTED_KEYRING + ---help--- + Enable bzImage signature verification support. + config CRASH_DUMP bool "kernel crash dumps" depends on X86_64 || (X86_32 && HIGHMEM) @@ -2399,10 +2484,9 @@ config X86_DMA_REMAP bool depends on STA2X11 -config IOSF_MBI - tristate - default m - depends on PCI +config PMC_ATOM + def_bool y + depends on PCI source "net/Kconfig" diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 33f71b01fd2..920e6160c53 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -15,12 +15,9 @@ endif # that way we can complain to the user if the CPU is insufficient. # # The -m16 option is supported by GCC >= 4.9 and clang >= 3.5. For -# older versions of GCC, we need to play evil and unreliable tricks to -# attempt to ensure that our asm(".code16gcc") is first in the asm -# output. -CODE16GCC_CFLAGS := -m32 -include $(srctree)/arch/x86/boot/code16gcc.h \ - $(call cc-option, -fno-toplevel-reorder,\ - $(call cc-option, -fno-unit-at-a-time)) +# older versions of GCC, include an *assembly* header to make sure that +# gcc doesn't play any games behind our back. +CODE16GCC_CFLAGS := -m32 -Wa,$(srctree)/arch/x86/boot/code16gcc.h M16_CFLAGS := $(call cc-option, -m16, $(CODE16GCC_CFLAGS)) REALMODE_CFLAGS := $(M16_CFLAGS) -g -Os -D__KERNEL__ \ @@ -53,9 +50,6 @@ ifeq ($(CONFIG_X86_32),y) KBUILD_CFLAGS += -msoft-float -mregparm=3 -freg-struct-return - # Don't autogenerate MMX or SSE instructions - KBUILD_CFLAGS += -mno-mmx -mno-sse - # Never want PIC in a 32-bit kernel, prevent breakage with GCC built # with nonstandard options KBUILD_CFLAGS += -fno-pic @@ -83,8 +77,7 @@ else KBUILD_AFLAGS += -m64 KBUILD_CFLAGS += -m64 - # Don't autogenerate traditional x87, MMX or SSE instructions - KBUILD_CFLAGS += -mno-mmx -mno-sse + # Don't autogenerate traditional x87 instructions KBUILD_CFLAGS += $(call cc-option,-mno-80387) KBUILD_CFLAGS += $(call cc-option,-mno-fp-ret-in-387) @@ -171,7 +164,7 @@ KBUILD_CFLAGS += -Wno-sign-compare # KBUILD_CFLAGS += -fno-asynchronous-unwind-tables # prevent gcc from generating any FP code by mistake -KBUILD_CFLAGS += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,) +KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow KBUILD_CFLAGS += $(call cc-option,-mno-avx,) KBUILD_CFLAGS += $(mflags-y) @@ -186,6 +179,11 @@ archscripts: scripts_basic archheaders: $(Q)$(MAKE) $(build)=arch/x86/syscalls all +archprepare: +ifeq ($(CONFIG_KEXEC_FILE),y) + $(Q)$(MAKE) $(build)=arch/x86/purgatory arch/x86/purgatory/kexec-purgatory.c +endif + ### # Kernel objects @@ -249,12 +247,7 @@ archclean: $(Q)rm -rf $(objtree)/arch/x86_64 $(Q)$(MAKE) $(clean)=$(boot) $(Q)$(MAKE) $(clean)=arch/x86/tools - -PHONY += kvmconfig -kvmconfig: - $(if $(wildcard $(objtree)/.config),, $(error You need an existing .config for this target)) - $(Q)$(CONFIG_SHELL) $(srctree)/scripts/kconfig/merge_config.sh -m -O $(objtree) $(objtree)/.config $(srctree)/arch/x86/configs/kvm_guest.config - $(Q)yes "" | $(MAKE) -f $(srctree)/Makefile oldconfig + $(Q)$(MAKE) $(clean)=arch/x86/purgatory define archhelp echo '* bzImage - Compressed kernel image (arch/x86/boot/bzImage)' @@ -269,5 +262,4 @@ define archhelp echo ' bzdisk/fdimage*/isoimage also accept:' echo ' FDARGS="..." arguments for the booted kernel' echo ' FDINITRD=file initrd for the booted kernel' - echo ' kvmconfig - Enable additional options for guest kernel support' endef diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index dbe8dd2fe24..5b016e2498f 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -35,19 +35,22 @@ setup-y += video-vesa.o setup-y += video-bios.o targets += $(setup-y) -hostprogs-y := mkcpustr tools/build +hostprogs-y := tools/build +hostprogs-$(CONFIG_X86_FEATURE_NAMES) += mkcpustr HOST_EXTRACFLAGS += -I$(srctree)/tools/include \ -include include/generated/autoconf.h \ -D__EXPORTED_HEADERS__ +ifdef CONFIG_X86_FEATURE_NAMES $(obj)/cpu.o: $(obj)/cpustr.h quiet_cmd_cpustr = CPUSTR $@ cmd_cpustr = $(obj)/mkcpustr > $@ -targets += cpustr.h +targets += cpustr.h $(obj)/cpustr.h: $(obj)/mkcpustr FORCE $(call if_changed,cpustr) +endif # --------------------------------------------------------------------------- diff --git a/arch/x86/boot/code16gcc.h b/arch/x86/boot/code16gcc.h index d93e48010b6..5ff42653539 100644 --- a/arch/x86/boot/code16gcc.h +++ b/arch/x86/boot/code16gcc.h @@ -1,15 +1,11 @@ -/* - * code16gcc.h - * - * This file is -include'd when compiling 16-bit C code. - * Note: this asm() needs to be emitted before gcc emits any code. - * Depending on gcc version, this requires -fno-unit-at-a-time or - * -fno-toplevel-reorder. - * - * Hopefully gcc will eventually have a real -m16 option so we can - * drop this hack long term. - */ +# +# code16gcc.h +# +# This file is added to the assembler via -Wa when compiling 16-bit C code. +# This is done this way instead via asm() to make sure gcc does not reorder +# things around us. +# +# gcc 4.9+ has a real -m16 option so we can drop this hack long term. +# -#ifndef __ASSEMBLY__ -asm(".code16gcc"); -#endif + .code16gcc diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 0fcd9133790..704f58aa79c 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -26,17 +26,18 @@ LDFLAGS_vmlinux := -T hostprogs-y := mkpiggy HOST_EXTRACFLAGS += -I$(srctree)/tools/include -VMLINUX_OBJS = $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \ - $(obj)/string.o $(obj)/cmdline.o $(obj)/early_serial_console.o \ - $(obj)/piggy.o $(obj)/cpuflags.o $(obj)/aslr.o +vmlinux-objs-y := $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \ + $(obj)/string.o $(obj)/cmdline.o \ + $(obj)/piggy.o $(obj)/cpuflags.o + +vmlinux-objs-$(CONFIG_EARLY_PRINTK) += $(obj)/early_serial_console.o +vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/aslr.o $(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone -ifeq ($(CONFIG_EFI_STUB), y) - VMLINUX_OBJS += $(obj)/eboot.o $(obj)/efi_stub_$(BITS).o -endif +vmlinux-objs-$(CONFIG_EFI_STUB) += $(obj)/eboot.o $(obj)/efi_stub_$(BITS).o -$(obj)/vmlinux: $(VMLINUX_OBJS) FORCE +$(obj)/vmlinux: $(vmlinux-objs-y) FORCE $(call if_changed,ld) @: @@ -44,7 +45,7 @@ OBJCOPYFLAGS_vmlinux.bin := -R .comment -S $(obj)/vmlinux.bin: vmlinux FORCE $(call if_changed,objcopy) -targets += $(patsubst $(obj)/%,%,$(VMLINUX_OBJS)) vmlinux.bin.all vmlinux.relocs +targets += $(patsubst $(obj)/%,%,$(vmlinux-objs-y)) vmlinux.bin.all vmlinux.relocs CMD_RELOCS = arch/x86/tools/relocs quiet_cmd_relocs = RELOCS $@ diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c index fc6091abedb..bb137638198 100644 --- a/arch/x86/boot/compressed/aslr.c +++ b/arch/x86/boot/compressed/aslr.c @@ -1,6 +1,5 @@ #include "misc.h" -#ifdef CONFIG_RANDOMIZE_BASE #include <asm/msr.h> #include <asm/archrandom.h> #include <asm/e820.h> @@ -183,12 +182,27 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size, static bool mem_avoid_overlap(struct mem_vector *img) { int i; + struct setup_data *ptr; for (i = 0; i < MEM_AVOID_MAX; i++) { if (mem_overlaps(img, &mem_avoid[i])) return true; } + /* Avoid all entries in the setup_data linked list. */ + ptr = (struct setup_data *)(unsigned long)real_mode->hdr.setup_data; + while (ptr) { + struct mem_vector avoid; + + avoid.start = (unsigned long)ptr; + avoid.size = sizeof(*ptr) + ptr->len; + + if (mem_overlaps(img, &avoid)) + return true; + + ptr = (struct setup_data *)(unsigned long)ptr->next; + } + return false; } @@ -320,5 +334,3 @@ unsigned char *choose_kernel_location(unsigned char *input, out: return (unsigned char *)choice; } - -#endif /* CONFIG_RANDOMIZE_BASE */ diff --git a/arch/x86/boot/compressed/early_serial_console.c b/arch/x86/boot/compressed/early_serial_console.c index d3d003cb548..261e81fb958 100644 --- a/arch/x86/boot/compressed/early_serial_console.c +++ b/arch/x86/boot/compressed/early_serial_console.c @@ -1,9 +1,5 @@ #include "misc.h" -#ifdef CONFIG_EARLY_PRINTK - int early_serial_base; #include "../early_serial_console.c" - -#endif diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index 0331d765c2b..de8eebd6f67 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -48,8 +48,7 @@ static void setup_boot_services##bits(struct efi_config *c) \ BOOT_SERVICES(32); BOOT_SERVICES(64); -static void efi_printk(efi_system_table_t *, char *); -static void efi_char16_printk(efi_system_table_t *, efi_char16_t *); +void efi_char16_printk(efi_system_table_t *, efi_char16_t *); static efi_status_t __file_size32(void *__fh, efi_char16_t *filename_16, @@ -156,7 +155,7 @@ grow: return status; } -static efi_status_t +efi_status_t efi_file_size(efi_system_table_t *sys_table, void *__fh, efi_char16_t *filename_16, void **handle, u64 *file_sz) { @@ -166,7 +165,7 @@ efi_file_size(efi_system_table_t *sys_table, void *__fh, return __file_size32(__fh, filename_16, handle, file_sz); } -static inline efi_status_t +efi_status_t efi_file_read(void *handle, unsigned long *size, void *addr) { unsigned long func; @@ -184,7 +183,7 @@ efi_file_read(void *handle, unsigned long *size, void *addr) } } -static inline efi_status_t efi_file_close(void *handle) +efi_status_t efi_file_close(void *handle) { if (efi_early->is64) { efi_file_handle_64_t *fh = handle; @@ -249,7 +248,7 @@ static inline efi_status_t __open_volume64(void *__image, void **__fh) return status; } -static inline efi_status_t +efi_status_t efi_open_volume(efi_system_table_t *sys_table, void *__image, void **__fh) { if (efi_early->is64) @@ -258,7 +257,7 @@ efi_open_volume(efi_system_table_t *sys_table, void *__image, void **__fh) return __open_volume32(__image, __fh); } -static void efi_char16_printk(efi_system_table_t *table, efi_char16_t *str) +void efi_char16_printk(efi_system_table_t *table, efi_char16_t *str) { unsigned long output_string; size_t offset; @@ -269,22 +268,24 @@ static void efi_char16_printk(efi_system_table_t *table, efi_char16_t *str) offset = offsetof(typeof(*out), output_string); output_string = efi_early->text_output + offset; + out = (typeof(out))(unsigned long)efi_early->text_output; func = (u64 *)output_string; - efi_early->call(*func, efi_early->text_output, str); + efi_early->call(*func, out, str); } else { struct efi_simple_text_output_protocol_32 *out; u32 *func; offset = offsetof(typeof(*out), output_string); output_string = efi_early->text_output + offset; + out = (typeof(out))(unsigned long)efi_early->text_output; func = (u32 *)output_string; - efi_early->call(*func, efi_early->text_output, str); + efi_early->call(*func, out, str); } } -#include "../../../../drivers/firmware/efi/efi-stub-helper.c" +#include "../../../../drivers/firmware/efi/libstub/efi-stub-helper.c" static void find_bits(unsigned long mask, u8 *pos, u8 *size) { @@ -366,7 +367,7 @@ free_struct: return status; } -static efi_status_t +static void setup_efi_pci32(struct boot_params *params, void **pci_handle, unsigned long size) { @@ -409,8 +410,6 @@ setup_efi_pci32(struct boot_params *params, void **pci_handle, data = (struct setup_data *)rom; } - - return status; } static efi_status_t @@ -469,7 +468,7 @@ free_struct: } -static efi_status_t +static void setup_efi_pci64(struct boot_params *params, void **pci_handle, unsigned long size) { @@ -512,11 +511,18 @@ setup_efi_pci64(struct boot_params *params, void **pci_handle, data = (struct setup_data *)rom; } - - return status; } -static efi_status_t setup_efi_pci(struct boot_params *params) +/* + * There's no way to return an informative status from this function, + * because any analysis (and printing of error messages) needs to be + * done directly at the EFI function call-site. + * + * For example, EFI_INVALID_PARAMETER could indicate a bug or maybe we + * just didn't find any PCI devices, but there's no way to tell outside + * the context of the call. + */ +static void setup_efi_pci(struct boot_params *params) { efi_status_t status; void **pci_handle = NULL; @@ -533,7 +539,7 @@ static efi_status_t setup_efi_pci(struct boot_params *params) size, (void **)&pci_handle); if (status != EFI_SUCCESS) - return status; + return; status = efi_call_early(locate_handle, EFI_LOCATE_BY_PROTOCOL, &pci_proto, @@ -544,13 +550,12 @@ static efi_status_t setup_efi_pci(struct boot_params *params) goto free_handle; if (efi_early->is64) - status = setup_efi_pci64(params, pci_handle, size); + setup_efi_pci64(params, pci_handle, size); else - status = setup_efi_pci32(params, pci_handle, size); + setup_efi_pci32(params, pci_handle, size); free_handle: efi_call_early(free_pool, pci_handle); - return status; } static void @@ -1104,10 +1109,22 @@ struct boot_params *make_boot_params(struct efi_config *c) (char *)(unsigned long)hdr->cmd_line_ptr, "initrd=", hdr->initrd_addr_max, &ramdisk_addr, &ramdisk_size); + + if (status != EFI_SUCCESS && + hdr->xloadflags & XLF_CAN_BE_LOADED_ABOVE_4G) { + efi_printk(sys_table, "Trying to load files to higher address\n"); + status = handle_cmdline_files(sys_table, image, + (char *)(unsigned long)hdr->cmd_line_ptr, + "initrd=", -1UL, + &ramdisk_addr, &ramdisk_size); + } + if (status != EFI_SUCCESS) goto fail2; - hdr->ramdisk_image = ramdisk_addr; - hdr->ramdisk_size = ramdisk_size; + hdr->ramdisk_image = ramdisk_addr & 0xffffffff; + hdr->ramdisk_size = ramdisk_size & 0xffffffff; + boot_params->ext_ramdisk_image = (u64)ramdisk_addr >> 32; + boot_params->ext_ramdisk_size = (u64)ramdisk_size >> 32; return boot_params; fail2: @@ -1401,16 +1418,20 @@ struct boot_params *efi_main(struct efi_config *c, hdr->init_size, hdr->init_size, hdr->pref_address, hdr->kernel_alignment); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk(sys_table, "efi_relocate_kernel() failed!\n"); goto fail; + } hdr->pref_address = hdr->code32_start; hdr->code32_start = bzimage_addr; } status = exit_boot(boot_params, handle, is64); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk(sys_table, "exit_boot() failed!\n"); goto fail; + } memset((char *)gdt->address, 0x0, gdt->size); desc = (struct desc_struct *)gdt->address; @@ -1470,5 +1491,6 @@ struct boot_params *efi_main(struct efi_config *c, return boot_params; fail: + efi_printk(sys_table, "efi_main() failed!\n"); return NULL; } diff --git a/arch/x86/boot/cpu.c b/arch/x86/boot/cpu.c index 6ec6bb6e995..29207f69ae8 100644 --- a/arch/x86/boot/cpu.c +++ b/arch/x86/boot/cpu.c @@ -16,7 +16,9 @@ */ #include "boot.h" +#ifdef CONFIG_X86_FEATURE_NAMES #include "cpustr.h" +#endif static char *cpu_name(int level) { @@ -32,11 +34,48 @@ static char *cpu_name(int level) } } +static void show_cap_strs(u32 *err_flags) +{ + int i, j; +#ifdef CONFIG_X86_FEATURE_NAMES + const unsigned char *msg_strs = (const unsigned char *)x86_cap_strs; + for (i = 0; i < NCAPINTS; i++) { + u32 e = err_flags[i]; + for (j = 0; j < 32; j++) { + if (msg_strs[0] < i || + (msg_strs[0] == i && msg_strs[1] < j)) { + /* Skip to the next string */ + msg_strs += 2; + while (*msg_strs++) + ; + } + if (e & 1) { + if (msg_strs[0] == i && + msg_strs[1] == j && + msg_strs[2]) + printf("%s ", msg_strs+2); + else + printf("%d:%d ", i, j); + } + e >>= 1; + } + } +#else + for (i = 0; i < NCAPINTS; i++) { + u32 e = err_flags[i]; + for (j = 0; j < 32; j++) { + if (e & 1) + printf("%d:%d ", i, j); + e >>= 1; + } + } +#endif +} + int validate_cpu(void) { u32 *err_flags; int cpu_level, req_level; - const unsigned char *msg_strs; check_cpu(&cpu_level, &req_level, &err_flags); @@ -49,34 +88,9 @@ int validate_cpu(void) } if (err_flags) { - int i, j; puts("This kernel requires the following features " "not present on the CPU:\n"); - - msg_strs = (const unsigned char *)x86_cap_strs; - - for (i = 0; i < NCAPINTS; i++) { - u32 e = err_flags[i]; - - for (j = 0; j < 32; j++) { - if (msg_strs[0] < i || - (msg_strs[0] == i && msg_strs[1] < j)) { - /* Skip to the next string */ - msg_strs += 2; - while (*msg_strs++) - ; - } - if (e & 1) { - if (msg_strs[0] == i && - msg_strs[1] == j && - msg_strs[2]) - printf("%s ", msg_strs+2); - else - printf("%d:%d ", i, j); - } - e >>= 1; - } - } + show_cap_strs(err_flags); putchar('\n'); return -1; } else { diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 7a6d43a554d..16ef02596db 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -154,7 +154,7 @@ extra_header_fields: #else .quad 0 # ImageBase #endif - .long 0x20 # SectionAlignment + .long CONFIG_PHYSICAL_ALIGN # SectionAlignment .long 0x20 # FileAlignment .word 0 # MajorOperatingSystemVersion .word 0 # MinorOperatingSystemVersion diff --git a/arch/x86/boot/mkcpustr.c b/arch/x86/boot/mkcpustr.c index 4579eff0ef4..637097e66a6 100644 --- a/arch/x86/boot/mkcpustr.c +++ b/arch/x86/boot/mkcpustr.c @@ -16,6 +16,7 @@ #include <stdio.h> #include "../include/asm/required-features.h" +#include "../include/asm/disabled-features.h" #include "../include/asm/cpufeature.h" #include "../kernel/cpu/capflags.c" diff --git a/arch/x86/configs/tiny.config b/arch/x86/configs/tiny.config new file mode 100644 index 00000000000..4e2ecfa23c1 --- /dev/null +++ b/arch/x86/configs/tiny.config @@ -0,0 +1 @@ +CONFIG_NOHIGHMEM=y diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 61d6e281898..fd0f848938c 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -14,6 +14,7 @@ obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o +obj-$(CONFIG_CRYPTO_DES3_EDE_X86_64) += des3_ede-x86_64.o obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o @@ -25,6 +26,7 @@ obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o +obj-$(CONFIG_CRYPTO_SHA1_MB) += sha-mb/ obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o obj-$(CONFIG_CRYPTO_SHA256_SSSE3) += sha256-ssse3.o obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o @@ -52,6 +54,7 @@ salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o +des3_ede-x86_64-y := des3_ede-asm_64.o des3_ede_glue.o camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o @@ -76,7 +79,7 @@ ifeq ($(avx2_supported),yes) endif aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o -aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o +aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o ifeq ($(avx2_supported),yes) diff --git a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S new file mode 100644 index 00000000000..2df2a0298f5 --- /dev/null +++ b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S @@ -0,0 +1,556 @@ +/* + * Implement AES CTR mode by8 optimization with AVX instructions. (x86_64) + * + * This is AES128/192/256 CTR mode optimization implementation. It requires + * the support of Intel(R) AESNI and AVX instructions. + * + * This work was inspired by the AES CTR mode optimization published + * in Intel Optimized IPSEC Cryptograhpic library. + * Additional information on it can be found at: + * http://downloadcenter.intel.com/Detail_Desc.aspx?agr=Y&DwnldID=22972 + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2014 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * James Guilford <james.guilford@intel.com> + * Sean Gulley <sean.m.gulley@intel.com> + * Chandramouli Narayanan <mouli@linux.intel.com> + * + * BSD LICENSE + * + * Copyright(c) 2014 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include <linux/linkage.h> +#include <asm/inst.h> + +#define CONCAT(a,b) a##b +#define VMOVDQ vmovdqu + +#define xdata0 %xmm0 +#define xdata1 %xmm1 +#define xdata2 %xmm2 +#define xdata3 %xmm3 +#define xdata4 %xmm4 +#define xdata5 %xmm5 +#define xdata6 %xmm6 +#define xdata7 %xmm7 +#define xcounter %xmm8 +#define xbyteswap %xmm9 +#define xkey0 %xmm10 +#define xkey4 %xmm11 +#define xkey8 %xmm12 +#define xkey12 %xmm13 +#define xkeyA %xmm14 +#define xkeyB %xmm15 + +#define p_in %rdi +#define p_iv %rsi +#define p_keys %rdx +#define p_out %rcx +#define num_bytes %r8 + +#define tmp %r10 +#define DDQ(i) CONCAT(ddq_add_,i) +#define XMM(i) CONCAT(%xmm, i) +#define DDQ_DATA 0 +#define XDATA 1 +#define KEY_128 1 +#define KEY_192 2 +#define KEY_256 3 + +.section .rodata +.align 16 + +byteswap_const: + .octa 0x000102030405060708090A0B0C0D0E0F +ddq_low_msk: + .octa 0x0000000000000000FFFFFFFFFFFFFFFF +ddq_high_add_1: + .octa 0x00000000000000010000000000000000 +ddq_add_1: + .octa 0x00000000000000000000000000000001 +ddq_add_2: + .octa 0x00000000000000000000000000000002 +ddq_add_3: + .octa 0x00000000000000000000000000000003 +ddq_add_4: + .octa 0x00000000000000000000000000000004 +ddq_add_5: + .octa 0x00000000000000000000000000000005 +ddq_add_6: + .octa 0x00000000000000000000000000000006 +ddq_add_7: + .octa 0x00000000000000000000000000000007 +ddq_add_8: + .octa 0x00000000000000000000000000000008 + +.text + +/* generate a unique variable for ddq_add_x */ + +.macro setddq n + var_ddq_add = DDQ(\n) +.endm + +/* generate a unique variable for xmm register */ +.macro setxdata n + var_xdata = XMM(\n) +.endm + +/* club the numeric 'id' to the symbol 'name' */ + +.macro club name, id +.altmacro + .if \name == DDQ_DATA + setddq %\id + .elseif \name == XDATA + setxdata %\id + .endif +.noaltmacro +.endm + +/* + * do_aes num_in_par load_keys key_len + * This increments p_in, but not p_out + */ +.macro do_aes b, k, key_len + .set by, \b + .set load_keys, \k + .set klen, \key_len + + .if (load_keys) + vmovdqa 0*16(p_keys), xkey0 + .endif + + vpshufb xbyteswap, xcounter, xdata0 + + .set i, 1 + .rept (by - 1) + club DDQ_DATA, i + club XDATA, i + vpaddq var_ddq_add(%rip), xcounter, var_xdata + vptest ddq_low_msk(%rip), var_xdata + jnz 1f + vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata + vpaddq ddq_high_add_1(%rip), xcounter, xcounter + 1: + vpshufb xbyteswap, var_xdata, var_xdata + .set i, (i +1) + .endr + + vmovdqa 1*16(p_keys), xkeyA + + vpxor xkey0, xdata0, xdata0 + club DDQ_DATA, by + vpaddq var_ddq_add(%rip), xcounter, xcounter + vptest ddq_low_msk(%rip), xcounter + jnz 1f + vpaddq ddq_high_add_1(%rip), xcounter, xcounter + 1: + + .set i, 1 + .rept (by - 1) + club XDATA, i + vpxor xkey0, var_xdata, var_xdata + .set i, (i +1) + .endr + + vmovdqa 2*16(p_keys), xkeyB + + .set i, 0 + .rept by + club XDATA, i + vaesenc xkeyA, var_xdata, var_xdata /* key 1 */ + .set i, (i +1) + .endr + + .if (klen == KEY_128) + .if (load_keys) + vmovdqa 3*16(p_keys), xkeyA + .endif + .else + vmovdqa 3*16(p_keys), xkeyA + .endif + + .set i, 0 + .rept by + club XDATA, i + vaesenc xkeyB, var_xdata, var_xdata /* key 2 */ + .set i, (i +1) + .endr + + add $(16*by), p_in + + .if (klen == KEY_128) + vmovdqa 4*16(p_keys), xkey4 + .else + .if (load_keys) + vmovdqa 4*16(p_keys), xkey4 + .endif + .endif + + .set i, 0 + .rept by + club XDATA, i + vaesenc xkeyA, var_xdata, var_xdata /* key 3 */ + .set i, (i +1) + .endr + + vmovdqa 5*16(p_keys), xkeyA + + .set i, 0 + .rept by + club XDATA, i + vaesenc xkey4, var_xdata, var_xdata /* key 4 */ + .set i, (i +1) + .endr + + .if (klen == KEY_128) + .if (load_keys) + vmovdqa 6*16(p_keys), xkeyB + .endif + .else + vmovdqa 6*16(p_keys), xkeyB + .endif + + .set i, 0 + .rept by + club XDATA, i + vaesenc xkeyA, var_xdata, var_xdata /* key 5 */ + .set i, (i +1) + .endr + + vmovdqa 7*16(p_keys), xkeyA + + .set i, 0 + .rept by + club XDATA, i + vaesenc xkeyB, var_xdata, var_xdata /* key 6 */ + .set i, (i +1) + .endr + + .if (klen == KEY_128) + vmovdqa 8*16(p_keys), xkey8 + .else + .if (load_keys) + vmovdqa 8*16(p_keys), xkey8 + .endif + .endif + + .set i, 0 + .rept by + club XDATA, i + vaesenc xkeyA, var_xdata, var_xdata /* key 7 */ + .set i, (i +1) + .endr + + .if (klen == KEY_128) + .if (load_keys) + vmovdqa 9*16(p_keys), xkeyA + .endif + .else + vmovdqa 9*16(p_keys), xkeyA + .endif + + .set i, 0 + .rept by + club XDATA, i + vaesenc xkey8, var_xdata, var_xdata /* key 8 */ + .set i, (i +1) + .endr + + vmovdqa 10*16(p_keys), xkeyB + + .set i, 0 + .rept by + club XDATA, i + vaesenc xkeyA, var_xdata, var_xdata /* key 9 */ + .set i, (i +1) + .endr + + .if (klen != KEY_128) + vmovdqa 11*16(p_keys), xkeyA + .endif + + .set i, 0 + .rept by + club XDATA, i + /* key 10 */ + .if (klen == KEY_128) + vaesenclast xkeyB, var_xdata, var_xdata + .else + vaesenc xkeyB, var_xdata, var_xdata + .endif + .set i, (i +1) + .endr + + .if (klen != KEY_128) + .if (load_keys) + vmovdqa 12*16(p_keys), xkey12 + .endif + + .set i, 0 + .rept by + club XDATA, i + vaesenc xkeyA, var_xdata, var_xdata /* key 11 */ + .set i, (i +1) + .endr + + .if (klen == KEY_256) + vmovdqa 13*16(p_keys), xkeyA + .endif + + .set i, 0 + .rept by + club XDATA, i + .if (klen == KEY_256) + /* key 12 */ + vaesenc xkey12, var_xdata, var_xdata + .else + vaesenclast xkey12, var_xdata, var_xdata + .endif + .set i, (i +1) + .endr + + .if (klen == KEY_256) + vmovdqa 14*16(p_keys), xkeyB + + .set i, 0 + .rept by + club XDATA, i + /* key 13 */ + vaesenc xkeyA, var_xdata, var_xdata + .set i, (i +1) + .endr + + .set i, 0 + .rept by + club XDATA, i + /* key 14 */ + vaesenclast xkeyB, var_xdata, var_xdata + .set i, (i +1) + .endr + .endif + .endif + + .set i, 0 + .rept (by / 2) + .set j, (i+1) + VMOVDQ (i*16 - 16*by)(p_in), xkeyA + VMOVDQ (j*16 - 16*by)(p_in), xkeyB + club XDATA, i + vpxor xkeyA, var_xdata, var_xdata + club XDATA, j + vpxor xkeyB, var_xdata, var_xdata + .set i, (i+2) + .endr + + .if (i < by) + VMOVDQ (i*16 - 16*by)(p_in), xkeyA + club XDATA, i + vpxor xkeyA, var_xdata, var_xdata + .endif + + .set i, 0 + .rept by + club XDATA, i + VMOVDQ var_xdata, i*16(p_out) + .set i, (i+1) + .endr +.endm + +.macro do_aes_load val, key_len + do_aes \val, 1, \key_len +.endm + +.macro do_aes_noload val, key_len + do_aes \val, 0, \key_len +.endm + +/* main body of aes ctr load */ + +.macro do_aes_ctrmain key_len + + cmp $16, num_bytes + jb .Ldo_return2\key_len + + vmovdqa byteswap_const(%rip), xbyteswap + vmovdqu (p_iv), xcounter + vpshufb xbyteswap, xcounter, xcounter + + mov num_bytes, tmp + and $(7*16), tmp + jz .Lmult_of_8_blks\key_len + + /* 1 <= tmp <= 7 */ + cmp $(4*16), tmp + jg .Lgt4\key_len + je .Leq4\key_len + +.Llt4\key_len: + cmp $(2*16), tmp + jg .Leq3\key_len + je .Leq2\key_len + +.Leq1\key_len: + do_aes_load 1, \key_len + add $(1*16), p_out + and $(~7*16), num_bytes + jz .Ldo_return2\key_len + jmp .Lmain_loop2\key_len + +.Leq2\key_len: + do_aes_load 2, \key_len + add $(2*16), p_out + and $(~7*16), num_bytes + jz .Ldo_return2\key_len + jmp .Lmain_loop2\key_len + + +.Leq3\key_len: + do_aes_load 3, \key_len + add $(3*16), p_out + and $(~7*16), num_bytes + jz .Ldo_return2\key_len + jmp .Lmain_loop2\key_len + +.Leq4\key_len: + do_aes_load 4, \key_len + add $(4*16), p_out + and $(~7*16), num_bytes + jz .Ldo_return2\key_len + jmp .Lmain_loop2\key_len + +.Lgt4\key_len: + cmp $(6*16), tmp + jg .Leq7\key_len + je .Leq6\key_len + +.Leq5\key_len: + do_aes_load 5, \key_len + add $(5*16), p_out + and $(~7*16), num_bytes + jz .Ldo_return2\key_len + jmp .Lmain_loop2\key_len + +.Leq6\key_len: + do_aes_load 6, \key_len + add $(6*16), p_out + and $(~7*16), num_bytes + jz .Ldo_return2\key_len + jmp .Lmain_loop2\key_len + +.Leq7\key_len: + do_aes_load 7, \key_len + add $(7*16), p_out + and $(~7*16), num_bytes + jz .Ldo_return2\key_len + jmp .Lmain_loop2\key_len + +.Lmult_of_8_blks\key_len: + .if (\key_len != KEY_128) + vmovdqa 0*16(p_keys), xkey0 + vmovdqa 4*16(p_keys), xkey4 + vmovdqa 8*16(p_keys), xkey8 + vmovdqa 12*16(p_keys), xkey12 + .else + vmovdqa 0*16(p_keys), xkey0 + vmovdqa 3*16(p_keys), xkey4 + vmovdqa 6*16(p_keys), xkey8 + vmovdqa 9*16(p_keys), xkey12 + .endif +.align 16 +.Lmain_loop2\key_len: + /* num_bytes is a multiple of 8 and >0 */ + do_aes_noload 8, \key_len + add $(8*16), p_out + sub $(8*16), num_bytes + jne .Lmain_loop2\key_len + +.Ldo_return2\key_len: + /* return updated IV */ + vpshufb xbyteswap, xcounter, xcounter + vmovdqu xcounter, (p_iv) + ret +.endm + +/* + * routine to do AES128 CTR enc/decrypt "by8" + * XMM registers are clobbered. + * Saving/restoring must be done at a higher level + * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out, + * unsigned int num_bytes) + */ +ENTRY(aes_ctr_enc_128_avx_by8) + /* call the aes main loop */ + do_aes_ctrmain KEY_128 + +ENDPROC(aes_ctr_enc_128_avx_by8) + +/* + * routine to do AES192 CTR enc/decrypt "by8" + * XMM registers are clobbered. + * Saving/restoring must be done at a higher level + * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out, + * unsigned int num_bytes) + */ +ENTRY(aes_ctr_enc_192_avx_by8) + /* call the aes main loop */ + do_aes_ctrmain KEY_192 + +ENDPROC(aes_ctr_enc_192_avx_by8) + +/* + * routine to do AES256 CTR enc/decrypt "by8" + * XMM registers are clobbered. + * Saving/restoring must be done at a higher level + * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out, + * unsigned int num_bytes) + */ +ENTRY(aes_ctr_enc_256_avx_by8) + /* call the aes main loop */ + do_aes_ctrmain KEY_256 + +ENDPROC(aes_ctr_enc_256_avx_by8) diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 948ad0e7774..888950f29fd 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -105,6 +105,9 @@ void crypto_fpu_exit(void); #define AVX_GEN4_OPTSIZE 4096 #ifdef CONFIG_X86_64 + +static void (*aesni_ctr_enc_tfm)(struct crypto_aes_ctx *ctx, u8 *out, + const u8 *in, unsigned int len, u8 *iv); asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv); @@ -155,6 +158,12 @@ asmlinkage void aesni_gcm_dec(void *ctx, u8 *out, #ifdef CONFIG_AS_AVX +asmlinkage void aes_ctr_enc_128_avx_by8(const u8 *in, u8 *iv, + void *keys, u8 *out, unsigned int num_bytes); +asmlinkage void aes_ctr_enc_192_avx_by8(const u8 *in, u8 *iv, + void *keys, u8 *out, unsigned int num_bytes); +asmlinkage void aes_ctr_enc_256_avx_by8(const u8 *in, u8 *iv, + void *keys, u8 *out, unsigned int num_bytes); /* * asmlinkage void aesni_gcm_precomp_avx_gen2() * gcm_data *my_ctx_data, context data @@ -472,6 +481,25 @@ static void ctr_crypt_final(struct crypto_aes_ctx *ctx, crypto_inc(ctrblk, AES_BLOCK_SIZE); } +#ifdef CONFIG_AS_AVX +static void aesni_ctr_enc_avx_tfm(struct crypto_aes_ctx *ctx, u8 *out, + const u8 *in, unsigned int len, u8 *iv) +{ + /* + * based on key length, override with the by8 version + * of ctr mode encryption/decryption for improved performance + * aes_set_key_common() ensures that key length is one of + * {128,192,256} + */ + if (ctx->key_length == AES_KEYSIZE_128) + aes_ctr_enc_128_avx_by8(in, iv, (void *)ctx, out, len); + else if (ctx->key_length == AES_KEYSIZE_192) + aes_ctr_enc_192_avx_by8(in, iv, (void *)ctx, out, len); + else + aes_ctr_enc_256_avx_by8(in, iv, (void *)ctx, out, len); +} +#endif + static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) @@ -486,8 +514,8 @@ static int ctr_crypt(struct blkcipher_desc *desc, kernel_fpu_begin(); while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) { - aesni_ctr_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr, - nbytes & AES_BLOCK_MASK, walk.iv); + aesni_ctr_enc_tfm(ctx, walk.dst.virt.addr, walk.src.virt.addr, + nbytes & AES_BLOCK_MASK, walk.iv); nbytes &= AES_BLOCK_SIZE - 1; err = blkcipher_walk_done(desc, &walk, nbytes); } @@ -1493,6 +1521,14 @@ static int __init aesni_init(void) aesni_gcm_enc_tfm = aesni_gcm_enc; aesni_gcm_dec_tfm = aesni_gcm_dec; } + aesni_ctr_enc_tfm = aesni_ctr_enc; +#ifdef CONFIG_AS_AVX + if (cpu_has_avx) { + /* optimize performance of ctr mode encryption transform */ + aesni_ctr_enc_tfm = aesni_ctr_enc_avx_tfm; + pr_info("AES CTR mode by8 optimization enabled\n"); + } +#endif #endif err = crypto_fpu_init(); diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S index dbc4339b541..26d49ebae04 100644 --- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S +++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S @@ -72,6 +72,7 @@ # unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init); +.text ENTRY(crc_pcl) #define bufp %rdi #define bufp_dw %edi @@ -216,15 +217,11 @@ LABEL crc_ %i ## 4) Combine three results: ################################################################ - lea (K_table-16)(%rip), bufp # first entry is for idx 1 + lea (K_table-8)(%rip), bufp # first entry is for idx 1 shlq $3, %rax # rax *= 8 - subq %rax, tmp # tmp -= rax*8 - shlq $1, %rax - subq %rax, tmp # tmp -= rax*16 - # (total tmp -= rax*24) - addq %rax, bufp - - movdqa (bufp), %xmm0 # 2 consts: K1:K2 + pmovzxdq (bufp,%rax), %xmm0 # 2 consts: K1:K2 + leal (%eax,%eax,2), %eax # rax *= 3 (total *24) + subq %rax, tmp # tmp -= rax*24 movq crc_init, %xmm1 # CRC for block 1 PCLMULQDQ 0x00,%xmm0,%xmm1 # Multiply by K2 @@ -238,9 +235,9 @@ LABEL crc_ %i mov crc2, crc_init crc32 %rax, crc_init -################################################################ -## 5) Check for end: -################################################################ + ################################################################ + ## 5) Check for end: + ################################################################ LABEL crc_ 0 mov tmp, len @@ -331,136 +328,136 @@ ENDPROC(crc_pcl) ################################################################ ## PCLMULQDQ tables - ## Table is 128 entries x 2 quad words each + ## Table is 128 entries x 2 words (8 bytes) each ################################################################ -.data -.align 64 +.section .rotata, "a", %progbits +.align 8 K_table: - .quad 0x14cd00bd6,0x105ec76f0 - .quad 0x0ba4fc28e,0x14cd00bd6 - .quad 0x1d82c63da,0x0f20c0dfe - .quad 0x09e4addf8,0x0ba4fc28e - .quad 0x039d3b296,0x1384aa63a - .quad 0x102f9b8a2,0x1d82c63da - .quad 0x14237f5e6,0x01c291d04 - .quad 0x00d3b6092,0x09e4addf8 - .quad 0x0c96cfdc0,0x0740eef02 - .quad 0x18266e456,0x039d3b296 - .quad 0x0daece73e,0x0083a6eec - .quad 0x0ab7aff2a,0x102f9b8a2 - .quad 0x1248ea574,0x1c1733996 - .quad 0x083348832,0x14237f5e6 - .quad 0x12c743124,0x02ad91c30 - .quad 0x0b9e02b86,0x00d3b6092 - .quad 0x018b33a4e,0x06992cea2 - .quad 0x1b331e26a,0x0c96cfdc0 - .quad 0x17d35ba46,0x07e908048 - .quad 0x1bf2e8b8a,0x18266e456 - .quad 0x1a3e0968a,0x11ed1f9d8 - .quad 0x0ce7f39f4,0x0daece73e - .quad 0x061d82e56,0x0f1d0f55e - .quad 0x0d270f1a2,0x0ab7aff2a - .quad 0x1c3f5f66c,0x0a87ab8a8 - .quad 0x12ed0daac,0x1248ea574 - .quad 0x065863b64,0x08462d800 - .quad 0x11eef4f8e,0x083348832 - .quad 0x1ee54f54c,0x071d111a8 - .quad 0x0b3e32c28,0x12c743124 - .quad 0x0064f7f26,0x0ffd852c6 - .quad 0x0dd7e3b0c,0x0b9e02b86 - .quad 0x0f285651c,0x0dcb17aa4 - .quad 0x010746f3c,0x018b33a4e - .quad 0x1c24afea4,0x0f37c5aee - .quad 0x0271d9844,0x1b331e26a - .quad 0x08e766a0c,0x06051d5a2 - .quad 0x093a5f730,0x17d35ba46 - .quad 0x06cb08e5c,0x11d5ca20e - .quad 0x06b749fb2,0x1bf2e8b8a - .quad 0x1167f94f2,0x021f3d99c - .quad 0x0cec3662e,0x1a3e0968a - .quad 0x19329634a,0x08f158014 - .quad 0x0e6fc4e6a,0x0ce7f39f4 - .quad 0x08227bb8a,0x1a5e82106 - .quad 0x0b0cd4768,0x061d82e56 - .quad 0x13c2b89c4,0x188815ab2 - .quad 0x0d7a4825c,0x0d270f1a2 - .quad 0x10f5ff2ba,0x105405f3e - .quad 0x00167d312,0x1c3f5f66c - .quad 0x0f6076544,0x0e9adf796 - .quad 0x026f6a60a,0x12ed0daac - .quad 0x1a2adb74e,0x096638b34 - .quad 0x19d34af3a,0x065863b64 - .quad 0x049c3cc9c,0x1e50585a0 - .quad 0x068bce87a,0x11eef4f8e - .quad 0x1524fa6c6,0x19f1c69dc - .quad 0x16cba8aca,0x1ee54f54c - .quad 0x042d98888,0x12913343e - .quad 0x1329d9f7e,0x0b3e32c28 - .quad 0x1b1c69528,0x088f25a3a - .quad 0x02178513a,0x0064f7f26 - .quad 0x0e0ac139e,0x04e36f0b0 - .quad 0x0170076fa,0x0dd7e3b0c - .quad 0x141a1a2e2,0x0bd6f81f8 - .quad 0x16ad828b4,0x0f285651c - .quad 0x041d17b64,0x19425cbba - .quad 0x1fae1cc66,0x010746f3c - .quad 0x1a75b4b00,0x18db37e8a - .quad 0x0f872e54c,0x1c24afea4 - .quad 0x01e41e9fc,0x04c144932 - .quad 0x086d8e4d2,0x0271d9844 - .quad 0x160f7af7a,0x052148f02 - .quad 0x05bb8f1bc,0x08e766a0c - .quad 0x0a90fd27a,0x0a3c6f37a - .quad 0x0b3af077a,0x093a5f730 - .quad 0x04984d782,0x1d22c238e - .quad 0x0ca6ef3ac,0x06cb08e5c - .quad 0x0234e0b26,0x063ded06a - .quad 0x1d88abd4a,0x06b749fb2 - .quad 0x04597456a,0x04d56973c - .quad 0x0e9e28eb4,0x1167f94f2 - .quad 0x07b3ff57a,0x19385bf2e - .quad 0x0c9c8b782,0x0cec3662e - .quad 0x13a9cba9e,0x0e417f38a - .quad 0x093e106a4,0x19329634a - .quad 0x167001a9c,0x14e727980 - .quad 0x1ddffc5d4,0x0e6fc4e6a - .quad 0x00df04680,0x0d104b8fc - .quad 0x02342001e,0x08227bb8a - .quad 0x00a2a8d7e,0x05b397730 - .quad 0x168763fa6,0x0b0cd4768 - .quad 0x1ed5a407a,0x0e78eb416 - .quad 0x0d2c3ed1a,0x13c2b89c4 - .quad 0x0995a5724,0x1641378f0 - .quad 0x19b1afbc4,0x0d7a4825c - .quad 0x109ffedc0,0x08d96551c - .quad 0x0f2271e60,0x10f5ff2ba - .quad 0x00b0bf8ca,0x00bf80dd2 - .quad 0x123888b7a,0x00167d312 - .quad 0x1e888f7dc,0x18dcddd1c - .quad 0x002ee03b2,0x0f6076544 - .quad 0x183e8d8fe,0x06a45d2b2 - .quad 0x133d7a042,0x026f6a60a - .quad 0x116b0f50c,0x1dd3e10e8 - .quad 0x05fabe670,0x1a2adb74e - .quad 0x130004488,0x0de87806c - .quad 0x000bcf5f6,0x19d34af3a - .quad 0x18f0c7078,0x014338754 - .quad 0x017f27698,0x049c3cc9c - .quad 0x058ca5f00,0x15e3e77ee - .quad 0x1af900c24,0x068bce87a - .quad 0x0b5cfca28,0x0dd07448e - .quad 0x0ded288f8,0x1524fa6c6 - .quad 0x059f229bc,0x1d8048348 - .quad 0x06d390dec,0x16cba8aca - .quad 0x037170390,0x0a3e3e02c - .quad 0x06353c1cc,0x042d98888 - .quad 0x0c4584f5c,0x0d73c7bea - .quad 0x1f16a3418,0x1329d9f7e - .quad 0x0531377e2,0x185137662 - .quad 0x1d8d9ca7c,0x1b1c69528 - .quad 0x0b25b29f2,0x18a08b5bc - .quad 0x19fb2a8b0,0x02178513a - .quad 0x1a08fe6ac,0x1da758ae0 - .quad 0x045cddf4e,0x0e0ac139e - .quad 0x1a91647f2,0x169cf9eb0 - .quad 0x1a0f717c4,0x0170076fa + .long 0x493c7d27, 0x00000001 + .long 0xba4fc28e, 0x493c7d27 + .long 0xddc0152b, 0xf20c0dfe + .long 0x9e4addf8, 0xba4fc28e + .long 0x39d3b296, 0x3da6d0cb + .long 0x0715ce53, 0xddc0152b + .long 0x47db8317, 0x1c291d04 + .long 0x0d3b6092, 0x9e4addf8 + .long 0xc96cfdc0, 0x740eef02 + .long 0x878a92a7, 0x39d3b296 + .long 0xdaece73e, 0x083a6eec + .long 0xab7aff2a, 0x0715ce53 + .long 0x2162d385, 0xc49f4f67 + .long 0x83348832, 0x47db8317 + .long 0x299847d5, 0x2ad91c30 + .long 0xb9e02b86, 0x0d3b6092 + .long 0x18b33a4e, 0x6992cea2 + .long 0xb6dd949b, 0xc96cfdc0 + .long 0x78d9ccb7, 0x7e908048 + .long 0xbac2fd7b, 0x878a92a7 + .long 0xa60ce07b, 0x1b3d8f29 + .long 0xce7f39f4, 0xdaece73e + .long 0x61d82e56, 0xf1d0f55e + .long 0xd270f1a2, 0xab7aff2a + .long 0xc619809d, 0xa87ab8a8 + .long 0x2b3cac5d, 0x2162d385 + .long 0x65863b64, 0x8462d800 + .long 0x1b03397f, 0x83348832 + .long 0xebb883bd, 0x71d111a8 + .long 0xb3e32c28, 0x299847d5 + .long 0x064f7f26, 0xffd852c6 + .long 0xdd7e3b0c, 0xb9e02b86 + .long 0xf285651c, 0xdcb17aa4 + .long 0x10746f3c, 0x18b33a4e + .long 0xc7a68855, 0xf37c5aee + .long 0x271d9844, 0xb6dd949b + .long 0x8e766a0c, 0x6051d5a2 + .long 0x93a5f730, 0x78d9ccb7 + .long 0x6cb08e5c, 0x18b0d4ff + .long 0x6b749fb2, 0xbac2fd7b + .long 0x1393e203, 0x21f3d99c + .long 0xcec3662e, 0xa60ce07b + .long 0x96c515bb, 0x8f158014 + .long 0xe6fc4e6a, 0xce7f39f4 + .long 0x8227bb8a, 0xa00457f7 + .long 0xb0cd4768, 0x61d82e56 + .long 0x39c7ff35, 0x8d6d2c43 + .long 0xd7a4825c, 0xd270f1a2 + .long 0x0ab3844b, 0x00ac29cf + .long 0x0167d312, 0xc619809d + .long 0xf6076544, 0xe9adf796 + .long 0x26f6a60a, 0x2b3cac5d + .long 0xa741c1bf, 0x96638b34 + .long 0x98d8d9cb, 0x65863b64 + .long 0x49c3cc9c, 0xe0e9f351 + .long 0x68bce87a, 0x1b03397f + .long 0x57a3d037, 0x9af01f2d + .long 0x6956fc3b, 0xebb883bd + .long 0x42d98888, 0x2cff42cf + .long 0x3771e98f, 0xb3e32c28 + .long 0xb42ae3d9, 0x88f25a3a + .long 0x2178513a, 0x064f7f26 + .long 0xe0ac139e, 0x4e36f0b0 + .long 0x170076fa, 0xdd7e3b0c + .long 0x444dd413, 0xbd6f81f8 + .long 0x6f345e45, 0xf285651c + .long 0x41d17b64, 0x91c9bd4b + .long 0xff0dba97, 0x10746f3c + .long 0xa2b73df1, 0x885f087b + .long 0xf872e54c, 0xc7a68855 + .long 0x1e41e9fc, 0x4c144932 + .long 0x86d8e4d2, 0x271d9844 + .long 0x651bd98b, 0x52148f02 + .long 0x5bb8f1bc, 0x8e766a0c + .long 0xa90fd27a, 0xa3c6f37a + .long 0xb3af077a, 0x93a5f730 + .long 0x4984d782, 0xd7c0557f + .long 0xca6ef3ac, 0x6cb08e5c + .long 0x234e0b26, 0x63ded06a + .long 0xdd66cbbb, 0x6b749fb2 + .long 0x4597456a, 0x4d56973c + .long 0xe9e28eb4, 0x1393e203 + .long 0x7b3ff57a, 0x9669c9df + .long 0xc9c8b782, 0xcec3662e + .long 0x3f70cc6f, 0xe417f38a + .long 0x93e106a4, 0x96c515bb + .long 0x62ec6c6d, 0x4b9e0f71 + .long 0xd813b325, 0xe6fc4e6a + .long 0x0df04680, 0xd104b8fc + .long 0x2342001e, 0x8227bb8a + .long 0x0a2a8d7e, 0x5b397730 + .long 0x6d9a4957, 0xb0cd4768 + .long 0xe8b6368b, 0xe78eb416 + .long 0xd2c3ed1a, 0x39c7ff35 + .long 0x995a5724, 0x61ff0e01 + .long 0x9ef68d35, 0xd7a4825c + .long 0x0c139b31, 0x8d96551c + .long 0xf2271e60, 0x0ab3844b + .long 0x0b0bf8ca, 0x0bf80dd2 + .long 0x2664fd8b, 0x0167d312 + .long 0xed64812d, 0x8821abed + .long 0x02ee03b2, 0xf6076544 + .long 0x8604ae0f, 0x6a45d2b2 + .long 0x363bd6b3, 0x26f6a60a + .long 0x135c83fd, 0xd8d26619 + .long 0x5fabe670, 0xa741c1bf + .long 0x35ec3279, 0xde87806c + .long 0x00bcf5f6, 0x98d8d9cb + .long 0x8ae00689, 0x14338754 + .long 0x17f27698, 0x49c3cc9c + .long 0x58ca5f00, 0x5bd2011f + .long 0xaa7c7ad5, 0x68bce87a + .long 0xb5cfca28, 0xdd07448e + .long 0xded288f8, 0x57a3d037 + .long 0x59f229bc, 0xdde8f5b9 + .long 0x6d390dec, 0x6956fc3b + .long 0x37170390, 0xa3e3e02c + .long 0x6353c1cc, 0x42d98888 + .long 0xc4584f5c, 0xd73c7bea + .long 0xf48642e9, 0x3771e98f + .long 0x531377e2, 0x80ff0093 + .long 0xdd35bc8d, 0xb42ae3d9 + .long 0xb25b29f2, 0x8fe4c34d + .long 0x9a5ede41, 0x2178513a + .long 0xa563905d, 0xdf99fc11 + .long 0x45cddf4e, 0xe0ac139e + .long 0xacfa3103, 0x6c23e841 + .long 0xa51b6135, 0x170076fa diff --git a/arch/x86/crypto/des3_ede-asm_64.S b/arch/x86/crypto/des3_ede-asm_64.S new file mode 100644 index 00000000000..038f6ae87c5 --- /dev/null +++ b/arch/x86/crypto/des3_ede-asm_64.S @@ -0,0 +1,805 @@ +/* + * des3_ede-asm_64.S - x86-64 assembly implementation of 3DES cipher + * + * Copyright © 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/linkage.h> + +.file "des3_ede-asm_64.S" +.text + +#define s1 .L_s1 +#define s2 ((s1) + (64*8)) +#define s3 ((s2) + (64*8)) +#define s4 ((s3) + (64*8)) +#define s5 ((s4) + (64*8)) +#define s6 ((s5) + (64*8)) +#define s7 ((s6) + (64*8)) +#define s8 ((s7) + (64*8)) + +/* register macros */ +#define CTX %rdi + +#define RL0 %r8 +#define RL1 %r9 +#define RL2 %r10 + +#define RL0d %r8d +#define RL1d %r9d +#define RL2d %r10d + +#define RR0 %r11 +#define RR1 %r12 +#define RR2 %r13 + +#define RR0d %r11d +#define RR1d %r12d +#define RR2d %r13d + +#define RW0 %rax +#define RW1 %rbx +#define RW2 %rcx + +#define RW0d %eax +#define RW1d %ebx +#define RW2d %ecx + +#define RW0bl %al +#define RW1bl %bl +#define RW2bl %cl + +#define RW0bh %ah +#define RW1bh %bh +#define RW2bh %ch + +#define RT0 %r15 +#define RT1 %rbp +#define RT2 %r14 +#define RT3 %rdx + +#define RT0d %r15d +#define RT1d %ebp +#define RT2d %r14d +#define RT3d %edx + +/*********************************************************************** + * 1-way 3DES + ***********************************************************************/ +#define do_permutation(a, b, offset, mask) \ + movl a, RT0d; \ + shrl $(offset), RT0d; \ + xorl b, RT0d; \ + andl $(mask), RT0d; \ + xorl RT0d, b; \ + shll $(offset), RT0d; \ + xorl RT0d, a; + +#define expand_to_64bits(val, mask) \ + movl val##d, RT0d; \ + rorl $4, RT0d; \ + shlq $32, RT0; \ + orq RT0, val; \ + andq mask, val; + +#define compress_to_64bits(val) \ + movq val, RT0; \ + shrq $32, RT0; \ + roll $4, RT0d; \ + orl RT0d, val##d; + +#define initial_permutation(left, right) \ + do_permutation(left##d, right##d, 4, 0x0f0f0f0f); \ + do_permutation(left##d, right##d, 16, 0x0000ffff); \ + do_permutation(right##d, left##d, 2, 0x33333333); \ + do_permutation(right##d, left##d, 8, 0x00ff00ff); \ + movabs $0x3f3f3f3f3f3f3f3f, RT3; \ + movl left##d, RW0d; \ + roll $1, right##d; \ + xorl right##d, RW0d; \ + andl $0xaaaaaaaa, RW0d; \ + xorl RW0d, left##d; \ + xorl RW0d, right##d; \ + roll $1, left##d; \ + expand_to_64bits(right, RT3); \ + expand_to_64bits(left, RT3); + +#define final_permutation(left, right) \ + compress_to_64bits(right); \ + compress_to_64bits(left); \ + movl right##d, RW0d; \ + rorl $1, left##d; \ + xorl left##d, RW0d; \ + andl $0xaaaaaaaa, RW0d; \ + xorl RW0d, right##d; \ + xorl RW0d, left##d; \ + rorl $1, right##d; \ + do_permutation(right##d, left##d, 8, 0x00ff00ff); \ + do_permutation(right##d, left##d, 2, 0x33333333); \ + do_permutation(left##d, right##d, 16, 0x0000ffff); \ + do_permutation(left##d, right##d, 4, 0x0f0f0f0f); + +#define round1(n, from, to, load_next_key) \ + xorq from, RW0; \ + \ + movzbl RW0bl, RT0d; \ + movzbl RW0bh, RT1d; \ + shrq $16, RW0; \ + movzbl RW0bl, RT2d; \ + movzbl RW0bh, RT3d; \ + shrq $16, RW0; \ + movq s8(, RT0, 8), RT0; \ + xorq s6(, RT1, 8), to; \ + movzbl RW0bl, RL1d; \ + movzbl RW0bh, RT1d; \ + shrl $16, RW0d; \ + xorq s4(, RT2, 8), RT0; \ + xorq s2(, RT3, 8), to; \ + movzbl RW0bl, RT2d; \ + movzbl RW0bh, RT3d; \ + xorq s7(, RL1, 8), RT0; \ + xorq s5(, RT1, 8), to; \ + xorq s3(, RT2, 8), RT0; \ + load_next_key(n, RW0); \ + xorq RT0, to; \ + xorq s1(, RT3, 8), to; \ + +#define load_next_key(n, RWx) \ + movq (((n) + 1) * 8)(CTX), RWx; + +#define dummy2(a, b) /*_*/ + +#define read_block(io, left, right) \ + movl (io), left##d; \ + movl 4(io), right##d; \ + bswapl left##d; \ + bswapl right##d; + +#define write_block(io, left, right) \ + bswapl left##d; \ + bswapl right##d; \ + movl left##d, (io); \ + movl right##d, 4(io); + +ENTRY(des3_ede_x86_64_crypt_blk) + /* input: + * %rdi: round keys, CTX + * %rsi: dst + * %rdx: src + */ + pushq %rbp; + pushq %rbx; + pushq %r12; + pushq %r13; + pushq %r14; + pushq %r15; + + read_block(%rdx, RL0, RR0); + initial_permutation(RL0, RR0); + + movq (CTX), RW0; + + round1(0, RR0, RL0, load_next_key); + round1(1, RL0, RR0, load_next_key); + round1(2, RR0, RL0, load_next_key); + round1(3, RL0, RR0, load_next_key); + round1(4, RR0, RL0, load_next_key); + round1(5, RL0, RR0, load_next_key); + round1(6, RR0, RL0, load_next_key); + round1(7, RL0, RR0, load_next_key); + round1(8, RR0, RL0, load_next_key); + round1(9, RL0, RR0, load_next_key); + round1(10, RR0, RL0, load_next_key); + round1(11, RL0, RR0, load_next_key); + round1(12, RR0, RL0, load_next_key); + round1(13, RL0, RR0, load_next_key); + round1(14, RR0, RL0, load_next_key); + round1(15, RL0, RR0, load_next_key); + + round1(16+0, RL0, RR0, load_next_key); + round1(16+1, RR0, RL0, load_next_key); + round1(16+2, RL0, RR0, load_next_key); + round1(16+3, RR0, RL0, load_next_key); + round1(16+4, RL0, RR0, load_next_key); + round1(16+5, RR0, RL0, load_next_key); + round1(16+6, RL0, RR0, load_next_key); + round1(16+7, RR0, RL0, load_next_key); + round1(16+8, RL0, RR0, load_next_key); + round1(16+9, RR0, RL0, load_next_key); + round1(16+10, RL0, RR0, load_next_key); + round1(16+11, RR0, RL0, load_next_key); + round1(16+12, RL0, RR0, load_next_key); + round1(16+13, RR0, RL0, load_next_key); + round1(16+14, RL0, RR0, load_next_key); + round1(16+15, RR0, RL0, load_next_key); + + round1(32+0, RR0, RL0, load_next_key); + round1(32+1, RL0, RR0, load_next_key); + round1(32+2, RR0, RL0, load_next_key); + round1(32+3, RL0, RR0, load_next_key); + round1(32+4, RR0, RL0, load_next_key); + round1(32+5, RL0, RR0, load_next_key); + round1(32+6, RR0, RL0, load_next_key); + round1(32+7, RL0, RR0, load_next_key); + round1(32+8, RR0, RL0, load_next_key); + round1(32+9, RL0, RR0, load_next_key); + round1(32+10, RR0, RL0, load_next_key); + round1(32+11, RL0, RR0, load_next_key); + round1(32+12, RR0, RL0, load_next_key); + round1(32+13, RL0, RR0, load_next_key); + round1(32+14, RR0, RL0, load_next_key); + round1(32+15, RL0, RR0, dummy2); + + final_permutation(RR0, RL0); + write_block(%rsi, RR0, RL0); + + popq %r15; + popq %r14; + popq %r13; + popq %r12; + popq %rbx; + popq %rbp; + + ret; +ENDPROC(des3_ede_x86_64_crypt_blk) + +/*********************************************************************** + * 3-way 3DES + ***********************************************************************/ +#define expand_to_64bits(val, mask) \ + movl val##d, RT0d; \ + rorl $4, RT0d; \ + shlq $32, RT0; \ + orq RT0, val; \ + andq mask, val; + +#define compress_to_64bits(val) \ + movq val, RT0; \ + shrq $32, RT0; \ + roll $4, RT0d; \ + orl RT0d, val##d; + +#define initial_permutation3(left, right) \ + do_permutation(left##0d, right##0d, 4, 0x0f0f0f0f); \ + do_permutation(left##0d, right##0d, 16, 0x0000ffff); \ + do_permutation(left##1d, right##1d, 4, 0x0f0f0f0f); \ + do_permutation(left##1d, right##1d, 16, 0x0000ffff); \ + do_permutation(left##2d, right##2d, 4, 0x0f0f0f0f); \ + do_permutation(left##2d, right##2d, 16, 0x0000ffff); \ + \ + do_permutation(right##0d, left##0d, 2, 0x33333333); \ + do_permutation(right##0d, left##0d, 8, 0x00ff00ff); \ + do_permutation(right##1d, left##1d, 2, 0x33333333); \ + do_permutation(right##1d, left##1d, 8, 0x00ff00ff); \ + do_permutation(right##2d, left##2d, 2, 0x33333333); \ + do_permutation(right##2d, left##2d, 8, 0x00ff00ff); \ + \ + movabs $0x3f3f3f3f3f3f3f3f, RT3; \ + \ + movl left##0d, RW0d; \ + roll $1, right##0d; \ + xorl right##0d, RW0d; \ + andl $0xaaaaaaaa, RW0d; \ + xorl RW0d, left##0d; \ + xorl RW0d, right##0d; \ + roll $1, left##0d; \ + expand_to_64bits(right##0, RT3); \ + expand_to_64bits(left##0, RT3); \ + movl left##1d, RW1d; \ + roll $1, right##1d; \ + xorl right##1d, RW1d; \ + andl $0xaaaaaaaa, RW1d; \ + xorl RW1d, left##1d; \ + xorl RW1d, right##1d; \ + roll $1, left##1d; \ + expand_to_64bits(right##1, RT3); \ + expand_to_64bits(left##1, RT3); \ + movl left##2d, RW2d; \ + roll $1, right##2d; \ + xorl right##2d, RW2d; \ + andl $0xaaaaaaaa, RW2d; \ + xorl RW2d, left##2d; \ + xorl RW2d, right##2d; \ + roll $1, left##2d; \ + expand_to_64bits(right##2, RT3); \ + expand_to_64bits(left##2, RT3); + +#define final_permutation3(left, right) \ + compress_to_64bits(right##0); \ + compress_to_64bits(left##0); \ + movl right##0d, RW0d; \ + rorl $1, left##0d; \ + xorl left##0d, RW0d; \ + andl $0xaaaaaaaa, RW0d; \ + xorl RW0d, right##0d; \ + xorl RW0d, left##0d; \ + rorl $1, right##0d; \ + compress_to_64bits(right##1); \ + compress_to_64bits(left##1); \ + movl right##1d, RW1d; \ + rorl $1, left##1d; \ + xorl left##1d, RW1d; \ + andl $0xaaaaaaaa, RW1d; \ + xorl RW1d, right##1d; \ + xorl RW1d, left##1d; \ + rorl $1, right##1d; \ + compress_to_64bits(right##2); \ + compress_to_64bits(left##2); \ + movl right##2d, RW2d; \ + rorl $1, left##2d; \ + xorl left##2d, RW2d; \ + andl $0xaaaaaaaa, RW2d; \ + xorl RW2d, right##2d; \ + xorl RW2d, left##2d; \ + rorl $1, right##2d; \ + \ + do_permutation(right##0d, left##0d, 8, 0x00ff00ff); \ + do_permutation(right##0d, left##0d, 2, 0x33333333); \ + do_permutation(right##1d, left##1d, 8, 0x00ff00ff); \ + do_permutation(right##1d, left##1d, 2, 0x33333333); \ + do_permutation(right##2d, left##2d, 8, 0x00ff00ff); \ + do_permutation(right##2d, left##2d, 2, 0x33333333); \ + \ + do_permutation(left##0d, right##0d, 16, 0x0000ffff); \ + do_permutation(left##0d, right##0d, 4, 0x0f0f0f0f); \ + do_permutation(left##1d, right##1d, 16, 0x0000ffff); \ + do_permutation(left##1d, right##1d, 4, 0x0f0f0f0f); \ + do_permutation(left##2d, right##2d, 16, 0x0000ffff); \ + do_permutation(left##2d, right##2d, 4, 0x0f0f0f0f); + +#define round3(n, from, to, load_next_key, do_movq) \ + xorq from##0, RW0; \ + movzbl RW0bl, RT3d; \ + movzbl RW0bh, RT1d; \ + shrq $16, RW0; \ + xorq s8(, RT3, 8), to##0; \ + xorq s6(, RT1, 8), to##0; \ + movzbl RW0bl, RT3d; \ + movzbl RW0bh, RT1d; \ + shrq $16, RW0; \ + xorq s4(, RT3, 8), to##0; \ + xorq s2(, RT1, 8), to##0; \ + movzbl RW0bl, RT3d; \ + movzbl RW0bh, RT1d; \ + shrl $16, RW0d; \ + xorq s7(, RT3, 8), to##0; \ + xorq s5(, RT1, 8), to##0; \ + movzbl RW0bl, RT3d; \ + movzbl RW0bh, RT1d; \ + load_next_key(n, RW0); \ + xorq s3(, RT3, 8), to##0; \ + xorq s1(, RT1, 8), to##0; \ + xorq from##1, RW1; \ + movzbl RW1bl, RT3d; \ + movzbl RW1bh, RT1d; \ + shrq $16, RW1; \ + xorq s8(, RT3, 8), to##1; \ + xorq s6(, RT1, 8), to##1; \ + movzbl RW1bl, RT3d; \ + movzbl RW1bh, RT1d; \ + shrq $16, RW1; \ + xorq s4(, RT3, 8), to##1; \ + xorq s2(, RT1, 8), to##1; \ + movzbl RW1bl, RT3d; \ + movzbl RW1bh, RT1d; \ + shrl $16, RW1d; \ + xorq s7(, RT3, 8), to##1; \ + xorq s5(, RT1, 8), to##1; \ + movzbl RW1bl, RT3d; \ + movzbl RW1bh, RT1d; \ + do_movq(RW0, RW1); \ + xorq s3(, RT3, 8), to##1; \ + xorq s1(, RT1, 8), to##1; \ + xorq from##2, RW2; \ + movzbl RW2bl, RT3d; \ + movzbl RW2bh, RT1d; \ + shrq $16, RW2; \ + xorq s8(, RT3, 8), to##2; \ + xorq s6(, RT1, 8), to##2; \ + movzbl RW2bl, RT3d; \ + movzbl RW2bh, RT1d; \ + shrq $16, RW2; \ + xorq s4(, RT3, 8), to##2; \ + xorq s2(, RT1, 8), to##2; \ + movzbl RW2bl, RT3d; \ + movzbl RW2bh, RT1d; \ + shrl $16, RW2d; \ + xorq s7(, RT3, 8), to##2; \ + xorq s5(, RT1, 8), to##2; \ + movzbl RW2bl, RT3d; \ + movzbl RW2bh, RT1d; \ + do_movq(RW0, RW2); \ + xorq s3(, RT3, 8), to##2; \ + xorq s1(, RT1, 8), to##2; + +#define __movq(src, dst) \ + movq src, dst; + +ENTRY(des3_ede_x86_64_crypt_blk_3way) + /* input: + * %rdi: ctx, round keys + * %rsi: dst (3 blocks) + * %rdx: src (3 blocks) + */ + + pushq %rbp; + pushq %rbx; + pushq %r12; + pushq %r13; + pushq %r14; + pushq %r15; + + /* load input */ + movl 0 * 4(%rdx), RL0d; + movl 1 * 4(%rdx), RR0d; + movl 2 * 4(%rdx), RL1d; + movl 3 * 4(%rdx), RR1d; + movl 4 * 4(%rdx), RL2d; + movl 5 * 4(%rdx), RR2d; + + bswapl RL0d; + bswapl RR0d; + bswapl RL1d; + bswapl RR1d; + bswapl RL2d; + bswapl RR2d; + + initial_permutation3(RL, RR); + + movq 0(CTX), RW0; + movq RW0, RW1; + movq RW0, RW2; + + round3(0, RR, RL, load_next_key, __movq); + round3(1, RL, RR, load_next_key, __movq); + round3(2, RR, RL, load_next_key, __movq); + round3(3, RL, RR, load_next_key, __movq); + round3(4, RR, RL, load_next_key, __movq); + round3(5, RL, RR, load_next_key, __movq); + round3(6, RR, RL, load_next_key, __movq); + round3(7, RL, RR, load_next_key, __movq); + round3(8, RR, RL, load_next_key, __movq); + round3(9, RL, RR, load_next_key, __movq); + round3(10, RR, RL, load_next_key, __movq); + round3(11, RL, RR, load_next_key, __movq); + round3(12, RR, RL, load_next_key, __movq); + round3(13, RL, RR, load_next_key, __movq); + round3(14, RR, RL, load_next_key, __movq); + round3(15, RL, RR, load_next_key, __movq); + + round3(16+0, RL, RR, load_next_key, __movq); + round3(16+1, RR, RL, load_next_key, __movq); + round3(16+2, RL, RR, load_next_key, __movq); + round3(16+3, RR, RL, load_next_key, __movq); + round3(16+4, RL, RR, load_next_key, __movq); + round3(16+5, RR, RL, load_next_key, __movq); + round3(16+6, RL, RR, load_next_key, __movq); + round3(16+7, RR, RL, load_next_key, __movq); + round3(16+8, RL, RR, load_next_key, __movq); + round3(16+9, RR, RL, load_next_key, __movq); + round3(16+10, RL, RR, load_next_key, __movq); + round3(16+11, RR, RL, load_next_key, __movq); + round3(16+12, RL, RR, load_next_key, __movq); + round3(16+13, RR, RL, load_next_key, __movq); + round3(16+14, RL, RR, load_next_key, __movq); + round3(16+15, RR, RL, load_next_key, __movq); + + round3(32+0, RR, RL, load_next_key, __movq); + round3(32+1, RL, RR, load_next_key, __movq); + round3(32+2, RR, RL, load_next_key, __movq); + round3(32+3, RL, RR, load_next_key, __movq); + round3(32+4, RR, RL, load_next_key, __movq); + round3(32+5, RL, RR, load_next_key, __movq); + round3(32+6, RR, RL, load_next_key, __movq); + round3(32+7, RL, RR, load_next_key, __movq); + round3(32+8, RR, RL, load_next_key, __movq); + round3(32+9, RL, RR, load_next_key, __movq); + round3(32+10, RR, RL, load_next_key, __movq); + round3(32+11, RL, RR, load_next_key, __movq); + round3(32+12, RR, RL, load_next_key, __movq); + round3(32+13, RL, RR, load_next_key, __movq); + round3(32+14, RR, RL, load_next_key, __movq); + round3(32+15, RL, RR, dummy2, dummy2); + + final_permutation3(RR, RL); + + bswapl RR0d; + bswapl RL0d; + bswapl RR1d; + bswapl RL1d; + bswapl RR2d; + bswapl RL2d; + + movl RR0d, 0 * 4(%rsi); + movl RL0d, 1 * 4(%rsi); + movl RR1d, 2 * 4(%rsi); + movl RL1d, 3 * 4(%rsi); + movl RR2d, 4 * 4(%rsi); + movl RL2d, 5 * 4(%rsi); + + popq %r15; + popq %r14; + popq %r13; + popq %r12; + popq %rbx; + popq %rbp; + + ret; +ENDPROC(des3_ede_x86_64_crypt_blk_3way) + +.data +.align 16 +.L_s1: + .quad 0x0010100001010400, 0x0000000000000000 + .quad 0x0000100000010000, 0x0010100001010404 + .quad 0x0010100001010004, 0x0000100000010404 + .quad 0x0000000000000004, 0x0000100000010000 + .quad 0x0000000000000400, 0x0010100001010400 + .quad 0x0010100001010404, 0x0000000000000400 + .quad 0x0010000001000404, 0x0010100001010004 + .quad 0x0010000001000000, 0x0000000000000004 + .quad 0x0000000000000404, 0x0010000001000400 + .quad 0x0010000001000400, 0x0000100000010400 + .quad 0x0000100000010400, 0x0010100001010000 + .quad 0x0010100001010000, 0x0010000001000404 + .quad 0x0000100000010004, 0x0010000001000004 + .quad 0x0010000001000004, 0x0000100000010004 + .quad 0x0000000000000000, 0x0000000000000404 + .quad 0x0000100000010404, 0x0010000001000000 + .quad 0x0000100000010000, 0x0010100001010404 + .quad 0x0000000000000004, 0x0010100001010000 + .quad 0x0010100001010400, 0x0010000001000000 + .quad 0x0010000001000000, 0x0000000000000400 + .quad 0x0010100001010004, 0x0000100000010000 + .quad 0x0000100000010400, 0x0010000001000004 + .quad 0x0000000000000400, 0x0000000000000004 + .quad 0x0010000001000404, 0x0000100000010404 + .quad 0x0010100001010404, 0x0000100000010004 + .quad 0x0010100001010000, 0x0010000001000404 + .quad 0x0010000001000004, 0x0000000000000404 + .quad 0x0000100000010404, 0x0010100001010400 + .quad 0x0000000000000404, 0x0010000001000400 + .quad 0x0010000001000400, 0x0000000000000000 + .quad 0x0000100000010004, 0x0000100000010400 + .quad 0x0000000000000000, 0x0010100001010004 +.L_s2: + .quad 0x0801080200100020, 0x0800080000000000 + .quad 0x0000080000000000, 0x0001080200100020 + .quad 0x0001000000100000, 0x0000000200000020 + .quad 0x0801000200100020, 0x0800080200000020 + .quad 0x0800000200000020, 0x0801080200100020 + .quad 0x0801080000100000, 0x0800000000000000 + .quad 0x0800080000000000, 0x0001000000100000 + .quad 0x0000000200000020, 0x0801000200100020 + .quad 0x0001080000100000, 0x0001000200100020 + .quad 0x0800080200000020, 0x0000000000000000 + .quad 0x0800000000000000, 0x0000080000000000 + .quad 0x0001080200100020, 0x0801000000100000 + .quad 0x0001000200100020, 0x0800000200000020 + .quad 0x0000000000000000, 0x0001080000100000 + .quad 0x0000080200000020, 0x0801080000100000 + .quad 0x0801000000100000, 0x0000080200000020 + .quad 0x0000000000000000, 0x0001080200100020 + .quad 0x0801000200100020, 0x0001000000100000 + .quad 0x0800080200000020, 0x0801000000100000 + .quad 0x0801080000100000, 0x0000080000000000 + .quad 0x0801000000100000, 0x0800080000000000 + .quad 0x0000000200000020, 0x0801080200100020 + .quad 0x0001080200100020, 0x0000000200000020 + .quad 0x0000080000000000, 0x0800000000000000 + .quad 0x0000080200000020, 0x0801080000100000 + .quad 0x0001000000100000, 0x0800000200000020 + .quad 0x0001000200100020, 0x0800080200000020 + .quad 0x0800000200000020, 0x0001000200100020 + .quad 0x0001080000100000, 0x0000000000000000 + .quad 0x0800080000000000, 0x0000080200000020 + .quad 0x0800000000000000, 0x0801000200100020 + .quad 0x0801080200100020, 0x0001080000100000 +.L_s3: + .quad 0x0000002000000208, 0x0000202008020200 + .quad 0x0000000000000000, 0x0000200008020008 + .quad 0x0000002008000200, 0x0000000000000000 + .quad 0x0000202000020208, 0x0000002008000200 + .quad 0x0000200000020008, 0x0000000008000008 + .quad 0x0000000008000008, 0x0000200000020000 + .quad 0x0000202008020208, 0x0000200000020008 + .quad 0x0000200008020000, 0x0000002000000208 + .quad 0x0000000008000000, 0x0000000000000008 + .quad 0x0000202008020200, 0x0000002000000200 + .quad 0x0000202000020200, 0x0000200008020000 + .quad 0x0000200008020008, 0x0000202000020208 + .quad 0x0000002008000208, 0x0000202000020200 + .quad 0x0000200000020000, 0x0000002008000208 + .quad 0x0000000000000008, 0x0000202008020208 + .quad 0x0000002000000200, 0x0000000008000000 + .quad 0x0000202008020200, 0x0000000008000000 + .quad 0x0000200000020008, 0x0000002000000208 + .quad 0x0000200000020000, 0x0000202008020200 + .quad 0x0000002008000200, 0x0000000000000000 + .quad 0x0000002000000200, 0x0000200000020008 + .quad 0x0000202008020208, 0x0000002008000200 + .quad 0x0000000008000008, 0x0000002000000200 + .quad 0x0000000000000000, 0x0000200008020008 + .quad 0x0000002008000208, 0x0000200000020000 + .quad 0x0000000008000000, 0x0000202008020208 + .quad 0x0000000000000008, 0x0000202000020208 + .quad 0x0000202000020200, 0x0000000008000008 + .quad 0x0000200008020000, 0x0000002008000208 + .quad 0x0000002000000208, 0x0000200008020000 + .quad 0x0000202000020208, 0x0000000000000008 + .quad 0x0000200008020008, 0x0000202000020200 +.L_s4: + .quad 0x1008020000002001, 0x1000020800002001 + .quad 0x1000020800002001, 0x0000000800000000 + .quad 0x0008020800002000, 0x1008000800000001 + .quad 0x1008000000000001, 0x1000020000002001 + .quad 0x0000000000000000, 0x0008020000002000 + .quad 0x0008020000002000, 0x1008020800002001 + .quad 0x1000000800000001, 0x0000000000000000 + .quad 0x0008000800000000, 0x1008000000000001 + .quad 0x1000000000000001, 0x0000020000002000 + .quad 0x0008000000000000, 0x1008020000002001 + .quad 0x0000000800000000, 0x0008000000000000 + .quad 0x1000020000002001, 0x0000020800002000 + .quad 0x1008000800000001, 0x1000000000000001 + .quad 0x0000020800002000, 0x0008000800000000 + .quad 0x0000020000002000, 0x0008020800002000 + .quad 0x1008020800002001, 0x1000000800000001 + .quad 0x0008000800000000, 0x1008000000000001 + .quad 0x0008020000002000, 0x1008020800002001 + .quad 0x1000000800000001, 0x0000000000000000 + .quad 0x0000000000000000, 0x0008020000002000 + .quad 0x0000020800002000, 0x0008000800000000 + .quad 0x1008000800000001, 0x1000000000000001 + .quad 0x1008020000002001, 0x1000020800002001 + .quad 0x1000020800002001, 0x0000000800000000 + .quad 0x1008020800002001, 0x1000000800000001 + .quad 0x1000000000000001, 0x0000020000002000 + .quad 0x1008000000000001, 0x1000020000002001 + .quad 0x0008020800002000, 0x1008000800000001 + .quad 0x1000020000002001, 0x0000020800002000 + .quad 0x0008000000000000, 0x1008020000002001 + .quad 0x0000000800000000, 0x0008000000000000 + .quad 0x0000020000002000, 0x0008020800002000 +.L_s5: + .quad 0x0000001000000100, 0x0020001002080100 + .quad 0x0020000002080000, 0x0420001002000100 + .quad 0x0000000000080000, 0x0000001000000100 + .quad 0x0400000000000000, 0x0020000002080000 + .quad 0x0400001000080100, 0x0000000000080000 + .quad 0x0020001002000100, 0x0400001000080100 + .quad 0x0420001002000100, 0x0420000002080000 + .quad 0x0000001000080100, 0x0400000000000000 + .quad 0x0020000002000000, 0x0400000000080000 + .quad 0x0400000000080000, 0x0000000000000000 + .quad 0x0400001000000100, 0x0420001002080100 + .quad 0x0420001002080100, 0x0020001002000100 + .quad 0x0420000002080000, 0x0400001000000100 + .quad 0x0000000000000000, 0x0420000002000000 + .quad 0x0020001002080100, 0x0020000002000000 + .quad 0x0420000002000000, 0x0000001000080100 + .quad 0x0000000000080000, 0x0420001002000100 + .quad 0x0000001000000100, 0x0020000002000000 + .quad 0x0400000000000000, 0x0020000002080000 + .quad 0x0420001002000100, 0x0400001000080100 + .quad 0x0020001002000100, 0x0400000000000000 + .quad 0x0420000002080000, 0x0020001002080100 + .quad 0x0400001000080100, 0x0000001000000100 + .quad 0x0020000002000000, 0x0420000002080000 + .quad 0x0420001002080100, 0x0000001000080100 + .quad 0x0420000002000000, 0x0420001002080100 + .quad 0x0020000002080000, 0x0000000000000000 + .quad 0x0400000000080000, 0x0420000002000000 + .quad 0x0000001000080100, 0x0020001002000100 + .quad 0x0400001000000100, 0x0000000000080000 + .quad 0x0000000000000000, 0x0400000000080000 + .quad 0x0020001002080100, 0x0400001000000100 +.L_s6: + .quad 0x0200000120000010, 0x0204000020000000 + .quad 0x0000040000000000, 0x0204040120000010 + .quad 0x0204000020000000, 0x0000000100000010 + .quad 0x0204040120000010, 0x0004000000000000 + .quad 0x0200040020000000, 0x0004040100000010 + .quad 0x0004000000000000, 0x0200000120000010 + .quad 0x0004000100000010, 0x0200040020000000 + .quad 0x0200000020000000, 0x0000040100000010 + .quad 0x0000000000000000, 0x0004000100000010 + .quad 0x0200040120000010, 0x0000040000000000 + .quad 0x0004040000000000, 0x0200040120000010 + .quad 0x0000000100000010, 0x0204000120000010 + .quad 0x0204000120000010, 0x0000000000000000 + .quad 0x0004040100000010, 0x0204040020000000 + .quad 0x0000040100000010, 0x0004040000000000 + .quad 0x0204040020000000, 0x0200000020000000 + .quad 0x0200040020000000, 0x0000000100000010 + .quad 0x0204000120000010, 0x0004040000000000 + .quad 0x0204040120000010, 0x0004000000000000 + .quad 0x0000040100000010, 0x0200000120000010 + .quad 0x0004000000000000, 0x0200040020000000 + .quad 0x0200000020000000, 0x0000040100000010 + .quad 0x0200000120000010, 0x0204040120000010 + .quad 0x0004040000000000, 0x0204000020000000 + .quad 0x0004040100000010, 0x0204040020000000 + .quad 0x0000000000000000, 0x0204000120000010 + .quad 0x0000000100000010, 0x0000040000000000 + .quad 0x0204000020000000, 0x0004040100000010 + .quad 0x0000040000000000, 0x0004000100000010 + .quad 0x0200040120000010, 0x0000000000000000 + .quad 0x0204040020000000, 0x0200000020000000 + .quad 0x0004000100000010, 0x0200040120000010 +.L_s7: + .quad 0x0002000000200000, 0x2002000004200002 + .quad 0x2000000004000802, 0x0000000000000000 + .quad 0x0000000000000800, 0x2000000004000802 + .quad 0x2002000000200802, 0x0002000004200800 + .quad 0x2002000004200802, 0x0002000000200000 + .quad 0x0000000000000000, 0x2000000004000002 + .quad 0x2000000000000002, 0x0000000004000000 + .quad 0x2002000004200002, 0x2000000000000802 + .quad 0x0000000004000800, 0x2002000000200802 + .quad 0x2002000000200002, 0x0000000004000800 + .quad 0x2000000004000002, 0x0002000004200000 + .quad 0x0002000004200800, 0x2002000000200002 + .quad 0x0002000004200000, 0x0000000000000800 + .quad 0x2000000000000802, 0x2002000004200802 + .quad 0x0002000000200800, 0x2000000000000002 + .quad 0x0000000004000000, 0x0002000000200800 + .quad 0x0000000004000000, 0x0002000000200800 + .quad 0x0002000000200000, 0x2000000004000802 + .quad 0x2000000004000802, 0x2002000004200002 + .quad 0x2002000004200002, 0x2000000000000002 + .quad 0x2002000000200002, 0x0000000004000000 + .quad 0x0000000004000800, 0x0002000000200000 + .quad 0x0002000004200800, 0x2000000000000802 + .quad 0x2002000000200802, 0x0002000004200800 + .quad 0x2000000000000802, 0x2000000004000002 + .quad 0x2002000004200802, 0x0002000004200000 + .quad 0x0002000000200800, 0x0000000000000000 + .quad 0x2000000000000002, 0x2002000004200802 + .quad 0x0000000000000000, 0x2002000000200802 + .quad 0x0002000004200000, 0x0000000000000800 + .quad 0x2000000004000002, 0x0000000004000800 + .quad 0x0000000000000800, 0x2002000000200002 +.L_s8: + .quad 0x0100010410001000, 0x0000010000001000 + .quad 0x0000000000040000, 0x0100010410041000 + .quad 0x0100000010000000, 0x0100010410001000 + .quad 0x0000000400000000, 0x0100000010000000 + .quad 0x0000000400040000, 0x0100000010040000 + .quad 0x0100010410041000, 0x0000010000041000 + .quad 0x0100010010041000, 0x0000010400041000 + .quad 0x0000010000001000, 0x0000000400000000 + .quad 0x0100000010040000, 0x0100000410000000 + .quad 0x0100010010001000, 0x0000010400001000 + .quad 0x0000010000041000, 0x0000000400040000 + .quad 0x0100000410040000, 0x0100010010041000 + .quad 0x0000010400001000, 0x0000000000000000 + .quad 0x0000000000000000, 0x0100000410040000 + .quad 0x0100000410000000, 0x0100010010001000 + .quad 0x0000010400041000, 0x0000000000040000 + .quad 0x0000010400041000, 0x0000000000040000 + .quad 0x0100010010041000, 0x0000010000001000 + .quad 0x0000000400000000, 0x0100000410040000 + .quad 0x0000010000001000, 0x0000010400041000 + .quad 0x0100010010001000, 0x0000000400000000 + .quad 0x0100000410000000, 0x0100000010040000 + .quad 0x0100000410040000, 0x0100000010000000 + .quad 0x0000000000040000, 0x0100010410001000 + .quad 0x0000000000000000, 0x0100010410041000 + .quad 0x0000000400040000, 0x0100000410000000 + .quad 0x0100000010040000, 0x0100010010001000 + .quad 0x0100010410001000, 0x0000000000000000 + .quad 0x0100010410041000, 0x0000010000041000 + .quad 0x0000010000041000, 0x0000010400001000 + .quad 0x0000010400001000, 0x0000000400040000 + .quad 0x0100000010000000, 0x0100010010041000 diff --git a/arch/x86/crypto/des3_ede_glue.c b/arch/x86/crypto/des3_ede_glue.c new file mode 100644 index 00000000000..0e9c0668fe4 --- /dev/null +++ b/arch/x86/crypto/des3_ede_glue.c @@ -0,0 +1,509 @@ +/* + * Glue Code for assembler optimized version of 3DES + * + * Copyright © 2014 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> + * + * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by: + * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> + * CTR part based on code (crypto/ctr.c) by: + * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include <asm/processor.h> +#include <crypto/des.h> +#include <linux/crypto.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/types.h> +#include <crypto/algapi.h> + +struct des3_ede_x86_ctx { + u32 enc_expkey[DES3_EDE_EXPKEY_WORDS]; + u32 dec_expkey[DES3_EDE_EXPKEY_WORDS]; +}; + +/* regular block cipher functions */ +asmlinkage void des3_ede_x86_64_crypt_blk(const u32 *expkey, u8 *dst, + const u8 *src); + +/* 3-way parallel cipher functions */ +asmlinkage void des3_ede_x86_64_crypt_blk_3way(const u32 *expkey, u8 *dst, + const u8 *src); + +static inline void des3_ede_enc_blk(struct des3_ede_x86_ctx *ctx, u8 *dst, + const u8 *src) +{ + u32 *enc_ctx = ctx->enc_expkey; + + des3_ede_x86_64_crypt_blk(enc_ctx, dst, src); +} + +static inline void des3_ede_dec_blk(struct des3_ede_x86_ctx *ctx, u8 *dst, + const u8 *src) +{ + u32 *dec_ctx = ctx->dec_expkey; + + des3_ede_x86_64_crypt_blk(dec_ctx, dst, src); +} + +static inline void des3_ede_enc_blk_3way(struct des3_ede_x86_ctx *ctx, u8 *dst, + const u8 *src) +{ + u32 *enc_ctx = ctx->enc_expkey; + + des3_ede_x86_64_crypt_blk_3way(enc_ctx, dst, src); +} + +static inline void des3_ede_dec_blk_3way(struct des3_ede_x86_ctx *ctx, u8 *dst, + const u8 *src) +{ + u32 *dec_ctx = ctx->dec_expkey; + + des3_ede_x86_64_crypt_blk_3way(dec_ctx, dst, src); +} + +static void des3_ede_x86_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) +{ + des3_ede_enc_blk(crypto_tfm_ctx(tfm), dst, src); +} + +static void des3_ede_x86_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) +{ + des3_ede_dec_blk(crypto_tfm_ctx(tfm), dst, src); +} + +static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk, + const u32 *expkey) +{ + unsigned int bsize = DES3_EDE_BLOCK_SIZE; + unsigned int nbytes; + int err; + + err = blkcipher_walk_virt(desc, walk); + + while ((nbytes = walk->nbytes)) { + u8 *wsrc = walk->src.virt.addr; + u8 *wdst = walk->dst.virt.addr; + + /* Process four block batch */ + if (nbytes >= bsize * 3) { + do { + des3_ede_x86_64_crypt_blk_3way(expkey, wdst, + wsrc); + + wsrc += bsize * 3; + wdst += bsize * 3; + nbytes -= bsize * 3; + } while (nbytes >= bsize * 3); + + if (nbytes < bsize) + goto done; + } + + /* Handle leftovers */ + do { + des3_ede_x86_64_crypt_blk(expkey, wdst, wsrc); + + wsrc += bsize; + wdst += bsize; + nbytes -= bsize; + } while (nbytes >= bsize); + +done: + err = blkcipher_walk_done(desc, walk, nbytes); + } + + return err; +} + +static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct des3_ede_x86_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + struct blkcipher_walk walk; + + blkcipher_walk_init(&walk, dst, src, nbytes); + return ecb_crypt(desc, &walk, ctx->enc_expkey); +} + +static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct des3_ede_x86_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + struct blkcipher_walk walk; + + blkcipher_walk_init(&walk, dst, src, nbytes); + return ecb_crypt(desc, &walk, ctx->dec_expkey); +} + +static unsigned int __cbc_encrypt(struct blkcipher_desc *desc, + struct blkcipher_walk *walk) +{ + struct des3_ede_x86_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + unsigned int bsize = DES3_EDE_BLOCK_SIZE; + unsigned int nbytes = walk->nbytes; + u64 *src = (u64 *)walk->src.virt.addr; + u64 *dst = (u64 *)walk->dst.virt.addr; + u64 *iv = (u64 *)walk->iv; + + do { + *dst = *src ^ *iv; + des3_ede_enc_blk(ctx, (u8 *)dst, (u8 *)dst); + iv = dst; + + src += 1; + dst += 1; + nbytes -= bsize; + } while (nbytes >= bsize); + + *(u64 *)walk->iv = *iv; + return nbytes; +} + +static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct blkcipher_walk walk; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + + while ((nbytes = walk.nbytes)) { + nbytes = __cbc_encrypt(desc, &walk); + err = blkcipher_walk_done(desc, &walk, nbytes); + } + + return err; +} + +static unsigned int __cbc_decrypt(struct blkcipher_desc *desc, + struct blkcipher_walk *walk) +{ + struct des3_ede_x86_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + unsigned int bsize = DES3_EDE_BLOCK_SIZE; + unsigned int nbytes = walk->nbytes; + u64 *src = (u64 *)walk->src.virt.addr; + u64 *dst = (u64 *)walk->dst.virt.addr; + u64 ivs[3 - 1]; + u64 last_iv; + + /* Start of the last block. */ + src += nbytes / bsize - 1; + dst += nbytes / bsize - 1; + + last_iv = *src; + + /* Process four block batch */ + if (nbytes >= bsize * 3) { + do { + nbytes -= bsize * 3 - bsize; + src -= 3 - 1; + dst -= 3 - 1; + + ivs[0] = src[0]; + ivs[1] = src[1]; + + des3_ede_dec_blk_3way(ctx, (u8 *)dst, (u8 *)src); + + dst[1] ^= ivs[0]; + dst[2] ^= ivs[1]; + + nbytes -= bsize; + if (nbytes < bsize) + goto done; + + *dst ^= *(src - 1); + src -= 1; + dst -= 1; + } while (nbytes >= bsize * 3); + } + + /* Handle leftovers */ + for (;;) { + des3_ede_dec_blk(ctx, (u8 *)dst, (u8 *)src); + + nbytes -= bsize; + if (nbytes < bsize) + break; + + *dst ^= *(src - 1); + src -= 1; + dst -= 1; + } + +done: + *dst ^= *(u64 *)walk->iv; + *(u64 *)walk->iv = last_iv; + + return nbytes; +} + +static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct blkcipher_walk walk; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + + while ((nbytes = walk.nbytes)) { + nbytes = __cbc_decrypt(desc, &walk); + err = blkcipher_walk_done(desc, &walk, nbytes); + } + + return err; +} + +static void ctr_crypt_final(struct des3_ede_x86_ctx *ctx, + struct blkcipher_walk *walk) +{ + u8 *ctrblk = walk->iv; + u8 keystream[DES3_EDE_BLOCK_SIZE]; + u8 *src = walk->src.virt.addr; + u8 *dst = walk->dst.virt.addr; + unsigned int nbytes = walk->nbytes; + + des3_ede_enc_blk(ctx, keystream, ctrblk); + crypto_xor(keystream, src, nbytes); + memcpy(dst, keystream, nbytes); + + crypto_inc(ctrblk, DES3_EDE_BLOCK_SIZE); +} + +static unsigned int __ctr_crypt(struct blkcipher_desc *desc, + struct blkcipher_walk *walk) +{ + struct des3_ede_x86_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + unsigned int bsize = DES3_EDE_BLOCK_SIZE; + unsigned int nbytes = walk->nbytes; + __be64 *src = (__be64 *)walk->src.virt.addr; + __be64 *dst = (__be64 *)walk->dst.virt.addr; + u64 ctrblk = be64_to_cpu(*(__be64 *)walk->iv); + __be64 ctrblocks[3]; + + /* Process four block batch */ + if (nbytes >= bsize * 3) { + do { + /* create ctrblks for parallel encrypt */ + ctrblocks[0] = cpu_to_be64(ctrblk++); + ctrblocks[1] = cpu_to_be64(ctrblk++); + ctrblocks[2] = cpu_to_be64(ctrblk++); + + des3_ede_enc_blk_3way(ctx, (u8 *)ctrblocks, + (u8 *)ctrblocks); + + dst[0] = src[0] ^ ctrblocks[0]; + dst[1] = src[1] ^ ctrblocks[1]; + dst[2] = src[2] ^ ctrblocks[2]; + + src += 3; + dst += 3; + } while ((nbytes -= bsize * 3) >= bsize * 3); + + if (nbytes < bsize) + goto done; + } + + /* Handle leftovers */ + do { + ctrblocks[0] = cpu_to_be64(ctrblk++); + + des3_ede_enc_blk(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks); + + dst[0] = src[0] ^ ctrblocks[0]; + + src += 1; + dst += 1; + } while ((nbytes -= bsize) >= bsize); + +done: + *(__be64 *)walk->iv = cpu_to_be64(ctrblk); + return nbytes; +} + +static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct blkcipher_walk walk; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt_block(desc, &walk, DES3_EDE_BLOCK_SIZE); + + while ((nbytes = walk.nbytes) >= DES3_EDE_BLOCK_SIZE) { + nbytes = __ctr_crypt(desc, &walk); + err = blkcipher_walk_done(desc, &walk, nbytes); + } + + if (walk.nbytes) { + ctr_crypt_final(crypto_blkcipher_ctx(desc->tfm), &walk); + err = blkcipher_walk_done(desc, &walk, 0); + } + + return err; +} + +static int des3_ede_x86_setkey(struct crypto_tfm *tfm, const u8 *key, + unsigned int keylen) +{ + struct des3_ede_x86_ctx *ctx = crypto_tfm_ctx(tfm); + u32 i, j, tmp; + int err; + + /* Generate encryption context using generic implementation. */ + err = __des3_ede_setkey(ctx->enc_expkey, &tfm->crt_flags, key, keylen); + if (err < 0) + return err; + + /* Fix encryption context for this implementation and form decryption + * context. */ + j = DES3_EDE_EXPKEY_WORDS - 2; + for (i = 0; i < DES3_EDE_EXPKEY_WORDS; i += 2, j -= 2) { + tmp = ror32(ctx->enc_expkey[i + 1], 4); + ctx->enc_expkey[i + 1] = tmp; + + ctx->dec_expkey[j + 0] = ctx->enc_expkey[i + 0]; + ctx->dec_expkey[j + 1] = tmp; + } + + return 0; +} + +static struct crypto_alg des3_ede_algs[4] = { { + .cra_name = "des3_ede", + .cra_driver_name = "des3_ede-asm", + .cra_priority = 200, + .cra_flags = CRYPTO_ALG_TYPE_CIPHER, + .cra_blocksize = DES3_EDE_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct des3_ede_x86_ctx), + .cra_alignmask = 0, + .cra_module = THIS_MODULE, + .cra_u = { + .cipher = { + .cia_min_keysize = DES3_EDE_KEY_SIZE, + .cia_max_keysize = DES3_EDE_KEY_SIZE, + .cia_setkey = des3_ede_x86_setkey, + .cia_encrypt = des3_ede_x86_encrypt, + .cia_decrypt = des3_ede_x86_decrypt, + } + } +}, { + .cra_name = "ecb(des3_ede)", + .cra_driver_name = "ecb-des3_ede-asm", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = DES3_EDE_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct des3_ede_x86_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_u = { + .blkcipher = { + .min_keysize = DES3_EDE_KEY_SIZE, + .max_keysize = DES3_EDE_KEY_SIZE, + .setkey = des3_ede_x86_setkey, + .encrypt = ecb_encrypt, + .decrypt = ecb_decrypt, + }, + }, +}, { + .cra_name = "cbc(des3_ede)", + .cra_driver_name = "cbc-des3_ede-asm", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = DES3_EDE_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct des3_ede_x86_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_u = { + .blkcipher = { + .min_keysize = DES3_EDE_KEY_SIZE, + .max_keysize = DES3_EDE_KEY_SIZE, + .ivsize = DES3_EDE_BLOCK_SIZE, + .setkey = des3_ede_x86_setkey, + .encrypt = cbc_encrypt, + .decrypt = cbc_decrypt, + }, + }, +}, { + .cra_name = "ctr(des3_ede)", + .cra_driver_name = "ctr-des3_ede-asm", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct des3_ede_x86_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_u = { + .blkcipher = { + .min_keysize = DES3_EDE_KEY_SIZE, + .max_keysize = DES3_EDE_KEY_SIZE, + .ivsize = DES3_EDE_BLOCK_SIZE, + .setkey = des3_ede_x86_setkey, + .encrypt = ctr_crypt, + .decrypt = ctr_crypt, + }, + }, +} }; + +static bool is_blacklisted_cpu(void) +{ + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return false; + + if (boot_cpu_data.x86 == 0x0f) { + /* + * On Pentium 4, des3_ede-x86_64 is slower than generic C + * implementation because use of 64bit rotates (which are really + * slow on P4). Therefore blacklist P4s. + */ + return true; + } + + return false; +} + +static int force; +module_param(force, int, 0); +MODULE_PARM_DESC(force, "Force module load, ignore CPU blacklist"); + +static int __init des3_ede_x86_init(void) +{ + if (!force && is_blacklisted_cpu()) { + pr_info("des3_ede-x86_64: performance on this CPU would be suboptimal: disabling des3_ede-x86_64.\n"); + return -ENODEV; + } + + return crypto_register_algs(des3_ede_algs, ARRAY_SIZE(des3_ede_algs)); +} + +static void __exit des3_ede_x86_fini(void) +{ + crypto_unregister_algs(des3_ede_algs, ARRAY_SIZE(des3_ede_algs)); +} + +module_init(des3_ede_x86_init); +module_exit(des3_ede_x86_fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Triple DES EDE Cipher Algorithm, asm optimized"); +MODULE_ALIAS("des3_ede"); +MODULE_ALIAS("des3_ede-asm"); +MODULE_ALIAS("des"); +MODULE_ALIAS("des-asm"); +MODULE_AUTHOR("Jussi Kivilinna <jussi.kivilinna@iki.fi>"); diff --git a/arch/x86/crypto/sha-mb/Makefile b/arch/x86/crypto/sha-mb/Makefile new file mode 100644 index 00000000000..2f8756375df --- /dev/null +++ b/arch/x86/crypto/sha-mb/Makefile @@ -0,0 +1,11 @@ +# +# Arch-specific CryptoAPI modules. +# + +avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\ + $(comma)4)$(comma)%ymm2,yes,no) +ifeq ($(avx2_supported),yes) + obj-$(CONFIG_CRYPTO_SHA1_MB) += sha1-mb.o + sha1-mb-y := sha1_mb.o sha1_mb_mgr_flush_avx2.o \ + sha1_mb_mgr_init_avx2.o sha1_mb_mgr_submit_avx2.o sha1_x8_avx2.o +endif diff --git a/arch/x86/crypto/sha-mb/sha1_mb.c b/arch/x86/crypto/sha-mb/sha1_mb.c new file mode 100644 index 00000000000..99eefd81295 --- /dev/null +++ b/arch/x86/crypto/sha-mb/sha1_mb.c @@ -0,0 +1,935 @@ +/* + * Multi buffer SHA1 algorithm Glue Code + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2014 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * Tim Chen <tim.c.chen@linux.intel.com> + * + * BSD LICENSE + * + * Copyright(c) 2014 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <crypto/internal/hash.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/cryptohash.h> +#include <linux/types.h> +#include <linux/list.h> +#include <crypto/scatterwalk.h> +#include <crypto/sha.h> +#include <crypto/mcryptd.h> +#include <crypto/crypto_wq.h> +#include <asm/byteorder.h> +#include <asm/i387.h> +#include <asm/xcr.h> +#include <asm/xsave.h> +#include <linux/hardirq.h> +#include <asm/fpu-internal.h> +#include "sha_mb_ctx.h" + +#define FLUSH_INTERVAL 1000 /* in usec */ + +static struct mcryptd_alg_state sha1_mb_alg_state; + +struct sha1_mb_ctx { + struct mcryptd_ahash *mcryptd_tfm; +}; + +static inline struct mcryptd_hash_request_ctx *cast_hash_to_mcryptd_ctx(struct sha1_hash_ctx *hash_ctx) +{ + struct shash_desc *desc; + + desc = container_of((void *) hash_ctx, struct shash_desc, __ctx); + return container_of(desc, struct mcryptd_hash_request_ctx, desc); +} + +static inline struct ahash_request *cast_mcryptd_ctx_to_req(struct mcryptd_hash_request_ctx *ctx) +{ + return container_of((void *) ctx, struct ahash_request, __ctx); +} + +static void req_ctx_init(struct mcryptd_hash_request_ctx *rctx, + struct shash_desc *desc) +{ + rctx->flag = HASH_UPDATE; +} + +static asmlinkage void (*sha1_job_mgr_init)(struct sha1_mb_mgr *state); +static asmlinkage struct job_sha1* (*sha1_job_mgr_submit)(struct sha1_mb_mgr *state, + struct job_sha1 *job); +static asmlinkage struct job_sha1* (*sha1_job_mgr_flush)(struct sha1_mb_mgr *state); +static asmlinkage struct job_sha1* (*sha1_job_mgr_get_comp_job)(struct sha1_mb_mgr *state); + +inline void sha1_init_digest(uint32_t *digest) +{ + static const uint32_t initial_digest[SHA1_DIGEST_LENGTH] = {SHA1_H0, + SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 }; + memcpy(digest, initial_digest, sizeof(initial_digest)); +} + +inline uint32_t sha1_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], + uint32_t total_len) +{ + uint32_t i = total_len & (SHA1_BLOCK_SIZE - 1); + + memset(&padblock[i], 0, SHA1_BLOCK_SIZE); + padblock[i] = 0x80; + + i += ((SHA1_BLOCK_SIZE - 1) & + (0 - (total_len + SHA1_PADLENGTHFIELD_SIZE + 1))) + + 1 + SHA1_PADLENGTHFIELD_SIZE; + +#if SHA1_PADLENGTHFIELD_SIZE == 16 + *((uint64_t *) &padblock[i - 16]) = 0; +#endif + + *((uint64_t *) &padblock[i - 8]) = cpu_to_be64(total_len << 3); + + /* Number of extra blocks to hash */ + return i >> SHA1_LOG2_BLOCK_SIZE; +} + +static struct sha1_hash_ctx *sha1_ctx_mgr_resubmit(struct sha1_ctx_mgr *mgr, struct sha1_hash_ctx *ctx) +{ + while (ctx) { + if (ctx->status & HASH_CTX_STS_COMPLETE) { + /* Clear PROCESSING bit */ + ctx->status = HASH_CTX_STS_COMPLETE; + return ctx; + } + + /* + * If the extra blocks are empty, begin hashing what remains + * in the user's buffer. + */ + if (ctx->partial_block_buffer_length == 0 && + ctx->incoming_buffer_length) { + + const void *buffer = ctx->incoming_buffer; + uint32_t len = ctx->incoming_buffer_length; + uint32_t copy_len; + + /* + * Only entire blocks can be hashed. + * Copy remainder to extra blocks buffer. + */ + copy_len = len & (SHA1_BLOCK_SIZE-1); + + if (copy_len) { + len -= copy_len; + memcpy(ctx->partial_block_buffer, + ((const char *) buffer + len), + copy_len); + ctx->partial_block_buffer_length = copy_len; + } + + ctx->incoming_buffer_length = 0; + + /* len should be a multiple of the block size now */ + assert((len % SHA1_BLOCK_SIZE) == 0); + + /* Set len to the number of blocks to be hashed */ + len >>= SHA1_LOG2_BLOCK_SIZE; + + if (len) { + + ctx->job.buffer = (uint8_t *) buffer; + ctx->job.len = len; + ctx = (struct sha1_hash_ctx *) sha1_job_mgr_submit(&mgr->mgr, + &ctx->job); + continue; + } + } + + /* + * If the extra blocks are not empty, then we are + * either on the last block(s) or we need more + * user input before continuing. + */ + if (ctx->status & HASH_CTX_STS_LAST) { + + uint8_t *buf = ctx->partial_block_buffer; + uint32_t n_extra_blocks = sha1_pad(buf, ctx->total_length); + + ctx->status = (HASH_CTX_STS_PROCESSING | + HASH_CTX_STS_COMPLETE); + ctx->job.buffer = buf; + ctx->job.len = (uint32_t) n_extra_blocks; + ctx = (struct sha1_hash_ctx *) sha1_job_mgr_submit(&mgr->mgr, &ctx->job); + continue; + } + + if (ctx) + ctx->status = HASH_CTX_STS_IDLE; + return ctx; + } + + return NULL; +} + +static struct sha1_hash_ctx *sha1_ctx_mgr_get_comp_ctx(struct sha1_ctx_mgr *mgr) +{ + /* + * If get_comp_job returns NULL, there are no jobs complete. + * If get_comp_job returns a job, verify that it is safe to return to the user. + * If it is not ready, resubmit the job to finish processing. + * If sha1_ctx_mgr_resubmit returned a job, it is ready to be returned. + * Otherwise, all jobs currently being managed by the hash_ctx_mgr still need processing. + */ + struct sha1_hash_ctx *ctx; + + ctx = (struct sha1_hash_ctx *) sha1_job_mgr_get_comp_job(&mgr->mgr); + return sha1_ctx_mgr_resubmit(mgr, ctx); +} + +static void sha1_ctx_mgr_init(struct sha1_ctx_mgr *mgr) +{ + sha1_job_mgr_init(&mgr->mgr); +} + +static struct sha1_hash_ctx *sha1_ctx_mgr_submit(struct sha1_ctx_mgr *mgr, + struct sha1_hash_ctx *ctx, + const void *buffer, + uint32_t len, + int flags) +{ + if (flags & (~HASH_ENTIRE)) { + /* User should not pass anything other than FIRST, UPDATE, or LAST */ + ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; + return ctx; + } + + if (ctx->status & HASH_CTX_STS_PROCESSING) { + /* Cannot submit to a currently processing job. */ + ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; + return ctx; + } + + if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { + /* Cannot update a finished job. */ + ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; + return ctx; + } + + + if (flags & HASH_FIRST) { + /* Init digest */ + sha1_init_digest(ctx->job.result_digest); + + /* Reset byte counter */ + ctx->total_length = 0; + + /* Clear extra blocks */ + ctx->partial_block_buffer_length = 0; + } + + /* If we made it here, there were no errors during this call to submit */ + ctx->error = HASH_CTX_ERROR_NONE; + + /* Store buffer ptr info from user */ + ctx->incoming_buffer = buffer; + ctx->incoming_buffer_length = len; + + /* Store the user's request flags and mark this ctx as currently being processed. */ + ctx->status = (flags & HASH_LAST) ? + (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) : + HASH_CTX_STS_PROCESSING; + + /* Advance byte counter */ + ctx->total_length += len; + + /* + * If there is anything currently buffered in the extra blocks, + * append to it until it contains a whole block. + * Or if the user's buffer contains less than a whole block, + * append as much as possible to the extra block. + */ + if ((ctx->partial_block_buffer_length) | (len < SHA1_BLOCK_SIZE)) { + /* Compute how many bytes to copy from user buffer into extra block */ + uint32_t copy_len = SHA1_BLOCK_SIZE - ctx->partial_block_buffer_length; + if (len < copy_len) + copy_len = len; + + if (copy_len) { + /* Copy and update relevant pointers and counters */ + memcpy(&ctx->partial_block_buffer[ctx->partial_block_buffer_length], + buffer, copy_len); + + ctx->partial_block_buffer_length += copy_len; + ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len); + ctx->incoming_buffer_length = len - copy_len; + } + + /* The extra block should never contain more than 1 block here */ + assert(ctx->partial_block_buffer_length <= SHA1_BLOCK_SIZE); + + /* If the extra block buffer contains exactly 1 block, it can be hashed. */ + if (ctx->partial_block_buffer_length >= SHA1_BLOCK_SIZE) { + ctx->partial_block_buffer_length = 0; + + ctx->job.buffer = ctx->partial_block_buffer; + ctx->job.len = 1; + ctx = (struct sha1_hash_ctx *) sha1_job_mgr_submit(&mgr->mgr, &ctx->job); + } + } + + return sha1_ctx_mgr_resubmit(mgr, ctx); +} + +static struct sha1_hash_ctx *sha1_ctx_mgr_flush(struct sha1_ctx_mgr *mgr) +{ + struct sha1_hash_ctx *ctx; + + while (1) { + ctx = (struct sha1_hash_ctx *) sha1_job_mgr_flush(&mgr->mgr); + + /* If flush returned 0, there are no more jobs in flight. */ + if (!ctx) + return NULL; + + /* + * If flush returned a job, resubmit the job to finish processing. + */ + ctx = sha1_ctx_mgr_resubmit(mgr, ctx); + + /* + * If sha1_ctx_mgr_resubmit returned a job, it is ready to be returned. + * Otherwise, all jobs currently being managed by the sha1_ctx_mgr + * still need processing. Loop. + */ + if (ctx) + return ctx; + } +} + +static int sha1_mb_init(struct shash_desc *desc) +{ + struct sha1_hash_ctx *sctx = shash_desc_ctx(desc); + + hash_ctx_init(sctx); + sctx->job.result_digest[0] = SHA1_H0; + sctx->job.result_digest[1] = SHA1_H1; + sctx->job.result_digest[2] = SHA1_H2; + sctx->job.result_digest[3] = SHA1_H3; + sctx->job.result_digest[4] = SHA1_H4; + sctx->total_length = 0; + sctx->partial_block_buffer_length = 0; + sctx->status = HASH_CTX_STS_IDLE; + + return 0; +} + +static int sha1_mb_set_results(struct mcryptd_hash_request_ctx *rctx) +{ + int i; + struct sha1_hash_ctx *sctx = shash_desc_ctx(&rctx->desc); + __be32 *dst = (__be32 *) rctx->out; + + for (i = 0; i < 5; ++i) + dst[i] = cpu_to_be32(sctx->job.result_digest[i]); + + return 0; +} + +static int sha_finish_walk(struct mcryptd_hash_request_ctx **ret_rctx, + struct mcryptd_alg_cstate *cstate, bool flush) +{ + int flag = HASH_UPDATE; + int nbytes, err = 0; + struct mcryptd_hash_request_ctx *rctx = *ret_rctx; + struct sha1_hash_ctx *sha_ctx; + + /* more work ? */ + while (!(rctx->flag & HASH_DONE)) { + nbytes = crypto_ahash_walk_done(&rctx->walk, 0); + if (nbytes < 0) { + err = nbytes; + goto out; + } + /* check if the walk is done */ + if (crypto_ahash_walk_last(&rctx->walk)) { + rctx->flag |= HASH_DONE; + if (rctx->flag & HASH_FINAL) + flag |= HASH_LAST; + + } + sha_ctx = (struct sha1_hash_ctx *) shash_desc_ctx(&rctx->desc); + kernel_fpu_begin(); + sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data, nbytes, flag); + if (!sha_ctx) { + if (flush) + sha_ctx = sha1_ctx_mgr_flush(cstate->mgr); + } + kernel_fpu_end(); + if (sha_ctx) + rctx = cast_hash_to_mcryptd_ctx(sha_ctx); + else { + rctx = NULL; + goto out; + } + } + + /* copy the results */ + if (rctx->flag & HASH_FINAL) + sha1_mb_set_results(rctx); + +out: + *ret_rctx = rctx; + return err; +} + +static int sha_complete_job(struct mcryptd_hash_request_ctx *rctx, + struct mcryptd_alg_cstate *cstate, + int err) +{ + struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx); + struct sha1_hash_ctx *sha_ctx; + struct mcryptd_hash_request_ctx *req_ctx; + int ret; + + /* remove from work list */ + spin_lock(&cstate->work_lock); + list_del(&rctx->waiter); + spin_unlock(&cstate->work_lock); + + if (irqs_disabled()) + rctx->complete(&req->base, err); + else { + local_bh_disable(); + rctx->complete(&req->base, err); + local_bh_enable(); + } + + /* check to see if there are other jobs that are done */ + sha_ctx = sha1_ctx_mgr_get_comp_ctx(cstate->mgr); + while (sha_ctx) { + req_ctx = cast_hash_to_mcryptd_ctx(sha_ctx); + ret = sha_finish_walk(&req_ctx, cstate, false); + if (req_ctx) { + spin_lock(&cstate->work_lock); + list_del(&req_ctx->waiter); + spin_unlock(&cstate->work_lock); + + req = cast_mcryptd_ctx_to_req(req_ctx); + if (irqs_disabled()) + rctx->complete(&req->base, ret); + else { + local_bh_disable(); + rctx->complete(&req->base, ret); + local_bh_enable(); + } + } + sha_ctx = sha1_ctx_mgr_get_comp_ctx(cstate->mgr); + } + + return 0; +} + +static void sha1_mb_add_list(struct mcryptd_hash_request_ctx *rctx, + struct mcryptd_alg_cstate *cstate) +{ + unsigned long next_flush; + unsigned long delay = usecs_to_jiffies(FLUSH_INTERVAL); + + /* initialize tag */ + rctx->tag.arrival = jiffies; /* tag the arrival time */ + rctx->tag.seq_num = cstate->next_seq_num++; + next_flush = rctx->tag.arrival + delay; + rctx->tag.expire = next_flush; + + spin_lock(&cstate->work_lock); + list_add_tail(&rctx->waiter, &cstate->work_list); + spin_unlock(&cstate->work_lock); + + mcryptd_arm_flusher(cstate, delay); +} + +static int sha1_mb_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + struct mcryptd_hash_request_ctx *rctx = + container_of(desc, struct mcryptd_hash_request_ctx, desc); + struct mcryptd_alg_cstate *cstate = + this_cpu_ptr(sha1_mb_alg_state.alg_cstate); + + struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx); + struct sha1_hash_ctx *sha_ctx; + int ret = 0, nbytes; + + + /* sanity check */ + if (rctx->tag.cpu != smp_processor_id()) { + pr_err("mcryptd error: cpu clash\n"); + goto done; + } + + /* need to init context */ + req_ctx_init(rctx, desc); + + nbytes = crypto_ahash_walk_first(req, &rctx->walk); + + if (nbytes < 0) { + ret = nbytes; + goto done; + } + + if (crypto_ahash_walk_last(&rctx->walk)) + rctx->flag |= HASH_DONE; + + /* submit */ + sha_ctx = (struct sha1_hash_ctx *) shash_desc_ctx(desc); + sha1_mb_add_list(rctx, cstate); + kernel_fpu_begin(); + sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data, nbytes, HASH_UPDATE); + kernel_fpu_end(); + + /* check if anything is returned */ + if (!sha_ctx) + return -EINPROGRESS; + + if (sha_ctx->error) { + ret = sha_ctx->error; + rctx = cast_hash_to_mcryptd_ctx(sha_ctx); + goto done; + } + + rctx = cast_hash_to_mcryptd_ctx(sha_ctx); + ret = sha_finish_walk(&rctx, cstate, false); + + if (!rctx) + return -EINPROGRESS; +done: + sha_complete_job(rctx, cstate, ret); + return ret; +} + +static int sha1_mb_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + struct mcryptd_hash_request_ctx *rctx = + container_of(desc, struct mcryptd_hash_request_ctx, desc); + struct mcryptd_alg_cstate *cstate = + this_cpu_ptr(sha1_mb_alg_state.alg_cstate); + + struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx); + struct sha1_hash_ctx *sha_ctx; + int ret = 0, flag = HASH_UPDATE, nbytes; + + /* sanity check */ + if (rctx->tag.cpu != smp_processor_id()) { + pr_err("mcryptd error: cpu clash\n"); + goto done; + } + + /* need to init context */ + req_ctx_init(rctx, desc); + + nbytes = crypto_ahash_walk_first(req, &rctx->walk); + + if (nbytes < 0) { + ret = nbytes; + goto done; + } + + if (crypto_ahash_walk_last(&rctx->walk)) { + rctx->flag |= HASH_DONE; + flag = HASH_LAST; + } + rctx->out = out; + + /* submit */ + rctx->flag |= HASH_FINAL; + sha_ctx = (struct sha1_hash_ctx *) shash_desc_ctx(desc); + sha1_mb_add_list(rctx, cstate); + + kernel_fpu_begin(); + sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data, nbytes, flag); + kernel_fpu_end(); + + /* check if anything is returned */ + if (!sha_ctx) + return -EINPROGRESS; + + if (sha_ctx->error) { + ret = sha_ctx->error; + goto done; + } + + rctx = cast_hash_to_mcryptd_ctx(sha_ctx); + ret = sha_finish_walk(&rctx, cstate, false); + if (!rctx) + return -EINPROGRESS; +done: + sha_complete_job(rctx, cstate, ret); + return ret; +} + +static int sha1_mb_final(struct shash_desc *desc, u8 *out) +{ + struct mcryptd_hash_request_ctx *rctx = + container_of(desc, struct mcryptd_hash_request_ctx, desc); + struct mcryptd_alg_cstate *cstate = + this_cpu_ptr(sha1_mb_alg_state.alg_cstate); + + struct sha1_hash_ctx *sha_ctx; + int ret = 0; + u8 data; + + /* sanity check */ + if (rctx->tag.cpu != smp_processor_id()) { + pr_err("mcryptd error: cpu clash\n"); + goto done; + } + + /* need to init context */ + req_ctx_init(rctx, desc); + + rctx->out = out; + rctx->flag |= HASH_DONE | HASH_FINAL; + + sha_ctx = (struct sha1_hash_ctx *) shash_desc_ctx(desc); + /* flag HASH_FINAL and 0 data size */ + sha1_mb_add_list(rctx, cstate); + kernel_fpu_begin(); + sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx, &data, 0, HASH_LAST); + kernel_fpu_end(); + + /* check if anything is returned */ + if (!sha_ctx) + return -EINPROGRESS; + + if (sha_ctx->error) { + ret = sha_ctx->error; + rctx = cast_hash_to_mcryptd_ctx(sha_ctx); + goto done; + } + + rctx = cast_hash_to_mcryptd_ctx(sha_ctx); + ret = sha_finish_walk(&rctx, cstate, false); + if (!rctx) + return -EINPROGRESS; +done: + sha_complete_job(rctx, cstate, ret); + return ret; +} + +static int sha1_mb_export(struct shash_desc *desc, void *out) +{ + struct sha1_hash_ctx *sctx = shash_desc_ctx(desc); + + memcpy(out, sctx, sizeof(*sctx)); + + return 0; +} + +static int sha1_mb_import(struct shash_desc *desc, const void *in) +{ + struct sha1_hash_ctx *sctx = shash_desc_ctx(desc); + + memcpy(sctx, in, sizeof(*sctx)); + + return 0; +} + + +static struct shash_alg sha1_mb_shash_alg = { + .digestsize = SHA1_DIGEST_SIZE, + .init = sha1_mb_init, + .update = sha1_mb_update, + .final = sha1_mb_final, + .finup = sha1_mb_finup, + .export = sha1_mb_export, + .import = sha1_mb_import, + .descsize = sizeof(struct sha1_hash_ctx), + .statesize = sizeof(struct sha1_hash_ctx), + .base = { + .cra_name = "__sha1-mb", + .cra_driver_name = "__intel_sha1-mb", + .cra_priority = 100, + /* + * use ASYNC flag as some buffers in multi-buffer + * algo may not have completed before hashing thread sleep + */ + .cra_flags = CRYPTO_ALG_TYPE_SHASH | CRYPTO_ALG_ASYNC, + .cra_blocksize = SHA1_BLOCK_SIZE, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(sha1_mb_shash_alg.base.cra_list), + } +}; + +static int sha1_mb_async_init(struct ahash_request *req) +{ + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm); + struct ahash_request *mcryptd_req = ahash_request_ctx(req); + struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm; + + memcpy(mcryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base); + return crypto_ahash_init(mcryptd_req); +} + +static int sha1_mb_async_update(struct ahash_request *req) +{ + struct ahash_request *mcryptd_req = ahash_request_ctx(req); + + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm); + struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm; + + memcpy(mcryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base); + return crypto_ahash_update(mcryptd_req); +} + +static int sha1_mb_async_finup(struct ahash_request *req) +{ + struct ahash_request *mcryptd_req = ahash_request_ctx(req); + + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm); + struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm; + + memcpy(mcryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base); + return crypto_ahash_finup(mcryptd_req); +} + +static int sha1_mb_async_final(struct ahash_request *req) +{ + struct ahash_request *mcryptd_req = ahash_request_ctx(req); + + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm); + struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm; + + memcpy(mcryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base); + return crypto_ahash_final(mcryptd_req); +} + +static int sha1_mb_async_digest(struct ahash_request *req) +{ + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm); + struct ahash_request *mcryptd_req = ahash_request_ctx(req); + struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm; + + memcpy(mcryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base); + return crypto_ahash_digest(mcryptd_req); +} + +static int sha1_mb_async_init_tfm(struct crypto_tfm *tfm) +{ + struct mcryptd_ahash *mcryptd_tfm; + struct sha1_mb_ctx *ctx = crypto_tfm_ctx(tfm); + struct mcryptd_hash_ctx *mctx; + + mcryptd_tfm = mcryptd_alloc_ahash("__intel_sha1-mb", 0, 0); + if (IS_ERR(mcryptd_tfm)) + return PTR_ERR(mcryptd_tfm); + mctx = crypto_ahash_ctx(&mcryptd_tfm->base); + mctx->alg_state = &sha1_mb_alg_state; + ctx->mcryptd_tfm = mcryptd_tfm; + crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm), + sizeof(struct ahash_request) + + crypto_ahash_reqsize(&mcryptd_tfm->base)); + + return 0; +} + +static void sha1_mb_async_exit_tfm(struct crypto_tfm *tfm) +{ + struct sha1_mb_ctx *ctx = crypto_tfm_ctx(tfm); + + mcryptd_free_ahash(ctx->mcryptd_tfm); +} + +static struct ahash_alg sha1_mb_async_alg = { + .init = sha1_mb_async_init, + .update = sha1_mb_async_update, + .final = sha1_mb_async_final, + .finup = sha1_mb_async_finup, + .digest = sha1_mb_async_digest, + .halg = { + .digestsize = SHA1_DIGEST_SIZE, + .base = { + .cra_name = "sha1", + .cra_driver_name = "sha1_mb", + .cra_priority = 200, + .cra_flags = CRYPTO_ALG_TYPE_AHASH | CRYPTO_ALG_ASYNC, + .cra_blocksize = SHA1_BLOCK_SIZE, + .cra_type = &crypto_ahash_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(sha1_mb_async_alg.halg.base.cra_list), + .cra_init = sha1_mb_async_init_tfm, + .cra_exit = sha1_mb_async_exit_tfm, + .cra_ctxsize = sizeof(struct sha1_mb_ctx), + .cra_alignmask = 0, + }, + }, +}; + +static unsigned long sha1_mb_flusher(struct mcryptd_alg_cstate *cstate) +{ + struct mcryptd_hash_request_ctx *rctx; + unsigned long cur_time; + unsigned long next_flush = 0; + struct sha1_hash_ctx *sha_ctx; + + + cur_time = jiffies; + + while (!list_empty(&cstate->work_list)) { + rctx = list_entry(cstate->work_list.next, + struct mcryptd_hash_request_ctx, waiter); + if time_before(cur_time, rctx->tag.expire) + break; + kernel_fpu_begin(); + sha_ctx = (struct sha1_hash_ctx *) sha1_ctx_mgr_flush(cstate->mgr); + kernel_fpu_end(); + if (!sha_ctx) { + pr_err("sha1_mb error: nothing got flushed for non-empty list\n"); + break; + } + rctx = cast_hash_to_mcryptd_ctx(sha_ctx); + sha_finish_walk(&rctx, cstate, true); + sha_complete_job(rctx, cstate, 0); + } + + if (!list_empty(&cstate->work_list)) { + rctx = list_entry(cstate->work_list.next, + struct mcryptd_hash_request_ctx, waiter); + /* get the hash context and then flush time */ + next_flush = rctx->tag.expire; + mcryptd_arm_flusher(cstate, get_delay(next_flush)); + } + return next_flush; +} + +static int __init sha1_mb_mod_init(void) +{ + + int cpu; + int err; + struct mcryptd_alg_cstate *cpu_state; + + /* check for dependent cpu features */ + if (!boot_cpu_has(X86_FEATURE_AVX2) || + !boot_cpu_has(X86_FEATURE_BMI2)) + return -ENODEV; + + /* initialize multibuffer structures */ + sha1_mb_alg_state.alg_cstate = alloc_percpu(struct mcryptd_alg_cstate); + + sha1_job_mgr_init = sha1_mb_mgr_init_avx2; + sha1_job_mgr_submit = sha1_mb_mgr_submit_avx2; + sha1_job_mgr_flush = sha1_mb_mgr_flush_avx2; + sha1_job_mgr_get_comp_job = sha1_mb_mgr_get_comp_job_avx2; + + if (!sha1_mb_alg_state.alg_cstate) + return -ENOMEM; + for_each_possible_cpu(cpu) { + cpu_state = per_cpu_ptr(sha1_mb_alg_state.alg_cstate, cpu); + cpu_state->next_flush = 0; + cpu_state->next_seq_num = 0; + cpu_state->flusher_engaged = false; + INIT_DELAYED_WORK(&cpu_state->flush, mcryptd_flusher); + cpu_state->cpu = cpu; + cpu_state->alg_state = &sha1_mb_alg_state; + cpu_state->mgr = (struct sha1_ctx_mgr *) kzalloc(sizeof(struct sha1_ctx_mgr), GFP_KERNEL); + if (!cpu_state->mgr) + goto err2; + sha1_ctx_mgr_init(cpu_state->mgr); + INIT_LIST_HEAD(&cpu_state->work_list); + spin_lock_init(&cpu_state->work_lock); + } + sha1_mb_alg_state.flusher = &sha1_mb_flusher; + + err = crypto_register_shash(&sha1_mb_shash_alg); + if (err) + goto err2; + err = crypto_register_ahash(&sha1_mb_async_alg); + if (err) + goto err1; + + + return 0; +err1: + crypto_unregister_shash(&sha1_mb_shash_alg); +err2: + for_each_possible_cpu(cpu) { + cpu_state = per_cpu_ptr(sha1_mb_alg_state.alg_cstate, cpu); + kfree(cpu_state->mgr); + } + free_percpu(sha1_mb_alg_state.alg_cstate); + return -ENODEV; +} + +static void __exit sha1_mb_mod_fini(void) +{ + int cpu; + struct mcryptd_alg_cstate *cpu_state; + + crypto_unregister_ahash(&sha1_mb_async_alg); + crypto_unregister_shash(&sha1_mb_shash_alg); + for_each_possible_cpu(cpu) { + cpu_state = per_cpu_ptr(sha1_mb_alg_state.alg_cstate, cpu); + kfree(cpu_state->mgr); + } + free_percpu(sha1_mb_alg_state.alg_cstate); +} + +module_init(sha1_mb_mod_init); +module_exit(sha1_mb_mod_fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm, multi buffer accelerated"); + +MODULE_ALIAS("sha1"); diff --git a/arch/x86/crypto/sha-mb/sha1_mb_mgr_datastruct.S b/arch/x86/crypto/sha-mb/sha1_mb_mgr_datastruct.S new file mode 100644 index 00000000000..86688c6e7a2 --- /dev/null +++ b/arch/x86/crypto/sha-mb/sha1_mb_mgr_datastruct.S @@ -0,0 +1,287 @@ +/* + * Header file for multi buffer SHA1 algorithm data structure + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2014 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * James Guilford <james.guilford@intel.com> + * Tim Chen <tim.c.chen@linux.intel.com> + * + * BSD LICENSE + * + * Copyright(c) 2014 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +# Macros for defining data structures + +# Usage example + +#START_FIELDS # JOB_AES +### name size align +#FIELD _plaintext, 8, 8 # pointer to plaintext +#FIELD _ciphertext, 8, 8 # pointer to ciphertext +#FIELD _IV, 16, 8 # IV +#FIELD _keys, 8, 8 # pointer to keys +#FIELD _len, 4, 4 # length in bytes +#FIELD _status, 4, 4 # status enumeration +#FIELD _user_data, 8, 8 # pointer to user data +#UNION _union, size1, align1, \ +# size2, align2, \ +# size3, align3, \ +# ... +#END_FIELDS +#%assign _JOB_AES_size _FIELD_OFFSET +#%assign _JOB_AES_align _STRUCT_ALIGN + +######################################################################### + +# Alternate "struc-like" syntax: +# STRUCT job_aes2 +# RES_Q .plaintext, 1 +# RES_Q .ciphertext, 1 +# RES_DQ .IV, 1 +# RES_B .nested, _JOB_AES_SIZE, _JOB_AES_ALIGN +# RES_U .union, size1, align1, \ +# size2, align2, \ +# ... +# ENDSTRUCT +# # Following only needed if nesting +# %assign job_aes2_size _FIELD_OFFSET +# %assign job_aes2_align _STRUCT_ALIGN +# +# RES_* macros take a name, a count and an optional alignment. +# The count in in terms of the base size of the macro, and the +# default alignment is the base size. +# The macros are: +# Macro Base size +# RES_B 1 +# RES_W 2 +# RES_D 4 +# RES_Q 8 +# RES_DQ 16 +# RES_Y 32 +# RES_Z 64 +# +# RES_U defines a union. It's arguments are a name and two or more +# pairs of "size, alignment" +# +# The two assigns are only needed if this structure is being nested +# within another. Even if the assigns are not done, one can still use +# STRUCT_NAME_size as the size of the structure. +# +# Note that for nesting, you still need to assign to STRUCT_NAME_size. +# +# The differences between this and using "struc" directly are that each +# type is implicitly aligned to its natural length (although this can be +# over-ridden with an explicit third parameter), and that the structure +# is padded at the end to its overall alignment. +# + +######################################################################### + +#ifndef _SHA1_MB_MGR_DATASTRUCT_ASM_ +#define _SHA1_MB_MGR_DATASTRUCT_ASM_ + +## START_FIELDS +.macro START_FIELDS + _FIELD_OFFSET = 0 + _STRUCT_ALIGN = 0 +.endm + +## FIELD name size align +.macro FIELD name size align + _FIELD_OFFSET = (_FIELD_OFFSET + (\align) - 1) & (~ ((\align)-1)) + \name = _FIELD_OFFSET + _FIELD_OFFSET = _FIELD_OFFSET + (\size) +.if (\align > _STRUCT_ALIGN) + _STRUCT_ALIGN = \align +.endif +.endm + +## END_FIELDS +.macro END_FIELDS + _FIELD_OFFSET = (_FIELD_OFFSET + _STRUCT_ALIGN-1) & (~ (_STRUCT_ALIGN-1)) +.endm + +######################################################################## + +.macro STRUCT p1 +START_FIELDS +.struc \p1 +.endm + +.macro ENDSTRUCT + tmp = _FIELD_OFFSET + END_FIELDS + tmp = (_FIELD_OFFSET - %%tmp) +.if (tmp > 0) + .lcomm tmp +.endif +.endstruc +.endm + +## RES_int name size align +.macro RES_int p1 p2 p3 + name = \p1 + size = \p2 + align = .\p3 + + _FIELD_OFFSET = (_FIELD_OFFSET + (align) - 1) & (~ ((align)-1)) +.align align +.lcomm name size + _FIELD_OFFSET = _FIELD_OFFSET + (size) +.if (align > _STRUCT_ALIGN) + _STRUCT_ALIGN = align +.endif +.endm + + + +# macro RES_B name, size [, align] +.macro RES_B _name, _size, _align=1 +RES_int _name _size _align +.endm + +# macro RES_W name, size [, align] +.macro RES_W _name, _size, _align=2 +RES_int _name 2*(_size) _align +.endm + +# macro RES_D name, size [, align] +.macro RES_D _name, _size, _align=4 +RES_int _name 4*(_size) _align +.endm + +# macro RES_Q name, size [, align] +.macro RES_Q _name, _size, _align=8 +RES_int _name 8*(_size) _align +.endm + +# macro RES_DQ name, size [, align] +.macro RES_DQ _name, _size, _align=16 +RES_int _name 16*(_size) _align +.endm + +# macro RES_Y name, size [, align] +.macro RES_Y _name, _size, _align=32 +RES_int _name 32*(_size) _align +.endm + +# macro RES_Z name, size [, align] +.macro RES_Z _name, _size, _align=64 +RES_int _name 64*(_size) _align +.endm + + +#endif + +######################################################################## +#### Define constants +######################################################################## + +######################################################################## +#### Define SHA1 Out Of Order Data Structures +######################################################################## + +START_FIELDS # LANE_DATA +### name size align +FIELD _job_in_lane, 8, 8 # pointer to job object +END_FIELDS + +_LANE_DATA_size = _FIELD_OFFSET +_LANE_DATA_align = _STRUCT_ALIGN + +######################################################################## + +START_FIELDS # SHA1_ARGS_X8 +### name size align +FIELD _digest, 4*5*8, 16 # transposed digest +FIELD _data_ptr, 8*8, 8 # array of pointers to data +END_FIELDS + +_SHA1_ARGS_X4_size = _FIELD_OFFSET +_SHA1_ARGS_X4_align = _STRUCT_ALIGN +_SHA1_ARGS_X8_size = _FIELD_OFFSET +_SHA1_ARGS_X8_align = _STRUCT_ALIGN + +######################################################################## + +START_FIELDS # MB_MGR +### name size align +FIELD _args, _SHA1_ARGS_X4_size, _SHA1_ARGS_X4_align +FIELD _lens, 4*8, 8 +FIELD _unused_lanes, 8, 8 +FIELD _ldata, _LANE_DATA_size*8, _LANE_DATA_align +END_FIELDS + +_MB_MGR_size = _FIELD_OFFSET +_MB_MGR_align = _STRUCT_ALIGN + +_args_digest = _args + _digest +_args_data_ptr = _args + _data_ptr + + +######################################################################## +#### Define constants +######################################################################## + +#define STS_UNKNOWN 0 +#define STS_BEING_PROCESSED 1 +#define STS_COMPLETED 2 + +######################################################################## +#### Define JOB_SHA1 structure +######################################################################## + +START_FIELDS # JOB_SHA1 + +### name size align +FIELD _buffer, 8, 8 # pointer to buffer +FIELD _len, 4, 4 # length in bytes +FIELD _result_digest, 5*4, 32 # Digest (output) +FIELD _status, 4, 4 +FIELD _user_data, 8, 8 +END_FIELDS + +_JOB_SHA1_size = _FIELD_OFFSET +_JOB_SHA1_align = _STRUCT_ALIGN diff --git a/arch/x86/crypto/sha-mb/sha1_mb_mgr_flush_avx2.S b/arch/x86/crypto/sha-mb/sha1_mb_mgr_flush_avx2.S new file mode 100644 index 00000000000..85c4e1cf717 --- /dev/null +++ b/arch/x86/crypto/sha-mb/sha1_mb_mgr_flush_avx2.S @@ -0,0 +1,327 @@ +/* + * Flush routine for SHA1 multibuffer + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2014 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * James Guilford <james.guilford@intel.com> + * Tim Chen <tim.c.chen@linux.intel.com> + * + * BSD LICENSE + * + * Copyright(c) 2014 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include <linux/linkage.h> +#include "sha1_mb_mgr_datastruct.S" + + +.extern sha1_x8_avx2 + +# LINUX register definitions +#define arg1 %rdi +#define arg2 %rsi + +# Common definitions +#define state arg1 +#define job arg2 +#define len2 arg2 + +# idx must be a register not clobbered by sha1_x8_avx2 +#define idx %r8 +#define DWORD_idx %r8d + +#define unused_lanes %rbx +#define lane_data %rbx +#define tmp2 %rbx +#define tmp2_w %ebx + +#define job_rax %rax +#define tmp1 %rax +#define size_offset %rax +#define tmp %rax +#define start_offset %rax + +#define tmp3 %arg1 + +#define extra_blocks %arg2 +#define p %arg2 + + +# STACK_SPACE needs to be an odd multiple of 8 +_XMM_SAVE_SIZE = 10*16 +_GPR_SAVE_SIZE = 8*8 +_ALIGN_SIZE = 8 + +_XMM_SAVE = 0 +_GPR_SAVE = _XMM_SAVE + _XMM_SAVE_SIZE +STACK_SPACE = _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE + +.macro LABEL prefix n +\prefix\n\(): +.endm + +.macro JNE_SKIP i +jne skip_\i +.endm + +.altmacro +.macro SET_OFFSET _offset +offset = \_offset +.endm +.noaltmacro + +# JOB* sha1_mb_mgr_flush_avx2(MB_MGR *state) +# arg 1 : rcx : state +ENTRY(sha1_mb_mgr_flush_avx2) + mov %rsp, %r10 + sub $STACK_SPACE, %rsp + and $~31, %rsp + mov %rbx, _GPR_SAVE(%rsp) + mov %r10, _GPR_SAVE+8*1(%rsp) #save rsp + mov %rbp, _GPR_SAVE+8*3(%rsp) + mov %r12, _GPR_SAVE+8*4(%rsp) + mov %r13, _GPR_SAVE+8*5(%rsp) + mov %r14, _GPR_SAVE+8*6(%rsp) + mov %r15, _GPR_SAVE+8*7(%rsp) + + # If bit (32+3) is set, then all lanes are empty + mov _unused_lanes(state), unused_lanes + bt $32+3, unused_lanes + jc return_null + + # find a lane with a non-null job + xor idx, idx + offset = (_ldata + 1 * _LANE_DATA_size + _job_in_lane) + cmpq $0, offset(state) + cmovne one(%rip), idx + offset = (_ldata + 2 * _LANE_DATA_size + _job_in_lane) + cmpq $0, offset(state) + cmovne two(%rip), idx + offset = (_ldata + 3 * _LANE_DATA_size + _job_in_lane) + cmpq $0, offset(state) + cmovne three(%rip), idx + offset = (_ldata + 4 * _LANE_DATA_size + _job_in_lane) + cmpq $0, offset(state) + cmovne four(%rip), idx + offset = (_ldata + 5 * _LANE_DATA_size + _job_in_lane) + cmpq $0, offset(state) + cmovne five(%rip), idx + offset = (_ldata + 6 * _LANE_DATA_size + _job_in_lane) + cmpq $0, offset(state) + cmovne six(%rip), idx + offset = (_ldata + 7 * _LANE_DATA_size + _job_in_lane) + cmpq $0, offset(state) + cmovne seven(%rip), idx + + # copy idx to empty lanes +copy_lane_data: + offset = (_args + _data_ptr) + mov offset(state,idx,8), tmp + + I = 0 +.rep 8 + offset = (_ldata + I * _LANE_DATA_size + _job_in_lane) + cmpq $0, offset(state) +.altmacro + JNE_SKIP %I + offset = (_args + _data_ptr + 8*I) + mov tmp, offset(state) + offset = (_lens + 4*I) + movl $0xFFFFFFFF, offset(state) +LABEL skip_ %I + I = (I+1) +.noaltmacro +.endr + + # Find min length + vmovdqa _lens+0*16(state), %xmm0 + vmovdqa _lens+1*16(state), %xmm1 + + vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A} + vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C} + vpminud %xmm3, %xmm2, %xmm2 # xmm2 has {x,x,E,F} + vpalignr $4, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,x,E} + vpminud %xmm3, %xmm2, %xmm2 # xmm2 has min value in low dword + + vmovd %xmm2, DWORD_idx + mov idx, len2 + and $0xF, idx + shr $4, len2 + jz len_is_0 + + vpand clear_low_nibble(%rip), %xmm2, %xmm2 + vpshufd $0, %xmm2, %xmm2 + + vpsubd %xmm2, %xmm0, %xmm0 + vpsubd %xmm2, %xmm1, %xmm1 + + vmovdqa %xmm0, _lens+0*16(state) + vmovdqa %xmm1, _lens+1*16(state) + + # "state" and "args" are the same address, arg1 + # len is arg2 + call sha1_x8_avx2 + # state and idx are intact + + +len_is_0: + # process completed job "idx" + imul $_LANE_DATA_size, idx, lane_data + lea _ldata(state, lane_data), lane_data + + mov _job_in_lane(lane_data), job_rax + movq $0, _job_in_lane(lane_data) + movl $STS_COMPLETED, _status(job_rax) + mov _unused_lanes(state), unused_lanes + shl $4, unused_lanes + or idx, unused_lanes + mov unused_lanes, _unused_lanes(state) + + movl $0xFFFFFFFF, _lens(state, idx, 4) + + vmovd _args_digest(state , idx, 4) , %xmm0 + vpinsrd $1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0 + vpinsrd $2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0 + vpinsrd $3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0 + movl _args_digest+4*32(state, idx, 4), tmp2_w + + vmovdqu %xmm0, _result_digest(job_rax) + offset = (_result_digest + 1*16) + mov tmp2_w, offset(job_rax) + +return: + + mov _GPR_SAVE(%rsp), %rbx + mov _GPR_SAVE+8*1(%rsp), %r10 #saved rsp + mov _GPR_SAVE+8*3(%rsp), %rbp + mov _GPR_SAVE+8*4(%rsp), %r12 + mov _GPR_SAVE+8*5(%rsp), %r13 + mov _GPR_SAVE+8*6(%rsp), %r14 + mov _GPR_SAVE+8*7(%rsp), %r15 + mov %r10, %rsp + + ret + +return_null: + xor job_rax, job_rax + jmp return +ENDPROC(sha1_mb_mgr_flush_avx2) + + +################################################################# + +.align 16 +ENTRY(sha1_mb_mgr_get_comp_job_avx2) + push %rbx + + ## if bit 32+3 is set, then all lanes are empty + mov _unused_lanes(state), unused_lanes + bt $(32+3), unused_lanes + jc .return_null + + # Find min length + vmovdqa _lens(state), %xmm0 + vmovdqa _lens+1*16(state), %xmm1 + + vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A} + vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C} + vpminud %xmm3, %xmm2, %xmm2 # xmm2 has {x,x,E,F} + vpalignr $4, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,x,E} + vpminud %xmm3, %xmm2, %xmm2 # xmm2 has min value in low dword + + vmovd %xmm2, DWORD_idx + test $~0xF, idx + jnz .return_null + + # process completed job "idx" + imul $_LANE_DATA_size, idx, lane_data + lea _ldata(state, lane_data), lane_data + + mov _job_in_lane(lane_data), job_rax + movq $0, _job_in_lane(lane_data) + movl $STS_COMPLETED, _status(job_rax) + mov _unused_lanes(state), unused_lanes + shl $4, unused_lanes + or idx, unused_lanes + mov unused_lanes, _unused_lanes(state) + + movl $0xFFFFFFFF, _lens(state, idx, 4) + + vmovd _args_digest(state, idx, 4), %xmm0 + vpinsrd $1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0 + vpinsrd $2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0 + vpinsrd $3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0 + movl _args_digest+4*32(state, idx, 4), tmp2_w + + vmovdqu %xmm0, _result_digest(job_rax) + movl tmp2_w, _result_digest+1*16(job_rax) + + pop %rbx + + ret + +.return_null: + xor job_rax, job_rax + pop %rbx + ret +ENDPROC(sha1_mb_mgr_get_comp_job_avx2) + +.data + +.align 16 +clear_low_nibble: +.octa 0x000000000000000000000000FFFFFFF0 +one: +.quad 1 +two: +.quad 2 +three: +.quad 3 +four: +.quad 4 +five: +.quad 5 +six: +.quad 6 +seven: +.quad 7 diff --git a/arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c b/arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c new file mode 100644 index 00000000000..4ca7e166a2a --- /dev/null +++ b/arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c @@ -0,0 +1,64 @@ +/* + * Initialization code for multi buffer SHA1 algorithm for AVX2 + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2014 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * Tim Chen <tim.c.chen@linux.intel.com> + * + * BSD LICENSE + * + * Copyright(c) 2014 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "sha_mb_mgr.h" + +void sha1_mb_mgr_init_avx2(struct sha1_mb_mgr *state) +{ + unsigned int j; + state->unused_lanes = 0xF76543210; + for (j = 0; j < 8; j++) { + state->lens[j] = 0xFFFFFFFF; + state->ldata[j].job_in_lane = NULL; + } +} diff --git a/arch/x86/crypto/sha-mb/sha1_mb_mgr_submit_avx2.S b/arch/x86/crypto/sha-mb/sha1_mb_mgr_submit_avx2.S new file mode 100644 index 00000000000..2ab9560b53c --- /dev/null +++ b/arch/x86/crypto/sha-mb/sha1_mb_mgr_submit_avx2.S @@ -0,0 +1,228 @@ +/* + * Buffer submit code for multi buffer SHA1 algorithm + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2014 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * James Guilford <james.guilford@intel.com> + * Tim Chen <tim.c.chen@linux.intel.com> + * + * BSD LICENSE + * + * Copyright(c) 2014 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <linux/linkage.h> +#include "sha1_mb_mgr_datastruct.S" + + +.extern sha1_x8_avx + +# LINUX register definitions +arg1 = %rdi +arg2 = %rsi +size_offset = %rcx +tmp2 = %rcx +extra_blocks = %rdx + +# Common definitions +#define state arg1 +#define job %rsi +#define len2 arg2 +#define p2 arg2 + +# idx must be a register not clobberred by sha1_x8_avx2 +idx = %r8 +DWORD_idx = %r8d +last_len = %r8 + +p = %r11 +start_offset = %r11 + +unused_lanes = %rbx +BYTE_unused_lanes = %bl + +job_rax = %rax +len = %rax +DWORD_len = %eax + +lane = %rbp +tmp3 = %rbp + +tmp = %r9 +DWORD_tmp = %r9d + +lane_data = %r10 + +# STACK_SPACE needs to be an odd multiple of 8 +STACK_SPACE = 8*8 + 16*10 + 8 + +# JOB* submit_mb_mgr_submit_avx2(MB_MGR *state, job_sha1 *job) +# arg 1 : rcx : state +# arg 2 : rdx : job +ENTRY(sha1_mb_mgr_submit_avx2) + + mov %rsp, %r10 + sub $STACK_SPACE, %rsp + and $~31, %rsp + + mov %rbx, (%rsp) + mov %r10, 8*2(%rsp) #save old rsp + mov %rbp, 8*3(%rsp) + mov %r12, 8*4(%rsp) + mov %r13, 8*5(%rsp) + mov %r14, 8*6(%rsp) + mov %r15, 8*7(%rsp) + + mov _unused_lanes(state), unused_lanes + mov unused_lanes, lane + and $0xF, lane + shr $4, unused_lanes + imul $_LANE_DATA_size, lane, lane_data + movl $STS_BEING_PROCESSED, _status(job) + lea _ldata(state, lane_data), lane_data + mov unused_lanes, _unused_lanes(state) + movl _len(job), DWORD_len + + mov job, _job_in_lane(lane_data) + shl $4, len + or lane, len + + movl DWORD_len, _lens(state , lane, 4) + + # Load digest words from result_digest + vmovdqu _result_digest(job), %xmm0 + mov _result_digest+1*16(job), DWORD_tmp + vmovd %xmm0, _args_digest(state, lane, 4) + vpextrd $1, %xmm0, _args_digest+1*32(state , lane, 4) + vpextrd $2, %xmm0, _args_digest+2*32(state , lane, 4) + vpextrd $3, %xmm0, _args_digest+3*32(state , lane, 4) + movl DWORD_tmp, _args_digest+4*32(state , lane, 4) + + mov _buffer(job), p + mov p, _args_data_ptr(state, lane, 8) + + cmp $0xF, unused_lanes + jne return_null + +start_loop: + # Find min length + vmovdqa _lens(state), %xmm0 + vmovdqa _lens+1*16(state), %xmm1 + + vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A} + vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C} + vpminud %xmm3, %xmm2, %xmm2 # xmm2 has {x,x,E,F} + vpalignr $4, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,x,E} + vpminud %xmm3, %xmm2, %xmm2 # xmm2 has min value in low dword + + vmovd %xmm2, DWORD_idx + mov idx, len2 + and $0xF, idx + shr $4, len2 + jz len_is_0 + + vpand clear_low_nibble(%rip), %xmm2, %xmm2 + vpshufd $0, %xmm2, %xmm2 + + vpsubd %xmm2, %xmm0, %xmm0 + vpsubd %xmm2, %xmm1, %xmm1 + + vmovdqa %xmm0, _lens + 0*16(state) + vmovdqa %xmm1, _lens + 1*16(state) + + + # "state" and "args" are the same address, arg1 + # len is arg2 + call sha1_x8_avx2 + + # state and idx are intact + +len_is_0: + # process completed job "idx" + imul $_LANE_DATA_size, idx, lane_data + lea _ldata(state, lane_data), lane_data + + mov _job_in_lane(lane_data), job_rax + mov _unused_lanes(state), unused_lanes + movq $0, _job_in_lane(lane_data) + movl $STS_COMPLETED, _status(job_rax) + shl $4, unused_lanes + or idx, unused_lanes + mov unused_lanes, _unused_lanes(state) + + movl $0xFFFFFFFF, _lens(state, idx, 4) + + vmovd _args_digest(state, idx, 4), %xmm0 + vpinsrd $1, _args_digest+1*32(state , idx, 4), %xmm0, %xmm0 + vpinsrd $2, _args_digest+2*32(state , idx, 4), %xmm0, %xmm0 + vpinsrd $3, _args_digest+3*32(state , idx, 4), %xmm0, %xmm0 + movl 4*32(state, idx, 4), DWORD_tmp + + vmovdqu %xmm0, _result_digest(job_rax) + movl DWORD_tmp, _result_digest+1*16(job_rax) + +return: + + mov (%rsp), %rbx + mov 8*2(%rsp), %r10 #save old rsp + mov 8*3(%rsp), %rbp + mov 8*4(%rsp), %r12 + mov 8*5(%rsp), %r13 + mov 8*6(%rsp), %r14 + mov 8*7(%rsp), %r15 + mov %r10, %rsp + + ret + +return_null: + xor job_rax, job_rax + jmp return + +ENDPROC(sha1_mb_mgr_submit_avx2) + +.data + +.align 16 +clear_low_nibble: + .octa 0x000000000000000000000000FFFFFFF0 diff --git a/arch/x86/crypto/sha-mb/sha1_x8_avx2.S b/arch/x86/crypto/sha-mb/sha1_x8_avx2.S new file mode 100644 index 00000000000..8e1b47792b3 --- /dev/null +++ b/arch/x86/crypto/sha-mb/sha1_x8_avx2.S @@ -0,0 +1,472 @@ +/* + * Multi-buffer SHA1 algorithm hash compute routine + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2014 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * James Guilford <james.guilford@intel.com> + * Tim Chen <tim.c.chen@linux.intel.com> + * + * BSD LICENSE + * + * Copyright(c) 2014 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <linux/linkage.h> +#include "sha1_mb_mgr_datastruct.S" + +## code to compute oct SHA1 using SSE-256 +## outer calling routine takes care of save and restore of XMM registers + +## Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15# ymm0-15 +## +## Linux clobbers: rax rbx rcx rdx rsi r9 r10 r11 r12 r13 r14 r15 +## Linux preserves: rdi rbp r8 +## +## clobbers ymm0-15 + + +# TRANSPOSE8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1 +# "transpose" data in {r0...r7} using temps {t0...t1} +# Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7} +# r0 = {a7 a6 a5 a4 a3 a2 a1 a0} +# r1 = {b7 b6 b5 b4 b3 b2 b1 b0} +# r2 = {c7 c6 c5 c4 c3 c2 c1 c0} +# r3 = {d7 d6 d5 d4 d3 d2 d1 d0} +# r4 = {e7 e6 e5 e4 e3 e2 e1 e0} +# r5 = {f7 f6 f5 f4 f3 f2 f1 f0} +# r6 = {g7 g6 g5 g4 g3 g2 g1 g0} +# r7 = {h7 h6 h5 h4 h3 h2 h1 h0} +# +# Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7} +# r0 = {h0 g0 f0 e0 d0 c0 b0 a0} +# r1 = {h1 g1 f1 e1 d1 c1 b1 a1} +# r2 = {h2 g2 f2 e2 d2 c2 b2 a2} +# r3 = {h3 g3 f3 e3 d3 c3 b3 a3} +# r4 = {h4 g4 f4 e4 d4 c4 b4 a4} +# r5 = {h5 g5 f5 e5 d5 c5 b5 a5} +# r6 = {h6 g6 f6 e6 d6 c6 b6 a6} +# r7 = {h7 g7 f7 e7 d7 c7 b7 a7} +# + +.macro TRANSPOSE8 r0 r1 r2 r3 r4 r5 r6 r7 t0 t1 + # process top half (r0..r3) {a...d} + vshufps $0x44, \r1, \r0, \t0 # t0 = {b5 b4 a5 a4 b1 b0 a1 a0} + vshufps $0xEE, \r1, \r0, \r0 # r0 = {b7 b6 a7 a6 b3 b2 a3 a2} + vshufps $0x44, \r3, \r2, \t1 # t1 = {d5 d4 c5 c4 d1 d0 c1 c0} + vshufps $0xEE, \r3, \r2, \r2 # r2 = {d7 d6 c7 c6 d3 d2 c3 c2} + vshufps $0xDD, \t1, \t0, \r3 # r3 = {d5 c5 b5 a5 d1 c1 b1 a1} + vshufps $0x88, \r2, \r0, \r1 # r1 = {d6 c6 b6 a6 d2 c2 b2 a2} + vshufps $0xDD, \r2, \r0, \r0 # r0 = {d7 c7 b7 a7 d3 c3 b3 a3} + vshufps $0x88, \t1, \t0, \t0 # t0 = {d4 c4 b4 a4 d0 c0 b0 a0} + + # use r2 in place of t0 + # process bottom half (r4..r7) {e...h} + vshufps $0x44, \r5, \r4, \r2 # r2 = {f5 f4 e5 e4 f1 f0 e1 e0} + vshufps $0xEE, \r5, \r4, \r4 # r4 = {f7 f6 e7 e6 f3 f2 e3 e2} + vshufps $0x44, \r7, \r6, \t1 # t1 = {h5 h4 g5 g4 h1 h0 g1 g0} + vshufps $0xEE, \r7, \r6, \r6 # r6 = {h7 h6 g7 g6 h3 h2 g3 g2} + vshufps $0xDD, \t1, \r2, \r7 # r7 = {h5 g5 f5 e5 h1 g1 f1 e1} + vshufps $0x88, \r6, \r4, \r5 # r5 = {h6 g6 f6 e6 h2 g2 f2 e2} + vshufps $0xDD, \r6, \r4, \r4 # r4 = {h7 g7 f7 e7 h3 g3 f3 e3} + vshufps $0x88, \t1, \r2, \t1 # t1 = {h4 g4 f4 e4 h0 g0 f0 e0} + + vperm2f128 $0x13, \r1, \r5, \r6 # h6...a6 + vperm2f128 $0x02, \r1, \r5, \r2 # h2...a2 + vperm2f128 $0x13, \r3, \r7, \r5 # h5...a5 + vperm2f128 $0x02, \r3, \r7, \r1 # h1...a1 + vperm2f128 $0x13, \r0, \r4, \r7 # h7...a7 + vperm2f128 $0x02, \r0, \r4, \r3 # h3...a3 + vperm2f128 $0x13, \t0, \t1, \r4 # h4...a4 + vperm2f128 $0x02, \t0, \t1, \r0 # h0...a0 + +.endm +## +## Magic functions defined in FIPS 180-1 +## +# macro MAGIC_F0 F,B,C,D,T ## F = (D ^ (B & (C ^ D))) +.macro MAGIC_F0 regF regB regC regD regT + vpxor \regD, \regC, \regF + vpand \regB, \regF, \regF + vpxor \regD, \regF, \regF +.endm + +# macro MAGIC_F1 F,B,C,D,T ## F = (B ^ C ^ D) +.macro MAGIC_F1 regF regB regC regD regT + vpxor \regC, \regD, \regF + vpxor \regB, \regF, \regF +.endm + +# macro MAGIC_F2 F,B,C,D,T ## F = ((B & C) | (B & D) | (C & D)) +.macro MAGIC_F2 regF regB regC regD regT + vpor \regC, \regB, \regF + vpand \regC, \regB, \regT + vpand \regD, \regF, \regF + vpor \regT, \regF, \regF +.endm + +# macro MAGIC_F3 F,B,C,D,T ## F = (B ^ C ^ D) +.macro MAGIC_F3 regF regB regC regD regT + MAGIC_F1 \regF,\regB,\regC,\regD,\regT +.endm + +# PROLD reg, imm, tmp +.macro PROLD reg imm tmp + vpsrld $(32-\imm), \reg, \tmp + vpslld $\imm, \reg, \reg + vpor \tmp, \reg, \reg +.endm + +.macro PROLD_nd reg imm tmp src + vpsrld $(32-\imm), \src, \tmp + vpslld $\imm, \src, \reg + vpor \tmp, \reg, \reg +.endm + +.macro SHA1_STEP_00_15 regA regB regC regD regE regT regF memW immCNT MAGIC + vpaddd \immCNT, \regE, \regE + vpaddd \memW*32(%rsp), \regE, \regE + PROLD_nd \regT, 5, \regF, \regA + vpaddd \regT, \regE, \regE + \MAGIC \regF, \regB, \regC, \regD, \regT + PROLD \regB, 30, \regT + vpaddd \regF, \regE, \regE +.endm + +.macro SHA1_STEP_16_79 regA regB regC regD regE regT regF memW immCNT MAGIC + vpaddd \immCNT, \regE, \regE + offset = ((\memW - 14) & 15) * 32 + vmovdqu offset(%rsp), W14 + vpxor W14, W16, W16 + offset = ((\memW - 8) & 15) * 32 + vpxor offset(%rsp), W16, W16 + offset = ((\memW - 3) & 15) * 32 + vpxor offset(%rsp), W16, W16 + vpsrld $(32-1), W16, \regF + vpslld $1, W16, W16 + vpor W16, \regF, \regF + + ROTATE_W + + offset = ((\memW - 0) & 15) * 32 + vmovdqu \regF, offset(%rsp) + vpaddd \regF, \regE, \regE + PROLD_nd \regT, 5, \regF, \regA + vpaddd \regT, \regE, \regE + \MAGIC \regF,\regB,\regC,\regD,\regT ## FUN = MAGIC_Fi(B,C,D) + PROLD \regB,30, \regT + vpaddd \regF, \regE, \regE +.endm + +######################################################################## +######################################################################## +######################################################################## + +## FRAMESZ plus pushes must be an odd multiple of 8 +YMM_SAVE = (15-15)*32 +FRAMESZ = 32*16 + YMM_SAVE +_YMM = FRAMESZ - YMM_SAVE + +#define VMOVPS vmovups + +IDX = %rax +inp0 = %r9 +inp1 = %r10 +inp2 = %r11 +inp3 = %r12 +inp4 = %r13 +inp5 = %r14 +inp6 = %r15 +inp7 = %rcx +arg1 = %rdi +arg2 = %rsi +RSP_SAVE = %rdx + +# ymm0 A +# ymm1 B +# ymm2 C +# ymm3 D +# ymm4 E +# ymm5 F AA +# ymm6 T0 BB +# ymm7 T1 CC +# ymm8 T2 DD +# ymm9 T3 EE +# ymm10 T4 TMP +# ymm11 T5 FUN +# ymm12 T6 K +# ymm13 T7 W14 +# ymm14 T8 W15 +# ymm15 T9 W16 + + +A = %ymm0 +B = %ymm1 +C = %ymm2 +D = %ymm3 +E = %ymm4 +F = %ymm5 +T0 = %ymm6 +T1 = %ymm7 +T2 = %ymm8 +T3 = %ymm9 +T4 = %ymm10 +T5 = %ymm11 +T6 = %ymm12 +T7 = %ymm13 +T8 = %ymm14 +T9 = %ymm15 + +AA = %ymm5 +BB = %ymm6 +CC = %ymm7 +DD = %ymm8 +EE = %ymm9 +TMP = %ymm10 +FUN = %ymm11 +K = %ymm12 +W14 = %ymm13 +W15 = %ymm14 +W16 = %ymm15 + +.macro ROTATE_ARGS + TMP_ = E + E = D + D = C + C = B + B = A + A = TMP_ +.endm + +.macro ROTATE_W +TMP_ = W16 +W16 = W15 +W15 = W14 +W14 = TMP_ +.endm + +# 8 streams x 5 32bit words per digest x 4 bytes per word +#define DIGEST_SIZE (8*5*4) + +.align 32 + +# void sha1_x8_avx2(void **input_data, UINT128 *digest, UINT32 size) +# arg 1 : pointer to array[4] of pointer to input data +# arg 2 : size (in blocks) ;; assumed to be >= 1 +# +ENTRY(sha1_x8_avx2) + + push RSP_SAVE + + #save rsp + mov %rsp, RSP_SAVE + sub $FRAMESZ, %rsp + + #align rsp to 32 Bytes + and $~0x1F, %rsp + + ## Initialize digests + vmovdqu 0*32(arg1), A + vmovdqu 1*32(arg1), B + vmovdqu 2*32(arg1), C + vmovdqu 3*32(arg1), D + vmovdqu 4*32(arg1), E + + ## transpose input onto stack + mov _data_ptr+0*8(arg1),inp0 + mov _data_ptr+1*8(arg1),inp1 + mov _data_ptr+2*8(arg1),inp2 + mov _data_ptr+3*8(arg1),inp3 + mov _data_ptr+4*8(arg1),inp4 + mov _data_ptr+5*8(arg1),inp5 + mov _data_ptr+6*8(arg1),inp6 + mov _data_ptr+7*8(arg1),inp7 + + xor IDX, IDX +lloop: + vmovdqu PSHUFFLE_BYTE_FLIP_MASK(%rip), F + I=0 +.rep 2 + VMOVPS (inp0, IDX), T0 + VMOVPS (inp1, IDX), T1 + VMOVPS (inp2, IDX), T2 + VMOVPS (inp3, IDX), T3 + VMOVPS (inp4, IDX), T4 + VMOVPS (inp5, IDX), T5 + VMOVPS (inp6, IDX), T6 + VMOVPS (inp7, IDX), T7 + + TRANSPOSE8 T0, T1, T2, T3, T4, T5, T6, T7, T8, T9 + vpshufb F, T0, T0 + vmovdqu T0, (I*8)*32(%rsp) + vpshufb F, T1, T1 + vmovdqu T1, (I*8+1)*32(%rsp) + vpshufb F, T2, T2 + vmovdqu T2, (I*8+2)*32(%rsp) + vpshufb F, T3, T3 + vmovdqu T3, (I*8+3)*32(%rsp) + vpshufb F, T4, T4 + vmovdqu T4, (I*8+4)*32(%rsp) + vpshufb F, T5, T5 + vmovdqu T5, (I*8+5)*32(%rsp) + vpshufb F, T6, T6 + vmovdqu T6, (I*8+6)*32(%rsp) + vpshufb F, T7, T7 + vmovdqu T7, (I*8+7)*32(%rsp) + add $32, IDX + I = (I+1) +.endr + # save old digests + vmovdqu A,AA + vmovdqu B,BB + vmovdqu C,CC + vmovdqu D,DD + vmovdqu E,EE + +## +## perform 0-79 steps +## + vmovdqu K00_19(%rip), K +## do rounds 0...15 + I = 0 +.rep 16 + SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0 + ROTATE_ARGS + I = (I+1) +.endr + +## do rounds 16...19 + vmovdqu ((16 - 16) & 15) * 32 (%rsp), W16 + vmovdqu ((16 - 15) & 15) * 32 (%rsp), W15 +.rep 4 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0 + ROTATE_ARGS + I = (I+1) +.endr + +## do rounds 20...39 + vmovdqu K20_39(%rip), K +.rep 20 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1 + ROTATE_ARGS + I = (I+1) +.endr + +## do rounds 40...59 + vmovdqu K40_59(%rip), K +.rep 20 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2 + ROTATE_ARGS + I = (I+1) +.endr + +## do rounds 60...79 + vmovdqu K60_79(%rip), K +.rep 20 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3 + ROTATE_ARGS + I = (I+1) +.endr + + vpaddd AA,A,A + vpaddd BB,B,B + vpaddd CC,C,C + vpaddd DD,D,D + vpaddd EE,E,E + + sub $1, arg2 + jne lloop + + # write out digests + vmovdqu A, 0*32(arg1) + vmovdqu B, 1*32(arg1) + vmovdqu C, 2*32(arg1) + vmovdqu D, 3*32(arg1) + vmovdqu E, 4*32(arg1) + + # update input pointers + add IDX, inp0 + add IDX, inp1 + add IDX, inp2 + add IDX, inp3 + add IDX, inp4 + add IDX, inp5 + add IDX, inp6 + add IDX, inp7 + mov inp0, _data_ptr (arg1) + mov inp1, _data_ptr + 1*8(arg1) + mov inp2, _data_ptr + 2*8(arg1) + mov inp3, _data_ptr + 3*8(arg1) + mov inp4, _data_ptr + 4*8(arg1) + mov inp5, _data_ptr + 5*8(arg1) + mov inp6, _data_ptr + 6*8(arg1) + mov inp7, _data_ptr + 7*8(arg1) + + ################ + ## Postamble + + mov RSP_SAVE, %rsp + pop RSP_SAVE + + ret +ENDPROC(sha1_x8_avx2) + + +.data + +.align 32 +K00_19: +.octa 0x5A8279995A8279995A8279995A827999 +.octa 0x5A8279995A8279995A8279995A827999 +K20_39: +.octa 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1 +.octa 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1 +K40_59: +.octa 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC +.octa 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC +K60_79: +.octa 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6 +.octa 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6 +PSHUFFLE_BYTE_FLIP_MASK: +.octa 0x0c0d0e0f08090a0b0405060700010203 +.octa 0x0c0d0e0f08090a0b0405060700010203 diff --git a/arch/x86/crypto/sha-mb/sha_mb_ctx.h b/arch/x86/crypto/sha-mb/sha_mb_ctx.h new file mode 100644 index 00000000000..e36069d0c1b --- /dev/null +++ b/arch/x86/crypto/sha-mb/sha_mb_ctx.h @@ -0,0 +1,136 @@ +/* + * Header file for multi buffer SHA context + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2014 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * Tim Chen <tim.c.chen@linux.intel.com> + * + * BSD LICENSE + * + * Copyright(c) 2014 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _SHA_MB_CTX_INTERNAL_H +#define _SHA_MB_CTX_INTERNAL_H + +#include "sha_mb_mgr.h" + +#define HASH_UPDATE 0x00 +#define HASH_FIRST 0x01 +#define HASH_LAST 0x02 +#define HASH_ENTIRE 0x03 +#define HASH_DONE 0x04 +#define HASH_FINAL 0x08 + +#define HASH_CTX_STS_IDLE 0x00 +#define HASH_CTX_STS_PROCESSING 0x01 +#define HASH_CTX_STS_LAST 0x02 +#define HASH_CTX_STS_COMPLETE 0x04 + +enum hash_ctx_error { + HASH_CTX_ERROR_NONE = 0, + HASH_CTX_ERROR_INVALID_FLAGS = -1, + HASH_CTX_ERROR_ALREADY_PROCESSING = -2, + HASH_CTX_ERROR_ALREADY_COMPLETED = -3, + +#ifdef HASH_CTX_DEBUG + HASH_CTX_ERROR_DEBUG_DIGEST_MISMATCH = -4, +#endif +}; + + +#define hash_ctx_user_data(ctx) ((ctx)->user_data) +#define hash_ctx_digest(ctx) ((ctx)->job.result_digest) +#define hash_ctx_processing(ctx) ((ctx)->status & HASH_CTX_STS_PROCESSING) +#define hash_ctx_complete(ctx) ((ctx)->status == HASH_CTX_STS_COMPLETE) +#define hash_ctx_status(ctx) ((ctx)->status) +#define hash_ctx_error(ctx) ((ctx)->error) +#define hash_ctx_init(ctx) \ + do { \ + (ctx)->error = HASH_CTX_ERROR_NONE; \ + (ctx)->status = HASH_CTX_STS_COMPLETE; \ + } while (0) + + +/* Hash Constants and Typedefs */ +#define SHA1_DIGEST_LENGTH 5 +#define SHA1_LOG2_BLOCK_SIZE 6 + +#define SHA1_PADLENGTHFIELD_SIZE 8 + +#ifdef SHA_MB_DEBUG +#define assert(expr) \ +do { \ + if (unlikely(!(expr))) { \ + printk(KERN_ERR "Assertion failed! %s,%s,%s,line=%d\n", \ + #expr, __FILE__, __func__, __LINE__); \ + } \ +} while (0) +#else +#define assert(expr) do {} while (0) +#endif + +struct sha1_ctx_mgr { + struct sha1_mb_mgr mgr; +}; + +/* typedef struct sha1_ctx_mgr sha1_ctx_mgr; */ + +struct sha1_hash_ctx { + /* Must be at struct offset 0 */ + struct job_sha1 job; + /* status flag */ + int status; + /* error flag */ + int error; + + uint32_t total_length; + const void *incoming_buffer; + uint32_t incoming_buffer_length; + uint8_t partial_block_buffer[SHA1_BLOCK_SIZE * 2]; + uint32_t partial_block_buffer_length; + void *user_data; +}; + +#endif diff --git a/arch/x86/crypto/sha-mb/sha_mb_mgr.h b/arch/x86/crypto/sha-mb/sha_mb_mgr.h new file mode 100644 index 00000000000..08ad1a9acfd --- /dev/null +++ b/arch/x86/crypto/sha-mb/sha_mb_mgr.h @@ -0,0 +1,110 @@ +/* + * Header file for multi buffer SHA1 algorithm manager + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2014 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * James Guilford <james.guilford@intel.com> + * Tim Chen <tim.c.chen@linux.intel.com> + * + * BSD LICENSE + * + * Copyright(c) 2014 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef __SHA_MB_MGR_H +#define __SHA_MB_MGR_H + + +#include <linux/types.h> + +#define NUM_SHA1_DIGEST_WORDS 5 + +enum job_sts { STS_UNKNOWN = 0, + STS_BEING_PROCESSED = 1, + STS_COMPLETED = 2, + STS_INTERNAL_ERROR = 3, + STS_ERROR = 4 +}; + +struct job_sha1 { + u8 *buffer; + u32 len; + u32 result_digest[NUM_SHA1_DIGEST_WORDS] __aligned(32); + enum job_sts status; + void *user_data; +}; + +/* SHA1 out-of-order scheduler */ + +/* typedef uint32_t sha1_digest_array[5][8]; */ + +struct sha1_args_x8 { + uint32_t digest[5][8]; + uint8_t *data_ptr[8]; +}; + +struct sha1_lane_data { + struct job_sha1 *job_in_lane; +}; + +struct sha1_mb_mgr { + struct sha1_args_x8 args; + + uint32_t lens[8]; + + /* each byte is index (0...7) of unused lanes */ + uint64_t unused_lanes; + /* byte 4 is set to FF as a flag */ + struct sha1_lane_data ldata[8]; +}; + + +#define SHA1_MB_MGR_NUM_LANES_AVX2 8 + +void sha1_mb_mgr_init_avx2(struct sha1_mb_mgr *state); +struct job_sha1 *sha1_mb_mgr_submit_avx2(struct sha1_mb_mgr *state, + struct job_sha1 *job); +struct job_sha1 *sha1_mb_mgr_flush_avx2(struct sha1_mb_mgr *state); +struct job_sha1 *sha1_mb_mgr_get_comp_job_avx2(struct sha1_mb_mgr *state); + +#endif diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index d21ff89207c..df91466f973 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -308,11 +308,8 @@ static int load_aout_binary(struct linux_binprm *bprm) (current->mm->start_brk = N_BSSADDR(ex)); retval = setup_arg_pages(bprm, IA32_STACK_TOP, EXSTACK_DEFAULT); - if (retval < 0) { - /* Someone check-me: is this error path enough? */ - send_sig(SIGKILL, current, 0); + if (retval < 0) return retval; - } install_exec_creds(bprm); @@ -324,17 +321,13 @@ static int load_aout_binary(struct linux_binprm *bprm) error = vm_brk(text_addr & PAGE_MASK, map_size); - if (error != (text_addr & PAGE_MASK)) { - send_sig(SIGKILL, current, 0); + if (error != (text_addr & PAGE_MASK)) return error; - } error = read_code(bprm->file, text_addr, 32, ex.a_text + ex.a_data); - if ((signed long)error < 0) { - send_sig(SIGKILL, current, 0); + if ((signed long)error < 0) return error; - } } else { #ifdef WARN_OLD static unsigned long error_time, error_time2; @@ -368,20 +361,16 @@ static int load_aout_binary(struct linux_binprm *bprm) MAP_EXECUTABLE | MAP_32BIT, fd_offset); - if (error != N_TXTADDR(ex)) { - send_sig(SIGKILL, current, 0); + if (error != N_TXTADDR(ex)) return error; - } error = vm_mmap(bprm->file, N_DATADDR(ex), ex.a_data, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE | MAP_32BIT, fd_offset + ex.a_text); - if (error != N_DATADDR(ex)) { - send_sig(SIGKILL, current, 0); + if (error != N_DATADDR(ex)) return error; - } } beyond_if: set_binfmt(&aout_format); diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index f5bdd288181..8ffba18395c 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -151,6 +151,16 @@ ENTRY(ia32_sysenter_target) 1: movl (%rbp),%ebp _ASM_EXTABLE(1b,ia32_badarg) ASM_CLAC + + /* + * Sysenter doesn't filter flags, so we need to clear NT + * ourselves. To save a few cycles, we can check whether + * NT was set instead of doing an unconditional popfq. + */ + testl $X86_EFLAGS_NT,EFLAGS(%rsp) /* saved EFLAGS match cpu */ + jnz sysenter_fix_flags +sysenter_flags_fixed: + orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) CFI_REMEMBER_STATE @@ -184,6 +194,8 @@ sysexit_from_sys_call: TRACE_IRQS_ON ENABLE_INTERRUPTS_SYSEXIT32 + CFI_RESTORE_STATE + #ifdef CONFIG_AUDITSYSCALL .macro auditsys_entry_common movl %esi,%r8d /* 5th arg: 4th syscall arg */ @@ -226,7 +238,6 @@ sysexit_from_sys_call: .endm sysenter_auditsys: - CFI_RESTORE_STATE auditsys_entry_common movl %ebp,%r9d /* reload 6th syscall arg */ jmp sysenter_dispatch @@ -235,6 +246,11 @@ sysexit_audit: auditsys_exit sysexit_from_sys_call #endif +sysenter_fix_flags: + pushq_cfi $(X86_EFLAGS_IF|X86_EFLAGS_FIXED) + popfq_cfi + jmp sysenter_flags_fixed + sysenter_tracesys: #ifdef CONFIG_AUDITSYSCALL testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index 3ca9762e164..d55a210a49b 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -5,6 +5,8 @@ genhdr-y += unistd_64.h genhdr-y += unistd_x32.h generic-y += clkdev.h -generic-y += early_ioremap.h generic-y += cputime.h +generic-y += dma-contiguous.h +generic-y += early_ioremap.h generic-y += mcs_spinlock.h +generic-y += scatterlist.h diff --git a/arch/x86/include/asm/acenv.h b/arch/x86/include/asm/acenv.h index 66873297e9f..1b010a859b8 100644 --- a/arch/x86/include/asm/acenv.h +++ b/arch/x86/include/asm/acenv.h @@ -18,8 +18,6 @@ #define ACPI_FLUSH_CPU_CACHE() wbinvd() -#ifdef CONFIG_ACPI - int __acpi_acquire_global_lock(unsigned int *lock); int __acpi_release_global_lock(unsigned int *lock); @@ -44,6 +42,4 @@ int __acpi_release_global_lock(unsigned int *lock); : "=r"(n_hi), "=r"(n_lo) \ : "0"(n_hi), "1"(n_lo)) -#endif - #endif /* _ASM_X86_ACENV_H */ diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index e06225eda63..0ab4f9fd268 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -121,6 +121,11 @@ static inline void arch_acpi_set_pdc_bits(u32 *buf) buf[2] &= ~(ACPI_PDC_C_C2C3_FFH); } +static inline bool acpi_has_cpu_in_madt(void) +{ + return !!acpi_lapic; +} + #else /* !CONFIG_ACPI */ #define acpi_lapic 0 diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 0a3f9c9f98d..473bdbee378 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -161,6 +161,20 @@ static inline int alternatives_text_reserved(void *start, void *end) asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) \ : : "i" (0), ## input) +/* + * This is similar to alternative_input. But it has two features and + * respective instructions. + * + * If CPU has feature2, newinstr2 is used. + * Otherwise, if CPU has feature1, newinstr1 is used. + * Otherwise, oldinstr is used. + */ +#define alternative_input_2(oldinstr, newinstr1, feature1, newinstr2, \ + feature2, input...) \ + asm volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1, \ + newinstr2, feature2) \ + : : "i" (0), ## input) + /* Like alternative_input, but with a single output argument */ #define alternative_io(oldinstr, newinstr, feature, output, input...) \ asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) \ diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 19b0ebafcd3..465b309af25 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -85,21 +85,13 @@ static inline bool apic_from_smp_config(void) #include <asm/paravirt.h> #endif -#ifdef CONFIG_X86_64 -extern int is_vsmp_box(void); -#else -static inline int is_vsmp_box(void) -{ - return 0; -} -#endif extern int setup_profiling_timer(unsigned int); static inline void native_apic_mem_write(u32 reg, u32 v) { volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg); - alternative_io("movl %0, %1", "xchgl %0, %1", X86_FEATURE_11AP, + alternative_io("movl %0, %1", "xchgl %0, %1", X86_BUG_11AP, ASM_OUTPUT2("=r" (v), "=m" (*addr)), ASM_OUTPUT2("0" (v), "m" (*addr))); } @@ -300,7 +292,6 @@ struct apic { int dest_logical; unsigned long (*check_apicid_used)(physid_mask_t *map, int apicid); - unsigned long (*check_apicid_present)(int apicid); void (*vector_allocation_domain)(int cpu, struct cpumask *retmask, const struct cpumask *mask); @@ -309,21 +300,11 @@ struct apic { void (*ioapic_phys_id_map)(physid_mask_t *phys_map, physid_mask_t *retmap); void (*setup_apic_routing)(void); - int (*multi_timer_check)(int apic, int irq); int (*cpu_present_to_apicid)(int mps_cpu); void (*apicid_to_cpu_present)(int phys_apicid, physid_mask_t *retmap); - void (*setup_portio_remap)(void); int (*check_phys_apicid_present)(int phys_apicid); - void (*enable_apic_mode)(void); int (*phys_pkg_id)(int cpuid_apic, int index_msb); - /* - * When one of the next two hooks returns 1 the apic - * is switched to this. Essentially they are additional - * probe functions: - */ - int (*mps_oem_check)(struct mpc_table *mpc, char *oem, char *productid); - unsigned int (*get_apic_id)(unsigned long x); unsigned long (*set_apic_id)(unsigned int id); unsigned long apic_id_mask; @@ -343,11 +324,7 @@ struct apic { /* wakeup_secondary_cpu */ int (*wakeup_secondary_cpu)(int apicid, unsigned long start_eip); - int trampoline_phys_low; - int trampoline_phys_high; - bool wait_for_init_deassert; - void (*smp_callin_clear_local_apic)(void); void (*inquire_remote_apic)(int apicid); /* apic ops */ @@ -378,14 +355,6 @@ struct apic { * won't be applied properly during early boot in this case. */ int (*x86_32_early_logical_apicid)(int cpu); - - /* - * Optional method called from setup_local_APIC() after logical - * apicid is guaranteed to be known to initialize apicid -> node - * mapping if NUMA initialization hasn't done so already. Don't - * add new users. - */ - int (*x86_32_numa_cpu_node)(int cpu); #endif }; @@ -496,14 +465,12 @@ static inline unsigned default_get_apic_id(unsigned long x) } /* - * Warm reset vector default position: + * Warm reset vector position: */ -#define DEFAULT_TRAMPOLINE_PHYS_LOW 0x467 -#define DEFAULT_TRAMPOLINE_PHYS_HIGH 0x469 +#define TRAMPOLINE_PHYS_LOW 0x467 +#define TRAMPOLINE_PHYS_HIGH 0x469 #ifdef CONFIG_X86_64 -extern int default_acpi_madt_oem_check(char *, char *); - extern void apic_send_IPI_self(int vector); DECLARE_PER_CPU(int, x2apic_extra_bits); @@ -552,6 +519,8 @@ static inline int default_apic_id_valid(int apicid) return (apicid < 255); } +extern int default_acpi_madt_oem_check(char *, char *); + extern void default_setup_apic_routing(void); extern struct apic apic_noop; @@ -635,11 +604,6 @@ static inline unsigned long default_check_apicid_used(physid_mask_t *map, int ap return physid_isset(apicid, *map); } -static inline unsigned long default_check_apicid_present(int bit) -{ - return physid_isset(bit, phys_cpu_present_map); -} - static inline void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) { *retmap = *phys_map; diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index 6dd1c7dd047..5e5cd123fdf 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -24,7 +24,7 @@ */ static inline int atomic_read(const atomic_t *v) { - return (*(volatile int *)&(v)->counter); + return ACCESS_ONCE((v)->counter); } /** @@ -219,21 +219,6 @@ static inline short int atomic_inc_short(short int *v) return *v; } -#ifdef CONFIG_X86_64 -/** - * atomic_or_long - OR of two long integers - * @v1: pointer to type unsigned long - * @v2: pointer to type unsigned long - * - * Atomically ORs @v1 and @v2 - * Returns the result of the OR - */ -static inline void atomic_or_long(unsigned long *v1, unsigned long v2) -{ - asm(LOCK_PREFIX "orq %1, %0" : "+m" (*v1) : "r" (v2)); -} -#endif - /* These are x86-specific, used by some header files */ #define atomic_clear_mask(mask, addr) \ asm volatile(LOCK_PREFIX "andl %0,%1" \ diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h index 46e9052bbd2..f8d273e1851 100644 --- a/arch/x86/include/asm/atomic64_64.h +++ b/arch/x86/include/asm/atomic64_64.h @@ -18,7 +18,7 @@ */ static inline long atomic64_read(const atomic64_t *v) { - return (*(volatile long *)&(v)->counter); + return ACCESS_ONCE((v)->counter); } /** diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index 5c7198cca5e..0f4460b5636 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -99,7 +99,7 @@ #if defined(CONFIG_X86_PPRO_FENCE) /* - * For either of these options x86 doesn't have a strong TSO memory + * For this option x86 doesn't have a strong TSO memory * model and we should fall back to full barriers. */ diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index afcd35d331d..cfe3b954d5e 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -497,8 +497,6 @@ static __always_inline int fls64(__u64 x) #include <asm-generic/bitops/sched.h> -#define ARCH_HAS_FAST_MULTIPLIER 1 - #include <asm/arch_hweight.h> #include <asm-generic/bitops/const_hweight.h> diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h index cb4c73bfeb4..76659b67fd1 100644 --- a/arch/x86/include/asm/calling.h +++ b/arch/x86/include/asm/calling.h @@ -85,7 +85,7 @@ For 32-bit we have the following conventions - kernel is built with #define ARGOFFSET R11 #define SWFRAME ORIG_RAX - .macro SAVE_ARGS addskip=0, save_rcx=1, save_r891011=1 + .macro SAVE_ARGS addskip=0, save_rcx=1, save_r891011=1, rax_enosys=0 subq $9*8+\addskip, %rsp CFI_ADJUST_CFA_OFFSET 9*8+\addskip movq_cfi rdi, 8*8 @@ -96,7 +96,11 @@ For 32-bit we have the following conventions - kernel is built with movq_cfi rcx, 5*8 .endif + .if \rax_enosys + movq $-ENOSYS, 4*8(%rsp) + .else movq_cfi rax, 4*8 + .endif .if \save_r891011 movq_cfi r8, 3*8 diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h index d47786acb01..99c105d78b7 100644 --- a/arch/x86/include/asm/cmpxchg.h +++ b/arch/x86/include/asm/cmpxchg.h @@ -4,6 +4,8 @@ #include <linux/compiler.h> #include <asm/alternative.h> /* Provides LOCK_PREFIX */ +#define __HAVE_ARCH_CMPXCHG 1 + /* * Non-existant functions to indicate usage errors at link time * (or compile-time if the compiler implements __compiletime_error(). @@ -143,7 +145,6 @@ extern void __add_wrong_size(void) # include <asm/cmpxchg_64.h> #endif -#ifdef __HAVE_ARCH_CMPXCHG #define cmpxchg(ptr, old, new) \ __cmpxchg(ptr, old, new, sizeof(*(ptr))) @@ -152,7 +153,6 @@ extern void __add_wrong_size(void) #define cmpxchg_local(ptr, old, new) \ __cmpxchg_local(ptr, old, new, sizeof(*(ptr))) -#endif /* * xadd() adds "inc" to "*ptr" and atomically returns the previous diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h index f8bf2eecab8..f7e14292648 100644 --- a/arch/x86/include/asm/cmpxchg_32.h +++ b/arch/x86/include/asm/cmpxchg_32.h @@ -34,8 +34,6 @@ static inline void set_64bit(volatile u64 *ptr, u64 value) : "memory"); } -#define __HAVE_ARCH_CMPXCHG 1 - #ifdef CONFIG_X86_CMPXCHG64 #define cmpxchg64(ptr, o, n) \ ((__typeof__(*(ptr)))__cmpxchg64((ptr), (unsigned long long)(o), \ diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h index 614be87f1a9..1af94697aae 100644 --- a/arch/x86/include/asm/cmpxchg_64.h +++ b/arch/x86/include/asm/cmpxchg_64.h @@ -6,8 +6,6 @@ static inline void set_64bit(volatile u64 *ptr, u64 val) *ptr = val; } -#define __HAVE_ARCH_CMPXCHG 1 - #define cmpxchg64(ptr, o, n) \ ({ \ BUILD_BUG_ON(sizeof(*(ptr)) != 8); \ diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index e265ff95d16..0bb1335313b 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -8,7 +8,11 @@ #include <asm/required-features.h> #endif -#define NCAPINTS 10 /* N 32-bit words worth of info */ +#ifndef _ASM_X86_DISABLED_FEATURES_H +#include <asm/disabled-features.h> +#endif + +#define NCAPINTS 11 /* N 32-bit words worth of info */ #define NBUGINTS 1 /* N 32-bit bug flags */ /* @@ -18,213 +22,219 @@ */ /* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */ -#define X86_FEATURE_FPU (0*32+ 0) /* Onboard FPU */ -#define X86_FEATURE_VME (0*32+ 1) /* Virtual Mode Extensions */ -#define X86_FEATURE_DE (0*32+ 2) /* Debugging Extensions */ -#define X86_FEATURE_PSE (0*32+ 3) /* Page Size Extensions */ -#define X86_FEATURE_TSC (0*32+ 4) /* Time Stamp Counter */ -#define X86_FEATURE_MSR (0*32+ 5) /* Model-Specific Registers */ -#define X86_FEATURE_PAE (0*32+ 6) /* Physical Address Extensions */ -#define X86_FEATURE_MCE (0*32+ 7) /* Machine Check Exception */ -#define X86_FEATURE_CX8 (0*32+ 8) /* CMPXCHG8 instruction */ -#define X86_FEATURE_APIC (0*32+ 9) /* Onboard APIC */ -#define X86_FEATURE_SEP (0*32+11) /* SYSENTER/SYSEXIT */ -#define X86_FEATURE_MTRR (0*32+12) /* Memory Type Range Registers */ -#define X86_FEATURE_PGE (0*32+13) /* Page Global Enable */ -#define X86_FEATURE_MCA (0*32+14) /* Machine Check Architecture */ -#define X86_FEATURE_CMOV (0*32+15) /* CMOV instructions */ +#define X86_FEATURE_FPU ( 0*32+ 0) /* Onboard FPU */ +#define X86_FEATURE_VME ( 0*32+ 1) /* Virtual Mode Extensions */ +#define X86_FEATURE_DE ( 0*32+ 2) /* Debugging Extensions */ +#define X86_FEATURE_PSE ( 0*32+ 3) /* Page Size Extensions */ +#define X86_FEATURE_TSC ( 0*32+ 4) /* Time Stamp Counter */ +#define X86_FEATURE_MSR ( 0*32+ 5) /* Model-Specific Registers */ +#define X86_FEATURE_PAE ( 0*32+ 6) /* Physical Address Extensions */ +#define X86_FEATURE_MCE ( 0*32+ 7) /* Machine Check Exception */ +#define X86_FEATURE_CX8 ( 0*32+ 8) /* CMPXCHG8 instruction */ +#define X86_FEATURE_APIC ( 0*32+ 9) /* Onboard APIC */ +#define X86_FEATURE_SEP ( 0*32+11) /* SYSENTER/SYSEXIT */ +#define X86_FEATURE_MTRR ( 0*32+12) /* Memory Type Range Registers */ +#define X86_FEATURE_PGE ( 0*32+13) /* Page Global Enable */ +#define X86_FEATURE_MCA ( 0*32+14) /* Machine Check Architecture */ +#define X86_FEATURE_CMOV ( 0*32+15) /* CMOV instructions */ /* (plus FCMOVcc, FCOMI with FPU) */ -#define X86_FEATURE_PAT (0*32+16) /* Page Attribute Table */ -#define X86_FEATURE_PSE36 (0*32+17) /* 36-bit PSEs */ -#define X86_FEATURE_PN (0*32+18) /* Processor serial number */ -#define X86_FEATURE_CLFLUSH (0*32+19) /* CLFLUSH instruction */ -#define X86_FEATURE_DS (0*32+21) /* "dts" Debug Store */ -#define X86_FEATURE_ACPI (0*32+22) /* ACPI via MSR */ -#define X86_FEATURE_MMX (0*32+23) /* Multimedia Extensions */ -#define X86_FEATURE_FXSR (0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */ -#define X86_FEATURE_XMM (0*32+25) /* "sse" */ -#define X86_FEATURE_XMM2 (0*32+26) /* "sse2" */ -#define X86_FEATURE_SELFSNOOP (0*32+27) /* "ss" CPU self snoop */ -#define X86_FEATURE_HT (0*32+28) /* Hyper-Threading */ -#define X86_FEATURE_ACC (0*32+29) /* "tm" Automatic clock control */ -#define X86_FEATURE_IA64 (0*32+30) /* IA-64 processor */ -#define X86_FEATURE_PBE (0*32+31) /* Pending Break Enable */ +#define X86_FEATURE_PAT ( 0*32+16) /* Page Attribute Table */ +#define X86_FEATURE_PSE36 ( 0*32+17) /* 36-bit PSEs */ +#define X86_FEATURE_PN ( 0*32+18) /* Processor serial number */ +#define X86_FEATURE_CLFLUSH ( 0*32+19) /* CLFLUSH instruction */ +#define X86_FEATURE_DS ( 0*32+21) /* "dts" Debug Store */ +#define X86_FEATURE_ACPI ( 0*32+22) /* ACPI via MSR */ +#define X86_FEATURE_MMX ( 0*32+23) /* Multimedia Extensions */ +#define X86_FEATURE_FXSR ( 0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */ +#define X86_FEATURE_XMM ( 0*32+25) /* "sse" */ +#define X86_FEATURE_XMM2 ( 0*32+26) /* "sse2" */ +#define X86_FEATURE_SELFSNOOP ( 0*32+27) /* "ss" CPU self snoop */ +#define X86_FEATURE_HT ( 0*32+28) /* Hyper-Threading */ +#define X86_FEATURE_ACC ( 0*32+29) /* "tm" Automatic clock control */ +#define X86_FEATURE_IA64 ( 0*32+30) /* IA-64 processor */ +#define X86_FEATURE_PBE ( 0*32+31) /* Pending Break Enable */ /* AMD-defined CPU features, CPUID level 0x80000001, word 1 */ /* Don't duplicate feature flags which are redundant with Intel! */ -#define X86_FEATURE_SYSCALL (1*32+11) /* SYSCALL/SYSRET */ -#define X86_FEATURE_MP (1*32+19) /* MP Capable. */ -#define X86_FEATURE_NX (1*32+20) /* Execute Disable */ -#define X86_FEATURE_MMXEXT (1*32+22) /* AMD MMX extensions */ -#define X86_FEATURE_FXSR_OPT (1*32+25) /* FXSAVE/FXRSTOR optimizations */ -#define X86_FEATURE_GBPAGES (1*32+26) /* "pdpe1gb" GB pages */ -#define X86_FEATURE_RDTSCP (1*32+27) /* RDTSCP */ -#define X86_FEATURE_LM (1*32+29) /* Long Mode (x86-64) */ -#define X86_FEATURE_3DNOWEXT (1*32+30) /* AMD 3DNow! extensions */ -#define X86_FEATURE_3DNOW (1*32+31) /* 3DNow! */ +#define X86_FEATURE_SYSCALL ( 1*32+11) /* SYSCALL/SYSRET */ +#define X86_FEATURE_MP ( 1*32+19) /* MP Capable. */ +#define X86_FEATURE_NX ( 1*32+20) /* Execute Disable */ +#define X86_FEATURE_MMXEXT ( 1*32+22) /* AMD MMX extensions */ +#define X86_FEATURE_FXSR_OPT ( 1*32+25) /* FXSAVE/FXRSTOR optimizations */ +#define X86_FEATURE_GBPAGES ( 1*32+26) /* "pdpe1gb" GB pages */ +#define X86_FEATURE_RDTSCP ( 1*32+27) /* RDTSCP */ +#define X86_FEATURE_LM ( 1*32+29) /* Long Mode (x86-64) */ +#define X86_FEATURE_3DNOWEXT ( 1*32+30) /* AMD 3DNow! extensions */ +#define X86_FEATURE_3DNOW ( 1*32+31) /* 3DNow! */ /* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */ -#define X86_FEATURE_RECOVERY (2*32+ 0) /* CPU in recovery mode */ -#define X86_FEATURE_LONGRUN (2*32+ 1) /* Longrun power control */ -#define X86_FEATURE_LRTI (2*32+ 3) /* LongRun table interface */ +#define X86_FEATURE_RECOVERY ( 2*32+ 0) /* CPU in recovery mode */ +#define X86_FEATURE_LONGRUN ( 2*32+ 1) /* Longrun power control */ +#define X86_FEATURE_LRTI ( 2*32+ 3) /* LongRun table interface */ /* Other features, Linux-defined mapping, word 3 */ /* This range is used for feature bits which conflict or are synthesized */ -#define X86_FEATURE_CXMMX (3*32+ 0) /* Cyrix MMX extensions */ -#define X86_FEATURE_K6_MTRR (3*32+ 1) /* AMD K6 nonstandard MTRRs */ -#define X86_FEATURE_CYRIX_ARR (3*32+ 2) /* Cyrix ARRs (= MTRRs) */ -#define X86_FEATURE_CENTAUR_MCR (3*32+ 3) /* Centaur MCRs (= MTRRs) */ +#define X86_FEATURE_CXMMX ( 3*32+ 0) /* Cyrix MMX extensions */ +#define X86_FEATURE_K6_MTRR ( 3*32+ 1) /* AMD K6 nonstandard MTRRs */ +#define X86_FEATURE_CYRIX_ARR ( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */ +#define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* Centaur MCRs (= MTRRs) */ /* cpu types for specific tunings: */ -#define X86_FEATURE_K8 (3*32+ 4) /* "" Opteron, Athlon64 */ -#define X86_FEATURE_K7 (3*32+ 5) /* "" Athlon */ -#define X86_FEATURE_P3 (3*32+ 6) /* "" P3 */ -#define X86_FEATURE_P4 (3*32+ 7) /* "" P4 */ -#define X86_FEATURE_CONSTANT_TSC (3*32+ 8) /* TSC ticks at a constant rate */ -#define X86_FEATURE_UP (3*32+ 9) /* smp kernel running on up */ -#define X86_FEATURE_FXSAVE_LEAK (3*32+10) /* "" FXSAVE leaks FOP/FIP/FOP */ -#define X86_FEATURE_ARCH_PERFMON (3*32+11) /* Intel Architectural PerfMon */ -#define X86_FEATURE_PEBS (3*32+12) /* Precise-Event Based Sampling */ -#define X86_FEATURE_BTS (3*32+13) /* Branch Trace Store */ -#define X86_FEATURE_SYSCALL32 (3*32+14) /* "" syscall in ia32 userspace */ -#define X86_FEATURE_SYSENTER32 (3*32+15) /* "" sysenter in ia32 userspace */ -#define X86_FEATURE_REP_GOOD (3*32+16) /* rep microcode works well */ -#define X86_FEATURE_MFENCE_RDTSC (3*32+17) /* "" Mfence synchronizes RDTSC */ -#define X86_FEATURE_LFENCE_RDTSC (3*32+18) /* "" Lfence synchronizes RDTSC */ -#define X86_FEATURE_11AP (3*32+19) /* "" Bad local APIC aka 11AP */ -#define X86_FEATURE_NOPL (3*32+20) /* The NOPL (0F 1F) instructions */ -#define X86_FEATURE_ALWAYS (3*32+21) /* "" Always-present feature */ -#define X86_FEATURE_XTOPOLOGY (3*32+22) /* cpu topology enum extensions */ -#define X86_FEATURE_TSC_RELIABLE (3*32+23) /* TSC is known to be reliable */ -#define X86_FEATURE_NONSTOP_TSC (3*32+24) /* TSC does not stop in C states */ -#define X86_FEATURE_CLFLUSH_MONITOR (3*32+25) /* "" clflush reqd with monitor */ -#define X86_FEATURE_EXTD_APICID (3*32+26) /* has extended APICID (8 bits) */ -#define X86_FEATURE_AMD_DCM (3*32+27) /* multi-node processor */ -#define X86_FEATURE_APERFMPERF (3*32+28) /* APERFMPERF */ -#define X86_FEATURE_EAGER_FPU (3*32+29) /* "eagerfpu" Non lazy FPU restore */ -#define X86_FEATURE_NONSTOP_TSC_S3 (3*32+30) /* TSC doesn't stop in S3 state */ +#define X86_FEATURE_K8 ( 3*32+ 4) /* "" Opteron, Athlon64 */ +#define X86_FEATURE_K7 ( 3*32+ 5) /* "" Athlon */ +#define X86_FEATURE_P3 ( 3*32+ 6) /* "" P3 */ +#define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */ +#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */ +#define X86_FEATURE_UP ( 3*32+ 9) /* smp kernel running on up */ +/* free, was #define X86_FEATURE_FXSAVE_LEAK ( 3*32+10) * "" FXSAVE leaks FOP/FIP/FOP */ +#define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */ +#define X86_FEATURE_PEBS ( 3*32+12) /* Precise-Event Based Sampling */ +#define X86_FEATURE_BTS ( 3*32+13) /* Branch Trace Store */ +#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in ia32 userspace */ +#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in ia32 userspace */ +#define X86_FEATURE_REP_GOOD ( 3*32+16) /* rep microcode works well */ +#define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" Mfence synchronizes RDTSC */ +#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" Lfence synchronizes RDTSC */ +/* free, was #define X86_FEATURE_11AP ( 3*32+19) * "" Bad local APIC aka 11AP */ +#define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */ +#define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */ +#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* cpu topology enum extensions */ +#define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */ +#define X86_FEATURE_NONSTOP_TSC ( 3*32+24) /* TSC does not stop in C states */ +/* free, was #define X86_FEATURE_CLFLUSH_MONITOR ( 3*32+25) * "" clflush reqd with monitor */ +#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* has extended APICID (8 bits) */ +#define X86_FEATURE_AMD_DCM ( 3*32+27) /* multi-node processor */ +#define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */ +#define X86_FEATURE_EAGER_FPU ( 3*32+29) /* "eagerfpu" Non lazy FPU restore */ +#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */ /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ -#define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */ -#define X86_FEATURE_PCLMULQDQ (4*32+ 1) /* PCLMULQDQ instruction */ -#define X86_FEATURE_DTES64 (4*32+ 2) /* 64-bit Debug Store */ -#define X86_FEATURE_MWAIT (4*32+ 3) /* "monitor" Monitor/Mwait support */ -#define X86_FEATURE_DSCPL (4*32+ 4) /* "ds_cpl" CPL Qual. Debug Store */ -#define X86_FEATURE_VMX (4*32+ 5) /* Hardware virtualization */ -#define X86_FEATURE_SMX (4*32+ 6) /* Safer mode */ -#define X86_FEATURE_EST (4*32+ 7) /* Enhanced SpeedStep */ -#define X86_FEATURE_TM2 (4*32+ 8) /* Thermal Monitor 2 */ -#define X86_FEATURE_SSSE3 (4*32+ 9) /* Supplemental SSE-3 */ -#define X86_FEATURE_CID (4*32+10) /* Context ID */ -#define X86_FEATURE_FMA (4*32+12) /* Fused multiply-add */ -#define X86_FEATURE_CX16 (4*32+13) /* CMPXCHG16B */ -#define X86_FEATURE_XTPR (4*32+14) /* Send Task Priority Messages */ -#define X86_FEATURE_PDCM (4*32+15) /* Performance Capabilities */ -#define X86_FEATURE_PCID (4*32+17) /* Process Context Identifiers */ -#define X86_FEATURE_DCA (4*32+18) /* Direct Cache Access */ -#define X86_FEATURE_XMM4_1 (4*32+19) /* "sse4_1" SSE-4.1 */ -#define X86_FEATURE_XMM4_2 (4*32+20) /* "sse4_2" SSE-4.2 */ -#define X86_FEATURE_X2APIC (4*32+21) /* x2APIC */ -#define X86_FEATURE_MOVBE (4*32+22) /* MOVBE instruction */ -#define X86_FEATURE_POPCNT (4*32+23) /* POPCNT instruction */ -#define X86_FEATURE_TSC_DEADLINE_TIMER (4*32+24) /* Tsc deadline timer */ -#define X86_FEATURE_AES (4*32+25) /* AES instructions */ -#define X86_FEATURE_XSAVE (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */ -#define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */ -#define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */ -#define X86_FEATURE_F16C (4*32+29) /* 16-bit fp conversions */ -#define X86_FEATURE_RDRAND (4*32+30) /* The RDRAND instruction */ -#define X86_FEATURE_HYPERVISOR (4*32+31) /* Running on a hypervisor */ +#define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */ +#define X86_FEATURE_PCLMULQDQ ( 4*32+ 1) /* PCLMULQDQ instruction */ +#define X86_FEATURE_DTES64 ( 4*32+ 2) /* 64-bit Debug Store */ +#define X86_FEATURE_MWAIT ( 4*32+ 3) /* "monitor" Monitor/Mwait support */ +#define X86_FEATURE_DSCPL ( 4*32+ 4) /* "ds_cpl" CPL Qual. Debug Store */ +#define X86_FEATURE_VMX ( 4*32+ 5) /* Hardware virtualization */ +#define X86_FEATURE_SMX ( 4*32+ 6) /* Safer mode */ +#define X86_FEATURE_EST ( 4*32+ 7) /* Enhanced SpeedStep */ +#define X86_FEATURE_TM2 ( 4*32+ 8) /* Thermal Monitor 2 */ +#define X86_FEATURE_SSSE3 ( 4*32+ 9) /* Supplemental SSE-3 */ +#define X86_FEATURE_CID ( 4*32+10) /* Context ID */ +#define X86_FEATURE_FMA ( 4*32+12) /* Fused multiply-add */ +#define X86_FEATURE_CX16 ( 4*32+13) /* CMPXCHG16B */ +#define X86_FEATURE_XTPR ( 4*32+14) /* Send Task Priority Messages */ +#define X86_FEATURE_PDCM ( 4*32+15) /* Performance Capabilities */ +#define X86_FEATURE_PCID ( 4*32+17) /* Process Context Identifiers */ +#define X86_FEATURE_DCA ( 4*32+18) /* Direct Cache Access */ +#define X86_FEATURE_XMM4_1 ( 4*32+19) /* "sse4_1" SSE-4.1 */ +#define X86_FEATURE_XMM4_2 ( 4*32+20) /* "sse4_2" SSE-4.2 */ +#define X86_FEATURE_X2APIC ( 4*32+21) /* x2APIC */ +#define X86_FEATURE_MOVBE ( 4*32+22) /* MOVBE instruction */ +#define X86_FEATURE_POPCNT ( 4*32+23) /* POPCNT instruction */ +#define X86_FEATURE_TSC_DEADLINE_TIMER ( 4*32+24) /* Tsc deadline timer */ +#define X86_FEATURE_AES ( 4*32+25) /* AES instructions */ +#define X86_FEATURE_XSAVE ( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */ +#define X86_FEATURE_OSXSAVE ( 4*32+27) /* "" XSAVE enabled in the OS */ +#define X86_FEATURE_AVX ( 4*32+28) /* Advanced Vector Extensions */ +#define X86_FEATURE_F16C ( 4*32+29) /* 16-bit fp conversions */ +#define X86_FEATURE_RDRAND ( 4*32+30) /* The RDRAND instruction */ +#define X86_FEATURE_HYPERVISOR ( 4*32+31) /* Running on a hypervisor */ /* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */ -#define X86_FEATURE_XSTORE (5*32+ 2) /* "rng" RNG present (xstore) */ -#define X86_FEATURE_XSTORE_EN (5*32+ 3) /* "rng_en" RNG enabled */ -#define X86_FEATURE_XCRYPT (5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */ -#define X86_FEATURE_XCRYPT_EN (5*32+ 7) /* "ace_en" on-CPU crypto enabled */ -#define X86_FEATURE_ACE2 (5*32+ 8) /* Advanced Cryptography Engine v2 */ -#define X86_FEATURE_ACE2_EN (5*32+ 9) /* ACE v2 enabled */ -#define X86_FEATURE_PHE (5*32+10) /* PadLock Hash Engine */ -#define X86_FEATURE_PHE_EN (5*32+11) /* PHE enabled */ -#define X86_FEATURE_PMM (5*32+12) /* PadLock Montgomery Multiplier */ -#define X86_FEATURE_PMM_EN (5*32+13) /* PMM enabled */ +#define X86_FEATURE_XSTORE ( 5*32+ 2) /* "rng" RNG present (xstore) */ +#define X86_FEATURE_XSTORE_EN ( 5*32+ 3) /* "rng_en" RNG enabled */ +#define X86_FEATURE_XCRYPT ( 5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */ +#define X86_FEATURE_XCRYPT_EN ( 5*32+ 7) /* "ace_en" on-CPU crypto enabled */ +#define X86_FEATURE_ACE2 ( 5*32+ 8) /* Advanced Cryptography Engine v2 */ +#define X86_FEATURE_ACE2_EN ( 5*32+ 9) /* ACE v2 enabled */ +#define X86_FEATURE_PHE ( 5*32+10) /* PadLock Hash Engine */ +#define X86_FEATURE_PHE_EN ( 5*32+11) /* PHE enabled */ +#define X86_FEATURE_PMM ( 5*32+12) /* PadLock Montgomery Multiplier */ +#define X86_FEATURE_PMM_EN ( 5*32+13) /* PMM enabled */ /* More extended AMD flags: CPUID level 0x80000001, ecx, word 6 */ -#define X86_FEATURE_LAHF_LM (6*32+ 0) /* LAHF/SAHF in long mode */ -#define X86_FEATURE_CMP_LEGACY (6*32+ 1) /* If yes HyperThreading not valid */ -#define X86_FEATURE_SVM (6*32+ 2) /* Secure virtual machine */ -#define X86_FEATURE_EXTAPIC (6*32+ 3) /* Extended APIC space */ -#define X86_FEATURE_CR8_LEGACY (6*32+ 4) /* CR8 in 32-bit mode */ -#define X86_FEATURE_ABM (6*32+ 5) /* Advanced bit manipulation */ -#define X86_FEATURE_SSE4A (6*32+ 6) /* SSE-4A */ -#define X86_FEATURE_MISALIGNSSE (6*32+ 7) /* Misaligned SSE mode */ -#define X86_FEATURE_3DNOWPREFETCH (6*32+ 8) /* 3DNow prefetch instructions */ -#define X86_FEATURE_OSVW (6*32+ 9) /* OS Visible Workaround */ -#define X86_FEATURE_IBS (6*32+10) /* Instruction Based Sampling */ -#define X86_FEATURE_XOP (6*32+11) /* extended AVX instructions */ -#define X86_FEATURE_SKINIT (6*32+12) /* SKINIT/STGI instructions */ -#define X86_FEATURE_WDT (6*32+13) /* Watchdog timer */ -#define X86_FEATURE_LWP (6*32+15) /* Light Weight Profiling */ -#define X86_FEATURE_FMA4 (6*32+16) /* 4 operands MAC instructions */ -#define X86_FEATURE_TCE (6*32+17) /* translation cache extension */ -#define X86_FEATURE_NODEID_MSR (6*32+19) /* NodeId MSR */ -#define X86_FEATURE_TBM (6*32+21) /* trailing bit manipulations */ -#define X86_FEATURE_TOPOEXT (6*32+22) /* topology extensions CPUID leafs */ -#define X86_FEATURE_PERFCTR_CORE (6*32+23) /* core performance counter extensions */ -#define X86_FEATURE_PERFCTR_NB (6*32+24) /* NB performance counter extensions */ -#define X86_FEATURE_PERFCTR_L2 (6*32+28) /* L2 performance counter extensions */ +#define X86_FEATURE_LAHF_LM ( 6*32+ 0) /* LAHF/SAHF in long mode */ +#define X86_FEATURE_CMP_LEGACY ( 6*32+ 1) /* If yes HyperThreading not valid */ +#define X86_FEATURE_SVM ( 6*32+ 2) /* Secure virtual machine */ +#define X86_FEATURE_EXTAPIC ( 6*32+ 3) /* Extended APIC space */ +#define X86_FEATURE_CR8_LEGACY ( 6*32+ 4) /* CR8 in 32-bit mode */ +#define X86_FEATURE_ABM ( 6*32+ 5) /* Advanced bit manipulation */ +#define X86_FEATURE_SSE4A ( 6*32+ 6) /* SSE-4A */ +#define X86_FEATURE_MISALIGNSSE ( 6*32+ 7) /* Misaligned SSE mode */ +#define X86_FEATURE_3DNOWPREFETCH ( 6*32+ 8) /* 3DNow prefetch instructions */ +#define X86_FEATURE_OSVW ( 6*32+ 9) /* OS Visible Workaround */ +#define X86_FEATURE_IBS ( 6*32+10) /* Instruction Based Sampling */ +#define X86_FEATURE_XOP ( 6*32+11) /* extended AVX instructions */ +#define X86_FEATURE_SKINIT ( 6*32+12) /* SKINIT/STGI instructions */ +#define X86_FEATURE_WDT ( 6*32+13) /* Watchdog timer */ +#define X86_FEATURE_LWP ( 6*32+15) /* Light Weight Profiling */ +#define X86_FEATURE_FMA4 ( 6*32+16) /* 4 operands MAC instructions */ +#define X86_FEATURE_TCE ( 6*32+17) /* translation cache extension */ +#define X86_FEATURE_NODEID_MSR ( 6*32+19) /* NodeId MSR */ +#define X86_FEATURE_TBM ( 6*32+21) /* trailing bit manipulations */ +#define X86_FEATURE_TOPOEXT ( 6*32+22) /* topology extensions CPUID leafs */ +#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* core performance counter extensions */ +#define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */ +#define X86_FEATURE_PERFCTR_L2 ( 6*32+28) /* L2 performance counter extensions */ /* * Auxiliary flags: Linux defined - For features scattered in various * CPUID levels like 0x6, 0xA etc, word 7 */ -#define X86_FEATURE_IDA (7*32+ 0) /* Intel Dynamic Acceleration */ -#define X86_FEATURE_ARAT (7*32+ 1) /* Always Running APIC Timer */ -#define X86_FEATURE_CPB (7*32+ 2) /* AMD Core Performance Boost */ -#define X86_FEATURE_EPB (7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ -#define X86_FEATURE_XSAVEOPT (7*32+ 4) /* Optimized Xsave */ -#define X86_FEATURE_PLN (7*32+ 5) /* Intel Power Limit Notification */ -#define X86_FEATURE_PTS (7*32+ 6) /* Intel Package Thermal Status */ -#define X86_FEATURE_DTHERM (7*32+ 7) /* Digital Thermal Sensor */ -#define X86_FEATURE_HW_PSTATE (7*32+ 8) /* AMD HW-PState */ -#define X86_FEATURE_PROC_FEEDBACK (7*32+ 9) /* AMD ProcFeedbackInterface */ +#define X86_FEATURE_IDA ( 7*32+ 0) /* Intel Dynamic Acceleration */ +#define X86_FEATURE_ARAT ( 7*32+ 1) /* Always Running APIC Timer */ +#define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */ +#define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ +#define X86_FEATURE_PLN ( 7*32+ 5) /* Intel Power Limit Notification */ +#define X86_FEATURE_PTS ( 7*32+ 6) /* Intel Package Thermal Status */ +#define X86_FEATURE_DTHERM ( 7*32+ 7) /* Digital Thermal Sensor */ +#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ +#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ /* Virtualization flags: Linux defined, word 8 */ -#define X86_FEATURE_TPR_SHADOW (8*32+ 0) /* Intel TPR Shadow */ -#define X86_FEATURE_VNMI (8*32+ 1) /* Intel Virtual NMI */ -#define X86_FEATURE_FLEXPRIORITY (8*32+ 2) /* Intel FlexPriority */ -#define X86_FEATURE_EPT (8*32+ 3) /* Intel Extended Page Table */ -#define X86_FEATURE_VPID (8*32+ 4) /* Intel Virtual Processor ID */ -#define X86_FEATURE_NPT (8*32+ 5) /* AMD Nested Page Table support */ -#define X86_FEATURE_LBRV (8*32+ 6) /* AMD LBR Virtualization support */ -#define X86_FEATURE_SVML (8*32+ 7) /* "svm_lock" AMD SVM locking MSR */ -#define X86_FEATURE_NRIPS (8*32+ 8) /* "nrip_save" AMD SVM next_rip save */ -#define X86_FEATURE_TSCRATEMSR (8*32+ 9) /* "tsc_scale" AMD TSC scaling support */ -#define X86_FEATURE_VMCBCLEAN (8*32+10) /* "vmcb_clean" AMD VMCB clean bits support */ -#define X86_FEATURE_FLUSHBYASID (8*32+11) /* AMD flush-by-ASID support */ -#define X86_FEATURE_DECODEASSISTS (8*32+12) /* AMD Decode Assists support */ -#define X86_FEATURE_PAUSEFILTER (8*32+13) /* AMD filtered pause intercept */ -#define X86_FEATURE_PFTHRESHOLD (8*32+14) /* AMD pause filter threshold */ +#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ +#define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */ +#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 2) /* Intel FlexPriority */ +#define X86_FEATURE_EPT ( 8*32+ 3) /* Intel Extended Page Table */ +#define X86_FEATURE_VPID ( 8*32+ 4) /* Intel Virtual Processor ID */ +#define X86_FEATURE_NPT ( 8*32+ 5) /* AMD Nested Page Table support */ +#define X86_FEATURE_LBRV ( 8*32+ 6) /* AMD LBR Virtualization support */ +#define X86_FEATURE_SVML ( 8*32+ 7) /* "svm_lock" AMD SVM locking MSR */ +#define X86_FEATURE_NRIPS ( 8*32+ 8) /* "nrip_save" AMD SVM next_rip save */ +#define X86_FEATURE_TSCRATEMSR ( 8*32+ 9) /* "tsc_scale" AMD TSC scaling support */ +#define X86_FEATURE_VMCBCLEAN ( 8*32+10) /* "vmcb_clean" AMD VMCB clean bits support */ +#define X86_FEATURE_FLUSHBYASID ( 8*32+11) /* AMD flush-by-ASID support */ +#define X86_FEATURE_DECODEASSISTS ( 8*32+12) /* AMD Decode Assists support */ +#define X86_FEATURE_PAUSEFILTER ( 8*32+13) /* AMD filtered pause intercept */ +#define X86_FEATURE_PFTHRESHOLD ( 8*32+14) /* AMD pause filter threshold */ +#define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer vmmcall to vmcall */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ -#define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ -#define X86_FEATURE_TSC_ADJUST (9*32+ 1) /* TSC adjustment MSR 0x3b */ -#define X86_FEATURE_BMI1 (9*32+ 3) /* 1st group bit manipulation extensions */ -#define X86_FEATURE_HLE (9*32+ 4) /* Hardware Lock Elision */ -#define X86_FEATURE_AVX2 (9*32+ 5) /* AVX2 instructions */ -#define X86_FEATURE_SMEP (9*32+ 7) /* Supervisor Mode Execution Protection */ -#define X86_FEATURE_BMI2 (9*32+ 8) /* 2nd group bit manipulation extensions */ -#define X86_FEATURE_ERMS (9*32+ 9) /* Enhanced REP MOVSB/STOSB */ -#define X86_FEATURE_INVPCID (9*32+10) /* Invalidate Processor Context ID */ -#define X86_FEATURE_RTM (9*32+11) /* Restricted Transactional Memory */ -#define X86_FEATURE_MPX (9*32+14) /* Memory Protection Extension */ -#define X86_FEATURE_AVX512F (9*32+16) /* AVX-512 Foundation */ -#define X86_FEATURE_RDSEED (9*32+18) /* The RDSEED instruction */ -#define X86_FEATURE_ADX (9*32+19) /* The ADCX and ADOX instructions */ -#define X86_FEATURE_SMAP (9*32+20) /* Supervisor Mode Access Prevention */ -#define X86_FEATURE_CLFLUSHOPT (9*32+23) /* CLFLUSHOPT instruction */ -#define X86_FEATURE_AVX512PF (9*32+26) /* AVX-512 Prefetch */ -#define X86_FEATURE_AVX512ER (9*32+27) /* AVX-512 Exponential and Reciprocal */ -#define X86_FEATURE_AVX512CD (9*32+28) /* AVX-512 Conflict Detection */ +#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ +#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* TSC adjustment MSR 0x3b */ +#define X86_FEATURE_BMI1 ( 9*32+ 3) /* 1st group bit manipulation extensions */ +#define X86_FEATURE_HLE ( 9*32+ 4) /* Hardware Lock Elision */ +#define X86_FEATURE_AVX2 ( 9*32+ 5) /* AVX2 instructions */ +#define X86_FEATURE_SMEP ( 9*32+ 7) /* Supervisor Mode Execution Protection */ +#define X86_FEATURE_BMI2 ( 9*32+ 8) /* 2nd group bit manipulation extensions */ +#define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB */ +#define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */ +#define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */ +#define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */ +#define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */ +#define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */ +#define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */ +#define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */ +#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */ +#define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */ +#define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */ +#define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */ + +/* Extended state features, CPUID level 0x0000000d:1 (eax), word 10 */ +#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* XSAVEOPT */ +#define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC */ +#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 */ +#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS */ /* * BUG word(s) @@ -234,16 +244,32 @@ #define X86_BUG_F00F X86_BUG(0) /* Intel F00F */ #define X86_BUG_FDIV X86_BUG(1) /* FPU FDIV */ #define X86_BUG_COMA X86_BUG(2) /* Cyrix 6x86 coma */ -#define X86_BUG_AMD_TLB_MMATCH X86_BUG(3) /* AMD Erratum 383 */ -#define X86_BUG_AMD_APIC_C1E X86_BUG(4) /* AMD Erratum 400 */ +#define X86_BUG_AMD_TLB_MMATCH X86_BUG(3) /* "tlb_mmatch" AMD Erratum 383 */ +#define X86_BUG_AMD_APIC_C1E X86_BUG(4) /* "apic_c1e" AMD Erratum 400 */ +#define X86_BUG_11AP X86_BUG(5) /* Bad local APIC aka 11AP */ +#define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */ +#define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */ #if defined(__KERNEL__) && !defined(__ASSEMBLY__) #include <asm/asm.h> #include <linux/bitops.h> +#ifdef CONFIG_X86_FEATURE_NAMES extern const char * const x86_cap_flags[NCAPINTS*32]; extern const char * const x86_power_flags[32]; +#define X86_CAP_FMT "%s" +#define x86_cap_flag(flag) x86_cap_flags[flag] +#else +#define X86_CAP_FMT "%d:%d" +#define x86_cap_flag(flag) ((flag) >> 5), ((flag) & 31) +#endif + +/* + * In order to save room, we index into this array by doing + * X86_BUG_<name> - NCAPINTS*32. + */ +extern const char * const x86_bug_flags[NBUGINTS*32]; #define test_cpu_cap(c, bit) \ test_bit(bit, (unsigned long *)((c)->x86_capability)) @@ -260,6 +286,18 @@ extern const char * const x86_power_flags[32]; (((bit)>>5)==8 && (1UL<<((bit)&31) & REQUIRED_MASK8)) || \ (((bit)>>5)==9 && (1UL<<((bit)&31) & REQUIRED_MASK9)) ) +#define DISABLED_MASK_BIT_SET(bit) \ + ( (((bit)>>5)==0 && (1UL<<((bit)&31) & DISABLED_MASK0)) || \ + (((bit)>>5)==1 && (1UL<<((bit)&31) & DISABLED_MASK1)) || \ + (((bit)>>5)==2 && (1UL<<((bit)&31) & DISABLED_MASK2)) || \ + (((bit)>>5)==3 && (1UL<<((bit)&31) & DISABLED_MASK3)) || \ + (((bit)>>5)==4 && (1UL<<((bit)&31) & DISABLED_MASK4)) || \ + (((bit)>>5)==5 && (1UL<<((bit)&31) & DISABLED_MASK5)) || \ + (((bit)>>5)==6 && (1UL<<((bit)&31) & DISABLED_MASK6)) || \ + (((bit)>>5)==7 && (1UL<<((bit)&31) & DISABLED_MASK7)) || \ + (((bit)>>5)==8 && (1UL<<((bit)&31) & DISABLED_MASK8)) || \ + (((bit)>>5)==9 && (1UL<<((bit)&31) & DISABLED_MASK9)) ) + #define cpu_has(c, bit) \ (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \ test_cpu_cap(c, bit)) @@ -268,6 +306,18 @@ extern const char * const x86_power_flags[32]; (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \ x86_this_cpu_test_bit(bit, (unsigned long *)&cpu_info.x86_capability)) +/* + * This macro is for detection of features which need kernel + * infrastructure to be used. It may *not* directly test the CPU + * itself. Use the cpu_has() family if you want true runtime + * testing of CPU features, like in hypervisor code where you are + * supporting a possible guest feature where host support for it + * is not relevant. + */ +#define cpu_feature_enabled(bit) \ + (__builtin_constant_p(bit) && DISABLED_MASK_BIT_SET(bit) ? 0 : \ + cpu_has(&boot_cpu_data, bit)) + #define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit) #define set_cpu_cap(c, bit) set_bit(bit, (unsigned long *)((c)->x86_capability)) @@ -282,11 +332,9 @@ extern const char * const x86_power_flags[32]; } while (0) #define cpu_has_fpu boot_cpu_has(X86_FEATURE_FPU) -#define cpu_has_vme boot_cpu_has(X86_FEATURE_VME) #define cpu_has_de boot_cpu_has(X86_FEATURE_DE) #define cpu_has_pse boot_cpu_has(X86_FEATURE_PSE) #define cpu_has_tsc boot_cpu_has(X86_FEATURE_TSC) -#define cpu_has_pae boot_cpu_has(X86_FEATURE_PAE) #define cpu_has_pge boot_cpu_has(X86_FEATURE_PGE) #define cpu_has_apic boot_cpu_has(X86_FEATURE_APIC) #define cpu_has_sep boot_cpu_has(X86_FEATURE_SEP) @@ -301,11 +349,7 @@ extern const char * const x86_power_flags[32]; #define cpu_has_avx boot_cpu_has(X86_FEATURE_AVX) #define cpu_has_avx2 boot_cpu_has(X86_FEATURE_AVX2) #define cpu_has_ht boot_cpu_has(X86_FEATURE_HT) -#define cpu_has_mp boot_cpu_has(X86_FEATURE_MP) #define cpu_has_nx boot_cpu_has(X86_FEATURE_NX) -#define cpu_has_k6_mtrr boot_cpu_has(X86_FEATURE_K6_MTRR) -#define cpu_has_cyrix_arr boot_cpu_has(X86_FEATURE_CYRIX_ARR) -#define cpu_has_centaur_mcr boot_cpu_has(X86_FEATURE_CENTAUR_MCR) #define cpu_has_xstore boot_cpu_has(X86_FEATURE_XSTORE) #define cpu_has_xstore_enabled boot_cpu_has(X86_FEATURE_XSTORE_EN) #define cpu_has_xcrypt boot_cpu_has(X86_FEATURE_XCRYPT) @@ -328,6 +372,7 @@ extern const char * const x86_power_flags[32]; #define cpu_has_x2apic boot_cpu_has(X86_FEATURE_X2APIC) #define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE) #define cpu_has_xsaveopt boot_cpu_has(X86_FEATURE_XSAVEOPT) +#define cpu_has_xsaves boot_cpu_has(X86_FEATURE_XSAVES) #define cpu_has_osxsave boot_cpu_has(X86_FEATURE_OSXSAVE) #define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR) #define cpu_has_pclmulqdq boot_cpu_has(X86_FEATURE_PCLMULQDQ) @@ -339,28 +384,6 @@ extern const char * const x86_power_flags[32]; #define cpu_has_eager_fpu boot_cpu_has(X86_FEATURE_EAGER_FPU) #define cpu_has_topoext boot_cpu_has(X86_FEATURE_TOPOEXT) -#ifdef CONFIG_X86_64 - -#undef cpu_has_vme -#define cpu_has_vme 0 - -#undef cpu_has_pae -#define cpu_has_pae ___BUG___ - -#undef cpu_has_mp -#define cpu_has_mp 1 - -#undef cpu_has_k6_mtrr -#define cpu_has_k6_mtrr 0 - -#undef cpu_has_cyrix_arr -#define cpu_has_cyrix_arr 0 - -#undef cpu_has_centaur_mcr -#define cpu_has_centaur_mcr 0 - -#endif /* CONFIG_X86_64 */ - #if __GNUC__ >= 4 extern void warn_pre_alternatives(void); extern bool __static_cpu_has_safe(u16 bit); @@ -539,20 +562,20 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit) #define static_cpu_has_safe(bit) boot_cpu_has(bit) #endif -#define cpu_has_bug(c, bit) cpu_has(c, (bit)) -#define set_cpu_bug(c, bit) set_cpu_cap(c, (bit)) -#define clear_cpu_bug(c, bit) clear_cpu_cap(c, (bit)); +#define cpu_has_bug(c, bit) cpu_has(c, (bit)) +#define set_cpu_bug(c, bit) set_cpu_cap(c, (bit)) +#define clear_cpu_bug(c, bit) clear_cpu_cap(c, (bit)) -#define static_cpu_has_bug(bit) static_cpu_has((bit)) -#define boot_cpu_has_bug(bit) cpu_has_bug(&boot_cpu_data, (bit)) +#define static_cpu_has_bug(bit) static_cpu_has((bit)) +#define static_cpu_has_bug_safe(bit) static_cpu_has_safe((bit)) +#define boot_cpu_has_bug(bit) cpu_has_bug(&boot_cpu_data, (bit)) -#define MAX_CPU_FEATURES (NCAPINTS * 32) -#define cpu_have_feature boot_cpu_has +#define MAX_CPU_FEATURES (NCAPINTS * 32) +#define cpu_have_feature boot_cpu_has -#define CPU_FEATURE_TYPEFMT "x86,ven%04Xfam%04Xmod%04X" -#define CPU_FEATURE_TYPEVAL boot_cpu_data.x86_vendor, boot_cpu_data.x86, \ - boot_cpu_data.x86_model +#define CPU_FEATURE_TYPEFMT "x86,ven%04Xfam%04Xmod%04X" +#define CPU_FEATURE_TYPEVAL boot_cpu_data.x86_vendor, boot_cpu_data.x86, \ + boot_cpu_data.x86_model #endif /* defined(__KERNEL__) && !defined(__ASSEMBLY__) */ - #endif /* _ASM_X86_CPUFEATURE_H */ diff --git a/arch/x86/include/asm/crash.h b/arch/x86/include/asm/crash.h new file mode 100644 index 00000000000..f498411f250 --- /dev/null +++ b/arch/x86/include/asm/crash.h @@ -0,0 +1,9 @@ +#ifndef _ASM_X86_CRASH_H +#define _ASM_X86_CRASH_H + +int crash_load_segments(struct kimage *image); +int crash_copy_backup_region(struct kimage *image); +int crash_setup_memmap_entries(struct kimage *image, + struct boot_params *params); + +#endif /* _ASM_X86_CRASH_H */ diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index 4b528a970bd..61fd18b83b6 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h @@ -97,11 +97,11 @@ extern void hw_breakpoint_restore(void); DECLARE_PER_CPU(int, debug_stack_usage); static inline void debug_stack_usage_inc(void) { - __get_cpu_var(debug_stack_usage)++; + __this_cpu_inc(debug_stack_usage); } static inline void debug_stack_usage_dec(void) { - __get_cpu_var(debug_stack_usage)--; + __this_cpu_dec(debug_stack_usage); } int is_debug_stack(unsigned long addr); void debug_stack_set_zero(void); diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h new file mode 100644 index 00000000000..97534a7d38e --- /dev/null +++ b/arch/x86/include/asm/disabled-features.h @@ -0,0 +1,39 @@ +#ifndef _ASM_X86_DISABLED_FEATURES_H +#define _ASM_X86_DISABLED_FEATURES_H + +/* These features, although they might be available in a CPU + * will not be used because the compile options to support + * them are not present. + * + * This code allows them to be checked and disabled at + * compile time without an explicit #ifdef. Use + * cpu_feature_enabled(). + */ + +#ifdef CONFIG_X86_64 +# define DISABLE_VME (1<<(X86_FEATURE_VME & 31)) +# define DISABLE_K6_MTRR (1<<(X86_FEATURE_K6_MTRR & 31)) +# define DISABLE_CYRIX_ARR (1<<(X86_FEATURE_CYRIX_ARR & 31)) +# define DISABLE_CENTAUR_MCR (1<<(X86_FEATURE_CENTAUR_MCR & 31)) +#else +# define DISABLE_VME 0 +# define DISABLE_K6_MTRR 0 +# define DISABLE_CYRIX_ARR 0 +# define DISABLE_CENTAUR_MCR 0 +#endif /* CONFIG_X86_64 */ + +/* + * Make sure to add features to the correct mask + */ +#define DISABLED_MASK0 (DISABLE_VME) +#define DISABLED_MASK1 0 +#define DISABLED_MASK2 0 +#define DISABLED_MASK3 (DISABLE_CYRIX_ARR|DISABLE_CENTAUR_MCR|DISABLE_K6_MTRR) +#define DISABLED_MASK4 0 +#define DISABLED_MASK5 0 +#define DISABLED_MASK6 0 +#define DISABLED_MASK7 0 +#define DISABLED_MASK8 0 +#define DISABLED_MASK9 0 + +#endif /* _ASM_X86_DISABLED_FEATURES_H */ diff --git a/arch/x86/include/asm/dma-contiguous.h b/arch/x86/include/asm/dma-contiguous.h deleted file mode 100644 index b4b38bacb40..00000000000 --- a/arch/x86/include/asm/dma-contiguous.h +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef ASMX86_DMA_CONTIGUOUS_H -#define ASMX86_DMA_CONTIGUOUS_H - -#ifdef __KERNEL__ - -#include <linux/types.h> - -static inline void -dma_contiguous_early_fixup(phys_addr_t base, unsigned long size) { } - -#endif -#endif diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 1eb5f6433ad..0ec241ede5a 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -104,6 +104,8 @@ extern void __init runtime_code_page_mkexec(void); extern void __init efi_runtime_mkexec(void); extern void __init efi_dump_pagetable(void); extern void __init efi_apply_memmap_quirks(void); +extern int __init efi_reuse_config(u64 tables, int nr_tables); +extern void efi_delete_dummy_variable(void); struct efi_setup_data { u64 fw_vendor; @@ -156,6 +158,9 @@ static inline efi_status_t efi_thunk_set_virtual_address_map( return EFI_SUCCESS; } #endif /* CONFIG_EFI_MIXED */ + +extern bool efi_reboot_required(void); + #else /* * IF EFI is not configured, have the EFI calls return -ENOSYS. @@ -168,6 +173,10 @@ static inline efi_status_t efi_thunk_set_virtual_address_map( #define efi_call5(_f, _a1, _a2, _a3, _a4, _a5) (-ENOSYS) #define efi_call6(_f, _a1, _a2, _a3, _a4, _a5, _a6) (-ENOSYS) static inline void parse_efi_setup(u64 phys_addr, u32 data_len) {} +static inline bool efi_reboot_required(void) +{ + return false; +} #endif /* CONFIG_EFI */ #endif /* _ASM_X86_EFI_H */ diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 1a055c81d86..ca3347a9dab 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -160,8 +160,9 @@ do { \ #define elf_check_arch(x) \ ((x)->e_machine == EM_X86_64) -#define compat_elf_check_arch(x) \ - (elf_check_arch_ia32(x) || (x)->e_machine == EM_X86_64) +#define compat_elf_check_arch(x) \ + (elf_check_arch_ia32(x) || \ + (IS_ENABLED(CONFIG_X86_X32_ABI) && (x)->e_machine == EM_X86_64)) #if __USER32_DS != __USER_DS # error "The following code assumes __USER32_DS == __USER_DS" diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index b0910f97a3e..ffb1733ac91 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -106,14 +106,14 @@ enum fixed_addresses { __end_of_permanent_fixed_addresses, /* - * 256 temporary boot-time mappings, used by early_ioremap(), + * 512 temporary boot-time mappings, used by early_ioremap(), * before ioremap() is functional. * - * If necessary we round it up to the next 256 pages boundary so + * If necessary we round it up to the next 512 pages boundary so * that we can have a single pgd entry and a single pte table: */ #define NR_FIX_BTMAPS 64 -#define FIX_BTMAPS_SLOTS 4 +#define FIX_BTMAPS_SLOTS 8 #define TOTAL_FIX_BTMAPS (NR_FIX_BTMAPS * FIX_BTMAPS_SLOTS) FIX_BTMAP_END = (__end_of_permanent_fixed_addresses ^ diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index 115e3689cd5..e97622f5772 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -293,7 +293,7 @@ static inline int restore_fpu_checking(struct task_struct *tsk) /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is pending. Clear the x87 state here by setting it to fixed values. "m" is a random variable that should be in L1 */ - if (unlikely(static_cpu_has_safe(X86_FEATURE_FXSAVE_LEAK))) { + if (unlikely(static_cpu_has_bug_safe(X86_BUG_FXSAVE_LEAK))) { asm volatile( "fnclex\n\t" "emms\n\t" @@ -344,7 +344,7 @@ static inline void __thread_fpu_end(struct task_struct *tsk) static inline void __thread_fpu_begin(struct task_struct *tsk) { - if (!static_cpu_has_safe(X86_FEATURE_EAGER_FPU)) + if (!use_eager_fpu()) clts(); __thread_set_has_fpu(tsk); } @@ -508,9 +508,12 @@ static inline void user_fpu_begin(void) static inline void __save_fpu(struct task_struct *tsk) { - if (use_xsave()) - xsave_state(&tsk->thread.fpu.state->xsave, -1); - else + if (use_xsave()) { + if (unlikely(system_state == SYSTEM_BOOTING)) + xsave_state_booting(&tsk->thread.fpu.state->xsave, -1); + else + xsave_state(&tsk->thread.fpu.state->xsave, -1); + } else fpu_fxsave(&tsk->thread.fpu); } diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 0525a8bdf65..e1f7fecaa7d 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -68,6 +68,8 @@ struct dyn_arch_ftrace { int ftrace_int3_handler(struct pt_regs *regs); +#define FTRACE_GRAPH_TRAMP_ADDR FTRACE_GRAPH_ADDR + #endif /* CONFIG_DYNAMIC_FTRACE */ #endif /* __ASSEMBLY__ */ #endif /* CONFIG_FUNCTION_TRACER */ diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 230853da4ec..0f5fb6b6567 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -40,9 +40,6 @@ typedef struct { DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); -/* We can have at most NR_VECTORS irqs routed to a cpu at a time */ -#define MAX_HARDIRQS_PER_CPU NR_VECTORS - #define __ARCH_IRQ_STAT #define inc_irq_stat(member) this_cpu_inc(irq_stat.member) diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h index a20365953bf..ccffa53750a 100644 --- a/arch/x86/include/asm/i8259.h +++ b/arch/x86/include/asm/i8259.h @@ -67,4 +67,9 @@ struct legacy_pic { extern struct legacy_pic *legacy_pic; extern struct legacy_pic null_legacy_pic; +static inline int nr_legacy_irqs(void) +{ + return legacy_pic->nr_legacy_irqs; +} + #endif /* _ASM_X86_I8259_H */ diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 90f97b4b934..1733ab49ac5 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -98,6 +98,8 @@ struct IR_IO_APIC_route_entry { #define IOAPIC_AUTO -1 #define IOAPIC_EDGE 0 #define IOAPIC_LEVEL 1 +#define IOAPIC_MAP_ALLOC 0x1 +#define IOAPIC_MAP_CHECK 0x2 #ifdef CONFIG_X86_IO_APIC @@ -118,9 +120,6 @@ extern int mp_irq_entries; /* MP IRQ source entries */ extern struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES]; -/* non-0 if default (table-less) MP configuration */ -extern int mpc_default_type; - /* Older SiS APIC requires we rewrite the index register */ extern int sis_apic_bug; @@ -133,9 +132,6 @@ extern int noioapicquirk; /* -1 if "noapic" boot option passed */ extern int noioapicreroute; -/* 1 if the timer IRQ uses the '8259A Virtual Wire' mode */ -extern int timer_through_8259; - /* * If we use the IO-APIC for IRQ routing, disable automatic * assignment of PCI IRQ's. @@ -145,24 +141,17 @@ extern int timer_through_8259; struct io_apic_irq_attr; struct irq_cfg; -extern int io_apic_set_pci_routing(struct device *dev, int irq, - struct io_apic_irq_attr *irq_attr); -void setup_IO_APIC_irq_extra(u32 gsi); extern void ioapic_insert_resources(void); extern int native_setup_ioapic_entry(int, struct IO_APIC_route_entry *, unsigned int, int, struct io_apic_irq_attr *); -extern int native_setup_ioapic_entry(int, struct IO_APIC_route_entry *, - unsigned int, int, - struct io_apic_irq_attr *); extern void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg); extern void native_compose_msi_msg(struct pci_dev *pdev, unsigned int irq, unsigned int dest, struct msi_msg *msg, u8 hpet_id); extern void native_eoi_ioapic_pin(int apic, int pin, int vector); -int io_apic_setup_irq_pin_once(unsigned int irq, int node, struct io_apic_irq_attr *attr); extern int save_ioapic_entries(void); extern void mask_ioapic_entries(void); @@ -171,15 +160,40 @@ extern int restore_ioapic_entries(void); extern void setup_ioapic_ids_from_mpc(void); extern void setup_ioapic_ids_from_mpc_nocheck(void); +enum ioapic_domain_type { + IOAPIC_DOMAIN_INVALID, + IOAPIC_DOMAIN_LEGACY, + IOAPIC_DOMAIN_STRICT, + IOAPIC_DOMAIN_DYNAMIC, +}; + +struct device_node; +struct irq_domain; +struct irq_domain_ops; + +struct ioapic_domain_cfg { + enum ioapic_domain_type type; + const struct irq_domain_ops *ops; + struct device_node *dev; +}; + struct mp_ioapic_gsi{ u32 gsi_base; u32 gsi_end; }; -extern struct mp_ioapic_gsi mp_gsi_routing[]; extern u32 gsi_top; -int mp_find_ioapic(u32 gsi); -int mp_find_ioapic_pin(int ioapic, u32 gsi); -void __init mp_register_ioapic(int id, u32 address, u32 gsi_base); + +extern int mp_find_ioapic(u32 gsi); +extern int mp_find_ioapic_pin(int ioapic, u32 gsi); +extern u32 mp_pin_to_gsi(int ioapic, int pin); +extern int mp_map_gsi_to_irq(u32 gsi, unsigned int flags); +extern void mp_unmap_irq(int irq); +extern void __init mp_register_ioapic(int id, u32 address, u32 gsi_base, + struct ioapic_domain_cfg *cfg); +extern int mp_irqdomain_map(struct irq_domain *domain, unsigned int virq, + irq_hw_number_t hwirq); +extern void mp_irqdomain_unmap(struct irq_domain *domain, unsigned int virq); +extern int mp_set_gsi_attr(u32 gsi, int trigger, int polarity, int node); extern void __init pre_init_apic_IRQ0(void); extern void mp_save_irq(struct mpc_intsrc *m); @@ -213,18 +227,19 @@ static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned extern void io_apic_eoi(unsigned int apic, unsigned int vector); +extern bool mp_should_keep_irq(struct device *dev); + #else /* !CONFIG_X86_IO_APIC */ #define io_apic_assign_pci_irqs 0 #define setup_ioapic_ids_from_mpc x86_init_noop -static const int timer_through_8259 = 0; static inline void ioapic_insert_resources(void) { } #define gsi_top (NR_IRQS_LEGACY) static inline int mp_find_ioapic(u32 gsi) { return 0; } - -struct io_apic_irq_attr; -static inline int io_apic_set_pci_routing(struct device *dev, int irq, - struct io_apic_irq_attr *irq_attr) { return 0; } +static inline u32 mp_pin_to_gsi(int ioapic, int pin) { return UINT_MAX; } +static inline int mp_map_gsi_to_irq(u32 gsi, unsigned int flags) { return gsi; } +static inline void mp_unmap_irq(int irq) { } +static inline bool mp_should_keep_irq(struct device *dev) { return 1; } static inline int save_ioapic_entries(void) { diff --git a/arch/x86/include/asm/irq_work.h b/arch/x86/include/asm/irq_work.h new file mode 100644 index 00000000000..78162f8e248 --- /dev/null +++ b/arch/x86/include/asm/irq_work.h @@ -0,0 +1,11 @@ +#ifndef _ASM_IRQ_WORK_H +#define _ASM_IRQ_WORK_H + +#include <asm/processor.h> + +static inline bool arch_irq_work_has_interrupt(void) +{ + return cpu_has_apic; +} + +#endif /* _ASM_IRQ_WORK_H */ diff --git a/arch/x86/include/asm/kexec-bzimage64.h b/arch/x86/include/asm/kexec-bzimage64.h new file mode 100644 index 00000000000..d1b5d194e31 --- /dev/null +++ b/arch/x86/include/asm/kexec-bzimage64.h @@ -0,0 +1,6 @@ +#ifndef _ASM_KEXEC_BZIMAGE64_H +#define _ASM_KEXEC_BZIMAGE64_H + +extern struct kexec_file_ops kexec_bzImage64_ops; + +#endif /* _ASM_KEXE_BZIMAGE64_H */ diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index 17483a492f1..d2434c1cad0 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -23,6 +23,9 @@ #include <asm/page.h> #include <asm/ptrace.h> +#include <asm/bootparam.h> + +struct kimage; /* * KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return. @@ -61,6 +64,10 @@ # define KEXEC_ARCH KEXEC_ARCH_X86_64 #endif +/* Memory to backup during crash kdump */ +#define KEXEC_BACKUP_SRC_START (0UL) +#define KEXEC_BACKUP_SRC_END (640 * 1024UL) /* 640K */ + /* * CPU does not save ss and sp on stack if execution is already * running in kernel mode at the time of NMI occurrence. This code @@ -160,6 +167,44 @@ struct kimage_arch { pud_t *pud; pmd_t *pmd; pte_t *pte; + /* Details of backup region */ + unsigned long backup_src_start; + unsigned long backup_src_sz; + + /* Physical address of backup segment */ + unsigned long backup_load_addr; + + /* Core ELF header buffer */ + void *elf_headers; + unsigned long elf_headers_sz; + unsigned long elf_load_addr; +}; +#endif /* CONFIG_X86_32 */ + +#ifdef CONFIG_X86_64 +/* + * Number of elements and order of elements in this structure should match + * with the ones in arch/x86/purgatory/entry64.S. If you make a change here + * make an appropriate change in purgatory too. + */ +struct kexec_entry64_regs { + uint64_t rax; + uint64_t rcx; + uint64_t rdx; + uint64_t rbx; + uint64_t rsp; + uint64_t rbp; + uint64_t rsi; + uint64_t rdi; + uint64_t r8; + uint64_t r9; + uint64_t r10; + uint64_t r11; + uint64_t r12; + uint64_t r13; + uint64_t r14; + uint64_t r15; + uint64_t rip; }; #endif diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h index 53cdfb2857a..4421b5da409 100644 --- a/arch/x86/include/asm/kprobes.h +++ b/arch/x86/include/asm/kprobes.h @@ -27,7 +27,6 @@ #include <asm/insn.h> #define __ARCH_WANT_KPROBES_INSN_SLOT -#define ARCH_SUPPORTS_KPROBES_ON_FTRACE struct pt_regs; struct kprobe; diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index a04fe4eb237..eb181178fe0 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -37,6 +37,7 @@ struct x86_instruction_info { u8 modrm_reg; /* index of register used */ u8 modrm_rm; /* rm part of modrm */ u64 src_val; /* value of source operand */ + u64 dst_val; /* value of destination operand */ u8 src_bytes; /* size of source operand */ u8 dst_bytes; /* size of destination operand */ u8 ad_bytes; /* size of src/dst address */ @@ -194,6 +195,7 @@ struct x86_emulate_ops { int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value); int (*set_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data); int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata); + int (*check_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc); int (*read_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc, u64 *pdata); void (*halt)(struct x86_emulate_ctxt *ctxt); void (*wbinvd)(struct x86_emulate_ctxt *ctxt); @@ -231,7 +233,7 @@ struct operand { union { unsigned long val; u64 val64; - char valptr[sizeof(unsigned long) + 2]; + char valptr[sizeof(sse128_t)]; sse128_t vec_val; u64 mm_val; void *data; @@ -240,8 +242,8 @@ struct operand { struct fetch_cache { u8 data[15]; - unsigned long start; - unsigned long end; + u8 *ptr; + u8 *end; }; struct read_cache { @@ -286,30 +288,36 @@ struct x86_emulate_ctxt { u8 opcode_len; u8 b; u8 intercept; - u8 lock_prefix; - u8 rep_prefix; u8 op_bytes; u8 ad_bytes; - u8 rex_prefix; struct operand src; struct operand src2; struct operand dst; - bool has_seg_override; - u8 seg_override; - u64 d; int (*execute)(struct x86_emulate_ctxt *ctxt); int (*check_perm)(struct x86_emulate_ctxt *ctxt); + /* + * The following six fields are cleared together, + * the rest are initialized unconditionally in x86_decode_insn + * or elsewhere + */ + bool rip_relative; + u8 rex_prefix; + u8 lock_prefix; + u8 rep_prefix; + /* bitmaps of registers in _regs[] that can be read */ + u32 regs_valid; + /* bitmaps of registers in _regs[] that have been written */ + u32 regs_dirty; /* modrm */ u8 modrm; u8 modrm_mod; u8 modrm_reg; u8 modrm_rm; u8 modrm_seg; - bool rip_relative; + u8 seg_override; + u64 d; unsigned long _eip; struct operand memop; - u32 regs_valid; /* bitmaps of registers in _regs[] that can be read */ - u32 regs_dirty; /* bitmaps of registers in _regs[] that have been written */ /* Fields above regs are cleared together. */ unsigned long _regs[NR_VCPU_REGS]; struct operand *memopp; @@ -407,6 +415,7 @@ bool x86_page_table_writing_insn(struct x86_emulate_ctxt *ctxt); #define EMULATION_OK 0 #define EMULATION_RESTART 1 #define EMULATION_INTERCEPTED 2 +void init_decode_cache(struct x86_emulate_ctxt *ctxt); int x86_emulate_insn(struct x86_emulate_ctxt *ctxt); int emulator_task_switch(struct x86_emulate_ctxt *ctxt, u16 tss_selector, int idt_index, int reason, diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 49205d01b9a..7d603a71ab3 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -95,14 +95,10 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level) #define KVM_REFILL_PAGES 25 #define KVM_MAX_CPUID_ENTRIES 80 #define KVM_NR_FIXED_MTRR_REGION 88 -#define KVM_NR_VAR_MTRR 10 +#define KVM_NR_VAR_MTRR 8 #define ASYNC_PF_PER_VCPU 64 -struct kvm_vcpu; -struct kvm; -struct kvm_async_pf; - enum kvm_reg { VCPU_REGS_RAX = 0, VCPU_REGS_RCX = 1, @@ -152,14 +148,16 @@ enum { #define DR6_BD (1 << 13) #define DR6_BS (1 << 14) -#define DR6_FIXED_1 0xffff0ff0 -#define DR6_VOLATILE 0x0000e00f +#define DR6_RTM (1 << 16) +#define DR6_FIXED_1 0xfffe0ff0 +#define DR6_INIT 0xffff0ff0 +#define DR6_VOLATILE 0x0001e00f #define DR7_BP_EN_MASK 0x000000ff #define DR7_GE (1 << 9) #define DR7_GD (1 << 13) #define DR7_FIXED_1 0x00000400 -#define DR7_VOLATILE 0xffff23ff +#define DR7_VOLATILE 0xffff2bff /* apic attention bits */ #define KVM_APIC_CHECK_VAPIC 0 @@ -264,7 +262,8 @@ struct kvm_mmu { struct x86_exception *fault); gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access, struct x86_exception *exception); - gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access); + gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, + struct x86_exception *exception); int (*sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp); void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva); @@ -448,7 +447,7 @@ struct kvm_vcpu_arch { u64 tsc_offset_adjustment; u64 this_tsc_nsec; u64 this_tsc_write; - u8 this_tsc_generation; + u64 this_tsc_generation; bool tsc_catchup; bool tsc_always_catchup; s8 virtual_tsc_shift; @@ -479,6 +478,7 @@ struct kvm_vcpu_arch { u64 mmio_gva; unsigned access; gfn_t mmio_gfn; + u64 mmio_gen; struct kvm_pmu pmu; @@ -574,11 +574,10 @@ struct kvm_arch { struct kvm_apic_map *apic_map; unsigned int tss_addr; - struct page *apic_access_page; + bool apic_access_page_done; gpa_t wall_clock; - struct page *ept_identity_pagetable; bool ept_identity_pagetable_done; gpa_t ept_identity_map_addr; @@ -591,7 +590,7 @@ struct kvm_arch { u64 cur_tsc_nsec; u64 cur_tsc_write; u64 cur_tsc_offset; - u8 cur_tsc_generation; + u64 cur_tsc_generation; int nr_vcpus_matched_tsc; spinlock_t pvclock_gtod_sync_lock; @@ -663,8 +662,8 @@ struct msr_data { struct kvm_x86_ops { int (*cpu_has_kvm_support)(void); /* __init */ int (*disabled_by_bios)(void); /* __init */ - int (*hardware_enable)(void *dummy); - void (*hardware_disable)(void *dummy); + int (*hardware_enable)(void); + void (*hardware_disable)(void); void (*check_processor_compatibility)(void *rtn); int (*hardware_setup)(void); /* __init */ void (*hardware_unsetup)(void); /* __exit */ @@ -708,7 +707,6 @@ struct kvm_x86_ops { void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); - void (*fpu_activate)(struct kvm_vcpu *vcpu); void (*fpu_deactivate)(struct kvm_vcpu *vcpu); void (*tlb_flush)(struct kvm_vcpu *vcpu); @@ -717,7 +715,7 @@ struct kvm_x86_ops { int (*handle_exit)(struct kvm_vcpu *vcpu); void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask); - u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask); + u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu); void (*patch_hypercall)(struct kvm_vcpu *vcpu, unsigned char *hypercall_addr); void (*set_irq)(struct kvm_vcpu *vcpu); @@ -738,6 +736,7 @@ struct kvm_x86_ops { void (*hwapic_isr_update)(struct kvm *kvm, int isr); void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set); + void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa); void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector); void (*sync_pir_to_irr)(struct kvm_vcpu *vcpu); int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); @@ -770,6 +769,8 @@ struct kvm_x86_ops { bool (*mpx_supported)(void); int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr); + + void (*sched_in)(struct kvm_vcpu *kvm, int cpu); }; struct kvm_arch_async_pf { @@ -893,7 +894,6 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault); int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, gfn_t gfn, void *data, int offset, int len, u32 access); -void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault); bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl); static inline int __kvm_irq_line_state(unsigned long *irq_state, @@ -915,7 +915,6 @@ void kvm_inject_nmi(struct kvm_vcpu *vcpu); int fx_init(struct kvm_vcpu *vcpu); -void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu); void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, int bytes); int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn); @@ -924,7 +923,8 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); int kvm_mmu_load(struct kvm_vcpu *vcpu); void kvm_mmu_unload(struct kvm_vcpu *vcpu); void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); -gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access); +gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, + struct x86_exception *exception); gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception); gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, @@ -944,7 +944,8 @@ void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu); void kvm_enable_tdp(void); void kvm_disable_tdp(void); -static inline gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access) +static inline gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, + struct x86_exception *exception) { return gpa; } @@ -1035,7 +1036,7 @@ asmlinkage void kvm_spurious_fault(void); #define KVM_ARCH_WANT_MMU_NOTIFIER int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end); -int kvm_age_hva(struct kvm *kvm, unsigned long hva); +int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); int cpuid_maxphyaddr(struct kvm_vcpu *vcpu); @@ -1044,6 +1045,9 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); int kvm_cpu_get_interrupt(struct kvm_vcpu *v); void kvm_vcpu_reset(struct kvm_vcpu *vcpu); +void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu); +void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, + unsigned long address); void kvm_define_shared_msr(unsigned index, u32 msr); void kvm_set_shared_msr(unsigned index, u64 val, u64 mask); @@ -1070,6 +1074,7 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu); bool kvm_pmu_msr(struct kvm_vcpu *vcpu, u32 msr); int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data); int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info); +int kvm_pmu_check_pmc(struct kvm_vcpu *vcpu, unsigned pmc); int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data); void kvm_handle_pmu_event(struct kvm_vcpu *vcpu); void kvm_deliver_pmi(struct kvm_vcpu *vcpu); diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index c7678e43465..e62cf897f78 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -2,6 +2,7 @@ #define _ASM_X86_KVM_PARA_H #include <asm/processor.h> +#include <asm/alternative.h> #include <uapi/asm/kvm_para.h> extern void kvmclock_init(void); @@ -16,10 +17,15 @@ static inline bool kvm_check_and_clear_guest_paused(void) } #endif /* CONFIG_KVM_GUEST */ -/* This instruction is vmcall. On non-VT architectures, it will generate a - * trap that we will then rewrite to the appropriate instruction. +#ifdef CONFIG_DEBUG_RODATA +#define KVM_HYPERCALL \ + ALTERNATIVE(".byte 0x0f,0x01,0xc1", ".byte 0x0f,0x01,0xd9", X86_FEATURE_VMMCALL) +#else +/* On AMD processors, vmcall will generate a trap that we will + * then rewrite to the appropriate instruction. */ #define KVM_HYPERCALL ".byte 0x0f,0x01,0xc1" +#endif /* For KVM hypercalls, a three-byte sequence of either the vmcall or the vmmcall * instruction. The hypervisor may replace it with something else but only the diff --git a/arch/x86/include/asm/mc146818rtc.h b/arch/x86/include/asm/mc146818rtc.h index a55c7efcc4e..0f555cc3198 100644 --- a/arch/x86/include/asm/mc146818rtc.h +++ b/arch/x86/include/asm/mc146818rtc.h @@ -13,7 +13,7 @@ #define RTC_ALWAYS_BCD 1 /* RTC operates in binary mode */ #endif -#if defined(CONFIG_X86_32) && defined(__HAVE_ARCH_CMPXCHG) +#if defined(CONFIG_X86_32) /* * This lock provides nmi access to the CMOS/RTC registers. It has some * special properties. It is owned by a CPU and stores the index register diff --git a/arch/x86/include/asm/microcode_intel.h b/arch/x86/include/asm/microcode_intel.h index 9067166409b..bbe296e0bce 100644 --- a/arch/x86/include/asm/microcode_intel.h +++ b/arch/x86/include/asm/microcode_intel.h @@ -43,7 +43,7 @@ struct extended_sigtable { #define DWSIZE (sizeof(u32)) #define get_totalsize(mc) \ - (((struct microcode_intel *)mc)->hdr.totalsize ? \ + (((struct microcode_intel *)mc)->hdr.datasize ? \ ((struct microcode_intel *)mc)->hdr.totalsize : \ DEFAULT_UCODE_TOTALSIZE) diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index be12c534fd5..166af2a8e86 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -3,6 +3,10 @@ #include <asm/desc.h> #include <linux/atomic.h> +#include <linux/mm_types.h> + +#include <trace/events/tlb.h> + #include <asm/pgalloc.h> #include <asm/tlbflush.h> #include <asm/paravirt.h> @@ -44,6 +48,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, /* Re-load page tables */ load_cr3(next->pgd); + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); /* Stop flush ipis for the previous mm */ cpumask_clear_cpu(cpu, mm_cpumask(prev)); @@ -71,6 +76,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, * to make sure to use no freed page tables. */ load_cr3(next->pgd); + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); load_LDT_nolock(&next->context); } } diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h index f5a61795673..b07233b6457 100644 --- a/arch/x86/include/asm/mpspec.h +++ b/arch/x86/include/asm/mpspec.h @@ -40,8 +40,6 @@ extern int mp_bus_id_to_type[MAX_MP_BUSSES]; extern DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); extern unsigned int boot_cpu_physical_apicid; -extern unsigned int max_physical_apicid; -extern int mpc_default_type; extern unsigned long mp_lapic_addr; #ifdef CONFIG_X86_LOCAL_APIC @@ -88,15 +86,6 @@ static inline void early_reserve_e820_mpc_new(void) { } #endif int generic_processor_info(int apicid, int version); -#ifdef CONFIG_ACPI -extern void mp_register_ioapic(int id, u32 address, u32 gsi_base); -extern void mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, - u32 gsi); -extern void mp_config_acpi_legacy_irqs(void); -struct device; -extern int mp_register_gsi(struct device *dev, u32 gsi, int edge_level, - int active_high_low); -#endif /* CONFIG_ACPI */ #define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_LOCAL_APIC) @@ -161,8 +150,4 @@ static inline void physid_set_mask_of_physid(int physid, physid_mask_t *map) extern physid_mask_t phys_cpu_present_map; -extern int generic_mps_oem_check(struct mpc_table *, char *, char *); - -extern int default_acpi_madt_oem_check(char *, char *); - #endif /* _ASM_X86_MPSPEC_H */ diff --git a/arch/x86/include/asm/mutex_32.h b/arch/x86/include/asm/mutex_32.h index 0208c3c2cbc..85e6cda45a0 100644 --- a/arch/x86/include/asm/mutex_32.h +++ b/arch/x86/include/asm/mutex_32.h @@ -100,23 +100,11 @@ do { \ static inline int __mutex_fastpath_trylock(atomic_t *count, int (*fail_fn)(atomic_t *)) { - /* - * We have two variants here. The cmpxchg based one is the best one - * because it never induce a false contention state. It is included - * here because architectures using the inc/dec algorithms over the - * xchg ones are much more likely to support cmpxchg natively. - * - * If not we fall back to the spinlock based variant - that is - * just as efficient (and simpler) as a 'destructive' probing of - * the mutex state would be. - */ -#ifdef __HAVE_ARCH_CMPXCHG + /* cmpxchg because it never induces a false contention state. */ if (likely(atomic_cmpxchg(count, 1, 0) == 1)) return 1; + return 0; -#else - return fail_fn(count); -#endif } #endif /* _ASM_X86_MUTEX_32_H */ diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index 1da25a5f96f..a1410db38a1 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -43,7 +43,7 @@ static inline void __mwait(unsigned long eax, unsigned long ecx) static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) { if (!current_set_polling_and_test()) { - if (static_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) { + if (static_cpu_has_bug(X86_BUG_CLFLUSH_MONITOR)) { mb(); clflush((void *)¤t_thread_info()->flags); mb(); diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h index 4064acae625..01b493e5a99 100644 --- a/arch/x86/include/asm/numa.h +++ b/arch/x86/include/asm/numa.h @@ -9,7 +9,6 @@ #ifdef CONFIG_NUMA #define NR_NODE_MEMBLKS (MAX_NUMNODES*2) -#define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT)) /* * Too small node sizes may confuse the VM badly. Usually they diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h index 775873d3be5..802dde30c92 100644 --- a/arch/x86/include/asm/page.h +++ b/arch/x86/include/asm/page.h @@ -70,7 +70,6 @@ extern bool __virt_addr_valid(unsigned long kaddr); #include <asm-generic/memory_model.h> #include <asm-generic/getorder.h> -#define __HAVE_ARCH_GATE_AREA 1 #define HAVE_ARCH_HUGETLB_UNMAPPED_AREA #endif /* __KERNEL__ */ diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index 0f1ddee6a0c..f408caf7343 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -39,4 +39,6 @@ void copy_page(void *to, void *from); #endif /* !__ASSEMBLY__ */ +#define __HAVE_ARCH_GATE_AREA 1 + #endif /* _ASM_X86_PAGE_64_H */ diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 851bcdc5db0..fd472181a1d 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -52,10 +52,9 @@ * Compared to the generic __my_cpu_offset version, the following * saves one instruction and avoids clobbering a temp register. */ -#define raw_cpu_ptr(ptr) \ +#define arch_raw_cpu_ptr(ptr) \ ({ \ unsigned long tcp_ptr__; \ - __verify_pcpu_ptr(ptr); \ asm volatile("add " __percpu_arg(1) ", %0" \ : "=r" (tcp_ptr__) \ : "m" (this_cpu_off), "0" (ptr)); \ diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 8249df45d2f..8dfc9fd094a 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -51,6 +51,14 @@ ARCH_PERFMON_EVENTSEL_EDGE | \ ARCH_PERFMON_EVENTSEL_INV | \ ARCH_PERFMON_EVENTSEL_CMASK) +#define X86_ALL_EVENT_FLAGS \ + (ARCH_PERFMON_EVENTSEL_EDGE | \ + ARCH_PERFMON_EVENTSEL_INV | \ + ARCH_PERFMON_EVENTSEL_CMASK | \ + ARCH_PERFMON_EVENTSEL_ANY | \ + ARCH_PERFMON_EVENTSEL_PIN_CONTROL | \ + HSW_IN_TX | \ + HSW_IN_TX_CHECKPOINTED) #define AMD64_RAW_EVENT_MASK \ (X86_RAW_EVENT_MASK | \ AMD64_EVENTSEL_EVENT) diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h index 85e13ccf15c..d725382c2ae 100644 --- a/arch/x86/include/asm/perf_event_p4.h +++ b/arch/x86/include/asm/perf_event_p4.h @@ -189,7 +189,7 @@ static inline int p4_ht_thread(int cpu) { #ifdef CONFIG_SMP if (smp_num_siblings == 2) - return cpu != cpumask_first(__get_cpu_var(cpu_sibling_map)); + return cpu != cpumask_first(this_cpu_cpumask_var_ptr(cpu_sibling_map)); #endif return 0; } diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 0ec05601261..aa97a070f09 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -131,8 +131,13 @@ static inline int pte_exec(pte_t pte) static inline int pte_special(pte_t pte) { - return (pte_flags(pte) & (_PAGE_PRESENT|_PAGE_SPECIAL)) == - (_PAGE_PRESENT|_PAGE_SPECIAL); + /* + * See CONFIG_NUMA_BALANCING pte_numa in include/asm-generic/pgtable.h. + * On x86 we have _PAGE_BIT_NUMA == _PAGE_BIT_GLOBAL+1 == + * __PAGE_BIT_SOFTW1 == _PAGE_BIT_SPECIAL. + */ + return (pte_flags(pte) & _PAGE_SPECIAL) && + (pte_flags(pte) & (_PAGE_PRESENT|_PAGE_PROTNONE)); } static inline unsigned long pte_pfn(pte_t pte) diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h index 9ee322103c6..b6c0b404898 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h @@ -32,9 +32,6 @@ static inline void pgtable_cache_init(void) { } static inline void check_pgt_cache(void) { } void paging_init(void); -extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t); - - /* * Define this if things work differently on an i386 and an i486: * it will (on an i486) warn about kernel memory accesses that are diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 5be9063545d..4572b2f3023 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -19,6 +19,7 @@ extern pud_t level3_ident_pgt[512]; extern pmd_t level2_kernel_pgt[512]; extern pmd_t level2_fixmap_pgt[512]; extern pmd_t level2_ident_pgt[512]; +extern pte_t level1_fixmap_pgt[512]; extern pgd_t init_level4_pgt[]; #define swapper_pg_dir init_level4_pgt @@ -115,7 +116,8 @@ static inline void native_pgd_clear(pgd_t *pgd) native_set_pgd(pgd, native_make_pgd(0)); } -extern void sync_global_pgds(unsigned long start, unsigned long end); +extern void sync_global_pgds(unsigned long start, unsigned long end, + int removed); /* * Conversion functions: convert a page and protection to a page entry, diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index f216963760e..07789647bf3 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -23,7 +23,6 @@ #define _PAGE_BIT_SPECIAL _PAGE_BIT_SOFTW1 #define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1 #define _PAGE_BIT_SPLITTING _PAGE_BIT_SOFTW2 /* only valid on a PSE pmd */ -#define _PAGE_BIT_IOMAP _PAGE_BIT_SOFTW2 /* flag used to indicate IO mapping */ #define _PAGE_BIT_HIDDEN _PAGE_BIT_SOFTW3 /* hidden by kmemcheck */ #define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */ #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ @@ -52,7 +51,7 @@ #define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE) #define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) #define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1) -#define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP) +#define _PAGE_SOFTW2 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2) #define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) #define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL) @@ -168,10 +167,10 @@ #define __PAGE_KERNEL_LARGE_NOCACHE (__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE) #define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE) -#define __PAGE_KERNEL_IO (__PAGE_KERNEL | _PAGE_IOMAP) -#define __PAGE_KERNEL_IO_NOCACHE (__PAGE_KERNEL_NOCACHE | _PAGE_IOMAP) -#define __PAGE_KERNEL_IO_UC_MINUS (__PAGE_KERNEL_UC_MINUS | _PAGE_IOMAP) -#define __PAGE_KERNEL_IO_WC (__PAGE_KERNEL_WC | _PAGE_IOMAP) +#define __PAGE_KERNEL_IO (__PAGE_KERNEL) +#define __PAGE_KERNEL_IO_NOCACHE (__PAGE_KERNEL_NOCACHE) +#define __PAGE_KERNEL_IO_UC_MINUS (__PAGE_KERNEL_UC_MINUS) +#define __PAGE_KERNEL_IO_WC (__PAGE_KERNEL_WC) #define PAGE_KERNEL __pgprot(__PAGE_KERNEL) #define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO) @@ -325,6 +324,20 @@ static inline pteval_t pte_flags(pte_t pte) return native_pte_val(pte) & PTE_FLAGS_MASK; } +#ifdef CONFIG_NUMA_BALANCING +/* Set of bits that distinguishes present, prot_none and numa ptes */ +#define _PAGE_NUMA_MASK (_PAGE_NUMA|_PAGE_PROTNONE|_PAGE_PRESENT) +static inline pteval_t ptenuma_flags(pte_t pte) +{ + return pte_flags(pte) & _PAGE_NUMA_MASK; +} + +static inline pmdval_t pmdnuma_flags(pmd_t pmd) +{ + return pmd_flags(pmd) & _PAGE_NUMA_MASK; +} +#endif /* CONFIG_NUMA_BALANCING */ + #define pgprot_val(x) ((x).pgprot) #define __pgprot(x) ((pgprot_t) { (x) } ) diff --git a/arch/x86/include/asm/platform_sst_audio.h b/arch/x86/include/asm/platform_sst_audio.h new file mode 100644 index 00000000000..0a4e140315b --- /dev/null +++ b/arch/x86/include/asm/platform_sst_audio.h @@ -0,0 +1,78 @@ +/* + * platform_sst_audio.h: sst audio platform data header file + * + * Copyright (C) 2012-14 Intel Corporation + * Author: Jeeja KP <jeeja.kp@intel.com> + * Omair Mohammed Abdullah <omair.m.abdullah@intel.com> + * Vinod Koul ,vinod.koul@intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ +#ifndef _PLATFORM_SST_AUDIO_H_ +#define _PLATFORM_SST_AUDIO_H_ + +#include <linux/sfi.h> + +enum sst_audio_task_id_mrfld { + SST_TASK_ID_NONE = 0, + SST_TASK_ID_SBA = 1, + SST_TASK_ID_MEDIA = 3, + SST_TASK_ID_MAX = SST_TASK_ID_MEDIA, +}; + +/* Device IDs for Merrifield are Pipe IDs, + * ref: DSP spec v0.75 */ +enum sst_audio_device_id_mrfld { + /* Output pipeline IDs */ + PIPE_ID_OUT_START = 0x0, + PIPE_CODEC_OUT0 = 0x2, + PIPE_CODEC_OUT1 = 0x3, + PIPE_SPROT_LOOP_OUT = 0x4, + PIPE_MEDIA_LOOP1_OUT = 0x5, + PIPE_MEDIA_LOOP2_OUT = 0x6, + PIPE_VOIP_OUT = 0xC, + PIPE_PCM0_OUT = 0xD, + PIPE_PCM1_OUT = 0xE, + PIPE_PCM2_OUT = 0xF, + PIPE_MEDIA0_OUT = 0x12, + PIPE_MEDIA1_OUT = 0x13, +/* Input Pipeline IDs */ + PIPE_ID_IN_START = 0x80, + PIPE_CODEC_IN0 = 0x82, + PIPE_CODEC_IN1 = 0x83, + PIPE_SPROT_LOOP_IN = 0x84, + PIPE_MEDIA_LOOP1_IN = 0x85, + PIPE_MEDIA_LOOP2_IN = 0x86, + PIPE_VOIP_IN = 0x8C, + PIPE_PCM0_IN = 0x8D, + PIPE_PCM1_IN = 0x8E, + PIPE_MEDIA0_IN = 0x8F, + PIPE_MEDIA1_IN = 0x90, + PIPE_MEDIA2_IN = 0x91, + PIPE_RSVD = 0xFF, +}; + +/* The stream map for each platform consists of an array of the below + * stream map structure. + */ +struct sst_dev_stream_map { + u8 dev_num; /* device id */ + u8 subdev_num; /* substream */ + u8 direction; + u8 device_id; /* fw id */ + u8 task_id; /* fw task */ + u8 status; +}; + +struct sst_platform_data { + /* Intel software platform id*/ + struct sst_dev_stream_map *pdev_strm_map; + unsigned int strm_map_size; +}; + +int add_sst_platform_device(void); +#endif + diff --git a/arch/x86/include/asm/pmc_atom.h b/arch/x86/include/asm/pmc_atom.h new file mode 100644 index 00000000000..fc7a17c05d3 --- /dev/null +++ b/arch/x86/include/asm/pmc_atom.h @@ -0,0 +1,107 @@ +/* + * Intel Atom SOC Power Management Controller Header File + * Copyright (c) 2014, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ + +#ifndef PMC_ATOM_H +#define PMC_ATOM_H + +/* ValleyView Power Control Unit PCI Device ID */ +#define PCI_DEVICE_ID_VLV_PMC 0x0F1C + +/* PMC Memory mapped IO registers */ +#define PMC_BASE_ADDR_OFFSET 0x44 +#define PMC_BASE_ADDR_MASK 0xFFFFFE00 +#define PMC_MMIO_REG_LEN 0x100 +#define PMC_REG_BIT_WIDTH 32 + +/* BIOS uses FUNC_DIS to disable specific function */ +#define PMC_FUNC_DIS 0x34 +#define PMC_FUNC_DIS_2 0x38 + +/* S0ix wake event control */ +#define PMC_S0IX_WAKE_EN 0x3C + +#define BIT_LPC_CLOCK_RUN BIT(4) +#define BIT_SHARED_IRQ_GPSC BIT(5) +#define BIT_ORED_DEDICATED_IRQ_GPSS BIT(18) +#define BIT_ORED_DEDICATED_IRQ_GPSC BIT(19) +#define BIT_SHARED_IRQ_GPSS BIT(20) + +#define PMC_WAKE_EN_SETTING ~(BIT_LPC_CLOCK_RUN | \ + BIT_SHARED_IRQ_GPSC | \ + BIT_ORED_DEDICATED_IRQ_GPSS | \ + BIT_ORED_DEDICATED_IRQ_GPSC | \ + BIT_SHARED_IRQ_GPSS) + +/* The timers acumulate time spent in sleep state */ +#define PMC_S0IR_TMR 0x80 +#define PMC_S0I1_TMR 0x84 +#define PMC_S0I2_TMR 0x88 +#define PMC_S0I3_TMR 0x8C +#define PMC_S0_TMR 0x90 +/* Sleep state counter is in units of of 32us */ +#define PMC_TMR_SHIFT 5 + +/* These registers reflect D3 status of functions */ +#define PMC_D3_STS_0 0xA0 + +#define BIT_LPSS1_F0_DMA BIT(0) +#define BIT_LPSS1_F1_PWM1 BIT(1) +#define BIT_LPSS1_F2_PWM2 BIT(2) +#define BIT_LPSS1_F3_HSUART1 BIT(3) +#define BIT_LPSS1_F4_HSUART2 BIT(4) +#define BIT_LPSS1_F5_SPI BIT(5) +#define BIT_LPSS1_F6_XXX BIT(6) +#define BIT_LPSS1_F7_XXX BIT(7) +#define BIT_SCC_EMMC BIT(8) +#define BIT_SCC_SDIO BIT(9) +#define BIT_SCC_SDCARD BIT(10) +#define BIT_SCC_MIPI BIT(11) +#define BIT_HDA BIT(12) +#define BIT_LPE BIT(13) +#define BIT_OTG BIT(14) +#define BIT_USH BIT(15) +#define BIT_GBE BIT(16) +#define BIT_SATA BIT(17) +#define BIT_USB_EHCI BIT(18) +#define BIT_SEC BIT(19) +#define BIT_PCIE_PORT0 BIT(20) +#define BIT_PCIE_PORT1 BIT(21) +#define BIT_PCIE_PORT2 BIT(22) +#define BIT_PCIE_PORT3 BIT(23) +#define BIT_LPSS2_F0_DMA BIT(24) +#define BIT_LPSS2_F1_I2C1 BIT(25) +#define BIT_LPSS2_F2_I2C2 BIT(26) +#define BIT_LPSS2_F3_I2C3 BIT(27) +#define BIT_LPSS2_F4_I2C4 BIT(28) +#define BIT_LPSS2_F5_I2C5 BIT(29) +#define BIT_LPSS2_F6_I2C6 BIT(30) +#define BIT_LPSS2_F7_I2C7 BIT(31) + +#define PMC_D3_STS_1 0xA4 +#define BIT_SMB BIT(0) +#define BIT_OTG_SS_PHY BIT(1) +#define BIT_USH_SS_PHY BIT(2) +#define BIT_DFX BIT(3) + +/* PMC I/O Registers */ +#define ACPI_BASE_ADDR_OFFSET 0x40 +#define ACPI_BASE_ADDR_MASK 0xFFFFFE00 +#define ACPI_MMIO_REG_LEN 0x100 + +#define PM1_CNT 0x4 +#define SLEEP_TYPE_MASK 0xFFFFECFF +#define SLEEP_TYPE_S5 0x1C00 +#define SLEEP_ENABLE 0x2000 +#endif /* PMC_ATOM_H */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index a4ea02351f4..eb71ec79473 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -72,7 +72,6 @@ extern u16 __read_mostly tlb_lld_4k[NR_INFO]; extern u16 __read_mostly tlb_lld_2m[NR_INFO]; extern u16 __read_mostly tlb_lld_4m[NR_INFO]; extern u16 __read_mostly tlb_lld_1g[NR_INFO]; -extern s8 __read_mostly tlb_flushall_shift; /* * CPU type and hardware bug flags. Kept separately for each CPU. @@ -386,8 +385,8 @@ struct bndcsr_struct { struct xsave_hdr_struct { u64 xstate_bv; - u64 reserved1[2]; - u64 reserved2[5]; + u64 xcomp_bv; + u64 reserved[6]; } __attribute__((packed)); struct xsave_struct { @@ -696,6 +695,8 @@ static inline void cpu_relax(void) rep_nop(); } +#define cpu_relax_lowlatency() cpu_relax() + /* Stop speculative execution and prefetching of modified code. */ static inline void sync_core(void) { diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h index fbeb06ed0ea..1d081ac1cd6 100644 --- a/arch/x86/include/asm/prom.h +++ b/arch/x86/include/asm/prom.h @@ -26,12 +26,10 @@ extern int of_ioapic; extern u64 initial_dtb; extern void add_dtb(u64 data); -extern void x86_add_irq_domains(void); void x86_of_pci_init(void); void x86_dtb_init(void); #else static inline void add_dtb(u64 data) { } -static inline void x86_add_irq_domains(void) { } static inline void x86_of_pci_init(void) { } static inline void x86_dtb_init(void) { } #define of_ioapic 0 diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 6205f0c434d..86fc2bb8228 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -75,6 +75,11 @@ convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs); extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code, int si_code); + +extern unsigned long syscall_trace_enter_phase1(struct pt_regs *, u32 arch); +extern long syscall_trace_enter_phase2(struct pt_regs *, u32 arch, + unsigned long phase1_result); + extern long syscall_trace_enter(struct pt_regs *); extern void syscall_trace_leave(struct pt_regs *); diff --git a/arch/x86/include/asm/qrwlock.h b/arch/x86/include/asm/qrwlock.h index 70f46f07f94..ae0e241e228 100644 --- a/arch/x86/include/asm/qrwlock.h +++ b/arch/x86/include/asm/qrwlock.h @@ -3,7 +3,7 @@ #include <asm-generic/qrwlock_types.h> -#if !defined(CONFIG_X86_OOSTORE) && !defined(CONFIG_X86_PPRO_FENCE) +#ifndef CONFIG_X86_PPRO_FENCE #define queue_write_unlock queue_write_unlock static inline void queue_write_unlock(struct qrwlock *lock) { diff --git a/arch/x86/include/asm/rwlock.h b/arch/x86/include/asm/rwlock.h deleted file mode 100644 index a5370a03d90..00000000000 --- a/arch/x86/include/asm/rwlock.h +++ /dev/null @@ -1,49 +0,0 @@ -#ifndef _ASM_X86_RWLOCK_H -#define _ASM_X86_RWLOCK_H - -#include <asm/asm.h> - -#if CONFIG_NR_CPUS <= 2048 - -#ifndef __ASSEMBLY__ -typedef union { - s32 lock; - s32 write; -} arch_rwlock_t; -#endif - -#define RW_LOCK_BIAS 0x00100000 -#define READ_LOCK_SIZE(insn) __ASM_FORM(insn##l) -#define READ_LOCK_ATOMIC(n) atomic_##n -#define WRITE_LOCK_ADD(n) __ASM_FORM_COMMA(addl n) -#define WRITE_LOCK_SUB(n) __ASM_FORM_COMMA(subl n) -#define WRITE_LOCK_CMP RW_LOCK_BIAS - -#else /* CONFIG_NR_CPUS > 2048 */ - -#include <linux/const.h> - -#ifndef __ASSEMBLY__ -typedef union { - s64 lock; - struct { - u32 read; - s32 write; - }; -} arch_rwlock_t; -#endif - -#define RW_LOCK_BIAS (_AC(1,L) << 32) -#define READ_LOCK_SIZE(insn) __ASM_FORM(insn##q) -#define READ_LOCK_ATOMIC(n) atomic64_##n -#define WRITE_LOCK_ADD(n) __ASM_FORM(incl) -#define WRITE_LOCK_SUB(n) __ASM_FORM(decl) -#define WRITE_LOCK_CMP 1 - -#endif /* CONFIG_NR_CPUS */ - -#define __ARCH_RW_LOCK_UNLOCKED { RW_LOCK_BIAS } - -/* Actual code is in asm/spinlock.h or in arch/x86/lib/rwlock.S */ - -#endif /* _ASM_X86_RWLOCK_H */ diff --git a/arch/x86/include/asm/scatterlist.h b/arch/x86/include/asm/scatterlist.h deleted file mode 100644 index 4240878b9d7..00000000000 --- a/arch/x86/include/asm/scatterlist.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef _ASM_X86_SCATTERLIST_H -#define _ASM_X86_SCATTERLIST_H - -#include <asm-generic/scatterlist.h> - -#define ARCH_HAS_SG_CHAIN - -#endif /* _ASM_X86_SCATTERLIST_H */ diff --git a/arch/x86/include/asm/serial.h b/arch/x86/include/asm/serial.h index 628c801535e..460b84f6455 100644 --- a/arch/x86/include/asm/serial.h +++ b/arch/x86/include/asm/serial.h @@ -6,24 +6,24 @@ * * It'd be nice if someone built a serial card with a 24.576 MHz * clock, since the 16550A is capable of handling a top speed of 1.5 - * megabits/second; but this requires the faster clock. + * megabits/second; but this requires a faster clock. */ -#define BASE_BAUD ( 1843200 / 16 ) +#define BASE_BAUD (1843200/16) /* Standard COM flags (except for COM4, because of the 8514 problem) */ #ifdef CONFIG_SERIAL_DETECT_IRQ -#define STD_COM_FLAGS (ASYNC_BOOT_AUTOCONF | ASYNC_SKIP_TEST | ASYNC_AUTO_IRQ) -#define STD_COM4_FLAGS (ASYNC_BOOT_AUTOCONF | ASYNC_AUTO_IRQ) +# define STD_COMX_FLAGS (ASYNC_BOOT_AUTOCONF | ASYNC_SKIP_TEST | ASYNC_AUTO_IRQ) +# define STD_COM4_FLAGS (ASYNC_BOOT_AUTOCONF | 0 | ASYNC_AUTO_IRQ) #else -#define STD_COM_FLAGS (ASYNC_BOOT_AUTOCONF | ASYNC_SKIP_TEST) -#define STD_COM4_FLAGS ASYNC_BOOT_AUTOCONF +# define STD_COMX_FLAGS (ASYNC_BOOT_AUTOCONF | ASYNC_SKIP_TEST | 0 ) +# define STD_COM4_FLAGS (ASYNC_BOOT_AUTOCONF | 0 | 0 ) #endif -#define SERIAL_PORT_DFNS \ - /* UART CLK PORT IRQ FLAGS */ \ - { 0, BASE_BAUD, 0x3F8, 4, STD_COM_FLAGS }, /* ttyS0 */ \ - { 0, BASE_BAUD, 0x2F8, 3, STD_COM_FLAGS }, /* ttyS1 */ \ - { 0, BASE_BAUD, 0x3E8, 4, STD_COM_FLAGS }, /* ttyS2 */ \ - { 0, BASE_BAUD, 0x2E8, 3, STD_COM4_FLAGS }, /* ttyS3 */ +#define SERIAL_PORT_DFNS \ + /* UART CLK PORT IRQ FLAGS */ \ + { .uart = 0, BASE_BAUD, 0x3F8, 4, STD_COMX_FLAGS }, /* ttyS0 */ \ + { .uart = 0, BASE_BAUD, 0x2F8, 3, STD_COMX_FLAGS }, /* ttyS1 */ \ + { .uart = 0, BASE_BAUD, 0x3E8, 4, STD_COMX_FLAGS }, /* ttyS2 */ \ + { .uart = 0, BASE_BAUD, 0x2E8, 3, STD_COM4_FLAGS }, /* ttyS3 */ #endif /* _ASM_X86_SERIAL_H */ diff --git a/arch/x86/include/asm/smpboot_hooks.h b/arch/x86/include/asm/smpboot_hooks.h index 49adfd7bb4a..0da7409f0be 100644 --- a/arch/x86/include/asm/smpboot_hooks.h +++ b/arch/x86/include/asm/smpboot_hooks.h @@ -17,11 +17,11 @@ static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip) spin_unlock_irqrestore(&rtc_lock, flags); local_flush_tlb(); pr_debug("1.\n"); - *((volatile unsigned short *)phys_to_virt(apic->trampoline_phys_high)) = - start_eip >> 4; + *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) = + start_eip >> 4; pr_debug("2.\n"); - *((volatile unsigned short *)phys_to_virt(apic->trampoline_phys_low)) = - start_eip & 0xf; + *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = + start_eip & 0xf; pr_debug("3.\n"); } @@ -42,7 +42,7 @@ static inline void smpboot_restore_warm_reset_vector(void) CMOS_WRITE(0, 0xf); spin_unlock_irqrestore(&rtc_lock, flags); - *((volatile u32 *)phys_to_virt(apic->trampoline_phys_low)) = 0; + *((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0; } static inline void __init smpboot_setup_io_apic(void) diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index 54f1c8068c0..9295016485c 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -187,7 +187,6 @@ static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) cpu_relax(); } -#ifndef CONFIG_QUEUE_RWLOCK /* * Read-write spinlocks, allowing multiple readers * but only one writer. @@ -198,91 +197,15 @@ static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) * irq-safe write-lock, but readers can get non-irqsafe * read-locks. * - * On x86, we implement read-write locks as a 32-bit counter - * with the high bit (sign) being the "contended" bit. + * On x86, we implement read-write locks using the generic qrwlock with + * x86 specific optimization. */ -/** - * read_can_lock - would read_trylock() succeed? - * @lock: the rwlock in question. - */ -static inline int arch_read_can_lock(arch_rwlock_t *lock) -{ - return lock->lock > 0; -} - -/** - * write_can_lock - would write_trylock() succeed? - * @lock: the rwlock in question. - */ -static inline int arch_write_can_lock(arch_rwlock_t *lock) -{ - return lock->write == WRITE_LOCK_CMP; -} - -static inline void arch_read_lock(arch_rwlock_t *rw) -{ - asm volatile(LOCK_PREFIX READ_LOCK_SIZE(dec) " (%0)\n\t" - "jns 1f\n" - "call __read_lock_failed\n\t" - "1:\n" - ::LOCK_PTR_REG (rw) : "memory"); -} - -static inline void arch_write_lock(arch_rwlock_t *rw) -{ - asm volatile(LOCK_PREFIX WRITE_LOCK_SUB(%1) "(%0)\n\t" - "jz 1f\n" - "call __write_lock_failed\n\t" - "1:\n" - ::LOCK_PTR_REG (&rw->write), "i" (RW_LOCK_BIAS) - : "memory"); -} - -static inline int arch_read_trylock(arch_rwlock_t *lock) -{ - READ_LOCK_ATOMIC(t) *count = (READ_LOCK_ATOMIC(t) *)lock; - - if (READ_LOCK_ATOMIC(dec_return)(count) >= 0) - return 1; - READ_LOCK_ATOMIC(inc)(count); - return 0; -} - -static inline int arch_write_trylock(arch_rwlock_t *lock) -{ - atomic_t *count = (atomic_t *)&lock->write; - - if (atomic_sub_and_test(WRITE_LOCK_CMP, count)) - return 1; - atomic_add(WRITE_LOCK_CMP, count); - return 0; -} - -static inline void arch_read_unlock(arch_rwlock_t *rw) -{ - asm volatile(LOCK_PREFIX READ_LOCK_SIZE(inc) " %0" - :"+m" (rw->lock) : : "memory"); -} - -static inline void arch_write_unlock(arch_rwlock_t *rw) -{ - asm volatile(LOCK_PREFIX WRITE_LOCK_ADD(%1) "%0" - : "+m" (rw->write) : "i" (RW_LOCK_BIAS) : "memory"); -} -#else #include <asm/qrwlock.h> -#endif /* CONFIG_QUEUE_RWLOCK */ #define arch_read_lock_flags(lock, flags) arch_read_lock(lock) #define arch_write_lock_flags(lock, flags) arch_write_lock(lock) -#undef READ_LOCK_SIZE -#undef READ_LOCK_ATOMIC -#undef WRITE_LOCK_ADD -#undef WRITE_LOCK_SUB -#undef WRITE_LOCK_CMP - #define arch_spin_relax(lock) cpu_relax() #define arch_read_relax(lock) cpu_relax() #define arch_write_relax(lock) cpu_relax() diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h index 73c4c007200..5f9d7572d82 100644 --- a/arch/x86/include/asm/spinlock_types.h +++ b/arch/x86/include/asm/spinlock_types.h @@ -34,10 +34,6 @@ typedef struct arch_spinlock { #define __ARCH_SPIN_LOCK_UNLOCKED { { 0 } } -#ifdef CONFIG_QUEUE_RWLOCK #include <asm-generic/qrwlock_types.h> -#else -#include <asm/rwlock.h> -#endif #endif /* _ASM_X86_SPINLOCK_TYPES_H */ diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index 0b46ef261c7..2d60a7813df 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -73,6 +73,7 @@ #define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD (is_uv1_hub() ? \ UV1_INTD_SOFT_ACK_TIMEOUT_PERIOD : \ UV2_INTD_SOFT_ACK_TIMEOUT_PERIOD) +/* assuming UV3 is the same */ #define BAU_MISC_CONTROL_MULT_MASK 3 @@ -93,6 +94,8 @@ #define SOFTACK_MSHIFT UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT #define SOFTACK_PSHIFT UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT #define SOFTACK_TIMEOUT_PERIOD UV_INTD_SOFT_ACK_TIMEOUT_PERIOD +#define PREFETCH_HINT_SHFT UV3H_LB_BAU_MISC_CONTROL_ENABLE_INTD_PREFETCH_HINT_SHFT +#define SB_STATUS_SHFT UV3H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_SHFT #define write_gmmr uv_write_global_mmr64 #define write_lmmr uv_write_local_mmr #define read_lmmr uv_read_local_mmr @@ -322,8 +325,9 @@ struct uv1_bau_msg_header { /* * UV2 Message header: 16 bytes (128 bits) (bytes 0x30-0x3f of descriptor) * see figure 9-2 of harp_sys.pdf + * assuming UV3 is the same */ -struct uv2_bau_msg_header { +struct uv2_3_bau_msg_header { unsigned int base_dest_nasid:15; /* nasid of the first bit */ /* bits 14:0 */ /* in uvhub map */ unsigned int dest_subnodeid:5; /* must be 0x10, for the LB */ @@ -395,7 +399,7 @@ struct bau_desc { */ union bau_msg_header { struct uv1_bau_msg_header uv1_hdr; - struct uv2_bau_msg_header uv2_hdr; + struct uv2_3_bau_msg_header uv2_3_hdr; } header; struct bau_msg_payload payload; @@ -631,11 +635,6 @@ struct bau_control { struct hub_and_pnode *thp; }; -static inline unsigned long read_mmr_uv2_status(void) -{ - return read_lmmr(UV2H_LB_BAU_SB_ACTIVATION_STATUS_2); -} - static inline void write_mmr_data_broadcast(int pnode, unsigned long mmr_image) { write_gmmr(pnode, UVH_BAU_DATA_BROADCAST, mmr_image); @@ -760,7 +759,11 @@ static inline int atomic_read_short(const struct atomic_short *v) */ static inline int atom_asr(short i, struct atomic_short *v) { - return i + xadd(&v->counter, i); + short __i = i; + asm volatile(LOCK_PREFIX "xaddw %0, %1" + : "+r" (i), "+m" (v->counter) + : : "memory"); + return i + __i; } /* diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index c63e925fd6b..a00ad8f2a65 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -164,7 +164,7 @@ struct uv_hub_info_s { }; DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); -#define uv_hub_info (&__get_cpu_var(__uv_hub_info)) +#define uv_hub_info this_cpu_ptr(&__uv_hub_info) #define uv_cpu_hub_info(cpu) (&per_cpu(__uv_hub_info, cpu)) /* @@ -601,16 +601,16 @@ struct uv_hub_nmi_s { struct uv_cpu_nmi_s { struct uv_hub_nmi_s *hub; - atomic_t state; - atomic_t pinging; + int state; + int pinging; int queries; int pings; }; -DECLARE_PER_CPU(struct uv_cpu_nmi_s, __uv_cpu_nmi); -#define uv_cpu_nmi (__get_cpu_var(__uv_cpu_nmi)) +DECLARE_PER_CPU(struct uv_cpu_nmi_s, uv_cpu_nmi); + #define uv_hub_nmi (uv_cpu_nmi.hub) -#define uv_cpu_nmi_per(cpu) (per_cpu(__uv_cpu_nmi, cpu)) +#define uv_cpu_nmi_per(cpu) (per_cpu(uv_cpu_nmi, cpu)) #define uv_hub_nmi_per(cpu) (uv_cpu_nmi_per(cpu).hub) /* uv_cpu_nmi_states */ diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h index 30be253dd28..8021bd28c0f 100644 --- a/arch/x86/include/asm/vdso.h +++ b/arch/x86/include/asm/vdso.h @@ -18,15 +18,15 @@ struct vdso_image { unsigned long alt, alt_len; - unsigned long sym_end_mapping; /* Total size of the mapping */ - - unsigned long sym_vvar_page; - unsigned long sym_hpet_page; - unsigned long sym_VDSO32_NOTE_MASK; - unsigned long sym___kernel_sigreturn; - unsigned long sym___kernel_rt_sigreturn; - unsigned long sym___kernel_vsyscall; - unsigned long sym_VDSO32_SYSENTER_RETURN; + long sym_vvar_start; /* Negative offset to the vvar area */ + + long sym_vvar_page; + long sym_hpet_page; + long sym_VDSO32_NOTE_MASK; + long sym___kernel_sigreturn; + long sym___kernel_rt_sigreturn; + long sym___kernel_vsyscall; + long sym_VDSO32_SYSENTER_RETURN; }; #ifdef CONFIG_X86_64 diff --git a/arch/x86/include/asm/vga.h b/arch/x86/include/asm/vga.h index 44282fbf7bf..c4b9dc2f67c 100644 --- a/arch/x86/include/asm/vga.h +++ b/arch/x86/include/asm/vga.h @@ -17,10 +17,4 @@ #define vga_readb(x) (*(x)) #define vga_writeb(x, y) (*(y) = (x)) -#ifdef CONFIG_FB_EFI -#define __ARCH_HAS_VGA_DEFAULT_DEVICE -extern struct pci_dev *vga_default_device(void); -extern void vga_set_default_device(struct pci_dev *pdev); -#endif - #endif /* _ASM_X86_VGA_H */ diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 7004d21e621..bcbfade26d8 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -51,6 +51,9 @@ #define CPU_BASED_MONITOR_EXITING 0x20000000 #define CPU_BASED_PAUSE_EXITING 0x40000000 #define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS 0x80000000 + +#define CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR 0x0401e172 + /* * Definitions of Secondary Processor-Based VM-Execution Controls. */ @@ -76,7 +79,7 @@ #define PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR 0x00000016 -#define VM_EXIT_SAVE_DEBUG_CONTROLS 0x00000002 +#define VM_EXIT_SAVE_DEBUG_CONTROLS 0x00000004 #define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200 #define VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL 0x00001000 #define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000 @@ -89,7 +92,7 @@ #define VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR 0x00036dff -#define VM_ENTRY_LOAD_DEBUG_CONTROLS 0x00000002 +#define VM_ENTRY_LOAD_DEBUG_CONTROLS 0x00000004 #define VM_ENTRY_IA32E_MODE 0x00000200 #define VM_ENTRY_SMM 0x00000400 #define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800 diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h index d949ef28c48..7e7a79ada65 100644 --- a/arch/x86/include/asm/xsave.h +++ b/arch/x86/include/asm/xsave.h @@ -52,24 +52,170 @@ extern void xsave_init(void); extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask); extern int init_fpu(struct task_struct *child); -static inline int fpu_xrstor_checking(struct xsave_struct *fx) +/* These macros all use (%edi)/(%rdi) as the single memory argument. */ +#define XSAVE ".byte " REX_PREFIX "0x0f,0xae,0x27" +#define XSAVEOPT ".byte " REX_PREFIX "0x0f,0xae,0x37" +#define XSAVES ".byte " REX_PREFIX "0x0f,0xc7,0x2f" +#define XRSTOR ".byte " REX_PREFIX "0x0f,0xae,0x2f" +#define XRSTORS ".byte " REX_PREFIX "0x0f,0xc7,0x1f" + +#define xstate_fault ".section .fixup,\"ax\"\n" \ + "3: movl $-1,%[err]\n" \ + " jmp 2b\n" \ + ".previous\n" \ + _ASM_EXTABLE(1b, 3b) \ + : [err] "=r" (err) + +/* + * This function is called only during boot time when x86 caps are not set + * up and alternative can not be used yet. + */ +static inline int xsave_state_booting(struct xsave_struct *fx, u64 mask) { - int err; + u32 lmask = mask; + u32 hmask = mask >> 32; + int err = 0; + + WARN_ON(system_state != SYSTEM_BOOTING); + + if (boot_cpu_has(X86_FEATURE_XSAVES)) + asm volatile("1:"XSAVES"\n\t" + "2:\n\t" + : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) + : "memory"); + else + asm volatile("1:"XSAVE"\n\t" + "2:\n\t" + : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) + : "memory"); + + asm volatile(xstate_fault + : "0" (0) + : "memory"); + + return err; +} + +/* + * This function is called only during boot time when x86 caps are not set + * up and alternative can not be used yet. + */ +static inline int xrstor_state_booting(struct xsave_struct *fx, u64 mask) +{ + u32 lmask = mask; + u32 hmask = mask >> 32; + int err = 0; + + WARN_ON(system_state != SYSTEM_BOOTING); - asm volatile("1: .byte " REX_PREFIX "0x0f,0xae,0x2f\n\t" - "2:\n" - ".section .fixup,\"ax\"\n" - "3: movl $-1,%[err]\n" - " jmp 2b\n" - ".previous\n" - _ASM_EXTABLE(1b, 3b) - : [err] "=r" (err) - : "D" (fx), "m" (*fx), "a" (-1), "d" (-1), "0" (0) + if (boot_cpu_has(X86_FEATURE_XSAVES)) + asm volatile("1:"XRSTORS"\n\t" + "2:\n\t" + : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) + : "memory"); + else + asm volatile("1:"XRSTOR"\n\t" + "2:\n\t" + : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) + : "memory"); + + asm volatile(xstate_fault + : "0" (0) + : "memory"); + + return err; +} + +/* + * Save processor xstate to xsave area. + */ +static inline int xsave_state(struct xsave_struct *fx, u64 mask) +{ + u32 lmask = mask; + u32 hmask = mask >> 32; + int err = 0; + + /* + * If xsaves is enabled, xsaves replaces xsaveopt because + * it supports compact format and supervisor states in addition to + * modified optimization in xsaveopt. + * + * Otherwise, if xsaveopt is enabled, xsaveopt replaces xsave + * because xsaveopt supports modified optimization which is not + * supported by xsave. + * + * If none of xsaves and xsaveopt is enabled, use xsave. + */ + alternative_input_2( + "1:"XSAVE, + "1:"XSAVEOPT, + X86_FEATURE_XSAVEOPT, + "1:"XSAVES, + X86_FEATURE_XSAVES, + [fx] "D" (fx), "a" (lmask), "d" (hmask) : + "memory"); + asm volatile("2:\n\t" + xstate_fault + : "0" (0) : "memory"); return err; } +/* + * Restore processor xstate from xsave area. + */ +static inline int xrstor_state(struct xsave_struct *fx, u64 mask) +{ + int err = 0; + u32 lmask = mask; + u32 hmask = mask >> 32; + + /* + * Use xrstors to restore context if it is enabled. xrstors supports + * compacted format of xsave area which is not supported by xrstor. + */ + alternative_input( + "1: " XRSTOR, + "1: " XRSTORS, + X86_FEATURE_XSAVES, + "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) + : "memory"); + + asm volatile("2:\n" + xstate_fault + : "0" (0) + : "memory"); + + return err; +} + +/* + * Save xstate context for old process during context switch. + */ +static inline void fpu_xsave(struct fpu *fpu) +{ + xsave_state(&fpu->state->xsave, -1); +} + +/* + * Restore xstate context for new process during context switch. + */ +static inline int fpu_xrstor_checking(struct xsave_struct *fx) +{ + return xrstor_state(fx, -1); +} + +/* + * Save xstate to user space xsave area. + * + * We don't use modified optimization because xrstor/xrstors might track + * a different application. + * + * We don't use compacted format xsave area for + * backward compatibility for old applications which don't understand + * compacted format of xsave area. + */ static inline int xsave_user(struct xsave_struct __user *buf) { int err; @@ -83,69 +229,34 @@ static inline int xsave_user(struct xsave_struct __user *buf) return -EFAULT; __asm__ __volatile__(ASM_STAC "\n" - "1: .byte " REX_PREFIX "0x0f,0xae,0x27\n" + "1:"XSAVE"\n" "2: " ASM_CLAC "\n" - ".section .fixup,\"ax\"\n" - "3: movl $-1,%[err]\n" - " jmp 2b\n" - ".previous\n" - _ASM_EXTABLE(1b,3b) - : [err] "=r" (err) + xstate_fault : "D" (buf), "a" (-1), "d" (-1), "0" (0) : "memory"); return err; } +/* + * Restore xstate from user space xsave area. + */ static inline int xrestore_user(struct xsave_struct __user *buf, u64 mask) { - int err; + int err = 0; struct xsave_struct *xstate = ((__force struct xsave_struct *)buf); u32 lmask = mask; u32 hmask = mask >> 32; __asm__ __volatile__(ASM_STAC "\n" - "1: .byte " REX_PREFIX "0x0f,0xae,0x2f\n" + "1:"XRSTOR"\n" "2: " ASM_CLAC "\n" - ".section .fixup,\"ax\"\n" - "3: movl $-1,%[err]\n" - " jmp 2b\n" - ".previous\n" - _ASM_EXTABLE(1b,3b) - : [err] "=r" (err) + xstate_fault : "D" (xstate), "a" (lmask), "d" (hmask), "0" (0) : "memory"); /* memory required? */ return err; } -static inline void xrstor_state(struct xsave_struct *fx, u64 mask) -{ - u32 lmask = mask; - u32 hmask = mask >> 32; - - asm volatile(".byte " REX_PREFIX "0x0f,0xae,0x2f\n\t" - : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) - : "memory"); -} - -static inline void xsave_state(struct xsave_struct *fx, u64 mask) -{ - u32 lmask = mask; - u32 hmask = mask >> 32; +void *get_xsave_addr(struct xsave_struct *xsave, int xstate); +void setup_xstate_comp(void); - asm volatile(".byte " REX_PREFIX "0x0f,0xae,0x27\n\t" - : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) - : "memory"); -} - -static inline void fpu_xsave(struct fpu *fpu) -{ - /* This, however, we can work around by forcing the compiler to select - an addressing mode that doesn't require extended registers. */ - alternative_input( - ".byte " REX_PREFIX "0x0f,0xae,0x27", - ".byte " REX_PREFIX "0x0f,0xae,0x37", - X86_FEATURE_XSAVEOPT, - [fx] "D" (&fpu->state->xsave), "a" (-1), "d" (-1) : - "memory"); -} #endif diff --git a/arch/x86/include/uapi/asm/Kbuild b/arch/x86/include/uapi/asm/Kbuild index 09409c44f9a..3dec769cadf 100644 --- a/arch/x86/include/uapi/asm/Kbuild +++ b/arch/x86/include/uapi/asm/Kbuild @@ -22,6 +22,7 @@ header-y += ipcbuf.h header-y += ist.h header-y += kvm.h header-y += kvm_para.h +header-y += kvm_perf.h header-y += ldt.h header-y += mce.h header-y += mman.h diff --git a/arch/x86/include/uapi/asm/e820.h b/arch/x86/include/uapi/asm/e820.h index bbae0247070..d993e33f523 100644 --- a/arch/x86/include/uapi/asm/e820.h +++ b/arch/x86/include/uapi/asm/e820.h @@ -21,11 +21,6 @@ * this size. */ -/* - * Odd: 'make headers_check' complains about numa.h if I try - * to collapse the next two #ifdef lines to a single line: - * #if defined(__KERNEL__) && defined(CONFIG_EFI) - */ #ifndef __KERNEL__ #define E820_X_MAX E820MAX #endif diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index d3a87780c70..d7dcef58aef 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -23,7 +23,10 @@ #define GP_VECTOR 13 #define PF_VECTOR 14 #define MF_VECTOR 16 +#define AC_VECTOR 17 #define MC_VECTOR 18 +#define XM_VECTOR 19 +#define VE_VECTOR 20 /* Select x86 specific features in <linux/kvm.h> */ #define __KVM_HAVE_PIT diff --git a/arch/x86/include/uapi/asm/kvm_perf.h b/arch/x86/include/uapi/asm/kvm_perf.h new file mode 100644 index 00000000000..3bb964f88aa --- /dev/null +++ b/arch/x86/include/uapi/asm/kvm_perf.h @@ -0,0 +1,16 @@ +#ifndef _ASM_X86_KVM_PERF_H +#define _ASM_X86_KVM_PERF_H + +#include <asm/svm.h> +#include <asm/vmx.h> +#include <asm/kvm.h> + +#define DECODE_STR_LEN 20 + +#define VCPU_ID "vcpu_id" + +#define KVM_ENTRY_TRACE "kvm:kvm_entry" +#define KVM_EXIT_TRACE "kvm:kvm_exit" +#define KVM_EXIT_REASON "exit_reason" + +#endif /* _ASM_X86_KVM_PERF_H */ diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h index fcf2b3ae1bf..e21331ce368 100644 --- a/arch/x86/include/uapi/asm/msr-index.h +++ b/arch/x86/include/uapi/asm/msr-index.h @@ -149,6 +149,9 @@ #define MSR_CORE_C1_RES 0x00000660 +#define MSR_CC6_DEMOTION_POLICY_CONFIG 0x00000668 +#define MSR_MC6_DEMOTION_POLICY_CONFIG 0x00000669 + #define MSR_AMD64_MC0_MASK 0xc0010044 #define MSR_IA32_MCx_CTL(x) (MSR_IA32_MC0_CTL + 4*(x)) @@ -297,6 +300,8 @@ #define MSR_IA32_TSC_ADJUST 0x0000003b #define MSR_IA32_BNDCFGS 0x00000d90 +#define MSR_IA32_XSS 0x00000da0 + #define FEATURE_CONTROL_LOCKED (1<<0) #define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX (1<<1) #define FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX (1<<2) @@ -558,6 +563,7 @@ /* VMX_BASIC bits and bitmasks */ #define VMX_BASIC_VMCS_SIZE_SHIFT 32 +#define VMX_BASIC_TRUE_CTLS (1ULL << 55) #define VMX_BASIC_64 0x0001000000000000LLU #define VMX_BASIC_MEM_TYPE_SHIFT 50 #define VMX_BASIC_MEM_TYPE_MASK 0x003c000000000000LLU diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 047f9ff2e36..8f1e77440b2 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -39,8 +39,6 @@ obj-y += tsc.o tsc_msr.o io_delay.o rtc.o obj-y += pci-iommu_table.o obj-y += resource.o -obj-$(CONFIG_PREEMPT) += preempt.o - obj-y += process.o obj-y += i387.o xsave.o obj-y += ptrace.o @@ -71,6 +69,7 @@ obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o obj-$(CONFIG_X86_TSC) += trace_clock.o obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o +obj-$(CONFIG_KEXEC_FILE) += kexec-bzimage64.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o obj-y += kprobes/ obj-$(CONFIG_MODULES) += module.o @@ -106,6 +105,7 @@ obj-$(CONFIG_EFI) += sysfb_efi.o obj-$(CONFIG_PERF_EVENTS) += perf_regs.o obj-$(CONFIG_TRACING) += tracepoint.o obj-$(CONFIG_IOSF_MBI) += iosf_mbi.o +obj-$(CONFIG_PMC_ATOM) += pmc_atom.o ### # 64 bit specific files diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile index 163b2258147..3242e591fa8 100644 --- a/arch/x86/kernel/acpi/Makefile +++ b/arch/x86/kernel/acpi/Makefile @@ -1,5 +1,6 @@ obj-$(CONFIG_ACPI) += boot.o obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o +obj-$(CONFIG_ACPI_APEI) += apei.o ifneq ($(CONFIG_ACPI_PROCESSOR),) obj-y += cstate.o diff --git a/arch/x86/kernel/acpi/apei.c b/arch/x86/kernel/acpi/apei.c new file mode 100644 index 00000000000..c280df6b2aa --- /dev/null +++ b/arch/x86/kernel/acpi/apei.c @@ -0,0 +1,62 @@ +/* + * Arch-specific APEI-related functions. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <acpi/apei.h> + +#include <asm/mce.h> +#include <asm/tlbflush.h> + +int arch_apei_enable_cmcff(struct acpi_hest_header *hest_hdr, void *data) +{ +#ifdef CONFIG_X86_MCE + int i; + struct acpi_hest_ia_corrected *cmc; + struct acpi_hest_ia_error_bank *mc_bank; + + if (hest_hdr->type != ACPI_HEST_TYPE_IA32_CORRECTED_CHECK) + return 0; + + cmc = (struct acpi_hest_ia_corrected *)hest_hdr; + if (!cmc->enabled) + return 0; + + /* + * We expect HEST to provide a list of MC banks that report errors + * in firmware first mode. Otherwise, return non-zero value to + * indicate that we are done parsing HEST. + */ + if (!(cmc->flags & ACPI_HEST_FIRMWARE_FIRST) || + !cmc->num_hardware_banks) + return 1; + + pr_info("HEST: Enabling Firmware First mode for corrected errors.\n"); + + mc_bank = (struct acpi_hest_ia_error_bank *)(cmc + 1); + for (i = 0; i < cmc->num_hardware_banks; i++, mc_bank++) + mce_disable_bank(mc_bank->bank_number); +#endif + return 1; +} + +void arch_apei_report_mem_error(int sev, struct cper_sec_mem_err *mem_err) +{ +#ifdef CONFIG_X86_MCE + apei_mce_report_mem_error(sev, mem_err); +#endif +} + +void arch_apei_flush_tlb_one(unsigned long addr) +{ + __flush_tlb_one(addr); +} diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 86281ffb96d..b436fc735aa 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -31,6 +31,7 @@ #include <linux/module.h> #include <linux/dmi.h> #include <linux/irq.h> +#include <linux/irqdomain.h> #include <linux/slab.h> #include <linux/bootmem.h> #include <linux/ioport.h> @@ -43,6 +44,7 @@ #include <asm/io.h> #include <asm/mpspec.h> #include <asm/smp.h> +#include <asm/i8259.h> #include "sleep.h" /* To include x86_acpi_suspend_lowlevel */ static int __initdata acpi_force = 0; @@ -74,10 +76,6 @@ int acpi_fix_pin2_polarity __initdata; static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; #endif -#ifndef __HAVE_ARCH_CMPXCHG -#warning ACPI uses CMPXCHG, i486 and later hardware -#endif - /* -------------------------------------------------------------------------- Boot-time Configuration -------------------------------------------------------------------------- */ @@ -97,44 +95,7 @@ static u32 isa_irq_to_gsi[NR_IRQS_LEGACY] __read_mostly = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; -static unsigned int gsi_to_irq(unsigned int gsi) -{ - unsigned int irq = gsi + NR_IRQS_LEGACY; - unsigned int i; - - for (i = 0; i < NR_IRQS_LEGACY; i++) { - if (isa_irq_to_gsi[i] == gsi) { - return i; - } - } - - /* Provide an identity mapping of gsi == irq - * except on truly weird platforms that have - * non isa irqs in the first 16 gsis. - */ - if (gsi >= NR_IRQS_LEGACY) - irq = gsi; - else - irq = gsi_top + gsi; - - return irq; -} - -static u32 irq_to_gsi(int irq) -{ - unsigned int gsi; - - if (irq < NR_IRQS_LEGACY) - gsi = isa_irq_to_gsi[irq]; - else if (irq < gsi_top) - gsi = irq; - else if (irq < (gsi_top + NR_IRQS_LEGACY)) - gsi = irq - gsi_top; - else - gsi = 0xffffffff; - - return gsi; -} +#define ACPI_INVALID_GSI INT_MIN /* * This is just a simple wrapper around early_ioremap(), @@ -345,11 +306,145 @@ acpi_parse_lapic_nmi(struct acpi_subtable_header * header, const unsigned long e #endif /*CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_X86_IO_APIC +#define MP_ISA_BUS 0 + +static void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, + u32 gsi) +{ + int ioapic; + int pin; + struct mpc_intsrc mp_irq; + + /* + * Convert 'gsi' to 'ioapic.pin'. + */ + ioapic = mp_find_ioapic(gsi); + if (ioapic < 0) + return; + pin = mp_find_ioapic_pin(ioapic, gsi); + + /* + * TBD: This check is for faulty timer entries, where the override + * erroneously sets the trigger to level, resulting in a HUGE + * increase of timer interrupts! + */ + if ((bus_irq == 0) && (trigger == 3)) + trigger = 1; + + mp_irq.type = MP_INTSRC; + mp_irq.irqtype = mp_INT; + mp_irq.irqflag = (trigger << 2) | polarity; + mp_irq.srcbus = MP_ISA_BUS; + mp_irq.srcbusirq = bus_irq; /* IRQ */ + mp_irq.dstapic = mpc_ioapic_id(ioapic); /* APIC ID */ + mp_irq.dstirq = pin; /* INTIN# */ + + mp_save_irq(&mp_irq); + + /* + * Reset default identity mapping if gsi is also an legacy IRQ, + * otherwise there will be more than one entry with the same GSI + * and acpi_isa_irq_to_gsi() may give wrong result. + */ + if (gsi < nr_legacy_irqs() && isa_irq_to_gsi[gsi] == gsi) + isa_irq_to_gsi[gsi] = ACPI_INVALID_GSI; + isa_irq_to_gsi[bus_irq] = gsi; +} + +static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger, + int polarity) +{ +#ifdef CONFIG_X86_MPPARSE + struct mpc_intsrc mp_irq; + struct pci_dev *pdev; + unsigned char number; + unsigned int devfn; + int ioapic; + u8 pin; + + if (!acpi_ioapic) + return 0; + if (!dev || !dev_is_pci(dev)) + return 0; + + pdev = to_pci_dev(dev); + number = pdev->bus->number; + devfn = pdev->devfn; + pin = pdev->pin; + /* print the entry should happen on mptable identically */ + mp_irq.type = MP_INTSRC; + mp_irq.irqtype = mp_INT; + mp_irq.irqflag = (trigger == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) | + (polarity == ACPI_ACTIVE_HIGH ? 1 : 3); + mp_irq.srcbus = number; + mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3); + ioapic = mp_find_ioapic(gsi); + mp_irq.dstapic = mpc_ioapic_id(ioapic); + mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi); + + mp_save_irq(&mp_irq); +#endif + return 0; +} + +static int mp_register_gsi(struct device *dev, u32 gsi, int trigger, + int polarity) +{ + int irq, node; + + if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) + return gsi; + + /* Don't set up the ACPI SCI because it's already set up */ + if (acpi_gbl_FADT.sci_interrupt == gsi) + return gsi; + + trigger = trigger == ACPI_EDGE_SENSITIVE ? 0 : 1; + polarity = polarity == ACPI_ACTIVE_HIGH ? 0 : 1; + node = dev ? dev_to_node(dev) : NUMA_NO_NODE; + if (mp_set_gsi_attr(gsi, trigger, polarity, node)) { + pr_warn("Failed to set pin attr for GSI%d\n", gsi); + return -1; + } + + irq = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC); + if (irq < 0) + return irq; + + if (enable_update_mptable) + mp_config_acpi_gsi(dev, gsi, trigger, polarity); + + return irq; +} + +static void mp_unregister_gsi(u32 gsi) +{ + int irq; + + if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) + return; + + if (acpi_gbl_FADT.sci_interrupt == gsi) + return; + + irq = mp_map_gsi_to_irq(gsi, 0); + if (irq > 0) + mp_unmap_irq(irq); +} + +static struct irq_domain_ops acpi_irqdomain_ops = { + .map = mp_irqdomain_map, + .unmap = mp_irqdomain_unmap, +}; static int __init acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end) { struct acpi_madt_io_apic *ioapic = NULL; + struct ioapic_domain_cfg cfg = { + .type = IOAPIC_DOMAIN_DYNAMIC, + .ops = &acpi_irqdomain_ops, + }; ioapic = (struct acpi_madt_io_apic *)header; @@ -358,8 +453,12 @@ acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end) acpi_table_print_madt_entry(header); - mp_register_ioapic(ioapic->id, - ioapic->address, ioapic->global_irq_base); + /* Statically assign IRQ numbers for IOAPICs hosting legacy IRQs */ + if (ioapic->global_irq_base < nr_legacy_irqs()) + cfg.type = IOAPIC_DOMAIN_LEGACY; + + mp_register_ioapic(ioapic->id, ioapic->address, ioapic->global_irq_base, + &cfg); return 0; } @@ -382,11 +481,6 @@ static void __init acpi_sci_ioapic_setup(u8 bus_irq, u16 polarity, u16 trigger, if (acpi_sci_flags & ACPI_MADT_POLARITY_MASK) polarity = acpi_sci_flags & ACPI_MADT_POLARITY_MASK; - /* - * mp_config_acpi_legacy_irqs() already setup IRQs < 16 - * If GSI is < 16, this will update its flags, - * else it will create a new mp_irqs[] entry. - */ mp_override_legacy_irq(bus_irq, polarity, trigger, gsi); /* @@ -508,25 +602,28 @@ void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger) outb(new >> 8, 0x4d1); } -int acpi_gsi_to_irq(u32 gsi, unsigned int *irq) +int acpi_gsi_to_irq(u32 gsi, unsigned int *irqp) { - *irq = gsi_to_irq(gsi); + int irq = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC | IOAPIC_MAP_CHECK); -#ifdef CONFIG_X86_IO_APIC - if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) - setup_IO_APIC_irq_extra(gsi); -#endif + if (irq >= 0) { + *irqp = irq; + return 0; + } - return 0; + return -1; } EXPORT_SYMBOL_GPL(acpi_gsi_to_irq); int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi) { - if (isa_irq >= 16) - return -1; - *gsi = irq_to_gsi(isa_irq); - return 0; + if (isa_irq < nr_legacy_irqs() && + isa_irq_to_gsi[isa_irq] != ACPI_INVALID_GSI) { + *gsi = isa_irq_to_gsi[isa_irq]; + return 0; + } + + return -1; } static int acpi_register_gsi_pic(struct device *dev, u32 gsi, @@ -546,15 +643,25 @@ static int acpi_register_gsi_pic(struct device *dev, u32 gsi, static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi, int trigger, int polarity) { + int irq = gsi; + #ifdef CONFIG_X86_IO_APIC - gsi = mp_register_gsi(dev, gsi, trigger, polarity); + irq = mp_register_gsi(dev, gsi, trigger, polarity); #endif - return gsi; + return irq; +} + +static void acpi_unregister_gsi_ioapic(u32 gsi) +{ +#ifdef CONFIG_X86_IO_APIC + mp_unregister_gsi(gsi); +#endif } int (*__acpi_register_gsi)(struct device *dev, u32 gsi, int trigger, int polarity) = acpi_register_gsi_pic; +void (*__acpi_unregister_gsi)(u32 gsi) = NULL; #ifdef CONFIG_ACPI_SLEEP int (*acpi_suspend_lowlevel)(void) = x86_acpi_suspend_lowlevel; @@ -568,32 +675,22 @@ int (*acpi_suspend_lowlevel)(void); */ int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) { - unsigned int irq; - unsigned int plat_gsi = gsi; - - plat_gsi = (*__acpi_register_gsi)(dev, gsi, trigger, polarity); - irq = gsi_to_irq(plat_gsi); - - return irq; + return __acpi_register_gsi(dev, gsi, trigger, polarity); } EXPORT_SYMBOL_GPL(acpi_register_gsi); void acpi_unregister_gsi(u32 gsi) { + if (__acpi_unregister_gsi) + __acpi_unregister_gsi(gsi); } EXPORT_SYMBOL_GPL(acpi_unregister_gsi); -void __init acpi_set_irq_model_pic(void) -{ - acpi_irq_model = ACPI_IRQ_MODEL_PIC; - __acpi_register_gsi = acpi_register_gsi_pic; - acpi_ioapic = 0; -} - -void __init acpi_set_irq_model_ioapic(void) +static void __init acpi_set_irq_model_ioapic(void) { acpi_irq_model = ACPI_IRQ_MODEL_IOAPIC; __acpi_register_gsi = acpi_register_gsi_ioapic; + __acpi_unregister_gsi = acpi_unregister_gsi_ioapic; acpi_ioapic = 1; } @@ -829,9 +926,8 @@ static int __init early_acpi_parse_madt_lapic_addr_ovr(void) * and (optionally) overriden by a LAPIC_ADDR_OVR entry (64-bit value). */ - count = - acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE, - acpi_parse_lapic_addr_ovr, 0); + count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE, + acpi_parse_lapic_addr_ovr, 0); if (count < 0) { printk(KERN_ERR PREFIX "Error parsing LAPIC address override entry\n"); @@ -856,9 +952,8 @@ static int __init acpi_parse_madt_lapic_entries(void) * and (optionally) overriden by a LAPIC_ADDR_OVR entry (64-bit value). */ - count = - acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE, - acpi_parse_lapic_addr_ovr, 0); + count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE, + acpi_parse_lapic_addr_ovr, 0); if (count < 0) { printk(KERN_ERR PREFIX "Error parsing LAPIC address override entry\n"); @@ -886,11 +981,10 @@ static int __init acpi_parse_madt_lapic_entries(void) return count; } - x2count = - acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC_NMI, - acpi_parse_x2apic_nmi, 0); - count = - acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_NMI, acpi_parse_lapic_nmi, 0); + x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC_NMI, + acpi_parse_x2apic_nmi, 0); + count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_NMI, + acpi_parse_lapic_nmi, 0); if (count < 0 || x2count < 0) { printk(KERN_ERR PREFIX "Error parsing LAPIC NMI entry\n"); /* TBD: Cleanup to allow fallback to MPS */ @@ -901,44 +995,7 @@ static int __init acpi_parse_madt_lapic_entries(void) #endif /* CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_X86_IO_APIC -#define MP_ISA_BUS 0 - -void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) -{ - int ioapic; - int pin; - struct mpc_intsrc mp_irq; - - /* - * Convert 'gsi' to 'ioapic.pin'. - */ - ioapic = mp_find_ioapic(gsi); - if (ioapic < 0) - return; - pin = mp_find_ioapic_pin(ioapic, gsi); - - /* - * TBD: This check is for faulty timer entries, where the override - * erroneously sets the trigger to level, resulting in a HUGE - * increase of timer interrupts! - */ - if ((bus_irq == 0) && (trigger == 3)) - trigger = 1; - - mp_irq.type = MP_INTSRC; - mp_irq.irqtype = mp_INT; - mp_irq.irqflag = (trigger << 2) | polarity; - mp_irq.srcbus = MP_ISA_BUS; - mp_irq.srcbusirq = bus_irq; /* IRQ */ - mp_irq.dstapic = mpc_ioapic_id(ioapic); /* APIC ID */ - mp_irq.dstirq = pin; /* INTIN# */ - - mp_save_irq(&mp_irq); - - isa_irq_to_gsi[bus_irq] = gsi; -} - -void __init mp_config_acpi_legacy_irqs(void) +static void __init mp_config_acpi_legacy_irqs(void) { int i; struct mpc_intsrc mp_irq; @@ -956,7 +1013,7 @@ void __init mp_config_acpi_legacy_irqs(void) * Use the default configuration for the IRQs 0-15. Unless * overridden by (MADT) interrupt source override entries. */ - for (i = 0; i < 16; i++) { + for (i = 0; i < nr_legacy_irqs(); i++) { int ioapic, pin; unsigned int dstapic; int idx; @@ -1004,84 +1061,6 @@ void __init mp_config_acpi_legacy_irqs(void) } } -static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger, - int polarity) -{ -#ifdef CONFIG_X86_MPPARSE - struct mpc_intsrc mp_irq; - struct pci_dev *pdev; - unsigned char number; - unsigned int devfn; - int ioapic; - u8 pin; - - if (!acpi_ioapic) - return 0; - if (!dev || !dev_is_pci(dev)) - return 0; - - pdev = to_pci_dev(dev); - number = pdev->bus->number; - devfn = pdev->devfn; - pin = pdev->pin; - /* print the entry should happen on mptable identically */ - mp_irq.type = MP_INTSRC; - mp_irq.irqtype = mp_INT; - mp_irq.irqflag = (trigger == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) | - (polarity == ACPI_ACTIVE_HIGH ? 1 : 3); - mp_irq.srcbus = number; - mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3); - ioapic = mp_find_ioapic(gsi); - mp_irq.dstapic = mpc_ioapic_id(ioapic); - mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi); - - mp_save_irq(&mp_irq); -#endif - return 0; -} - -int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) -{ - int ioapic; - int ioapic_pin; - struct io_apic_irq_attr irq_attr; - int ret; - - if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) - return gsi; - - /* Don't set up the ACPI SCI because it's already set up */ - if (acpi_gbl_FADT.sci_interrupt == gsi) - return gsi; - - ioapic = mp_find_ioapic(gsi); - if (ioapic < 0) { - printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi); - return gsi; - } - - ioapic_pin = mp_find_ioapic_pin(ioapic, gsi); - - if (ioapic_pin > MP_MAX_IOAPIC_PIN) { - printk(KERN_ERR "Invalid reference to IOAPIC pin " - "%d-%d\n", mpc_ioapic_id(ioapic), - ioapic_pin); - return gsi; - } - - if (enable_update_mptable) - mp_config_acpi_gsi(dev, gsi, trigger, polarity); - - set_io_apic_irq_attr(&irq_attr, ioapic, ioapic_pin, - trigger == ACPI_EDGE_SENSITIVE ? 0 : 1, - polarity == ACPI_ACTIVE_HIGH ? 0 : 1); - ret = io_apic_set_pci_routing(dev, gsi_to_irq(gsi), &irq_attr); - if (ret < 0) - gsi = INT_MIN; - - return gsi; -} - /* * Parse IOAPIC related entries in MADT * returns 0 on success, < 0 on error @@ -1111,9 +1090,8 @@ static int __init acpi_parse_madt_ioapic_entries(void) return -ENODEV; } - count = - acpi_table_parse_madt(ACPI_MADT_TYPE_IO_APIC, acpi_parse_ioapic, - MAX_IO_APICS); + count = acpi_table_parse_madt(ACPI_MADT_TYPE_IO_APIC, acpi_parse_ioapic, + MAX_IO_APICS); if (!count) { printk(KERN_ERR PREFIX "No IOAPIC entries present\n"); return -ENODEV; @@ -1122,9 +1100,8 @@ static int __init acpi_parse_madt_ioapic_entries(void) return count; } - count = - acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, acpi_parse_int_src_ovr, - nr_irqs); + count = acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, + acpi_parse_int_src_ovr, nr_irqs); if (count < 0) { printk(KERN_ERR PREFIX "Error parsing interrupt source overrides entry\n"); @@ -1143,9 +1120,8 @@ static int __init acpi_parse_madt_ioapic_entries(void) /* Fill in identity legacy mappings where no override */ mp_config_acpi_legacy_irqs(); - count = - acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE, acpi_parse_nmi_src, - nr_irqs); + count = acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE, + acpi_parse_nmi_src, nr_irqs); if (count < 0) { printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n"); /* TBD: Cleanup to allow fallback to MPS */ diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index af5b08ab3b7..5972b108f15 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -146,7 +146,7 @@ static inline int is_apbt_capable(void) static int __init apbt_clockevent_register(void) { struct sfi_timer_table_entry *mtmr; - struct apbt_dev *adev = &__get_cpu_var(cpu_apbt_dev); + struct apbt_dev *adev = this_cpu_ptr(&cpu_apbt_dev); mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM); if (mtmr == NULL) { @@ -200,7 +200,7 @@ void apbt_setup_secondary_clock(void) if (!cpu) return; - adev = &__get_cpu_var(cpu_apbt_dev); + adev = this_cpu_ptr(&cpu_apbt_dev); if (!adev->timer) { adev->timer = dw_apb_clockevent_init(cpu, adev->name, APBT_CLOCKEVENT_RATING, adev_virt_addr(adev), diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index ad28db7e6bd..00853b254ab 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -67,7 +67,7 @@ EXPORT_SYMBOL_GPL(boot_cpu_physical_apicid); /* * The highest APIC ID seen during enumeration. */ -unsigned int max_physical_apicid; +static unsigned int max_physical_apicid; /* * Bitmask of physically existing CPUs: @@ -561,7 +561,7 @@ static DEFINE_PER_CPU(struct clock_event_device, lapic_events); */ static void setup_APIC_timer(void) { - struct clock_event_device *levt = &__get_cpu_var(lapic_events); + struct clock_event_device *levt = this_cpu_ptr(&lapic_events); if (this_cpu_has(X86_FEATURE_ARAT)) { lapic_clockevent.features &= ~CLOCK_EVT_FEAT_C3STOP; @@ -696,7 +696,7 @@ calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc) static int __init calibrate_APIC_clock(void) { - struct clock_event_device *levt = &__get_cpu_var(lapic_events); + struct clock_event_device *levt = this_cpu_ptr(&lapic_events); void (*real_handler)(struct clock_event_device *dev); unsigned long deltaj; long delta, deltatsc; @@ -1342,17 +1342,6 @@ void setup_local_APIC(void) /* always use the value from LDR */ early_per_cpu(x86_cpu_to_logical_apicid, cpu) = logical_smp_processor_id(); - - /* - * Some NUMA implementations (NUMAQ) don't initialize apicid to - * node mapping during NUMA init. Now that logical apicid is - * guaranteed to be known, give it another chance. This is already - * a bit too late - percpu allocation has already happened without - * proper NUMA affinity. - */ - if (apic->x86_32_numa_cpu_node) - set_apicid_to_node(early_per_cpu(x86_cpu_to_apicid, cpu), - apic->x86_32_numa_cpu_node(cpu)); #endif /* @@ -2053,8 +2042,6 @@ void __init connect_bsp_APIC(void) imcr_pic_to_apic(); } #endif - if (apic->enable_apic_mode) - apic->enable_apic_mode(); } /** @@ -2451,51 +2438,6 @@ static void apic_pm_activate(void) { } #ifdef CONFIG_X86_64 -static int apic_cluster_num(void) -{ - int i, clusters, zeros; - unsigned id; - u16 *bios_cpu_apicid; - DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS); - - bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid); - bitmap_zero(clustermap, NUM_APIC_CLUSTERS); - - for (i = 0; i < nr_cpu_ids; i++) { - /* are we being called early in kernel startup? */ - if (bios_cpu_apicid) { - id = bios_cpu_apicid[i]; - } else if (i < nr_cpu_ids) { - if (cpu_present(i)) - id = per_cpu(x86_bios_cpu_apicid, i); - else - continue; - } else - break; - - if (id != BAD_APICID) - __set_bit(APIC_CLUSTERID(id), clustermap); - } - - /* Problem: Partially populated chassis may not have CPUs in some of - * the APIC clusters they have been allocated. Only present CPUs have - * x86_bios_cpu_apicid entries, thus causing zeroes in the bitmap. - * Since clusters are allocated sequentially, count zeros only if - * they are bounded by ones. - */ - clusters = 0; - zeros = 0; - for (i = 0; i < NUM_APIC_CLUSTERS; i++) { - if (test_bit(i, clustermap)) { - clusters += 1 + zeros; - zeros = 0; - } else - ++zeros; - } - - return clusters; -} - static int multi_checked; static int multi; @@ -2540,20 +2482,7 @@ static void dmi_check_multi(void) int apic_is_clustered_box(void) { dmi_check_multi(); - if (multi) - return 1; - - if (!is_vsmp_box()) - return 0; - - /* - * ScaleMP vSMPowered boxes have one cluster per board and TSCs are - * not guaranteed to be synced between boards - */ - if (apic_cluster_num() > 1) - return 1; - - return 0; + return multi; } #endif diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 7c1b2947951..de918c410ea 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -168,21 +168,16 @@ static struct apic apic_flat = { .disable_esr = 0, .dest_logical = APIC_DEST_LOGICAL, .check_apicid_used = NULL, - .check_apicid_present = NULL, .vector_allocation_domain = flat_vector_allocation_domain, .init_apic_ldr = flat_init_apic_ldr, .ioapic_phys_id_map = NULL, .setup_apic_routing = NULL, - .multi_timer_check = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, .apicid_to_cpu_present = NULL, - .setup_portio_remap = NULL, .check_phys_apicid_present = default_check_phys_apicid_present, - .enable_apic_mode = NULL, .phys_pkg_id = flat_phys_pkg_id, - .mps_oem_check = NULL, .get_apic_id = flat_get_apic_id, .set_apic_id = set_apic_id, @@ -196,10 +191,7 @@ static struct apic apic_flat = { .send_IPI_all = flat_send_IPI_all, .send_IPI_self = apic_send_IPI_self, - .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, - .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, .wait_for_init_deassert = false, - .smp_callin_clear_local_apic = NULL, .inquire_remote_apic = default_inquire_remote_apic, .read = native_apic_mem_read, @@ -283,7 +275,6 @@ static struct apic apic_physflat = { .disable_esr = 0, .dest_logical = 0, .check_apicid_used = NULL, - .check_apicid_present = NULL, .vector_allocation_domain = default_vector_allocation_domain, /* not needed, but shouldn't hurt: */ @@ -291,14 +282,10 @@ static struct apic apic_physflat = { .ioapic_phys_id_map = NULL, .setup_apic_routing = NULL, - .multi_timer_check = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, .apicid_to_cpu_present = NULL, - .setup_portio_remap = NULL, .check_phys_apicid_present = default_check_phys_apicid_present, - .enable_apic_mode = NULL, .phys_pkg_id = flat_phys_pkg_id, - .mps_oem_check = NULL, .get_apic_id = flat_get_apic_id, .set_apic_id = set_apic_id, @@ -312,10 +299,7 @@ static struct apic apic_physflat = { .send_IPI_all = physflat_send_IPI_all, .send_IPI_self = apic_send_IPI_self, - .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, - .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, .wait_for_init_deassert = false, - .smp_callin_clear_local_apic = NULL, .inquire_remote_apic = default_inquire_remote_apic, .read = native_apic_mem_read, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 8c7c98249c2..b205cdbdbe6 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -89,16 +89,6 @@ static const struct cpumask *noop_target_cpus(void) return cpumask_of(0); } -static unsigned long noop_check_apicid_used(physid_mask_t *map, int apicid) -{ - return physid_isset(apicid, *map); -} - -static unsigned long noop_check_apicid_present(int bit) -{ - return physid_isset(bit, phys_cpu_present_map); -} - static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask, const struct cpumask *mask) { @@ -133,27 +123,21 @@ struct apic apic_noop = { .target_cpus = noop_target_cpus, .disable_esr = 0, .dest_logical = APIC_DEST_LOGICAL, - .check_apicid_used = noop_check_apicid_used, - .check_apicid_present = noop_check_apicid_present, + .check_apicid_used = default_check_apicid_used, .vector_allocation_domain = noop_vector_allocation_domain, .init_apic_ldr = noop_init_apic_ldr, .ioapic_phys_id_map = default_ioapic_phys_id_map, .setup_apic_routing = NULL, - .multi_timer_check = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, .apicid_to_cpu_present = physid_set_mask_of_physid, - .setup_portio_remap = NULL, .check_phys_apicid_present = default_check_phys_apicid_present, - .enable_apic_mode = NULL, .phys_pkg_id = noop_phys_pkg_id, - .mps_oem_check = NULL, - .get_apic_id = noop_get_apic_id, .set_apic_id = NULL, .apic_id_mask = 0x0F << 24, @@ -168,12 +152,7 @@ struct apic apic_noop = { .wakeup_secondary_cpu = noop_wakeup_secondary_cpu, - /* should be safe */ - .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, - .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, - .wait_for_init_deassert = false, - .smp_callin_clear_local_apic = NULL, .inquire_remote_apic = NULL, .read = noop_apic_read, diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index a5b45df8bc8..4128b5fcb55 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -32,7 +32,7 @@ static int numachip_system __read_mostly; -static const struct apic apic_numachip __read_mostly; +static const struct apic apic_numachip; static unsigned int get_apic_id(unsigned long x) { @@ -217,21 +217,16 @@ static const struct apic apic_numachip __refconst = { .disable_esr = 0, .dest_logical = 0, .check_apicid_used = NULL, - .check_apicid_present = NULL, .vector_allocation_domain = default_vector_allocation_domain, .init_apic_ldr = flat_init_apic_ldr, .ioapic_phys_id_map = NULL, .setup_apic_routing = NULL, - .multi_timer_check = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, .apicid_to_cpu_present = NULL, - .setup_portio_remap = NULL, .check_phys_apicid_present = default_check_phys_apicid_present, - .enable_apic_mode = NULL, .phys_pkg_id = numachip_phys_pkg_id, - .mps_oem_check = NULL, .get_apic_id = get_apic_id, .set_apic_id = set_apic_id, @@ -246,10 +241,7 @@ static const struct apic apic_numachip __refconst = { .send_IPI_self = numachip_send_IPI_self, .wakeup_secondary_cpu = numachip_wakeup_secondary, - .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, - .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, .wait_for_init_deassert = false, - .smp_callin_clear_local_apic = NULL, .inquire_remote_apic = NULL, /* REMRD not supported */ .read = native_apic_mem_read, diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index e4840aa7a25..c4a8d63f822 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -31,11 +31,6 @@ static unsigned long bigsmp_check_apicid_used(physid_mask_t *map, int apicid) return 0; } -static unsigned long bigsmp_check_apicid_present(int bit) -{ - return 1; -} - static int bigsmp_early_logical_apicid(int cpu) { /* on bigsmp, logical apicid is the same as physical */ @@ -168,21 +163,16 @@ static struct apic apic_bigsmp = { .disable_esr = 1, .dest_logical = 0, .check_apicid_used = bigsmp_check_apicid_used, - .check_apicid_present = bigsmp_check_apicid_present, .vector_allocation_domain = default_vector_allocation_domain, .init_apic_ldr = bigsmp_init_apic_ldr, .ioapic_phys_id_map = bigsmp_ioapic_phys_id_map, .setup_apic_routing = bigsmp_setup_apic_routing, - .multi_timer_check = NULL, .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid, .apicid_to_cpu_present = physid_set_mask_of_physid, - .setup_portio_remap = NULL, .check_phys_apicid_present = bigsmp_check_phys_apicid_present, - .enable_apic_mode = NULL, .phys_pkg_id = bigsmp_phys_pkg_id, - .mps_oem_check = NULL, .get_apic_id = bigsmp_get_apic_id, .set_apic_id = NULL, @@ -196,11 +186,7 @@ static struct apic apic_bigsmp = { .send_IPI_all = bigsmp_send_IPI_all, .send_IPI_self = default_send_IPI_self, - .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, - .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, - .wait_for_init_deassert = true, - .smp_callin_clear_local_apic = NULL, .inquire_remote_apic = default_inquire_remote_apic, .read = native_apic_mem_read, diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 81e08eff05e..1183d545da1 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -31,6 +31,7 @@ #include <linux/acpi.h> #include <linux/module.h> #include <linux/syscore_ops.h> +#include <linux/irqdomain.h> #include <linux/msi.h> #include <linux/htirq.h> #include <linux/freezer.h> @@ -62,6 +63,16 @@ #define __apicdebuginit(type) static type __init +#define for_each_ioapic(idx) \ + for ((idx) = 0; (idx) < nr_ioapics; (idx)++) +#define for_each_ioapic_reverse(idx) \ + for ((idx) = nr_ioapics - 1; (idx) >= 0; (idx)--) +#define for_each_pin(idx, pin) \ + for ((pin) = 0; (pin) < ioapics[(idx)].nr_registers; (pin)++) +#define for_each_ioapic_pin(idx, pin) \ + for_each_ioapic((idx)) \ + for_each_pin((idx), (pin)) + #define for_each_irq_pin(entry, head) \ for (entry = head; entry; entry = entry->next) @@ -73,6 +84,17 @@ int sis_apic_bug = -1; static DEFINE_RAW_SPINLOCK(ioapic_lock); static DEFINE_RAW_SPINLOCK(vector_lock); +static DEFINE_MUTEX(ioapic_mutex); +static unsigned int ioapic_dynirq_base; +static int ioapic_initialized; + +struct mp_pin_info { + int trigger; + int polarity; + int node; + int set; + u32 count; +}; static struct ioapic { /* @@ -87,7 +109,9 @@ static struct ioapic { struct mpc_ioapic mp_config; /* IO APIC gsi routing info */ struct mp_ioapic_gsi gsi_config; - DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1); + struct ioapic_domain_cfg irqdomain_cfg; + struct irq_domain *irqdomain; + struct mp_pin_info *pin_info; } ioapics[MAX_IO_APICS]; #define mpc_ioapic_ver(ioapic_idx) ioapics[ioapic_idx].mp_config.apicver @@ -107,6 +131,41 @@ struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int ioapic_idx) return &ioapics[ioapic_idx].gsi_config; } +static inline int mp_ioapic_pin_count(int ioapic) +{ + struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(ioapic); + + return gsi_cfg->gsi_end - gsi_cfg->gsi_base + 1; +} + +u32 mp_pin_to_gsi(int ioapic, int pin) +{ + return mp_ioapic_gsi_routing(ioapic)->gsi_base + pin; +} + +/* + * Initialize all legacy IRQs and all pins on the first IOAPIC + * if we have legacy interrupt controller. Kernel boot option "pirq=" + * may rely on non-legacy pins on the first IOAPIC. + */ +static inline int mp_init_irq_at_boot(int ioapic, int irq) +{ + if (!nr_legacy_irqs()) + return 0; + + return ioapic == 0 || (irq >= 0 && irq < nr_legacy_irqs()); +} + +static inline struct mp_pin_info *mp_pin_info(int ioapic_idx, int pin) +{ + return ioapics[ioapic_idx].pin_info + pin; +} + +static inline struct irq_domain *mp_ioapic_irqdomain(int ioapic) +{ + return ioapics[ioapic].irqdomain; +} + int nr_ioapics; /* The one past the highest gsi number used */ @@ -118,9 +177,6 @@ struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES]; /* # of MP IRQ source entries */ int mp_irq_entries; -/* GSI interrupts */ -static int nr_irqs_gsi = NR_IRQS_LEGACY; - #ifdef CONFIG_EISA int mp_bus_id_to_type[MAX_MP_BUSSES]; #endif @@ -149,8 +205,7 @@ static int __init parse_noapic(char *str) } early_param("noapic", parse_noapic); -static int io_apic_setup_irq_pin(unsigned int irq, int node, - struct io_apic_irq_attr *attr); +static struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node); /* Will be called in mpparse/acpi/sfi codes for saving IRQ info */ void mp_save_irq(struct mpc_intsrc *m) @@ -182,19 +237,15 @@ static struct irq_pin_list *alloc_irq_pin_list(int node) return kzalloc_node(sizeof(struct irq_pin_list), GFP_KERNEL, node); } - -/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ -static struct irq_cfg irq_cfgx[NR_IRQS_LEGACY]; - int __init arch_early_irq_init(void) { struct irq_cfg *cfg; - int count, node, i; + int i, node = cpu_to_node(0); - if (!legacy_pic->nr_legacy_irqs) + if (!nr_legacy_irqs()) io_apic_irqs = ~0UL; - for (i = 0; i < nr_ioapics; i++) { + for_each_ioapic(i) { ioapics[i].saved_registers = kzalloc(sizeof(struct IO_APIC_route_entry) * ioapics[i].nr_registers, GFP_KERNEL); @@ -202,28 +253,20 @@ int __init arch_early_irq_init(void) pr_err("IOAPIC %d: suspend/resume impossible!\n", i); } - cfg = irq_cfgx; - count = ARRAY_SIZE(irq_cfgx); - node = cpu_to_node(0); - - for (i = 0; i < count; i++) { - irq_set_chip_data(i, &cfg[i]); - zalloc_cpumask_var_node(&cfg[i].domain, GFP_KERNEL, node); - zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_KERNEL, node); - /* - * For legacy IRQ's, start with assigning irq0 to irq15 to - * IRQ0_VECTOR to IRQ15_VECTOR for all cpu's. - */ - if (i < legacy_pic->nr_legacy_irqs) { - cfg[i].vector = IRQ0_VECTOR + i; - cpumask_setall(cfg[i].domain); - } + /* + * For legacy IRQ's, start with assigning irq0 to irq15 to + * IRQ0_VECTOR to IRQ15_VECTOR for all cpu's. + */ + for (i = 0; i < nr_legacy_irqs(); i++) { + cfg = alloc_irq_and_cfg_at(i, node); + cfg->vector = IRQ0_VECTOR + i; + cpumask_setall(cfg->domain); } return 0; } -static struct irq_cfg *irq_cfg(unsigned int irq) +static inline struct irq_cfg *irq_cfg(unsigned int irq) { return irq_get_chip_data(irq); } @@ -265,7 +308,7 @@ static struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node) if (res < 0) { if (res != -EEXIST) return NULL; - cfg = irq_get_chip_data(at); + cfg = irq_cfg(at); if (cfg) return cfg; } @@ -425,6 +468,21 @@ static int __add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pi return 0; } +static void __remove_pin_from_irq(struct irq_cfg *cfg, int apic, int pin) +{ + struct irq_pin_list **last, *entry; + + last = &cfg->irq_2_pin; + for_each_irq_pin(entry, cfg->irq_2_pin) + if (entry->apic == apic && entry->pin == pin) { + *last = entry->next; + kfree(entry); + return; + } else { + last = &entry->next; + } +} + static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) { if (__add_pin_to_irq_node(cfg, node, apic, pin)) @@ -627,9 +685,8 @@ static void clear_IO_APIC (void) { int apic, pin; - for (apic = 0; apic < nr_ioapics; apic++) - for (pin = 0; pin < ioapics[apic].nr_registers; pin++) - clear_IO_APIC_pin(apic, pin); + for_each_ioapic_pin(apic, pin) + clear_IO_APIC_pin(apic, pin); } #ifdef CONFIG_X86_32 @@ -678,13 +735,13 @@ int save_ioapic_entries(void) int apic, pin; int err = 0; - for (apic = 0; apic < nr_ioapics; apic++) { + for_each_ioapic(apic) { if (!ioapics[apic].saved_registers) { err = -ENOMEM; continue; } - for (pin = 0; pin < ioapics[apic].nr_registers; pin++) + for_each_pin(apic, pin) ioapics[apic].saved_registers[pin] = ioapic_read_entry(apic, pin); } @@ -699,11 +756,11 @@ void mask_ioapic_entries(void) { int apic, pin; - for (apic = 0; apic < nr_ioapics; apic++) { + for_each_ioapic(apic) { if (!ioapics[apic].saved_registers) continue; - for (pin = 0; pin < ioapics[apic].nr_registers; pin++) { + for_each_pin(apic, pin) { struct IO_APIC_route_entry entry; entry = ioapics[apic].saved_registers[pin]; @@ -722,11 +779,11 @@ int restore_ioapic_entries(void) { int apic, pin; - for (apic = 0; apic < nr_ioapics; apic++) { + for_each_ioapic(apic) { if (!ioapics[apic].saved_registers) continue; - for (pin = 0; pin < ioapics[apic].nr_registers; pin++) + for_each_pin(apic, pin) ioapic_write_entry(apic, pin, ioapics[apic].saved_registers[pin]); } @@ -785,7 +842,7 @@ static int __init find_isa_irq_apic(int irq, int type) if (i < mp_irq_entries) { int ioapic_idx; - for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++) + for_each_ioapic(ioapic_idx) if (mpc_ioapic_id(ioapic_idx) == mp_irqs[i].dstapic) return ioapic_idx; } @@ -799,7 +856,7 @@ static int __init find_isa_irq_apic(int irq, int type) */ static int EISA_ELCR(unsigned int irq) { - if (irq < legacy_pic->nr_legacy_irqs) { + if (irq < nr_legacy_irqs()) { unsigned int port = 0x4d0 + (irq >> 3); return (inb(port) >> (irq & 7)) & 1; } @@ -939,29 +996,106 @@ static int irq_trigger(int idx) return trigger; } -static int pin_2_irq(int idx, int apic, int pin) +static int alloc_irq_from_domain(struct irq_domain *domain, u32 gsi, int pin) +{ + int irq = -1; + int ioapic = (int)(long)domain->host_data; + int type = ioapics[ioapic].irqdomain_cfg.type; + + switch (type) { + case IOAPIC_DOMAIN_LEGACY: + /* + * Dynamically allocate IRQ number for non-ISA IRQs in the first 16 + * GSIs on some weird platforms. + */ + if (gsi < nr_legacy_irqs()) + irq = irq_create_mapping(domain, pin); + else if (irq_create_strict_mappings(domain, gsi, pin, 1) == 0) + irq = gsi; + break; + case IOAPIC_DOMAIN_STRICT: + if (irq_create_strict_mappings(domain, gsi, pin, 1) == 0) + irq = gsi; + break; + case IOAPIC_DOMAIN_DYNAMIC: + irq = irq_create_mapping(domain, pin); + break; + default: + WARN(1, "ioapic: unknown irqdomain type %d\n", type); + break; + } + + return irq > 0 ? irq : -1; +} + +static int mp_map_pin_to_irq(u32 gsi, int idx, int ioapic, int pin, + unsigned int flags) { int irq; - int bus = mp_irqs[idx].srcbus; - struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(apic); + struct irq_domain *domain = mp_ioapic_irqdomain(ioapic); + struct mp_pin_info *info = mp_pin_info(ioapic, pin); + + if (!domain) + return -1; + + mutex_lock(&ioapic_mutex); /* - * Debugging check, we are in big trouble if this message pops up! + * Don't use irqdomain to manage ISA IRQs because there may be + * multiple IOAPIC pins sharing the same ISA IRQ number and + * irqdomain only supports 1:1 mapping between IOAPIC pin and + * IRQ number. A typical IOAPIC has 24 pins, pin 0-15 are used + * for legacy IRQs and pin 16-23 are used for PCI IRQs (PIRQ A-H). + * When ACPI is disabled, only legacy IRQ numbers (IRQ0-15) are + * available, and some BIOSes may use MP Interrupt Source records + * to override IRQ numbers for PIRQs instead of reprogramming + * the interrupt routing logic. Thus there may be multiple pins + * sharing the same legacy IRQ number when ACPI is disabled. */ - if (mp_irqs[idx].dstirq != pin) - pr_err("broken BIOS or MPTABLE parser, ayiee!!\n"); - - if (test_bit(bus, mp_bus_not_pci)) { + if (idx >= 0 && test_bit(mp_irqs[idx].srcbus, mp_bus_not_pci)) { irq = mp_irqs[idx].srcbusirq; + if (flags & IOAPIC_MAP_ALLOC) { + if (info->count == 0 && + mp_irqdomain_map(domain, irq, pin) != 0) + irq = -1; + + /* special handling for timer IRQ0 */ + if (irq == 0) + info->count++; + } } else { - u32 gsi = gsi_cfg->gsi_base + pin; + irq = irq_find_mapping(domain, pin); + if (irq <= 0 && (flags & IOAPIC_MAP_ALLOC)) + irq = alloc_irq_from_domain(domain, gsi, pin); + } - if (gsi >= NR_IRQS_LEGACY) - irq = gsi; - else - irq = gsi_top + gsi; + if (flags & IOAPIC_MAP_ALLOC) { + /* special handling for legacy IRQs */ + if (irq < nr_legacy_irqs() && info->count == 1 && + mp_irqdomain_map(domain, irq, pin) != 0) + irq = -1; + + if (irq > 0) + info->count++; + else if (info->count == 0) + info->set = 0; } + mutex_unlock(&ioapic_mutex); + + return irq > 0 ? irq : -1; +} + +static int pin_2_irq(int idx, int ioapic, int pin, unsigned int flags) +{ + u32 gsi = mp_pin_to_gsi(ioapic, pin); + + /* + * Debugging check, we are in big trouble if this message pops up! + */ + if (mp_irqs[idx].dstirq != pin) + pr_err("broken BIOS or MPTABLE parser, ayiee!!\n"); + #ifdef CONFIG_X86_32 /* * PCI IRQ command line redirection. Yes, limits are hardcoded. @@ -972,16 +1106,58 @@ static int pin_2_irq(int idx, int apic, int pin) apic_printk(APIC_VERBOSE, KERN_DEBUG "disabling PIRQ%d\n", pin-16); } else { - irq = pirq_entries[pin-16]; + int irq = pirq_entries[pin-16]; apic_printk(APIC_VERBOSE, KERN_DEBUG "using PIRQ%d -> IRQ %d\n", pin-16, irq); + return irq; } } } #endif - return irq; + return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags); +} + +int mp_map_gsi_to_irq(u32 gsi, unsigned int flags) +{ + int ioapic, pin, idx; + + ioapic = mp_find_ioapic(gsi); + if (ioapic < 0) + return -1; + + pin = mp_find_ioapic_pin(ioapic, gsi); + idx = find_irq_entry(ioapic, pin, mp_INT); + if ((flags & IOAPIC_MAP_CHECK) && idx < 0) + return -1; + + return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags); +} + +void mp_unmap_irq(int irq) +{ + struct irq_data *data = irq_get_irq_data(irq); + struct mp_pin_info *info; + int ioapic, pin; + + if (!data || !data->domain) + return; + + ioapic = (int)(long)data->domain->host_data; + pin = (int)data->hwirq; + info = mp_pin_info(ioapic, pin); + + mutex_lock(&ioapic_mutex); + if (--info->count == 0) { + info->set = 0; + if (irq < nr_legacy_irqs() && + ioapics[ioapic].irqdomain_cfg.type == IOAPIC_DOMAIN_LEGACY) + mp_irqdomain_unmap(data->domain, irq); + else + irq_dispose_mapping(irq); + } + mutex_unlock(&ioapic_mutex); } /* @@ -991,7 +1167,7 @@ static int pin_2_irq(int idx, int apic, int pin) int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin, struct io_apic_irq_attr *irq_attr) { - int ioapic_idx, i, best_guess = -1; + int irq, i, best_ioapic = -1, best_idx = -1; apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", @@ -1001,44 +1177,56 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin, "PCI BIOS passed nonexistent PCI bus %d!\n", bus); return -1; } + for (i = 0; i < mp_irq_entries; i++) { int lbus = mp_irqs[i].srcbus; + int ioapic_idx, found = 0; - for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++) + if (bus != lbus || mp_irqs[i].irqtype != mp_INT || + slot != ((mp_irqs[i].srcbusirq >> 2) & 0x1f)) + continue; + + for_each_ioapic(ioapic_idx) if (mpc_ioapic_id(ioapic_idx) == mp_irqs[i].dstapic || - mp_irqs[i].dstapic == MP_APIC_ALL) + mp_irqs[i].dstapic == MP_APIC_ALL) { + found = 1; break; + } + if (!found) + continue; - if (!test_bit(lbus, mp_bus_not_pci) && - !mp_irqs[i].irqtype && - (bus == lbus) && - (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) { - int irq = pin_2_irq(i, ioapic_idx, mp_irqs[i].dstirq); + /* Skip ISA IRQs */ + irq = pin_2_irq(i, ioapic_idx, mp_irqs[i].dstirq, 0); + if (irq > 0 && !IO_APIC_IRQ(irq)) + continue; - if (!(ioapic_idx || IO_APIC_IRQ(irq))) - continue; + if (pin == (mp_irqs[i].srcbusirq & 3)) { + best_idx = i; + best_ioapic = ioapic_idx; + goto out; + } - if (pin == (mp_irqs[i].srcbusirq & 3)) { - set_io_apic_irq_attr(irq_attr, ioapic_idx, - mp_irqs[i].dstirq, - irq_trigger(i), - irq_polarity(i)); - return irq; - } - /* - * Use the first all-but-pin matching entry as a - * best-guess fuzzy result for broken mptables. - */ - if (best_guess < 0) { - set_io_apic_irq_attr(irq_attr, ioapic_idx, - mp_irqs[i].dstirq, - irq_trigger(i), - irq_polarity(i)); - best_guess = irq; - } + /* + * Use the first all-but-pin matching entry as a + * best-guess fuzzy result for broken mptables. + */ + if (best_idx < 0) { + best_idx = i; + best_ioapic = ioapic_idx; } } - return best_guess; + if (best_idx < 0) + return -1; + +out: + irq = pin_2_irq(best_idx, best_ioapic, mp_irqs[best_idx].dstirq, + IOAPIC_MAP_ALLOC); + if (irq > 0) + set_io_apic_irq_attr(irq_attr, best_ioapic, + mp_irqs[best_idx].dstirq, + irq_trigger(best_idx), + irq_polarity(best_idx)); + return irq; } EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); @@ -1198,7 +1386,7 @@ void __setup_vector_irq(int cpu) raw_spin_lock(&vector_lock); /* Mark the inuse vectors */ for_each_active_irq(irq) { - cfg = irq_get_chip_data(irq); + cfg = irq_cfg(irq); if (!cfg) continue; @@ -1227,12 +1415,10 @@ static inline int IO_APIC_irq_trigger(int irq) { int apic, idx, pin; - for (apic = 0; apic < nr_ioapics; apic++) { - for (pin = 0; pin < ioapics[apic].nr_registers; pin++) { - idx = find_irq_entry(apic, pin, mp_INT); - if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin))) - return irq_trigger(idx); - } + for_each_ioapic_pin(apic, pin) { + idx = find_irq_entry(apic, pin, mp_INT); + if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin, 0))) + return irq_trigger(idx); } /* * nonexistent IRQs are edge default @@ -1330,95 +1516,29 @@ static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg, } ioapic_register_intr(irq, cfg, attr->trigger); - if (irq < legacy_pic->nr_legacy_irqs) + if (irq < nr_legacy_irqs()) legacy_pic->mask(irq); ioapic_write_entry(attr->ioapic, attr->ioapic_pin, entry); } -static bool __init io_apic_pin_not_connected(int idx, int ioapic_idx, int pin) -{ - if (idx != -1) - return false; - - apic_printk(APIC_VERBOSE, KERN_DEBUG " apic %d pin %d not connected\n", - mpc_ioapic_id(ioapic_idx), pin); - return true; -} - -static void __init __io_apic_setup_irqs(unsigned int ioapic_idx) -{ - int idx, node = cpu_to_node(0); - struct io_apic_irq_attr attr; - unsigned int pin, irq; - - for (pin = 0; pin < ioapics[ioapic_idx].nr_registers; pin++) { - idx = find_irq_entry(ioapic_idx, pin, mp_INT); - if (io_apic_pin_not_connected(idx, ioapic_idx, pin)) - continue; - - irq = pin_2_irq(idx, ioapic_idx, pin); - - if ((ioapic_idx > 0) && (irq > 16)) - continue; - - /* - * Skip the timer IRQ if there's a quirk handler - * installed and if it returns 1: - */ - if (apic->multi_timer_check && - apic->multi_timer_check(ioapic_idx, irq)) - continue; - - set_io_apic_irq_attr(&attr, ioapic_idx, pin, irq_trigger(idx), - irq_polarity(idx)); - - io_apic_setup_irq_pin(irq, node, &attr); - } -} - static void __init setup_IO_APIC_irqs(void) { - unsigned int ioapic_idx; + unsigned int ioapic, pin; + int idx; apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); - for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++) - __io_apic_setup_irqs(ioapic_idx); -} - -/* - * for the gsit that is not in first ioapic - * but could not use acpi_register_gsi() - * like some special sci in IBM x3330 - */ -void setup_IO_APIC_irq_extra(u32 gsi) -{ - int ioapic_idx = 0, pin, idx, irq, node = cpu_to_node(0); - struct io_apic_irq_attr attr; - - /* - * Convert 'gsi' to 'ioapic.pin'. - */ - ioapic_idx = mp_find_ioapic(gsi); - if (ioapic_idx < 0) - return; - - pin = mp_find_ioapic_pin(ioapic_idx, gsi); - idx = find_irq_entry(ioapic_idx, pin, mp_INT); - if (idx == -1) - return; - - irq = pin_2_irq(idx, ioapic_idx, pin); - - /* Only handle the non legacy irqs on secondary ioapics */ - if (ioapic_idx == 0 || irq < NR_IRQS_LEGACY) - return; - - set_io_apic_irq_attr(&attr, ioapic_idx, pin, irq_trigger(idx), - irq_polarity(idx)); - - io_apic_setup_irq_pin_once(irq, node, &attr); + for_each_ioapic_pin(ioapic, pin) { + idx = find_irq_entry(ioapic, pin, mp_INT); + if (idx < 0) + apic_printk(APIC_VERBOSE, + KERN_DEBUG " apic %d pin %d not connected\n", + mpc_ioapic_id(ioapic), pin); + else + pin_2_irq(idx, ioapic, pin, + ioapic ? 0 : IOAPIC_MAP_ALLOC); + } } /* @@ -1586,7 +1706,7 @@ __apicdebuginit(void) print_IO_APICs(void) struct irq_chip *chip; printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); - for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++) + for_each_ioapic(ioapic_idx) printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", mpc_ioapic_id(ioapic_idx), ioapics[ioapic_idx].nr_registers); @@ -1597,7 +1717,7 @@ __apicdebuginit(void) print_IO_APICs(void) */ printk(KERN_INFO "testing the IO APIC.......................\n"); - for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++) + for_each_ioapic(ioapic_idx) print_IO_APIC(ioapic_idx); printk(KERN_DEBUG "IRQ to pin mappings:\n"); @@ -1608,7 +1728,7 @@ __apicdebuginit(void) print_IO_APICs(void) if (chip != &ioapic_chip) continue; - cfg = irq_get_chip_data(irq); + cfg = irq_cfg(irq); if (!cfg) continue; entry = cfg->irq_2_pin; @@ -1758,7 +1878,7 @@ __apicdebuginit(void) print_PIC(void) unsigned int v; unsigned long flags; - if (!legacy_pic->nr_legacy_irqs) + if (!nr_legacy_irqs()) return; printk(KERN_DEBUG "\nprinting PIC contents\n"); @@ -1828,26 +1948,22 @@ static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; void __init enable_IO_APIC(void) { int i8259_apic, i8259_pin; - int apic; + int apic, pin; - if (!legacy_pic->nr_legacy_irqs) + if (!nr_legacy_irqs()) return; - for(apic = 0; apic < nr_ioapics; apic++) { - int pin; + for_each_ioapic_pin(apic, pin) { /* See if any of the pins is in ExtINT mode */ - for (pin = 0; pin < ioapics[apic].nr_registers; pin++) { - struct IO_APIC_route_entry entry; - entry = ioapic_read_entry(apic, pin); + struct IO_APIC_route_entry entry = ioapic_read_entry(apic, pin); - /* If the interrupt line is enabled and in ExtInt mode - * I have found the pin where the i8259 is connected. - */ - if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) { - ioapic_i8259.apic = apic; - ioapic_i8259.pin = pin; - goto found_i8259; - } + /* If the interrupt line is enabled and in ExtInt mode + * I have found the pin where the i8259 is connected. + */ + if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) { + ioapic_i8259.apic = apic; + ioapic_i8259.pin = pin; + goto found_i8259; } } found_i8259: @@ -1919,7 +2035,7 @@ void disable_IO_APIC(void) */ clear_IO_APIC(); - if (!legacy_pic->nr_legacy_irqs) + if (!nr_legacy_irqs()) return; x86_io_apic_ops.disable(); @@ -1950,7 +2066,7 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void) /* * Set the IOAPIC ID to the value stored in the MPC table. */ - for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++) { + for_each_ioapic(ioapic_idx) { /* Read the register 0 value */ raw_spin_lock_irqsave(&ioapic_lock, flags); reg_00.raw = io_apic_read(ioapic_idx, 0); @@ -2123,7 +2239,7 @@ static unsigned int startup_ioapic_irq(struct irq_data *data) unsigned long flags; raw_spin_lock_irqsave(&ioapic_lock, flags); - if (irq < legacy_pic->nr_legacy_irqs) { + if (irq < nr_legacy_irqs()) { legacy_pic->mask(irq); if (legacy_pic->irq_pending(irq)) was_pending = 1; @@ -2225,7 +2341,7 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void) apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR); goto unlock; } - __this_cpu_write(vector_irq[vector], -1); + __this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED); unlock: raw_spin_unlock(&desc->lock); } @@ -2253,7 +2369,7 @@ static void irq_complete_move(struct irq_cfg *cfg) void irq_force_complete_move(int irq) { - struct irq_cfg *cfg = irq_get_chip_data(irq); + struct irq_cfg *cfg = irq_cfg(irq); if (!cfg) return; @@ -2507,6 +2623,7 @@ static struct irq_chip ioapic_chip __read_mostly = { .irq_eoi = ack_apic_level, .irq_set_affinity = native_ioapic_set_affinity, .irq_retrigger = ioapic_retrigger_irq, + .flags = IRQCHIP_SKIP_SET_WAKE, }; static inline void init_IO_APIC_traps(void) @@ -2514,26 +2631,15 @@ static inline void init_IO_APIC_traps(void) struct irq_cfg *cfg; unsigned int irq; - /* - * NOTE! The local APIC isn't very good at handling - * multiple interrupts at the same interrupt level. - * As the interrupt level is determined by taking the - * vector number and shifting that right by 4, we - * want to spread these out a bit so that they don't - * all fall in the same interrupt level. - * - * Also, we've got to be careful not to trash gate - * 0x80, because int 0x80 is hm, kind of importantish. ;) - */ for_each_active_irq(irq) { - cfg = irq_get_chip_data(irq); + cfg = irq_cfg(irq); if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) { /* * Hmm.. We don't have an entry for this, * so default to an old-fashioned 8259 * interrupt if we can.. */ - if (irq < legacy_pic->nr_legacy_irqs) + if (irq < nr_legacy_irqs()) legacy_pic->make_irq(irq); else /* Strange. Oh, well.. */ @@ -2649,8 +2755,6 @@ static int __init disable_timer_pin_setup(char *arg) } early_param("disable_timer_pin_1", disable_timer_pin_setup); -int timer_through_8259 __initdata; - /* * This code may look a bit paranoid, but it's supposed to cooperate with * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ @@ -2661,7 +2765,7 @@ int timer_through_8259 __initdata; */ static inline void __init check_timer(void) { - struct irq_cfg *cfg = irq_get_chip_data(0); + struct irq_cfg *cfg = irq_cfg(0); int node = cpu_to_node(0); int apic1, pin1, apic2, pin2; unsigned long flags; @@ -2755,7 +2859,6 @@ static inline void __init check_timer(void) legacy_pic->unmask(0); if (timer_irq_works()) { apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); - timer_through_8259 = 1; goto out; } /* @@ -2827,15 +2930,54 @@ out: */ #define PIC_IRQS (1UL << PIC_CASCADE_IR) +static int mp_irqdomain_create(int ioapic) +{ + size_t size; + int hwirqs = mp_ioapic_pin_count(ioapic); + struct ioapic *ip = &ioapics[ioapic]; + struct ioapic_domain_cfg *cfg = &ip->irqdomain_cfg; + struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(ioapic); + + size = sizeof(struct mp_pin_info) * mp_ioapic_pin_count(ioapic); + ip->pin_info = kzalloc(size, GFP_KERNEL); + if (!ip->pin_info) + return -ENOMEM; + + if (cfg->type == IOAPIC_DOMAIN_INVALID) + return 0; + + ip->irqdomain = irq_domain_add_linear(cfg->dev, hwirqs, cfg->ops, + (void *)(long)ioapic); + if(!ip->irqdomain) { + kfree(ip->pin_info); + ip->pin_info = NULL; + return -ENOMEM; + } + + if (cfg->type == IOAPIC_DOMAIN_LEGACY || + cfg->type == IOAPIC_DOMAIN_STRICT) + ioapic_dynirq_base = max(ioapic_dynirq_base, + gsi_cfg->gsi_end + 1); + + if (gsi_cfg->gsi_base == 0) + irq_set_default_host(ip->irqdomain); + + return 0; +} + void __init setup_IO_APIC(void) { + int ioapic; /* * calling enable_IO_APIC() is moved to setup_local_APIC for BP */ - io_apic_irqs = legacy_pic->nr_legacy_irqs ? ~PIC_IRQS : ~0UL; + io_apic_irqs = nr_legacy_irqs() ? ~PIC_IRQS : ~0UL; apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); + for_each_ioapic(ioapic) + BUG_ON(mp_irqdomain_create(ioapic)); + /* * Set up IO-APIC IRQ routing. */ @@ -2844,8 +2986,10 @@ void __init setup_IO_APIC(void) sync_Arb_IDs(); setup_IO_APIC_irqs(); init_IO_APIC_traps(); - if (legacy_pic->nr_legacy_irqs) + if (nr_legacy_irqs()) check_timer(); + + ioapic_initialized = 1; } /* @@ -2880,7 +3024,7 @@ static void ioapic_resume(void) { int ioapic_idx; - for (ioapic_idx = nr_ioapics - 1; ioapic_idx >= 0; ioapic_idx--) + for_each_ioapic_reverse(ioapic_idx) resume_ioapic_id(ioapic_idx); restore_ioapic_entries(); @@ -2926,7 +3070,7 @@ int arch_setup_hwirq(unsigned int irq, int node) void arch_teardown_hwirq(unsigned int irq) { - struct irq_cfg *cfg = irq_get_chip_data(irq); + struct irq_cfg *cfg = irq_cfg(irq); unsigned long flags; free_remapped_irq(irq); @@ -3030,6 +3174,7 @@ static struct irq_chip msi_chip = { .irq_ack = ack_apic_edge, .irq_set_affinity = msi_set_affinity, .irq_retrigger = ioapic_retrigger_irq, + .flags = IRQCHIP_SKIP_SET_WAKE, }; int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, @@ -3053,7 +3198,7 @@ int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, if (!irq_offset) write_msi_msg(irq, &msg); - setup_remapped_irq(irq, irq_get_chip_data(irq), chip); + setup_remapped_irq(irq, irq_cfg(irq), chip); irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge"); @@ -3128,6 +3273,7 @@ static struct irq_chip dmar_msi_type = { .irq_ack = ack_apic_edge, .irq_set_affinity = dmar_msi_set_affinity, .irq_retrigger = ioapic_retrigger_irq, + .flags = IRQCHIP_SKIP_SET_WAKE, }; int arch_setup_dmar_msi(unsigned int irq) @@ -3178,6 +3324,7 @@ static struct irq_chip hpet_msi_type = { .irq_ack = ack_apic_edge, .irq_set_affinity = hpet_msi_set_affinity, .irq_retrigger = ioapic_retrigger_irq, + .flags = IRQCHIP_SKIP_SET_WAKE, }; int default_setup_hpet_msi(unsigned int irq, unsigned int id) @@ -3192,7 +3339,7 @@ int default_setup_hpet_msi(unsigned int irq, unsigned int id) hpet_msi_write(irq_get_handler_data(irq), &msg); irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); - setup_remapped_irq(irq, irq_get_chip_data(irq), chip); + setup_remapped_irq(irq, irq_cfg(irq), chip); irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge"); return 0; @@ -3241,6 +3388,7 @@ static struct irq_chip ht_irq_chip = { .irq_ack = ack_apic_edge, .irq_set_affinity = ht_set_affinity, .irq_retrigger = ioapic_retrigger_irq, + .flags = IRQCHIP_SKIP_SET_WAKE, }; int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) @@ -3303,27 +3451,6 @@ io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr) return ret; } -int io_apic_setup_irq_pin_once(unsigned int irq, int node, - struct io_apic_irq_attr *attr) -{ - unsigned int ioapic_idx = attr->ioapic, pin = attr->ioapic_pin; - int ret; - struct IO_APIC_route_entry orig_entry; - - /* Avoid redundant programming */ - if (test_bit(pin, ioapics[ioapic_idx].pin_programmed)) { - pr_debug("Pin %d-%d already programmed\n", mpc_ioapic_id(ioapic_idx), pin); - orig_entry = ioapic_read_entry(attr->ioapic, pin); - if (attr->trigger == orig_entry.trigger && attr->polarity == orig_entry.polarity) - return 0; - return -EBUSY; - } - ret = io_apic_setup_irq_pin(irq, node, attr); - if (!ret) - set_bit(pin, ioapics[ioapic_idx].pin_programmed); - return ret; -} - static int __init io_apic_get_redir_entries(int ioapic) { union IO_APIC_reg_01 reg_01; @@ -3340,20 +3467,13 @@ static int __init io_apic_get_redir_entries(int ioapic) return reg_01.bits.entries + 1; } -static void __init probe_nr_irqs_gsi(void) -{ - int nr; - - nr = gsi_top + NR_IRQS_LEGACY; - if (nr > nr_irqs_gsi) - nr_irqs_gsi = nr; - - printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi); -} - unsigned int arch_dynirq_lower_bound(unsigned int from) { - return from < nr_irqs_gsi ? nr_irqs_gsi : from; + /* + * dmar_alloc_hwirq() may be called before setup_IO_APIC(), so use + * gsi_top if ioapic_dynirq_base hasn't been initialized yet. + */ + return ioapic_initialized ? ioapic_dynirq_base : gsi_top; } int __init arch_probe_nr_irqs(void) @@ -3363,33 +3483,17 @@ int __init arch_probe_nr_irqs(void) if (nr_irqs > (NR_VECTORS * nr_cpu_ids)) nr_irqs = NR_VECTORS * nr_cpu_ids; - nr = nr_irqs_gsi + 8 * nr_cpu_ids; + nr = (gsi_top + nr_legacy_irqs()) + 8 * nr_cpu_ids; #if defined(CONFIG_PCI_MSI) || defined(CONFIG_HT_IRQ) /* * for MSI and HT dyn irq */ - nr += nr_irqs_gsi * 16; + nr += gsi_top * 16; #endif if (nr < nr_irqs) nr_irqs = nr; - return NR_IRQS_LEGACY; -} - -int io_apic_set_pci_routing(struct device *dev, int irq, - struct io_apic_irq_attr *irq_attr) -{ - int node; - - if (!IO_APIC_IRQ(irq)) { - apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", - irq_attr->ioapic); - return -EINVAL; - } - - node = dev ? dev_to_node(dev) : cpu_to_node(0); - - return io_apic_setup_irq_pin_once(irq, node, irq_attr); + return 0; } #ifdef CONFIG_X86_32 @@ -3483,9 +3587,8 @@ static u8 __init io_apic_unique_id(u8 id) DECLARE_BITMAP(used, 256); bitmap_zero(used, 256); - for (i = 0; i < nr_ioapics; i++) { + for_each_ioapic(i) __set_bit(mpc_ioapic_id(i), used); - } if (!test_bit(id, used)) return id; return find_first_zero_bit(used, 256); @@ -3543,14 +3646,13 @@ void __init setup_ioapic_dest(void) if (skip_ioapic_setup == 1) return; - for (ioapic = 0; ioapic < nr_ioapics; ioapic++) - for (pin = 0; pin < ioapics[ioapic].nr_registers; pin++) { + for_each_ioapic_pin(ioapic, pin) { irq_entry = find_irq_entry(ioapic, pin, mp_INT); if (irq_entry == -1) continue; - irq = pin_2_irq(irq_entry, ioapic, pin); - if ((ioapic > 0) && (irq > 16)) + irq = pin_2_irq(irq_entry, ioapic, pin, 0); + if (irq < 0 || !mp_init_irq_at_boot(ioapic, irq)) continue; idata = irq_get_irq_data(irq); @@ -3573,29 +3675,33 @@ void __init setup_ioapic_dest(void) static struct resource *ioapic_resources; -static struct resource * __init ioapic_setup_resources(int nr_ioapics) +static struct resource * __init ioapic_setup_resources(void) { unsigned long n; struct resource *res; char *mem; - int i; + int i, num = 0; - if (nr_ioapics <= 0) + for_each_ioapic(i) + num++; + if (num == 0) return NULL; n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource); - n *= nr_ioapics; + n *= num; mem = alloc_bootmem(n); res = (void *)mem; - mem += sizeof(struct resource) * nr_ioapics; + mem += sizeof(struct resource) * num; - for (i = 0; i < nr_ioapics; i++) { - res[i].name = mem; - res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; + num = 0; + for_each_ioapic(i) { + res[num].name = mem; + res[num].flags = IORESOURCE_MEM | IORESOURCE_BUSY; snprintf(mem, IOAPIC_RESOURCE_NAME_SIZE, "IOAPIC %u", i); mem += IOAPIC_RESOURCE_NAME_SIZE; + num++; } ioapic_resources = res; @@ -3609,8 +3715,8 @@ void __init native_io_apic_init_mappings(void) struct resource *ioapic_res; int i; - ioapic_res = ioapic_setup_resources(nr_ioapics); - for (i = 0; i < nr_ioapics; i++) { + ioapic_res = ioapic_setup_resources(); + for_each_ioapic(i) { if (smp_found_config) { ioapic_phys = mpc_ioapic_addr(i); #ifdef CONFIG_X86_32 @@ -3641,8 +3747,6 @@ fake_ioapic_page: ioapic_res->end = ioapic_phys + IO_APIC_SLOT_SIZE - 1; ioapic_res++; } - - probe_nr_irqs_gsi(); } void __init ioapic_insert_resources(void) @@ -3657,7 +3761,7 @@ void __init ioapic_insert_resources(void) return; } - for (i = 0; i < nr_ioapics; i++) { + for_each_ioapic(i) { insert_resource(&iomem_resource, r); r++; } @@ -3665,16 +3769,15 @@ void __init ioapic_insert_resources(void) int mp_find_ioapic(u32 gsi) { - int i = 0; + int i; if (nr_ioapics == 0) return -1; /* Find the IOAPIC that manages this GSI. */ - for (i = 0; i < nr_ioapics; i++) { + for_each_ioapic(i) { struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(i); - if ((gsi >= gsi_cfg->gsi_base) - && (gsi <= gsi_cfg->gsi_end)) + if (gsi >= gsi_cfg->gsi_base && gsi <= gsi_cfg->gsi_end) return i; } @@ -3686,7 +3789,7 @@ int mp_find_ioapic_pin(int ioapic, u32 gsi) { struct mp_ioapic_gsi *gsi_cfg; - if (WARN_ON(ioapic == -1)) + if (WARN_ON(ioapic < 0)) return -1; gsi_cfg = mp_ioapic_gsi_routing(ioapic); @@ -3729,7 +3832,8 @@ static __init int bad_ioapic_register(int idx) return 0; } -void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) +void __init mp_register_ioapic(int id, u32 address, u32 gsi_base, + struct ioapic_domain_cfg *cfg) { int idx = 0; int entries; @@ -3743,6 +3847,8 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) ioapics[idx].mp_config.type = MP_IOAPIC; ioapics[idx].mp_config.flags = MPC_APIC_USABLE; ioapics[idx].mp_config.apicaddr = address; + ioapics[idx].irqdomain = NULL; + ioapics[idx].irqdomain_cfg = *cfg; set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); @@ -3779,6 +3885,97 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) nr_ioapics++; } +int mp_irqdomain_map(struct irq_domain *domain, unsigned int virq, + irq_hw_number_t hwirq) +{ + int ioapic = (int)(long)domain->host_data; + struct mp_pin_info *info = mp_pin_info(ioapic, hwirq); + struct io_apic_irq_attr attr; + + /* Get default attribute if not set by caller yet */ + if (!info->set) { + u32 gsi = mp_pin_to_gsi(ioapic, hwirq); + + if (acpi_get_override_irq(gsi, &info->trigger, + &info->polarity) < 0) { + /* + * PCI interrupts are always polarity one level + * triggered. + */ + info->trigger = 1; + info->polarity = 1; + } + info->node = NUMA_NO_NODE; + + /* + * setup_IO_APIC_irqs() programs all legacy IRQs with default + * trigger and polarity attributes. Don't set the flag for that + * case so the first legacy IRQ user could reprogram the pin + * with real trigger and polarity attributes. + */ + if (virq >= nr_legacy_irqs() || info->count) + info->set = 1; + } + set_io_apic_irq_attr(&attr, ioapic, hwirq, info->trigger, + info->polarity); + + return io_apic_setup_irq_pin(virq, info->node, &attr); +} + +void mp_irqdomain_unmap(struct irq_domain *domain, unsigned int virq) +{ + struct irq_data *data = irq_get_irq_data(virq); + struct irq_cfg *cfg = irq_cfg(virq); + int ioapic = (int)(long)domain->host_data; + int pin = (int)data->hwirq; + + ioapic_mask_entry(ioapic, pin); + __remove_pin_from_irq(cfg, ioapic, pin); + WARN_ON(cfg->irq_2_pin != NULL); + arch_teardown_hwirq(virq); +} + +int mp_set_gsi_attr(u32 gsi, int trigger, int polarity, int node) +{ + int ret = 0; + int ioapic, pin; + struct mp_pin_info *info; + + ioapic = mp_find_ioapic(gsi); + if (ioapic < 0) + return -ENODEV; + + pin = mp_find_ioapic_pin(ioapic, gsi); + info = mp_pin_info(ioapic, pin); + trigger = trigger ? 1 : 0; + polarity = polarity ? 1 : 0; + + mutex_lock(&ioapic_mutex); + if (!info->set) { + info->trigger = trigger; + info->polarity = polarity; + info->node = node; + info->set = 1; + } else if (info->trigger != trigger || info->polarity != polarity) { + ret = -EBUSY; + } + mutex_unlock(&ioapic_mutex); + + return ret; +} + +bool mp_should_keep_irq(struct device *dev) +{ + if (dev->power.is_prepared) + return true; +#ifdef CONFIG_PM_RUNTIME + if (dev->power.runtime_status == RPM_SUSPENDING) + return true; +#endif + + return false; +} + /* Enable IOAPIC early just for system timer */ void __init pre_init_apic_IRQ0(void) { diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index cceb352c968..bda488680db 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -88,21 +88,16 @@ static struct apic apic_default = { .disable_esr = 0, .dest_logical = APIC_DEST_LOGICAL, .check_apicid_used = default_check_apicid_used, - .check_apicid_present = default_check_apicid_present, .vector_allocation_domain = flat_vector_allocation_domain, .init_apic_ldr = default_init_apic_ldr, .ioapic_phys_id_map = default_ioapic_phys_id_map, .setup_apic_routing = setup_apic_flat_routing, - .multi_timer_check = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, .apicid_to_cpu_present = physid_set_mask_of_physid, - .setup_portio_remap = NULL, .check_phys_apicid_present = default_check_phys_apicid_present, - .enable_apic_mode = NULL, .phys_pkg_id = default_phys_pkg_id, - .mps_oem_check = NULL, .get_apic_id = default_get_apic_id, .set_apic_id = NULL, @@ -116,11 +111,7 @@ static struct apic apic_default = { .send_IPI_all = default_send_IPI_all, .send_IPI_self = default_send_IPI_self, - .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, - .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, - .wait_for_init_deassert = true, - .smp_callin_clear_local_apic = NULL, .inquire_remote_apic = default_inquire_remote_apic, .read = native_apic_mem_read, @@ -214,29 +205,7 @@ void __init generic_apic_probe(void) printk(KERN_INFO "Using APIC driver %s\n", apic->name); } -/* These functions can switch the APIC even after the initial ->probe() */ - -int __init -generic_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid) -{ - struct apic **drv; - - for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { - if (!((*drv)->mps_oem_check)) - continue; - if (!(*drv)->mps_oem_check(mpc, oem, productid)) - continue; - - if (!cmdline_apic) { - apic = *drv; - printk(KERN_INFO "Switched to APIC driver `%s'.\n", - apic->name); - } - return 1; - } - return 0; -} - +/* This function can switch the APIC even after the initial ->probe() */ int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { struct apic **drv; diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index e66766bf164..e658f21681c 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -42,7 +42,7 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest) * We are to modify mask, so we need an own copy * and be sure it's manipulated with irq off. */ - ipi_mask_ptr = __raw_get_cpu_var(ipi_mask); + ipi_mask_ptr = this_cpu_cpumask_var_ptr(ipi_mask); cpumask_copy(ipi_mask_ptr, mask); /* @@ -249,21 +249,16 @@ static struct apic apic_x2apic_cluster = { .disable_esr = 0, .dest_logical = APIC_DEST_LOGICAL, .check_apicid_used = NULL, - .check_apicid_present = NULL, .vector_allocation_domain = cluster_vector_allocation_domain, .init_apic_ldr = init_x2apic_ldr, .ioapic_phys_id_map = NULL, .setup_apic_routing = NULL, - .multi_timer_check = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, .apicid_to_cpu_present = NULL, - .setup_portio_remap = NULL, .check_phys_apicid_present = default_check_phys_apicid_present, - .enable_apic_mode = NULL, .phys_pkg_id = x2apic_phys_pkg_id, - .mps_oem_check = NULL, .get_apic_id = x2apic_get_apic_id, .set_apic_id = x2apic_set_apic_id, @@ -277,10 +272,7 @@ static struct apic apic_x2apic_cluster = { .send_IPI_all = x2apic_send_IPI_all, .send_IPI_self = x2apic_send_IPI_self, - .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, - .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, .wait_for_init_deassert = false, - .smp_callin_clear_local_apic = NULL, .inquire_remote_apic = NULL, .read = native_apic_msr_read, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 6d600ebf6c1..6fae733e919 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -103,21 +103,16 @@ static struct apic apic_x2apic_phys = { .disable_esr = 0, .dest_logical = 0, .check_apicid_used = NULL, - .check_apicid_present = NULL, .vector_allocation_domain = default_vector_allocation_domain, .init_apic_ldr = init_x2apic_ldr, .ioapic_phys_id_map = NULL, .setup_apic_routing = NULL, - .multi_timer_check = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, .apicid_to_cpu_present = NULL, - .setup_portio_remap = NULL, .check_phys_apicid_present = default_check_phys_apicid_present, - .enable_apic_mode = NULL, .phys_pkg_id = x2apic_phys_pkg_id, - .mps_oem_check = NULL, .get_apic_id = x2apic_get_apic_id, .set_apic_id = x2apic_set_apic_id, @@ -131,10 +126,7 @@ static struct apic apic_x2apic_phys = { .send_IPI_all = x2apic_send_IPI_all, .send_IPI_self = x2apic_send_IPI_self, - .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, - .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, .wait_for_init_deassert = false, - .smp_callin_clear_local_apic = NULL, .inquire_remote_apic = NULL, .read = native_apic_msr_read, diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 293b41df54e..8e9dcfd630e 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -204,7 +204,6 @@ EXPORT_SYMBOL(sn_rtc_cycles_per_second); static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip) { -#ifdef CONFIG_SMP unsigned long val; int pnode; @@ -223,7 +222,6 @@ static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip) uv_write_global_mmr64(pnode, UVH_IPI_INT, val); atomic_set(&init_deasserted, 1); -#endif return 0; } @@ -365,21 +363,16 @@ static struct apic __refdata apic_x2apic_uv_x = { .disable_esr = 0, .dest_logical = APIC_DEST_LOGICAL, .check_apicid_used = NULL, - .check_apicid_present = NULL, .vector_allocation_domain = default_vector_allocation_domain, .init_apic_ldr = uv_init_apic_ldr, .ioapic_phys_id_map = NULL, .setup_apic_routing = NULL, - .multi_timer_check = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, .apicid_to_cpu_present = NULL, - .setup_portio_remap = NULL, .check_phys_apicid_present = default_check_phys_apicid_present, - .enable_apic_mode = NULL, .phys_pkg_id = uv_phys_pkg_id, - .mps_oem_check = NULL, .get_apic_id = x2apic_get_apic_id, .set_apic_id = set_apic_id, @@ -394,10 +387,7 @@ static struct apic __refdata apic_x2apic_uv_x = { .send_IPI_self = uv_send_IPI_self, .wakeup_secondary_cpu = uv_wakeup_secondary, - .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, - .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, .wait_for_init_deassert = false, - .smp_callin_clear_local_apic = NULL, .inquire_remote_apic = NULL, .read = native_apic_msr_read, diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 7fd54f09b01..01d5453b550 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -13,10 +13,13 @@ nostackp := $(call cc-option, -fno-stack-protector) CFLAGS_common.o := $(nostackp) obj-y := intel_cacheinfo.o scattered.o topology.o -obj-y += proc.o capflags.o powerflags.o common.o +obj-y += common.o obj-y += rdrand.o obj-y += match.o +obj-$(CONFIG_PROC_FS) += proc.o +obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o + obj-$(CONFIG_X86_32) += bugs.o obj-$(CONFIG_X86_64) += bugs_64.o @@ -36,7 +39,9 @@ obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd_iommu.o endif obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_knc.o perf_event_p4.o obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o -obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_uncore.o perf_event_intel_rapl.o +obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_uncore.o perf_event_intel_uncore_snb.o +obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_uncore_snbep.o perf_event_intel_uncore_nhmex.o +obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_rapl.o endif @@ -48,6 +53,7 @@ obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o perf_event_amd_ibs.o obj-$(CONFIG_HYPERVISOR_GUEST) += vmware.o hypervisor.o mshyperv.o +ifdef CONFIG_X86_FEATURE_NAMES quiet_cmd_mkcapflags = MKCAP $@ cmd_mkcapflags = $(CONFIG_SHELL) $(srctree)/$(src)/mkcapflags.sh $< $@ @@ -56,3 +62,4 @@ cpufeature = $(src)/../../include/asm/cpufeature.h targets += capflags.c $(obj)/capflags.c: $(cpufeature) $(src)/mkcapflags.sh FORCE $(call if_changed,mkcapflags) +endif diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index ce8b8ff0e0e..813d29d00a1 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -8,6 +8,7 @@ #include <asm/processor.h> #include <asm/apic.h> #include <asm/cpu.h> +#include <asm/smp.h> #include <asm/pci-direct.h> #ifdef CONFIG_X86_64 @@ -50,7 +51,6 @@ static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val) return wrmsr_safe_regs(gprs); } -#ifdef CONFIG_X86_32 /* * B step AMD K6 before B 9730xxxx have hardware bugs that can cause * misexecution of code under Linux. Owners of such processors should @@ -70,6 +70,7 @@ __asm__(".globl vide\n\t.align 4\nvide: ret"); static void init_amd_k5(struct cpuinfo_x86 *c) { +#ifdef CONFIG_X86_32 /* * General Systems BIOSen alias the cpu frequency registers * of the Elan at 0x000df000. Unfortuantly, one of the Linux @@ -83,11 +84,12 @@ static void init_amd_k5(struct cpuinfo_x86 *c) if (inl(CBAR) & CBAR_ENB) outl(0 | CBAR_KEY, CBAR); } +#endif } - static void init_amd_k6(struct cpuinfo_x86 *c) { +#ifdef CONFIG_X86_32 u32 l, h; int mbytes = get_num_physpages() >> (20-PAGE_SHIFT); @@ -176,10 +178,44 @@ static void init_amd_k6(struct cpuinfo_x86 *c) /* placeholder for any needed mods */ return; } +#endif } -static void amd_k7_smp_check(struct cpuinfo_x86 *c) +static void init_amd_k7(struct cpuinfo_x86 *c) { +#ifdef CONFIG_X86_32 + u32 l, h; + + /* + * Bit 15 of Athlon specific MSR 15, needs to be 0 + * to enable SSE on Palomino/Morgan/Barton CPU's. + * If the BIOS didn't enable it already, enable it here. + */ + if (c->x86_model >= 6 && c->x86_model <= 10) { + if (!cpu_has(c, X86_FEATURE_XMM)) { + printk(KERN_INFO "Enabling disabled K7/SSE Support.\n"); + msr_clear_bit(MSR_K7_HWCR, 15); + set_cpu_cap(c, X86_FEATURE_XMM); + } + } + + /* + * It's been determined by AMD that Athlons since model 8 stepping 1 + * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx + * As per AMD technical note 27212 0.2 + */ + if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) { + rdmsr(MSR_K7_CLK_CTL, l, h); + if ((l & 0xfff00000) != 0x20000000) { + printk(KERN_INFO + "CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", + l, ((l & 0x000fffff)|0x20000000)); + wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h); + } + } + + set_cpu_cap(c, X86_FEATURE_K7); + /* calling is from identify_secondary_cpu() ? */ if (!c->cpu_index) return; @@ -207,7 +243,7 @@ static void amd_k7_smp_check(struct cpuinfo_x86 *c) if (((c->x86_model == 6) && (c->x86_mask >= 2)) || ((c->x86_model == 7) && (c->x86_mask >= 1)) || (c->x86_model > 7)) - if (cpu_has_mp) + if (cpu_has(c, X86_FEATURE_MP)) return; /* If we get here, not a certified SMP capable AMD system. */ @@ -219,45 +255,8 @@ static void amd_k7_smp_check(struct cpuinfo_x86 *c) WARN_ONCE(1, "WARNING: This combination of AMD" " processors is not suitable for SMP.\n"); add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_NOW_UNRELIABLE); -} - -static void init_amd_k7(struct cpuinfo_x86 *c) -{ - u32 l, h; - - /* - * Bit 15 of Athlon specific MSR 15, needs to be 0 - * to enable SSE on Palomino/Morgan/Barton CPU's. - * If the BIOS didn't enable it already, enable it here. - */ - if (c->x86_model >= 6 && c->x86_model <= 10) { - if (!cpu_has(c, X86_FEATURE_XMM)) { - printk(KERN_INFO "Enabling disabled K7/SSE Support.\n"); - msr_clear_bit(MSR_K7_HWCR, 15); - set_cpu_cap(c, X86_FEATURE_XMM); - } - } - - /* - * It's been determined by AMD that Athlons since model 8 stepping 1 - * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx - * As per AMD technical note 27212 0.2 - */ - if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) { - rdmsr(MSR_K7_CLK_CTL, l, h); - if ((l & 0xfff00000) != 0x20000000) { - printk(KERN_INFO - "CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", - l, ((l & 0x000fffff)|0x20000000)); - wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h); - } - } - - set_cpu_cap(c, X86_FEATURE_K7); - - amd_k7_smp_check(c); -} #endif +} #ifdef CONFIG_NUMA /* @@ -446,6 +445,26 @@ static void early_init_amd_mc(struct cpuinfo_x86 *c) static void bsp_init_amd(struct cpuinfo_x86 *c) { + +#ifdef CONFIG_X86_64 + if (c->x86 >= 0xf) { + unsigned long long tseg; + + /* + * Split up direct mapping around the TSEG SMM area. + * Don't do it for gbpages because there seems very little + * benefit in doing so. + */ + if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) { + unsigned long pfn = tseg >> PAGE_SHIFT; + + printk(KERN_DEBUG "tseg: %010llx\n", tseg); + if (pfn_range_is_mapped(pfn, pfn + 1)) + set_memory_4k((unsigned long)__va(tseg), 1); + } + } +#endif + if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) { if (c->x86 > 0x10 || @@ -506,6 +525,13 @@ static void early_init_amd(struct cpuinfo_x86 *c) } #endif + /* + * This is only needed to tell the kernel whether to use VMCALL + * and VMMCALL. VMMCALL is never executed except under virt, so + * we can set it unconditionally. + */ + set_cpu_cap(c, X86_FEATURE_VMMCALL); + /* F16h erratum 793, CVE-2013-6885 */ if (c->x86 == 0x16 && c->x86_model <= 0xf) msr_set_bit(MSR_AMD64_LS_CFG, 15); @@ -515,101 +541,74 @@ static const int amd_erratum_383[]; static const int amd_erratum_400[]; static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum); -static void init_amd(struct cpuinfo_x86 *c) +static void init_amd_k8(struct cpuinfo_x86 *c) { - u32 dummy; - unsigned long long value; + u32 level; + u64 value; -#ifdef CONFIG_SMP - /* - * Disable TLB flush filter by setting HWCR.FFDIS on K8 - * bit 6 of msr C001_0015 - * - * Errata 63 for SH-B3 steppings - * Errata 122 for all steppings (F+ have it disabled by default) - */ - if (c->x86 == 0xf) - msr_set_bit(MSR_K7_HWCR, 6); -#endif - - early_init_amd(c); + /* On C+ stepping K8 rep microcode works well for copy/memset */ + level = cpuid_eax(1); + if ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58) + set_cpu_cap(c, X86_FEATURE_REP_GOOD); /* - * Bit 31 in normal CPUID used for nonstandard 3DNow ID; - * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway + * Some BIOSes incorrectly force this feature, but only K8 revision D + * (model = 0x14) and later actually support it. + * (AMD Erratum #110, docId: 25759). */ - clear_cpu_cap(c, 0*32+31); - -#ifdef CONFIG_X86_64 - /* On C+ stepping K8 rep microcode works well for copy/memset */ - if (c->x86 == 0xf) { - u32 level; - - level = cpuid_eax(1); - if ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58) - set_cpu_cap(c, X86_FEATURE_REP_GOOD); - - /* - * Some BIOSes incorrectly force this feature, but only K8 - * revision D (model = 0x14) and later actually support it. - * (AMD Erratum #110, docId: 25759). - */ - if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM)) { - clear_cpu_cap(c, X86_FEATURE_LAHF_LM); - if (!rdmsrl_amd_safe(0xc001100d, &value)) { - value &= ~(1ULL << 32); - wrmsrl_amd_safe(0xc001100d, value); - } + if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM)) { + clear_cpu_cap(c, X86_FEATURE_LAHF_LM); + if (!rdmsrl_amd_safe(0xc001100d, &value)) { + value &= ~BIT_64(32); + wrmsrl_amd_safe(0xc001100d, value); } - } - if (c->x86 >= 0x10) - set_cpu_cap(c, X86_FEATURE_REP_GOOD); - /* get apicid instead of initial apic id from cpuid */ - c->apicid = hard_smp_processor_id(); -#else + if (!c->x86_model_id[0]) + strcpy(c->x86_model_id, "Hammer"); +} + +static void init_amd_gh(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_X86_64 + /* do this for boot cpu */ + if (c == &boot_cpu_data) + check_enable_amd_mmconf_dmi(); + + fam10h_check_enable_mmcfg(); +#endif /* - * FIXME: We should handle the K5 here. Set up the write - * range and also turn on MSR 83 bits 4 and 31 (write alloc, - * no bus pipeline) + * Disable GART TLB Walk Errors on Fam10h. We do this here because this + * is always needed when GART is enabled, even in a kernel which has no + * MCE support built in. BIOS should disable GartTlbWlk Errors already. + * If it doesn't, we do it here as suggested by the BKDG. + * + * Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=33012 */ + msr_set_bit(MSR_AMD64_MCx_MASK(4), 10); - switch (c->x86) { - case 4: - init_amd_k5(c); - break; - case 5: - init_amd_k6(c); - break; - case 6: /* An Athlon/Duron */ - init_amd_k7(c); - break; - } + /* + * On family 10h BIOS may not have properly enabled WC+ support, causing + * it to be converted to CD memtype. This may result in performance + * degradation for certain nested-paging guests. Prevent this conversion + * by clearing bit 24 in MSR_AMD64_BU_CFG2. + * + * NOTE: we want to use the _safe accessors so as not to #GP kvm + * guests on older kvm hosts. + */ + msr_clear_bit(MSR_AMD64_BU_CFG2, 24); - /* K6s reports MCEs but don't actually have all the MSRs */ - if (c->x86 < 6) - clear_cpu_cap(c, X86_FEATURE_MCE); -#endif + if (cpu_has_amd_erratum(c, amd_erratum_383)) + set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH); +} - /* Enable workaround for FXSAVE leak */ - if (c->x86 >= 6) - set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK); - - if (!c->x86_model_id[0]) { - switch (c->x86) { - case 0xf: - /* Should distinguish Models here, but this is only - a fallback anyways. */ - strcpy(c->x86_model_id, "Hammer"); - break; - } - } +static void init_amd_bd(struct cpuinfo_x86 *c) +{ + u64 value; /* re-enable TopologyExtensions if switched off by BIOS */ - if ((c->x86 == 0x15) && - (c->x86_model >= 0x10) && (c->x86_model <= 0x1f) && + if ((c->x86_model >= 0x10) && (c->x86_model <= 0x1f) && !cpu_has(c, X86_FEATURE_TOPOEXT)) { if (msr_set_bit(0xc0011005, 54) > 0) { @@ -625,14 +624,60 @@ static void init_amd(struct cpuinfo_x86 *c) * The way access filter has a performance penalty on some workloads. * Disable it on the affected CPUs. */ - if ((c->x86 == 0x15) && - (c->x86_model >= 0x02) && (c->x86_model < 0x20)) { - + if ((c->x86_model >= 0x02) && (c->x86_model < 0x20)) { if (!rdmsrl_safe(0xc0011021, &value) && !(value & 0x1E)) { value |= 0x1E; wrmsrl_safe(0xc0011021, value); } } +} + +static void init_amd(struct cpuinfo_x86 *c) +{ + u32 dummy; + +#ifdef CONFIG_SMP + /* + * Disable TLB flush filter by setting HWCR.FFDIS on K8 + * bit 6 of msr C001_0015 + * + * Errata 63 for SH-B3 steppings + * Errata 122 for all steppings (F+ have it disabled by default) + */ + if (c->x86 == 0xf) + msr_set_bit(MSR_K7_HWCR, 6); +#endif + + early_init_amd(c); + + /* + * Bit 31 in normal CPUID used for nonstandard 3DNow ID; + * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway + */ + clear_cpu_cap(c, 0*32+31); + + if (c->x86 >= 0x10) + set_cpu_cap(c, X86_FEATURE_REP_GOOD); + + /* get apicid instead of initial apic id from cpuid */ + c->apicid = hard_smp_processor_id(); + + /* K6s reports MCEs but don't actually have all the MSRs */ + if (c->x86 < 6) + clear_cpu_cap(c, X86_FEATURE_MCE); + + switch (c->x86) { + case 4: init_amd_k5(c); break; + case 5: init_amd_k6(c); break; + case 6: init_amd_k7(c); break; + case 0xf: init_amd_k8(c); break; + case 0x10: init_amd_gh(c); break; + case 0x15: init_amd_bd(c); break; + } + + /* Enable workaround for FXSAVE leak */ + if (c->x86 >= 6) + set_cpu_bug(c, X86_BUG_FXSAVE_LEAK); cpu_detect_cache_sizes(c); @@ -656,33 +701,6 @@ static void init_amd(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); } -#ifdef CONFIG_X86_64 - if (c->x86 == 0x10) { - /* do this for boot cpu */ - if (c == &boot_cpu_data) - check_enable_amd_mmconf_dmi(); - - fam10h_check_enable_mmcfg(); - } - - if (c == &boot_cpu_data && c->x86 >= 0xf) { - unsigned long long tseg; - - /* - * Split up direct mapping around the TSEG SMM area. - * Don't do it for gbpages because there seems very little - * benefit in doing so. - */ - if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) { - unsigned long pfn = tseg >> PAGE_SHIFT; - - printk(KERN_DEBUG "tseg: %010llx\n", tseg); - if (pfn_range_is_mapped(pfn, pfn + 1)) - set_memory_4k((unsigned long)__va(tseg), 1); - } - } -#endif - /* * Family 0x12 and above processors have APIC timer * running in deep C states. @@ -690,34 +708,6 @@ static void init_amd(struct cpuinfo_x86 *c) if (c->x86 > 0x11) set_cpu_cap(c, X86_FEATURE_ARAT); - if (c->x86 == 0x10) { - /* - * Disable GART TLB Walk Errors on Fam10h. We do this here - * because this is always needed when GART is enabled, even in a - * kernel which has no MCE support built in. - * BIOS should disable GartTlbWlk Errors already. If - * it doesn't, do it here as suggested by the BKDG. - * - * Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=33012 - */ - msr_set_bit(MSR_AMD64_MCx_MASK(4), 10); - - /* - * On family 10h BIOS may not have properly enabled WC+ support, - * causing it to be converted to CD memtype. This may result in - * performance degradation for certain nested-paging guests. - * Prevent this conversion by clearing bit 24 in - * MSR_AMD64_BU_CFG2. - * - * NOTE: we want to use the _safe accessors so as not to #GP kvm - * guests on older kvm hosts. - */ - msr_clear_bit(MSR_AMD64_BU_CFG2, 24); - - if (cpu_has_amd_erratum(c, amd_erratum_383)) - set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH); - } - if (cpu_has_amd_erratum(c, amd_erratum_400)) set_cpu_bug(c, X86_BUG_AMD_APIC_C1E); @@ -741,11 +731,6 @@ static unsigned int amd_size_cache(struct cpuinfo_x86 *c, unsigned int size) } #endif -static void cpu_set_tlb_flushall_shift(struct cpuinfo_x86 *c) -{ - tlb_flushall_shift = 6; -} - static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c) { u32 ebx, eax, ecx, edx; @@ -793,8 +778,6 @@ static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c) tlb_lli_2m[ENTRIES] = eax & mask; tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1; - - cpu_set_tlb_flushall_shift(c); } static const struct cpu_dev amd_cpu_dev = { diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index ef1b93f18ed..4b4f78c9ba1 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -148,6 +148,7 @@ static int __init x86_xsave_setup(char *s) { setup_clear_cpu_cap(X86_FEATURE_XSAVE); setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); + setup_clear_cpu_cap(X86_FEATURE_XSAVES); setup_clear_cpu_cap(X86_FEATURE_AVX); setup_clear_cpu_cap(X86_FEATURE_AVX2); return 1; @@ -161,6 +162,13 @@ static int __init x86_xsaveopt_setup(char *s) } __setup("noxsaveopt", x86_xsaveopt_setup); +static int __init x86_xsaves_setup(char *s) +{ + setup_clear_cpu_cap(X86_FEATURE_XSAVES); + return 1; +} +__setup("noxsaves", x86_xsaves_setup); + #ifdef CONFIG_X86_32 static int cachesize_override = -1; static int disable_x86_serial_nr = 1; @@ -338,8 +346,8 @@ static void filter_cpuid_features(struct cpuinfo_x86 *c, bool warn) continue; printk(KERN_WARNING - "CPU: CPU feature %s disabled, no CPUID level 0x%x\n", - x86_cap_flags[df->feature], df->level); + "CPU: CPU feature " X86_CAP_FMT " disabled, no CPUID level 0x%x\n", + x86_cap_flag(df->feature), df->level); } } @@ -481,26 +489,17 @@ u16 __read_mostly tlb_lld_2m[NR_INFO]; u16 __read_mostly tlb_lld_4m[NR_INFO]; u16 __read_mostly tlb_lld_1g[NR_INFO]; -/* - * tlb_flushall_shift shows the balance point in replacing cr3 write - * with multiple 'invlpg'. It will do this replacement when - * flush_tlb_lines <= active_lines/2^tlb_flushall_shift. - * If tlb_flushall_shift is -1, means the replacement will be disabled. - */ -s8 __read_mostly tlb_flushall_shift = -1; - void cpu_detect_tlb(struct cpuinfo_x86 *c) { if (this_cpu->c_detect_tlb) this_cpu->c_detect_tlb(c); printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n" - "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n" - "tlb_flushall_shift: %d\n", + "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n", tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES], tlb_lli_4m[ENTRIES], tlb_lld_4k[ENTRIES], tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES], - tlb_lld_1g[ENTRIES], tlb_flushall_shift); + tlb_lld_1g[ENTRIES]); } void detect_ht(struct cpuinfo_x86 *c) @@ -634,6 +633,15 @@ void get_cpu_cap(struct cpuinfo_x86 *c) c->x86_capability[9] = ebx; } + /* Extended state features: level 0x0000000d */ + if (c->cpuid_level >= 0x0000000d) { + u32 eax, ebx, ecx, edx; + + cpuid_count(0x0000000d, 1, &eax, &ebx, &ecx, &edx); + + c->x86_capability[10] = eax; + } + /* AMD-defined flags: level 0x80000001 */ xlvl = cpuid_eax(0x80000000); c->extended_cpuid_level = xlvl; @@ -956,6 +964,7 @@ static void vgetcpu_set_mode(void) vgetcpu_mode = VGETCPU_LSL; } +#ifdef CONFIG_IA32_EMULATION /* May not be __init: called during resume */ static void syscall32_cpu_init(void) { @@ -967,7 +976,8 @@ static void syscall32_cpu_init(void) wrmsrl(MSR_CSTAR, ia32_cstar_target); } -#endif +#endif /* CONFIG_IA32_EMULATION */ +#endif /* CONFIG_X86_64 */ #ifdef CONFIG_X86_32 void enable_sep_cpu(void) @@ -1176,7 +1186,7 @@ void syscall_init(void) /* Flags to clear on syscall */ wrmsrl(MSR_SYSCALL_MASK, X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF| - X86_EFLAGS_IOPL|X86_EFLAGS_AC); + X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT); } /* @@ -1190,9 +1200,9 @@ DEFINE_PER_CPU(int, debug_stack_usage); int is_debug_stack(unsigned long addr) { - return __get_cpu_var(debug_stack_usage) || - (addr <= __get_cpu_var(debug_stack_addr) && - addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ)); + return __this_cpu_read(debug_stack_usage) || + (addr <= __this_cpu_read(debug_stack_addr) && + addr > (__this_cpu_read(debug_stack_addr) - DEBUG_STKSZ)); } NOKPROBE_SYMBOL(is_debug_stack); @@ -1258,6 +1268,19 @@ static void dbg_restore_debug_regs(void) #define dbg_restore_debug_regs() #endif /* ! CONFIG_KGDB */ +static void wait_for_master_cpu(int cpu) +{ +#ifdef CONFIG_SMP + /* + * wait for ACK from master CPU before continuing + * with AP initialization + */ + WARN_ON(cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)); + while (!cpumask_test_cpu(cpu, cpu_callout_mask)) + cpu_relax(); +#endif +} + /* * cpu_init() initializes state that is per-CPU. Some data is already * initialized (naturally) in the bootstrap process, such as the GDT @@ -1273,16 +1296,17 @@ void cpu_init(void) struct task_struct *me; struct tss_struct *t; unsigned long v; - int cpu; + int cpu = stack_smp_processor_id(); int i; + wait_for_master_cpu(cpu); + /* * Load microcode on this cpu if a valid microcode is available. * This is early microcode loading procedure. */ load_ucode_ap(); - cpu = stack_smp_processor_id(); t = &per_cpu(init_tss, cpu); oist = &per_cpu(orig_ist, cpu); @@ -1294,9 +1318,6 @@ void cpu_init(void) me = current; - if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) - panic("CPU#%d already initialized!\n", cpu); - pr_debug("Initializing CPU#%d\n", cpu); clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); @@ -1373,17 +1394,13 @@ void cpu_init(void) struct tss_struct *t = &per_cpu(init_tss, cpu); struct thread_struct *thread = &curr->thread; - show_ucode_info_early(); + wait_for_master_cpu(cpu); - if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) { - printk(KERN_WARNING "CPU#%d already initialized!\n", cpu); - for (;;) - local_irq_enable(); - } + show_ucode_info_early(); printk(KERN_INFO "Initializing CPU#%d\n", cpu); - if (cpu_has_vme || cpu_has_tsc || cpu_has_de) + if (cpu_feature_enabled(X86_FEATURE_VME) || cpu_has_tsc || cpu_has_de) clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); load_current_idt(); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index f9e4fdd3b87..1ef45627317 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -144,6 +144,21 @@ static void early_init_intel(struct cpuinfo_x86 *c) setup_clear_cpu_cap(X86_FEATURE_ERMS); } } + + /* + * Intel Quark Core DevMan_001.pdf section 6.4.11 + * "The operating system also is required to invalidate (i.e., flush) + * the TLB when any changes are made to any of the page table entries. + * The operating system must reload CR3 to cause the TLB to be flushed" + * + * As a result cpu_has_pge() in arch/x86/include/asm/tlbflush.h should + * be false so that __flush_tlb_all() causes CR3 insted of CR4.PGE + * to be modified + */ + if (c->x86 == 5 && c->x86_model == 9) { + pr_info("Disabling PGE capability bit\n"); + setup_clear_cpu_cap(X86_FEATURE_PGE); + } } #ifdef CONFIG_X86_32 @@ -253,7 +268,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c) */ if (cpu_has_apic && (c->x86<<8 | c->x86_model<<4) == 0x520 && (c->x86_mask < 0x6 || c->x86_mask == 0xb)) - set_cpu_cap(c, X86_FEATURE_11AP); + set_cpu_bug(c, X86_BUG_11AP); #ifdef CONFIG_X86_INTEL_USERCOPY @@ -382,6 +397,13 @@ static void init_intel(struct cpuinfo_x86 *c) } l2 = init_intel_cacheinfo(c); + + /* Detect legacy cache sizes if init_intel_cacheinfo did not */ + if (l2 == 0) { + cpu_detect_cache_sizes(c); + l2 = c->x86_cache_size; + } + if (c->cpuid_level > 9) { unsigned eax = cpuid_eax(10); /* Check for version and the number of counters */ @@ -402,7 +424,7 @@ static void init_intel(struct cpuinfo_x86 *c) if (c->x86 == 6 && cpu_has_clflush && (c->x86_model == 29 || c->x86_model == 46 || c->x86_model == 47)) - set_cpu_cap(c, X86_FEATURE_CLFLUSH_MONITOR); + set_cpu_bug(c, X86_BUG_CLFLUSH_MONITOR); #ifdef CONFIG_X86_64 if (c->x86 == 15) @@ -485,6 +507,13 @@ static unsigned int intel_size_cache(struct cpuinfo_x86 *c, unsigned int size) */ if ((c->x86 == 6) && (c->x86_model == 11) && (size == 0)) size = 256; + + /* + * Intel Quark SoC X1000 contains a 4-way set associative + * 16K cache with a 16 byte cache line and 256 lines per tag + */ + if ((c->x86 == 5) && (c->x86_model == 9)) + size = 16; return size; } #endif @@ -634,31 +663,6 @@ static void intel_tlb_lookup(const unsigned char desc) } } -static void intel_tlb_flushall_shift_set(struct cpuinfo_x86 *c) -{ - switch ((c->x86 << 8) + c->x86_model) { - case 0x60f: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ - case 0x616: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ - case 0x617: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ - case 0x61d: /* six-core 45 nm xeon "Dunnington" */ - tlb_flushall_shift = -1; - break; - case 0x63a: /* Ivybridge */ - tlb_flushall_shift = 2; - break; - case 0x61a: /* 45 nm nehalem, "Bloomfield" */ - case 0x61e: /* 45 nm nehalem, "Lynnfield" */ - case 0x625: /* 32 nm nehalem, "Clarkdale" */ - case 0x62c: /* 32 nm nehalem, "Gulftown" */ - case 0x62e: /* 45 nm nehalem-ex, "Beckton" */ - case 0x62f: /* 32 nm Xeon E7 */ - case 0x62a: /* SandyBridge */ - case 0x62d: /* SandyBridge, "Romely-EP" */ - default: - tlb_flushall_shift = 6; - } -} - static void intel_detect_tlb(struct cpuinfo_x86 *c) { int i, j, n; @@ -683,7 +687,6 @@ static void intel_detect_tlb(struct cpuinfo_x86 *c) for (j = 1 ; j < 16 ; j++) intel_tlb_lookup(desc[j]); } - intel_tlb_flushall_shift_set(c); } static const struct cpu_dev intel_cpu_dev = { @@ -712,7 +715,8 @@ static const struct cpu_dev intel_cpu_dev = { [3] = "OverDrive PODP5V83", [4] = "Pentium MMX", [7] = "Mobile Pentium 75 - 200", - [8] = "Mobile Pentium MMX" + [8] = "Mobile Pentium MMX", + [9] = "Quark SoC X1000", } }, { .family = 6, .model_names = diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 9c8f7394c61..c7035073dfc 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -461,7 +461,7 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); - if (strict_strtoul(buf, 10, &val) < 0) + if (kstrtoul(buf, 10, &val) < 0) return -EINVAL; err = amd_set_l3_disable_slot(this_leaf->base.nb, cpu, slot, val); @@ -511,7 +511,7 @@ store_subcaches(struct _cpuid4_info *this_leaf, const char *buf, size_t count, if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) return -EINVAL; - if (strict_strtoul(buf, 16, &val) < 0) + if (kstrtoul(buf, 16, &val) < 0) return -EINVAL; if (amd_set_subcaches(cpu, val)) diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index 5ac2d1fb28b..4cfba4371a7 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -83,7 +83,7 @@ static DEFINE_MUTEX(mce_inject_mutex); static int mce_raise_notify(unsigned int cmd, struct pt_regs *regs) { int cpu = smp_processor_id(); - struct mce *m = &__get_cpu_var(injectm); + struct mce *m = this_cpu_ptr(&injectm); if (!cpumask_test_cpu(cpu, mce_inject_cpumask)) return NMI_DONE; cpumask_clear_cpu(cpu, mce_inject_cpumask); @@ -97,7 +97,7 @@ static int mce_raise_notify(unsigned int cmd, struct pt_regs *regs) static void mce_irq_ipi(void *info) { int cpu = smp_processor_id(); - struct mce *m = &__get_cpu_var(injectm); + struct mce *m = this_cpu_ptr(&injectm); if (cpumask_test_cpu(cpu, mce_inject_cpumask) && m->inject_flags & MCJ_EXCEPTION) { @@ -109,7 +109,7 @@ static void mce_irq_ipi(void *info) /* Inject mce on current CPU */ static int raise_local(void) { - struct mce *m = &__get_cpu_var(injectm); + struct mce *m = this_cpu_ptr(&injectm); int context = MCJ_CTX(m->inject_flags); int ret = 0; int cpu = m->extcpu; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 9a79c8dbd8e..61a9668cebf 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -400,7 +400,7 @@ static u64 mce_rdmsrl(u32 msr) if (offset < 0) return 0; - return *(u64 *)((char *)&__get_cpu_var(injectm) + offset); + return *(u64 *)((char *)this_cpu_ptr(&injectm) + offset); } if (rdmsrl_safe(msr, &v)) { @@ -422,7 +422,7 @@ static void mce_wrmsrl(u32 msr, u64 v) int offset = msr_to_offset(msr); if (offset >= 0) - *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v; + *(u64 *)((char *)this_cpu_ptr(&injectm) + offset) = v; return; } wrmsrl(msr, v); @@ -478,7 +478,7 @@ static DEFINE_PER_CPU(struct mce_ring, mce_ring); /* Runs with CPU affinity in workqueue */ static int mce_ring_empty(void) { - struct mce_ring *r = &__get_cpu_var(mce_ring); + struct mce_ring *r = this_cpu_ptr(&mce_ring); return r->start == r->end; } @@ -490,7 +490,7 @@ static int mce_ring_get(unsigned long *pfn) *pfn = 0; get_cpu(); - r = &__get_cpu_var(mce_ring); + r = this_cpu_ptr(&mce_ring); if (r->start == r->end) goto out; *pfn = r->ring[r->start]; @@ -504,7 +504,7 @@ out: /* Always runs in MCE context with preempt off */ static int mce_ring_add(unsigned long pfn) { - struct mce_ring *r = &__get_cpu_var(mce_ring); + struct mce_ring *r = this_cpu_ptr(&mce_ring); unsigned next; next = (r->end + 1) % MCE_RING_SIZE; @@ -526,7 +526,7 @@ int mce_available(struct cpuinfo_x86 *c) static void mce_schedule_work(void) { if (!mce_ring_empty()) - schedule_work(&__get_cpu_var(mce_work)); + schedule_work(this_cpu_ptr(&mce_work)); } DEFINE_PER_CPU(struct irq_work, mce_irq_work); @@ -551,7 +551,7 @@ static void mce_report_event(struct pt_regs *regs) return; } - irq_work_queue(&__get_cpu_var(mce_irq_work)); + irq_work_queue(this_cpu_ptr(&mce_irq_work)); } /* @@ -1045,7 +1045,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) mce_gather_info(&m, regs); - final = &__get_cpu_var(mces_seen); + final = this_cpu_ptr(&mces_seen); *final = m; memset(valid_banks, 0, sizeof(valid_banks)); @@ -1278,22 +1278,22 @@ static unsigned long (*mce_adjust_timer)(unsigned long interval) = static int cmc_error_seen(void) { - unsigned long *v = &__get_cpu_var(mce_polled_error); + unsigned long *v = this_cpu_ptr(&mce_polled_error); return test_and_clear_bit(0, v); } static void mce_timer_fn(unsigned long data) { - struct timer_list *t = &__get_cpu_var(mce_timer); + struct timer_list *t = this_cpu_ptr(&mce_timer); unsigned long iv; int notify; WARN_ON(smp_processor_id() != data); - if (mce_available(__this_cpu_ptr(&cpu_info))) { + if (mce_available(this_cpu_ptr(&cpu_info))) { machine_check_poll(MCP_TIMESTAMP, - &__get_cpu_var(mce_poll_banks)); + this_cpu_ptr(&mce_poll_banks)); mce_intel_cmci_poll(); } @@ -1323,7 +1323,7 @@ static void mce_timer_fn(unsigned long data) */ void mce_timer_kick(unsigned long interval) { - struct timer_list *t = &__get_cpu_var(mce_timer); + struct timer_list *t = this_cpu_ptr(&mce_timer); unsigned long when = jiffies + interval; unsigned long iv = __this_cpu_read(mce_next_interval); @@ -1659,7 +1659,7 @@ static void mce_start_timer(unsigned int cpu, struct timer_list *t) static void __mcheck_cpu_init_timer(void) { - struct timer_list *t = &__get_cpu_var(mce_timer); + struct timer_list *t = this_cpu_ptr(&mce_timer); unsigned int cpu = smp_processor_id(); setup_timer(t, mce_timer_fn, cpu); @@ -1702,8 +1702,8 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c) __mcheck_cpu_init_generic(); __mcheck_cpu_init_vendor(c); __mcheck_cpu_init_timer(); - INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); - init_irq_work(&__get_cpu_var(mce_irq_work), &mce_irq_work_cb); + INIT_WORK(this_cpu_ptr(&mce_work), mce_process_work); + init_irq_work(this_cpu_ptr(&mce_irq_work), &mce_irq_work_cb); } /* @@ -1955,7 +1955,7 @@ static struct miscdevice mce_chrdev_device = { static void __mce_disable_bank(void *arg) { int bank = *((int *)arg); - __clear_bit(bank, __get_cpu_var(mce_poll_banks)); + __clear_bit(bank, this_cpu_ptr(mce_poll_banks)); cmci_disable_bank(bank); } @@ -2065,7 +2065,7 @@ static void mce_syscore_shutdown(void) static void mce_syscore_resume(void) { __mcheck_cpu_init_generic(); - __mcheck_cpu_init_vendor(__this_cpu_ptr(&cpu_info)); + __mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info)); } static struct syscore_ops mce_syscore_ops = { @@ -2080,7 +2080,7 @@ static struct syscore_ops mce_syscore_ops = { static void mce_cpu_restart(void *data) { - if (!mce_available(__this_cpu_ptr(&cpu_info))) + if (!mce_available(raw_cpu_ptr(&cpu_info))) return; __mcheck_cpu_init_generic(); __mcheck_cpu_init_timer(); @@ -2096,14 +2096,14 @@ static void mce_restart(void) /* Toggle features for corrected errors */ static void mce_disable_cmci(void *data) { - if (!mce_available(__this_cpu_ptr(&cpu_info))) + if (!mce_available(raw_cpu_ptr(&cpu_info))) return; cmci_clear(); } static void mce_enable_ce(void *all) { - if (!mce_available(__this_cpu_ptr(&cpu_info))) + if (!mce_available(raw_cpu_ptr(&cpu_info))) return; cmci_reenable(); cmci_recheck(); @@ -2136,7 +2136,7 @@ static ssize_t set_bank(struct device *s, struct device_attribute *attr, { u64 new; - if (strict_strtoull(buf, 0, &new) < 0) + if (kstrtou64(buf, 0, &new) < 0) return -EINVAL; attr_to_bank(attr)->ctl = new; @@ -2174,7 +2174,7 @@ static ssize_t set_ignore_ce(struct device *s, { u64 new; - if (strict_strtoull(buf, 0, &new) < 0) + if (kstrtou64(buf, 0, &new) < 0) return -EINVAL; if (mca_cfg.ignore_ce ^ !!new) { @@ -2198,7 +2198,7 @@ static ssize_t set_cmci_disabled(struct device *s, { u64 new; - if (strict_strtoull(buf, 0, &new) < 0) + if (kstrtou64(buf, 0, &new) < 0) return -EINVAL; if (mca_cfg.cmci_disabled ^ !!new) { @@ -2336,7 +2336,7 @@ static void mce_disable_cpu(void *h) unsigned long action = *(unsigned long *)h; int i; - if (!mce_available(__this_cpu_ptr(&cpu_info))) + if (!mce_available(raw_cpu_ptr(&cpu_info))) return; if (!(action & CPU_TASKS_FROZEN)) @@ -2354,7 +2354,7 @@ static void mce_reenable_cpu(void *h) unsigned long action = *(unsigned long *)h; int i; - if (!mce_available(__this_cpu_ptr(&cpu_info))) + if (!mce_available(raw_cpu_ptr(&cpu_info))) return; if (!(action & CPU_TASKS_FROZEN)) @@ -2385,6 +2385,10 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) threshold_cpu_callback(action, cpu); mce_device_remove(cpu); mce_intel_hcpu_update(cpu); + + /* intentionally ignoring frozen here */ + if (!(action & CPU_TASKS_FROZEN)) + cmci_rediscover(); break; case CPU_DOWN_PREPARE: smp_call_function_single(cpu, mce_disable_cpu, &action, 1); @@ -2396,11 +2400,6 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) break; } - if (action == CPU_POST_DEAD) { - /* intentionally ignoring frozen here */ - cmci_rediscover(); - } - return NOTIFY_OK; } diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 603df4f7464..5d4999f95ae 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -310,7 +310,7 @@ static void amd_threshold_interrupt(void) * event. */ machine_check_poll(MCP_TIMESTAMP, - &__get_cpu_var(mce_poll_banks)); + this_cpu_ptr(&mce_poll_banks)); if (high & MASK_OVERFLOW_HI) { rdmsrl(address, m.misc); @@ -353,7 +353,7 @@ store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size) if (!b->interrupt_capable) return -EINVAL; - if (strict_strtoul(buf, 0, &new) < 0) + if (kstrtoul(buf, 0, &new) < 0) return -EINVAL; b->interrupt_enable = !!new; @@ -372,7 +372,7 @@ store_threshold_limit(struct threshold_block *b, const char *buf, size_t size) struct thresh_restart tr; unsigned long new; - if (strict_strtoul(buf, 0, &new) < 0) + if (kstrtoul(buf, 0, &new) < 0) return -EINVAL; if (new > THRESHOLD_MAX) diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 9a316b21df8..b3c97bafc12 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -42,7 +42,7 @@ static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned); * cmci_discover_lock protects against parallel discovery attempts * which could race against each other. */ -static DEFINE_SPINLOCK(cmci_discover_lock); +static DEFINE_RAW_SPINLOCK(cmci_discover_lock); #define CMCI_THRESHOLD 1 #define CMCI_POLL_INTERVAL (30 * HZ) @@ -86,7 +86,7 @@ void mce_intel_cmci_poll(void) { if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE) return; - machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); + machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)); } void mce_intel_hcpu_update(unsigned long cpu) @@ -144,14 +144,14 @@ static void cmci_storm_disable_banks(void) int bank; u64 val; - spin_lock_irqsave(&cmci_discover_lock, flags); - owned = __get_cpu_var(mce_banks_owned); + raw_spin_lock_irqsave(&cmci_discover_lock, flags); + owned = this_cpu_ptr(mce_banks_owned); for_each_set_bit(bank, owned, MAX_NR_BANKS) { rdmsrl(MSR_IA32_MCx_CTL2(bank), val); val &= ~MCI_CTL2_CMCI_EN; wrmsrl(MSR_IA32_MCx_CTL2(bank), val); } - spin_unlock_irqrestore(&cmci_discover_lock, flags); + raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); } static bool cmci_storm_detect(void) @@ -195,7 +195,7 @@ static void intel_threshold_interrupt(void) { if (cmci_storm_detect()) return; - machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); + machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)); mce_notify_irq(); } @@ -206,12 +206,12 @@ static void intel_threshold_interrupt(void) */ static void cmci_discover(int banks) { - unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned); + unsigned long *owned = (void *)this_cpu_ptr(&mce_banks_owned); unsigned long flags; int i; int bios_wrong_thresh = 0; - spin_lock_irqsave(&cmci_discover_lock, flags); + raw_spin_lock_irqsave(&cmci_discover_lock, flags); for (i = 0; i < banks; i++) { u64 val; int bios_zero_thresh = 0; @@ -228,7 +228,7 @@ static void cmci_discover(int banks) /* Already owned by someone else? */ if (val & MCI_CTL2_CMCI_EN) { clear_bit(i, owned); - __clear_bit(i, __get_cpu_var(mce_poll_banks)); + __clear_bit(i, this_cpu_ptr(mce_poll_banks)); continue; } @@ -252,7 +252,7 @@ static void cmci_discover(int banks) /* Did the enable bit stick? -- the bank supports CMCI */ if (val & MCI_CTL2_CMCI_EN) { set_bit(i, owned); - __clear_bit(i, __get_cpu_var(mce_poll_banks)); + __clear_bit(i, this_cpu_ptr(mce_poll_banks)); /* * We are able to set thresholds for some banks that * had a threshold of 0. This means the BIOS has not @@ -263,10 +263,10 @@ static void cmci_discover(int banks) (val & MCI_CTL2_CMCI_THRESHOLD_MASK)) bios_wrong_thresh = 1; } else { - WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks))); + WARN_ON(!test_bit(i, this_cpu_ptr(mce_poll_banks))); } } - spin_unlock_irqrestore(&cmci_discover_lock, flags); + raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); if (mca_cfg.bios_cmci_threshold && bios_wrong_thresh) { pr_info_once( "bios_cmci_threshold: Some banks do not have valid thresholds set\n"); @@ -284,10 +284,10 @@ void cmci_recheck(void) unsigned long flags; int banks; - if (!mce_available(__this_cpu_ptr(&cpu_info)) || !cmci_supported(&banks)) + if (!mce_available(raw_cpu_ptr(&cpu_info)) || !cmci_supported(&banks)) return; local_irq_save(flags); - machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); + machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)); local_irq_restore(flags); } @@ -296,12 +296,12 @@ static void __cmci_disable_bank(int bank) { u64 val; - if (!test_bit(bank, __get_cpu_var(mce_banks_owned))) + if (!test_bit(bank, this_cpu_ptr(mce_banks_owned))) return; rdmsrl(MSR_IA32_MCx_CTL2(bank), val); val &= ~MCI_CTL2_CMCI_EN; wrmsrl(MSR_IA32_MCx_CTL2(bank), val); - __clear_bit(bank, __get_cpu_var(mce_banks_owned)); + __clear_bit(bank, this_cpu_ptr(mce_banks_owned)); } /* @@ -316,10 +316,10 @@ void cmci_clear(void) if (!cmci_supported(&banks)) return; - spin_lock_irqsave(&cmci_discover_lock, flags); + raw_spin_lock_irqsave(&cmci_discover_lock, flags); for (i = 0; i < banks; i++) __cmci_disable_bank(i); - spin_unlock_irqrestore(&cmci_discover_lock, flags); + raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); } static void cmci_rediscover_work_func(void *arg) @@ -360,9 +360,9 @@ void cmci_disable_bank(int bank) if (!cmci_supported(&banks)) return; - spin_lock_irqsave(&cmci_discover_lock, flags); + raw_spin_lock_irqsave(&cmci_discover_lock, flags); __cmci_disable_bank(bank); - spin_unlock_irqrestore(&cmci_discover_lock, flags); + raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); } static void intel_init_cmci(void) diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 36a1bb6d1ee..1af51b1586d 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -498,8 +498,8 @@ void intel_init_thermal(struct cpuinfo_x86 *c) if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { - printk(KERN_DEBUG - "CPU%d: Thermal monitoring handled by SMI\n", cpu); + if (system_state == SYSTEM_BOOTING) + printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n", cpu); return; } diff --git a/arch/x86/kernel/cpu/microcode/amd_early.c b/arch/x86/kernel/cpu/microcode/amd_early.c index 617a9e28424..7aa1acc7978 100644 --- a/arch/x86/kernel/cpu/microcode/amd_early.c +++ b/arch/x86/kernel/cpu/microcode/amd_early.c @@ -27,7 +27,7 @@ static u32 ucode_new_rev; u8 amd_ucode_patch[PATCH_MAX_SIZE]; static u16 this_equiv_id; -struct cpio_data ucode_cpio; +static struct cpio_data ucode_cpio; /* * Microcode patch container file is prepended to the initrd in cpio format. diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index a276fa75d9b..c6826d1e808 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -127,7 +127,7 @@ static int get_matching_mc(struct microcode_intel *mc_intel, int cpu) return get_matching_microcode(csig, cpf, mc_intel, crev); } -int apply_microcode(int cpu) +static int apply_microcode_intel(int cpu) { struct microcode_intel *mc_intel; struct ucode_cpu_info *uci; @@ -314,7 +314,7 @@ static struct microcode_ops microcode_intel_ops = { .request_microcode_user = request_microcode_user, .request_microcode_fw = request_microcode_fw, .collect_cpu_info = collect_cpu_info, - .apply_microcode = apply_microcode, + .apply_microcode = apply_microcode_intel, .microcode_fini_cpu = microcode_fini_cpu, }; diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c index 18f739129e7..b88343f7a3b 100644 --- a/arch/x86/kernel/cpu/microcode/intel_early.c +++ b/arch/x86/kernel/cpu/microcode/intel_early.c @@ -28,8 +28,8 @@ #include <asm/tlbflush.h> #include <asm/setup.h> -unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT]; -struct mc_saved_data { +static unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT]; +static struct mc_saved_data { unsigned int mc_saved_count; struct microcode_intel **mc_saved; } mc_saved_data; @@ -415,7 +415,7 @@ static void __ref show_saved_mc(void) struct ucode_cpu_info uci; if (mc_saved_data.mc_saved_count == 0) { - pr_debug("no micorcode data saved.\n"); + pr_debug("no microcode data saved.\n"); return; } pr_debug("Total microcode saved: %d\n", mc_saved_data.mc_saved_count); @@ -506,7 +506,7 @@ int save_mc_for_early(u8 *mc) if (mc_saved && mc_saved_count) memcpy(mc_saved_tmp, mc_saved, - mc_saved_count * sizeof(struct mirocode_intel *)); + mc_saved_count * sizeof(struct microcode_intel *)); /* * Save the microcode patch mc in mc_save_tmp structure if it's a newer * version. @@ -526,7 +526,7 @@ int save_mc_for_early(u8 *mc) show_saved_mc(); /* - * Free old saved microcod data. + * Free old saved microcode data. */ if (mc_saved) { for (i = 0; i < mc_saved_count_init; i++) diff --git a/arch/x86/kernel/cpu/mkcapflags.sh b/arch/x86/kernel/cpu/mkcapflags.sh index 2bf61650549..e2b22df964c 100644 --- a/arch/x86/kernel/cpu/mkcapflags.sh +++ b/arch/x86/kernel/cpu/mkcapflags.sh @@ -1,23 +1,25 @@ #!/bin/sh # -# Generate the x86_cap_flags[] array from include/asm/cpufeature.h +# Generate the x86_cap/bug_flags[] arrays from include/asm/cpufeature.h # IN=$1 OUT=$2 -TABS="$(printf '\t\t\t\t\t')" -trap 'rm "$OUT"' EXIT +function dump_array() +{ + ARRAY=$1 + SIZE=$2 + PFX=$3 + POSTFIX=$4 -( - echo "#ifndef _ASM_X86_CPUFEATURE_H" - echo "#include <asm/cpufeature.h>" - echo "#endif" - echo "" - echo "const char * const x86_cap_flags[NCAPINTS*32] = {" + PFX_SZ=$(echo $PFX | wc -c) + TABS="$(printf '\t\t\t\t\t')" + + echo "const char * const $ARRAY[$SIZE] = {" - # Iterate through any input lines starting with #define X86_FEATURE_ - sed -n -e 's/\t/ /g' -e 's/^ *# *define *X86_FEATURE_//p' $IN | + # Iterate through any input lines starting with #define $PFX + sed -n -e 's/\t/ /g' -e "s/^ *# *define *$PFX//p" $IN | while read i do # Name is everything up to the first whitespace @@ -31,11 +33,32 @@ trap 'rm "$OUT"' EXIT # Name is uppercase, VALUE is all lowercase VALUE="$(echo "$VALUE" | tr A-Z a-z)" - TABCOUNT=$(( ( 5*8 - 14 - $(echo "$NAME" | wc -c) ) / 8 )) - printf "\t[%s]%.*s = %s,\n" \ - "X86_FEATURE_$NAME" "$TABCOUNT" "$TABS" "$VALUE" + if [ -n "$POSTFIX" ]; then + T=$(( $PFX_SZ + $(echo $POSTFIX | wc -c) + 2 )) + TABS="$(printf '\t\t\t\t\t\t')" + TABCOUNT=$(( ( 6*8 - ($T + 1) - $(echo "$NAME" | wc -c) ) / 8 )) + printf "\t[%s - %s]%.*s = %s,\n" "$PFX$NAME" "$POSTFIX" "$TABCOUNT" "$TABS" "$VALUE" + else + TABCOUNT=$(( ( 5*8 - ($PFX_SZ + 1) - $(echo "$NAME" | wc -c) ) / 8 )) + printf "\t[%s]%.*s = %s,\n" "$PFX$NAME" "$TABCOUNT" "$TABS" "$VALUE" + fi done echo "};" +} + +trap 'rm "$OUT"' EXIT + +( + echo "#ifndef _ASM_X86_CPUFEATURE_H" + echo "#include <asm/cpufeature.h>" + echo "#endif" + echo "" + + dump_array "x86_cap_flags" "NCAPINTS*32" "X86_FEATURE_" "" + echo "" + + dump_array "x86_bug_flags" "NBUGINTS*32" "X86_BUG_" "NCAPINTS*32" + ) > $OUT trap - EXIT diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index f961de9964c..ea5f363a194 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -707,7 +707,7 @@ void __init mtrr_bp_init(void) } else { switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: - if (cpu_has_k6_mtrr) { + if (cpu_feature_enabled(X86_FEATURE_K6_MTRR)) { /* Pre-Athlon (K6) AMD CPU MTRRs */ mtrr_if = mtrr_ops[X86_VENDOR_AMD]; size_or_mask = SIZE_OR_MASK_BITS(32); @@ -715,14 +715,14 @@ void __init mtrr_bp_init(void) } break; case X86_VENDOR_CENTAUR: - if (cpu_has_centaur_mcr) { + if (cpu_feature_enabled(X86_FEATURE_CENTAUR_MCR)) { mtrr_if = mtrr_ops[X86_VENDOR_CENTAUR]; size_or_mask = SIZE_OR_MASK_BITS(32); size_and_mask = 0; } break; case X86_VENDOR_CYRIX: - if (cpu_has_cyrix_arr) { + if (cpu_feature_enabled(X86_FEATURE_CYRIX_ARR)) { mtrr_if = mtrr_ops[X86_VENDOR_CYRIX]; size_or_mask = SIZE_OR_MASK_BITS(32); size_and_mask = 0; diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 2879ecdaac4..1b8299dd3d9 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -243,7 +243,8 @@ static bool check_hw_exists(void) msr_fail: printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n"); - printk(KERN_ERR "Failed to access perfctr msr (MSR %x is %Lx)\n", reg, val_new); + printk(boot_cpu_has(X86_FEATURE_HYPERVISOR) ? KERN_INFO : KERN_ERR + "Failed to access perfctr msr (MSR %x is %Lx)\n", reg, val_new); return false; } @@ -387,7 +388,7 @@ int x86_pmu_hw_config(struct perf_event *event) precise++; /* Support for IP fixup */ - if (x86_pmu.lbr_nr) + if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2) precise++; } @@ -443,6 +444,12 @@ int x86_pmu_hw_config(struct perf_event *event) if (event->attr.type == PERF_TYPE_RAW) event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK; + if (event->attr.sample_period && x86_pmu.limit_period) { + if (x86_pmu.limit_period(event, event->attr.sample_period) > + event->attr.sample_period) + return -EINVAL; + } + return x86_setup_perfctr(event); } @@ -487,7 +494,7 @@ static int __x86_pmu_event_init(struct perf_event *event) void x86_pmu_disable_all(void) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int idx; for (idx = 0; idx < x86_pmu.num_counters; idx++) { @@ -505,7 +512,7 @@ void x86_pmu_disable_all(void) static void x86_pmu_disable(struct pmu *pmu) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); if (!x86_pmu_initialized()) return; @@ -522,7 +529,7 @@ static void x86_pmu_disable(struct pmu *pmu) void x86_pmu_enable_all(int added) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int idx; for (idx = 0; idx < x86_pmu.num_counters; idx++) { @@ -869,7 +876,7 @@ static void x86_pmu_start(struct perf_event *event, int flags); static void x86_pmu_enable(struct pmu *pmu) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct perf_event *event; struct hw_perf_event *hwc; int i, added = cpuc->n_added; @@ -980,6 +987,9 @@ int x86_perf_event_set_period(struct perf_event *event) if (left > x86_pmu.max_period) left = x86_pmu.max_period; + if (x86_pmu.limit_period) + left = x86_pmu.limit_period(event, left); + per_cpu(pmc_prev_left[idx], smp_processor_id()) = left; /* @@ -1020,7 +1030,7 @@ void x86_pmu_enable_event(struct perf_event *event) */ static int x86_pmu_add(struct perf_event *event, int flags) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc; int assign[X86_PMC_IDX_MAX]; int n, n0, ret; @@ -1071,7 +1081,7 @@ out: static void x86_pmu_start(struct perf_event *event, int flags) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int idx = event->hw.idx; if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) @@ -1150,7 +1160,7 @@ void perf_event_print_debug(void) void x86_pmu_stop(struct perf_event *event, int flags) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; if (__test_and_clear_bit(hwc->idx, cpuc->active_mask)) { @@ -1172,7 +1182,7 @@ void x86_pmu_stop(struct perf_event *event, int flags) static void x86_pmu_del(struct perf_event *event, int flags) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int i; /* @@ -1227,7 +1237,7 @@ int x86_pmu_handle_irq(struct pt_regs *regs) int idx, handled = 0; u64 val; - cpuc = &__get_cpu_var(cpu_hw_events); + cpuc = this_cpu_ptr(&cpu_hw_events); /* * Some chipsets need to unmask the LVTPC in a particular spot @@ -1636,7 +1646,7 @@ static void x86_pmu_cancel_txn(struct pmu *pmu) */ static int x86_pmu_commit_txn(struct pmu *pmu) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int assign[X86_PMC_IDX_MAX]; int n, ret; @@ -1995,7 +2005,7 @@ static unsigned long get_segment_base(unsigned int segment) if (idx > GDT_ENTRIES) return 0; - desc = __this_cpu_ptr(&gdt_page.gdt[0]); + desc = raw_cpu_ptr(gdt_page.gdt); } return get_desc_base(desc + idx); diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 8ade93111e0..d98a34d435d 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -67,8 +67,10 @@ struct event_constraint { */ #define PERF_X86_EVENT_PEBS_LDLAT 0x1 /* ld+ldlat data address sampling */ #define PERF_X86_EVENT_PEBS_ST 0x2 /* st data address sampling */ -#define PERF_X86_EVENT_PEBS_ST_HSW 0x4 /* haswell style st data sampling */ +#define PERF_X86_EVENT_PEBS_ST_HSW 0x4 /* haswell style datala, store */ #define PERF_X86_EVENT_COMMITTED 0x8 /* event passed commit_txn */ +#define PERF_X86_EVENT_PEBS_LD_HSW 0x10 /* haswell style datala, load */ +#define PERF_X86_EVENT_PEBS_NA_HSW 0x20 /* haswell style datala, unknown */ struct amd_nb { int nb_id; /* NorthBridge id */ @@ -252,18 +254,52 @@ struct cpu_hw_events { EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK) #define INTEL_PLD_CONSTRAINT(c, n) \ - __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \ + __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LDLAT) #define INTEL_PST_CONSTRAINT(c, n) \ - __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \ + __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST) -/* DataLA version of store sampling without extra enable bit. */ -#define INTEL_PST_HSW_CONSTRAINT(c, n) \ - __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \ +/* Event constraint, but match on all event flags too. */ +#define INTEL_FLAGS_EVENT_CONSTRAINT(c, n) \ + EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS) + +/* Check only flags, but allow all event/umask */ +#define INTEL_ALL_EVENT_CONSTRAINT(code, n) \ + EVENT_CONSTRAINT(code, n, X86_ALL_EVENT_FLAGS) + +/* Check flags and event code, and set the HSW store flag */ +#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_ST(code, n) \ + __EVENT_CONSTRAINT(code, n, \ + ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \ + HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW) + +/* Check flags and event code, and set the HSW load flag */ +#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(code, n) \ + __EVENT_CONSTRAINT(code, n, \ + ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \ + HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW) + +/* Check flags and event code/umask, and set the HSW store flag */ +#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(code, n) \ + __EVENT_CONSTRAINT(code, n, \ + INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW) +/* Check flags and event code/umask, and set the HSW load flag */ +#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(code, n) \ + __EVENT_CONSTRAINT(code, n, \ + INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ + HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW) + +/* Check flags and event code/umask, and set the HSW N/A flag */ +#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(code, n) \ + __EVENT_CONSTRAINT(code, n, \ + INTEL_ARCH_EVENT_MASK|INTEL_ARCH_EVENT_MASK, \ + HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_NA_HSW) + + /* * We define the end marker as having a weight of -1 * to enable blacklisting of events using a counter bitmask @@ -409,6 +445,7 @@ struct x86_pmu { struct x86_pmu_quirk *quirks; int perfctr_second_write; bool late_ack; + unsigned (*limit_period)(struct perf_event *event, unsigned l); /* * sysfs attrs diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index beeb7cc0704..28926311aac 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -699,7 +699,7 @@ __init int amd_pmu_init(void) void amd_pmu_enable_virt(void) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); cpuc->perf_ctr_virt_mask = 0; @@ -711,7 +711,7 @@ EXPORT_SYMBOL_GPL(amd_pmu_enable_virt); void amd_pmu_disable_virt(void) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); /* * We only mask out the Host-only bit so that host-only counting works diff --git a/arch/x86/kernel/cpu/perf_event_amd_uncore.c b/arch/x86/kernel/cpu/perf_event_amd_uncore.c index 3bbdf4cd38b..30790d798e6 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_amd_uncore.c @@ -294,31 +294,41 @@ static struct amd_uncore *amd_uncore_alloc(unsigned int cpu) cpu_to_node(cpu)); } -static void amd_uncore_cpu_up_prepare(unsigned int cpu) +static int amd_uncore_cpu_up_prepare(unsigned int cpu) { - struct amd_uncore *uncore; + struct amd_uncore *uncore_nb = NULL, *uncore_l2; if (amd_uncore_nb) { - uncore = amd_uncore_alloc(cpu); - uncore->cpu = cpu; - uncore->num_counters = NUM_COUNTERS_NB; - uncore->rdpmc_base = RDPMC_BASE_NB; - uncore->msr_base = MSR_F15H_NB_PERF_CTL; - uncore->active_mask = &amd_nb_active_mask; - uncore->pmu = &amd_nb_pmu; - *per_cpu_ptr(amd_uncore_nb, cpu) = uncore; + uncore_nb = amd_uncore_alloc(cpu); + if (!uncore_nb) + goto fail; + uncore_nb->cpu = cpu; + uncore_nb->num_counters = NUM_COUNTERS_NB; + uncore_nb->rdpmc_base = RDPMC_BASE_NB; + uncore_nb->msr_base = MSR_F15H_NB_PERF_CTL; + uncore_nb->active_mask = &amd_nb_active_mask; + uncore_nb->pmu = &amd_nb_pmu; + *per_cpu_ptr(amd_uncore_nb, cpu) = uncore_nb; } if (amd_uncore_l2) { - uncore = amd_uncore_alloc(cpu); - uncore->cpu = cpu; - uncore->num_counters = NUM_COUNTERS_L2; - uncore->rdpmc_base = RDPMC_BASE_L2; - uncore->msr_base = MSR_F16H_L2I_PERF_CTL; - uncore->active_mask = &amd_l2_active_mask; - uncore->pmu = &amd_l2_pmu; - *per_cpu_ptr(amd_uncore_l2, cpu) = uncore; + uncore_l2 = amd_uncore_alloc(cpu); + if (!uncore_l2) + goto fail; + uncore_l2->cpu = cpu; + uncore_l2->num_counters = NUM_COUNTERS_L2; + uncore_l2->rdpmc_base = RDPMC_BASE_L2; + uncore_l2->msr_base = MSR_F16H_L2I_PERF_CTL; + uncore_l2->active_mask = &amd_l2_active_mask; + uncore_l2->pmu = &amd_l2_pmu; + *per_cpu_ptr(amd_uncore_l2, cpu) = uncore_l2; } + + return 0; + +fail: + kfree(uncore_nb); + return -ENOMEM; } static struct amd_uncore * @@ -441,7 +451,7 @@ static void uncore_dead(unsigned int cpu, struct amd_uncore * __percpu *uncores) if (!--uncore->refcnt) kfree(uncore); - *per_cpu_ptr(amd_uncore_nb, cpu) = NULL; + *per_cpu_ptr(uncores, cpu) = NULL; } static void amd_uncore_cpu_dead(unsigned int cpu) @@ -461,7 +471,8 @@ amd_uncore_cpu_notifier(struct notifier_block *self, unsigned long action, switch (action & ~CPU_TASKS_FROZEN) { case CPU_UP_PREPARE: - amd_uncore_cpu_up_prepare(cpu); + if (amd_uncore_cpu_up_prepare(cpu)) + return notifier_from_errno(-ENOMEM); break; case CPU_STARTING: @@ -501,20 +512,33 @@ static void __init init_cpu_already_online(void *dummy) amd_uncore_cpu_online(cpu); } +static void cleanup_cpu_online(void *dummy) +{ + unsigned int cpu = smp_processor_id(); + + amd_uncore_cpu_dead(cpu); +} + static int __init amd_uncore_init(void) { - unsigned int cpu; + unsigned int cpu, cpu2; int ret = -ENODEV; if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) - return -ENODEV; + goto fail_nodev; if (!cpu_has_topoext) - return -ENODEV; + goto fail_nodev; if (cpu_has_perfctr_nb) { amd_uncore_nb = alloc_percpu(struct amd_uncore *); - perf_pmu_register(&amd_nb_pmu, amd_nb_pmu.name, -1); + if (!amd_uncore_nb) { + ret = -ENOMEM; + goto fail_nb; + } + ret = perf_pmu_register(&amd_nb_pmu, amd_nb_pmu.name, -1); + if (ret) + goto fail_nb; printk(KERN_INFO "perf: AMD NB counters detected\n"); ret = 0; @@ -522,20 +546,28 @@ static int __init amd_uncore_init(void) if (cpu_has_perfctr_l2) { amd_uncore_l2 = alloc_percpu(struct amd_uncore *); - perf_pmu_register(&amd_l2_pmu, amd_l2_pmu.name, -1); + if (!amd_uncore_l2) { + ret = -ENOMEM; + goto fail_l2; + } + ret = perf_pmu_register(&amd_l2_pmu, amd_l2_pmu.name, -1); + if (ret) + goto fail_l2; printk(KERN_INFO "perf: AMD L2I counters detected\n"); ret = 0; } if (ret) - return -ENODEV; + goto fail_nodev; cpu_notifier_register_begin(); /* init cpus already online before registering for hotplug notifier */ for_each_online_cpu(cpu) { - amd_uncore_cpu_up_prepare(cpu); + ret = amd_uncore_cpu_up_prepare(cpu); + if (ret) + goto fail_online; smp_call_function_single(cpu, init_cpu_already_online, NULL, 1); } @@ -543,5 +575,30 @@ static int __init amd_uncore_init(void) cpu_notifier_register_done(); return 0; + + +fail_online: + for_each_online_cpu(cpu2) { + if (cpu2 == cpu) + break; + smp_call_function_single(cpu, cleanup_cpu_online, NULL, 1); + } + cpu_notifier_register_done(); + + /* amd_uncore_nb/l2 should have been freed by cleanup_cpu_online */ + amd_uncore_nb = amd_uncore_l2 = NULL; + if (cpu_has_perfctr_l2) + perf_pmu_unregister(&amd_l2_pmu); +fail_l2: + if (cpu_has_perfctr_nb) + perf_pmu_unregister(&amd_nb_pmu); + if (amd_uncore_l2) + free_percpu(amd_uncore_l2); +fail_nb: + if (amd_uncore_nb) + free_percpu(amd_uncore_nb); + +fail_nodev: + return ret; } device_initcall(amd_uncore_init); diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 2502d0d9d24..a73947c53b6 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -220,6 +220,15 @@ static struct event_constraint intel_hsw_event_constraints[] = { EVENT_CONSTRAINT_END }; +static struct event_constraint intel_bdw_event_constraints[] = { + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ + INTEL_UEVENT_CONSTRAINT(0x148, 0x4), /* L1D_PEND_MISS.PENDING */ + INTEL_EVENT_CONSTRAINT(0xa3, 0x4), /* CYCLE_ACTIVITY.* */ + EVENT_CONSTRAINT_END +}; + static u64 intel_pmu_event_map(int hw_event) { return intel_perfmon_event_map[hw_event]; @@ -415,6 +424,126 @@ static __initconst const u64 snb_hw_cache_event_ids }; +static __initconst const u64 hsw_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS */ + [ C(RESULT_MISS) ] = 0x151, /* L1D.REPLACEMENT */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES */ + [ C(RESULT_MISS) ] = 0x0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x280, /* ICACHE.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + /* OFFCORE_RESPONSE:ALL_DATA_RD|ALL_CODE_RD */ + [ C(RESULT_ACCESS) ] = 0x1b7, + /* OFFCORE_RESPONSE:ALL_DATA_RD|ALL_CODE_RD|SUPPLIER_NONE| + L3_MISS|ANY_SNOOP */ + [ C(RESULT_MISS) ] = 0x1b7, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE:ALL_RFO */ + /* OFFCORE_RESPONSE:ALL_RFO|SUPPLIER_NONE|L3_MISS|ANY_SNOOP */ + [ C(RESULT_MISS) ] = 0x1b7, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS */ + [ C(RESULT_MISS) ] = 0x108, /* DTLB_LOAD_MISSES.MISS_CAUSES_A_WALK */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES */ + [ C(RESULT_MISS) ] = 0x149, /* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x6085, /* ITLB_MISSES.STLB_HIT */ + [ C(RESULT_MISS) ] = 0x185, /* ITLB_MISSES.MISS_CAUSES_A_WALK */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0xc4, /* BR_INST_RETIRED.ALL_BRANCHES */ + [ C(RESULT_MISS) ] = 0xc5, /* BR_MISP_RETIRED.ALL_BRANCHES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +static __initconst const u64 hsw_hw_cache_extra_regs + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(LL ) ] = { + [ C(OP_READ) ] = { + /* OFFCORE_RESPONSE:ALL_DATA_RD|ALL_CODE_RD */ + [ C(RESULT_ACCESS) ] = 0x2d5, + /* OFFCORE_RESPONSE:ALL_DATA_RD|ALL_CODE_RD|SUPPLIER_NONE| + L3_MISS|ANY_SNOOP */ + [ C(RESULT_MISS) ] = 0x3fbc0202d5ull, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x122, /* OFFCORE_RESPONSE:ALL_RFO */ + /* OFFCORE_RESPONSE:ALL_RFO|SUPPLIER_NONE|L3_MISS|ANY_SNOOP */ + [ C(RESULT_MISS) ] = 0x3fbc020122ull, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, +}; + static __initconst const u64 westmere_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] @@ -1045,7 +1174,7 @@ static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event) static void intel_pmu_disable_all(void) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); @@ -1058,7 +1187,7 @@ static void intel_pmu_disable_all(void) static void intel_pmu_enable_all(int added) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); intel_pmu_pebs_enable_all(); intel_pmu_lbr_enable_all(); @@ -1092,7 +1221,7 @@ static void intel_pmu_enable_all(int added) */ static void intel_pmu_nhm_workaround(void) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); static const unsigned long nhm_magic[4] = { 0x4300B5, 0x4300D2, @@ -1191,7 +1320,7 @@ static inline bool event_is_checkpointed(struct perf_event *event) static void intel_pmu_disable_event(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) { intel_pmu_disable_bts(); @@ -1255,7 +1384,7 @@ static void intel_pmu_enable_fixed(struct hw_perf_event *hwc) static void intel_pmu_enable_event(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) { if (!__this_cpu_read(cpu_hw_events.enabled)) @@ -1349,7 +1478,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) u64 status; int handled; - cpuc = &__get_cpu_var(cpu_hw_events); + cpuc = this_cpu_ptr(&cpu_hw_events); /* * No known reason to not always do late ACK, @@ -1781,7 +1910,7 @@ EXPORT_SYMBOL_GPL(perf_guest_get_msrs); static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs; arr[0].msr = MSR_CORE_PERF_GLOBAL_CTRL; @@ -1802,7 +1931,7 @@ static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr) static struct perf_guest_switch_msr *core_guest_get_msrs(int *nr) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs; int idx; @@ -1836,7 +1965,7 @@ static void core_pmu_enable_event(struct perf_event *event) static void core_pmu_enable_all(int added) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int idx; for (idx = 0; idx < x86_pmu.num_counters; idx++) { @@ -1905,6 +2034,24 @@ hsw_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) return c; } +/* + * Broadwell: + * The INST_RETIRED.ALL period always needs to have lowest + * 6bits cleared (BDM57). It shall not use a period smaller + * than 100 (BDM11). We combine the two to enforce + * a min-period of 128. + */ +static unsigned bdw_limit_period(struct perf_event *event, unsigned left) +{ + if ((event->hw.config & INTEL_ARCH_EVENT_MASK) == + X86_CONFIG(.event=0xc0, .umask=0x01)) { + if (left < 128) + left = 128; + left &= ~0x3fu; + } + return left; +} + PMU_FORMAT_ATTR(event, "config:0-7" ); PMU_FORMAT_ATTR(umask, "config:8-15" ); PMU_FORMAT_ATTR(edge, "config:18" ); @@ -2367,15 +2514,15 @@ __init int intel_pmu_init(void) * Install the hw-cache-events table: */ switch (boot_cpu_data.x86_model) { - case 14: /* 65 nm core solo/duo, "Yonah" */ + case 14: /* 65nm Core "Yonah" */ pr_cont("Core events, "); break; - case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ + case 15: /* 65nm Core2 "Merom" */ x86_add_quirk(intel_clovertown_quirk); - case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ - case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ - case 29: /* six-core 45 nm xeon "Dunnington" */ + case 22: /* 65nm Core2 "Merom-L" */ + case 23: /* 45nm Core2 "Penryn" */ + case 29: /* 45nm Core2 "Dunnington (MP) */ memcpy(hw_cache_event_ids, core2_hw_cache_event_ids, sizeof(hw_cache_event_ids)); @@ -2386,9 +2533,9 @@ __init int intel_pmu_init(void) pr_cont("Core2 events, "); break; - case 26: /* 45 nm nehalem, "Bloomfield" */ - case 30: /* 45 nm nehalem, "Lynnfield" */ - case 46: /* 45 nm nehalem-ex, "Beckton" */ + case 30: /* 45nm Nehalem */ + case 26: /* 45nm Nehalem-EP */ + case 46: /* 45nm Nehalem-EX */ memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs, @@ -2415,11 +2562,11 @@ __init int intel_pmu_init(void) pr_cont("Nehalem events, "); break; - case 28: /* Atom */ - case 38: /* Lincroft */ - case 39: /* Penwell */ - case 53: /* Cloverview */ - case 54: /* Cedarview */ + case 28: /* 45nm Atom "Pineview" */ + case 38: /* 45nm Atom "Lincroft" */ + case 39: /* 32nm Atom "Penwell" */ + case 53: /* 32nm Atom "Cloverview" */ + case 54: /* 32nm Atom "Cedarview" */ memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, sizeof(hw_cache_event_ids)); @@ -2430,8 +2577,8 @@ __init int intel_pmu_init(void) pr_cont("Atom events, "); break; - case 55: /* Atom 22nm "Silvermont" */ - case 77: /* Avoton "Silvermont" */ + case 55: /* 22nm Atom "Silvermont" */ + case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */ memcpy(hw_cache_event_ids, slm_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_extra_regs, slm_hw_cache_extra_regs, @@ -2446,9 +2593,9 @@ __init int intel_pmu_init(void) pr_cont("Silvermont events, "); break; - case 37: /* 32 nm nehalem, "Clarkdale" */ - case 44: /* 32 nm nehalem, "Gulftown" */ - case 47: /* 32 nm Xeon E7 */ + case 37: /* 32nm Westmere */ + case 44: /* 32nm Westmere-EP */ + case 47: /* 32nm Westmere-EX */ memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs, @@ -2474,8 +2621,8 @@ __init int intel_pmu_init(void) pr_cont("Westmere events, "); break; - case 42: /* SandyBridge */ - case 45: /* SandyBridge, "Romely-EP" */ + case 42: /* 32nm SandyBridge */ + case 45: /* 32nm SandyBridge-E/EN/EP */ x86_add_quirk(intel_sandybridge_quirk); memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids)); @@ -2506,8 +2653,9 @@ __init int intel_pmu_init(void) pr_cont("SandyBridge events, "); break; - case 58: /* IvyBridge */ - case 62: /* IvyBridge EP */ + + case 58: /* 22nm IvyBridge */ + case 62: /* 22nm IvyBridge-EP/EX */ memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids)); /* dTLB-load-misses on IVB is different than SNB */ @@ -2539,20 +2687,19 @@ __init int intel_pmu_init(void) break; - case 60: /* Haswell Client */ - case 70: - case 71: - case 63: - case 69: + case 60: /* 22nm Haswell Core */ + case 63: /* 22nm Haswell Server */ + case 69: /* 22nm Haswell ULT */ + case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */ x86_pmu.late_ack = true; - memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids)); - memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); + memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids)); + memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); intel_pmu_lbr_init_snb(); x86_pmu.event_constraints = intel_hsw_event_constraints; x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints; - x86_pmu.extra_regs = intel_snb_extra_regs; + x86_pmu.extra_regs = intel_snbep_extra_regs; x86_pmu.pebs_aliases = intel_pebs_aliases_snb; /* all extra regs are per-cpu when HT is on */ x86_pmu.er_flags |= ERF_HAS_RSP_1; @@ -2565,6 +2712,28 @@ __init int intel_pmu_init(void) pr_cont("Haswell events, "); break; + case 61: /* 14nm Broadwell Core-M */ + x86_pmu.late_ack = true; + memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids)); + memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); + + intel_pmu_lbr_init_snb(); + + x86_pmu.event_constraints = intel_bdw_event_constraints; + x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints; + x86_pmu.extra_regs = intel_snbep_extra_regs; + x86_pmu.pebs_aliases = intel_pebs_aliases_snb; + /* all extra regs are per-cpu when HT is on */ + x86_pmu.er_flags |= ERF_HAS_RSP_1; + x86_pmu.er_flags |= ERF_NO_HT_SHARING; + + x86_pmu.hw_config = hsw_hw_config; + x86_pmu.get_event_constraints = hsw_get_event_constraints; + x86_pmu.cpu_events = hsw_events_attrs; + x86_pmu.limit_period = bdw_limit_period; + pr_cont("Broadwell events, "); + break; + default: switch (x86_pmu.version) { case 1: diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 696ade311de..46211bcc813 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -108,14 +108,16 @@ static u64 precise_store_data(u64 status) return val; } -static u64 precise_store_data_hsw(struct perf_event *event, u64 status) +static u64 precise_datala_hsw(struct perf_event *event, u64 status) { union perf_mem_data_src dse; - u64 cfg = event->hw.config & INTEL_ARCH_EVENT_MASK; - dse.val = 0; - dse.mem_op = PERF_MEM_OP_STORE; - dse.mem_lvl = PERF_MEM_LVL_NA; + dse.val = PERF_MEM_NA; + + if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW) + dse.mem_op = PERF_MEM_OP_STORE; + else if (event->hw.flags & PERF_X86_EVENT_PEBS_LD_HSW) + dse.mem_op = PERF_MEM_OP_LOAD; /* * L1 info only valid for following events: @@ -125,15 +127,12 @@ static u64 precise_store_data_hsw(struct perf_event *event, u64 status) * MEM_UOPS_RETIRED.SPLIT_STORES * MEM_UOPS_RETIRED.ALL_STORES */ - if (cfg != 0x12d0 && cfg != 0x22d0 && cfg != 0x42d0 && cfg != 0x82d0) - return dse.mem_lvl; - - if (status & 1) - dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT; - else - dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_MISS; - - /* Nothing else supported. Sorry. */ + if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW) { + if (status & 1) + dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT; + else + dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_MISS; + } return dse.val; } @@ -475,7 +474,7 @@ void intel_pmu_enable_bts(u64 config) void intel_pmu_disable_bts(void) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); unsigned long debugctlmsr; if (!cpuc->ds) @@ -492,7 +491,7 @@ void intel_pmu_disable_bts(void) int intel_pmu_drain_bts_buffer(void) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct debug_store *ds = cpuc->ds; struct bts_record { u64 from; @@ -569,28 +568,10 @@ struct event_constraint intel_atom_pebs_event_constraints[] = { }; struct event_constraint intel_slm_pebs_event_constraints[] = { - INTEL_UEVENT_CONSTRAINT(0x0103, 0x1), /* REHABQ.LD_BLOCK_ST_FORWARD_PS */ - INTEL_UEVENT_CONSTRAINT(0x0803, 0x1), /* REHABQ.LD_SPLITS_PS */ - INTEL_UEVENT_CONSTRAINT(0x0204, 0x1), /* MEM_UOPS_RETIRED.L2_HIT_LOADS_PS */ - INTEL_UEVENT_CONSTRAINT(0x0404, 0x1), /* MEM_UOPS_RETIRED.L2_MISS_LOADS_PS */ - INTEL_UEVENT_CONSTRAINT(0x0804, 0x1), /* MEM_UOPS_RETIRED.DTLB_MISS_LOADS_PS */ - INTEL_UEVENT_CONSTRAINT(0x2004, 0x1), /* MEM_UOPS_RETIRED.HITM_PS */ - INTEL_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY_PS */ - INTEL_UEVENT_CONSTRAINT(0x00c4, 0x1), /* BR_INST_RETIRED.ALL_BRANCHES_PS */ - INTEL_UEVENT_CONSTRAINT(0x7ec4, 0x1), /* BR_INST_RETIRED.JCC_PS */ - INTEL_UEVENT_CONSTRAINT(0xbfc4, 0x1), /* BR_INST_RETIRED.FAR_BRANCH_PS */ - INTEL_UEVENT_CONSTRAINT(0xebc4, 0x1), /* BR_INST_RETIRED.NON_RETURN_IND_PS */ - INTEL_UEVENT_CONSTRAINT(0xf7c4, 0x1), /* BR_INST_RETIRED.RETURN_PS */ - INTEL_UEVENT_CONSTRAINT(0xf9c4, 0x1), /* BR_INST_RETIRED.CALL_PS */ - INTEL_UEVENT_CONSTRAINT(0xfbc4, 0x1), /* BR_INST_RETIRED.IND_CALL_PS */ - INTEL_UEVENT_CONSTRAINT(0xfdc4, 0x1), /* BR_INST_RETIRED.REL_CALL_PS */ - INTEL_UEVENT_CONSTRAINT(0xfec4, 0x1), /* BR_INST_RETIRED.TAKEN_JCC_PS */ - INTEL_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_MISP_RETIRED.ALL_BRANCHES_PS */ - INTEL_UEVENT_CONSTRAINT(0x7ec5, 0x1), /* BR_INST_MISP_RETIRED.JCC_PS */ - INTEL_UEVENT_CONSTRAINT(0xebc5, 0x1), /* BR_INST_MISP_RETIRED.NON_RETURN_IND_PS */ - INTEL_UEVENT_CONSTRAINT(0xf7c5, 0x1), /* BR_INST_MISP_RETIRED.RETURN_PS */ - INTEL_UEVENT_CONSTRAINT(0xfbc5, 0x1), /* BR_INST_MISP_RETIRED.IND_CALL_PS */ - INTEL_UEVENT_CONSTRAINT(0xfec5, 0x1), /* BR_INST_MISP_RETIRED.TAKEN_JCC_PS */ + /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */ + INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf), + /* Allow all events as PEBS with no flags */ + INTEL_ALL_EVENT_CONSTRAINT(0, 0x1), EVENT_CONSTRAINT_END }; @@ -626,68 +607,44 @@ struct event_constraint intel_westmere_pebs_event_constraints[] = { struct event_constraint intel_snb_pebs_event_constraints[] = { INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */ - INTEL_UEVENT_CONSTRAINT(0x01c2, 0xf), /* UOPS_RETIRED.ALL */ - INTEL_UEVENT_CONSTRAINT(0x02c2, 0xf), /* UOPS_RETIRED.RETIRE_SLOTS */ - INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ - INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */ INTEL_PLD_CONSTRAINT(0x01cd, 0x8), /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */ INTEL_PST_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORES */ - INTEL_EVENT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */ - INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ - INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ - INTEL_EVENT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */ - INTEL_UEVENT_CONSTRAINT(0x02d4, 0xf), /* MEM_LOAD_UOPS_MISC_RETIRED.LLC_MISS */ + /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */ + INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf), + /* Allow all events as PEBS with no flags */ + INTEL_ALL_EVENT_CONSTRAINT(0, 0xf), EVENT_CONSTRAINT_END }; struct event_constraint intel_ivb_pebs_event_constraints[] = { INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */ - INTEL_UEVENT_CONSTRAINT(0x01c2, 0xf), /* UOPS_RETIRED.ALL */ - INTEL_UEVENT_CONSTRAINT(0x02c2, 0xf), /* UOPS_RETIRED.RETIRE_SLOTS */ - INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ - INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */ INTEL_PLD_CONSTRAINT(0x01cd, 0x8), /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */ INTEL_PST_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORES */ - INTEL_EVENT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */ - INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ - INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ - INTEL_EVENT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */ + /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */ + INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf), + /* Allow all events as PEBS with no flags */ + INTEL_ALL_EVENT_CONSTRAINT(0, 0xf), EVENT_CONSTRAINT_END }; struct event_constraint intel_hsw_pebs_event_constraints[] = { INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */ - INTEL_PST_HSW_CONSTRAINT(0x01c2, 0xf), /* UOPS_RETIRED.ALL */ - INTEL_UEVENT_CONSTRAINT(0x02c2, 0xf), /* UOPS_RETIRED.RETIRE_SLOTS */ - INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ - INTEL_UEVENT_CONSTRAINT(0x01c5, 0xf), /* BR_MISP_RETIRED.CONDITIONAL */ - INTEL_UEVENT_CONSTRAINT(0x04c5, 0xf), /* BR_MISP_RETIRED.ALL_BRANCHES */ - INTEL_UEVENT_CONSTRAINT(0x20c5, 0xf), /* BR_MISP_RETIRED.NEAR_TAKEN */ - INTEL_PLD_CONSTRAINT(0x01cd, 0x8), /* MEM_TRANS_RETIRED.* */ - /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */ - INTEL_UEVENT_CONSTRAINT(0x11d0, 0xf), - /* MEM_UOPS_RETIRED.STLB_MISS_STORES */ - INTEL_UEVENT_CONSTRAINT(0x12d0, 0xf), - INTEL_UEVENT_CONSTRAINT(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */ - INTEL_UEVENT_CONSTRAINT(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */ - /* MEM_UOPS_RETIRED.SPLIT_STORES */ - INTEL_UEVENT_CONSTRAINT(0x42d0, 0xf), - INTEL_UEVENT_CONSTRAINT(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */ - INTEL_PST_HSW_CONSTRAINT(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */ - INTEL_UEVENT_CONSTRAINT(0x01d1, 0xf), /* MEM_LOAD_UOPS_RETIRED.L1_HIT */ - INTEL_UEVENT_CONSTRAINT(0x02d1, 0xf), /* MEM_LOAD_UOPS_RETIRED.L2_HIT */ - INTEL_UEVENT_CONSTRAINT(0x04d1, 0xf), /* MEM_LOAD_UOPS_RETIRED.L3_HIT */ - /* MEM_LOAD_UOPS_RETIRED.HIT_LFB */ - INTEL_UEVENT_CONSTRAINT(0x40d1, 0xf), - /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS */ - INTEL_UEVENT_CONSTRAINT(0x01d2, 0xf), - /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT */ - INTEL_UEVENT_CONSTRAINT(0x02d2, 0xf), - /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM */ - INTEL_UEVENT_CONSTRAINT(0x01d3, 0xf), - INTEL_UEVENT_CONSTRAINT(0x04c8, 0xf), /* HLE_RETIRED.Abort */ - INTEL_UEVENT_CONSTRAINT(0x04c9, 0xf), /* RTM_RETIRED.Abort */ - + INTEL_PLD_CONSTRAINT(0x01cd, 0xf), /* MEM_TRANS_RETIRED.* */ + /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */ + INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf), + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_STORES */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_STORES */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */ + INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ + INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd2, 0xf), /* MEM_LOAD_UOPS_L3_HIT_RETIRED.* */ + INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd3, 0xf), /* MEM_LOAD_UOPS_L3_MISS_RETIRED.* */ + /* Allow all events as PEBS with no flags */ + INTEL_ALL_EVENT_CONSTRAINT(0, 0xf), EVENT_CONSTRAINT_END }; @@ -712,7 +669,7 @@ struct event_constraint *intel_pebs_constraints(struct perf_event *event) void intel_pmu_pebs_enable(struct perf_event *event) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT; @@ -727,7 +684,7 @@ void intel_pmu_pebs_enable(struct perf_event *event) void intel_pmu_pebs_disable(struct perf_event *event) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; cpuc->pebs_enabled &= ~(1ULL << hwc->idx); @@ -745,7 +702,7 @@ void intel_pmu_pebs_disable(struct perf_event *event) void intel_pmu_pebs_enable_all(void) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); if (cpuc->pebs_enabled) wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled); @@ -753,7 +710,7 @@ void intel_pmu_pebs_enable_all(void) void intel_pmu_pebs_disable_all(void) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); if (cpuc->pebs_enabled) wrmsrl(MSR_IA32_PEBS_ENABLE, 0); @@ -761,7 +718,7 @@ void intel_pmu_pebs_disable_all(void) static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); unsigned long from = cpuc->lbr_entries[0].from; unsigned long old_to, to = cpuc->lbr_entries[0].to; unsigned long ip = regs->ip; @@ -864,51 +821,53 @@ static inline u64 intel_hsw_transaction(struct pebs_record_hsw *pebs) static void __intel_pmu_pebs_event(struct perf_event *event, struct pt_regs *iregs, void *__pebs) { +#define PERF_X86_EVENT_PEBS_HSW_PREC \ + (PERF_X86_EVENT_PEBS_ST_HSW | \ + PERF_X86_EVENT_PEBS_LD_HSW | \ + PERF_X86_EVENT_PEBS_NA_HSW) /* * We cast to the biggest pebs_record but are careful not to * unconditionally access the 'extra' entries. */ - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct pebs_record_hsw *pebs = __pebs; struct perf_sample_data data; struct pt_regs regs; u64 sample_type; - int fll, fst; + int fll, fst, dsrc; + int fl = event->hw.flags; if (!intel_pmu_save_and_restart(event)) return; - fll = event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT; - fst = event->hw.flags & (PERF_X86_EVENT_PEBS_ST | - PERF_X86_EVENT_PEBS_ST_HSW); + sample_type = event->attr.sample_type; + dsrc = sample_type & PERF_SAMPLE_DATA_SRC; + + fll = fl & PERF_X86_EVENT_PEBS_LDLAT; + fst = fl & (PERF_X86_EVENT_PEBS_ST | PERF_X86_EVENT_PEBS_HSW_PREC); perf_sample_data_init(&data, 0, event->hw.last_period); data.period = event->hw.last_period; - sample_type = event->attr.sample_type; /* - * if PEBS-LL or PreciseStore + * Use latency for weight (only avail with PEBS-LL) */ - if (fll || fst) { - /* - * Use latency for weight (only avail with PEBS-LL) - */ - if (fll && (sample_type & PERF_SAMPLE_WEIGHT)) - data.weight = pebs->lat; - - /* - * data.data_src encodes the data source - */ - if (sample_type & PERF_SAMPLE_DATA_SRC) { - if (fll) - data.data_src.val = load_latency_data(pebs->dse); - else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW) - data.data_src.val = - precise_store_data_hsw(event, pebs->dse); - else - data.data_src.val = precise_store_data(pebs->dse); - } + if (fll && (sample_type & PERF_SAMPLE_WEIGHT)) + data.weight = pebs->lat; + + /* + * data.data_src encodes the data source + */ + if (dsrc) { + u64 val = PERF_MEM_NA; + if (fll) + val = load_latency_data(pebs->dse); + else if (fst && (fl & PERF_X86_EVENT_PEBS_HSW_PREC)) + val = precise_datala_hsw(event, pebs->dse); + else if (fst) + val = precise_store_data(pebs->dse); + data.data_src.val = val; } /* @@ -935,16 +894,16 @@ static void __intel_pmu_pebs_event(struct perf_event *event, else regs.flags &= ~PERF_EFLAGS_EXACT; - if ((event->attr.sample_type & PERF_SAMPLE_ADDR) && + if ((sample_type & PERF_SAMPLE_ADDR) && x86_pmu.intel_cap.pebs_format >= 1) data.addr = pebs->dla; if (x86_pmu.intel_cap.pebs_format >= 2) { /* Only set the TSX weight when no memory weight. */ - if ((event->attr.sample_type & PERF_SAMPLE_WEIGHT) && !fll) + if ((sample_type & PERF_SAMPLE_WEIGHT) && !fll) data.weight = intel_hsw_weight(pebs); - if (event->attr.sample_type & PERF_SAMPLE_TRANSACTION) + if (sample_type & PERF_SAMPLE_TRANSACTION) data.txn = intel_hsw_transaction(pebs); } @@ -957,7 +916,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event, static void intel_pmu_drain_pebs_core(struct pt_regs *iregs) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct debug_store *ds = cpuc->ds; struct perf_event *event = cpuc->events[0]; /* PMC0 only */ struct pebs_record_core *at, *top; @@ -998,7 +957,7 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs) static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct debug_store *ds = cpuc->ds; struct perf_event *event = NULL; void *at, *top; @@ -1055,7 +1014,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) * BTS, PEBS probe and setup */ -void intel_ds_init(void) +void __init intel_ds_init(void) { /* * No support for 32bit formats diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index 9dd2459a4c7..45fa730a528 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -133,7 +133,7 @@ static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc); static void __intel_pmu_lbr_enable(void) { u64 debugctl; - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); if (cpuc->lbr_sel) wrmsrl(MSR_LBR_SELECT, cpuc->lbr_sel->config); @@ -183,7 +183,7 @@ void intel_pmu_lbr_reset(void) void intel_pmu_lbr_enable(struct perf_event *event) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); if (!x86_pmu.lbr_nr) return; @@ -203,7 +203,7 @@ void intel_pmu_lbr_enable(struct perf_event *event) void intel_pmu_lbr_disable(struct perf_event *event) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); if (!x86_pmu.lbr_nr) return; @@ -220,7 +220,7 @@ void intel_pmu_lbr_disable(struct perf_event *event) void intel_pmu_lbr_enable_all(void) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); if (cpuc->lbr_users) __intel_pmu_lbr_enable(); @@ -228,7 +228,7 @@ void intel_pmu_lbr_enable_all(void) void intel_pmu_lbr_disable_all(void) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); if (cpuc->lbr_users) __intel_pmu_lbr_disable(); @@ -332,7 +332,7 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) void intel_pmu_lbr_read(void) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); if (!cpuc->lbr_users) return; @@ -697,7 +697,7 @@ static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = { }; /* core */ -void intel_pmu_lbr_init_core(void) +void __init intel_pmu_lbr_init_core(void) { x86_pmu.lbr_nr = 4; x86_pmu.lbr_tos = MSR_LBR_TOS; @@ -712,7 +712,7 @@ void intel_pmu_lbr_init_core(void) } /* nehalem/westmere */ -void intel_pmu_lbr_init_nhm(void) +void __init intel_pmu_lbr_init_nhm(void) { x86_pmu.lbr_nr = 16; x86_pmu.lbr_tos = MSR_LBR_TOS; @@ -733,7 +733,7 @@ void intel_pmu_lbr_init_nhm(void) } /* sandy bridge */ -void intel_pmu_lbr_init_snb(void) +void __init intel_pmu_lbr_init_snb(void) { x86_pmu.lbr_nr = 16; x86_pmu.lbr_tos = MSR_LBR_TOS; @@ -753,7 +753,7 @@ void intel_pmu_lbr_init_snb(void) } /* atom */ -void intel_pmu_lbr_init_atom(void) +void __init intel_pmu_lbr_init_atom(void) { /* * only models starting at stepping 10 seems diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c index 619f7699487..d64f275fe27 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c +++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c @@ -135,7 +135,7 @@ static inline u64 rapl_scale(u64 v) * or use ldexp(count, -32). * Watts = Joules/Time delta */ - return v << (32 - __get_cpu_var(rapl_pmu)->hw_unit); + return v << (32 - __this_cpu_read(rapl_pmu->hw_unit)); } static u64 rapl_event_update(struct perf_event *event) @@ -187,7 +187,7 @@ static void rapl_stop_hrtimer(struct rapl_pmu *pmu) static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer) { - struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu); + struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu); struct perf_event *event; unsigned long flags; @@ -234,7 +234,7 @@ static void __rapl_pmu_event_start(struct rapl_pmu *pmu, static void rapl_pmu_event_start(struct perf_event *event, int mode) { - struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu); + struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu); unsigned long flags; spin_lock_irqsave(&pmu->lock, flags); @@ -244,7 +244,7 @@ static void rapl_pmu_event_start(struct perf_event *event, int mode) static void rapl_pmu_event_stop(struct perf_event *event, int mode) { - struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu); + struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu); struct hw_perf_event *hwc = &event->hw; unsigned long flags; @@ -278,7 +278,7 @@ static void rapl_pmu_event_stop(struct perf_event *event, int mode) static int rapl_pmu_event_add(struct perf_event *event, int mode) { - struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu); + struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu); struct hw_perf_event *hwc = &event->hw; unsigned long flags; @@ -696,7 +696,7 @@ static int __init rapl_pmu_init(void) return -1; } - pmu = __get_cpu_var(rapl_pmu); + pmu = __this_cpu_read(rapl_pmu); pr_info("RAPL PMU detected, hw unit 2^-%d Joules," " API unit is 2^-32 Joules," diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index ae6552a0701..9762dbd9f3f 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -1,83 +1,39 @@ #include "perf_event_intel_uncore.h" static struct intel_uncore_type *empty_uncore[] = { NULL, }; -static struct intel_uncore_type **msr_uncores = empty_uncore; -static struct intel_uncore_type **pci_uncores = empty_uncore; -/* pci bus to socket mapping */ -static int pcibus_to_physid[256] = { [0 ... 255] = -1, }; +struct intel_uncore_type **uncore_msr_uncores = empty_uncore; +struct intel_uncore_type **uncore_pci_uncores = empty_uncore; -static struct pci_dev *extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX]; +static bool pcidrv_registered; +struct pci_driver *uncore_pci_driver; +/* pci bus to socket mapping */ +int uncore_pcibus_to_physid[256] = { [0 ... 255] = -1, }; +struct pci_dev *uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX]; static DEFINE_RAW_SPINLOCK(uncore_box_lock); - /* mask of cpus that collect uncore events */ static cpumask_t uncore_cpu_mask; /* constraint for the fixed counter */ -static struct event_constraint constraint_fixed = +static struct event_constraint uncore_constraint_fixed = EVENT_CONSTRAINT(~0ULL, 1 << UNCORE_PMC_IDX_FIXED, ~0ULL); -static struct event_constraint constraint_empty = +struct event_constraint uncore_constraint_empty = EVENT_CONSTRAINT(0, 0, 0); -#define __BITS_VALUE(x, i, n) ((typeof(x))(((x) >> ((i) * (n))) & \ - ((1ULL << (n)) - 1))) - -DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7"); -DEFINE_UNCORE_FORMAT_ATTR(event_ext, event, "config:0-7,21"); -DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15"); -DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18"); -DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19"); -DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23"); -DEFINE_UNCORE_FORMAT_ATTR(cmask5, cmask, "config:24-28"); -DEFINE_UNCORE_FORMAT_ATTR(cmask8, cmask, "config:24-31"); -DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31"); -DEFINE_UNCORE_FORMAT_ATTR(thresh5, thresh, "config:24-28"); -DEFINE_UNCORE_FORMAT_ATTR(occ_sel, occ_sel, "config:14-15"); -DEFINE_UNCORE_FORMAT_ATTR(occ_invert, occ_invert, "config:30"); -DEFINE_UNCORE_FORMAT_ATTR(occ_edge, occ_edge, "config:14-51"); -DEFINE_UNCORE_FORMAT_ATTR(filter_tid, filter_tid, "config1:0-4"); -DEFINE_UNCORE_FORMAT_ATTR(filter_link, filter_link, "config1:5-8"); -DEFINE_UNCORE_FORMAT_ATTR(filter_nid, filter_nid, "config1:10-17"); -DEFINE_UNCORE_FORMAT_ATTR(filter_nid2, filter_nid, "config1:32-47"); -DEFINE_UNCORE_FORMAT_ATTR(filter_state, filter_state, "config1:18-22"); -DEFINE_UNCORE_FORMAT_ATTR(filter_state2, filter_state, "config1:17-22"); -DEFINE_UNCORE_FORMAT_ATTR(filter_opc, filter_opc, "config1:23-31"); -DEFINE_UNCORE_FORMAT_ATTR(filter_opc2, filter_opc, "config1:52-60"); -DEFINE_UNCORE_FORMAT_ATTR(filter_band0, filter_band0, "config1:0-7"); -DEFINE_UNCORE_FORMAT_ATTR(filter_band1, filter_band1, "config1:8-15"); -DEFINE_UNCORE_FORMAT_ATTR(filter_band2, filter_band2, "config1:16-23"); -DEFINE_UNCORE_FORMAT_ATTR(filter_band3, filter_band3, "config1:24-31"); -DEFINE_UNCORE_FORMAT_ATTR(match_rds, match_rds, "config1:48-51"); -DEFINE_UNCORE_FORMAT_ATTR(match_rnid30, match_rnid30, "config1:32-35"); -DEFINE_UNCORE_FORMAT_ATTR(match_rnid4, match_rnid4, "config1:31"); -DEFINE_UNCORE_FORMAT_ATTR(match_dnid, match_dnid, "config1:13-17"); -DEFINE_UNCORE_FORMAT_ATTR(match_mc, match_mc, "config1:9-12"); -DEFINE_UNCORE_FORMAT_ATTR(match_opc, match_opc, "config1:5-8"); -DEFINE_UNCORE_FORMAT_ATTR(match_vnw, match_vnw, "config1:3-4"); -DEFINE_UNCORE_FORMAT_ATTR(match0, match0, "config1:0-31"); -DEFINE_UNCORE_FORMAT_ATTR(match1, match1, "config1:32-63"); -DEFINE_UNCORE_FORMAT_ATTR(mask_rds, mask_rds, "config2:48-51"); -DEFINE_UNCORE_FORMAT_ATTR(mask_rnid30, mask_rnid30, "config2:32-35"); -DEFINE_UNCORE_FORMAT_ATTR(mask_rnid4, mask_rnid4, "config2:31"); -DEFINE_UNCORE_FORMAT_ATTR(mask_dnid, mask_dnid, "config2:13-17"); -DEFINE_UNCORE_FORMAT_ATTR(mask_mc, mask_mc, "config2:9-12"); -DEFINE_UNCORE_FORMAT_ATTR(mask_opc, mask_opc, "config2:5-8"); -DEFINE_UNCORE_FORMAT_ATTR(mask_vnw, mask_vnw, "config2:3-4"); -DEFINE_UNCORE_FORMAT_ATTR(mask0, mask0, "config2:0-31"); -DEFINE_UNCORE_FORMAT_ATTR(mask1, mask1, "config2:32-63"); - -static void uncore_pmu_start_hrtimer(struct intel_uncore_box *box); -static void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box); -static void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event); -static void uncore_pmu_event_read(struct perf_event *event); - -static struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event) +ssize_t uncore_event_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct uncore_event_desc *event = + container_of(attr, struct uncore_event_desc, attr); + return sprintf(buf, "%s", event->config); +} + +struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event) { return container_of(event->pmu, struct intel_uncore_pmu, pmu); } -static struct intel_uncore_box * -uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu) +struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu) { struct intel_uncore_box *box; @@ -86,6 +42,9 @@ uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu) return box; raw_spin_lock(&uncore_box_lock); + /* Recheck in lock to handle races. */ + if (*per_cpu_ptr(pmu->box, cpu)) + goto out; list_for_each_entry(box, &pmu->box_list, list) { if (box->phys_id == topology_physical_package_id(cpu)) { atomic_inc(&box->refcnt); @@ -93,12 +52,13 @@ uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu) break; } } +out: raw_spin_unlock(&uncore_box_lock); return *per_cpu_ptr(pmu->box, cpu); } -static struct intel_uncore_box *uncore_event_to_box(struct perf_event *event) +struct intel_uncore_box *uncore_event_to_box(struct perf_event *event) { /* * perf core schedules event on the basis of cpu, uncore events are @@ -107,7 +67,7 @@ static struct intel_uncore_box *uncore_event_to_box(struct perf_event *event) return uncore_pmu_to_box(uncore_event_to_pmu(event), smp_processor_id()); } -static u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event) +u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event) { u64 count; @@ -119,7 +79,7 @@ static u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_eve /* * generic get constraint function for shared match/mask registers. */ -static struct event_constraint * +struct event_constraint * uncore_get_constraint(struct intel_uncore_box *box, struct perf_event *event) { struct intel_uncore_extra_reg *er; @@ -154,10 +114,10 @@ uncore_get_constraint(struct intel_uncore_box *box, struct perf_event *event) return NULL; } - return &constraint_empty; + return &uncore_constraint_empty; } -static void uncore_put_constraint(struct intel_uncore_box *box, struct perf_event *event) +void uncore_put_constraint(struct intel_uncore_box *box, struct perf_event *event) { struct intel_uncore_extra_reg *er; struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; @@ -178,7 +138,7 @@ static void uncore_put_constraint(struct intel_uncore_box *box, struct perf_even reg1->alloc = 0; } -static u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx) +u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx) { struct intel_uncore_extra_reg *er; unsigned long flags; @@ -193,2939 +153,6 @@ static u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx) return config; } -/* Sandy Bridge-EP uncore support */ -static struct intel_uncore_type snbep_uncore_cbox; -static struct intel_uncore_type snbep_uncore_pcu; - -static void snbep_uncore_pci_disable_box(struct intel_uncore_box *box) -{ - struct pci_dev *pdev = box->pci_dev; - int box_ctl = uncore_pci_box_ctl(box); - u32 config = 0; - - if (!pci_read_config_dword(pdev, box_ctl, &config)) { - config |= SNBEP_PMON_BOX_CTL_FRZ; - pci_write_config_dword(pdev, box_ctl, config); - } -} - -static void snbep_uncore_pci_enable_box(struct intel_uncore_box *box) -{ - struct pci_dev *pdev = box->pci_dev; - int box_ctl = uncore_pci_box_ctl(box); - u32 config = 0; - - if (!pci_read_config_dword(pdev, box_ctl, &config)) { - config &= ~SNBEP_PMON_BOX_CTL_FRZ; - pci_write_config_dword(pdev, box_ctl, config); - } -} - -static void snbep_uncore_pci_enable_event(struct intel_uncore_box *box, struct perf_event *event) -{ - struct pci_dev *pdev = box->pci_dev; - struct hw_perf_event *hwc = &event->hw; - - pci_write_config_dword(pdev, hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN); -} - -static void snbep_uncore_pci_disable_event(struct intel_uncore_box *box, struct perf_event *event) -{ - struct pci_dev *pdev = box->pci_dev; - struct hw_perf_event *hwc = &event->hw; - - pci_write_config_dword(pdev, hwc->config_base, hwc->config); -} - -static u64 snbep_uncore_pci_read_counter(struct intel_uncore_box *box, struct perf_event *event) -{ - struct pci_dev *pdev = box->pci_dev; - struct hw_perf_event *hwc = &event->hw; - u64 count = 0; - - pci_read_config_dword(pdev, hwc->event_base, (u32 *)&count); - pci_read_config_dword(pdev, hwc->event_base + 4, (u32 *)&count + 1); - - return count; -} - -static void snbep_uncore_pci_init_box(struct intel_uncore_box *box) -{ - struct pci_dev *pdev = box->pci_dev; - - pci_write_config_dword(pdev, SNBEP_PCI_PMON_BOX_CTL, SNBEP_PMON_BOX_CTL_INT); -} - -static void snbep_uncore_msr_disable_box(struct intel_uncore_box *box) -{ - u64 config; - unsigned msr; - - msr = uncore_msr_box_ctl(box); - if (msr) { - rdmsrl(msr, config); - config |= SNBEP_PMON_BOX_CTL_FRZ; - wrmsrl(msr, config); - } -} - -static void snbep_uncore_msr_enable_box(struct intel_uncore_box *box) -{ - u64 config; - unsigned msr; - - msr = uncore_msr_box_ctl(box); - if (msr) { - rdmsrl(msr, config); - config &= ~SNBEP_PMON_BOX_CTL_FRZ; - wrmsrl(msr, config); - } -} - -static void snbep_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - struct hw_perf_event_extra *reg1 = &hwc->extra_reg; - - if (reg1->idx != EXTRA_REG_NONE) - wrmsrl(reg1->reg, uncore_shared_reg_config(box, 0)); - - wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN); -} - -static void snbep_uncore_msr_disable_event(struct intel_uncore_box *box, - struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - - wrmsrl(hwc->config_base, hwc->config); -} - -static void snbep_uncore_msr_init_box(struct intel_uncore_box *box) -{ - unsigned msr = uncore_msr_box_ctl(box); - - if (msr) - wrmsrl(msr, SNBEP_PMON_BOX_CTL_INT); -} - -static struct attribute *snbep_uncore_formats_attr[] = { - &format_attr_event.attr, - &format_attr_umask.attr, - &format_attr_edge.attr, - &format_attr_inv.attr, - &format_attr_thresh8.attr, - NULL, -}; - -static struct attribute *snbep_uncore_ubox_formats_attr[] = { - &format_attr_event.attr, - &format_attr_umask.attr, - &format_attr_edge.attr, - &format_attr_inv.attr, - &format_attr_thresh5.attr, - NULL, -}; - -static struct attribute *snbep_uncore_cbox_formats_attr[] = { - &format_attr_event.attr, - &format_attr_umask.attr, - &format_attr_edge.attr, - &format_attr_tid_en.attr, - &format_attr_inv.attr, - &format_attr_thresh8.attr, - &format_attr_filter_tid.attr, - &format_attr_filter_nid.attr, - &format_attr_filter_state.attr, - &format_attr_filter_opc.attr, - NULL, -}; - -static struct attribute *snbep_uncore_pcu_formats_attr[] = { - &format_attr_event_ext.attr, - &format_attr_occ_sel.attr, - &format_attr_edge.attr, - &format_attr_inv.attr, - &format_attr_thresh5.attr, - &format_attr_occ_invert.attr, - &format_attr_occ_edge.attr, - &format_attr_filter_band0.attr, - &format_attr_filter_band1.attr, - &format_attr_filter_band2.attr, - &format_attr_filter_band3.attr, - NULL, -}; - -static struct attribute *snbep_uncore_qpi_formats_attr[] = { - &format_attr_event_ext.attr, - &format_attr_umask.attr, - &format_attr_edge.attr, - &format_attr_inv.attr, - &format_attr_thresh8.attr, - &format_attr_match_rds.attr, - &format_attr_match_rnid30.attr, - &format_attr_match_rnid4.attr, - &format_attr_match_dnid.attr, - &format_attr_match_mc.attr, - &format_attr_match_opc.attr, - &format_attr_match_vnw.attr, - &format_attr_match0.attr, - &format_attr_match1.attr, - &format_attr_mask_rds.attr, - &format_attr_mask_rnid30.attr, - &format_attr_mask_rnid4.attr, - &format_attr_mask_dnid.attr, - &format_attr_mask_mc.attr, - &format_attr_mask_opc.attr, - &format_attr_mask_vnw.attr, - &format_attr_mask0.attr, - &format_attr_mask1.attr, - NULL, -}; - -static struct uncore_event_desc snbep_uncore_imc_events[] = { - INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"), - INTEL_UNCORE_EVENT_DESC(cas_count_read, "event=0x04,umask=0x03"), - INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"), - { /* end: all zeroes */ }, -}; - -static struct uncore_event_desc snbep_uncore_qpi_events[] = { - INTEL_UNCORE_EVENT_DESC(clockticks, "event=0x14"), - INTEL_UNCORE_EVENT_DESC(txl_flits_active, "event=0x00,umask=0x06"), - INTEL_UNCORE_EVENT_DESC(drs_data, "event=0x102,umask=0x08"), - INTEL_UNCORE_EVENT_DESC(ncb_data, "event=0x103,umask=0x04"), - { /* end: all zeroes */ }, -}; - -static struct attribute_group snbep_uncore_format_group = { - .name = "format", - .attrs = snbep_uncore_formats_attr, -}; - -static struct attribute_group snbep_uncore_ubox_format_group = { - .name = "format", - .attrs = snbep_uncore_ubox_formats_attr, -}; - -static struct attribute_group snbep_uncore_cbox_format_group = { - .name = "format", - .attrs = snbep_uncore_cbox_formats_attr, -}; - -static struct attribute_group snbep_uncore_pcu_format_group = { - .name = "format", - .attrs = snbep_uncore_pcu_formats_attr, -}; - -static struct attribute_group snbep_uncore_qpi_format_group = { - .name = "format", - .attrs = snbep_uncore_qpi_formats_attr, -}; - -#define SNBEP_UNCORE_MSR_OPS_COMMON_INIT() \ - .init_box = snbep_uncore_msr_init_box, \ - .disable_box = snbep_uncore_msr_disable_box, \ - .enable_box = snbep_uncore_msr_enable_box, \ - .disable_event = snbep_uncore_msr_disable_event, \ - .enable_event = snbep_uncore_msr_enable_event, \ - .read_counter = uncore_msr_read_counter - -static struct intel_uncore_ops snbep_uncore_msr_ops = { - SNBEP_UNCORE_MSR_OPS_COMMON_INIT(), -}; - -#define SNBEP_UNCORE_PCI_OPS_COMMON_INIT() \ - .init_box = snbep_uncore_pci_init_box, \ - .disable_box = snbep_uncore_pci_disable_box, \ - .enable_box = snbep_uncore_pci_enable_box, \ - .disable_event = snbep_uncore_pci_disable_event, \ - .read_counter = snbep_uncore_pci_read_counter - -static struct intel_uncore_ops snbep_uncore_pci_ops = { - SNBEP_UNCORE_PCI_OPS_COMMON_INIT(), - .enable_event = snbep_uncore_pci_enable_event, \ -}; - -static struct event_constraint snbep_uncore_cbox_constraints[] = { - UNCORE_EVENT_CONSTRAINT(0x01, 0x1), - UNCORE_EVENT_CONSTRAINT(0x02, 0x3), - UNCORE_EVENT_CONSTRAINT(0x04, 0x3), - UNCORE_EVENT_CONSTRAINT(0x05, 0x3), - UNCORE_EVENT_CONSTRAINT(0x07, 0x3), - UNCORE_EVENT_CONSTRAINT(0x09, 0x3), - UNCORE_EVENT_CONSTRAINT(0x11, 0x1), - UNCORE_EVENT_CONSTRAINT(0x12, 0x3), - UNCORE_EVENT_CONSTRAINT(0x13, 0x3), - UNCORE_EVENT_CONSTRAINT(0x1b, 0xc), - UNCORE_EVENT_CONSTRAINT(0x1c, 0xc), - UNCORE_EVENT_CONSTRAINT(0x1d, 0xc), - UNCORE_EVENT_CONSTRAINT(0x1e, 0xc), - EVENT_CONSTRAINT_OVERLAP(0x1f, 0xe, 0xff), - UNCORE_EVENT_CONSTRAINT(0x21, 0x3), - UNCORE_EVENT_CONSTRAINT(0x23, 0x3), - UNCORE_EVENT_CONSTRAINT(0x31, 0x3), - UNCORE_EVENT_CONSTRAINT(0x32, 0x3), - UNCORE_EVENT_CONSTRAINT(0x33, 0x3), - UNCORE_EVENT_CONSTRAINT(0x34, 0x3), - UNCORE_EVENT_CONSTRAINT(0x35, 0x3), - UNCORE_EVENT_CONSTRAINT(0x36, 0x1), - UNCORE_EVENT_CONSTRAINT(0x37, 0x3), - UNCORE_EVENT_CONSTRAINT(0x38, 0x3), - UNCORE_EVENT_CONSTRAINT(0x39, 0x3), - UNCORE_EVENT_CONSTRAINT(0x3b, 0x1), - EVENT_CONSTRAINT_END -}; - -static struct event_constraint snbep_uncore_r2pcie_constraints[] = { - UNCORE_EVENT_CONSTRAINT(0x10, 0x3), - UNCORE_EVENT_CONSTRAINT(0x11, 0x3), - UNCORE_EVENT_CONSTRAINT(0x12, 0x1), - UNCORE_EVENT_CONSTRAINT(0x23, 0x3), - UNCORE_EVENT_CONSTRAINT(0x24, 0x3), - UNCORE_EVENT_CONSTRAINT(0x25, 0x3), - UNCORE_EVENT_CONSTRAINT(0x26, 0x3), - UNCORE_EVENT_CONSTRAINT(0x32, 0x3), - UNCORE_EVENT_CONSTRAINT(0x33, 0x3), - UNCORE_EVENT_CONSTRAINT(0x34, 0x3), - EVENT_CONSTRAINT_END -}; - -static struct event_constraint snbep_uncore_r3qpi_constraints[] = { - UNCORE_EVENT_CONSTRAINT(0x10, 0x3), - UNCORE_EVENT_CONSTRAINT(0x11, 0x3), - UNCORE_EVENT_CONSTRAINT(0x12, 0x3), - UNCORE_EVENT_CONSTRAINT(0x13, 0x1), - UNCORE_EVENT_CONSTRAINT(0x20, 0x3), - UNCORE_EVENT_CONSTRAINT(0x21, 0x3), - UNCORE_EVENT_CONSTRAINT(0x22, 0x3), - UNCORE_EVENT_CONSTRAINT(0x23, 0x3), - UNCORE_EVENT_CONSTRAINT(0x24, 0x3), - UNCORE_EVENT_CONSTRAINT(0x25, 0x3), - UNCORE_EVENT_CONSTRAINT(0x26, 0x3), - UNCORE_EVENT_CONSTRAINT(0x28, 0x3), - UNCORE_EVENT_CONSTRAINT(0x29, 0x3), - UNCORE_EVENT_CONSTRAINT(0x2a, 0x3), - UNCORE_EVENT_CONSTRAINT(0x2b, 0x3), - UNCORE_EVENT_CONSTRAINT(0x2c, 0x3), - UNCORE_EVENT_CONSTRAINT(0x2d, 0x3), - UNCORE_EVENT_CONSTRAINT(0x2e, 0x3), - UNCORE_EVENT_CONSTRAINT(0x2f, 0x3), - UNCORE_EVENT_CONSTRAINT(0x30, 0x3), - UNCORE_EVENT_CONSTRAINT(0x31, 0x3), - UNCORE_EVENT_CONSTRAINT(0x32, 0x3), - UNCORE_EVENT_CONSTRAINT(0x33, 0x3), - UNCORE_EVENT_CONSTRAINT(0x34, 0x3), - UNCORE_EVENT_CONSTRAINT(0x36, 0x3), - UNCORE_EVENT_CONSTRAINT(0x37, 0x3), - UNCORE_EVENT_CONSTRAINT(0x38, 0x3), - UNCORE_EVENT_CONSTRAINT(0x39, 0x3), - EVENT_CONSTRAINT_END -}; - -static struct intel_uncore_type snbep_uncore_ubox = { - .name = "ubox", - .num_counters = 2, - .num_boxes = 1, - .perf_ctr_bits = 44, - .fixed_ctr_bits = 48, - .perf_ctr = SNBEP_U_MSR_PMON_CTR0, - .event_ctl = SNBEP_U_MSR_PMON_CTL0, - .event_mask = SNBEP_U_MSR_PMON_RAW_EVENT_MASK, - .fixed_ctr = SNBEP_U_MSR_PMON_UCLK_FIXED_CTR, - .fixed_ctl = SNBEP_U_MSR_PMON_UCLK_FIXED_CTL, - .ops = &snbep_uncore_msr_ops, - .format_group = &snbep_uncore_ubox_format_group, -}; - -static struct extra_reg snbep_uncore_cbox_extra_regs[] = { - SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN, - SNBEP_CBO_PMON_CTL_TID_EN, 0x1), - SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4), - SNBEP_CBO_EVENT_EXTRA_REG(0x4334, 0xffff, 0x6), - SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4), - SNBEP_CBO_EVENT_EXTRA_REG(0x4534, 0xffff, 0x6), - SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4), - SNBEP_CBO_EVENT_EXTRA_REG(0x4934, 0xffff, 0x6), - SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0x6), - SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x8), - SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x8), - SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0xa), - SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0xa), - SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x2), - SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x2), - SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x2), - SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x2), - SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x8), - SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x8), - SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0xa), - SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0xa), - SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x2), - SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x2), - SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x2), - SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x2), - EVENT_EXTRA_END -}; - -static void snbep_cbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event) -{ - struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; - struct intel_uncore_extra_reg *er = &box->shared_regs[0]; - int i; - - if (uncore_box_is_fake(box)) - return; - - for (i = 0; i < 5; i++) { - if (reg1->alloc & (0x1 << i)) - atomic_sub(1 << (i * 6), &er->ref); - } - reg1->alloc = 0; -} - -static struct event_constraint * -__snbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event, - u64 (*cbox_filter_mask)(int fields)) -{ - struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; - struct intel_uncore_extra_reg *er = &box->shared_regs[0]; - int i, alloc = 0; - unsigned long flags; - u64 mask; - - if (reg1->idx == EXTRA_REG_NONE) - return NULL; - - raw_spin_lock_irqsave(&er->lock, flags); - for (i = 0; i < 5; i++) { - if (!(reg1->idx & (0x1 << i))) - continue; - if (!uncore_box_is_fake(box) && (reg1->alloc & (0x1 << i))) - continue; - - mask = cbox_filter_mask(0x1 << i); - if (!__BITS_VALUE(atomic_read(&er->ref), i, 6) || - !((reg1->config ^ er->config) & mask)) { - atomic_add(1 << (i * 6), &er->ref); - er->config &= ~mask; - er->config |= reg1->config & mask; - alloc |= (0x1 << i); - } else { - break; - } - } - raw_spin_unlock_irqrestore(&er->lock, flags); - if (i < 5) - goto fail; - - if (!uncore_box_is_fake(box)) - reg1->alloc |= alloc; - - return NULL; -fail: - for (; i >= 0; i--) { - if (alloc & (0x1 << i)) - atomic_sub(1 << (i * 6), &er->ref); - } - return &constraint_empty; -} - -static u64 snbep_cbox_filter_mask(int fields) -{ - u64 mask = 0; - - if (fields & 0x1) - mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_TID; - if (fields & 0x2) - mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_NID; - if (fields & 0x4) - mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_STATE; - if (fields & 0x8) - mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_OPC; - - return mask; -} - -static struct event_constraint * -snbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event) -{ - return __snbep_cbox_get_constraint(box, event, snbep_cbox_filter_mask); -} - -static int snbep_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event) -{ - struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; - struct extra_reg *er; - int idx = 0; - - for (er = snbep_uncore_cbox_extra_regs; er->msr; er++) { - if (er->event != (event->hw.config & er->config_mask)) - continue; - idx |= er->idx; - } - - if (idx) { - reg1->reg = SNBEP_C0_MSR_PMON_BOX_FILTER + - SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx; - reg1->config = event->attr.config1 & snbep_cbox_filter_mask(idx); - reg1->idx = idx; - } - return 0; -} - -static struct intel_uncore_ops snbep_uncore_cbox_ops = { - SNBEP_UNCORE_MSR_OPS_COMMON_INIT(), - .hw_config = snbep_cbox_hw_config, - .get_constraint = snbep_cbox_get_constraint, - .put_constraint = snbep_cbox_put_constraint, -}; - -static struct intel_uncore_type snbep_uncore_cbox = { - .name = "cbox", - .num_counters = 4, - .num_boxes = 8, - .perf_ctr_bits = 44, - .event_ctl = SNBEP_C0_MSR_PMON_CTL0, - .perf_ctr = SNBEP_C0_MSR_PMON_CTR0, - .event_mask = SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK, - .box_ctl = SNBEP_C0_MSR_PMON_BOX_CTL, - .msr_offset = SNBEP_CBO_MSR_OFFSET, - .num_shared_regs = 1, - .constraints = snbep_uncore_cbox_constraints, - .ops = &snbep_uncore_cbox_ops, - .format_group = &snbep_uncore_cbox_format_group, -}; - -static u64 snbep_pcu_alter_er(struct perf_event *event, int new_idx, bool modify) -{ - struct hw_perf_event *hwc = &event->hw; - struct hw_perf_event_extra *reg1 = &hwc->extra_reg; - u64 config = reg1->config; - - if (new_idx > reg1->idx) - config <<= 8 * (new_idx - reg1->idx); - else - config >>= 8 * (reg1->idx - new_idx); - - if (modify) { - hwc->config += new_idx - reg1->idx; - reg1->config = config; - reg1->idx = new_idx; - } - return config; -} - -static struct event_constraint * -snbep_pcu_get_constraint(struct intel_uncore_box *box, struct perf_event *event) -{ - struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; - struct intel_uncore_extra_reg *er = &box->shared_regs[0]; - unsigned long flags; - int idx = reg1->idx; - u64 mask, config1 = reg1->config; - bool ok = false; - - if (reg1->idx == EXTRA_REG_NONE || - (!uncore_box_is_fake(box) && reg1->alloc)) - return NULL; -again: - mask = 0xffULL << (idx * 8); - raw_spin_lock_irqsave(&er->lock, flags); - if (!__BITS_VALUE(atomic_read(&er->ref), idx, 8) || - !((config1 ^ er->config) & mask)) { - atomic_add(1 << (idx * 8), &er->ref); - er->config &= ~mask; - er->config |= config1 & mask; - ok = true; - } - raw_spin_unlock_irqrestore(&er->lock, flags); - - if (!ok) { - idx = (idx + 1) % 4; - if (idx != reg1->idx) { - config1 = snbep_pcu_alter_er(event, idx, false); - goto again; - } - return &constraint_empty; - } - - if (!uncore_box_is_fake(box)) { - if (idx != reg1->idx) - snbep_pcu_alter_er(event, idx, true); - reg1->alloc = 1; - } - return NULL; -} - -static void snbep_pcu_put_constraint(struct intel_uncore_box *box, struct perf_event *event) -{ - struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; - struct intel_uncore_extra_reg *er = &box->shared_regs[0]; - - if (uncore_box_is_fake(box) || !reg1->alloc) - return; - - atomic_sub(1 << (reg1->idx * 8), &er->ref); - reg1->alloc = 0; -} - -static int snbep_pcu_hw_config(struct intel_uncore_box *box, struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - struct hw_perf_event_extra *reg1 = &hwc->extra_reg; - int ev_sel = hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK; - - if (ev_sel >= 0xb && ev_sel <= 0xe) { - reg1->reg = SNBEP_PCU_MSR_PMON_BOX_FILTER; - reg1->idx = ev_sel - 0xb; - reg1->config = event->attr.config1 & (0xff << reg1->idx); - } - return 0; -} - -static struct intel_uncore_ops snbep_uncore_pcu_ops = { - SNBEP_UNCORE_MSR_OPS_COMMON_INIT(), - .hw_config = snbep_pcu_hw_config, - .get_constraint = snbep_pcu_get_constraint, - .put_constraint = snbep_pcu_put_constraint, -}; - -static struct intel_uncore_type snbep_uncore_pcu = { - .name = "pcu", - .num_counters = 4, - .num_boxes = 1, - .perf_ctr_bits = 48, - .perf_ctr = SNBEP_PCU_MSR_PMON_CTR0, - .event_ctl = SNBEP_PCU_MSR_PMON_CTL0, - .event_mask = SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK, - .box_ctl = SNBEP_PCU_MSR_PMON_BOX_CTL, - .num_shared_regs = 1, - .ops = &snbep_uncore_pcu_ops, - .format_group = &snbep_uncore_pcu_format_group, -}; - -static struct intel_uncore_type *snbep_msr_uncores[] = { - &snbep_uncore_ubox, - &snbep_uncore_cbox, - &snbep_uncore_pcu, - NULL, -}; - -enum { - SNBEP_PCI_QPI_PORT0_FILTER, - SNBEP_PCI_QPI_PORT1_FILTER, -}; - -static int snbep_qpi_hw_config(struct intel_uncore_box *box, struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - struct hw_perf_event_extra *reg1 = &hwc->extra_reg; - struct hw_perf_event_extra *reg2 = &hwc->branch_reg; - - if ((hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK) == 0x38) { - reg1->idx = 0; - reg1->reg = SNBEP_Q_Py_PCI_PMON_PKT_MATCH0; - reg1->config = event->attr.config1; - reg2->reg = SNBEP_Q_Py_PCI_PMON_PKT_MASK0; - reg2->config = event->attr.config2; - } - return 0; -} - -static void snbep_qpi_enable_event(struct intel_uncore_box *box, struct perf_event *event) -{ - struct pci_dev *pdev = box->pci_dev; - struct hw_perf_event *hwc = &event->hw; - struct hw_perf_event_extra *reg1 = &hwc->extra_reg; - struct hw_perf_event_extra *reg2 = &hwc->branch_reg; - - if (reg1->idx != EXTRA_REG_NONE) { - int idx = box->pmu->pmu_idx + SNBEP_PCI_QPI_PORT0_FILTER; - struct pci_dev *filter_pdev = extra_pci_dev[box->phys_id][idx]; - WARN_ON_ONCE(!filter_pdev); - if (filter_pdev) { - pci_write_config_dword(filter_pdev, reg1->reg, - (u32)reg1->config); - pci_write_config_dword(filter_pdev, reg1->reg + 4, - (u32)(reg1->config >> 32)); - pci_write_config_dword(filter_pdev, reg2->reg, - (u32)reg2->config); - pci_write_config_dword(filter_pdev, reg2->reg + 4, - (u32)(reg2->config >> 32)); - } - } - - pci_write_config_dword(pdev, hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN); -} - -static struct intel_uncore_ops snbep_uncore_qpi_ops = { - SNBEP_UNCORE_PCI_OPS_COMMON_INIT(), - .enable_event = snbep_qpi_enable_event, - .hw_config = snbep_qpi_hw_config, - .get_constraint = uncore_get_constraint, - .put_constraint = uncore_put_constraint, -}; - -#define SNBEP_UNCORE_PCI_COMMON_INIT() \ - .perf_ctr = SNBEP_PCI_PMON_CTR0, \ - .event_ctl = SNBEP_PCI_PMON_CTL0, \ - .event_mask = SNBEP_PMON_RAW_EVENT_MASK, \ - .box_ctl = SNBEP_PCI_PMON_BOX_CTL, \ - .ops = &snbep_uncore_pci_ops, \ - .format_group = &snbep_uncore_format_group - -static struct intel_uncore_type snbep_uncore_ha = { - .name = "ha", - .num_counters = 4, - .num_boxes = 1, - .perf_ctr_bits = 48, - SNBEP_UNCORE_PCI_COMMON_INIT(), -}; - -static struct intel_uncore_type snbep_uncore_imc = { - .name = "imc", - .num_counters = 4, - .num_boxes = 4, - .perf_ctr_bits = 48, - .fixed_ctr_bits = 48, - .fixed_ctr = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR, - .fixed_ctl = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL, - .event_descs = snbep_uncore_imc_events, - SNBEP_UNCORE_PCI_COMMON_INIT(), -}; - -static struct intel_uncore_type snbep_uncore_qpi = { - .name = "qpi", - .num_counters = 4, - .num_boxes = 2, - .perf_ctr_bits = 48, - .perf_ctr = SNBEP_PCI_PMON_CTR0, - .event_ctl = SNBEP_PCI_PMON_CTL0, - .event_mask = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK, - .box_ctl = SNBEP_PCI_PMON_BOX_CTL, - .num_shared_regs = 1, - .ops = &snbep_uncore_qpi_ops, - .event_descs = snbep_uncore_qpi_events, - .format_group = &snbep_uncore_qpi_format_group, -}; - - -static struct intel_uncore_type snbep_uncore_r2pcie = { - .name = "r2pcie", - .num_counters = 4, - .num_boxes = 1, - .perf_ctr_bits = 44, - .constraints = snbep_uncore_r2pcie_constraints, - SNBEP_UNCORE_PCI_COMMON_INIT(), -}; - -static struct intel_uncore_type snbep_uncore_r3qpi = { - .name = "r3qpi", - .num_counters = 3, - .num_boxes = 2, - .perf_ctr_bits = 44, - .constraints = snbep_uncore_r3qpi_constraints, - SNBEP_UNCORE_PCI_COMMON_INIT(), -}; - -enum { - SNBEP_PCI_UNCORE_HA, - SNBEP_PCI_UNCORE_IMC, - SNBEP_PCI_UNCORE_QPI, - SNBEP_PCI_UNCORE_R2PCIE, - SNBEP_PCI_UNCORE_R3QPI, -}; - -static struct intel_uncore_type *snbep_pci_uncores[] = { - [SNBEP_PCI_UNCORE_HA] = &snbep_uncore_ha, - [SNBEP_PCI_UNCORE_IMC] = &snbep_uncore_imc, - [SNBEP_PCI_UNCORE_QPI] = &snbep_uncore_qpi, - [SNBEP_PCI_UNCORE_R2PCIE] = &snbep_uncore_r2pcie, - [SNBEP_PCI_UNCORE_R3QPI] = &snbep_uncore_r3qpi, - NULL, -}; - -static DEFINE_PCI_DEVICE_TABLE(snbep_uncore_pci_ids) = { - { /* Home Agent */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_HA), - .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_HA, 0), - }, - { /* MC Channel 0 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC0), - .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 0), - }, - { /* MC Channel 1 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC1), - .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 1), - }, - { /* MC Channel 2 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC2), - .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 2), - }, - { /* MC Channel 3 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC3), - .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 3), - }, - { /* QPI Port 0 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI0), - .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_QPI, 0), - }, - { /* QPI Port 1 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI1), - .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_QPI, 1), - }, - { /* R2PCIe */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R2PCIE), - .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R2PCIE, 0), - }, - { /* R3QPI Link 0 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI0), - .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R3QPI, 0), - }, - { /* R3QPI Link 1 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI1), - .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R3QPI, 1), - }, - { /* QPI Port 0 filter */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3c86), - .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, - SNBEP_PCI_QPI_PORT0_FILTER), - }, - { /* QPI Port 0 filter */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3c96), - .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, - SNBEP_PCI_QPI_PORT1_FILTER), - }, - { /* end: all zeroes */ } -}; - -static struct pci_driver snbep_uncore_pci_driver = { - .name = "snbep_uncore", - .id_table = snbep_uncore_pci_ids, -}; - -/* - * build pci bus to socket mapping - */ -static int snbep_pci2phy_map_init(int devid) -{ - struct pci_dev *ubox_dev = NULL; - int i, bus, nodeid; - int err = 0; - u32 config = 0; - - while (1) { - /* find the UBOX device */ - ubox_dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, ubox_dev); - if (!ubox_dev) - break; - bus = ubox_dev->bus->number; - /* get the Node ID of the local register */ - err = pci_read_config_dword(ubox_dev, 0x40, &config); - if (err) - break; - nodeid = config; - /* get the Node ID mapping */ - err = pci_read_config_dword(ubox_dev, 0x54, &config); - if (err) - break; - /* - * every three bits in the Node ID mapping register maps - * to a particular node. - */ - for (i = 0; i < 8; i++) { - if (nodeid == ((config >> (3 * i)) & 0x7)) { - pcibus_to_physid[bus] = i; - break; - } - } - } - - if (!err) { - /* - * For PCI bus with no UBOX device, find the next bus - * that has UBOX device and use its mapping. - */ - i = -1; - for (bus = 255; bus >= 0; bus--) { - if (pcibus_to_physid[bus] >= 0) - i = pcibus_to_physid[bus]; - else - pcibus_to_physid[bus] = i; - } - } - - if (ubox_dev) - pci_dev_put(ubox_dev); - - return err ? pcibios_err_to_errno(err) : 0; -} -/* end of Sandy Bridge-EP uncore support */ - -/* IvyTown uncore support */ -static void ivt_uncore_msr_init_box(struct intel_uncore_box *box) -{ - unsigned msr = uncore_msr_box_ctl(box); - if (msr) - wrmsrl(msr, IVT_PMON_BOX_CTL_INT); -} - -static void ivt_uncore_pci_init_box(struct intel_uncore_box *box) -{ - struct pci_dev *pdev = box->pci_dev; - - pci_write_config_dword(pdev, SNBEP_PCI_PMON_BOX_CTL, IVT_PMON_BOX_CTL_INT); -} - -#define IVT_UNCORE_MSR_OPS_COMMON_INIT() \ - .init_box = ivt_uncore_msr_init_box, \ - .disable_box = snbep_uncore_msr_disable_box, \ - .enable_box = snbep_uncore_msr_enable_box, \ - .disable_event = snbep_uncore_msr_disable_event, \ - .enable_event = snbep_uncore_msr_enable_event, \ - .read_counter = uncore_msr_read_counter - -static struct intel_uncore_ops ivt_uncore_msr_ops = { - IVT_UNCORE_MSR_OPS_COMMON_INIT(), -}; - -static struct intel_uncore_ops ivt_uncore_pci_ops = { - .init_box = ivt_uncore_pci_init_box, - .disable_box = snbep_uncore_pci_disable_box, - .enable_box = snbep_uncore_pci_enable_box, - .disable_event = snbep_uncore_pci_disable_event, - .enable_event = snbep_uncore_pci_enable_event, - .read_counter = snbep_uncore_pci_read_counter, -}; - -#define IVT_UNCORE_PCI_COMMON_INIT() \ - .perf_ctr = SNBEP_PCI_PMON_CTR0, \ - .event_ctl = SNBEP_PCI_PMON_CTL0, \ - .event_mask = IVT_PMON_RAW_EVENT_MASK, \ - .box_ctl = SNBEP_PCI_PMON_BOX_CTL, \ - .ops = &ivt_uncore_pci_ops, \ - .format_group = &ivt_uncore_format_group - -static struct attribute *ivt_uncore_formats_attr[] = { - &format_attr_event.attr, - &format_attr_umask.attr, - &format_attr_edge.attr, - &format_attr_inv.attr, - &format_attr_thresh8.attr, - NULL, -}; - -static struct attribute *ivt_uncore_ubox_formats_attr[] = { - &format_attr_event.attr, - &format_attr_umask.attr, - &format_attr_edge.attr, - &format_attr_inv.attr, - &format_attr_thresh5.attr, - NULL, -}; - -static struct attribute *ivt_uncore_cbox_formats_attr[] = { - &format_attr_event.attr, - &format_attr_umask.attr, - &format_attr_edge.attr, - &format_attr_tid_en.attr, - &format_attr_thresh8.attr, - &format_attr_filter_tid.attr, - &format_attr_filter_link.attr, - &format_attr_filter_state2.attr, - &format_attr_filter_nid2.attr, - &format_attr_filter_opc2.attr, - NULL, -}; - -static struct attribute *ivt_uncore_pcu_formats_attr[] = { - &format_attr_event_ext.attr, - &format_attr_occ_sel.attr, - &format_attr_edge.attr, - &format_attr_thresh5.attr, - &format_attr_occ_invert.attr, - &format_attr_occ_edge.attr, - &format_attr_filter_band0.attr, - &format_attr_filter_band1.attr, - &format_attr_filter_band2.attr, - &format_attr_filter_band3.attr, - NULL, -}; - -static struct attribute *ivt_uncore_qpi_formats_attr[] = { - &format_attr_event_ext.attr, - &format_attr_umask.attr, - &format_attr_edge.attr, - &format_attr_thresh8.attr, - &format_attr_match_rds.attr, - &format_attr_match_rnid30.attr, - &format_attr_match_rnid4.attr, - &format_attr_match_dnid.attr, - &format_attr_match_mc.attr, - &format_attr_match_opc.attr, - &format_attr_match_vnw.attr, - &format_attr_match0.attr, - &format_attr_match1.attr, - &format_attr_mask_rds.attr, - &format_attr_mask_rnid30.attr, - &format_attr_mask_rnid4.attr, - &format_attr_mask_dnid.attr, - &format_attr_mask_mc.attr, - &format_attr_mask_opc.attr, - &format_attr_mask_vnw.attr, - &format_attr_mask0.attr, - &format_attr_mask1.attr, - NULL, -}; - -static struct attribute_group ivt_uncore_format_group = { - .name = "format", - .attrs = ivt_uncore_formats_attr, -}; - -static struct attribute_group ivt_uncore_ubox_format_group = { - .name = "format", - .attrs = ivt_uncore_ubox_formats_attr, -}; - -static struct attribute_group ivt_uncore_cbox_format_group = { - .name = "format", - .attrs = ivt_uncore_cbox_formats_attr, -}; - -static struct attribute_group ivt_uncore_pcu_format_group = { - .name = "format", - .attrs = ivt_uncore_pcu_formats_attr, -}; - -static struct attribute_group ivt_uncore_qpi_format_group = { - .name = "format", - .attrs = ivt_uncore_qpi_formats_attr, -}; - -static struct intel_uncore_type ivt_uncore_ubox = { - .name = "ubox", - .num_counters = 2, - .num_boxes = 1, - .perf_ctr_bits = 44, - .fixed_ctr_bits = 48, - .perf_ctr = SNBEP_U_MSR_PMON_CTR0, - .event_ctl = SNBEP_U_MSR_PMON_CTL0, - .event_mask = IVT_U_MSR_PMON_RAW_EVENT_MASK, - .fixed_ctr = SNBEP_U_MSR_PMON_UCLK_FIXED_CTR, - .fixed_ctl = SNBEP_U_MSR_PMON_UCLK_FIXED_CTL, - .ops = &ivt_uncore_msr_ops, - .format_group = &ivt_uncore_ubox_format_group, -}; - -static struct extra_reg ivt_uncore_cbox_extra_regs[] = { - SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN, - SNBEP_CBO_PMON_CTL_TID_EN, 0x1), - SNBEP_CBO_EVENT_EXTRA_REG(0x1031, 0x10ff, 0x2), - - SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4), - SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0xc), - SNBEP_CBO_EVENT_EXTRA_REG(0x5134, 0xffff, 0xc), - SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4), - SNBEP_CBO_EVENT_EXTRA_REG(0x4334, 0xffff, 0xc), - SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4), - SNBEP_CBO_EVENT_EXTRA_REG(0x4534, 0xffff, 0xc), - SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4), - SNBEP_CBO_EVENT_EXTRA_REG(0x4934, 0xffff, 0xc), - SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x10), - SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x10), - SNBEP_CBO_EVENT_EXTRA_REG(0x2135, 0xffff, 0x10), - SNBEP_CBO_EVENT_EXTRA_REG(0x2335, 0xffff, 0x10), - SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0x18), - SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0x18), - SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x8), - SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x8), - SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x8), - SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x8), - SNBEP_CBO_EVENT_EXTRA_REG(0x8135, 0xffff, 0x10), - SNBEP_CBO_EVENT_EXTRA_REG(0x8335, 0xffff, 0x10), - SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x10), - SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x10), - SNBEP_CBO_EVENT_EXTRA_REG(0x2136, 0xffff, 0x10), - SNBEP_CBO_EVENT_EXTRA_REG(0x2336, 0xffff, 0x10), - SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0x18), - SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0x18), - SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x8), - SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x8), - SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x8), - SNBEP_CBO_EVENT_EXTRA_REG(0x5036, 0xffff, 0x8), - SNBEP_CBO_EVENT_EXTRA_REG(0x8136, 0xffff, 0x10), - SNBEP_CBO_EVENT_EXTRA_REG(0x8336, 0xffff, 0x10), - SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x8), - EVENT_EXTRA_END -}; - -static u64 ivt_cbox_filter_mask(int fields) -{ - u64 mask = 0; - - if (fields & 0x1) - mask |= IVT_CB0_MSR_PMON_BOX_FILTER_TID; - if (fields & 0x2) - mask |= IVT_CB0_MSR_PMON_BOX_FILTER_LINK; - if (fields & 0x4) - mask |= IVT_CB0_MSR_PMON_BOX_FILTER_STATE; - if (fields & 0x8) - mask |= IVT_CB0_MSR_PMON_BOX_FILTER_NID; - if (fields & 0x10) - mask |= IVT_CB0_MSR_PMON_BOX_FILTER_OPC; - - return mask; -} - -static struct event_constraint * -ivt_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event) -{ - return __snbep_cbox_get_constraint(box, event, ivt_cbox_filter_mask); -} - -static int ivt_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event) -{ - struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; - struct extra_reg *er; - int idx = 0; - - for (er = ivt_uncore_cbox_extra_regs; er->msr; er++) { - if (er->event != (event->hw.config & er->config_mask)) - continue; - idx |= er->idx; - } - - if (idx) { - reg1->reg = SNBEP_C0_MSR_PMON_BOX_FILTER + - SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx; - reg1->config = event->attr.config1 & ivt_cbox_filter_mask(idx); - reg1->idx = idx; - } - return 0; -} - -static void ivt_cbox_enable_event(struct intel_uncore_box *box, struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - struct hw_perf_event_extra *reg1 = &hwc->extra_reg; - - if (reg1->idx != EXTRA_REG_NONE) { - u64 filter = uncore_shared_reg_config(box, 0); - wrmsrl(reg1->reg, filter & 0xffffffff); - wrmsrl(reg1->reg + 6, filter >> 32); - } - - wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN); -} - -static struct intel_uncore_ops ivt_uncore_cbox_ops = { - .init_box = ivt_uncore_msr_init_box, - .disable_box = snbep_uncore_msr_disable_box, - .enable_box = snbep_uncore_msr_enable_box, - .disable_event = snbep_uncore_msr_disable_event, - .enable_event = ivt_cbox_enable_event, - .read_counter = uncore_msr_read_counter, - .hw_config = ivt_cbox_hw_config, - .get_constraint = ivt_cbox_get_constraint, - .put_constraint = snbep_cbox_put_constraint, -}; - -static struct intel_uncore_type ivt_uncore_cbox = { - .name = "cbox", - .num_counters = 4, - .num_boxes = 15, - .perf_ctr_bits = 44, - .event_ctl = SNBEP_C0_MSR_PMON_CTL0, - .perf_ctr = SNBEP_C0_MSR_PMON_CTR0, - .event_mask = IVT_CBO_MSR_PMON_RAW_EVENT_MASK, - .box_ctl = SNBEP_C0_MSR_PMON_BOX_CTL, - .msr_offset = SNBEP_CBO_MSR_OFFSET, - .num_shared_regs = 1, - .constraints = snbep_uncore_cbox_constraints, - .ops = &ivt_uncore_cbox_ops, - .format_group = &ivt_uncore_cbox_format_group, -}; - -static struct intel_uncore_ops ivt_uncore_pcu_ops = { - IVT_UNCORE_MSR_OPS_COMMON_INIT(), - .hw_config = snbep_pcu_hw_config, - .get_constraint = snbep_pcu_get_constraint, - .put_constraint = snbep_pcu_put_constraint, -}; - -static struct intel_uncore_type ivt_uncore_pcu = { - .name = "pcu", - .num_counters = 4, - .num_boxes = 1, - .perf_ctr_bits = 48, - .perf_ctr = SNBEP_PCU_MSR_PMON_CTR0, - .event_ctl = SNBEP_PCU_MSR_PMON_CTL0, - .event_mask = IVT_PCU_MSR_PMON_RAW_EVENT_MASK, - .box_ctl = SNBEP_PCU_MSR_PMON_BOX_CTL, - .num_shared_regs = 1, - .ops = &ivt_uncore_pcu_ops, - .format_group = &ivt_uncore_pcu_format_group, -}; - -static struct intel_uncore_type *ivt_msr_uncores[] = { - &ivt_uncore_ubox, - &ivt_uncore_cbox, - &ivt_uncore_pcu, - NULL, -}; - -static struct intel_uncore_type ivt_uncore_ha = { - .name = "ha", - .num_counters = 4, - .num_boxes = 2, - .perf_ctr_bits = 48, - IVT_UNCORE_PCI_COMMON_INIT(), -}; - -static struct intel_uncore_type ivt_uncore_imc = { - .name = "imc", - .num_counters = 4, - .num_boxes = 8, - .perf_ctr_bits = 48, - .fixed_ctr_bits = 48, - .fixed_ctr = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR, - .fixed_ctl = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL, - IVT_UNCORE_PCI_COMMON_INIT(), -}; - -/* registers in IRP boxes are not properly aligned */ -static unsigned ivt_uncore_irp_ctls[] = {0xd8, 0xdc, 0xe0, 0xe4}; -static unsigned ivt_uncore_irp_ctrs[] = {0xa0, 0xb0, 0xb8, 0xc0}; - -static void ivt_uncore_irp_enable_event(struct intel_uncore_box *box, struct perf_event *event) -{ - struct pci_dev *pdev = box->pci_dev; - struct hw_perf_event *hwc = &event->hw; - - pci_write_config_dword(pdev, ivt_uncore_irp_ctls[hwc->idx], - hwc->config | SNBEP_PMON_CTL_EN); -} - -static void ivt_uncore_irp_disable_event(struct intel_uncore_box *box, struct perf_event *event) -{ - struct pci_dev *pdev = box->pci_dev; - struct hw_perf_event *hwc = &event->hw; - - pci_write_config_dword(pdev, ivt_uncore_irp_ctls[hwc->idx], hwc->config); -} - -static u64 ivt_uncore_irp_read_counter(struct intel_uncore_box *box, struct perf_event *event) -{ - struct pci_dev *pdev = box->pci_dev; - struct hw_perf_event *hwc = &event->hw; - u64 count = 0; - - pci_read_config_dword(pdev, ivt_uncore_irp_ctrs[hwc->idx], (u32 *)&count); - pci_read_config_dword(pdev, ivt_uncore_irp_ctrs[hwc->idx] + 4, (u32 *)&count + 1); - - return count; -} - -static struct intel_uncore_ops ivt_uncore_irp_ops = { - .init_box = ivt_uncore_pci_init_box, - .disable_box = snbep_uncore_pci_disable_box, - .enable_box = snbep_uncore_pci_enable_box, - .disable_event = ivt_uncore_irp_disable_event, - .enable_event = ivt_uncore_irp_enable_event, - .read_counter = ivt_uncore_irp_read_counter, -}; - -static struct intel_uncore_type ivt_uncore_irp = { - .name = "irp", - .num_counters = 4, - .num_boxes = 1, - .perf_ctr_bits = 48, - .event_mask = IVT_PMON_RAW_EVENT_MASK, - .box_ctl = SNBEP_PCI_PMON_BOX_CTL, - .ops = &ivt_uncore_irp_ops, - .format_group = &ivt_uncore_format_group, -}; - -static struct intel_uncore_ops ivt_uncore_qpi_ops = { - .init_box = ivt_uncore_pci_init_box, - .disable_box = snbep_uncore_pci_disable_box, - .enable_box = snbep_uncore_pci_enable_box, - .disable_event = snbep_uncore_pci_disable_event, - .enable_event = snbep_qpi_enable_event, - .read_counter = snbep_uncore_pci_read_counter, - .hw_config = snbep_qpi_hw_config, - .get_constraint = uncore_get_constraint, - .put_constraint = uncore_put_constraint, -}; - -static struct intel_uncore_type ivt_uncore_qpi = { - .name = "qpi", - .num_counters = 4, - .num_boxes = 3, - .perf_ctr_bits = 48, - .perf_ctr = SNBEP_PCI_PMON_CTR0, - .event_ctl = SNBEP_PCI_PMON_CTL0, - .event_mask = IVT_QPI_PCI_PMON_RAW_EVENT_MASK, - .box_ctl = SNBEP_PCI_PMON_BOX_CTL, - .num_shared_regs = 1, - .ops = &ivt_uncore_qpi_ops, - .format_group = &ivt_uncore_qpi_format_group, -}; - -static struct intel_uncore_type ivt_uncore_r2pcie = { - .name = "r2pcie", - .num_counters = 4, - .num_boxes = 1, - .perf_ctr_bits = 44, - .constraints = snbep_uncore_r2pcie_constraints, - IVT_UNCORE_PCI_COMMON_INIT(), -}; - -static struct intel_uncore_type ivt_uncore_r3qpi = { - .name = "r3qpi", - .num_counters = 3, - .num_boxes = 2, - .perf_ctr_bits = 44, - .constraints = snbep_uncore_r3qpi_constraints, - IVT_UNCORE_PCI_COMMON_INIT(), -}; - -enum { - IVT_PCI_UNCORE_HA, - IVT_PCI_UNCORE_IMC, - IVT_PCI_UNCORE_IRP, - IVT_PCI_UNCORE_QPI, - IVT_PCI_UNCORE_R2PCIE, - IVT_PCI_UNCORE_R3QPI, -}; - -static struct intel_uncore_type *ivt_pci_uncores[] = { - [IVT_PCI_UNCORE_HA] = &ivt_uncore_ha, - [IVT_PCI_UNCORE_IMC] = &ivt_uncore_imc, - [IVT_PCI_UNCORE_IRP] = &ivt_uncore_irp, - [IVT_PCI_UNCORE_QPI] = &ivt_uncore_qpi, - [IVT_PCI_UNCORE_R2PCIE] = &ivt_uncore_r2pcie, - [IVT_PCI_UNCORE_R3QPI] = &ivt_uncore_r3qpi, - NULL, -}; - -static DEFINE_PCI_DEVICE_TABLE(ivt_uncore_pci_ids) = { - { /* Home Agent 0 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe30), - .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_HA, 0), - }, - { /* Home Agent 1 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe38), - .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_HA, 1), - }, - { /* MC0 Channel 0 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb4), - .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 0), - }, - { /* MC0 Channel 1 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb5), - .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 1), - }, - { /* MC0 Channel 3 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb0), - .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 2), - }, - { /* MC0 Channel 4 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb1), - .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 3), - }, - { /* MC1 Channel 0 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef4), - .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 4), - }, - { /* MC1 Channel 1 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef5), - .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 5), - }, - { /* MC1 Channel 3 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef0), - .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 6), - }, - { /* MC1 Channel 4 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef1), - .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 7), - }, - { /* IRP */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe39), - .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IRP, 0), - }, - { /* QPI0 Port 0 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe32), - .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_QPI, 0), - }, - { /* QPI0 Port 1 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe33), - .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_QPI, 1), - }, - { /* QPI1 Port 2 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3a), - .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_QPI, 2), - }, - { /* R2PCIe */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe34), - .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_R2PCIE, 0), - }, - { /* R3QPI0 Link 0 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe36), - .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_R3QPI, 0), - }, - { /* R3QPI0 Link 1 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe37), - .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_R3QPI, 1), - }, - { /* R3QPI1 Link 2 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3e), - .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_R3QPI, 2), - }, - { /* QPI Port 0 filter */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe86), - .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, - SNBEP_PCI_QPI_PORT0_FILTER), - }, - { /* QPI Port 0 filter */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe96), - .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, - SNBEP_PCI_QPI_PORT1_FILTER), - }, - { /* end: all zeroes */ } -}; - -static struct pci_driver ivt_uncore_pci_driver = { - .name = "ivt_uncore", - .id_table = ivt_uncore_pci_ids, -}; -/* end of IvyTown uncore support */ - -/* Sandy Bridge uncore support */ -static void snb_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - - if (hwc->idx < UNCORE_PMC_IDX_FIXED) - wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN); - else - wrmsrl(hwc->config_base, SNB_UNC_CTL_EN); -} - -static void snb_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event) -{ - wrmsrl(event->hw.config_base, 0); -} - -static void snb_uncore_msr_init_box(struct intel_uncore_box *box) -{ - if (box->pmu->pmu_idx == 0) { - wrmsrl(SNB_UNC_PERF_GLOBAL_CTL, - SNB_UNC_GLOBAL_CTL_EN | SNB_UNC_GLOBAL_CTL_CORE_ALL); - } -} - -static struct uncore_event_desc snb_uncore_events[] = { - INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"), - { /* end: all zeroes */ }, -}; - -static struct attribute *snb_uncore_formats_attr[] = { - &format_attr_event.attr, - &format_attr_umask.attr, - &format_attr_edge.attr, - &format_attr_inv.attr, - &format_attr_cmask5.attr, - NULL, -}; - -static struct attribute_group snb_uncore_format_group = { - .name = "format", - .attrs = snb_uncore_formats_attr, -}; - -static struct intel_uncore_ops snb_uncore_msr_ops = { - .init_box = snb_uncore_msr_init_box, - .disable_event = snb_uncore_msr_disable_event, - .enable_event = snb_uncore_msr_enable_event, - .read_counter = uncore_msr_read_counter, -}; - -static struct event_constraint snb_uncore_cbox_constraints[] = { - UNCORE_EVENT_CONSTRAINT(0x80, 0x1), - UNCORE_EVENT_CONSTRAINT(0x83, 0x1), - EVENT_CONSTRAINT_END -}; - -static struct intel_uncore_type snb_uncore_cbox = { - .name = "cbox", - .num_counters = 2, - .num_boxes = 4, - .perf_ctr_bits = 44, - .fixed_ctr_bits = 48, - .perf_ctr = SNB_UNC_CBO_0_PER_CTR0, - .event_ctl = SNB_UNC_CBO_0_PERFEVTSEL0, - .fixed_ctr = SNB_UNC_FIXED_CTR, - .fixed_ctl = SNB_UNC_FIXED_CTR_CTRL, - .single_fixed = 1, - .event_mask = SNB_UNC_RAW_EVENT_MASK, - .msr_offset = SNB_UNC_CBO_MSR_OFFSET, - .constraints = snb_uncore_cbox_constraints, - .ops = &snb_uncore_msr_ops, - .format_group = &snb_uncore_format_group, - .event_descs = snb_uncore_events, -}; - -static struct intel_uncore_type *snb_msr_uncores[] = { - &snb_uncore_cbox, - NULL, -}; - -enum { - SNB_PCI_UNCORE_IMC, -}; - -static struct uncore_event_desc snb_uncore_imc_events[] = { - INTEL_UNCORE_EVENT_DESC(data_reads, "event=0x01"), - INTEL_UNCORE_EVENT_DESC(data_reads.scale, "6.103515625e-5"), - INTEL_UNCORE_EVENT_DESC(data_reads.unit, "MiB"), - - INTEL_UNCORE_EVENT_DESC(data_writes, "event=0x02"), - INTEL_UNCORE_EVENT_DESC(data_writes.scale, "6.103515625e-5"), - INTEL_UNCORE_EVENT_DESC(data_writes.unit, "MiB"), - - { /* end: all zeroes */ }, -}; - -#define SNB_UNCORE_PCI_IMC_EVENT_MASK 0xff -#define SNB_UNCORE_PCI_IMC_BAR_OFFSET 0x48 - -/* page size multiple covering all config regs */ -#define SNB_UNCORE_PCI_IMC_MAP_SIZE 0x6000 - -#define SNB_UNCORE_PCI_IMC_DATA_READS 0x1 -#define SNB_UNCORE_PCI_IMC_DATA_READS_BASE 0x5050 -#define SNB_UNCORE_PCI_IMC_DATA_WRITES 0x2 -#define SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE 0x5054 -#define SNB_UNCORE_PCI_IMC_CTR_BASE SNB_UNCORE_PCI_IMC_DATA_READS_BASE - -static struct attribute *snb_uncore_imc_formats_attr[] = { - &format_attr_event.attr, - NULL, -}; - -static struct attribute_group snb_uncore_imc_format_group = { - .name = "format", - .attrs = snb_uncore_imc_formats_attr, -}; - -static void snb_uncore_imc_init_box(struct intel_uncore_box *box) -{ - struct pci_dev *pdev = box->pci_dev; - int where = SNB_UNCORE_PCI_IMC_BAR_OFFSET; - resource_size_t addr; - u32 pci_dword; - - pci_read_config_dword(pdev, where, &pci_dword); - addr = pci_dword; - -#ifdef CONFIG_PHYS_ADDR_T_64BIT - pci_read_config_dword(pdev, where + 4, &pci_dword); - addr |= ((resource_size_t)pci_dword << 32); -#endif - - addr &= ~(PAGE_SIZE - 1); - - box->io_addr = ioremap(addr, SNB_UNCORE_PCI_IMC_MAP_SIZE); - box->hrtimer_duration = UNCORE_SNB_IMC_HRTIMER_INTERVAL; -} - -static void snb_uncore_imc_enable_box(struct intel_uncore_box *box) -{} - -static void snb_uncore_imc_disable_box(struct intel_uncore_box *box) -{} - -static void snb_uncore_imc_enable_event(struct intel_uncore_box *box, struct perf_event *event) -{} - -static void snb_uncore_imc_disable_event(struct intel_uncore_box *box, struct perf_event *event) -{} - -static u64 snb_uncore_imc_read_counter(struct intel_uncore_box *box, struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - - return (u64)*(unsigned int *)(box->io_addr + hwc->event_base); -} - -/* - * custom event_init() function because we define our own fixed, free - * running counters, so we do not want to conflict with generic uncore - * logic. Also simplifies processing - */ -static int snb_uncore_imc_event_init(struct perf_event *event) -{ - struct intel_uncore_pmu *pmu; - struct intel_uncore_box *box; - struct hw_perf_event *hwc = &event->hw; - u64 cfg = event->attr.config & SNB_UNCORE_PCI_IMC_EVENT_MASK; - int idx, base; - - if (event->attr.type != event->pmu->type) - return -ENOENT; - - pmu = uncore_event_to_pmu(event); - /* no device found for this pmu */ - if (pmu->func_id < 0) - return -ENOENT; - - /* Sampling not supported yet */ - if (hwc->sample_period) - return -EINVAL; - - /* unsupported modes and filters */ - if (event->attr.exclude_user || - event->attr.exclude_kernel || - event->attr.exclude_hv || - event->attr.exclude_idle || - event->attr.exclude_host || - event->attr.exclude_guest || - event->attr.sample_period) /* no sampling */ - return -EINVAL; - - /* - * Place all uncore events for a particular physical package - * onto a single cpu - */ - if (event->cpu < 0) - return -EINVAL; - - /* check only supported bits are set */ - if (event->attr.config & ~SNB_UNCORE_PCI_IMC_EVENT_MASK) - return -EINVAL; - - box = uncore_pmu_to_box(pmu, event->cpu); - if (!box || box->cpu < 0) - return -EINVAL; - - event->cpu = box->cpu; - - event->hw.idx = -1; - event->hw.last_tag = ~0ULL; - event->hw.extra_reg.idx = EXTRA_REG_NONE; - event->hw.branch_reg.idx = EXTRA_REG_NONE; - /* - * check event is known (whitelist, determines counter) - */ - switch (cfg) { - case SNB_UNCORE_PCI_IMC_DATA_READS: - base = SNB_UNCORE_PCI_IMC_DATA_READS_BASE; - idx = UNCORE_PMC_IDX_FIXED; - break; - case SNB_UNCORE_PCI_IMC_DATA_WRITES: - base = SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE; - idx = UNCORE_PMC_IDX_FIXED + 1; - break; - default: - return -EINVAL; - } - - /* must be done before validate_group */ - event->hw.event_base = base; - event->hw.config = cfg; - event->hw.idx = idx; - - /* no group validation needed, we have free running counters */ - - return 0; -} - -static int snb_uncore_imc_hw_config(struct intel_uncore_box *box, struct perf_event *event) -{ - return 0; -} - -static void snb_uncore_imc_event_start(struct perf_event *event, int flags) -{ - struct intel_uncore_box *box = uncore_event_to_box(event); - u64 count; - - if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) - return; - - event->hw.state = 0; - box->n_active++; - - list_add_tail(&event->active_entry, &box->active_list); - - count = snb_uncore_imc_read_counter(box, event); - local64_set(&event->hw.prev_count, count); - - if (box->n_active == 1) - uncore_pmu_start_hrtimer(box); -} - -static void snb_uncore_imc_event_stop(struct perf_event *event, int flags) -{ - struct intel_uncore_box *box = uncore_event_to_box(event); - struct hw_perf_event *hwc = &event->hw; - - if (!(hwc->state & PERF_HES_STOPPED)) { - box->n_active--; - - WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); - hwc->state |= PERF_HES_STOPPED; - - list_del(&event->active_entry); - - if (box->n_active == 0) - uncore_pmu_cancel_hrtimer(box); - } - - if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { - /* - * Drain the remaining delta count out of a event - * that we are disabling: - */ - uncore_perf_event_update(box, event); - hwc->state |= PERF_HES_UPTODATE; - } -} - -static int snb_uncore_imc_event_add(struct perf_event *event, int flags) -{ - struct intel_uncore_box *box = uncore_event_to_box(event); - struct hw_perf_event *hwc = &event->hw; - - if (!box) - return -ENODEV; - - hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; - if (!(flags & PERF_EF_START)) - hwc->state |= PERF_HES_ARCH; - - snb_uncore_imc_event_start(event, 0); - - box->n_events++; - - return 0; -} - -static void snb_uncore_imc_event_del(struct perf_event *event, int flags) -{ - struct intel_uncore_box *box = uncore_event_to_box(event); - int i; - - snb_uncore_imc_event_stop(event, PERF_EF_UPDATE); - - for (i = 0; i < box->n_events; i++) { - if (event == box->event_list[i]) { - --box->n_events; - break; - } - } -} - -static int snb_pci2phy_map_init(int devid) -{ - struct pci_dev *dev = NULL; - int bus; - - dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, dev); - if (!dev) - return -ENOTTY; - - bus = dev->bus->number; - - pcibus_to_physid[bus] = 0; - - pci_dev_put(dev); - - return 0; -} - -static struct pmu snb_uncore_imc_pmu = { - .task_ctx_nr = perf_invalid_context, - .event_init = snb_uncore_imc_event_init, - .add = snb_uncore_imc_event_add, - .del = snb_uncore_imc_event_del, - .start = snb_uncore_imc_event_start, - .stop = snb_uncore_imc_event_stop, - .read = uncore_pmu_event_read, -}; - -static struct intel_uncore_ops snb_uncore_imc_ops = { - .init_box = snb_uncore_imc_init_box, - .enable_box = snb_uncore_imc_enable_box, - .disable_box = snb_uncore_imc_disable_box, - .disable_event = snb_uncore_imc_disable_event, - .enable_event = snb_uncore_imc_enable_event, - .hw_config = snb_uncore_imc_hw_config, - .read_counter = snb_uncore_imc_read_counter, -}; - -static struct intel_uncore_type snb_uncore_imc = { - .name = "imc", - .num_counters = 2, - .num_boxes = 1, - .fixed_ctr_bits = 32, - .fixed_ctr = SNB_UNCORE_PCI_IMC_CTR_BASE, - .event_descs = snb_uncore_imc_events, - .format_group = &snb_uncore_imc_format_group, - .perf_ctr = SNB_UNCORE_PCI_IMC_DATA_READS_BASE, - .event_mask = SNB_UNCORE_PCI_IMC_EVENT_MASK, - .ops = &snb_uncore_imc_ops, - .pmu = &snb_uncore_imc_pmu, -}; - -static struct intel_uncore_type *snb_pci_uncores[] = { - [SNB_PCI_UNCORE_IMC] = &snb_uncore_imc, - NULL, -}; - -static DEFINE_PCI_DEVICE_TABLE(snb_uncore_pci_ids) = { - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SNB_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* end: all zeroes */ }, -}; - -static DEFINE_PCI_DEVICE_TABLE(ivb_uncore_pci_ids) = { - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* end: all zeroes */ }, -}; - -static DEFINE_PCI_DEVICE_TABLE(hsw_uncore_pci_ids) = { - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* end: all zeroes */ }, -}; - -static struct pci_driver snb_uncore_pci_driver = { - .name = "snb_uncore", - .id_table = snb_uncore_pci_ids, -}; - -static struct pci_driver ivb_uncore_pci_driver = { - .name = "ivb_uncore", - .id_table = ivb_uncore_pci_ids, -}; - -static struct pci_driver hsw_uncore_pci_driver = { - .name = "hsw_uncore", - .id_table = hsw_uncore_pci_ids, -}; - -/* end of Sandy Bridge uncore support */ - -/* Nehalem uncore support */ -static void nhm_uncore_msr_disable_box(struct intel_uncore_box *box) -{ - wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, 0); -} - -static void nhm_uncore_msr_enable_box(struct intel_uncore_box *box) -{ - wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, NHM_UNC_GLOBAL_CTL_EN_PC_ALL | NHM_UNC_GLOBAL_CTL_EN_FC); -} - -static void nhm_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - - if (hwc->idx < UNCORE_PMC_IDX_FIXED) - wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN); - else - wrmsrl(hwc->config_base, NHM_UNC_FIXED_CTR_CTL_EN); -} - -static struct attribute *nhm_uncore_formats_attr[] = { - &format_attr_event.attr, - &format_attr_umask.attr, - &format_attr_edge.attr, - &format_attr_inv.attr, - &format_attr_cmask8.attr, - NULL, -}; - -static struct attribute_group nhm_uncore_format_group = { - .name = "format", - .attrs = nhm_uncore_formats_attr, -}; - -static struct uncore_event_desc nhm_uncore_events[] = { - INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"), - INTEL_UNCORE_EVENT_DESC(qmc_writes_full_any, "event=0x2f,umask=0x0f"), - INTEL_UNCORE_EVENT_DESC(qmc_normal_reads_any, "event=0x2c,umask=0x0f"), - INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_reads, "event=0x20,umask=0x01"), - INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_writes, "event=0x20,umask=0x02"), - INTEL_UNCORE_EVENT_DESC(qhl_request_remote_reads, "event=0x20,umask=0x04"), - INTEL_UNCORE_EVENT_DESC(qhl_request_remote_writes, "event=0x20,umask=0x08"), - INTEL_UNCORE_EVENT_DESC(qhl_request_local_reads, "event=0x20,umask=0x10"), - INTEL_UNCORE_EVENT_DESC(qhl_request_local_writes, "event=0x20,umask=0x20"), - { /* end: all zeroes */ }, -}; - -static struct intel_uncore_ops nhm_uncore_msr_ops = { - .disable_box = nhm_uncore_msr_disable_box, - .enable_box = nhm_uncore_msr_enable_box, - .disable_event = snb_uncore_msr_disable_event, - .enable_event = nhm_uncore_msr_enable_event, - .read_counter = uncore_msr_read_counter, -}; - -static struct intel_uncore_type nhm_uncore = { - .name = "", - .num_counters = 8, - .num_boxes = 1, - .perf_ctr_bits = 48, - .fixed_ctr_bits = 48, - .event_ctl = NHM_UNC_PERFEVTSEL0, - .perf_ctr = NHM_UNC_UNCORE_PMC0, - .fixed_ctr = NHM_UNC_FIXED_CTR, - .fixed_ctl = NHM_UNC_FIXED_CTR_CTRL, - .event_mask = NHM_UNC_RAW_EVENT_MASK, - .event_descs = nhm_uncore_events, - .ops = &nhm_uncore_msr_ops, - .format_group = &nhm_uncore_format_group, -}; - -static struct intel_uncore_type *nhm_msr_uncores[] = { - &nhm_uncore, - NULL, -}; -/* end of Nehalem uncore support */ - -/* Nehalem-EX uncore support */ -DEFINE_UNCORE_FORMAT_ATTR(event5, event, "config:1-5"); -DEFINE_UNCORE_FORMAT_ATTR(counter, counter, "config:6-7"); -DEFINE_UNCORE_FORMAT_ATTR(match, match, "config1:0-63"); -DEFINE_UNCORE_FORMAT_ATTR(mask, mask, "config2:0-63"); - -static void nhmex_uncore_msr_init_box(struct intel_uncore_box *box) -{ - wrmsrl(NHMEX_U_MSR_PMON_GLOBAL_CTL, NHMEX_U_PMON_GLOBAL_EN_ALL); -} - -static void nhmex_uncore_msr_disable_box(struct intel_uncore_box *box) -{ - unsigned msr = uncore_msr_box_ctl(box); - u64 config; - - if (msr) { - rdmsrl(msr, config); - config &= ~((1ULL << uncore_num_counters(box)) - 1); - /* WBox has a fixed counter */ - if (uncore_msr_fixed_ctl(box)) - config &= ~NHMEX_W_PMON_GLOBAL_FIXED_EN; - wrmsrl(msr, config); - } -} - -static void nhmex_uncore_msr_enable_box(struct intel_uncore_box *box) -{ - unsigned msr = uncore_msr_box_ctl(box); - u64 config; - - if (msr) { - rdmsrl(msr, config); - config |= (1ULL << uncore_num_counters(box)) - 1; - /* WBox has a fixed counter */ - if (uncore_msr_fixed_ctl(box)) - config |= NHMEX_W_PMON_GLOBAL_FIXED_EN; - wrmsrl(msr, config); - } -} - -static void nhmex_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event) -{ - wrmsrl(event->hw.config_base, 0); -} - -static void nhmex_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - - if (hwc->idx >= UNCORE_PMC_IDX_FIXED) - wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0); - else if (box->pmu->type->event_mask & NHMEX_PMON_CTL_EN_BIT0) - wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22); - else - wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0); -} - -#define NHMEX_UNCORE_OPS_COMMON_INIT() \ - .init_box = nhmex_uncore_msr_init_box, \ - .disable_box = nhmex_uncore_msr_disable_box, \ - .enable_box = nhmex_uncore_msr_enable_box, \ - .disable_event = nhmex_uncore_msr_disable_event, \ - .read_counter = uncore_msr_read_counter - -static struct intel_uncore_ops nhmex_uncore_ops = { - NHMEX_UNCORE_OPS_COMMON_INIT(), - .enable_event = nhmex_uncore_msr_enable_event, -}; - -static struct attribute *nhmex_uncore_ubox_formats_attr[] = { - &format_attr_event.attr, - &format_attr_edge.attr, - NULL, -}; - -static struct attribute_group nhmex_uncore_ubox_format_group = { - .name = "format", - .attrs = nhmex_uncore_ubox_formats_attr, -}; - -static struct intel_uncore_type nhmex_uncore_ubox = { - .name = "ubox", - .num_counters = 1, - .num_boxes = 1, - .perf_ctr_bits = 48, - .event_ctl = NHMEX_U_MSR_PMON_EV_SEL, - .perf_ctr = NHMEX_U_MSR_PMON_CTR, - .event_mask = NHMEX_U_PMON_RAW_EVENT_MASK, - .box_ctl = NHMEX_U_MSR_PMON_GLOBAL_CTL, - .ops = &nhmex_uncore_ops, - .format_group = &nhmex_uncore_ubox_format_group -}; - -static struct attribute *nhmex_uncore_cbox_formats_attr[] = { - &format_attr_event.attr, - &format_attr_umask.attr, - &format_attr_edge.attr, - &format_attr_inv.attr, - &format_attr_thresh8.attr, - NULL, -}; - -static struct attribute_group nhmex_uncore_cbox_format_group = { - .name = "format", - .attrs = nhmex_uncore_cbox_formats_attr, -}; - -/* msr offset for each instance of cbox */ -static unsigned nhmex_cbox_msr_offsets[] = { - 0x0, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, 0x240, 0x2c0, -}; - -static struct intel_uncore_type nhmex_uncore_cbox = { - .name = "cbox", - .num_counters = 6, - .num_boxes = 10, - .perf_ctr_bits = 48, - .event_ctl = NHMEX_C0_MSR_PMON_EV_SEL0, - .perf_ctr = NHMEX_C0_MSR_PMON_CTR0, - .event_mask = NHMEX_PMON_RAW_EVENT_MASK, - .box_ctl = NHMEX_C0_MSR_PMON_GLOBAL_CTL, - .msr_offsets = nhmex_cbox_msr_offsets, - .pair_ctr_ctl = 1, - .ops = &nhmex_uncore_ops, - .format_group = &nhmex_uncore_cbox_format_group -}; - -static struct uncore_event_desc nhmex_uncore_wbox_events[] = { - INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0"), - { /* end: all zeroes */ }, -}; - -static struct intel_uncore_type nhmex_uncore_wbox = { - .name = "wbox", - .num_counters = 4, - .num_boxes = 1, - .perf_ctr_bits = 48, - .event_ctl = NHMEX_W_MSR_PMON_CNT0, - .perf_ctr = NHMEX_W_MSR_PMON_EVT_SEL0, - .fixed_ctr = NHMEX_W_MSR_PMON_FIXED_CTR, - .fixed_ctl = NHMEX_W_MSR_PMON_FIXED_CTL, - .event_mask = NHMEX_PMON_RAW_EVENT_MASK, - .box_ctl = NHMEX_W_MSR_GLOBAL_CTL, - .pair_ctr_ctl = 1, - .event_descs = nhmex_uncore_wbox_events, - .ops = &nhmex_uncore_ops, - .format_group = &nhmex_uncore_cbox_format_group -}; - -static int nhmex_bbox_hw_config(struct intel_uncore_box *box, struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - struct hw_perf_event_extra *reg1 = &hwc->extra_reg; - struct hw_perf_event_extra *reg2 = &hwc->branch_reg; - int ctr, ev_sel; - - ctr = (hwc->config & NHMEX_B_PMON_CTR_MASK) >> - NHMEX_B_PMON_CTR_SHIFT; - ev_sel = (hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK) >> - NHMEX_B_PMON_CTL_EV_SEL_SHIFT; - - /* events that do not use the match/mask registers */ - if ((ctr == 0 && ev_sel > 0x3) || (ctr == 1 && ev_sel > 0x6) || - (ctr == 2 && ev_sel != 0x4) || ctr == 3) - return 0; - - if (box->pmu->pmu_idx == 0) - reg1->reg = NHMEX_B0_MSR_MATCH; - else - reg1->reg = NHMEX_B1_MSR_MATCH; - reg1->idx = 0; - reg1->config = event->attr.config1; - reg2->config = event->attr.config2; - return 0; -} - -static void nhmex_bbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - struct hw_perf_event_extra *reg1 = &hwc->extra_reg; - struct hw_perf_event_extra *reg2 = &hwc->branch_reg; - - if (reg1->idx != EXTRA_REG_NONE) { - wrmsrl(reg1->reg, reg1->config); - wrmsrl(reg1->reg + 1, reg2->config); - } - wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 | - (hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK)); -} - -/* - * The Bbox has 4 counters, but each counter monitors different events. - * Use bits 6-7 in the event config to select counter. - */ -static struct event_constraint nhmex_uncore_bbox_constraints[] = { - EVENT_CONSTRAINT(0 , 1, 0xc0), - EVENT_CONSTRAINT(0x40, 2, 0xc0), - EVENT_CONSTRAINT(0x80, 4, 0xc0), - EVENT_CONSTRAINT(0xc0, 8, 0xc0), - EVENT_CONSTRAINT_END, -}; - -static struct attribute *nhmex_uncore_bbox_formats_attr[] = { - &format_attr_event5.attr, - &format_attr_counter.attr, - &format_attr_match.attr, - &format_attr_mask.attr, - NULL, -}; - -static struct attribute_group nhmex_uncore_bbox_format_group = { - .name = "format", - .attrs = nhmex_uncore_bbox_formats_attr, -}; - -static struct intel_uncore_ops nhmex_uncore_bbox_ops = { - NHMEX_UNCORE_OPS_COMMON_INIT(), - .enable_event = nhmex_bbox_msr_enable_event, - .hw_config = nhmex_bbox_hw_config, - .get_constraint = uncore_get_constraint, - .put_constraint = uncore_put_constraint, -}; - -static struct intel_uncore_type nhmex_uncore_bbox = { - .name = "bbox", - .num_counters = 4, - .num_boxes = 2, - .perf_ctr_bits = 48, - .event_ctl = NHMEX_B0_MSR_PMON_CTL0, - .perf_ctr = NHMEX_B0_MSR_PMON_CTR0, - .event_mask = NHMEX_B_PMON_RAW_EVENT_MASK, - .box_ctl = NHMEX_B0_MSR_PMON_GLOBAL_CTL, - .msr_offset = NHMEX_B_MSR_OFFSET, - .pair_ctr_ctl = 1, - .num_shared_regs = 1, - .constraints = nhmex_uncore_bbox_constraints, - .ops = &nhmex_uncore_bbox_ops, - .format_group = &nhmex_uncore_bbox_format_group -}; - -static int nhmex_sbox_hw_config(struct intel_uncore_box *box, struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - struct hw_perf_event_extra *reg1 = &hwc->extra_reg; - struct hw_perf_event_extra *reg2 = &hwc->branch_reg; - - /* only TO_R_PROG_EV event uses the match/mask register */ - if ((hwc->config & NHMEX_PMON_CTL_EV_SEL_MASK) != - NHMEX_S_EVENT_TO_R_PROG_EV) - return 0; - - if (box->pmu->pmu_idx == 0) - reg1->reg = NHMEX_S0_MSR_MM_CFG; - else - reg1->reg = NHMEX_S1_MSR_MM_CFG; - reg1->idx = 0; - reg1->config = event->attr.config1; - reg2->config = event->attr.config2; - return 0; -} - -static void nhmex_sbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - struct hw_perf_event_extra *reg1 = &hwc->extra_reg; - struct hw_perf_event_extra *reg2 = &hwc->branch_reg; - - if (reg1->idx != EXTRA_REG_NONE) { - wrmsrl(reg1->reg, 0); - wrmsrl(reg1->reg + 1, reg1->config); - wrmsrl(reg1->reg + 2, reg2->config); - wrmsrl(reg1->reg, NHMEX_S_PMON_MM_CFG_EN); - } - wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22); -} - -static struct attribute *nhmex_uncore_sbox_formats_attr[] = { - &format_attr_event.attr, - &format_attr_umask.attr, - &format_attr_edge.attr, - &format_attr_inv.attr, - &format_attr_thresh8.attr, - &format_attr_match.attr, - &format_attr_mask.attr, - NULL, -}; - -static struct attribute_group nhmex_uncore_sbox_format_group = { - .name = "format", - .attrs = nhmex_uncore_sbox_formats_attr, -}; - -static struct intel_uncore_ops nhmex_uncore_sbox_ops = { - NHMEX_UNCORE_OPS_COMMON_INIT(), - .enable_event = nhmex_sbox_msr_enable_event, - .hw_config = nhmex_sbox_hw_config, - .get_constraint = uncore_get_constraint, - .put_constraint = uncore_put_constraint, -}; - -static struct intel_uncore_type nhmex_uncore_sbox = { - .name = "sbox", - .num_counters = 4, - .num_boxes = 2, - .perf_ctr_bits = 48, - .event_ctl = NHMEX_S0_MSR_PMON_CTL0, - .perf_ctr = NHMEX_S0_MSR_PMON_CTR0, - .event_mask = NHMEX_PMON_RAW_EVENT_MASK, - .box_ctl = NHMEX_S0_MSR_PMON_GLOBAL_CTL, - .msr_offset = NHMEX_S_MSR_OFFSET, - .pair_ctr_ctl = 1, - .num_shared_regs = 1, - .ops = &nhmex_uncore_sbox_ops, - .format_group = &nhmex_uncore_sbox_format_group -}; - -enum { - EXTRA_REG_NHMEX_M_FILTER, - EXTRA_REG_NHMEX_M_DSP, - EXTRA_REG_NHMEX_M_ISS, - EXTRA_REG_NHMEX_M_MAP, - EXTRA_REG_NHMEX_M_MSC_THR, - EXTRA_REG_NHMEX_M_PGT, - EXTRA_REG_NHMEX_M_PLD, - EXTRA_REG_NHMEX_M_ZDP_CTL_FVC, -}; - -static struct extra_reg nhmex_uncore_mbox_extra_regs[] = { - MBOX_INC_SEL_EXTAR_REG(0x0, DSP), - MBOX_INC_SEL_EXTAR_REG(0x4, MSC_THR), - MBOX_INC_SEL_EXTAR_REG(0x5, MSC_THR), - MBOX_INC_SEL_EXTAR_REG(0x9, ISS), - /* event 0xa uses two extra registers */ - MBOX_INC_SEL_EXTAR_REG(0xa, ISS), - MBOX_INC_SEL_EXTAR_REG(0xa, PLD), - MBOX_INC_SEL_EXTAR_REG(0xb, PLD), - /* events 0xd ~ 0x10 use the same extra register */ - MBOX_INC_SEL_EXTAR_REG(0xd, ZDP_CTL_FVC), - MBOX_INC_SEL_EXTAR_REG(0xe, ZDP_CTL_FVC), - MBOX_INC_SEL_EXTAR_REG(0xf, ZDP_CTL_FVC), - MBOX_INC_SEL_EXTAR_REG(0x10, ZDP_CTL_FVC), - MBOX_INC_SEL_EXTAR_REG(0x16, PGT), - MBOX_SET_FLAG_SEL_EXTRA_REG(0x0, DSP), - MBOX_SET_FLAG_SEL_EXTRA_REG(0x1, ISS), - MBOX_SET_FLAG_SEL_EXTRA_REG(0x5, PGT), - MBOX_SET_FLAG_SEL_EXTRA_REG(0x6, MAP), - EVENT_EXTRA_END -}; - -/* Nehalem-EX or Westmere-EX ? */ -static bool uncore_nhmex; - -static bool nhmex_mbox_get_shared_reg(struct intel_uncore_box *box, int idx, u64 config) -{ - struct intel_uncore_extra_reg *er; - unsigned long flags; - bool ret = false; - u64 mask; - - if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) { - er = &box->shared_regs[idx]; - raw_spin_lock_irqsave(&er->lock, flags); - if (!atomic_read(&er->ref) || er->config == config) { - atomic_inc(&er->ref); - er->config = config; - ret = true; - } - raw_spin_unlock_irqrestore(&er->lock, flags); - - return ret; - } - /* - * The ZDP_CTL_FVC MSR has 4 fields which are used to control - * events 0xd ~ 0x10. Besides these 4 fields, there are additional - * fields which are shared. - */ - idx -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC; - if (WARN_ON_ONCE(idx >= 4)) - return false; - - /* mask of the shared fields */ - if (uncore_nhmex) - mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK; - else - mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK; - er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC]; - - raw_spin_lock_irqsave(&er->lock, flags); - /* add mask of the non-shared field if it's in use */ - if (__BITS_VALUE(atomic_read(&er->ref), idx, 8)) { - if (uncore_nhmex) - mask |= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); - else - mask |= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); - } - - if (!atomic_read(&er->ref) || !((er->config ^ config) & mask)) { - atomic_add(1 << (idx * 8), &er->ref); - if (uncore_nhmex) - mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK | - NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); - else - mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK | - WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); - er->config &= ~mask; - er->config |= (config & mask); - ret = true; - } - raw_spin_unlock_irqrestore(&er->lock, flags); - - return ret; -} - -static void nhmex_mbox_put_shared_reg(struct intel_uncore_box *box, int idx) -{ - struct intel_uncore_extra_reg *er; - - if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) { - er = &box->shared_regs[idx]; - atomic_dec(&er->ref); - return; - } - - idx -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC; - er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC]; - atomic_sub(1 << (idx * 8), &er->ref); -} - -static u64 nhmex_mbox_alter_er(struct perf_event *event, int new_idx, bool modify) -{ - struct hw_perf_event *hwc = &event->hw; - struct hw_perf_event_extra *reg1 = &hwc->extra_reg; - u64 idx, orig_idx = __BITS_VALUE(reg1->idx, 0, 8); - u64 config = reg1->config; - - /* get the non-shared control bits and shift them */ - idx = orig_idx - EXTRA_REG_NHMEX_M_ZDP_CTL_FVC; - if (uncore_nhmex) - config &= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); - else - config &= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); - if (new_idx > orig_idx) { - idx = new_idx - orig_idx; - config <<= 3 * idx; - } else { - idx = orig_idx - new_idx; - config >>= 3 * idx; - } - - /* add the shared control bits back */ - if (uncore_nhmex) - config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config; - else - config |= WSMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config; - config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config; - if (modify) { - /* adjust the main event selector */ - if (new_idx > orig_idx) - hwc->config += idx << NHMEX_M_PMON_CTL_INC_SEL_SHIFT; - else - hwc->config -= idx << NHMEX_M_PMON_CTL_INC_SEL_SHIFT; - reg1->config = config; - reg1->idx = ~0xff | new_idx; - } - return config; -} - -static struct event_constraint * -nhmex_mbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event) -{ - struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; - struct hw_perf_event_extra *reg2 = &event->hw.branch_reg; - int i, idx[2], alloc = 0; - u64 config1 = reg1->config; - - idx[0] = __BITS_VALUE(reg1->idx, 0, 8); - idx[1] = __BITS_VALUE(reg1->idx, 1, 8); -again: - for (i = 0; i < 2; i++) { - if (!uncore_box_is_fake(box) && (reg1->alloc & (0x1 << i))) - idx[i] = 0xff; - - if (idx[i] == 0xff) - continue; - - if (!nhmex_mbox_get_shared_reg(box, idx[i], - __BITS_VALUE(config1, i, 32))) - goto fail; - alloc |= (0x1 << i); - } - - /* for the match/mask registers */ - if (reg2->idx != EXTRA_REG_NONE && - (uncore_box_is_fake(box) || !reg2->alloc) && - !nhmex_mbox_get_shared_reg(box, reg2->idx, reg2->config)) - goto fail; - - /* - * If it's a fake box -- as per validate_{group,event}() we - * shouldn't touch event state and we can avoid doing so - * since both will only call get_event_constraints() once - * on each event, this avoids the need for reg->alloc. - */ - if (!uncore_box_is_fake(box)) { - if (idx[0] != 0xff && idx[0] != __BITS_VALUE(reg1->idx, 0, 8)) - nhmex_mbox_alter_er(event, idx[0], true); - reg1->alloc |= alloc; - if (reg2->idx != EXTRA_REG_NONE) - reg2->alloc = 1; - } - return NULL; -fail: - if (idx[0] != 0xff && !(alloc & 0x1) && - idx[0] >= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) { - /* - * events 0xd ~ 0x10 are functional identical, but are - * controlled by different fields in the ZDP_CTL_FVC - * register. If we failed to take one field, try the - * rest 3 choices. - */ - BUG_ON(__BITS_VALUE(reg1->idx, 1, 8) != 0xff); - idx[0] -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC; - idx[0] = (idx[0] + 1) % 4; - idx[0] += EXTRA_REG_NHMEX_M_ZDP_CTL_FVC; - if (idx[0] != __BITS_VALUE(reg1->idx, 0, 8)) { - config1 = nhmex_mbox_alter_er(event, idx[0], false); - goto again; - } - } - - if (alloc & 0x1) - nhmex_mbox_put_shared_reg(box, idx[0]); - if (alloc & 0x2) - nhmex_mbox_put_shared_reg(box, idx[1]); - return &constraint_empty; -} - -static void nhmex_mbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event) -{ - struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; - struct hw_perf_event_extra *reg2 = &event->hw.branch_reg; - - if (uncore_box_is_fake(box)) - return; - - if (reg1->alloc & 0x1) - nhmex_mbox_put_shared_reg(box, __BITS_VALUE(reg1->idx, 0, 8)); - if (reg1->alloc & 0x2) - nhmex_mbox_put_shared_reg(box, __BITS_VALUE(reg1->idx, 1, 8)); - reg1->alloc = 0; - - if (reg2->alloc) { - nhmex_mbox_put_shared_reg(box, reg2->idx); - reg2->alloc = 0; - } -} - -static int nhmex_mbox_extra_reg_idx(struct extra_reg *er) -{ - if (er->idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) - return er->idx; - return er->idx + (er->event >> NHMEX_M_PMON_CTL_INC_SEL_SHIFT) - 0xd; -} - -static int nhmex_mbox_hw_config(struct intel_uncore_box *box, struct perf_event *event) -{ - struct intel_uncore_type *type = box->pmu->type; - struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; - struct hw_perf_event_extra *reg2 = &event->hw.branch_reg; - struct extra_reg *er; - unsigned msr; - int reg_idx = 0; - /* - * The mbox events may require 2 extra MSRs at the most. But only - * the lower 32 bits in these MSRs are significant, so we can use - * config1 to pass two MSRs' config. - */ - for (er = nhmex_uncore_mbox_extra_regs; er->msr; er++) { - if (er->event != (event->hw.config & er->config_mask)) - continue; - if (event->attr.config1 & ~er->valid_mask) - return -EINVAL; - - msr = er->msr + type->msr_offset * box->pmu->pmu_idx; - if (WARN_ON_ONCE(msr >= 0xffff || er->idx >= 0xff)) - return -EINVAL; - - /* always use the 32~63 bits to pass the PLD config */ - if (er->idx == EXTRA_REG_NHMEX_M_PLD) - reg_idx = 1; - else if (WARN_ON_ONCE(reg_idx > 0)) - return -EINVAL; - - reg1->idx &= ~(0xff << (reg_idx * 8)); - reg1->reg &= ~(0xffff << (reg_idx * 16)); - reg1->idx |= nhmex_mbox_extra_reg_idx(er) << (reg_idx * 8); - reg1->reg |= msr << (reg_idx * 16); - reg1->config = event->attr.config1; - reg_idx++; - } - /* - * The mbox only provides ability to perform address matching - * for the PLD events. - */ - if (reg_idx == 2) { - reg2->idx = EXTRA_REG_NHMEX_M_FILTER; - if (event->attr.config2 & NHMEX_M_PMON_MM_CFG_EN) - reg2->config = event->attr.config2; - else - reg2->config = ~0ULL; - if (box->pmu->pmu_idx == 0) - reg2->reg = NHMEX_M0_MSR_PMU_MM_CFG; - else - reg2->reg = NHMEX_M1_MSR_PMU_MM_CFG; - } - return 0; -} - -static u64 nhmex_mbox_shared_reg_config(struct intel_uncore_box *box, int idx) -{ - struct intel_uncore_extra_reg *er; - unsigned long flags; - u64 config; - - if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) - return box->shared_regs[idx].config; - - er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC]; - raw_spin_lock_irqsave(&er->lock, flags); - config = er->config; - raw_spin_unlock_irqrestore(&er->lock, flags); - return config; -} - -static void nhmex_mbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - struct hw_perf_event_extra *reg1 = &hwc->extra_reg; - struct hw_perf_event_extra *reg2 = &hwc->branch_reg; - int idx; - - idx = __BITS_VALUE(reg1->idx, 0, 8); - if (idx != 0xff) - wrmsrl(__BITS_VALUE(reg1->reg, 0, 16), - nhmex_mbox_shared_reg_config(box, idx)); - idx = __BITS_VALUE(reg1->idx, 1, 8); - if (idx != 0xff) - wrmsrl(__BITS_VALUE(reg1->reg, 1, 16), - nhmex_mbox_shared_reg_config(box, idx)); - - if (reg2->idx != EXTRA_REG_NONE) { - wrmsrl(reg2->reg, 0); - if (reg2->config != ~0ULL) { - wrmsrl(reg2->reg + 1, - reg2->config & NHMEX_M_PMON_ADDR_MATCH_MASK); - wrmsrl(reg2->reg + 2, NHMEX_M_PMON_ADDR_MASK_MASK & - (reg2->config >> NHMEX_M_PMON_ADDR_MASK_SHIFT)); - wrmsrl(reg2->reg, NHMEX_M_PMON_MM_CFG_EN); - } - } - - wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0); -} - -DEFINE_UNCORE_FORMAT_ATTR(count_mode, count_mode, "config:2-3"); -DEFINE_UNCORE_FORMAT_ATTR(storage_mode, storage_mode, "config:4-5"); -DEFINE_UNCORE_FORMAT_ATTR(wrap_mode, wrap_mode, "config:6"); -DEFINE_UNCORE_FORMAT_ATTR(flag_mode, flag_mode, "config:7"); -DEFINE_UNCORE_FORMAT_ATTR(inc_sel, inc_sel, "config:9-13"); -DEFINE_UNCORE_FORMAT_ATTR(set_flag_sel, set_flag_sel, "config:19-21"); -DEFINE_UNCORE_FORMAT_ATTR(filter_cfg_en, filter_cfg_en, "config2:63"); -DEFINE_UNCORE_FORMAT_ATTR(filter_match, filter_match, "config2:0-33"); -DEFINE_UNCORE_FORMAT_ATTR(filter_mask, filter_mask, "config2:34-61"); -DEFINE_UNCORE_FORMAT_ATTR(dsp, dsp, "config1:0-31"); -DEFINE_UNCORE_FORMAT_ATTR(thr, thr, "config1:0-31"); -DEFINE_UNCORE_FORMAT_ATTR(fvc, fvc, "config1:0-31"); -DEFINE_UNCORE_FORMAT_ATTR(pgt, pgt, "config1:0-31"); -DEFINE_UNCORE_FORMAT_ATTR(map, map, "config1:0-31"); -DEFINE_UNCORE_FORMAT_ATTR(iss, iss, "config1:0-31"); -DEFINE_UNCORE_FORMAT_ATTR(pld, pld, "config1:32-63"); - -static struct attribute *nhmex_uncore_mbox_formats_attr[] = { - &format_attr_count_mode.attr, - &format_attr_storage_mode.attr, - &format_attr_wrap_mode.attr, - &format_attr_flag_mode.attr, - &format_attr_inc_sel.attr, - &format_attr_set_flag_sel.attr, - &format_attr_filter_cfg_en.attr, - &format_attr_filter_match.attr, - &format_attr_filter_mask.attr, - &format_attr_dsp.attr, - &format_attr_thr.attr, - &format_attr_fvc.attr, - &format_attr_pgt.attr, - &format_attr_map.attr, - &format_attr_iss.attr, - &format_attr_pld.attr, - NULL, -}; - -static struct attribute_group nhmex_uncore_mbox_format_group = { - .name = "format", - .attrs = nhmex_uncore_mbox_formats_attr, -}; - -static struct uncore_event_desc nhmex_uncore_mbox_events[] = { - INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x2800"), - INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x2820"), - { /* end: all zeroes */ }, -}; - -static struct uncore_event_desc wsmex_uncore_mbox_events[] = { - INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x5000"), - INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x5040"), - { /* end: all zeroes */ }, -}; - -static struct intel_uncore_ops nhmex_uncore_mbox_ops = { - NHMEX_UNCORE_OPS_COMMON_INIT(), - .enable_event = nhmex_mbox_msr_enable_event, - .hw_config = nhmex_mbox_hw_config, - .get_constraint = nhmex_mbox_get_constraint, - .put_constraint = nhmex_mbox_put_constraint, -}; - -static struct intel_uncore_type nhmex_uncore_mbox = { - .name = "mbox", - .num_counters = 6, - .num_boxes = 2, - .perf_ctr_bits = 48, - .event_ctl = NHMEX_M0_MSR_PMU_CTL0, - .perf_ctr = NHMEX_M0_MSR_PMU_CNT0, - .event_mask = NHMEX_M_PMON_RAW_EVENT_MASK, - .box_ctl = NHMEX_M0_MSR_GLOBAL_CTL, - .msr_offset = NHMEX_M_MSR_OFFSET, - .pair_ctr_ctl = 1, - .num_shared_regs = 8, - .event_descs = nhmex_uncore_mbox_events, - .ops = &nhmex_uncore_mbox_ops, - .format_group = &nhmex_uncore_mbox_format_group, -}; - -static void nhmex_rbox_alter_er(struct intel_uncore_box *box, struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - struct hw_perf_event_extra *reg1 = &hwc->extra_reg; - - /* adjust the main event selector and extra register index */ - if (reg1->idx % 2) { - reg1->idx--; - hwc->config -= 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT; - } else { - reg1->idx++; - hwc->config += 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT; - } - - /* adjust extra register config */ - switch (reg1->idx % 6) { - case 2: - /* shift the 8~15 bits to the 0~7 bits */ - reg1->config >>= 8; - break; - case 3: - /* shift the 0~7 bits to the 8~15 bits */ - reg1->config <<= 8; - break; - }; -} - -/* - * Each rbox has 4 event set which monitor PQI port 0~3 or 4~7. - * An event set consists of 6 events, the 3rd and 4th events in - * an event set use the same extra register. So an event set uses - * 5 extra registers. - */ -static struct event_constraint * -nhmex_rbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - struct hw_perf_event_extra *reg1 = &hwc->extra_reg; - struct hw_perf_event_extra *reg2 = &hwc->branch_reg; - struct intel_uncore_extra_reg *er; - unsigned long flags; - int idx, er_idx; - u64 config1; - bool ok = false; - - if (!uncore_box_is_fake(box) && reg1->alloc) - return NULL; - - idx = reg1->idx % 6; - config1 = reg1->config; -again: - er_idx = idx; - /* the 3rd and 4th events use the same extra register */ - if (er_idx > 2) - er_idx--; - er_idx += (reg1->idx / 6) * 5; - - er = &box->shared_regs[er_idx]; - raw_spin_lock_irqsave(&er->lock, flags); - if (idx < 2) { - if (!atomic_read(&er->ref) || er->config == reg1->config) { - atomic_inc(&er->ref); - er->config = reg1->config; - ok = true; - } - } else if (idx == 2 || idx == 3) { - /* - * these two events use different fields in a extra register, - * the 0~7 bits and the 8~15 bits respectively. - */ - u64 mask = 0xff << ((idx - 2) * 8); - if (!__BITS_VALUE(atomic_read(&er->ref), idx - 2, 8) || - !((er->config ^ config1) & mask)) { - atomic_add(1 << ((idx - 2) * 8), &er->ref); - er->config &= ~mask; - er->config |= config1 & mask; - ok = true; - } - } else { - if (!atomic_read(&er->ref) || - (er->config == (hwc->config >> 32) && - er->config1 == reg1->config && - er->config2 == reg2->config)) { - atomic_inc(&er->ref); - er->config = (hwc->config >> 32); - er->config1 = reg1->config; - er->config2 = reg2->config; - ok = true; - } - } - raw_spin_unlock_irqrestore(&er->lock, flags); - - if (!ok) { - /* - * The Rbox events are always in pairs. The paired - * events are functional identical, but use different - * extra registers. If we failed to take an extra - * register, try the alternative. - */ - if (idx % 2) - idx--; - else - idx++; - if (idx != reg1->idx % 6) { - if (idx == 2) - config1 >>= 8; - else if (idx == 3) - config1 <<= 8; - goto again; - } - } else { - if (!uncore_box_is_fake(box)) { - if (idx != reg1->idx % 6) - nhmex_rbox_alter_er(box, event); - reg1->alloc = 1; - } - return NULL; - } - return &constraint_empty; -} - -static void nhmex_rbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event) -{ - struct intel_uncore_extra_reg *er; - struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; - int idx, er_idx; - - if (uncore_box_is_fake(box) || !reg1->alloc) - return; - - idx = reg1->idx % 6; - er_idx = idx; - if (er_idx > 2) - er_idx--; - er_idx += (reg1->idx / 6) * 5; - - er = &box->shared_regs[er_idx]; - if (idx == 2 || idx == 3) - atomic_sub(1 << ((idx - 2) * 8), &er->ref); - else - atomic_dec(&er->ref); - - reg1->alloc = 0; -} - -static int nhmex_rbox_hw_config(struct intel_uncore_box *box, struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; - struct hw_perf_event_extra *reg2 = &event->hw.branch_reg; - int idx; - - idx = (event->hw.config & NHMEX_R_PMON_CTL_EV_SEL_MASK) >> - NHMEX_R_PMON_CTL_EV_SEL_SHIFT; - if (idx >= 0x18) - return -EINVAL; - - reg1->idx = idx; - reg1->config = event->attr.config1; - - switch (idx % 6) { - case 4: - case 5: - hwc->config |= event->attr.config & (~0ULL << 32); - reg2->config = event->attr.config2; - break; - }; - return 0; -} - -static void nhmex_rbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - struct hw_perf_event_extra *reg1 = &hwc->extra_reg; - struct hw_perf_event_extra *reg2 = &hwc->branch_reg; - int idx, port; - - idx = reg1->idx; - port = idx / 6 + box->pmu->pmu_idx * 4; - - switch (idx % 6) { - case 0: - wrmsrl(NHMEX_R_MSR_PORTN_IPERF_CFG0(port), reg1->config); - break; - case 1: - wrmsrl(NHMEX_R_MSR_PORTN_IPERF_CFG1(port), reg1->config); - break; - case 2: - case 3: - wrmsrl(NHMEX_R_MSR_PORTN_QLX_CFG(port), - uncore_shared_reg_config(box, 2 + (idx / 6) * 5)); - break; - case 4: - wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(port), - hwc->config >> 32); - wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(port), reg1->config); - wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MASK(port), reg2->config); - break; - case 5: - wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(port), - hwc->config >> 32); - wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(port), reg1->config); - wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MASK(port), reg2->config); - break; - }; - - wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 | - (hwc->config & NHMEX_R_PMON_CTL_EV_SEL_MASK)); -} - -DEFINE_UNCORE_FORMAT_ATTR(xbr_mm_cfg, xbr_mm_cfg, "config:32-63"); -DEFINE_UNCORE_FORMAT_ATTR(xbr_match, xbr_match, "config1:0-63"); -DEFINE_UNCORE_FORMAT_ATTR(xbr_mask, xbr_mask, "config2:0-63"); -DEFINE_UNCORE_FORMAT_ATTR(qlx_cfg, qlx_cfg, "config1:0-15"); -DEFINE_UNCORE_FORMAT_ATTR(iperf_cfg, iperf_cfg, "config1:0-31"); - -static struct attribute *nhmex_uncore_rbox_formats_attr[] = { - &format_attr_event5.attr, - &format_attr_xbr_mm_cfg.attr, - &format_attr_xbr_match.attr, - &format_attr_xbr_mask.attr, - &format_attr_qlx_cfg.attr, - &format_attr_iperf_cfg.attr, - NULL, -}; - -static struct attribute_group nhmex_uncore_rbox_format_group = { - .name = "format", - .attrs = nhmex_uncore_rbox_formats_attr, -}; - -static struct uncore_event_desc nhmex_uncore_rbox_events[] = { - INTEL_UNCORE_EVENT_DESC(qpi0_flit_send, "event=0x0,iperf_cfg=0x80000000"), - INTEL_UNCORE_EVENT_DESC(qpi1_filt_send, "event=0x6,iperf_cfg=0x80000000"), - INTEL_UNCORE_EVENT_DESC(qpi0_idle_filt, "event=0x0,iperf_cfg=0x40000000"), - INTEL_UNCORE_EVENT_DESC(qpi1_idle_filt, "event=0x6,iperf_cfg=0x40000000"), - INTEL_UNCORE_EVENT_DESC(qpi0_date_response, "event=0x0,iperf_cfg=0xc4"), - INTEL_UNCORE_EVENT_DESC(qpi1_date_response, "event=0x6,iperf_cfg=0xc4"), - { /* end: all zeroes */ }, -}; - -static struct intel_uncore_ops nhmex_uncore_rbox_ops = { - NHMEX_UNCORE_OPS_COMMON_INIT(), - .enable_event = nhmex_rbox_msr_enable_event, - .hw_config = nhmex_rbox_hw_config, - .get_constraint = nhmex_rbox_get_constraint, - .put_constraint = nhmex_rbox_put_constraint, -}; - -static struct intel_uncore_type nhmex_uncore_rbox = { - .name = "rbox", - .num_counters = 8, - .num_boxes = 2, - .perf_ctr_bits = 48, - .event_ctl = NHMEX_R_MSR_PMON_CTL0, - .perf_ctr = NHMEX_R_MSR_PMON_CNT0, - .event_mask = NHMEX_R_PMON_RAW_EVENT_MASK, - .box_ctl = NHMEX_R_MSR_GLOBAL_CTL, - .msr_offset = NHMEX_R_MSR_OFFSET, - .pair_ctr_ctl = 1, - .num_shared_regs = 20, - .event_descs = nhmex_uncore_rbox_events, - .ops = &nhmex_uncore_rbox_ops, - .format_group = &nhmex_uncore_rbox_format_group -}; - -static struct intel_uncore_type *nhmex_msr_uncores[] = { - &nhmex_uncore_ubox, - &nhmex_uncore_cbox, - &nhmex_uncore_bbox, - &nhmex_uncore_sbox, - &nhmex_uncore_mbox, - &nhmex_uncore_rbox, - &nhmex_uncore_wbox, - NULL, -}; -/* end of Nehalem-EX uncore support */ - static void uncore_assign_hw_event(struct intel_uncore_box *box, struct perf_event *event, int idx) { struct hw_perf_event *hwc = &event->hw; @@ -3143,7 +170,7 @@ static void uncore_assign_hw_event(struct intel_uncore_box *box, struct perf_eve hwc->event_base = uncore_perf_ctr(box, hwc->idx); } -static void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event) +void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event) { u64 prev_count, new_count, delta; int shift; @@ -3204,14 +231,14 @@ static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer) return HRTIMER_RESTART; } -static void uncore_pmu_start_hrtimer(struct intel_uncore_box *box) +void uncore_pmu_start_hrtimer(struct intel_uncore_box *box) { __hrtimer_start_range_ns(&box->hrtimer, ns_to_ktime(box->hrtimer_duration), 0, HRTIMER_MODE_REL_PINNED, 0); } -static void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box) +void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box) { hrtimer_cancel(&box->hrtimer); } @@ -3294,7 +321,7 @@ uncore_get_event_constraint(struct intel_uncore_box *box, struct perf_event *eve } if (event->attr.config == UNCORE_FIXED_EVENT) - return &constraint_fixed; + return &uncore_constraint_fixed; if (type->constraints) { for_each_event_constraint(c, type->constraints) { @@ -3499,7 +526,7 @@ static void uncore_pmu_event_del(struct perf_event *event, int flags) event->hw.last_tag = ~0ULL; } -static void uncore_pmu_event_read(struct perf_event *event) +void uncore_pmu_event_read(struct perf_event *event) { struct intel_uncore_box *box = uncore_event_to_box(event); uncore_perf_event_update(box, event); @@ -3638,7 +665,7 @@ static struct attribute_group uncore_pmu_attr_group = { .attrs = uncore_pmu_attrs, }; -static int __init uncore_pmu_register(struct intel_uncore_pmu *pmu) +static int uncore_pmu_register(struct intel_uncore_pmu *pmu) { int ret; @@ -3761,9 +788,6 @@ fail: return ret; } -static struct pci_driver *uncore_pci_driver; -static bool pcidrv_registered; - /* * add a pci uncore device */ @@ -3773,18 +797,20 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id struct intel_uncore_box *box; struct intel_uncore_type *type; int phys_id; + bool first_box = false; - phys_id = pcibus_to_physid[pdev->bus->number]; + phys_id = uncore_pcibus_to_physid[pdev->bus->number]; if (phys_id < 0) return -ENODEV; if (UNCORE_PCI_DEV_TYPE(id->driver_data) == UNCORE_EXTRA_PCI_DEV) { - extra_pci_dev[phys_id][UNCORE_PCI_DEV_IDX(id->driver_data)] = pdev; + int idx = UNCORE_PCI_DEV_IDX(id->driver_data); + uncore_extra_pci_dev[phys_id][idx] = pdev; pci_set_drvdata(pdev, NULL); return 0; } - type = pci_uncores[UNCORE_PCI_DEV_TYPE(id->driver_data)]; + type = uncore_pci_uncores[UNCORE_PCI_DEV_TYPE(id->driver_data)]; box = uncore_alloc_box(type, NUMA_NO_NODE); if (!box) return -ENOMEM; @@ -3806,9 +832,13 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id pci_set_drvdata(pdev, box); raw_spin_lock(&uncore_box_lock); + if (list_empty(&pmu->box_list)) + first_box = true; list_add_tail(&box->list, &pmu->box_list); raw_spin_unlock(&uncore_box_lock); + if (first_box) + uncore_pmu_register(pmu); return 0; } @@ -3816,13 +846,14 @@ static void uncore_pci_remove(struct pci_dev *pdev) { struct intel_uncore_box *box = pci_get_drvdata(pdev); struct intel_uncore_pmu *pmu; - int i, cpu, phys_id = pcibus_to_physid[pdev->bus->number]; + int i, cpu, phys_id = uncore_pcibus_to_physid[pdev->bus->number]; + bool last_box = false; box = pci_get_drvdata(pdev); if (!box) { for (i = 0; i < UNCORE_EXTRA_PCI_DEV_MAX; i++) { - if (extra_pci_dev[phys_id][i] == pdev) { - extra_pci_dev[phys_id][i] = NULL; + if (uncore_extra_pci_dev[phys_id][i] == pdev) { + uncore_extra_pci_dev[phys_id][i] = NULL; break; } } @@ -3838,6 +869,8 @@ static void uncore_pci_remove(struct pci_dev *pdev) raw_spin_lock(&uncore_box_lock); list_del(&box->list); + if (list_empty(&pmu->box_list)) + last_box = true; raw_spin_unlock(&uncore_box_lock); for_each_possible_cpu(cpu) { @@ -3849,6 +882,9 @@ static void uncore_pci_remove(struct pci_dev *pdev) WARN_ON_ONCE(atomic_read(&box->refcnt) != 1); kfree(box); + + if (last_box) + perf_pmu_unregister(&pmu->pmu); } static int __init uncore_pci_init(void) @@ -3857,46 +893,32 @@ static int __init uncore_pci_init(void) switch (boot_cpu_data.x86_model) { case 45: /* Sandy Bridge-EP */ - ret = snbep_pci2phy_map_init(0x3ce0); - if (ret) - return ret; - pci_uncores = snbep_pci_uncores; - uncore_pci_driver = &snbep_uncore_pci_driver; + ret = snbep_uncore_pci_init(); break; - case 62: /* IvyTown */ - ret = snbep_pci2phy_map_init(0x0e1e); - if (ret) - return ret; - pci_uncores = ivt_pci_uncores; - uncore_pci_driver = &ivt_uncore_pci_driver; + case 62: /* Ivy Bridge-EP */ + ret = ivbep_uncore_pci_init(); + break; + case 63: /* Haswell-EP */ + ret = hswep_uncore_pci_init(); break; case 42: /* Sandy Bridge */ - ret = snb_pci2phy_map_init(PCI_DEVICE_ID_INTEL_SNB_IMC); - if (ret) - return ret; - pci_uncores = snb_pci_uncores; - uncore_pci_driver = &snb_uncore_pci_driver; + ret = snb_uncore_pci_init(); break; case 58: /* Ivy Bridge */ - ret = snb_pci2phy_map_init(PCI_DEVICE_ID_INTEL_IVB_IMC); - if (ret) - return ret; - pci_uncores = snb_pci_uncores; - uncore_pci_driver = &ivb_uncore_pci_driver; + ret = ivb_uncore_pci_init(); break; case 60: /* Haswell */ case 69: /* Haswell Celeron */ - ret = snb_pci2phy_map_init(PCI_DEVICE_ID_INTEL_HSW_IMC); - if (ret) - return ret; - pci_uncores = snb_pci_uncores; - uncore_pci_driver = &hsw_uncore_pci_driver; + ret = hsw_uncore_pci_init(); break; default: return 0; } - ret = uncore_types_init(pci_uncores); + if (ret) + return ret; + + ret = uncore_types_init(uncore_pci_uncores); if (ret) return ret; @@ -3907,7 +929,7 @@ static int __init uncore_pci_init(void) if (ret == 0) pcidrv_registered = true; else - uncore_types_exit(pci_uncores); + uncore_types_exit(uncore_pci_uncores); return ret; } @@ -3917,7 +939,7 @@ static void __init uncore_pci_exit(void) if (pcidrv_registered) { pcidrv_registered = false; pci_unregister_driver(uncore_pci_driver); - uncore_types_exit(pci_uncores); + uncore_types_exit(uncore_pci_uncores); } } @@ -3943,8 +965,8 @@ static void uncore_cpu_dying(int cpu) struct intel_uncore_box *box; int i, j; - for (i = 0; msr_uncores[i]; i++) { - type = msr_uncores[i]; + for (i = 0; uncore_msr_uncores[i]; i++) { + type = uncore_msr_uncores[i]; for (j = 0; j < type->num_boxes; j++) { pmu = &type->pmus[j]; box = *per_cpu_ptr(pmu->box, cpu); @@ -3964,8 +986,8 @@ static int uncore_cpu_starting(int cpu) phys_id = topology_physical_package_id(cpu); - for (i = 0; msr_uncores[i]; i++) { - type = msr_uncores[i]; + for (i = 0; uncore_msr_uncores[i]; i++) { + type = uncore_msr_uncores[i]; for (j = 0; j < type->num_boxes; j++) { pmu = &type->pmus[j]; box = *per_cpu_ptr(pmu->box, cpu); @@ -4005,8 +1027,8 @@ static int uncore_cpu_prepare(int cpu, int phys_id) struct intel_uncore_box *box; int i, j; - for (i = 0; msr_uncores[i]; i++) { - type = msr_uncores[i]; + for (i = 0; uncore_msr_uncores[i]; i++) { + type = uncore_msr_uncores[i]; for (j = 0; j < type->num_boxes; j++) { pmu = &type->pmus[j]; if (pmu->func_id < 0) @@ -4086,8 +1108,8 @@ static void uncore_event_exit_cpu(int cpu) if (target >= 0) cpumask_set_cpu(target, &uncore_cpu_mask); - uncore_change_context(msr_uncores, cpu, target); - uncore_change_context(pci_uncores, cpu, target); + uncore_change_context(uncore_msr_uncores, cpu, target); + uncore_change_context(uncore_pci_uncores, cpu, target); } static void uncore_event_init_cpu(int cpu) @@ -4102,8 +1124,8 @@ static void uncore_event_init_cpu(int cpu) cpumask_set_cpu(cpu, &uncore_cpu_mask); - uncore_change_context(msr_uncores, -1, cpu); - uncore_change_context(pci_uncores, -1, cpu); + uncore_change_context(uncore_msr_uncores, -1, cpu); + uncore_change_context(uncore_pci_uncores, -1, cpu); } static int uncore_cpu_notifier(struct notifier_block *self, @@ -4163,47 +1185,37 @@ static void __init uncore_cpu_setup(void *dummy) static int __init uncore_cpu_init(void) { - int ret, max_cores; + int ret; - max_cores = boot_cpu_data.x86_max_cores; switch (boot_cpu_data.x86_model) { case 26: /* Nehalem */ case 30: case 37: /* Westmere */ case 44: - msr_uncores = nhm_msr_uncores; + nhm_uncore_cpu_init(); break; case 42: /* Sandy Bridge */ case 58: /* Ivy Bridge */ - if (snb_uncore_cbox.num_boxes > max_cores) - snb_uncore_cbox.num_boxes = max_cores; - msr_uncores = snb_msr_uncores; + snb_uncore_cpu_init(); break; case 45: /* Sandy Bridge-EP */ - if (snbep_uncore_cbox.num_boxes > max_cores) - snbep_uncore_cbox.num_boxes = max_cores; - msr_uncores = snbep_msr_uncores; + snbep_uncore_cpu_init(); break; case 46: /* Nehalem-EX */ - uncore_nhmex = true; case 47: /* Westmere-EX aka. Xeon E7 */ - if (!uncore_nhmex) - nhmex_uncore_mbox.event_descs = wsmex_uncore_mbox_events; - if (nhmex_uncore_cbox.num_boxes > max_cores) - nhmex_uncore_cbox.num_boxes = max_cores; - msr_uncores = nhmex_msr_uncores; + nhmex_uncore_cpu_init(); break; - case 62: /* IvyTown */ - if (ivt_uncore_cbox.num_boxes > max_cores) - ivt_uncore_cbox.num_boxes = max_cores; - msr_uncores = ivt_msr_uncores; + case 62: /* Ivy Bridge-EP */ + ivbep_uncore_cpu_init(); + break; + case 63: /* Haswell-EP */ + hswep_uncore_cpu_init(); break; - default: return 0; } - ret = uncore_types_init(msr_uncores); + ret = uncore_types_init(uncore_msr_uncores); if (ret) return ret; @@ -4216,16 +1228,8 @@ static int __init uncore_pmus_register(void) struct intel_uncore_type *type; int i, j; - for (i = 0; msr_uncores[i]; i++) { - type = msr_uncores[i]; - for (j = 0; j < type->num_boxes; j++) { - pmu = &type->pmus[j]; - uncore_pmu_register(pmu); - } - } - - for (i = 0; pci_uncores[i]; i++) { - type = pci_uncores[i]; + for (i = 0; uncore_msr_uncores[i]; i++) { + type = uncore_msr_uncores[i]; for (j = 0; j < type->num_boxes; j++) { pmu = &type->pmus[j]; uncore_pmu_register(pmu); diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h index 90236f0c94a..18eb78bbdd1 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h @@ -24,395 +24,6 @@ #define UNCORE_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, 0xff) -/* SNB event control */ -#define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff -#define SNB_UNC_CTL_UMASK_MASK 0x0000ff00 -#define SNB_UNC_CTL_EDGE_DET (1 << 18) -#define SNB_UNC_CTL_EN (1 << 22) -#define SNB_UNC_CTL_INVERT (1 << 23) -#define SNB_UNC_CTL_CMASK_MASK 0x1f000000 -#define NHM_UNC_CTL_CMASK_MASK 0xff000000 -#define NHM_UNC_FIXED_CTR_CTL_EN (1 << 0) - -#define SNB_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \ - SNB_UNC_CTL_UMASK_MASK | \ - SNB_UNC_CTL_EDGE_DET | \ - SNB_UNC_CTL_INVERT | \ - SNB_UNC_CTL_CMASK_MASK) - -#define NHM_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \ - SNB_UNC_CTL_UMASK_MASK | \ - SNB_UNC_CTL_EDGE_DET | \ - SNB_UNC_CTL_INVERT | \ - NHM_UNC_CTL_CMASK_MASK) - -/* SNB global control register */ -#define SNB_UNC_PERF_GLOBAL_CTL 0x391 -#define SNB_UNC_FIXED_CTR_CTRL 0x394 -#define SNB_UNC_FIXED_CTR 0x395 - -/* SNB uncore global control */ -#define SNB_UNC_GLOBAL_CTL_CORE_ALL ((1 << 4) - 1) -#define SNB_UNC_GLOBAL_CTL_EN (1 << 29) - -/* SNB Cbo register */ -#define SNB_UNC_CBO_0_PERFEVTSEL0 0x700 -#define SNB_UNC_CBO_0_PER_CTR0 0x706 -#define SNB_UNC_CBO_MSR_OFFSET 0x10 - -/* NHM global control register */ -#define NHM_UNC_PERF_GLOBAL_CTL 0x391 -#define NHM_UNC_FIXED_CTR 0x394 -#define NHM_UNC_FIXED_CTR_CTRL 0x395 - -/* NHM uncore global control */ -#define NHM_UNC_GLOBAL_CTL_EN_PC_ALL ((1ULL << 8) - 1) -#define NHM_UNC_GLOBAL_CTL_EN_FC (1ULL << 32) - -/* NHM uncore register */ -#define NHM_UNC_PERFEVTSEL0 0x3c0 -#define NHM_UNC_UNCORE_PMC0 0x3b0 - -/* SNB-EP Box level control */ -#define SNBEP_PMON_BOX_CTL_RST_CTRL (1 << 0) -#define SNBEP_PMON_BOX_CTL_RST_CTRS (1 << 1) -#define SNBEP_PMON_BOX_CTL_FRZ (1 << 8) -#define SNBEP_PMON_BOX_CTL_FRZ_EN (1 << 16) -#define SNBEP_PMON_BOX_CTL_INT (SNBEP_PMON_BOX_CTL_RST_CTRL | \ - SNBEP_PMON_BOX_CTL_RST_CTRS | \ - SNBEP_PMON_BOX_CTL_FRZ_EN) -/* SNB-EP event control */ -#define SNBEP_PMON_CTL_EV_SEL_MASK 0x000000ff -#define SNBEP_PMON_CTL_UMASK_MASK 0x0000ff00 -#define SNBEP_PMON_CTL_RST (1 << 17) -#define SNBEP_PMON_CTL_EDGE_DET (1 << 18) -#define SNBEP_PMON_CTL_EV_SEL_EXT (1 << 21) -#define SNBEP_PMON_CTL_EN (1 << 22) -#define SNBEP_PMON_CTL_INVERT (1 << 23) -#define SNBEP_PMON_CTL_TRESH_MASK 0xff000000 -#define SNBEP_PMON_RAW_EVENT_MASK (SNBEP_PMON_CTL_EV_SEL_MASK | \ - SNBEP_PMON_CTL_UMASK_MASK | \ - SNBEP_PMON_CTL_EDGE_DET | \ - SNBEP_PMON_CTL_INVERT | \ - SNBEP_PMON_CTL_TRESH_MASK) - -/* SNB-EP Ubox event control */ -#define SNBEP_U_MSR_PMON_CTL_TRESH_MASK 0x1f000000 -#define SNBEP_U_MSR_PMON_RAW_EVENT_MASK \ - (SNBEP_PMON_CTL_EV_SEL_MASK | \ - SNBEP_PMON_CTL_UMASK_MASK | \ - SNBEP_PMON_CTL_EDGE_DET | \ - SNBEP_PMON_CTL_INVERT | \ - SNBEP_U_MSR_PMON_CTL_TRESH_MASK) - -#define SNBEP_CBO_PMON_CTL_TID_EN (1 << 19) -#define SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK (SNBEP_PMON_RAW_EVENT_MASK | \ - SNBEP_CBO_PMON_CTL_TID_EN) - -/* SNB-EP PCU event control */ -#define SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK 0x0000c000 -#define SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK 0x1f000000 -#define SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT (1 << 30) -#define SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET (1 << 31) -#define SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK \ - (SNBEP_PMON_CTL_EV_SEL_MASK | \ - SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \ - SNBEP_PMON_CTL_EDGE_DET | \ - SNBEP_PMON_CTL_EV_SEL_EXT | \ - SNBEP_PMON_CTL_INVERT | \ - SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \ - SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \ - SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET) - -#define SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK \ - (SNBEP_PMON_RAW_EVENT_MASK | \ - SNBEP_PMON_CTL_EV_SEL_EXT) - -/* SNB-EP pci control register */ -#define SNBEP_PCI_PMON_BOX_CTL 0xf4 -#define SNBEP_PCI_PMON_CTL0 0xd8 -/* SNB-EP pci counter register */ -#define SNBEP_PCI_PMON_CTR0 0xa0 - -/* SNB-EP home agent register */ -#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH0 0x40 -#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH1 0x44 -#define SNBEP_HA_PCI_PMON_BOX_OPCODEMATCH 0x48 -/* SNB-EP memory controller register */ -#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTL 0xf0 -#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTR 0xd0 -/* SNB-EP QPI register */ -#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH0 0x228 -#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH1 0x22c -#define SNBEP_Q_Py_PCI_PMON_PKT_MASK0 0x238 -#define SNBEP_Q_Py_PCI_PMON_PKT_MASK1 0x23c - -/* SNB-EP Ubox register */ -#define SNBEP_U_MSR_PMON_CTR0 0xc16 -#define SNBEP_U_MSR_PMON_CTL0 0xc10 - -#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTL 0xc08 -#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTR 0xc09 - -/* SNB-EP Cbo register */ -#define SNBEP_C0_MSR_PMON_CTR0 0xd16 -#define SNBEP_C0_MSR_PMON_CTL0 0xd10 -#define SNBEP_C0_MSR_PMON_BOX_CTL 0xd04 -#define SNBEP_C0_MSR_PMON_BOX_FILTER 0xd14 -#define SNBEP_CBO_MSR_OFFSET 0x20 - -#define SNBEP_CB0_MSR_PMON_BOX_FILTER_TID 0x1f -#define SNBEP_CB0_MSR_PMON_BOX_FILTER_NID 0x3fc00 -#define SNBEP_CB0_MSR_PMON_BOX_FILTER_STATE 0x7c0000 -#define SNBEP_CB0_MSR_PMON_BOX_FILTER_OPC 0xff800000 - -#define SNBEP_CBO_EVENT_EXTRA_REG(e, m, i) { \ - .event = (e), \ - .msr = SNBEP_C0_MSR_PMON_BOX_FILTER, \ - .config_mask = (m), \ - .idx = (i) \ -} - -/* SNB-EP PCU register */ -#define SNBEP_PCU_MSR_PMON_CTR0 0xc36 -#define SNBEP_PCU_MSR_PMON_CTL0 0xc30 -#define SNBEP_PCU_MSR_PMON_BOX_CTL 0xc24 -#define SNBEP_PCU_MSR_PMON_BOX_FILTER 0xc34 -#define SNBEP_PCU_MSR_PMON_BOX_FILTER_MASK 0xffffffff -#define SNBEP_PCU_MSR_CORE_C3_CTR 0x3fc -#define SNBEP_PCU_MSR_CORE_C6_CTR 0x3fd - -/* IVT event control */ -#define IVT_PMON_BOX_CTL_INT (SNBEP_PMON_BOX_CTL_RST_CTRL | \ - SNBEP_PMON_BOX_CTL_RST_CTRS) -#define IVT_PMON_RAW_EVENT_MASK (SNBEP_PMON_CTL_EV_SEL_MASK | \ - SNBEP_PMON_CTL_UMASK_MASK | \ - SNBEP_PMON_CTL_EDGE_DET | \ - SNBEP_PMON_CTL_TRESH_MASK) -/* IVT Ubox */ -#define IVT_U_MSR_PMON_GLOBAL_CTL 0xc00 -#define IVT_U_PMON_GLOBAL_FRZ_ALL (1 << 31) -#define IVT_U_PMON_GLOBAL_UNFRZ_ALL (1 << 29) - -#define IVT_U_MSR_PMON_RAW_EVENT_MASK \ - (SNBEP_PMON_CTL_EV_SEL_MASK | \ - SNBEP_PMON_CTL_UMASK_MASK | \ - SNBEP_PMON_CTL_EDGE_DET | \ - SNBEP_U_MSR_PMON_CTL_TRESH_MASK) -/* IVT Cbo */ -#define IVT_CBO_MSR_PMON_RAW_EVENT_MASK (IVT_PMON_RAW_EVENT_MASK | \ - SNBEP_CBO_PMON_CTL_TID_EN) - -#define IVT_CB0_MSR_PMON_BOX_FILTER_TID (0x1fULL << 0) -#define IVT_CB0_MSR_PMON_BOX_FILTER_LINK (0xfULL << 5) -#define IVT_CB0_MSR_PMON_BOX_FILTER_STATE (0x3fULL << 17) -#define IVT_CB0_MSR_PMON_BOX_FILTER_NID (0xffffULL << 32) -#define IVT_CB0_MSR_PMON_BOX_FILTER_OPC (0x1ffULL << 52) -#define IVT_CB0_MSR_PMON_BOX_FILTER_C6 (0x1ULL << 61) -#define IVT_CB0_MSR_PMON_BOX_FILTER_NC (0x1ULL << 62) -#define IVT_CB0_MSR_PMON_BOX_FILTER_IOSC (0x1ULL << 63) - -/* IVT home agent */ -#define IVT_HA_PCI_PMON_CTL_Q_OCC_RST (1 << 16) -#define IVT_HA_PCI_PMON_RAW_EVENT_MASK \ - (IVT_PMON_RAW_EVENT_MASK | \ - IVT_HA_PCI_PMON_CTL_Q_OCC_RST) -/* IVT PCU */ -#define IVT_PCU_MSR_PMON_RAW_EVENT_MASK \ - (SNBEP_PMON_CTL_EV_SEL_MASK | \ - SNBEP_PMON_CTL_EV_SEL_EXT | \ - SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \ - SNBEP_PMON_CTL_EDGE_DET | \ - SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \ - SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \ - SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET) -/* IVT QPI */ -#define IVT_QPI_PCI_PMON_RAW_EVENT_MASK \ - (IVT_PMON_RAW_EVENT_MASK | \ - SNBEP_PMON_CTL_EV_SEL_EXT) - -/* NHM-EX event control */ -#define NHMEX_PMON_CTL_EV_SEL_MASK 0x000000ff -#define NHMEX_PMON_CTL_UMASK_MASK 0x0000ff00 -#define NHMEX_PMON_CTL_EN_BIT0 (1 << 0) -#define NHMEX_PMON_CTL_EDGE_DET (1 << 18) -#define NHMEX_PMON_CTL_PMI_EN (1 << 20) -#define NHMEX_PMON_CTL_EN_BIT22 (1 << 22) -#define NHMEX_PMON_CTL_INVERT (1 << 23) -#define NHMEX_PMON_CTL_TRESH_MASK 0xff000000 -#define NHMEX_PMON_RAW_EVENT_MASK (NHMEX_PMON_CTL_EV_SEL_MASK | \ - NHMEX_PMON_CTL_UMASK_MASK | \ - NHMEX_PMON_CTL_EDGE_DET | \ - NHMEX_PMON_CTL_INVERT | \ - NHMEX_PMON_CTL_TRESH_MASK) - -/* NHM-EX Ubox */ -#define NHMEX_U_MSR_PMON_GLOBAL_CTL 0xc00 -#define NHMEX_U_MSR_PMON_CTR 0xc11 -#define NHMEX_U_MSR_PMON_EV_SEL 0xc10 - -#define NHMEX_U_PMON_GLOBAL_EN (1 << 0) -#define NHMEX_U_PMON_GLOBAL_PMI_CORE_SEL 0x0000001e -#define NHMEX_U_PMON_GLOBAL_EN_ALL (1 << 28) -#define NHMEX_U_PMON_GLOBAL_RST_ALL (1 << 29) -#define NHMEX_U_PMON_GLOBAL_FRZ_ALL (1 << 31) - -#define NHMEX_U_PMON_RAW_EVENT_MASK \ - (NHMEX_PMON_CTL_EV_SEL_MASK | \ - NHMEX_PMON_CTL_EDGE_DET) - -/* NHM-EX Cbox */ -#define NHMEX_C0_MSR_PMON_GLOBAL_CTL 0xd00 -#define NHMEX_C0_MSR_PMON_CTR0 0xd11 -#define NHMEX_C0_MSR_PMON_EV_SEL0 0xd10 -#define NHMEX_C_MSR_OFFSET 0x20 - -/* NHM-EX Bbox */ -#define NHMEX_B0_MSR_PMON_GLOBAL_CTL 0xc20 -#define NHMEX_B0_MSR_PMON_CTR0 0xc31 -#define NHMEX_B0_MSR_PMON_CTL0 0xc30 -#define NHMEX_B_MSR_OFFSET 0x40 -#define NHMEX_B0_MSR_MATCH 0xe45 -#define NHMEX_B0_MSR_MASK 0xe46 -#define NHMEX_B1_MSR_MATCH 0xe4d -#define NHMEX_B1_MSR_MASK 0xe4e - -#define NHMEX_B_PMON_CTL_EN (1 << 0) -#define NHMEX_B_PMON_CTL_EV_SEL_SHIFT 1 -#define NHMEX_B_PMON_CTL_EV_SEL_MASK \ - (0x1f << NHMEX_B_PMON_CTL_EV_SEL_SHIFT) -#define NHMEX_B_PMON_CTR_SHIFT 6 -#define NHMEX_B_PMON_CTR_MASK \ - (0x3 << NHMEX_B_PMON_CTR_SHIFT) -#define NHMEX_B_PMON_RAW_EVENT_MASK \ - (NHMEX_B_PMON_CTL_EV_SEL_MASK | \ - NHMEX_B_PMON_CTR_MASK) - -/* NHM-EX Sbox */ -#define NHMEX_S0_MSR_PMON_GLOBAL_CTL 0xc40 -#define NHMEX_S0_MSR_PMON_CTR0 0xc51 -#define NHMEX_S0_MSR_PMON_CTL0 0xc50 -#define NHMEX_S_MSR_OFFSET 0x80 -#define NHMEX_S0_MSR_MM_CFG 0xe48 -#define NHMEX_S0_MSR_MATCH 0xe49 -#define NHMEX_S0_MSR_MASK 0xe4a -#define NHMEX_S1_MSR_MM_CFG 0xe58 -#define NHMEX_S1_MSR_MATCH 0xe59 -#define NHMEX_S1_MSR_MASK 0xe5a - -#define NHMEX_S_PMON_MM_CFG_EN (0x1ULL << 63) -#define NHMEX_S_EVENT_TO_R_PROG_EV 0 - -/* NHM-EX Mbox */ -#define NHMEX_M0_MSR_GLOBAL_CTL 0xca0 -#define NHMEX_M0_MSR_PMU_DSP 0xca5 -#define NHMEX_M0_MSR_PMU_ISS 0xca6 -#define NHMEX_M0_MSR_PMU_MAP 0xca7 -#define NHMEX_M0_MSR_PMU_MSC_THR 0xca8 -#define NHMEX_M0_MSR_PMU_PGT 0xca9 -#define NHMEX_M0_MSR_PMU_PLD 0xcaa -#define NHMEX_M0_MSR_PMU_ZDP_CTL_FVC 0xcab -#define NHMEX_M0_MSR_PMU_CTL0 0xcb0 -#define NHMEX_M0_MSR_PMU_CNT0 0xcb1 -#define NHMEX_M_MSR_OFFSET 0x40 -#define NHMEX_M0_MSR_PMU_MM_CFG 0xe54 -#define NHMEX_M1_MSR_PMU_MM_CFG 0xe5c - -#define NHMEX_M_PMON_MM_CFG_EN (1ULL << 63) -#define NHMEX_M_PMON_ADDR_MATCH_MASK 0x3ffffffffULL -#define NHMEX_M_PMON_ADDR_MASK_MASK 0x7ffffffULL -#define NHMEX_M_PMON_ADDR_MASK_SHIFT 34 - -#define NHMEX_M_PMON_CTL_EN (1 << 0) -#define NHMEX_M_PMON_CTL_PMI_EN (1 << 1) -#define NHMEX_M_PMON_CTL_COUNT_MODE_SHIFT 2 -#define NHMEX_M_PMON_CTL_COUNT_MODE_MASK \ - (0x3 << NHMEX_M_PMON_CTL_COUNT_MODE_SHIFT) -#define NHMEX_M_PMON_CTL_STORAGE_MODE_SHIFT 4 -#define NHMEX_M_PMON_CTL_STORAGE_MODE_MASK \ - (0x3 << NHMEX_M_PMON_CTL_STORAGE_MODE_SHIFT) -#define NHMEX_M_PMON_CTL_WRAP_MODE (1 << 6) -#define NHMEX_M_PMON_CTL_FLAG_MODE (1 << 7) -#define NHMEX_M_PMON_CTL_INC_SEL_SHIFT 9 -#define NHMEX_M_PMON_CTL_INC_SEL_MASK \ - (0x1f << NHMEX_M_PMON_CTL_INC_SEL_SHIFT) -#define NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT 19 -#define NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK \ - (0x7 << NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT) -#define NHMEX_M_PMON_RAW_EVENT_MASK \ - (NHMEX_M_PMON_CTL_COUNT_MODE_MASK | \ - NHMEX_M_PMON_CTL_STORAGE_MODE_MASK | \ - NHMEX_M_PMON_CTL_WRAP_MODE | \ - NHMEX_M_PMON_CTL_FLAG_MODE | \ - NHMEX_M_PMON_CTL_INC_SEL_MASK | \ - NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK) - -#define NHMEX_M_PMON_ZDP_CTL_FVC_MASK (((1 << 11) - 1) | (1 << 23)) -#define NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n) (0x7ULL << (11 + 3 * (n))) - -#define WSMEX_M_PMON_ZDP_CTL_FVC_MASK (((1 << 12) - 1) | (1 << 24)) -#define WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n) (0x7ULL << (12 + 3 * (n))) - -/* - * use the 9~13 bits to select event If the 7th bit is not set, - * otherwise use the 19~21 bits to select event. - */ -#define MBOX_INC_SEL(x) ((x) << NHMEX_M_PMON_CTL_INC_SEL_SHIFT) -#define MBOX_SET_FLAG_SEL(x) (((x) << NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT) | \ - NHMEX_M_PMON_CTL_FLAG_MODE) -#define MBOX_INC_SEL_MASK (NHMEX_M_PMON_CTL_INC_SEL_MASK | \ - NHMEX_M_PMON_CTL_FLAG_MODE) -#define MBOX_SET_FLAG_SEL_MASK (NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK | \ - NHMEX_M_PMON_CTL_FLAG_MODE) -#define MBOX_INC_SEL_EXTAR_REG(c, r) \ - EVENT_EXTRA_REG(MBOX_INC_SEL(c), NHMEX_M0_MSR_PMU_##r, \ - MBOX_INC_SEL_MASK, (u64)-1, NHMEX_M_##r) -#define MBOX_SET_FLAG_SEL_EXTRA_REG(c, r) \ - EVENT_EXTRA_REG(MBOX_SET_FLAG_SEL(c), NHMEX_M0_MSR_PMU_##r, \ - MBOX_SET_FLAG_SEL_MASK, \ - (u64)-1, NHMEX_M_##r) - -/* NHM-EX Rbox */ -#define NHMEX_R_MSR_GLOBAL_CTL 0xe00 -#define NHMEX_R_MSR_PMON_CTL0 0xe10 -#define NHMEX_R_MSR_PMON_CNT0 0xe11 -#define NHMEX_R_MSR_OFFSET 0x20 - -#define NHMEX_R_MSR_PORTN_QLX_CFG(n) \ - ((n) < 4 ? (0xe0c + (n)) : (0xe2c + (n) - 4)) -#define NHMEX_R_MSR_PORTN_IPERF_CFG0(n) (0xe04 + (n)) -#define NHMEX_R_MSR_PORTN_IPERF_CFG1(n) (0xe24 + (n)) -#define NHMEX_R_MSR_PORTN_XBR_OFFSET(n) \ - (((n) < 4 ? 0 : 0x10) + (n) * 4) -#define NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) \ - (0xe60 + NHMEX_R_MSR_PORTN_XBR_OFFSET(n)) -#define NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(n) \ - (NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) + 1) -#define NHMEX_R_MSR_PORTN_XBR_SET1_MASK(n) \ - (NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) + 2) -#define NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) \ - (0xe70 + NHMEX_R_MSR_PORTN_XBR_OFFSET(n)) -#define NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(n) \ - (NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) + 1) -#define NHMEX_R_MSR_PORTN_XBR_SET2_MASK(n) \ - (NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) + 2) - -#define NHMEX_R_PMON_CTL_EN (1 << 0) -#define NHMEX_R_PMON_CTL_EV_SEL_SHIFT 1 -#define NHMEX_R_PMON_CTL_EV_SEL_MASK \ - (0x1f << NHMEX_R_PMON_CTL_EV_SEL_SHIFT) -#define NHMEX_R_PMON_CTL_PMI_EN (1 << 6) -#define NHMEX_R_PMON_RAW_EVENT_MASK NHMEX_R_PMON_CTL_EV_SEL_MASK - -/* NHM-EX Wbox */ -#define NHMEX_W_MSR_GLOBAL_CTL 0xc80 -#define NHMEX_W_MSR_PMON_CNT0 0xc90 -#define NHMEX_W_MSR_PMON_EVT_SEL0 0xc91 -#define NHMEX_W_MSR_PMON_FIXED_CTR 0x394 -#define NHMEX_W_MSR_PMON_FIXED_CTL 0x395 - -#define NHMEX_W_PMON_GLOBAL_FIXED_EN (1ULL << 31) - struct intel_uncore_ops; struct intel_uncore_pmu; struct intel_uncore_box; @@ -505,6 +116,9 @@ struct uncore_event_desc { const char *config; }; +ssize_t uncore_event_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf); + #define INTEL_UNCORE_EVENT_DESC(_name, _config) \ { \ .attr = __ATTR(_name, 0444, uncore_event_show, NULL), \ @@ -522,15 +136,6 @@ static ssize_t __uncore_##_var##_show(struct kobject *kobj, \ static struct kobj_attribute format_attr_##_var = \ __ATTR(_name, 0444, __uncore_##_var##_show, NULL) - -static ssize_t uncore_event_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct uncore_event_desc *event = - container_of(attr, struct uncore_event_desc, attr); - return sprintf(buf, "%s", event->config); -} - static inline unsigned uncore_pci_box_ctl(struct intel_uncore_box *box) { return box->pmu->type->box_ctl; @@ -694,3 +299,41 @@ static inline bool uncore_box_is_fake(struct intel_uncore_box *box) { return (box->phys_id < 0); } + +struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event); +struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu); +struct intel_uncore_box *uncore_event_to_box(struct perf_event *event); +u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event); +void uncore_pmu_start_hrtimer(struct intel_uncore_box *box); +void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box); +void uncore_pmu_event_read(struct perf_event *event); +void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event); +struct event_constraint * +uncore_get_constraint(struct intel_uncore_box *box, struct perf_event *event); +void uncore_put_constraint(struct intel_uncore_box *box, struct perf_event *event); +u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx); + +extern struct intel_uncore_type **uncore_msr_uncores; +extern struct intel_uncore_type **uncore_pci_uncores; +extern struct pci_driver *uncore_pci_driver; +extern int uncore_pcibus_to_physid[256]; +extern struct pci_dev *uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX]; +extern struct event_constraint uncore_constraint_empty; + +/* perf_event_intel_uncore_snb.c */ +int snb_uncore_pci_init(void); +int ivb_uncore_pci_init(void); +int hsw_uncore_pci_init(void); +void snb_uncore_cpu_init(void); +void nhm_uncore_cpu_init(void); + +/* perf_event_intel_uncore_snbep.c */ +int snbep_uncore_pci_init(void); +void snbep_uncore_cpu_init(void); +int ivbep_uncore_pci_init(void); +void ivbep_uncore_cpu_init(void); +int hswep_uncore_pci_init(void); +void hswep_uncore_cpu_init(void); + +/* perf_event_intel_uncore_nhmex.c */ +void nhmex_uncore_cpu_init(void); diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_nhmex.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_nhmex.c new file mode 100644 index 00000000000..2749965afed --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_nhmex.c @@ -0,0 +1,1221 @@ +/* Nehalem-EX/Westmere-EX uncore support */ +#include "perf_event_intel_uncore.h" + +/* NHM-EX event control */ +#define NHMEX_PMON_CTL_EV_SEL_MASK 0x000000ff +#define NHMEX_PMON_CTL_UMASK_MASK 0x0000ff00 +#define NHMEX_PMON_CTL_EN_BIT0 (1 << 0) +#define NHMEX_PMON_CTL_EDGE_DET (1 << 18) +#define NHMEX_PMON_CTL_PMI_EN (1 << 20) +#define NHMEX_PMON_CTL_EN_BIT22 (1 << 22) +#define NHMEX_PMON_CTL_INVERT (1 << 23) +#define NHMEX_PMON_CTL_TRESH_MASK 0xff000000 +#define NHMEX_PMON_RAW_EVENT_MASK (NHMEX_PMON_CTL_EV_SEL_MASK | \ + NHMEX_PMON_CTL_UMASK_MASK | \ + NHMEX_PMON_CTL_EDGE_DET | \ + NHMEX_PMON_CTL_INVERT | \ + NHMEX_PMON_CTL_TRESH_MASK) + +/* NHM-EX Ubox */ +#define NHMEX_U_MSR_PMON_GLOBAL_CTL 0xc00 +#define NHMEX_U_MSR_PMON_CTR 0xc11 +#define NHMEX_U_MSR_PMON_EV_SEL 0xc10 + +#define NHMEX_U_PMON_GLOBAL_EN (1 << 0) +#define NHMEX_U_PMON_GLOBAL_PMI_CORE_SEL 0x0000001e +#define NHMEX_U_PMON_GLOBAL_EN_ALL (1 << 28) +#define NHMEX_U_PMON_GLOBAL_RST_ALL (1 << 29) +#define NHMEX_U_PMON_GLOBAL_FRZ_ALL (1 << 31) + +#define NHMEX_U_PMON_RAW_EVENT_MASK \ + (NHMEX_PMON_CTL_EV_SEL_MASK | \ + NHMEX_PMON_CTL_EDGE_DET) + +/* NHM-EX Cbox */ +#define NHMEX_C0_MSR_PMON_GLOBAL_CTL 0xd00 +#define NHMEX_C0_MSR_PMON_CTR0 0xd11 +#define NHMEX_C0_MSR_PMON_EV_SEL0 0xd10 +#define NHMEX_C_MSR_OFFSET 0x20 + +/* NHM-EX Bbox */ +#define NHMEX_B0_MSR_PMON_GLOBAL_CTL 0xc20 +#define NHMEX_B0_MSR_PMON_CTR0 0xc31 +#define NHMEX_B0_MSR_PMON_CTL0 0xc30 +#define NHMEX_B_MSR_OFFSET 0x40 +#define NHMEX_B0_MSR_MATCH 0xe45 +#define NHMEX_B0_MSR_MASK 0xe46 +#define NHMEX_B1_MSR_MATCH 0xe4d +#define NHMEX_B1_MSR_MASK 0xe4e + +#define NHMEX_B_PMON_CTL_EN (1 << 0) +#define NHMEX_B_PMON_CTL_EV_SEL_SHIFT 1 +#define NHMEX_B_PMON_CTL_EV_SEL_MASK \ + (0x1f << NHMEX_B_PMON_CTL_EV_SEL_SHIFT) +#define NHMEX_B_PMON_CTR_SHIFT 6 +#define NHMEX_B_PMON_CTR_MASK \ + (0x3 << NHMEX_B_PMON_CTR_SHIFT) +#define NHMEX_B_PMON_RAW_EVENT_MASK \ + (NHMEX_B_PMON_CTL_EV_SEL_MASK | \ + NHMEX_B_PMON_CTR_MASK) + +/* NHM-EX Sbox */ +#define NHMEX_S0_MSR_PMON_GLOBAL_CTL 0xc40 +#define NHMEX_S0_MSR_PMON_CTR0 0xc51 +#define NHMEX_S0_MSR_PMON_CTL0 0xc50 +#define NHMEX_S_MSR_OFFSET 0x80 +#define NHMEX_S0_MSR_MM_CFG 0xe48 +#define NHMEX_S0_MSR_MATCH 0xe49 +#define NHMEX_S0_MSR_MASK 0xe4a +#define NHMEX_S1_MSR_MM_CFG 0xe58 +#define NHMEX_S1_MSR_MATCH 0xe59 +#define NHMEX_S1_MSR_MASK 0xe5a + +#define NHMEX_S_PMON_MM_CFG_EN (0x1ULL << 63) +#define NHMEX_S_EVENT_TO_R_PROG_EV 0 + +/* NHM-EX Mbox */ +#define NHMEX_M0_MSR_GLOBAL_CTL 0xca0 +#define NHMEX_M0_MSR_PMU_DSP 0xca5 +#define NHMEX_M0_MSR_PMU_ISS 0xca6 +#define NHMEX_M0_MSR_PMU_MAP 0xca7 +#define NHMEX_M0_MSR_PMU_MSC_THR 0xca8 +#define NHMEX_M0_MSR_PMU_PGT 0xca9 +#define NHMEX_M0_MSR_PMU_PLD 0xcaa +#define NHMEX_M0_MSR_PMU_ZDP_CTL_FVC 0xcab +#define NHMEX_M0_MSR_PMU_CTL0 0xcb0 +#define NHMEX_M0_MSR_PMU_CNT0 0xcb1 +#define NHMEX_M_MSR_OFFSET 0x40 +#define NHMEX_M0_MSR_PMU_MM_CFG 0xe54 +#define NHMEX_M1_MSR_PMU_MM_CFG 0xe5c + +#define NHMEX_M_PMON_MM_CFG_EN (1ULL << 63) +#define NHMEX_M_PMON_ADDR_MATCH_MASK 0x3ffffffffULL +#define NHMEX_M_PMON_ADDR_MASK_MASK 0x7ffffffULL +#define NHMEX_M_PMON_ADDR_MASK_SHIFT 34 + +#define NHMEX_M_PMON_CTL_EN (1 << 0) +#define NHMEX_M_PMON_CTL_PMI_EN (1 << 1) +#define NHMEX_M_PMON_CTL_COUNT_MODE_SHIFT 2 +#define NHMEX_M_PMON_CTL_COUNT_MODE_MASK \ + (0x3 << NHMEX_M_PMON_CTL_COUNT_MODE_SHIFT) +#define NHMEX_M_PMON_CTL_STORAGE_MODE_SHIFT 4 +#define NHMEX_M_PMON_CTL_STORAGE_MODE_MASK \ + (0x3 << NHMEX_M_PMON_CTL_STORAGE_MODE_SHIFT) +#define NHMEX_M_PMON_CTL_WRAP_MODE (1 << 6) +#define NHMEX_M_PMON_CTL_FLAG_MODE (1 << 7) +#define NHMEX_M_PMON_CTL_INC_SEL_SHIFT 9 +#define NHMEX_M_PMON_CTL_INC_SEL_MASK \ + (0x1f << NHMEX_M_PMON_CTL_INC_SEL_SHIFT) +#define NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT 19 +#define NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK \ + (0x7 << NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT) +#define NHMEX_M_PMON_RAW_EVENT_MASK \ + (NHMEX_M_PMON_CTL_COUNT_MODE_MASK | \ + NHMEX_M_PMON_CTL_STORAGE_MODE_MASK | \ + NHMEX_M_PMON_CTL_WRAP_MODE | \ + NHMEX_M_PMON_CTL_FLAG_MODE | \ + NHMEX_M_PMON_CTL_INC_SEL_MASK | \ + NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK) + +#define NHMEX_M_PMON_ZDP_CTL_FVC_MASK (((1 << 11) - 1) | (1 << 23)) +#define NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n) (0x7ULL << (11 + 3 * (n))) + +#define WSMEX_M_PMON_ZDP_CTL_FVC_MASK (((1 << 12) - 1) | (1 << 24)) +#define WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n) (0x7ULL << (12 + 3 * (n))) + +/* + * use the 9~13 bits to select event If the 7th bit is not set, + * otherwise use the 19~21 bits to select event. + */ +#define MBOX_INC_SEL(x) ((x) << NHMEX_M_PMON_CTL_INC_SEL_SHIFT) +#define MBOX_SET_FLAG_SEL(x) (((x) << NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT) | \ + NHMEX_M_PMON_CTL_FLAG_MODE) +#define MBOX_INC_SEL_MASK (NHMEX_M_PMON_CTL_INC_SEL_MASK | \ + NHMEX_M_PMON_CTL_FLAG_MODE) +#define MBOX_SET_FLAG_SEL_MASK (NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK | \ + NHMEX_M_PMON_CTL_FLAG_MODE) +#define MBOX_INC_SEL_EXTAR_REG(c, r) \ + EVENT_EXTRA_REG(MBOX_INC_SEL(c), NHMEX_M0_MSR_PMU_##r, \ + MBOX_INC_SEL_MASK, (u64)-1, NHMEX_M_##r) +#define MBOX_SET_FLAG_SEL_EXTRA_REG(c, r) \ + EVENT_EXTRA_REG(MBOX_SET_FLAG_SEL(c), NHMEX_M0_MSR_PMU_##r, \ + MBOX_SET_FLAG_SEL_MASK, \ + (u64)-1, NHMEX_M_##r) + +/* NHM-EX Rbox */ +#define NHMEX_R_MSR_GLOBAL_CTL 0xe00 +#define NHMEX_R_MSR_PMON_CTL0 0xe10 +#define NHMEX_R_MSR_PMON_CNT0 0xe11 +#define NHMEX_R_MSR_OFFSET 0x20 + +#define NHMEX_R_MSR_PORTN_QLX_CFG(n) \ + ((n) < 4 ? (0xe0c + (n)) : (0xe2c + (n) - 4)) +#define NHMEX_R_MSR_PORTN_IPERF_CFG0(n) (0xe04 + (n)) +#define NHMEX_R_MSR_PORTN_IPERF_CFG1(n) (0xe24 + (n)) +#define NHMEX_R_MSR_PORTN_XBR_OFFSET(n) \ + (((n) < 4 ? 0 : 0x10) + (n) * 4) +#define NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) \ + (0xe60 + NHMEX_R_MSR_PORTN_XBR_OFFSET(n)) +#define NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(n) \ + (NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) + 1) +#define NHMEX_R_MSR_PORTN_XBR_SET1_MASK(n) \ + (NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) + 2) +#define NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) \ + (0xe70 + NHMEX_R_MSR_PORTN_XBR_OFFSET(n)) +#define NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(n) \ + (NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) + 1) +#define NHMEX_R_MSR_PORTN_XBR_SET2_MASK(n) \ + (NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) + 2) + +#define NHMEX_R_PMON_CTL_EN (1 << 0) +#define NHMEX_R_PMON_CTL_EV_SEL_SHIFT 1 +#define NHMEX_R_PMON_CTL_EV_SEL_MASK \ + (0x1f << NHMEX_R_PMON_CTL_EV_SEL_SHIFT) +#define NHMEX_R_PMON_CTL_PMI_EN (1 << 6) +#define NHMEX_R_PMON_RAW_EVENT_MASK NHMEX_R_PMON_CTL_EV_SEL_MASK + +/* NHM-EX Wbox */ +#define NHMEX_W_MSR_GLOBAL_CTL 0xc80 +#define NHMEX_W_MSR_PMON_CNT0 0xc90 +#define NHMEX_W_MSR_PMON_EVT_SEL0 0xc91 +#define NHMEX_W_MSR_PMON_FIXED_CTR 0x394 +#define NHMEX_W_MSR_PMON_FIXED_CTL 0x395 + +#define NHMEX_W_PMON_GLOBAL_FIXED_EN (1ULL << 31) + +#define __BITS_VALUE(x, i, n) ((typeof(x))(((x) >> ((i) * (n))) & \ + ((1ULL << (n)) - 1))) + +DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7"); +DEFINE_UNCORE_FORMAT_ATTR(event5, event, "config:1-5"); +DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15"); +DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18"); +DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23"); +DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31"); +DEFINE_UNCORE_FORMAT_ATTR(counter, counter, "config:6-7"); +DEFINE_UNCORE_FORMAT_ATTR(match, match, "config1:0-63"); +DEFINE_UNCORE_FORMAT_ATTR(mask, mask, "config2:0-63"); + +static void nhmex_uncore_msr_init_box(struct intel_uncore_box *box) +{ + wrmsrl(NHMEX_U_MSR_PMON_GLOBAL_CTL, NHMEX_U_PMON_GLOBAL_EN_ALL); +} + +static void nhmex_uncore_msr_disable_box(struct intel_uncore_box *box) +{ + unsigned msr = uncore_msr_box_ctl(box); + u64 config; + + if (msr) { + rdmsrl(msr, config); + config &= ~((1ULL << uncore_num_counters(box)) - 1); + /* WBox has a fixed counter */ + if (uncore_msr_fixed_ctl(box)) + config &= ~NHMEX_W_PMON_GLOBAL_FIXED_EN; + wrmsrl(msr, config); + } +} + +static void nhmex_uncore_msr_enable_box(struct intel_uncore_box *box) +{ + unsigned msr = uncore_msr_box_ctl(box); + u64 config; + + if (msr) { + rdmsrl(msr, config); + config |= (1ULL << uncore_num_counters(box)) - 1; + /* WBox has a fixed counter */ + if (uncore_msr_fixed_ctl(box)) + config |= NHMEX_W_PMON_GLOBAL_FIXED_EN; + wrmsrl(msr, config); + } +} + +static void nhmex_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + wrmsrl(event->hw.config_base, 0); +} + +static void nhmex_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (hwc->idx >= UNCORE_PMC_IDX_FIXED) + wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0); + else if (box->pmu->type->event_mask & NHMEX_PMON_CTL_EN_BIT0) + wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22); + else + wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0); +} + +#define NHMEX_UNCORE_OPS_COMMON_INIT() \ + .init_box = nhmex_uncore_msr_init_box, \ + .disable_box = nhmex_uncore_msr_disable_box, \ + .enable_box = nhmex_uncore_msr_enable_box, \ + .disable_event = nhmex_uncore_msr_disable_event, \ + .read_counter = uncore_msr_read_counter + +static struct intel_uncore_ops nhmex_uncore_ops = { + NHMEX_UNCORE_OPS_COMMON_INIT(), + .enable_event = nhmex_uncore_msr_enable_event, +}; + +static struct attribute *nhmex_uncore_ubox_formats_attr[] = { + &format_attr_event.attr, + &format_attr_edge.attr, + NULL, +}; + +static struct attribute_group nhmex_uncore_ubox_format_group = { + .name = "format", + .attrs = nhmex_uncore_ubox_formats_attr, +}; + +static struct intel_uncore_type nhmex_uncore_ubox = { + .name = "ubox", + .num_counters = 1, + .num_boxes = 1, + .perf_ctr_bits = 48, + .event_ctl = NHMEX_U_MSR_PMON_EV_SEL, + .perf_ctr = NHMEX_U_MSR_PMON_CTR, + .event_mask = NHMEX_U_PMON_RAW_EVENT_MASK, + .box_ctl = NHMEX_U_MSR_PMON_GLOBAL_CTL, + .ops = &nhmex_uncore_ops, + .format_group = &nhmex_uncore_ubox_format_group +}; + +static struct attribute *nhmex_uncore_cbox_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh8.attr, + NULL, +}; + +static struct attribute_group nhmex_uncore_cbox_format_group = { + .name = "format", + .attrs = nhmex_uncore_cbox_formats_attr, +}; + +/* msr offset for each instance of cbox */ +static unsigned nhmex_cbox_msr_offsets[] = { + 0x0, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, 0x240, 0x2c0, +}; + +static struct intel_uncore_type nhmex_uncore_cbox = { + .name = "cbox", + .num_counters = 6, + .num_boxes = 10, + .perf_ctr_bits = 48, + .event_ctl = NHMEX_C0_MSR_PMON_EV_SEL0, + .perf_ctr = NHMEX_C0_MSR_PMON_CTR0, + .event_mask = NHMEX_PMON_RAW_EVENT_MASK, + .box_ctl = NHMEX_C0_MSR_PMON_GLOBAL_CTL, + .msr_offsets = nhmex_cbox_msr_offsets, + .pair_ctr_ctl = 1, + .ops = &nhmex_uncore_ops, + .format_group = &nhmex_uncore_cbox_format_group +}; + +static struct uncore_event_desc nhmex_uncore_wbox_events[] = { + INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0"), + { /* end: all zeroes */ }, +}; + +static struct intel_uncore_type nhmex_uncore_wbox = { + .name = "wbox", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 48, + .event_ctl = NHMEX_W_MSR_PMON_CNT0, + .perf_ctr = NHMEX_W_MSR_PMON_EVT_SEL0, + .fixed_ctr = NHMEX_W_MSR_PMON_FIXED_CTR, + .fixed_ctl = NHMEX_W_MSR_PMON_FIXED_CTL, + .event_mask = NHMEX_PMON_RAW_EVENT_MASK, + .box_ctl = NHMEX_W_MSR_GLOBAL_CTL, + .pair_ctr_ctl = 1, + .event_descs = nhmex_uncore_wbox_events, + .ops = &nhmex_uncore_ops, + .format_group = &nhmex_uncore_cbox_format_group +}; + +static int nhmex_bbox_hw_config(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + struct hw_perf_event_extra *reg2 = &hwc->branch_reg; + int ctr, ev_sel; + + ctr = (hwc->config & NHMEX_B_PMON_CTR_MASK) >> + NHMEX_B_PMON_CTR_SHIFT; + ev_sel = (hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK) >> + NHMEX_B_PMON_CTL_EV_SEL_SHIFT; + + /* events that do not use the match/mask registers */ + if ((ctr == 0 && ev_sel > 0x3) || (ctr == 1 && ev_sel > 0x6) || + (ctr == 2 && ev_sel != 0x4) || ctr == 3) + return 0; + + if (box->pmu->pmu_idx == 0) + reg1->reg = NHMEX_B0_MSR_MATCH; + else + reg1->reg = NHMEX_B1_MSR_MATCH; + reg1->idx = 0; + reg1->config = event->attr.config1; + reg2->config = event->attr.config2; + return 0; +} + +static void nhmex_bbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + struct hw_perf_event_extra *reg2 = &hwc->branch_reg; + + if (reg1->idx != EXTRA_REG_NONE) { + wrmsrl(reg1->reg, reg1->config); + wrmsrl(reg1->reg + 1, reg2->config); + } + wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 | + (hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK)); +} + +/* + * The Bbox has 4 counters, but each counter monitors different events. + * Use bits 6-7 in the event config to select counter. + */ +static struct event_constraint nhmex_uncore_bbox_constraints[] = { + EVENT_CONSTRAINT(0 , 1, 0xc0), + EVENT_CONSTRAINT(0x40, 2, 0xc0), + EVENT_CONSTRAINT(0x80, 4, 0xc0), + EVENT_CONSTRAINT(0xc0, 8, 0xc0), + EVENT_CONSTRAINT_END, +}; + +static struct attribute *nhmex_uncore_bbox_formats_attr[] = { + &format_attr_event5.attr, + &format_attr_counter.attr, + &format_attr_match.attr, + &format_attr_mask.attr, + NULL, +}; + +static struct attribute_group nhmex_uncore_bbox_format_group = { + .name = "format", + .attrs = nhmex_uncore_bbox_formats_attr, +}; + +static struct intel_uncore_ops nhmex_uncore_bbox_ops = { + NHMEX_UNCORE_OPS_COMMON_INIT(), + .enable_event = nhmex_bbox_msr_enable_event, + .hw_config = nhmex_bbox_hw_config, + .get_constraint = uncore_get_constraint, + .put_constraint = uncore_put_constraint, +}; + +static struct intel_uncore_type nhmex_uncore_bbox = { + .name = "bbox", + .num_counters = 4, + .num_boxes = 2, + .perf_ctr_bits = 48, + .event_ctl = NHMEX_B0_MSR_PMON_CTL0, + .perf_ctr = NHMEX_B0_MSR_PMON_CTR0, + .event_mask = NHMEX_B_PMON_RAW_EVENT_MASK, + .box_ctl = NHMEX_B0_MSR_PMON_GLOBAL_CTL, + .msr_offset = NHMEX_B_MSR_OFFSET, + .pair_ctr_ctl = 1, + .num_shared_regs = 1, + .constraints = nhmex_uncore_bbox_constraints, + .ops = &nhmex_uncore_bbox_ops, + .format_group = &nhmex_uncore_bbox_format_group +}; + +static int nhmex_sbox_hw_config(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + struct hw_perf_event_extra *reg2 = &hwc->branch_reg; + + /* only TO_R_PROG_EV event uses the match/mask register */ + if ((hwc->config & NHMEX_PMON_CTL_EV_SEL_MASK) != + NHMEX_S_EVENT_TO_R_PROG_EV) + return 0; + + if (box->pmu->pmu_idx == 0) + reg1->reg = NHMEX_S0_MSR_MM_CFG; + else + reg1->reg = NHMEX_S1_MSR_MM_CFG; + reg1->idx = 0; + reg1->config = event->attr.config1; + reg2->config = event->attr.config2; + return 0; +} + +static void nhmex_sbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + struct hw_perf_event_extra *reg2 = &hwc->branch_reg; + + if (reg1->idx != EXTRA_REG_NONE) { + wrmsrl(reg1->reg, 0); + wrmsrl(reg1->reg + 1, reg1->config); + wrmsrl(reg1->reg + 2, reg2->config); + wrmsrl(reg1->reg, NHMEX_S_PMON_MM_CFG_EN); + } + wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22); +} + +static struct attribute *nhmex_uncore_sbox_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh8.attr, + &format_attr_match.attr, + &format_attr_mask.attr, + NULL, +}; + +static struct attribute_group nhmex_uncore_sbox_format_group = { + .name = "format", + .attrs = nhmex_uncore_sbox_formats_attr, +}; + +static struct intel_uncore_ops nhmex_uncore_sbox_ops = { + NHMEX_UNCORE_OPS_COMMON_INIT(), + .enable_event = nhmex_sbox_msr_enable_event, + .hw_config = nhmex_sbox_hw_config, + .get_constraint = uncore_get_constraint, + .put_constraint = uncore_put_constraint, +}; + +static struct intel_uncore_type nhmex_uncore_sbox = { + .name = "sbox", + .num_counters = 4, + .num_boxes = 2, + .perf_ctr_bits = 48, + .event_ctl = NHMEX_S0_MSR_PMON_CTL0, + .perf_ctr = NHMEX_S0_MSR_PMON_CTR0, + .event_mask = NHMEX_PMON_RAW_EVENT_MASK, + .box_ctl = NHMEX_S0_MSR_PMON_GLOBAL_CTL, + .msr_offset = NHMEX_S_MSR_OFFSET, + .pair_ctr_ctl = 1, + .num_shared_regs = 1, + .ops = &nhmex_uncore_sbox_ops, + .format_group = &nhmex_uncore_sbox_format_group +}; + +enum { + EXTRA_REG_NHMEX_M_FILTER, + EXTRA_REG_NHMEX_M_DSP, + EXTRA_REG_NHMEX_M_ISS, + EXTRA_REG_NHMEX_M_MAP, + EXTRA_REG_NHMEX_M_MSC_THR, + EXTRA_REG_NHMEX_M_PGT, + EXTRA_REG_NHMEX_M_PLD, + EXTRA_REG_NHMEX_M_ZDP_CTL_FVC, +}; + +static struct extra_reg nhmex_uncore_mbox_extra_regs[] = { + MBOX_INC_SEL_EXTAR_REG(0x0, DSP), + MBOX_INC_SEL_EXTAR_REG(0x4, MSC_THR), + MBOX_INC_SEL_EXTAR_REG(0x5, MSC_THR), + MBOX_INC_SEL_EXTAR_REG(0x9, ISS), + /* event 0xa uses two extra registers */ + MBOX_INC_SEL_EXTAR_REG(0xa, ISS), + MBOX_INC_SEL_EXTAR_REG(0xa, PLD), + MBOX_INC_SEL_EXTAR_REG(0xb, PLD), + /* events 0xd ~ 0x10 use the same extra register */ + MBOX_INC_SEL_EXTAR_REG(0xd, ZDP_CTL_FVC), + MBOX_INC_SEL_EXTAR_REG(0xe, ZDP_CTL_FVC), + MBOX_INC_SEL_EXTAR_REG(0xf, ZDP_CTL_FVC), + MBOX_INC_SEL_EXTAR_REG(0x10, ZDP_CTL_FVC), + MBOX_INC_SEL_EXTAR_REG(0x16, PGT), + MBOX_SET_FLAG_SEL_EXTRA_REG(0x0, DSP), + MBOX_SET_FLAG_SEL_EXTRA_REG(0x1, ISS), + MBOX_SET_FLAG_SEL_EXTRA_REG(0x5, PGT), + MBOX_SET_FLAG_SEL_EXTRA_REG(0x6, MAP), + EVENT_EXTRA_END +}; + +/* Nehalem-EX or Westmere-EX ? */ +static bool uncore_nhmex; + +static bool nhmex_mbox_get_shared_reg(struct intel_uncore_box *box, int idx, u64 config) +{ + struct intel_uncore_extra_reg *er; + unsigned long flags; + bool ret = false; + u64 mask; + + if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) { + er = &box->shared_regs[idx]; + raw_spin_lock_irqsave(&er->lock, flags); + if (!atomic_read(&er->ref) || er->config == config) { + atomic_inc(&er->ref); + er->config = config; + ret = true; + } + raw_spin_unlock_irqrestore(&er->lock, flags); + + return ret; + } + /* + * The ZDP_CTL_FVC MSR has 4 fields which are used to control + * events 0xd ~ 0x10. Besides these 4 fields, there are additional + * fields which are shared. + */ + idx -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC; + if (WARN_ON_ONCE(idx >= 4)) + return false; + + /* mask of the shared fields */ + if (uncore_nhmex) + mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK; + else + mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK; + er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC]; + + raw_spin_lock_irqsave(&er->lock, flags); + /* add mask of the non-shared field if it's in use */ + if (__BITS_VALUE(atomic_read(&er->ref), idx, 8)) { + if (uncore_nhmex) + mask |= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); + else + mask |= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); + } + + if (!atomic_read(&er->ref) || !((er->config ^ config) & mask)) { + atomic_add(1 << (idx * 8), &er->ref); + if (uncore_nhmex) + mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK | + NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); + else + mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK | + WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); + er->config &= ~mask; + er->config |= (config & mask); + ret = true; + } + raw_spin_unlock_irqrestore(&er->lock, flags); + + return ret; +} + +static void nhmex_mbox_put_shared_reg(struct intel_uncore_box *box, int idx) +{ + struct intel_uncore_extra_reg *er; + + if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) { + er = &box->shared_regs[idx]; + atomic_dec(&er->ref); + return; + } + + idx -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC; + er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC]; + atomic_sub(1 << (idx * 8), &er->ref); +} + +static u64 nhmex_mbox_alter_er(struct perf_event *event, int new_idx, bool modify) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + u64 idx, orig_idx = __BITS_VALUE(reg1->idx, 0, 8); + u64 config = reg1->config; + + /* get the non-shared control bits and shift them */ + idx = orig_idx - EXTRA_REG_NHMEX_M_ZDP_CTL_FVC; + if (uncore_nhmex) + config &= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); + else + config &= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); + if (new_idx > orig_idx) { + idx = new_idx - orig_idx; + config <<= 3 * idx; + } else { + idx = orig_idx - new_idx; + config >>= 3 * idx; + } + + /* add the shared control bits back */ + if (uncore_nhmex) + config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config; + else + config |= WSMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config; + config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config; + if (modify) { + /* adjust the main event selector */ + if (new_idx > orig_idx) + hwc->config += idx << NHMEX_M_PMON_CTL_INC_SEL_SHIFT; + else + hwc->config -= idx << NHMEX_M_PMON_CTL_INC_SEL_SHIFT; + reg1->config = config; + reg1->idx = ~0xff | new_idx; + } + return config; +} + +static struct event_constraint * +nhmex_mbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + struct hw_perf_event_extra *reg2 = &event->hw.branch_reg; + int i, idx[2], alloc = 0; + u64 config1 = reg1->config; + + idx[0] = __BITS_VALUE(reg1->idx, 0, 8); + idx[1] = __BITS_VALUE(reg1->idx, 1, 8); +again: + for (i = 0; i < 2; i++) { + if (!uncore_box_is_fake(box) && (reg1->alloc & (0x1 << i))) + idx[i] = 0xff; + + if (idx[i] == 0xff) + continue; + + if (!nhmex_mbox_get_shared_reg(box, idx[i], + __BITS_VALUE(config1, i, 32))) + goto fail; + alloc |= (0x1 << i); + } + + /* for the match/mask registers */ + if (reg2->idx != EXTRA_REG_NONE && + (uncore_box_is_fake(box) || !reg2->alloc) && + !nhmex_mbox_get_shared_reg(box, reg2->idx, reg2->config)) + goto fail; + + /* + * If it's a fake box -- as per validate_{group,event}() we + * shouldn't touch event state and we can avoid doing so + * since both will only call get_event_constraints() once + * on each event, this avoids the need for reg->alloc. + */ + if (!uncore_box_is_fake(box)) { + if (idx[0] != 0xff && idx[0] != __BITS_VALUE(reg1->idx, 0, 8)) + nhmex_mbox_alter_er(event, idx[0], true); + reg1->alloc |= alloc; + if (reg2->idx != EXTRA_REG_NONE) + reg2->alloc = 1; + } + return NULL; +fail: + if (idx[0] != 0xff && !(alloc & 0x1) && + idx[0] >= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) { + /* + * events 0xd ~ 0x10 are functional identical, but are + * controlled by different fields in the ZDP_CTL_FVC + * register. If we failed to take one field, try the + * rest 3 choices. + */ + BUG_ON(__BITS_VALUE(reg1->idx, 1, 8) != 0xff); + idx[0] -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC; + idx[0] = (idx[0] + 1) % 4; + idx[0] += EXTRA_REG_NHMEX_M_ZDP_CTL_FVC; + if (idx[0] != __BITS_VALUE(reg1->idx, 0, 8)) { + config1 = nhmex_mbox_alter_er(event, idx[0], false); + goto again; + } + } + + if (alloc & 0x1) + nhmex_mbox_put_shared_reg(box, idx[0]); + if (alloc & 0x2) + nhmex_mbox_put_shared_reg(box, idx[1]); + return &uncore_constraint_empty; +} + +static void nhmex_mbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + struct hw_perf_event_extra *reg2 = &event->hw.branch_reg; + + if (uncore_box_is_fake(box)) + return; + + if (reg1->alloc & 0x1) + nhmex_mbox_put_shared_reg(box, __BITS_VALUE(reg1->idx, 0, 8)); + if (reg1->alloc & 0x2) + nhmex_mbox_put_shared_reg(box, __BITS_VALUE(reg1->idx, 1, 8)); + reg1->alloc = 0; + + if (reg2->alloc) { + nhmex_mbox_put_shared_reg(box, reg2->idx); + reg2->alloc = 0; + } +} + +static int nhmex_mbox_extra_reg_idx(struct extra_reg *er) +{ + if (er->idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) + return er->idx; + return er->idx + (er->event >> NHMEX_M_PMON_CTL_INC_SEL_SHIFT) - 0xd; +} + +static int nhmex_mbox_hw_config(struct intel_uncore_box *box, struct perf_event *event) +{ + struct intel_uncore_type *type = box->pmu->type; + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + struct hw_perf_event_extra *reg2 = &event->hw.branch_reg; + struct extra_reg *er; + unsigned msr; + int reg_idx = 0; + /* + * The mbox events may require 2 extra MSRs at the most. But only + * the lower 32 bits in these MSRs are significant, so we can use + * config1 to pass two MSRs' config. + */ + for (er = nhmex_uncore_mbox_extra_regs; er->msr; er++) { + if (er->event != (event->hw.config & er->config_mask)) + continue; + if (event->attr.config1 & ~er->valid_mask) + return -EINVAL; + + msr = er->msr + type->msr_offset * box->pmu->pmu_idx; + if (WARN_ON_ONCE(msr >= 0xffff || er->idx >= 0xff)) + return -EINVAL; + + /* always use the 32~63 bits to pass the PLD config */ + if (er->idx == EXTRA_REG_NHMEX_M_PLD) + reg_idx = 1; + else if (WARN_ON_ONCE(reg_idx > 0)) + return -EINVAL; + + reg1->idx &= ~(0xff << (reg_idx * 8)); + reg1->reg &= ~(0xffff << (reg_idx * 16)); + reg1->idx |= nhmex_mbox_extra_reg_idx(er) << (reg_idx * 8); + reg1->reg |= msr << (reg_idx * 16); + reg1->config = event->attr.config1; + reg_idx++; + } + /* + * The mbox only provides ability to perform address matching + * for the PLD events. + */ + if (reg_idx == 2) { + reg2->idx = EXTRA_REG_NHMEX_M_FILTER; + if (event->attr.config2 & NHMEX_M_PMON_MM_CFG_EN) + reg2->config = event->attr.config2; + else + reg2->config = ~0ULL; + if (box->pmu->pmu_idx == 0) + reg2->reg = NHMEX_M0_MSR_PMU_MM_CFG; + else + reg2->reg = NHMEX_M1_MSR_PMU_MM_CFG; + } + return 0; +} + +static u64 nhmex_mbox_shared_reg_config(struct intel_uncore_box *box, int idx) +{ + struct intel_uncore_extra_reg *er; + unsigned long flags; + u64 config; + + if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) + return box->shared_regs[idx].config; + + er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC]; + raw_spin_lock_irqsave(&er->lock, flags); + config = er->config; + raw_spin_unlock_irqrestore(&er->lock, flags); + return config; +} + +static void nhmex_mbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + struct hw_perf_event_extra *reg2 = &hwc->branch_reg; + int idx; + + idx = __BITS_VALUE(reg1->idx, 0, 8); + if (idx != 0xff) + wrmsrl(__BITS_VALUE(reg1->reg, 0, 16), + nhmex_mbox_shared_reg_config(box, idx)); + idx = __BITS_VALUE(reg1->idx, 1, 8); + if (idx != 0xff) + wrmsrl(__BITS_VALUE(reg1->reg, 1, 16), + nhmex_mbox_shared_reg_config(box, idx)); + + if (reg2->idx != EXTRA_REG_NONE) { + wrmsrl(reg2->reg, 0); + if (reg2->config != ~0ULL) { + wrmsrl(reg2->reg + 1, + reg2->config & NHMEX_M_PMON_ADDR_MATCH_MASK); + wrmsrl(reg2->reg + 2, NHMEX_M_PMON_ADDR_MASK_MASK & + (reg2->config >> NHMEX_M_PMON_ADDR_MASK_SHIFT)); + wrmsrl(reg2->reg, NHMEX_M_PMON_MM_CFG_EN); + } + } + + wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0); +} + +DEFINE_UNCORE_FORMAT_ATTR(count_mode, count_mode, "config:2-3"); +DEFINE_UNCORE_FORMAT_ATTR(storage_mode, storage_mode, "config:4-5"); +DEFINE_UNCORE_FORMAT_ATTR(wrap_mode, wrap_mode, "config:6"); +DEFINE_UNCORE_FORMAT_ATTR(flag_mode, flag_mode, "config:7"); +DEFINE_UNCORE_FORMAT_ATTR(inc_sel, inc_sel, "config:9-13"); +DEFINE_UNCORE_FORMAT_ATTR(set_flag_sel, set_flag_sel, "config:19-21"); +DEFINE_UNCORE_FORMAT_ATTR(filter_cfg_en, filter_cfg_en, "config2:63"); +DEFINE_UNCORE_FORMAT_ATTR(filter_match, filter_match, "config2:0-33"); +DEFINE_UNCORE_FORMAT_ATTR(filter_mask, filter_mask, "config2:34-61"); +DEFINE_UNCORE_FORMAT_ATTR(dsp, dsp, "config1:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(thr, thr, "config1:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(fvc, fvc, "config1:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(pgt, pgt, "config1:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(map, map, "config1:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(iss, iss, "config1:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(pld, pld, "config1:32-63"); + +static struct attribute *nhmex_uncore_mbox_formats_attr[] = { + &format_attr_count_mode.attr, + &format_attr_storage_mode.attr, + &format_attr_wrap_mode.attr, + &format_attr_flag_mode.attr, + &format_attr_inc_sel.attr, + &format_attr_set_flag_sel.attr, + &format_attr_filter_cfg_en.attr, + &format_attr_filter_match.attr, + &format_attr_filter_mask.attr, + &format_attr_dsp.attr, + &format_attr_thr.attr, + &format_attr_fvc.attr, + &format_attr_pgt.attr, + &format_attr_map.attr, + &format_attr_iss.attr, + &format_attr_pld.attr, + NULL, +}; + +static struct attribute_group nhmex_uncore_mbox_format_group = { + .name = "format", + .attrs = nhmex_uncore_mbox_formats_attr, +}; + +static struct uncore_event_desc nhmex_uncore_mbox_events[] = { + INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x2800"), + INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x2820"), + { /* end: all zeroes */ }, +}; + +static struct uncore_event_desc wsmex_uncore_mbox_events[] = { + INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x5000"), + INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x5040"), + { /* end: all zeroes */ }, +}; + +static struct intel_uncore_ops nhmex_uncore_mbox_ops = { + NHMEX_UNCORE_OPS_COMMON_INIT(), + .enable_event = nhmex_mbox_msr_enable_event, + .hw_config = nhmex_mbox_hw_config, + .get_constraint = nhmex_mbox_get_constraint, + .put_constraint = nhmex_mbox_put_constraint, +}; + +static struct intel_uncore_type nhmex_uncore_mbox = { + .name = "mbox", + .num_counters = 6, + .num_boxes = 2, + .perf_ctr_bits = 48, + .event_ctl = NHMEX_M0_MSR_PMU_CTL0, + .perf_ctr = NHMEX_M0_MSR_PMU_CNT0, + .event_mask = NHMEX_M_PMON_RAW_EVENT_MASK, + .box_ctl = NHMEX_M0_MSR_GLOBAL_CTL, + .msr_offset = NHMEX_M_MSR_OFFSET, + .pair_ctr_ctl = 1, + .num_shared_regs = 8, + .event_descs = nhmex_uncore_mbox_events, + .ops = &nhmex_uncore_mbox_ops, + .format_group = &nhmex_uncore_mbox_format_group, +}; + +static void nhmex_rbox_alter_er(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + + /* adjust the main event selector and extra register index */ + if (reg1->idx % 2) { + reg1->idx--; + hwc->config -= 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT; + } else { + reg1->idx++; + hwc->config += 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT; + } + + /* adjust extra register config */ + switch (reg1->idx % 6) { + case 2: + /* shift the 8~15 bits to the 0~7 bits */ + reg1->config >>= 8; + break; + case 3: + /* shift the 0~7 bits to the 8~15 bits */ + reg1->config <<= 8; + break; + } +} + +/* + * Each rbox has 4 event set which monitor PQI port 0~3 or 4~7. + * An event set consists of 6 events, the 3rd and 4th events in + * an event set use the same extra register. So an event set uses + * 5 extra registers. + */ +static struct event_constraint * +nhmex_rbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + struct hw_perf_event_extra *reg2 = &hwc->branch_reg; + struct intel_uncore_extra_reg *er; + unsigned long flags; + int idx, er_idx; + u64 config1; + bool ok = false; + + if (!uncore_box_is_fake(box) && reg1->alloc) + return NULL; + + idx = reg1->idx % 6; + config1 = reg1->config; +again: + er_idx = idx; + /* the 3rd and 4th events use the same extra register */ + if (er_idx > 2) + er_idx--; + er_idx += (reg1->idx / 6) * 5; + + er = &box->shared_regs[er_idx]; + raw_spin_lock_irqsave(&er->lock, flags); + if (idx < 2) { + if (!atomic_read(&er->ref) || er->config == reg1->config) { + atomic_inc(&er->ref); + er->config = reg1->config; + ok = true; + } + } else if (idx == 2 || idx == 3) { + /* + * these two events use different fields in a extra register, + * the 0~7 bits and the 8~15 bits respectively. + */ + u64 mask = 0xff << ((idx - 2) * 8); + if (!__BITS_VALUE(atomic_read(&er->ref), idx - 2, 8) || + !((er->config ^ config1) & mask)) { + atomic_add(1 << ((idx - 2) * 8), &er->ref); + er->config &= ~mask; + er->config |= config1 & mask; + ok = true; + } + } else { + if (!atomic_read(&er->ref) || + (er->config == (hwc->config >> 32) && + er->config1 == reg1->config && + er->config2 == reg2->config)) { + atomic_inc(&er->ref); + er->config = (hwc->config >> 32); + er->config1 = reg1->config; + er->config2 = reg2->config; + ok = true; + } + } + raw_spin_unlock_irqrestore(&er->lock, flags); + + if (!ok) { + /* + * The Rbox events are always in pairs. The paired + * events are functional identical, but use different + * extra registers. If we failed to take an extra + * register, try the alternative. + */ + idx ^= 1; + if (idx != reg1->idx % 6) { + if (idx == 2) + config1 >>= 8; + else if (idx == 3) + config1 <<= 8; + goto again; + } + } else { + if (!uncore_box_is_fake(box)) { + if (idx != reg1->idx % 6) + nhmex_rbox_alter_er(box, event); + reg1->alloc = 1; + } + return NULL; + } + return &uncore_constraint_empty; +} + +static void nhmex_rbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event) +{ + struct intel_uncore_extra_reg *er; + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + int idx, er_idx; + + if (uncore_box_is_fake(box) || !reg1->alloc) + return; + + idx = reg1->idx % 6; + er_idx = idx; + if (er_idx > 2) + er_idx--; + er_idx += (reg1->idx / 6) * 5; + + er = &box->shared_regs[er_idx]; + if (idx == 2 || idx == 3) + atomic_sub(1 << ((idx - 2) * 8), &er->ref); + else + atomic_dec(&er->ref); + + reg1->alloc = 0; +} + +static int nhmex_rbox_hw_config(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + struct hw_perf_event_extra *reg2 = &event->hw.branch_reg; + int idx; + + idx = (event->hw.config & NHMEX_R_PMON_CTL_EV_SEL_MASK) >> + NHMEX_R_PMON_CTL_EV_SEL_SHIFT; + if (idx >= 0x18) + return -EINVAL; + + reg1->idx = idx; + reg1->config = event->attr.config1; + + switch (idx % 6) { + case 4: + case 5: + hwc->config |= event->attr.config & (~0ULL << 32); + reg2->config = event->attr.config2; + break; + } + return 0; +} + +static void nhmex_rbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + struct hw_perf_event_extra *reg2 = &hwc->branch_reg; + int idx, port; + + idx = reg1->idx; + port = idx / 6 + box->pmu->pmu_idx * 4; + + switch (idx % 6) { + case 0: + wrmsrl(NHMEX_R_MSR_PORTN_IPERF_CFG0(port), reg1->config); + break; + case 1: + wrmsrl(NHMEX_R_MSR_PORTN_IPERF_CFG1(port), reg1->config); + break; + case 2: + case 3: + wrmsrl(NHMEX_R_MSR_PORTN_QLX_CFG(port), + uncore_shared_reg_config(box, 2 + (idx / 6) * 5)); + break; + case 4: + wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(port), + hwc->config >> 32); + wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(port), reg1->config); + wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MASK(port), reg2->config); + break; + case 5: + wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(port), + hwc->config >> 32); + wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(port), reg1->config); + wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MASK(port), reg2->config); + break; + } + + wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 | + (hwc->config & NHMEX_R_PMON_CTL_EV_SEL_MASK)); +} + +DEFINE_UNCORE_FORMAT_ATTR(xbr_mm_cfg, xbr_mm_cfg, "config:32-63"); +DEFINE_UNCORE_FORMAT_ATTR(xbr_match, xbr_match, "config1:0-63"); +DEFINE_UNCORE_FORMAT_ATTR(xbr_mask, xbr_mask, "config2:0-63"); +DEFINE_UNCORE_FORMAT_ATTR(qlx_cfg, qlx_cfg, "config1:0-15"); +DEFINE_UNCORE_FORMAT_ATTR(iperf_cfg, iperf_cfg, "config1:0-31"); + +static struct attribute *nhmex_uncore_rbox_formats_attr[] = { + &format_attr_event5.attr, + &format_attr_xbr_mm_cfg.attr, + &format_attr_xbr_match.attr, + &format_attr_xbr_mask.attr, + &format_attr_qlx_cfg.attr, + &format_attr_iperf_cfg.attr, + NULL, +}; + +static struct attribute_group nhmex_uncore_rbox_format_group = { + .name = "format", + .attrs = nhmex_uncore_rbox_formats_attr, +}; + +static struct uncore_event_desc nhmex_uncore_rbox_events[] = { + INTEL_UNCORE_EVENT_DESC(qpi0_flit_send, "event=0x0,iperf_cfg=0x80000000"), + INTEL_UNCORE_EVENT_DESC(qpi1_filt_send, "event=0x6,iperf_cfg=0x80000000"), + INTEL_UNCORE_EVENT_DESC(qpi0_idle_filt, "event=0x0,iperf_cfg=0x40000000"), + INTEL_UNCORE_EVENT_DESC(qpi1_idle_filt, "event=0x6,iperf_cfg=0x40000000"), + INTEL_UNCORE_EVENT_DESC(qpi0_date_response, "event=0x0,iperf_cfg=0xc4"), + INTEL_UNCORE_EVENT_DESC(qpi1_date_response, "event=0x6,iperf_cfg=0xc4"), + { /* end: all zeroes */ }, +}; + +static struct intel_uncore_ops nhmex_uncore_rbox_ops = { + NHMEX_UNCORE_OPS_COMMON_INIT(), + .enable_event = nhmex_rbox_msr_enable_event, + .hw_config = nhmex_rbox_hw_config, + .get_constraint = nhmex_rbox_get_constraint, + .put_constraint = nhmex_rbox_put_constraint, +}; + +static struct intel_uncore_type nhmex_uncore_rbox = { + .name = "rbox", + .num_counters = 8, + .num_boxes = 2, + .perf_ctr_bits = 48, + .event_ctl = NHMEX_R_MSR_PMON_CTL0, + .perf_ctr = NHMEX_R_MSR_PMON_CNT0, + .event_mask = NHMEX_R_PMON_RAW_EVENT_MASK, + .box_ctl = NHMEX_R_MSR_GLOBAL_CTL, + .msr_offset = NHMEX_R_MSR_OFFSET, + .pair_ctr_ctl = 1, + .num_shared_regs = 20, + .event_descs = nhmex_uncore_rbox_events, + .ops = &nhmex_uncore_rbox_ops, + .format_group = &nhmex_uncore_rbox_format_group +}; + +static struct intel_uncore_type *nhmex_msr_uncores[] = { + &nhmex_uncore_ubox, + &nhmex_uncore_cbox, + &nhmex_uncore_bbox, + &nhmex_uncore_sbox, + &nhmex_uncore_mbox, + &nhmex_uncore_rbox, + &nhmex_uncore_wbox, + NULL, +}; + +void nhmex_uncore_cpu_init(void) +{ + if (boot_cpu_data.x86_model == 46) + uncore_nhmex = true; + else + nhmex_uncore_mbox.event_descs = wsmex_uncore_mbox_events; + if (nhmex_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) + nhmex_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; + uncore_msr_uncores = nhmex_msr_uncores; +} +/* end of Nehalem-EX uncore support */ diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c new file mode 100644 index 00000000000..3001015b755 --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c @@ -0,0 +1,636 @@ +/* Nehalem/SandBridge/Haswell uncore support */ +#include "perf_event_intel_uncore.h" + +/* SNB event control */ +#define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff +#define SNB_UNC_CTL_UMASK_MASK 0x0000ff00 +#define SNB_UNC_CTL_EDGE_DET (1 << 18) +#define SNB_UNC_CTL_EN (1 << 22) +#define SNB_UNC_CTL_INVERT (1 << 23) +#define SNB_UNC_CTL_CMASK_MASK 0x1f000000 +#define NHM_UNC_CTL_CMASK_MASK 0xff000000 +#define NHM_UNC_FIXED_CTR_CTL_EN (1 << 0) + +#define SNB_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \ + SNB_UNC_CTL_UMASK_MASK | \ + SNB_UNC_CTL_EDGE_DET | \ + SNB_UNC_CTL_INVERT | \ + SNB_UNC_CTL_CMASK_MASK) + +#define NHM_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \ + SNB_UNC_CTL_UMASK_MASK | \ + SNB_UNC_CTL_EDGE_DET | \ + SNB_UNC_CTL_INVERT | \ + NHM_UNC_CTL_CMASK_MASK) + +/* SNB global control register */ +#define SNB_UNC_PERF_GLOBAL_CTL 0x391 +#define SNB_UNC_FIXED_CTR_CTRL 0x394 +#define SNB_UNC_FIXED_CTR 0x395 + +/* SNB uncore global control */ +#define SNB_UNC_GLOBAL_CTL_CORE_ALL ((1 << 4) - 1) +#define SNB_UNC_GLOBAL_CTL_EN (1 << 29) + +/* SNB Cbo register */ +#define SNB_UNC_CBO_0_PERFEVTSEL0 0x700 +#define SNB_UNC_CBO_0_PER_CTR0 0x706 +#define SNB_UNC_CBO_MSR_OFFSET 0x10 + +/* NHM global control register */ +#define NHM_UNC_PERF_GLOBAL_CTL 0x391 +#define NHM_UNC_FIXED_CTR 0x394 +#define NHM_UNC_FIXED_CTR_CTRL 0x395 + +/* NHM uncore global control */ +#define NHM_UNC_GLOBAL_CTL_EN_PC_ALL ((1ULL << 8) - 1) +#define NHM_UNC_GLOBAL_CTL_EN_FC (1ULL << 32) + +/* NHM uncore register */ +#define NHM_UNC_PERFEVTSEL0 0x3c0 +#define NHM_UNC_UNCORE_PMC0 0x3b0 + +DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7"); +DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15"); +DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18"); +DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23"); +DEFINE_UNCORE_FORMAT_ATTR(cmask5, cmask, "config:24-28"); +DEFINE_UNCORE_FORMAT_ATTR(cmask8, cmask, "config:24-31"); + +/* Sandy Bridge uncore support */ +static void snb_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (hwc->idx < UNCORE_PMC_IDX_FIXED) + wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN); + else + wrmsrl(hwc->config_base, SNB_UNC_CTL_EN); +} + +static void snb_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + wrmsrl(event->hw.config_base, 0); +} + +static void snb_uncore_msr_init_box(struct intel_uncore_box *box) +{ + if (box->pmu->pmu_idx == 0) { + wrmsrl(SNB_UNC_PERF_GLOBAL_CTL, + SNB_UNC_GLOBAL_CTL_EN | SNB_UNC_GLOBAL_CTL_CORE_ALL); + } +} + +static struct uncore_event_desc snb_uncore_events[] = { + INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"), + { /* end: all zeroes */ }, +}; + +static struct attribute *snb_uncore_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_cmask5.attr, + NULL, +}; + +static struct attribute_group snb_uncore_format_group = { + .name = "format", + .attrs = snb_uncore_formats_attr, +}; + +static struct intel_uncore_ops snb_uncore_msr_ops = { + .init_box = snb_uncore_msr_init_box, + .disable_event = snb_uncore_msr_disable_event, + .enable_event = snb_uncore_msr_enable_event, + .read_counter = uncore_msr_read_counter, +}; + +static struct event_constraint snb_uncore_cbox_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x80, 0x1), + UNCORE_EVENT_CONSTRAINT(0x83, 0x1), + EVENT_CONSTRAINT_END +}; + +static struct intel_uncore_type snb_uncore_cbox = { + .name = "cbox", + .num_counters = 2, + .num_boxes = 4, + .perf_ctr_bits = 44, + .fixed_ctr_bits = 48, + .perf_ctr = SNB_UNC_CBO_0_PER_CTR0, + .event_ctl = SNB_UNC_CBO_0_PERFEVTSEL0, + .fixed_ctr = SNB_UNC_FIXED_CTR, + .fixed_ctl = SNB_UNC_FIXED_CTR_CTRL, + .single_fixed = 1, + .event_mask = SNB_UNC_RAW_EVENT_MASK, + .msr_offset = SNB_UNC_CBO_MSR_OFFSET, + .constraints = snb_uncore_cbox_constraints, + .ops = &snb_uncore_msr_ops, + .format_group = &snb_uncore_format_group, + .event_descs = snb_uncore_events, +}; + +static struct intel_uncore_type *snb_msr_uncores[] = { + &snb_uncore_cbox, + NULL, +}; + +void snb_uncore_cpu_init(void) +{ + uncore_msr_uncores = snb_msr_uncores; + if (snb_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) + snb_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; +} + +enum { + SNB_PCI_UNCORE_IMC, +}; + +static struct uncore_event_desc snb_uncore_imc_events[] = { + INTEL_UNCORE_EVENT_DESC(data_reads, "event=0x01"), + INTEL_UNCORE_EVENT_DESC(data_reads.scale, "6.103515625e-5"), + INTEL_UNCORE_EVENT_DESC(data_reads.unit, "MiB"), + + INTEL_UNCORE_EVENT_DESC(data_writes, "event=0x02"), + INTEL_UNCORE_EVENT_DESC(data_writes.scale, "6.103515625e-5"), + INTEL_UNCORE_EVENT_DESC(data_writes.unit, "MiB"), + + { /* end: all zeroes */ }, +}; + +#define SNB_UNCORE_PCI_IMC_EVENT_MASK 0xff +#define SNB_UNCORE_PCI_IMC_BAR_OFFSET 0x48 + +/* page size multiple covering all config regs */ +#define SNB_UNCORE_PCI_IMC_MAP_SIZE 0x6000 + +#define SNB_UNCORE_PCI_IMC_DATA_READS 0x1 +#define SNB_UNCORE_PCI_IMC_DATA_READS_BASE 0x5050 +#define SNB_UNCORE_PCI_IMC_DATA_WRITES 0x2 +#define SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE 0x5054 +#define SNB_UNCORE_PCI_IMC_CTR_BASE SNB_UNCORE_PCI_IMC_DATA_READS_BASE + +static struct attribute *snb_uncore_imc_formats_attr[] = { + &format_attr_event.attr, + NULL, +}; + +static struct attribute_group snb_uncore_imc_format_group = { + .name = "format", + .attrs = snb_uncore_imc_formats_attr, +}; + +static void snb_uncore_imc_init_box(struct intel_uncore_box *box) +{ + struct pci_dev *pdev = box->pci_dev; + int where = SNB_UNCORE_PCI_IMC_BAR_OFFSET; + resource_size_t addr; + u32 pci_dword; + + pci_read_config_dword(pdev, where, &pci_dword); + addr = pci_dword; + +#ifdef CONFIG_PHYS_ADDR_T_64BIT + pci_read_config_dword(pdev, where + 4, &pci_dword); + addr |= ((resource_size_t)pci_dword << 32); +#endif + + addr &= ~(PAGE_SIZE - 1); + + box->io_addr = ioremap(addr, SNB_UNCORE_PCI_IMC_MAP_SIZE); + box->hrtimer_duration = UNCORE_SNB_IMC_HRTIMER_INTERVAL; +} + +static void snb_uncore_imc_enable_box(struct intel_uncore_box *box) +{} + +static void snb_uncore_imc_disable_box(struct intel_uncore_box *box) +{} + +static void snb_uncore_imc_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{} + +static void snb_uncore_imc_disable_event(struct intel_uncore_box *box, struct perf_event *event) +{} + +static u64 snb_uncore_imc_read_counter(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + return (u64)*(unsigned int *)(box->io_addr + hwc->event_base); +} + +/* + * custom event_init() function because we define our own fixed, free + * running counters, so we do not want to conflict with generic uncore + * logic. Also simplifies processing + */ +static int snb_uncore_imc_event_init(struct perf_event *event) +{ + struct intel_uncore_pmu *pmu; + struct intel_uncore_box *box; + struct hw_perf_event *hwc = &event->hw; + u64 cfg = event->attr.config & SNB_UNCORE_PCI_IMC_EVENT_MASK; + int idx, base; + + if (event->attr.type != event->pmu->type) + return -ENOENT; + + pmu = uncore_event_to_pmu(event); + /* no device found for this pmu */ + if (pmu->func_id < 0) + return -ENOENT; + + /* Sampling not supported yet */ + if (hwc->sample_period) + return -EINVAL; + + /* unsupported modes and filters */ + if (event->attr.exclude_user || + event->attr.exclude_kernel || + event->attr.exclude_hv || + event->attr.exclude_idle || + event->attr.exclude_host || + event->attr.exclude_guest || + event->attr.sample_period) /* no sampling */ + return -EINVAL; + + /* + * Place all uncore events for a particular physical package + * onto a single cpu + */ + if (event->cpu < 0) + return -EINVAL; + + /* check only supported bits are set */ + if (event->attr.config & ~SNB_UNCORE_PCI_IMC_EVENT_MASK) + return -EINVAL; + + box = uncore_pmu_to_box(pmu, event->cpu); + if (!box || box->cpu < 0) + return -EINVAL; + + event->cpu = box->cpu; + + event->hw.idx = -1; + event->hw.last_tag = ~0ULL; + event->hw.extra_reg.idx = EXTRA_REG_NONE; + event->hw.branch_reg.idx = EXTRA_REG_NONE; + /* + * check event is known (whitelist, determines counter) + */ + switch (cfg) { + case SNB_UNCORE_PCI_IMC_DATA_READS: + base = SNB_UNCORE_PCI_IMC_DATA_READS_BASE; + idx = UNCORE_PMC_IDX_FIXED; + break; + case SNB_UNCORE_PCI_IMC_DATA_WRITES: + base = SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE; + idx = UNCORE_PMC_IDX_FIXED + 1; + break; + default: + return -EINVAL; + } + + /* must be done before validate_group */ + event->hw.event_base = base; + event->hw.config = cfg; + event->hw.idx = idx; + + /* no group validation needed, we have free running counters */ + + return 0; +} + +static int snb_uncore_imc_hw_config(struct intel_uncore_box *box, struct perf_event *event) +{ + return 0; +} + +static void snb_uncore_imc_event_start(struct perf_event *event, int flags) +{ + struct intel_uncore_box *box = uncore_event_to_box(event); + u64 count; + + if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) + return; + + event->hw.state = 0; + box->n_active++; + + list_add_tail(&event->active_entry, &box->active_list); + + count = snb_uncore_imc_read_counter(box, event); + local64_set(&event->hw.prev_count, count); + + if (box->n_active == 1) + uncore_pmu_start_hrtimer(box); +} + +static void snb_uncore_imc_event_stop(struct perf_event *event, int flags) +{ + struct intel_uncore_box *box = uncore_event_to_box(event); + struct hw_perf_event *hwc = &event->hw; + + if (!(hwc->state & PERF_HES_STOPPED)) { + box->n_active--; + + WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); + hwc->state |= PERF_HES_STOPPED; + + list_del(&event->active_entry); + + if (box->n_active == 0) + uncore_pmu_cancel_hrtimer(box); + } + + if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { + /* + * Drain the remaining delta count out of a event + * that we are disabling: + */ + uncore_perf_event_update(box, event); + hwc->state |= PERF_HES_UPTODATE; + } +} + +static int snb_uncore_imc_event_add(struct perf_event *event, int flags) +{ + struct intel_uncore_box *box = uncore_event_to_box(event); + struct hw_perf_event *hwc = &event->hw; + + if (!box) + return -ENODEV; + + hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + if (!(flags & PERF_EF_START)) + hwc->state |= PERF_HES_ARCH; + + snb_uncore_imc_event_start(event, 0); + + box->n_events++; + + return 0; +} + +static void snb_uncore_imc_event_del(struct perf_event *event, int flags) +{ + struct intel_uncore_box *box = uncore_event_to_box(event); + int i; + + snb_uncore_imc_event_stop(event, PERF_EF_UPDATE); + + for (i = 0; i < box->n_events; i++) { + if (event == box->event_list[i]) { + --box->n_events; + break; + } + } +} + +static int snb_pci2phy_map_init(int devid) +{ + struct pci_dev *dev = NULL; + int bus; + + dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, dev); + if (!dev) + return -ENOTTY; + + bus = dev->bus->number; + + uncore_pcibus_to_physid[bus] = 0; + + pci_dev_put(dev); + + return 0; +} + +static struct pmu snb_uncore_imc_pmu = { + .task_ctx_nr = perf_invalid_context, + .event_init = snb_uncore_imc_event_init, + .add = snb_uncore_imc_event_add, + .del = snb_uncore_imc_event_del, + .start = snb_uncore_imc_event_start, + .stop = snb_uncore_imc_event_stop, + .read = uncore_pmu_event_read, +}; + +static struct intel_uncore_ops snb_uncore_imc_ops = { + .init_box = snb_uncore_imc_init_box, + .enable_box = snb_uncore_imc_enable_box, + .disable_box = snb_uncore_imc_disable_box, + .disable_event = snb_uncore_imc_disable_event, + .enable_event = snb_uncore_imc_enable_event, + .hw_config = snb_uncore_imc_hw_config, + .read_counter = snb_uncore_imc_read_counter, +}; + +static struct intel_uncore_type snb_uncore_imc = { + .name = "imc", + .num_counters = 2, + .num_boxes = 1, + .fixed_ctr_bits = 32, + .fixed_ctr = SNB_UNCORE_PCI_IMC_CTR_BASE, + .event_descs = snb_uncore_imc_events, + .format_group = &snb_uncore_imc_format_group, + .perf_ctr = SNB_UNCORE_PCI_IMC_DATA_READS_BASE, + .event_mask = SNB_UNCORE_PCI_IMC_EVENT_MASK, + .ops = &snb_uncore_imc_ops, + .pmu = &snb_uncore_imc_pmu, +}; + +static struct intel_uncore_type *snb_pci_uncores[] = { + [SNB_PCI_UNCORE_IMC] = &snb_uncore_imc, + NULL, +}; + +static const struct pci_device_id snb_uncore_pci_ids[] = { + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SNB_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* end: all zeroes */ }, +}; + +static const struct pci_device_id ivb_uncore_pci_ids[] = { + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_E3_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* end: all zeroes */ }, +}; + +static const struct pci_device_id hsw_uncore_pci_ids[] = { + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* end: all zeroes */ }, +}; + +static struct pci_driver snb_uncore_pci_driver = { + .name = "snb_uncore", + .id_table = snb_uncore_pci_ids, +}; + +static struct pci_driver ivb_uncore_pci_driver = { + .name = "ivb_uncore", + .id_table = ivb_uncore_pci_ids, +}; + +static struct pci_driver hsw_uncore_pci_driver = { + .name = "hsw_uncore", + .id_table = hsw_uncore_pci_ids, +}; + +struct imc_uncore_pci_dev { + __u32 pci_id; + struct pci_driver *driver; +}; +#define IMC_DEV(a, d) \ + { .pci_id = PCI_DEVICE_ID_INTEL_##a, .driver = (d) } + +static const struct imc_uncore_pci_dev desktop_imc_pci_ids[] = { + IMC_DEV(SNB_IMC, &snb_uncore_pci_driver), + IMC_DEV(IVB_IMC, &ivb_uncore_pci_driver), /* 3rd Gen Core processor */ + IMC_DEV(IVB_E3_IMC, &ivb_uncore_pci_driver), /* Xeon E3-1200 v2/3rd Gen Core processor */ + IMC_DEV(HSW_IMC, &hsw_uncore_pci_driver), /* 4th Gen Core Processor */ + { /* end marker */ } +}; + + +#define for_each_imc_pci_id(x, t) \ + for (x = (t); (x)->pci_id; x++) + +static struct pci_driver *imc_uncore_find_dev(void) +{ + const struct imc_uncore_pci_dev *p; + int ret; + + for_each_imc_pci_id(p, desktop_imc_pci_ids) { + ret = snb_pci2phy_map_init(p->pci_id); + if (ret == 0) + return p->driver; + } + return NULL; +} + +static int imc_uncore_pci_init(void) +{ + struct pci_driver *imc_drv = imc_uncore_find_dev(); + + if (!imc_drv) + return -ENODEV; + + uncore_pci_uncores = snb_pci_uncores; + uncore_pci_driver = imc_drv; + + return 0; +} + +int snb_uncore_pci_init(void) +{ + return imc_uncore_pci_init(); +} + +int ivb_uncore_pci_init(void) +{ + return imc_uncore_pci_init(); +} +int hsw_uncore_pci_init(void) +{ + return imc_uncore_pci_init(); +} + +/* end of Sandy Bridge uncore support */ + +/* Nehalem uncore support */ +static void nhm_uncore_msr_disable_box(struct intel_uncore_box *box) +{ + wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, 0); +} + +static void nhm_uncore_msr_enable_box(struct intel_uncore_box *box) +{ + wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, NHM_UNC_GLOBAL_CTL_EN_PC_ALL | NHM_UNC_GLOBAL_CTL_EN_FC); +} + +static void nhm_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (hwc->idx < UNCORE_PMC_IDX_FIXED) + wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN); + else + wrmsrl(hwc->config_base, NHM_UNC_FIXED_CTR_CTL_EN); +} + +static struct attribute *nhm_uncore_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_cmask8.attr, + NULL, +}; + +static struct attribute_group nhm_uncore_format_group = { + .name = "format", + .attrs = nhm_uncore_formats_attr, +}; + +static struct uncore_event_desc nhm_uncore_events[] = { + INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"), + INTEL_UNCORE_EVENT_DESC(qmc_writes_full_any, "event=0x2f,umask=0x0f"), + INTEL_UNCORE_EVENT_DESC(qmc_normal_reads_any, "event=0x2c,umask=0x0f"), + INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_reads, "event=0x20,umask=0x01"), + INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_writes, "event=0x20,umask=0x02"), + INTEL_UNCORE_EVENT_DESC(qhl_request_remote_reads, "event=0x20,umask=0x04"), + INTEL_UNCORE_EVENT_DESC(qhl_request_remote_writes, "event=0x20,umask=0x08"), + INTEL_UNCORE_EVENT_DESC(qhl_request_local_reads, "event=0x20,umask=0x10"), + INTEL_UNCORE_EVENT_DESC(qhl_request_local_writes, "event=0x20,umask=0x20"), + { /* end: all zeroes */ }, +}; + +static struct intel_uncore_ops nhm_uncore_msr_ops = { + .disable_box = nhm_uncore_msr_disable_box, + .enable_box = nhm_uncore_msr_enable_box, + .disable_event = snb_uncore_msr_disable_event, + .enable_event = nhm_uncore_msr_enable_event, + .read_counter = uncore_msr_read_counter, +}; + +static struct intel_uncore_type nhm_uncore = { + .name = "", + .num_counters = 8, + .num_boxes = 1, + .perf_ctr_bits = 48, + .fixed_ctr_bits = 48, + .event_ctl = NHM_UNC_PERFEVTSEL0, + .perf_ctr = NHM_UNC_UNCORE_PMC0, + .fixed_ctr = NHM_UNC_FIXED_CTR, + .fixed_ctl = NHM_UNC_FIXED_CTR_CTRL, + .event_mask = NHM_UNC_RAW_EVENT_MASK, + .event_descs = nhm_uncore_events, + .ops = &nhm_uncore_msr_ops, + .format_group = &nhm_uncore_format_group, +}; + +static struct intel_uncore_type *nhm_msr_uncores[] = { + &nhm_uncore, + NULL, +}; + +void nhm_uncore_cpu_init(void) +{ + uncore_msr_uncores = nhm_msr_uncores; +} + +/* end of Nehalem uncore support */ diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c new file mode 100644 index 00000000000..adf138eac85 --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c @@ -0,0 +1,2258 @@ +/* SandyBridge-EP/IvyTown uncore support */ +#include "perf_event_intel_uncore.h" + + +/* SNB-EP Box level control */ +#define SNBEP_PMON_BOX_CTL_RST_CTRL (1 << 0) +#define SNBEP_PMON_BOX_CTL_RST_CTRS (1 << 1) +#define SNBEP_PMON_BOX_CTL_FRZ (1 << 8) +#define SNBEP_PMON_BOX_CTL_FRZ_EN (1 << 16) +#define SNBEP_PMON_BOX_CTL_INT (SNBEP_PMON_BOX_CTL_RST_CTRL | \ + SNBEP_PMON_BOX_CTL_RST_CTRS | \ + SNBEP_PMON_BOX_CTL_FRZ_EN) +/* SNB-EP event control */ +#define SNBEP_PMON_CTL_EV_SEL_MASK 0x000000ff +#define SNBEP_PMON_CTL_UMASK_MASK 0x0000ff00 +#define SNBEP_PMON_CTL_RST (1 << 17) +#define SNBEP_PMON_CTL_EDGE_DET (1 << 18) +#define SNBEP_PMON_CTL_EV_SEL_EXT (1 << 21) +#define SNBEP_PMON_CTL_EN (1 << 22) +#define SNBEP_PMON_CTL_INVERT (1 << 23) +#define SNBEP_PMON_CTL_TRESH_MASK 0xff000000 +#define SNBEP_PMON_RAW_EVENT_MASK (SNBEP_PMON_CTL_EV_SEL_MASK | \ + SNBEP_PMON_CTL_UMASK_MASK | \ + SNBEP_PMON_CTL_EDGE_DET | \ + SNBEP_PMON_CTL_INVERT | \ + SNBEP_PMON_CTL_TRESH_MASK) + +/* SNB-EP Ubox event control */ +#define SNBEP_U_MSR_PMON_CTL_TRESH_MASK 0x1f000000 +#define SNBEP_U_MSR_PMON_RAW_EVENT_MASK \ + (SNBEP_PMON_CTL_EV_SEL_MASK | \ + SNBEP_PMON_CTL_UMASK_MASK | \ + SNBEP_PMON_CTL_EDGE_DET | \ + SNBEP_PMON_CTL_INVERT | \ + SNBEP_U_MSR_PMON_CTL_TRESH_MASK) + +#define SNBEP_CBO_PMON_CTL_TID_EN (1 << 19) +#define SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK (SNBEP_PMON_RAW_EVENT_MASK | \ + SNBEP_CBO_PMON_CTL_TID_EN) + +/* SNB-EP PCU event control */ +#define SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK 0x0000c000 +#define SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK 0x1f000000 +#define SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT (1 << 30) +#define SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET (1 << 31) +#define SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK \ + (SNBEP_PMON_CTL_EV_SEL_MASK | \ + SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \ + SNBEP_PMON_CTL_EDGE_DET | \ + SNBEP_PMON_CTL_EV_SEL_EXT | \ + SNBEP_PMON_CTL_INVERT | \ + SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \ + SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \ + SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET) + +#define SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK \ + (SNBEP_PMON_RAW_EVENT_MASK | \ + SNBEP_PMON_CTL_EV_SEL_EXT) + +/* SNB-EP pci control register */ +#define SNBEP_PCI_PMON_BOX_CTL 0xf4 +#define SNBEP_PCI_PMON_CTL0 0xd8 +/* SNB-EP pci counter register */ +#define SNBEP_PCI_PMON_CTR0 0xa0 + +/* SNB-EP home agent register */ +#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH0 0x40 +#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH1 0x44 +#define SNBEP_HA_PCI_PMON_BOX_OPCODEMATCH 0x48 +/* SNB-EP memory controller register */ +#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTL 0xf0 +#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTR 0xd0 +/* SNB-EP QPI register */ +#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH0 0x228 +#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH1 0x22c +#define SNBEP_Q_Py_PCI_PMON_PKT_MASK0 0x238 +#define SNBEP_Q_Py_PCI_PMON_PKT_MASK1 0x23c + +/* SNB-EP Ubox register */ +#define SNBEP_U_MSR_PMON_CTR0 0xc16 +#define SNBEP_U_MSR_PMON_CTL0 0xc10 + +#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTL 0xc08 +#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTR 0xc09 + +/* SNB-EP Cbo register */ +#define SNBEP_C0_MSR_PMON_CTR0 0xd16 +#define SNBEP_C0_MSR_PMON_CTL0 0xd10 +#define SNBEP_C0_MSR_PMON_BOX_CTL 0xd04 +#define SNBEP_C0_MSR_PMON_BOX_FILTER 0xd14 +#define SNBEP_CBO_MSR_OFFSET 0x20 + +#define SNBEP_CB0_MSR_PMON_BOX_FILTER_TID 0x1f +#define SNBEP_CB0_MSR_PMON_BOX_FILTER_NID 0x3fc00 +#define SNBEP_CB0_MSR_PMON_BOX_FILTER_STATE 0x7c0000 +#define SNBEP_CB0_MSR_PMON_BOX_FILTER_OPC 0xff800000 + +#define SNBEP_CBO_EVENT_EXTRA_REG(e, m, i) { \ + .event = (e), \ + .msr = SNBEP_C0_MSR_PMON_BOX_FILTER, \ + .config_mask = (m), \ + .idx = (i) \ +} + +/* SNB-EP PCU register */ +#define SNBEP_PCU_MSR_PMON_CTR0 0xc36 +#define SNBEP_PCU_MSR_PMON_CTL0 0xc30 +#define SNBEP_PCU_MSR_PMON_BOX_CTL 0xc24 +#define SNBEP_PCU_MSR_PMON_BOX_FILTER 0xc34 +#define SNBEP_PCU_MSR_PMON_BOX_FILTER_MASK 0xffffffff +#define SNBEP_PCU_MSR_CORE_C3_CTR 0x3fc +#define SNBEP_PCU_MSR_CORE_C6_CTR 0x3fd + +/* IVBEP event control */ +#define IVBEP_PMON_BOX_CTL_INT (SNBEP_PMON_BOX_CTL_RST_CTRL | \ + SNBEP_PMON_BOX_CTL_RST_CTRS) +#define IVBEP_PMON_RAW_EVENT_MASK (SNBEP_PMON_CTL_EV_SEL_MASK | \ + SNBEP_PMON_CTL_UMASK_MASK | \ + SNBEP_PMON_CTL_EDGE_DET | \ + SNBEP_PMON_CTL_TRESH_MASK) +/* IVBEP Ubox */ +#define IVBEP_U_MSR_PMON_GLOBAL_CTL 0xc00 +#define IVBEP_U_PMON_GLOBAL_FRZ_ALL (1 << 31) +#define IVBEP_U_PMON_GLOBAL_UNFRZ_ALL (1 << 29) + +#define IVBEP_U_MSR_PMON_RAW_EVENT_MASK \ + (SNBEP_PMON_CTL_EV_SEL_MASK | \ + SNBEP_PMON_CTL_UMASK_MASK | \ + SNBEP_PMON_CTL_EDGE_DET | \ + SNBEP_U_MSR_PMON_CTL_TRESH_MASK) +/* IVBEP Cbo */ +#define IVBEP_CBO_MSR_PMON_RAW_EVENT_MASK (IVBEP_PMON_RAW_EVENT_MASK | \ + SNBEP_CBO_PMON_CTL_TID_EN) + +#define IVBEP_CB0_MSR_PMON_BOX_FILTER_TID (0x1fULL << 0) +#define IVBEP_CB0_MSR_PMON_BOX_FILTER_LINK (0xfULL << 5) +#define IVBEP_CB0_MSR_PMON_BOX_FILTER_STATE (0x3fULL << 17) +#define IVBEP_CB0_MSR_PMON_BOX_FILTER_NID (0xffffULL << 32) +#define IVBEP_CB0_MSR_PMON_BOX_FILTER_OPC (0x1ffULL << 52) +#define IVBEP_CB0_MSR_PMON_BOX_FILTER_C6 (0x1ULL << 61) +#define IVBEP_CB0_MSR_PMON_BOX_FILTER_NC (0x1ULL << 62) +#define IVBEP_CB0_MSR_PMON_BOX_FILTER_ISOC (0x1ULL << 63) + +/* IVBEP home agent */ +#define IVBEP_HA_PCI_PMON_CTL_Q_OCC_RST (1 << 16) +#define IVBEP_HA_PCI_PMON_RAW_EVENT_MASK \ + (IVBEP_PMON_RAW_EVENT_MASK | \ + IVBEP_HA_PCI_PMON_CTL_Q_OCC_RST) +/* IVBEP PCU */ +#define IVBEP_PCU_MSR_PMON_RAW_EVENT_MASK \ + (SNBEP_PMON_CTL_EV_SEL_MASK | \ + SNBEP_PMON_CTL_EV_SEL_EXT | \ + SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \ + SNBEP_PMON_CTL_EDGE_DET | \ + SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \ + SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \ + SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET) +/* IVBEP QPI */ +#define IVBEP_QPI_PCI_PMON_RAW_EVENT_MASK \ + (IVBEP_PMON_RAW_EVENT_MASK | \ + SNBEP_PMON_CTL_EV_SEL_EXT) + +#define __BITS_VALUE(x, i, n) ((typeof(x))(((x) >> ((i) * (n))) & \ + ((1ULL << (n)) - 1))) + +/* Haswell-EP Ubox */ +#define HSWEP_U_MSR_PMON_CTR0 0x705 +#define HSWEP_U_MSR_PMON_CTL0 0x709 +#define HSWEP_U_MSR_PMON_FILTER 0x707 + +#define HSWEP_U_MSR_PMON_UCLK_FIXED_CTL 0x703 +#define HSWEP_U_MSR_PMON_UCLK_FIXED_CTR 0x704 + +#define HSWEP_U_MSR_PMON_BOX_FILTER_TID (0x1 << 0) +#define HSWEP_U_MSR_PMON_BOX_FILTER_CID (0x1fULL << 1) +#define HSWEP_U_MSR_PMON_BOX_FILTER_MASK \ + (HSWEP_U_MSR_PMON_BOX_FILTER_TID | \ + HSWEP_U_MSR_PMON_BOX_FILTER_CID) + +/* Haswell-EP CBo */ +#define HSWEP_C0_MSR_PMON_CTR0 0xe08 +#define HSWEP_C0_MSR_PMON_CTL0 0xe01 +#define HSWEP_C0_MSR_PMON_BOX_CTL 0xe00 +#define HSWEP_C0_MSR_PMON_BOX_FILTER0 0xe05 +#define HSWEP_CBO_MSR_OFFSET 0x10 + + +#define HSWEP_CB0_MSR_PMON_BOX_FILTER_TID (0x3fULL << 0) +#define HSWEP_CB0_MSR_PMON_BOX_FILTER_LINK (0xfULL << 6) +#define HSWEP_CB0_MSR_PMON_BOX_FILTER_STATE (0x7fULL << 17) +#define HSWEP_CB0_MSR_PMON_BOX_FILTER_NID (0xffffULL << 32) +#define HSWEP_CB0_MSR_PMON_BOX_FILTER_OPC (0x1ffULL << 52) +#define HSWEP_CB0_MSR_PMON_BOX_FILTER_C6 (0x1ULL << 61) +#define HSWEP_CB0_MSR_PMON_BOX_FILTER_NC (0x1ULL << 62) +#define HSWEP_CB0_MSR_PMON_BOX_FILTER_ISOC (0x1ULL << 63) + + +/* Haswell-EP Sbox */ +#define HSWEP_S0_MSR_PMON_CTR0 0x726 +#define HSWEP_S0_MSR_PMON_CTL0 0x721 +#define HSWEP_S0_MSR_PMON_BOX_CTL 0x720 +#define HSWEP_SBOX_MSR_OFFSET 0xa +#define HSWEP_S_MSR_PMON_RAW_EVENT_MASK (SNBEP_PMON_RAW_EVENT_MASK | \ + SNBEP_CBO_PMON_CTL_TID_EN) + +/* Haswell-EP PCU */ +#define HSWEP_PCU_MSR_PMON_CTR0 0x717 +#define HSWEP_PCU_MSR_PMON_CTL0 0x711 +#define HSWEP_PCU_MSR_PMON_BOX_CTL 0x710 +#define HSWEP_PCU_MSR_PMON_BOX_FILTER 0x715 + + +DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7"); +DEFINE_UNCORE_FORMAT_ATTR(event_ext, event, "config:0-7,21"); +DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15"); +DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18"); +DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19"); +DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23"); +DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31"); +DEFINE_UNCORE_FORMAT_ATTR(thresh5, thresh, "config:24-28"); +DEFINE_UNCORE_FORMAT_ATTR(occ_sel, occ_sel, "config:14-15"); +DEFINE_UNCORE_FORMAT_ATTR(occ_invert, occ_invert, "config:30"); +DEFINE_UNCORE_FORMAT_ATTR(occ_edge, occ_edge, "config:14-51"); +DEFINE_UNCORE_FORMAT_ATTR(filter_tid, filter_tid, "config1:0-4"); +DEFINE_UNCORE_FORMAT_ATTR(filter_tid2, filter_tid, "config1:0"); +DEFINE_UNCORE_FORMAT_ATTR(filter_tid3, filter_tid, "config1:0-5"); +DEFINE_UNCORE_FORMAT_ATTR(filter_cid, filter_cid, "config1:5"); +DEFINE_UNCORE_FORMAT_ATTR(filter_link, filter_link, "config1:5-8"); +DEFINE_UNCORE_FORMAT_ATTR(filter_link2, filter_link, "config1:6-8"); +DEFINE_UNCORE_FORMAT_ATTR(filter_nid, filter_nid, "config1:10-17"); +DEFINE_UNCORE_FORMAT_ATTR(filter_nid2, filter_nid, "config1:32-47"); +DEFINE_UNCORE_FORMAT_ATTR(filter_state, filter_state, "config1:18-22"); +DEFINE_UNCORE_FORMAT_ATTR(filter_state2, filter_state, "config1:17-22"); +DEFINE_UNCORE_FORMAT_ATTR(filter_state3, filter_state, "config1:17-23"); +DEFINE_UNCORE_FORMAT_ATTR(filter_opc, filter_opc, "config1:23-31"); +DEFINE_UNCORE_FORMAT_ATTR(filter_opc2, filter_opc, "config1:52-60"); +DEFINE_UNCORE_FORMAT_ATTR(filter_nc, filter_nc, "config1:62"); +DEFINE_UNCORE_FORMAT_ATTR(filter_c6, filter_c6, "config1:61"); +DEFINE_UNCORE_FORMAT_ATTR(filter_isoc, filter_isoc, "config1:63"); +DEFINE_UNCORE_FORMAT_ATTR(filter_band0, filter_band0, "config1:0-7"); +DEFINE_UNCORE_FORMAT_ATTR(filter_band1, filter_band1, "config1:8-15"); +DEFINE_UNCORE_FORMAT_ATTR(filter_band2, filter_band2, "config1:16-23"); +DEFINE_UNCORE_FORMAT_ATTR(filter_band3, filter_band3, "config1:24-31"); +DEFINE_UNCORE_FORMAT_ATTR(match_rds, match_rds, "config1:48-51"); +DEFINE_UNCORE_FORMAT_ATTR(match_rnid30, match_rnid30, "config1:32-35"); +DEFINE_UNCORE_FORMAT_ATTR(match_rnid4, match_rnid4, "config1:31"); +DEFINE_UNCORE_FORMAT_ATTR(match_dnid, match_dnid, "config1:13-17"); +DEFINE_UNCORE_FORMAT_ATTR(match_mc, match_mc, "config1:9-12"); +DEFINE_UNCORE_FORMAT_ATTR(match_opc, match_opc, "config1:5-8"); +DEFINE_UNCORE_FORMAT_ATTR(match_vnw, match_vnw, "config1:3-4"); +DEFINE_UNCORE_FORMAT_ATTR(match0, match0, "config1:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(match1, match1, "config1:32-63"); +DEFINE_UNCORE_FORMAT_ATTR(mask_rds, mask_rds, "config2:48-51"); +DEFINE_UNCORE_FORMAT_ATTR(mask_rnid30, mask_rnid30, "config2:32-35"); +DEFINE_UNCORE_FORMAT_ATTR(mask_rnid4, mask_rnid4, "config2:31"); +DEFINE_UNCORE_FORMAT_ATTR(mask_dnid, mask_dnid, "config2:13-17"); +DEFINE_UNCORE_FORMAT_ATTR(mask_mc, mask_mc, "config2:9-12"); +DEFINE_UNCORE_FORMAT_ATTR(mask_opc, mask_opc, "config2:5-8"); +DEFINE_UNCORE_FORMAT_ATTR(mask_vnw, mask_vnw, "config2:3-4"); +DEFINE_UNCORE_FORMAT_ATTR(mask0, mask0, "config2:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(mask1, mask1, "config2:32-63"); + +static void snbep_uncore_pci_disable_box(struct intel_uncore_box *box) +{ + struct pci_dev *pdev = box->pci_dev; + int box_ctl = uncore_pci_box_ctl(box); + u32 config = 0; + + if (!pci_read_config_dword(pdev, box_ctl, &config)) { + config |= SNBEP_PMON_BOX_CTL_FRZ; + pci_write_config_dword(pdev, box_ctl, config); + } +} + +static void snbep_uncore_pci_enable_box(struct intel_uncore_box *box) +{ + struct pci_dev *pdev = box->pci_dev; + int box_ctl = uncore_pci_box_ctl(box); + u32 config = 0; + + if (!pci_read_config_dword(pdev, box_ctl, &config)) { + config &= ~SNBEP_PMON_BOX_CTL_FRZ; + pci_write_config_dword(pdev, box_ctl, config); + } +} + +static void snbep_uncore_pci_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + + pci_write_config_dword(pdev, hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN); +} + +static void snbep_uncore_pci_disable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + + pci_write_config_dword(pdev, hwc->config_base, hwc->config); +} + +static u64 snbep_uncore_pci_read_counter(struct intel_uncore_box *box, struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + u64 count = 0; + + pci_read_config_dword(pdev, hwc->event_base, (u32 *)&count); + pci_read_config_dword(pdev, hwc->event_base + 4, (u32 *)&count + 1); + + return count; +} + +static void snbep_uncore_pci_init_box(struct intel_uncore_box *box) +{ + struct pci_dev *pdev = box->pci_dev; + + pci_write_config_dword(pdev, SNBEP_PCI_PMON_BOX_CTL, SNBEP_PMON_BOX_CTL_INT); +} + +static void snbep_uncore_msr_disable_box(struct intel_uncore_box *box) +{ + u64 config; + unsigned msr; + + msr = uncore_msr_box_ctl(box); + if (msr) { + rdmsrl(msr, config); + config |= SNBEP_PMON_BOX_CTL_FRZ; + wrmsrl(msr, config); + } +} + +static void snbep_uncore_msr_enable_box(struct intel_uncore_box *box) +{ + u64 config; + unsigned msr; + + msr = uncore_msr_box_ctl(box); + if (msr) { + rdmsrl(msr, config); + config &= ~SNBEP_PMON_BOX_CTL_FRZ; + wrmsrl(msr, config); + } +} + +static void snbep_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + + if (reg1->idx != EXTRA_REG_NONE) + wrmsrl(reg1->reg, uncore_shared_reg_config(box, 0)); + + wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN); +} + +static void snbep_uncore_msr_disable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + wrmsrl(hwc->config_base, hwc->config); +} + +static void snbep_uncore_msr_init_box(struct intel_uncore_box *box) +{ + unsigned msr = uncore_msr_box_ctl(box); + + if (msr) + wrmsrl(msr, SNBEP_PMON_BOX_CTL_INT); +} + +static struct attribute *snbep_uncore_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh8.attr, + NULL, +}; + +static struct attribute *snbep_uncore_ubox_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh5.attr, + NULL, +}; + +static struct attribute *snbep_uncore_cbox_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_tid_en.attr, + &format_attr_inv.attr, + &format_attr_thresh8.attr, + &format_attr_filter_tid.attr, + &format_attr_filter_nid.attr, + &format_attr_filter_state.attr, + &format_attr_filter_opc.attr, + NULL, +}; + +static struct attribute *snbep_uncore_pcu_formats_attr[] = { + &format_attr_event_ext.attr, + &format_attr_occ_sel.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh5.attr, + &format_attr_occ_invert.attr, + &format_attr_occ_edge.attr, + &format_attr_filter_band0.attr, + &format_attr_filter_band1.attr, + &format_attr_filter_band2.attr, + &format_attr_filter_band3.attr, + NULL, +}; + +static struct attribute *snbep_uncore_qpi_formats_attr[] = { + &format_attr_event_ext.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh8.attr, + &format_attr_match_rds.attr, + &format_attr_match_rnid30.attr, + &format_attr_match_rnid4.attr, + &format_attr_match_dnid.attr, + &format_attr_match_mc.attr, + &format_attr_match_opc.attr, + &format_attr_match_vnw.attr, + &format_attr_match0.attr, + &format_attr_match1.attr, + &format_attr_mask_rds.attr, + &format_attr_mask_rnid30.attr, + &format_attr_mask_rnid4.attr, + &format_attr_mask_dnid.attr, + &format_attr_mask_mc.attr, + &format_attr_mask_opc.attr, + &format_attr_mask_vnw.attr, + &format_attr_mask0.attr, + &format_attr_mask1.attr, + NULL, +}; + +static struct uncore_event_desc snbep_uncore_imc_events[] = { + INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"), + INTEL_UNCORE_EVENT_DESC(cas_count_read, "event=0x04,umask=0x03"), + INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"), + { /* end: all zeroes */ }, +}; + +static struct uncore_event_desc snbep_uncore_qpi_events[] = { + INTEL_UNCORE_EVENT_DESC(clockticks, "event=0x14"), + INTEL_UNCORE_EVENT_DESC(txl_flits_active, "event=0x00,umask=0x06"), + INTEL_UNCORE_EVENT_DESC(drs_data, "event=0x102,umask=0x08"), + INTEL_UNCORE_EVENT_DESC(ncb_data, "event=0x103,umask=0x04"), + { /* end: all zeroes */ }, +}; + +static struct attribute_group snbep_uncore_format_group = { + .name = "format", + .attrs = snbep_uncore_formats_attr, +}; + +static struct attribute_group snbep_uncore_ubox_format_group = { + .name = "format", + .attrs = snbep_uncore_ubox_formats_attr, +}; + +static struct attribute_group snbep_uncore_cbox_format_group = { + .name = "format", + .attrs = snbep_uncore_cbox_formats_attr, +}; + +static struct attribute_group snbep_uncore_pcu_format_group = { + .name = "format", + .attrs = snbep_uncore_pcu_formats_attr, +}; + +static struct attribute_group snbep_uncore_qpi_format_group = { + .name = "format", + .attrs = snbep_uncore_qpi_formats_attr, +}; + +#define SNBEP_UNCORE_MSR_OPS_COMMON_INIT() \ + .init_box = snbep_uncore_msr_init_box, \ + .disable_box = snbep_uncore_msr_disable_box, \ + .enable_box = snbep_uncore_msr_enable_box, \ + .disable_event = snbep_uncore_msr_disable_event, \ + .enable_event = snbep_uncore_msr_enable_event, \ + .read_counter = uncore_msr_read_counter + +static struct intel_uncore_ops snbep_uncore_msr_ops = { + SNBEP_UNCORE_MSR_OPS_COMMON_INIT(), +}; + +#define SNBEP_UNCORE_PCI_OPS_COMMON_INIT() \ + .init_box = snbep_uncore_pci_init_box, \ + .disable_box = snbep_uncore_pci_disable_box, \ + .enable_box = snbep_uncore_pci_enable_box, \ + .disable_event = snbep_uncore_pci_disable_event, \ + .read_counter = snbep_uncore_pci_read_counter + +static struct intel_uncore_ops snbep_uncore_pci_ops = { + SNBEP_UNCORE_PCI_OPS_COMMON_INIT(), + .enable_event = snbep_uncore_pci_enable_event, \ +}; + +static struct event_constraint snbep_uncore_cbox_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x01, 0x1), + UNCORE_EVENT_CONSTRAINT(0x02, 0x3), + UNCORE_EVENT_CONSTRAINT(0x04, 0x3), + UNCORE_EVENT_CONSTRAINT(0x05, 0x3), + UNCORE_EVENT_CONSTRAINT(0x07, 0x3), + UNCORE_EVENT_CONSTRAINT(0x09, 0x3), + UNCORE_EVENT_CONSTRAINT(0x11, 0x1), + UNCORE_EVENT_CONSTRAINT(0x12, 0x3), + UNCORE_EVENT_CONSTRAINT(0x13, 0x3), + UNCORE_EVENT_CONSTRAINT(0x1b, 0xc), + UNCORE_EVENT_CONSTRAINT(0x1c, 0xc), + UNCORE_EVENT_CONSTRAINT(0x1d, 0xc), + UNCORE_EVENT_CONSTRAINT(0x1e, 0xc), + EVENT_CONSTRAINT_OVERLAP(0x1f, 0xe, 0xff), + UNCORE_EVENT_CONSTRAINT(0x21, 0x3), + UNCORE_EVENT_CONSTRAINT(0x23, 0x3), + UNCORE_EVENT_CONSTRAINT(0x31, 0x3), + UNCORE_EVENT_CONSTRAINT(0x32, 0x3), + UNCORE_EVENT_CONSTRAINT(0x33, 0x3), + UNCORE_EVENT_CONSTRAINT(0x34, 0x3), + UNCORE_EVENT_CONSTRAINT(0x35, 0x3), + UNCORE_EVENT_CONSTRAINT(0x36, 0x1), + UNCORE_EVENT_CONSTRAINT(0x37, 0x3), + UNCORE_EVENT_CONSTRAINT(0x38, 0x3), + UNCORE_EVENT_CONSTRAINT(0x39, 0x3), + UNCORE_EVENT_CONSTRAINT(0x3b, 0x1), + EVENT_CONSTRAINT_END +}; + +static struct event_constraint snbep_uncore_r2pcie_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x10, 0x3), + UNCORE_EVENT_CONSTRAINT(0x11, 0x3), + UNCORE_EVENT_CONSTRAINT(0x12, 0x1), + UNCORE_EVENT_CONSTRAINT(0x23, 0x3), + UNCORE_EVENT_CONSTRAINT(0x24, 0x3), + UNCORE_EVENT_CONSTRAINT(0x25, 0x3), + UNCORE_EVENT_CONSTRAINT(0x26, 0x3), + UNCORE_EVENT_CONSTRAINT(0x32, 0x3), + UNCORE_EVENT_CONSTRAINT(0x33, 0x3), + UNCORE_EVENT_CONSTRAINT(0x34, 0x3), + EVENT_CONSTRAINT_END +}; + +static struct event_constraint snbep_uncore_r3qpi_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x10, 0x3), + UNCORE_EVENT_CONSTRAINT(0x11, 0x3), + UNCORE_EVENT_CONSTRAINT(0x12, 0x3), + UNCORE_EVENT_CONSTRAINT(0x13, 0x1), + UNCORE_EVENT_CONSTRAINT(0x20, 0x3), + UNCORE_EVENT_CONSTRAINT(0x21, 0x3), + UNCORE_EVENT_CONSTRAINT(0x22, 0x3), + UNCORE_EVENT_CONSTRAINT(0x23, 0x3), + UNCORE_EVENT_CONSTRAINT(0x24, 0x3), + UNCORE_EVENT_CONSTRAINT(0x25, 0x3), + UNCORE_EVENT_CONSTRAINT(0x26, 0x3), + UNCORE_EVENT_CONSTRAINT(0x28, 0x3), + UNCORE_EVENT_CONSTRAINT(0x29, 0x3), + UNCORE_EVENT_CONSTRAINT(0x2a, 0x3), + UNCORE_EVENT_CONSTRAINT(0x2b, 0x3), + UNCORE_EVENT_CONSTRAINT(0x2c, 0x3), + UNCORE_EVENT_CONSTRAINT(0x2d, 0x3), + UNCORE_EVENT_CONSTRAINT(0x2e, 0x3), + UNCORE_EVENT_CONSTRAINT(0x2f, 0x3), + UNCORE_EVENT_CONSTRAINT(0x30, 0x3), + UNCORE_EVENT_CONSTRAINT(0x31, 0x3), + UNCORE_EVENT_CONSTRAINT(0x32, 0x3), + UNCORE_EVENT_CONSTRAINT(0x33, 0x3), + UNCORE_EVENT_CONSTRAINT(0x34, 0x3), + UNCORE_EVENT_CONSTRAINT(0x36, 0x3), + UNCORE_EVENT_CONSTRAINT(0x37, 0x3), + UNCORE_EVENT_CONSTRAINT(0x38, 0x3), + UNCORE_EVENT_CONSTRAINT(0x39, 0x3), + EVENT_CONSTRAINT_END +}; + +static struct intel_uncore_type snbep_uncore_ubox = { + .name = "ubox", + .num_counters = 2, + .num_boxes = 1, + .perf_ctr_bits = 44, + .fixed_ctr_bits = 48, + .perf_ctr = SNBEP_U_MSR_PMON_CTR0, + .event_ctl = SNBEP_U_MSR_PMON_CTL0, + .event_mask = SNBEP_U_MSR_PMON_RAW_EVENT_MASK, + .fixed_ctr = SNBEP_U_MSR_PMON_UCLK_FIXED_CTR, + .fixed_ctl = SNBEP_U_MSR_PMON_UCLK_FIXED_CTL, + .ops = &snbep_uncore_msr_ops, + .format_group = &snbep_uncore_ubox_format_group, +}; + +static struct extra_reg snbep_uncore_cbox_extra_regs[] = { + SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN, + SNBEP_CBO_PMON_CTL_TID_EN, 0x1), + SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4), + SNBEP_CBO_EVENT_EXTRA_REG(0x4334, 0xffff, 0x6), + SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4), + SNBEP_CBO_EVENT_EXTRA_REG(0x4534, 0xffff, 0x6), + SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4), + SNBEP_CBO_EVENT_EXTRA_REG(0x4934, 0xffff, 0x6), + SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0x6), + SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x8), + SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x8), + SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0xa), + SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0xa), + SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x2), + SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x2), + SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x2), + SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x2), + SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x8), + SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x8), + SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0xa), + SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0xa), + SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x2), + SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x2), + SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x2), + SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x2), + EVENT_EXTRA_END +}; + +static void snbep_cbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + struct intel_uncore_extra_reg *er = &box->shared_regs[0]; + int i; + + if (uncore_box_is_fake(box)) + return; + + for (i = 0; i < 5; i++) { + if (reg1->alloc & (0x1 << i)) + atomic_sub(1 << (i * 6), &er->ref); + } + reg1->alloc = 0; +} + +static struct event_constraint * +__snbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event, + u64 (*cbox_filter_mask)(int fields)) +{ + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + struct intel_uncore_extra_reg *er = &box->shared_regs[0]; + int i, alloc = 0; + unsigned long flags; + u64 mask; + + if (reg1->idx == EXTRA_REG_NONE) + return NULL; + + raw_spin_lock_irqsave(&er->lock, flags); + for (i = 0; i < 5; i++) { + if (!(reg1->idx & (0x1 << i))) + continue; + if (!uncore_box_is_fake(box) && (reg1->alloc & (0x1 << i))) + continue; + + mask = cbox_filter_mask(0x1 << i); + if (!__BITS_VALUE(atomic_read(&er->ref), i, 6) || + !((reg1->config ^ er->config) & mask)) { + atomic_add(1 << (i * 6), &er->ref); + er->config &= ~mask; + er->config |= reg1->config & mask; + alloc |= (0x1 << i); + } else { + break; + } + } + raw_spin_unlock_irqrestore(&er->lock, flags); + if (i < 5) + goto fail; + + if (!uncore_box_is_fake(box)) + reg1->alloc |= alloc; + + return NULL; +fail: + for (; i >= 0; i--) { + if (alloc & (0x1 << i)) + atomic_sub(1 << (i * 6), &er->ref); + } + return &uncore_constraint_empty; +} + +static u64 snbep_cbox_filter_mask(int fields) +{ + u64 mask = 0; + + if (fields & 0x1) + mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_TID; + if (fields & 0x2) + mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_NID; + if (fields & 0x4) + mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_STATE; + if (fields & 0x8) + mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_OPC; + + return mask; +} + +static struct event_constraint * +snbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event) +{ + return __snbep_cbox_get_constraint(box, event, snbep_cbox_filter_mask); +} + +static int snbep_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + struct extra_reg *er; + int idx = 0; + + for (er = snbep_uncore_cbox_extra_regs; er->msr; er++) { + if (er->event != (event->hw.config & er->config_mask)) + continue; + idx |= er->idx; + } + + if (idx) { + reg1->reg = SNBEP_C0_MSR_PMON_BOX_FILTER + + SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx; + reg1->config = event->attr.config1 & snbep_cbox_filter_mask(idx); + reg1->idx = idx; + } + return 0; +} + +static struct intel_uncore_ops snbep_uncore_cbox_ops = { + SNBEP_UNCORE_MSR_OPS_COMMON_INIT(), + .hw_config = snbep_cbox_hw_config, + .get_constraint = snbep_cbox_get_constraint, + .put_constraint = snbep_cbox_put_constraint, +}; + +static struct intel_uncore_type snbep_uncore_cbox = { + .name = "cbox", + .num_counters = 4, + .num_boxes = 8, + .perf_ctr_bits = 44, + .event_ctl = SNBEP_C0_MSR_PMON_CTL0, + .perf_ctr = SNBEP_C0_MSR_PMON_CTR0, + .event_mask = SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK, + .box_ctl = SNBEP_C0_MSR_PMON_BOX_CTL, + .msr_offset = SNBEP_CBO_MSR_OFFSET, + .num_shared_regs = 1, + .constraints = snbep_uncore_cbox_constraints, + .ops = &snbep_uncore_cbox_ops, + .format_group = &snbep_uncore_cbox_format_group, +}; + +static u64 snbep_pcu_alter_er(struct perf_event *event, int new_idx, bool modify) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + u64 config = reg1->config; + + if (new_idx > reg1->idx) + config <<= 8 * (new_idx - reg1->idx); + else + config >>= 8 * (reg1->idx - new_idx); + + if (modify) { + hwc->config += new_idx - reg1->idx; + reg1->config = config; + reg1->idx = new_idx; + } + return config; +} + +static struct event_constraint * +snbep_pcu_get_constraint(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + struct intel_uncore_extra_reg *er = &box->shared_regs[0]; + unsigned long flags; + int idx = reg1->idx; + u64 mask, config1 = reg1->config; + bool ok = false; + + if (reg1->idx == EXTRA_REG_NONE || + (!uncore_box_is_fake(box) && reg1->alloc)) + return NULL; +again: + mask = 0xffULL << (idx * 8); + raw_spin_lock_irqsave(&er->lock, flags); + if (!__BITS_VALUE(atomic_read(&er->ref), idx, 8) || + !((config1 ^ er->config) & mask)) { + atomic_add(1 << (idx * 8), &er->ref); + er->config &= ~mask; + er->config |= config1 & mask; + ok = true; + } + raw_spin_unlock_irqrestore(&er->lock, flags); + + if (!ok) { + idx = (idx + 1) % 4; + if (idx != reg1->idx) { + config1 = snbep_pcu_alter_er(event, idx, false); + goto again; + } + return &uncore_constraint_empty; + } + + if (!uncore_box_is_fake(box)) { + if (idx != reg1->idx) + snbep_pcu_alter_er(event, idx, true); + reg1->alloc = 1; + } + return NULL; +} + +static void snbep_pcu_put_constraint(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + struct intel_uncore_extra_reg *er = &box->shared_regs[0]; + + if (uncore_box_is_fake(box) || !reg1->alloc) + return; + + atomic_sub(1 << (reg1->idx * 8), &er->ref); + reg1->alloc = 0; +} + +static int snbep_pcu_hw_config(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + int ev_sel = hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK; + + if (ev_sel >= 0xb && ev_sel <= 0xe) { + reg1->reg = SNBEP_PCU_MSR_PMON_BOX_FILTER; + reg1->idx = ev_sel - 0xb; + reg1->config = event->attr.config1 & (0xff << (reg1->idx * 8)); + } + return 0; +} + +static struct intel_uncore_ops snbep_uncore_pcu_ops = { + SNBEP_UNCORE_MSR_OPS_COMMON_INIT(), + .hw_config = snbep_pcu_hw_config, + .get_constraint = snbep_pcu_get_constraint, + .put_constraint = snbep_pcu_put_constraint, +}; + +static struct intel_uncore_type snbep_uncore_pcu = { + .name = "pcu", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 48, + .perf_ctr = SNBEP_PCU_MSR_PMON_CTR0, + .event_ctl = SNBEP_PCU_MSR_PMON_CTL0, + .event_mask = SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK, + .box_ctl = SNBEP_PCU_MSR_PMON_BOX_CTL, + .num_shared_regs = 1, + .ops = &snbep_uncore_pcu_ops, + .format_group = &snbep_uncore_pcu_format_group, +}; + +static struct intel_uncore_type *snbep_msr_uncores[] = { + &snbep_uncore_ubox, + &snbep_uncore_cbox, + &snbep_uncore_pcu, + NULL, +}; + +void snbep_uncore_cpu_init(void) +{ + if (snbep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) + snbep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; + uncore_msr_uncores = snbep_msr_uncores; +} + +enum { + SNBEP_PCI_QPI_PORT0_FILTER, + SNBEP_PCI_QPI_PORT1_FILTER, +}; + +static int snbep_qpi_hw_config(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + struct hw_perf_event_extra *reg2 = &hwc->branch_reg; + + if ((hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK) == 0x38) { + reg1->idx = 0; + reg1->reg = SNBEP_Q_Py_PCI_PMON_PKT_MATCH0; + reg1->config = event->attr.config1; + reg2->reg = SNBEP_Q_Py_PCI_PMON_PKT_MASK0; + reg2->config = event->attr.config2; + } + return 0; +} + +static void snbep_qpi_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + struct hw_perf_event_extra *reg2 = &hwc->branch_reg; + + if (reg1->idx != EXTRA_REG_NONE) { + int idx = box->pmu->pmu_idx + SNBEP_PCI_QPI_PORT0_FILTER; + struct pci_dev *filter_pdev = uncore_extra_pci_dev[box->phys_id][idx]; + if (filter_pdev) { + pci_write_config_dword(filter_pdev, reg1->reg, + (u32)reg1->config); + pci_write_config_dword(filter_pdev, reg1->reg + 4, + (u32)(reg1->config >> 32)); + pci_write_config_dword(filter_pdev, reg2->reg, + (u32)reg2->config); + pci_write_config_dword(filter_pdev, reg2->reg + 4, + (u32)(reg2->config >> 32)); + } + } + + pci_write_config_dword(pdev, hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN); +} + +static struct intel_uncore_ops snbep_uncore_qpi_ops = { + SNBEP_UNCORE_PCI_OPS_COMMON_INIT(), + .enable_event = snbep_qpi_enable_event, + .hw_config = snbep_qpi_hw_config, + .get_constraint = uncore_get_constraint, + .put_constraint = uncore_put_constraint, +}; + +#define SNBEP_UNCORE_PCI_COMMON_INIT() \ + .perf_ctr = SNBEP_PCI_PMON_CTR0, \ + .event_ctl = SNBEP_PCI_PMON_CTL0, \ + .event_mask = SNBEP_PMON_RAW_EVENT_MASK, \ + .box_ctl = SNBEP_PCI_PMON_BOX_CTL, \ + .ops = &snbep_uncore_pci_ops, \ + .format_group = &snbep_uncore_format_group + +static struct intel_uncore_type snbep_uncore_ha = { + .name = "ha", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 48, + SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + +static struct intel_uncore_type snbep_uncore_imc = { + .name = "imc", + .num_counters = 4, + .num_boxes = 4, + .perf_ctr_bits = 48, + .fixed_ctr_bits = 48, + .fixed_ctr = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR, + .fixed_ctl = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL, + .event_descs = snbep_uncore_imc_events, + SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + +static struct intel_uncore_type snbep_uncore_qpi = { + .name = "qpi", + .num_counters = 4, + .num_boxes = 2, + .perf_ctr_bits = 48, + .perf_ctr = SNBEP_PCI_PMON_CTR0, + .event_ctl = SNBEP_PCI_PMON_CTL0, + .event_mask = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK, + .box_ctl = SNBEP_PCI_PMON_BOX_CTL, + .num_shared_regs = 1, + .ops = &snbep_uncore_qpi_ops, + .event_descs = snbep_uncore_qpi_events, + .format_group = &snbep_uncore_qpi_format_group, +}; + + +static struct intel_uncore_type snbep_uncore_r2pcie = { + .name = "r2pcie", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 44, + .constraints = snbep_uncore_r2pcie_constraints, + SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + +static struct intel_uncore_type snbep_uncore_r3qpi = { + .name = "r3qpi", + .num_counters = 3, + .num_boxes = 2, + .perf_ctr_bits = 44, + .constraints = snbep_uncore_r3qpi_constraints, + SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + +enum { + SNBEP_PCI_UNCORE_HA, + SNBEP_PCI_UNCORE_IMC, + SNBEP_PCI_UNCORE_QPI, + SNBEP_PCI_UNCORE_R2PCIE, + SNBEP_PCI_UNCORE_R3QPI, +}; + +static struct intel_uncore_type *snbep_pci_uncores[] = { + [SNBEP_PCI_UNCORE_HA] = &snbep_uncore_ha, + [SNBEP_PCI_UNCORE_IMC] = &snbep_uncore_imc, + [SNBEP_PCI_UNCORE_QPI] = &snbep_uncore_qpi, + [SNBEP_PCI_UNCORE_R2PCIE] = &snbep_uncore_r2pcie, + [SNBEP_PCI_UNCORE_R3QPI] = &snbep_uncore_r3qpi, + NULL, +}; + +static const struct pci_device_id snbep_uncore_pci_ids[] = { + { /* Home Agent */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_HA), + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_HA, 0), + }, + { /* MC Channel 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC0), + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 0), + }, + { /* MC Channel 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC1), + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 1), + }, + { /* MC Channel 2 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC2), + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 2), + }, + { /* MC Channel 3 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC3), + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 3), + }, + { /* QPI Port 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI0), + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_QPI, 0), + }, + { /* QPI Port 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI1), + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_QPI, 1), + }, + { /* R2PCIe */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R2PCIE), + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R2PCIE, 0), + }, + { /* R3QPI Link 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI0), + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R3QPI, 0), + }, + { /* R3QPI Link 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI1), + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R3QPI, 1), + }, + { /* QPI Port 0 filter */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3c86), + .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, + SNBEP_PCI_QPI_PORT0_FILTER), + }, + { /* QPI Port 0 filter */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3c96), + .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, + SNBEP_PCI_QPI_PORT1_FILTER), + }, + { /* end: all zeroes */ } +}; + +static struct pci_driver snbep_uncore_pci_driver = { + .name = "snbep_uncore", + .id_table = snbep_uncore_pci_ids, +}; + +/* + * build pci bus to socket mapping + */ +static int snbep_pci2phy_map_init(int devid) +{ + struct pci_dev *ubox_dev = NULL; + int i, bus, nodeid; + int err = 0; + u32 config = 0; + + while (1) { + /* find the UBOX device */ + ubox_dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, ubox_dev); + if (!ubox_dev) + break; + bus = ubox_dev->bus->number; + /* get the Node ID of the local register */ + err = pci_read_config_dword(ubox_dev, 0x40, &config); + if (err) + break; + nodeid = config; + /* get the Node ID mapping */ + err = pci_read_config_dword(ubox_dev, 0x54, &config); + if (err) + break; + /* + * every three bits in the Node ID mapping register maps + * to a particular node. + */ + for (i = 0; i < 8; i++) { + if (nodeid == ((config >> (3 * i)) & 0x7)) { + uncore_pcibus_to_physid[bus] = i; + break; + } + } + } + + if (!err) { + /* + * For PCI bus with no UBOX device, find the next bus + * that has UBOX device and use its mapping. + */ + i = -1; + for (bus = 255; bus >= 0; bus--) { + if (uncore_pcibus_to_physid[bus] >= 0) + i = uncore_pcibus_to_physid[bus]; + else + uncore_pcibus_to_physid[bus] = i; + } + } + + if (ubox_dev) + pci_dev_put(ubox_dev); + + return err ? pcibios_err_to_errno(err) : 0; +} + +int snbep_uncore_pci_init(void) +{ + int ret = snbep_pci2phy_map_init(0x3ce0); + if (ret) + return ret; + uncore_pci_uncores = snbep_pci_uncores; + uncore_pci_driver = &snbep_uncore_pci_driver; + return 0; +} +/* end of Sandy Bridge-EP uncore support */ + +/* IvyTown uncore support */ +static void ivbep_uncore_msr_init_box(struct intel_uncore_box *box) +{ + unsigned msr = uncore_msr_box_ctl(box); + if (msr) + wrmsrl(msr, IVBEP_PMON_BOX_CTL_INT); +} + +static void ivbep_uncore_pci_init_box(struct intel_uncore_box *box) +{ + struct pci_dev *pdev = box->pci_dev; + + pci_write_config_dword(pdev, SNBEP_PCI_PMON_BOX_CTL, IVBEP_PMON_BOX_CTL_INT); +} + +#define IVBEP_UNCORE_MSR_OPS_COMMON_INIT() \ + .init_box = ivbep_uncore_msr_init_box, \ + .disable_box = snbep_uncore_msr_disable_box, \ + .enable_box = snbep_uncore_msr_enable_box, \ + .disable_event = snbep_uncore_msr_disable_event, \ + .enable_event = snbep_uncore_msr_enable_event, \ + .read_counter = uncore_msr_read_counter + +static struct intel_uncore_ops ivbep_uncore_msr_ops = { + IVBEP_UNCORE_MSR_OPS_COMMON_INIT(), +}; + +static struct intel_uncore_ops ivbep_uncore_pci_ops = { + .init_box = ivbep_uncore_pci_init_box, + .disable_box = snbep_uncore_pci_disable_box, + .enable_box = snbep_uncore_pci_enable_box, + .disable_event = snbep_uncore_pci_disable_event, + .enable_event = snbep_uncore_pci_enable_event, + .read_counter = snbep_uncore_pci_read_counter, +}; + +#define IVBEP_UNCORE_PCI_COMMON_INIT() \ + .perf_ctr = SNBEP_PCI_PMON_CTR0, \ + .event_ctl = SNBEP_PCI_PMON_CTL0, \ + .event_mask = IVBEP_PMON_RAW_EVENT_MASK, \ + .box_ctl = SNBEP_PCI_PMON_BOX_CTL, \ + .ops = &ivbep_uncore_pci_ops, \ + .format_group = &ivbep_uncore_format_group + +static struct attribute *ivbep_uncore_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh8.attr, + NULL, +}; + +static struct attribute *ivbep_uncore_ubox_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh5.attr, + NULL, +}; + +static struct attribute *ivbep_uncore_cbox_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_tid_en.attr, + &format_attr_thresh8.attr, + &format_attr_filter_tid.attr, + &format_attr_filter_link.attr, + &format_attr_filter_state2.attr, + &format_attr_filter_nid2.attr, + &format_attr_filter_opc2.attr, + &format_attr_filter_nc.attr, + &format_attr_filter_c6.attr, + &format_attr_filter_isoc.attr, + NULL, +}; + +static struct attribute *ivbep_uncore_pcu_formats_attr[] = { + &format_attr_event_ext.attr, + &format_attr_occ_sel.attr, + &format_attr_edge.attr, + &format_attr_thresh5.attr, + &format_attr_occ_invert.attr, + &format_attr_occ_edge.attr, + &format_attr_filter_band0.attr, + &format_attr_filter_band1.attr, + &format_attr_filter_band2.attr, + &format_attr_filter_band3.attr, + NULL, +}; + +static struct attribute *ivbep_uncore_qpi_formats_attr[] = { + &format_attr_event_ext.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_thresh8.attr, + &format_attr_match_rds.attr, + &format_attr_match_rnid30.attr, + &format_attr_match_rnid4.attr, + &format_attr_match_dnid.attr, + &format_attr_match_mc.attr, + &format_attr_match_opc.attr, + &format_attr_match_vnw.attr, + &format_attr_match0.attr, + &format_attr_match1.attr, + &format_attr_mask_rds.attr, + &format_attr_mask_rnid30.attr, + &format_attr_mask_rnid4.attr, + &format_attr_mask_dnid.attr, + &format_attr_mask_mc.attr, + &format_attr_mask_opc.attr, + &format_attr_mask_vnw.attr, + &format_attr_mask0.attr, + &format_attr_mask1.attr, + NULL, +}; + +static struct attribute_group ivbep_uncore_format_group = { + .name = "format", + .attrs = ivbep_uncore_formats_attr, +}; + +static struct attribute_group ivbep_uncore_ubox_format_group = { + .name = "format", + .attrs = ivbep_uncore_ubox_formats_attr, +}; + +static struct attribute_group ivbep_uncore_cbox_format_group = { + .name = "format", + .attrs = ivbep_uncore_cbox_formats_attr, +}; + +static struct attribute_group ivbep_uncore_pcu_format_group = { + .name = "format", + .attrs = ivbep_uncore_pcu_formats_attr, +}; + +static struct attribute_group ivbep_uncore_qpi_format_group = { + .name = "format", + .attrs = ivbep_uncore_qpi_formats_attr, +}; + +static struct intel_uncore_type ivbep_uncore_ubox = { + .name = "ubox", + .num_counters = 2, + .num_boxes = 1, + .perf_ctr_bits = 44, + .fixed_ctr_bits = 48, + .perf_ctr = SNBEP_U_MSR_PMON_CTR0, + .event_ctl = SNBEP_U_MSR_PMON_CTL0, + .event_mask = IVBEP_U_MSR_PMON_RAW_EVENT_MASK, + .fixed_ctr = SNBEP_U_MSR_PMON_UCLK_FIXED_CTR, + .fixed_ctl = SNBEP_U_MSR_PMON_UCLK_FIXED_CTL, + .ops = &ivbep_uncore_msr_ops, + .format_group = &ivbep_uncore_ubox_format_group, +}; + +static struct extra_reg ivbep_uncore_cbox_extra_regs[] = { + SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN, + SNBEP_CBO_PMON_CTL_TID_EN, 0x1), + SNBEP_CBO_EVENT_EXTRA_REG(0x1031, 0x10ff, 0x2), + SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4), + SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0xc), + SNBEP_CBO_EVENT_EXTRA_REG(0x5134, 0xffff, 0xc), + SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4), + SNBEP_CBO_EVENT_EXTRA_REG(0x4334, 0xffff, 0xc), + SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4), + SNBEP_CBO_EVENT_EXTRA_REG(0x4534, 0xffff, 0xc), + SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4), + SNBEP_CBO_EVENT_EXTRA_REG(0x4934, 0xffff, 0xc), + SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x10), + SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x10), + SNBEP_CBO_EVENT_EXTRA_REG(0x2135, 0xffff, 0x10), + SNBEP_CBO_EVENT_EXTRA_REG(0x2335, 0xffff, 0x10), + SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0x18), + SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0x18), + SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x8), + SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x8), + SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x8), + SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x8), + SNBEP_CBO_EVENT_EXTRA_REG(0x8135, 0xffff, 0x10), + SNBEP_CBO_EVENT_EXTRA_REG(0x8335, 0xffff, 0x10), + SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x10), + SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x10), + SNBEP_CBO_EVENT_EXTRA_REG(0x2136, 0xffff, 0x10), + SNBEP_CBO_EVENT_EXTRA_REG(0x2336, 0xffff, 0x10), + SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0x18), + SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0x18), + SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x8), + SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x8), + SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x8), + SNBEP_CBO_EVENT_EXTRA_REG(0x5036, 0xffff, 0x8), + SNBEP_CBO_EVENT_EXTRA_REG(0x8136, 0xffff, 0x10), + SNBEP_CBO_EVENT_EXTRA_REG(0x8336, 0xffff, 0x10), + SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x8), + EVENT_EXTRA_END +}; + +static u64 ivbep_cbox_filter_mask(int fields) +{ + u64 mask = 0; + + if (fields & 0x1) + mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_TID; + if (fields & 0x2) + mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_LINK; + if (fields & 0x4) + mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_STATE; + if (fields & 0x8) + mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_NID; + if (fields & 0x10) { + mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_OPC; + mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_NC; + mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_C6; + mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_ISOC; + } + + return mask; +} + +static struct event_constraint * +ivbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event) +{ + return __snbep_cbox_get_constraint(box, event, ivbep_cbox_filter_mask); +} + +static int ivbep_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + struct extra_reg *er; + int idx = 0; + + for (er = ivbep_uncore_cbox_extra_regs; er->msr; er++) { + if (er->event != (event->hw.config & er->config_mask)) + continue; + idx |= er->idx; + } + + if (idx) { + reg1->reg = SNBEP_C0_MSR_PMON_BOX_FILTER + + SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx; + reg1->config = event->attr.config1 & ivbep_cbox_filter_mask(idx); + reg1->idx = idx; + } + return 0; +} + +static void ivbep_cbox_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + + if (reg1->idx != EXTRA_REG_NONE) { + u64 filter = uncore_shared_reg_config(box, 0); + wrmsrl(reg1->reg, filter & 0xffffffff); + wrmsrl(reg1->reg + 6, filter >> 32); + } + + wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN); +} + +static struct intel_uncore_ops ivbep_uncore_cbox_ops = { + .init_box = ivbep_uncore_msr_init_box, + .disable_box = snbep_uncore_msr_disable_box, + .enable_box = snbep_uncore_msr_enable_box, + .disable_event = snbep_uncore_msr_disable_event, + .enable_event = ivbep_cbox_enable_event, + .read_counter = uncore_msr_read_counter, + .hw_config = ivbep_cbox_hw_config, + .get_constraint = ivbep_cbox_get_constraint, + .put_constraint = snbep_cbox_put_constraint, +}; + +static struct intel_uncore_type ivbep_uncore_cbox = { + .name = "cbox", + .num_counters = 4, + .num_boxes = 15, + .perf_ctr_bits = 44, + .event_ctl = SNBEP_C0_MSR_PMON_CTL0, + .perf_ctr = SNBEP_C0_MSR_PMON_CTR0, + .event_mask = IVBEP_CBO_MSR_PMON_RAW_EVENT_MASK, + .box_ctl = SNBEP_C0_MSR_PMON_BOX_CTL, + .msr_offset = SNBEP_CBO_MSR_OFFSET, + .num_shared_regs = 1, + .constraints = snbep_uncore_cbox_constraints, + .ops = &ivbep_uncore_cbox_ops, + .format_group = &ivbep_uncore_cbox_format_group, +}; + +static struct intel_uncore_ops ivbep_uncore_pcu_ops = { + IVBEP_UNCORE_MSR_OPS_COMMON_INIT(), + .hw_config = snbep_pcu_hw_config, + .get_constraint = snbep_pcu_get_constraint, + .put_constraint = snbep_pcu_put_constraint, +}; + +static struct intel_uncore_type ivbep_uncore_pcu = { + .name = "pcu", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 48, + .perf_ctr = SNBEP_PCU_MSR_PMON_CTR0, + .event_ctl = SNBEP_PCU_MSR_PMON_CTL0, + .event_mask = IVBEP_PCU_MSR_PMON_RAW_EVENT_MASK, + .box_ctl = SNBEP_PCU_MSR_PMON_BOX_CTL, + .num_shared_regs = 1, + .ops = &ivbep_uncore_pcu_ops, + .format_group = &ivbep_uncore_pcu_format_group, +}; + +static struct intel_uncore_type *ivbep_msr_uncores[] = { + &ivbep_uncore_ubox, + &ivbep_uncore_cbox, + &ivbep_uncore_pcu, + NULL, +}; + +void ivbep_uncore_cpu_init(void) +{ + if (ivbep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) + ivbep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; + uncore_msr_uncores = ivbep_msr_uncores; +} + +static struct intel_uncore_type ivbep_uncore_ha = { + .name = "ha", + .num_counters = 4, + .num_boxes = 2, + .perf_ctr_bits = 48, + IVBEP_UNCORE_PCI_COMMON_INIT(), +}; + +static struct intel_uncore_type ivbep_uncore_imc = { + .name = "imc", + .num_counters = 4, + .num_boxes = 8, + .perf_ctr_bits = 48, + .fixed_ctr_bits = 48, + .fixed_ctr = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR, + .fixed_ctl = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL, + .event_descs = snbep_uncore_imc_events, + IVBEP_UNCORE_PCI_COMMON_INIT(), +}; + +/* registers in IRP boxes are not properly aligned */ +static unsigned ivbep_uncore_irp_ctls[] = {0xd8, 0xdc, 0xe0, 0xe4}; +static unsigned ivbep_uncore_irp_ctrs[] = {0xa0, 0xb0, 0xb8, 0xc0}; + +static void ivbep_uncore_irp_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + + pci_write_config_dword(pdev, ivbep_uncore_irp_ctls[hwc->idx], + hwc->config | SNBEP_PMON_CTL_EN); +} + +static void ivbep_uncore_irp_disable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + + pci_write_config_dword(pdev, ivbep_uncore_irp_ctls[hwc->idx], hwc->config); +} + +static u64 ivbep_uncore_irp_read_counter(struct intel_uncore_box *box, struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + u64 count = 0; + + pci_read_config_dword(pdev, ivbep_uncore_irp_ctrs[hwc->idx], (u32 *)&count); + pci_read_config_dword(pdev, ivbep_uncore_irp_ctrs[hwc->idx] + 4, (u32 *)&count + 1); + + return count; +} + +static struct intel_uncore_ops ivbep_uncore_irp_ops = { + .init_box = ivbep_uncore_pci_init_box, + .disable_box = snbep_uncore_pci_disable_box, + .enable_box = snbep_uncore_pci_enable_box, + .disable_event = ivbep_uncore_irp_disable_event, + .enable_event = ivbep_uncore_irp_enable_event, + .read_counter = ivbep_uncore_irp_read_counter, +}; + +static struct intel_uncore_type ivbep_uncore_irp = { + .name = "irp", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 48, + .event_mask = IVBEP_PMON_RAW_EVENT_MASK, + .box_ctl = SNBEP_PCI_PMON_BOX_CTL, + .ops = &ivbep_uncore_irp_ops, + .format_group = &ivbep_uncore_format_group, +}; + +static struct intel_uncore_ops ivbep_uncore_qpi_ops = { + .init_box = ivbep_uncore_pci_init_box, + .disable_box = snbep_uncore_pci_disable_box, + .enable_box = snbep_uncore_pci_enable_box, + .disable_event = snbep_uncore_pci_disable_event, + .enable_event = snbep_qpi_enable_event, + .read_counter = snbep_uncore_pci_read_counter, + .hw_config = snbep_qpi_hw_config, + .get_constraint = uncore_get_constraint, + .put_constraint = uncore_put_constraint, +}; + +static struct intel_uncore_type ivbep_uncore_qpi = { + .name = "qpi", + .num_counters = 4, + .num_boxes = 3, + .perf_ctr_bits = 48, + .perf_ctr = SNBEP_PCI_PMON_CTR0, + .event_ctl = SNBEP_PCI_PMON_CTL0, + .event_mask = IVBEP_QPI_PCI_PMON_RAW_EVENT_MASK, + .box_ctl = SNBEP_PCI_PMON_BOX_CTL, + .num_shared_regs = 1, + .ops = &ivbep_uncore_qpi_ops, + .format_group = &ivbep_uncore_qpi_format_group, +}; + +static struct intel_uncore_type ivbep_uncore_r2pcie = { + .name = "r2pcie", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 44, + .constraints = snbep_uncore_r2pcie_constraints, + IVBEP_UNCORE_PCI_COMMON_INIT(), +}; + +static struct intel_uncore_type ivbep_uncore_r3qpi = { + .name = "r3qpi", + .num_counters = 3, + .num_boxes = 2, + .perf_ctr_bits = 44, + .constraints = snbep_uncore_r3qpi_constraints, + IVBEP_UNCORE_PCI_COMMON_INIT(), +}; + +enum { + IVBEP_PCI_UNCORE_HA, + IVBEP_PCI_UNCORE_IMC, + IVBEP_PCI_UNCORE_IRP, + IVBEP_PCI_UNCORE_QPI, + IVBEP_PCI_UNCORE_R2PCIE, + IVBEP_PCI_UNCORE_R3QPI, +}; + +static struct intel_uncore_type *ivbep_pci_uncores[] = { + [IVBEP_PCI_UNCORE_HA] = &ivbep_uncore_ha, + [IVBEP_PCI_UNCORE_IMC] = &ivbep_uncore_imc, + [IVBEP_PCI_UNCORE_IRP] = &ivbep_uncore_irp, + [IVBEP_PCI_UNCORE_QPI] = &ivbep_uncore_qpi, + [IVBEP_PCI_UNCORE_R2PCIE] = &ivbep_uncore_r2pcie, + [IVBEP_PCI_UNCORE_R3QPI] = &ivbep_uncore_r3qpi, + NULL, +}; + +static const struct pci_device_id ivbep_uncore_pci_ids[] = { + { /* Home Agent 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe30), + .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_HA, 0), + }, + { /* Home Agent 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe38), + .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_HA, 1), + }, + { /* MC0 Channel 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb4), + .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 0), + }, + { /* MC0 Channel 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb5), + .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 1), + }, + { /* MC0 Channel 3 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb0), + .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 2), + }, + { /* MC0 Channel 4 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb1), + .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 3), + }, + { /* MC1 Channel 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef4), + .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 4), + }, + { /* MC1 Channel 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef5), + .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 5), + }, + { /* MC1 Channel 3 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef0), + .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 6), + }, + { /* MC1 Channel 4 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef1), + .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 7), + }, + { /* IRP */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe39), + .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IRP, 0), + }, + { /* QPI0 Port 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe32), + .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_QPI, 0), + }, + { /* QPI0 Port 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe33), + .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_QPI, 1), + }, + { /* QPI1 Port 2 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3a), + .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_QPI, 2), + }, + { /* R2PCIe */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe34), + .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R2PCIE, 0), + }, + { /* R3QPI0 Link 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe36), + .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R3QPI, 0), + }, + { /* R3QPI0 Link 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe37), + .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R3QPI, 1), + }, + { /* R3QPI1 Link 2 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3e), + .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R3QPI, 2), + }, + { /* QPI Port 0 filter */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe86), + .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, + SNBEP_PCI_QPI_PORT0_FILTER), + }, + { /* QPI Port 0 filter */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe96), + .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, + SNBEP_PCI_QPI_PORT1_FILTER), + }, + { /* end: all zeroes */ } +}; + +static struct pci_driver ivbep_uncore_pci_driver = { + .name = "ivbep_uncore", + .id_table = ivbep_uncore_pci_ids, +}; + +int ivbep_uncore_pci_init(void) +{ + int ret = snbep_pci2phy_map_init(0x0e1e); + if (ret) + return ret; + uncore_pci_uncores = ivbep_pci_uncores; + uncore_pci_driver = &ivbep_uncore_pci_driver; + return 0; +} +/* end of IvyTown uncore support */ + +/* Haswell-EP uncore support */ +static struct attribute *hswep_uncore_ubox_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh5.attr, + &format_attr_filter_tid2.attr, + &format_attr_filter_cid.attr, + NULL, +}; + +static struct attribute_group hswep_uncore_ubox_format_group = { + .name = "format", + .attrs = hswep_uncore_ubox_formats_attr, +}; + +static int hswep_ubox_hw_config(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + reg1->reg = HSWEP_U_MSR_PMON_FILTER; + reg1->config = event->attr.config1 & HSWEP_U_MSR_PMON_BOX_FILTER_MASK; + reg1->idx = 0; + return 0; +} + +static struct intel_uncore_ops hswep_uncore_ubox_ops = { + SNBEP_UNCORE_MSR_OPS_COMMON_INIT(), + .hw_config = hswep_ubox_hw_config, + .get_constraint = uncore_get_constraint, + .put_constraint = uncore_put_constraint, +}; + +static struct intel_uncore_type hswep_uncore_ubox = { + .name = "ubox", + .num_counters = 2, + .num_boxes = 1, + .perf_ctr_bits = 44, + .fixed_ctr_bits = 48, + .perf_ctr = HSWEP_U_MSR_PMON_CTR0, + .event_ctl = HSWEP_U_MSR_PMON_CTL0, + .event_mask = SNBEP_U_MSR_PMON_RAW_EVENT_MASK, + .fixed_ctr = HSWEP_U_MSR_PMON_UCLK_FIXED_CTR, + .fixed_ctl = HSWEP_U_MSR_PMON_UCLK_FIXED_CTL, + .num_shared_regs = 1, + .ops = &hswep_uncore_ubox_ops, + .format_group = &hswep_uncore_ubox_format_group, +}; + +static struct attribute *hswep_uncore_cbox_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_tid_en.attr, + &format_attr_thresh8.attr, + &format_attr_filter_tid3.attr, + &format_attr_filter_link2.attr, + &format_attr_filter_state3.attr, + &format_attr_filter_nid2.attr, + &format_attr_filter_opc2.attr, + &format_attr_filter_nc.attr, + &format_attr_filter_c6.attr, + &format_attr_filter_isoc.attr, + NULL, +}; + +static struct attribute_group hswep_uncore_cbox_format_group = { + .name = "format", + .attrs = hswep_uncore_cbox_formats_attr, +}; + +static struct event_constraint hswep_uncore_cbox_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x01, 0x1), + UNCORE_EVENT_CONSTRAINT(0x09, 0x1), + UNCORE_EVENT_CONSTRAINT(0x11, 0x1), + UNCORE_EVENT_CONSTRAINT(0x36, 0x1), + UNCORE_EVENT_CONSTRAINT(0x38, 0x3), + UNCORE_EVENT_CONSTRAINT(0x3b, 0x1), + UNCORE_EVENT_CONSTRAINT(0x3e, 0x1), + EVENT_CONSTRAINT_END +}; + +static struct extra_reg hswep_uncore_cbox_extra_regs[] = { + SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN, + SNBEP_CBO_PMON_CTL_TID_EN, 0x1), + SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4), + SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4), + SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4), + SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4), + SNBEP_CBO_EVENT_EXTRA_REG(0x2134, 0xffff, 0x4), + SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0x4), + SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x8), + SNBEP_CBO_EVENT_EXTRA_REG(0x4028, 0x40ff, 0x8), + SNBEP_CBO_EVENT_EXTRA_REG(0x4032, 0x40ff, 0x8), + SNBEP_CBO_EVENT_EXTRA_REG(0x4029, 0x40ff, 0x8), + SNBEP_CBO_EVENT_EXTRA_REG(0x4033, 0x40ff, 0x8), + SNBEP_CBO_EVENT_EXTRA_REG(0x402A, 0x40ff, 0x8), + SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x12), + SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x10), + SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0x18), + SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x8), + SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x8), + SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x8), + SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0x18), + SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x8), + SNBEP_CBO_EVENT_EXTRA_REG(0x2335, 0xffff, 0x10), + SNBEP_CBO_EVENT_EXTRA_REG(0x8335, 0xffff, 0x10), + SNBEP_CBO_EVENT_EXTRA_REG(0x2135, 0xffff, 0x10), + SNBEP_CBO_EVENT_EXTRA_REG(0x8135, 0xffff, 0x10), + SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x10), + SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x10), + SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0x18), + SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x8), + SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x8), + SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0x18), + SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x8), + SNBEP_CBO_EVENT_EXTRA_REG(0x2336, 0xffff, 0x10), + SNBEP_CBO_EVENT_EXTRA_REG(0x8336, 0xffff, 0x10), + SNBEP_CBO_EVENT_EXTRA_REG(0x2136, 0xffff, 0x10), + SNBEP_CBO_EVENT_EXTRA_REG(0x8136, 0xffff, 0x10), + SNBEP_CBO_EVENT_EXTRA_REG(0x5036, 0xffff, 0x8), + EVENT_EXTRA_END +}; + +static u64 hswep_cbox_filter_mask(int fields) +{ + u64 mask = 0; + if (fields & 0x1) + mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_TID; + if (fields & 0x2) + mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_LINK; + if (fields & 0x4) + mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_STATE; + if (fields & 0x8) + mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_NID; + if (fields & 0x10) { + mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_OPC; + mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_NC; + mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_C6; + mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_ISOC; + } + return mask; +} + +static struct event_constraint * +hswep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event) +{ + return __snbep_cbox_get_constraint(box, event, hswep_cbox_filter_mask); +} + +static int hswep_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + struct extra_reg *er; + int idx = 0; + + for (er = hswep_uncore_cbox_extra_regs; er->msr; er++) { + if (er->event != (event->hw.config & er->config_mask)) + continue; + idx |= er->idx; + } + + if (idx) { + reg1->reg = HSWEP_C0_MSR_PMON_BOX_FILTER0 + + HSWEP_CBO_MSR_OFFSET * box->pmu->pmu_idx; + reg1->config = event->attr.config1 & hswep_cbox_filter_mask(idx); + reg1->idx = idx; + } + return 0; +} + +static void hswep_cbox_enable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + + if (reg1->idx != EXTRA_REG_NONE) { + u64 filter = uncore_shared_reg_config(box, 0); + wrmsrl(reg1->reg, filter & 0xffffffff); + wrmsrl(reg1->reg + 1, filter >> 32); + } + + wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN); +} + +static struct intel_uncore_ops hswep_uncore_cbox_ops = { + .init_box = snbep_uncore_msr_init_box, + .disable_box = snbep_uncore_msr_disable_box, + .enable_box = snbep_uncore_msr_enable_box, + .disable_event = snbep_uncore_msr_disable_event, + .enable_event = hswep_cbox_enable_event, + .read_counter = uncore_msr_read_counter, + .hw_config = hswep_cbox_hw_config, + .get_constraint = hswep_cbox_get_constraint, + .put_constraint = snbep_cbox_put_constraint, +}; + +static struct intel_uncore_type hswep_uncore_cbox = { + .name = "cbox", + .num_counters = 4, + .num_boxes = 18, + .perf_ctr_bits = 44, + .event_ctl = HSWEP_C0_MSR_PMON_CTL0, + .perf_ctr = HSWEP_C0_MSR_PMON_CTR0, + .event_mask = SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK, + .box_ctl = HSWEP_C0_MSR_PMON_BOX_CTL, + .msr_offset = HSWEP_CBO_MSR_OFFSET, + .num_shared_regs = 1, + .constraints = hswep_uncore_cbox_constraints, + .ops = &hswep_uncore_cbox_ops, + .format_group = &hswep_uncore_cbox_format_group, +}; + +static struct attribute *hswep_uncore_sbox_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_tid_en.attr, + &format_attr_inv.attr, + &format_attr_thresh8.attr, + NULL, +}; + +static struct attribute_group hswep_uncore_sbox_format_group = { + .name = "format", + .attrs = hswep_uncore_sbox_formats_attr, +}; + +static struct intel_uncore_type hswep_uncore_sbox = { + .name = "sbox", + .num_counters = 4, + .num_boxes = 4, + .perf_ctr_bits = 44, + .event_ctl = HSWEP_S0_MSR_PMON_CTL0, + .perf_ctr = HSWEP_S0_MSR_PMON_CTR0, + .event_mask = HSWEP_S_MSR_PMON_RAW_EVENT_MASK, + .box_ctl = HSWEP_S0_MSR_PMON_BOX_CTL, + .msr_offset = HSWEP_SBOX_MSR_OFFSET, + .ops = &snbep_uncore_msr_ops, + .format_group = &hswep_uncore_sbox_format_group, +}; + +static int hswep_pcu_hw_config(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + int ev_sel = hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK; + + if (ev_sel >= 0xb && ev_sel <= 0xe) { + reg1->reg = HSWEP_PCU_MSR_PMON_BOX_FILTER; + reg1->idx = ev_sel - 0xb; + reg1->config = event->attr.config1 & (0xff << reg1->idx); + } + return 0; +} + +static struct intel_uncore_ops hswep_uncore_pcu_ops = { + SNBEP_UNCORE_MSR_OPS_COMMON_INIT(), + .hw_config = hswep_pcu_hw_config, + .get_constraint = snbep_pcu_get_constraint, + .put_constraint = snbep_pcu_put_constraint, +}; + +static struct intel_uncore_type hswep_uncore_pcu = { + .name = "pcu", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 48, + .perf_ctr = HSWEP_PCU_MSR_PMON_CTR0, + .event_ctl = HSWEP_PCU_MSR_PMON_CTL0, + .event_mask = SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK, + .box_ctl = HSWEP_PCU_MSR_PMON_BOX_CTL, + .num_shared_regs = 1, + .ops = &hswep_uncore_pcu_ops, + .format_group = &snbep_uncore_pcu_format_group, +}; + +static struct intel_uncore_type *hswep_msr_uncores[] = { + &hswep_uncore_ubox, + &hswep_uncore_cbox, + &hswep_uncore_sbox, + &hswep_uncore_pcu, + NULL, +}; + +void hswep_uncore_cpu_init(void) +{ + if (hswep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) + hswep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; + uncore_msr_uncores = hswep_msr_uncores; +} + +static struct intel_uncore_type hswep_uncore_ha = { + .name = "ha", + .num_counters = 5, + .num_boxes = 2, + .perf_ctr_bits = 48, + SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + +static struct uncore_event_desc hswep_uncore_imc_events[] = { + INTEL_UNCORE_EVENT_DESC(clockticks, "event=0x00,umask=0x00"), + INTEL_UNCORE_EVENT_DESC(cas_count_read, "event=0x04,umask=0x03"), + INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"), + { /* end: all zeroes */ }, +}; + +static struct intel_uncore_type hswep_uncore_imc = { + .name = "imc", + .num_counters = 5, + .num_boxes = 8, + .perf_ctr_bits = 48, + .fixed_ctr_bits = 48, + .fixed_ctr = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR, + .fixed_ctl = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL, + .event_descs = hswep_uncore_imc_events, + SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + +static struct intel_uncore_ops hswep_uncore_irp_ops = { + .init_box = snbep_uncore_pci_init_box, + .disable_box = snbep_uncore_pci_disable_box, + .enable_box = snbep_uncore_pci_enable_box, + .disable_event = ivbep_uncore_irp_disable_event, + .enable_event = ivbep_uncore_irp_enable_event, + .read_counter = ivbep_uncore_irp_read_counter, +}; + +static struct intel_uncore_type hswep_uncore_irp = { + .name = "irp", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 48, + .event_mask = SNBEP_PMON_RAW_EVENT_MASK, + .box_ctl = SNBEP_PCI_PMON_BOX_CTL, + .ops = &hswep_uncore_irp_ops, + .format_group = &snbep_uncore_format_group, +}; + +static struct intel_uncore_type hswep_uncore_qpi = { + .name = "qpi", + .num_counters = 5, + .num_boxes = 3, + .perf_ctr_bits = 48, + .perf_ctr = SNBEP_PCI_PMON_CTR0, + .event_ctl = SNBEP_PCI_PMON_CTL0, + .event_mask = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK, + .box_ctl = SNBEP_PCI_PMON_BOX_CTL, + .num_shared_regs = 1, + .ops = &snbep_uncore_qpi_ops, + .format_group = &snbep_uncore_qpi_format_group, +}; + +static struct event_constraint hswep_uncore_r2pcie_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x10, 0x3), + UNCORE_EVENT_CONSTRAINT(0x11, 0x3), + UNCORE_EVENT_CONSTRAINT(0x13, 0x1), + UNCORE_EVENT_CONSTRAINT(0x23, 0x1), + UNCORE_EVENT_CONSTRAINT(0x24, 0x1), + UNCORE_EVENT_CONSTRAINT(0x25, 0x1), + UNCORE_EVENT_CONSTRAINT(0x26, 0x3), + UNCORE_EVENT_CONSTRAINT(0x27, 0x1), + UNCORE_EVENT_CONSTRAINT(0x28, 0x3), + UNCORE_EVENT_CONSTRAINT(0x29, 0x3), + UNCORE_EVENT_CONSTRAINT(0x2a, 0x1), + UNCORE_EVENT_CONSTRAINT(0x2b, 0x3), + UNCORE_EVENT_CONSTRAINT(0x2c, 0x3), + UNCORE_EVENT_CONSTRAINT(0x2d, 0x3), + UNCORE_EVENT_CONSTRAINT(0x32, 0x3), + UNCORE_EVENT_CONSTRAINT(0x33, 0x3), + UNCORE_EVENT_CONSTRAINT(0x34, 0x3), + UNCORE_EVENT_CONSTRAINT(0x35, 0x3), + EVENT_CONSTRAINT_END +}; + +static struct intel_uncore_type hswep_uncore_r2pcie = { + .name = "r2pcie", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 48, + .constraints = hswep_uncore_r2pcie_constraints, + SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + +static struct event_constraint hswep_uncore_r3qpi_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x01, 0x3), + UNCORE_EVENT_CONSTRAINT(0x07, 0x7), + UNCORE_EVENT_CONSTRAINT(0x08, 0x7), + UNCORE_EVENT_CONSTRAINT(0x09, 0x7), + UNCORE_EVENT_CONSTRAINT(0x0a, 0x7), + UNCORE_EVENT_CONSTRAINT(0x0e, 0x7), + UNCORE_EVENT_CONSTRAINT(0x10, 0x3), + UNCORE_EVENT_CONSTRAINT(0x11, 0x3), + UNCORE_EVENT_CONSTRAINT(0x12, 0x3), + UNCORE_EVENT_CONSTRAINT(0x13, 0x1), + UNCORE_EVENT_CONSTRAINT(0x14, 0x3), + UNCORE_EVENT_CONSTRAINT(0x15, 0x3), + UNCORE_EVENT_CONSTRAINT(0x1f, 0x3), + UNCORE_EVENT_CONSTRAINT(0x20, 0x3), + UNCORE_EVENT_CONSTRAINT(0x21, 0x3), + UNCORE_EVENT_CONSTRAINT(0x22, 0x3), + UNCORE_EVENT_CONSTRAINT(0x23, 0x3), + UNCORE_EVENT_CONSTRAINT(0x25, 0x3), + UNCORE_EVENT_CONSTRAINT(0x26, 0x3), + UNCORE_EVENT_CONSTRAINT(0x28, 0x3), + UNCORE_EVENT_CONSTRAINT(0x29, 0x3), + UNCORE_EVENT_CONSTRAINT(0x2c, 0x3), + UNCORE_EVENT_CONSTRAINT(0x2d, 0x3), + UNCORE_EVENT_CONSTRAINT(0x2e, 0x3), + UNCORE_EVENT_CONSTRAINT(0x2f, 0x3), + UNCORE_EVENT_CONSTRAINT(0x31, 0x3), + UNCORE_EVENT_CONSTRAINT(0x32, 0x3), + UNCORE_EVENT_CONSTRAINT(0x33, 0x3), + UNCORE_EVENT_CONSTRAINT(0x34, 0x3), + UNCORE_EVENT_CONSTRAINT(0x36, 0x3), + UNCORE_EVENT_CONSTRAINT(0x37, 0x3), + UNCORE_EVENT_CONSTRAINT(0x38, 0x3), + UNCORE_EVENT_CONSTRAINT(0x39, 0x3), + EVENT_CONSTRAINT_END +}; + +static struct intel_uncore_type hswep_uncore_r3qpi = { + .name = "r3qpi", + .num_counters = 4, + .num_boxes = 3, + .perf_ctr_bits = 44, + .constraints = hswep_uncore_r3qpi_constraints, + SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + +enum { + HSWEP_PCI_UNCORE_HA, + HSWEP_PCI_UNCORE_IMC, + HSWEP_PCI_UNCORE_IRP, + HSWEP_PCI_UNCORE_QPI, + HSWEP_PCI_UNCORE_R2PCIE, + HSWEP_PCI_UNCORE_R3QPI, +}; + +static struct intel_uncore_type *hswep_pci_uncores[] = { + [HSWEP_PCI_UNCORE_HA] = &hswep_uncore_ha, + [HSWEP_PCI_UNCORE_IMC] = &hswep_uncore_imc, + [HSWEP_PCI_UNCORE_IRP] = &hswep_uncore_irp, + [HSWEP_PCI_UNCORE_QPI] = &hswep_uncore_qpi, + [HSWEP_PCI_UNCORE_R2PCIE] = &hswep_uncore_r2pcie, + [HSWEP_PCI_UNCORE_R3QPI] = &hswep_uncore_r3qpi, + NULL, +}; + +static DEFINE_PCI_DEVICE_TABLE(hswep_uncore_pci_ids) = { + { /* Home Agent 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f30), + .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_HA, 0), + }, + { /* Home Agent 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f38), + .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_HA, 1), + }, + { /* MC0 Channel 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fb0), + .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 0), + }, + { /* MC0 Channel 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fb1), + .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 1), + }, + { /* MC0 Channel 2 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fb4), + .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 2), + }, + { /* MC0 Channel 3 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fb5), + .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 3), + }, + { /* MC1 Channel 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fd0), + .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 4), + }, + { /* MC1 Channel 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fd1), + .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 5), + }, + { /* MC1 Channel 2 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fd4), + .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 6), + }, + { /* MC1 Channel 3 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fd5), + .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 7), + }, + { /* IRP */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f39), + .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IRP, 0), + }, + { /* QPI0 Port 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f32), + .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_QPI, 0), + }, + { /* QPI0 Port 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f33), + .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_QPI, 1), + }, + { /* QPI1 Port 2 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f3a), + .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_QPI, 2), + }, + { /* R2PCIe */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f34), + .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_R2PCIE, 0), + }, + { /* R3QPI0 Link 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f36), + .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_R3QPI, 0), + }, + { /* R3QPI0 Link 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f37), + .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_R3QPI, 1), + }, + { /* R3QPI1 Link 2 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f3e), + .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_R3QPI, 2), + }, + { /* QPI Port 0 filter */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f86), + .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, + SNBEP_PCI_QPI_PORT0_FILTER), + }, + { /* QPI Port 1 filter */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f96), + .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, + SNBEP_PCI_QPI_PORT1_FILTER), + }, + { /* end: all zeroes */ } +}; + +static struct pci_driver hswep_uncore_pci_driver = { + .name = "hswep_uncore", + .id_table = hswep_uncore_pci_ids, +}; + +int hswep_uncore_pci_init(void) +{ + int ret = snbep_pci2phy_map_init(0x2f1e); + if (ret) + return ret; + uncore_pci_uncores = hswep_pci_uncores; + uncore_pci_driver = &hswep_uncore_pci_driver; + return 0; +} +/* end of Haswell-EP uncore support */ diff --git a/arch/x86/kernel/cpu/perf_event_knc.c b/arch/x86/kernel/cpu/perf_event_knc.c index 838fa8772c6..5b0c232d1ee 100644 --- a/arch/x86/kernel/cpu/perf_event_knc.c +++ b/arch/x86/kernel/cpu/perf_event_knc.c @@ -217,7 +217,7 @@ static int knc_pmu_handle_irq(struct pt_regs *regs) int bit, loops; u64 status; - cpuc = &__get_cpu_var(cpu_hw_events); + cpuc = this_cpu_ptr(&cpu_hw_events); knc_pmu_disable_all(); diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index 5d466b7d860..f2e56783af3 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -915,7 +915,7 @@ static inline void p4_pmu_disable_event(struct perf_event *event) static void p4_pmu_disable_all(void) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int idx; for (idx = 0; idx < x86_pmu.num_counters; idx++) { @@ -984,7 +984,7 @@ static void p4_pmu_enable_event(struct perf_event *event) static void p4_pmu_enable_all(int added) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int idx; for (idx = 0; idx < x86_pmu.num_counters; idx++) { @@ -1004,7 +1004,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs) int idx, handled = 0; u64 val; - cpuc = &__get_cpu_var(cpu_hw_events); + cpuc = this_cpu_ptr(&cpu_hw_events); for (idx = 0; idx < x86_pmu.num_counters; idx++) { int overflow; diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 06fe3ed8b85..5433658e598 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -97,6 +97,14 @@ static int show_cpuinfo(struct seq_file *m, void *v) if (cpu_has(c, i) && x86_cap_flags[i] != NULL) seq_printf(m, " %s", x86_cap_flags[i]); + seq_printf(m, "\nbugs\t\t:"); + for (i = 0; i < 32*NBUGINTS; i++) { + unsigned int bug_bit = 32*NCAPINTS + i; + + if (cpu_has_bug(c, bug_bit) && x86_bug_flags[i]) + seq_printf(m, " %s", x86_bug_flags[i]); + } + seq_printf(m, "\nbogomips\t: %lu.%02lu\n", c->loops_per_jiffy/(500000/HZ), (c->loops_per_jiffy/(5000/HZ)) % 100); diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index b6f794aa169..4a8013d5594 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -38,7 +38,6 @@ void init_scattered_cpuid_features(struct cpuinfo_x86 *c) { X86_FEATURE_PTS, CR_EAX, 6, 0x00000006, 0 }, { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 }, { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 }, - { X86_FEATURE_XSAVEOPT, CR_EAX, 0, 0x0000000d, 1 }, { X86_FEATURE_HW_PSTATE, CR_EDX, 7, 0x80000007, 0 }, { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 }, { X86_FEATURE_PROC_FEEDBACK, CR_EDX,11, 0x80000007, 0 }, diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 507de806659..f5ab56d1428 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -4,9 +4,14 @@ * Created by: Hariprasad Nellitheertha (hari@in.ibm.com) * * Copyright (C) IBM Corporation, 2004. All rights reserved. + * Copyright (C) Red Hat Inc., 2014. All rights reserved. + * Authors: + * Vivek Goyal <vgoyal@redhat.com> * */ +#define pr_fmt(fmt) "kexec: " fmt + #include <linux/types.h> #include <linux/kernel.h> #include <linux/smp.h> @@ -16,6 +21,7 @@ #include <linux/elf.h> #include <linux/elfcore.h> #include <linux/module.h> +#include <linux/slab.h> #include <asm/processor.h> #include <asm/hardirq.h> @@ -28,6 +34,45 @@ #include <asm/reboot.h> #include <asm/virtext.h> +/* Alignment required for elf header segment */ +#define ELF_CORE_HEADER_ALIGN 4096 + +/* This primarily represents number of split ranges due to exclusion */ +#define CRASH_MAX_RANGES 16 + +struct crash_mem_range { + u64 start, end; +}; + +struct crash_mem { + unsigned int nr_ranges; + struct crash_mem_range ranges[CRASH_MAX_RANGES]; +}; + +/* Misc data about ram ranges needed to prepare elf headers */ +struct crash_elf_data { + struct kimage *image; + /* + * Total number of ram ranges we have after various adjustments for + * GART, crash reserved region etc. + */ + unsigned int max_nr_ranges; + unsigned long gart_start, gart_end; + + /* Pointer to elf header */ + void *ehdr; + /* Pointer to next phdr */ + void *bufp; + struct crash_mem mem; +}; + +/* Used while preparing memory map entries for second kernel */ +struct crash_memmap_data { + struct boot_params *params; + /* Type of memory */ + unsigned int type; +}; + int in_crash_kexec; /* @@ -39,6 +84,7 @@ int in_crash_kexec; */ crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss = NULL; EXPORT_SYMBOL_GPL(crash_vmclear_loaded_vmcss); +unsigned long crash_zero_bytes; static inline void cpu_crash_vmclear_loaded_vmcss(void) { @@ -135,3 +181,520 @@ void native_machine_crash_shutdown(struct pt_regs *regs) #endif crash_save_cpu(regs, safe_smp_processor_id()); } + +#ifdef CONFIG_KEXEC_FILE +static int get_nr_ram_ranges_callback(unsigned long start_pfn, + unsigned long nr_pfn, void *arg) +{ + int *nr_ranges = arg; + + (*nr_ranges)++; + return 0; +} + +static int get_gart_ranges_callback(u64 start, u64 end, void *arg) +{ + struct crash_elf_data *ced = arg; + + ced->gart_start = start; + ced->gart_end = end; + + /* Not expecting more than 1 gart aperture */ + return 1; +} + + +/* Gather all the required information to prepare elf headers for ram regions */ +static void fill_up_crash_elf_data(struct crash_elf_data *ced, + struct kimage *image) +{ + unsigned int nr_ranges = 0; + + ced->image = image; + + walk_system_ram_range(0, -1, &nr_ranges, + get_nr_ram_ranges_callback); + + ced->max_nr_ranges = nr_ranges; + + /* + * We don't create ELF headers for GART aperture as an attempt + * to dump this memory in second kernel leads to hang/crash. + * If gart aperture is present, one needs to exclude that region + * and that could lead to need of extra phdr. + */ + walk_iomem_res("GART", IORESOURCE_MEM, 0, -1, + ced, get_gart_ranges_callback); + + /* + * If we have gart region, excluding that could potentially split + * a memory range, resulting in extra header. Account for that. + */ + if (ced->gart_end) + ced->max_nr_ranges++; + + /* Exclusion of crash region could split memory ranges */ + ced->max_nr_ranges++; + + /* If crashk_low_res is not 0, another range split possible */ + if (crashk_low_res.end) + ced->max_nr_ranges++; +} + +static int exclude_mem_range(struct crash_mem *mem, + unsigned long long mstart, unsigned long long mend) +{ + int i, j; + unsigned long long start, end; + struct crash_mem_range temp_range = {0, 0}; + + for (i = 0; i < mem->nr_ranges; i++) { + start = mem->ranges[i].start; + end = mem->ranges[i].end; + + if (mstart > end || mend < start) + continue; + + /* Truncate any area outside of range */ + if (mstart < start) + mstart = start; + if (mend > end) + mend = end; + + /* Found completely overlapping range */ + if (mstart == start && mend == end) { + mem->ranges[i].start = 0; + mem->ranges[i].end = 0; + if (i < mem->nr_ranges - 1) { + /* Shift rest of the ranges to left */ + for (j = i; j < mem->nr_ranges - 1; j++) { + mem->ranges[j].start = + mem->ranges[j+1].start; + mem->ranges[j].end = + mem->ranges[j+1].end; + } + } + mem->nr_ranges--; + return 0; + } + + if (mstart > start && mend < end) { + /* Split original range */ + mem->ranges[i].end = mstart - 1; + temp_range.start = mend + 1; + temp_range.end = end; + } else if (mstart != start) + mem->ranges[i].end = mstart - 1; + else + mem->ranges[i].start = mend + 1; + break; + } + + /* If a split happend, add the split to array */ + if (!temp_range.end) + return 0; + + /* Split happened */ + if (i == CRASH_MAX_RANGES - 1) { + pr_err("Too many crash ranges after split\n"); + return -ENOMEM; + } + + /* Location where new range should go */ + j = i + 1; + if (j < mem->nr_ranges) { + /* Move over all ranges one slot towards the end */ + for (i = mem->nr_ranges - 1; i >= j; i--) + mem->ranges[i + 1] = mem->ranges[i]; + } + + mem->ranges[j].start = temp_range.start; + mem->ranges[j].end = temp_range.end; + mem->nr_ranges++; + return 0; +} + +/* + * Look for any unwanted ranges between mstart, mend and remove them. This + * might lead to split and split ranges are put in ced->mem.ranges[] array + */ +static int elf_header_exclude_ranges(struct crash_elf_data *ced, + unsigned long long mstart, unsigned long long mend) +{ + struct crash_mem *cmem = &ced->mem; + int ret = 0; + + memset(cmem->ranges, 0, sizeof(cmem->ranges)); + + cmem->ranges[0].start = mstart; + cmem->ranges[0].end = mend; + cmem->nr_ranges = 1; + + /* Exclude crashkernel region */ + ret = exclude_mem_range(cmem, crashk_res.start, crashk_res.end); + if (ret) + return ret; + + if (crashk_low_res.end) { + ret = exclude_mem_range(cmem, crashk_low_res.start, crashk_low_res.end); + if (ret) + return ret; + } + + /* Exclude GART region */ + if (ced->gart_end) { + ret = exclude_mem_range(cmem, ced->gart_start, ced->gart_end); + if (ret) + return ret; + } + + return ret; +} + +static int prepare_elf64_ram_headers_callback(u64 start, u64 end, void *arg) +{ + struct crash_elf_data *ced = arg; + Elf64_Ehdr *ehdr; + Elf64_Phdr *phdr; + unsigned long mstart, mend; + struct kimage *image = ced->image; + struct crash_mem *cmem; + int ret, i; + + ehdr = ced->ehdr; + + /* Exclude unwanted mem ranges */ + ret = elf_header_exclude_ranges(ced, start, end); + if (ret) + return ret; + + /* Go through all the ranges in ced->mem.ranges[] and prepare phdr */ + cmem = &ced->mem; + + for (i = 0; i < cmem->nr_ranges; i++) { + mstart = cmem->ranges[i].start; + mend = cmem->ranges[i].end; + + phdr = ced->bufp; + ced->bufp += sizeof(Elf64_Phdr); + + phdr->p_type = PT_LOAD; + phdr->p_flags = PF_R|PF_W|PF_X; + phdr->p_offset = mstart; + + /* + * If a range matches backup region, adjust offset to backup + * segment. + */ + if (mstart == image->arch.backup_src_start && + (mend - mstart + 1) == image->arch.backup_src_sz) + phdr->p_offset = image->arch.backup_load_addr; + + phdr->p_paddr = mstart; + phdr->p_vaddr = (unsigned long long) __va(mstart); + phdr->p_filesz = phdr->p_memsz = mend - mstart + 1; + phdr->p_align = 0; + ehdr->e_phnum++; + pr_debug("Crash PT_LOAD elf header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n", + phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz, + ehdr->e_phnum, phdr->p_offset); + } + + return ret; +} + +static int prepare_elf64_headers(struct crash_elf_data *ced, + void **addr, unsigned long *sz) +{ + Elf64_Ehdr *ehdr; + Elf64_Phdr *phdr; + unsigned long nr_cpus = num_possible_cpus(), nr_phdr, elf_sz; + unsigned char *buf, *bufp; + unsigned int cpu; + unsigned long long notes_addr; + int ret; + + /* extra phdr for vmcoreinfo elf note */ + nr_phdr = nr_cpus + 1; + nr_phdr += ced->max_nr_ranges; + + /* + * kexec-tools creates an extra PT_LOAD phdr for kernel text mapping + * area on x86_64 (ffffffff80000000 - ffffffffa0000000). + * I think this is required by tools like gdb. So same physical + * memory will be mapped in two elf headers. One will contain kernel + * text virtual addresses and other will have __va(physical) addresses. + */ + + nr_phdr++; + elf_sz = sizeof(Elf64_Ehdr) + nr_phdr * sizeof(Elf64_Phdr); + elf_sz = ALIGN(elf_sz, ELF_CORE_HEADER_ALIGN); + + buf = vzalloc(elf_sz); + if (!buf) + return -ENOMEM; + + bufp = buf; + ehdr = (Elf64_Ehdr *)bufp; + bufp += sizeof(Elf64_Ehdr); + memcpy(ehdr->e_ident, ELFMAG, SELFMAG); + ehdr->e_ident[EI_CLASS] = ELFCLASS64; + ehdr->e_ident[EI_DATA] = ELFDATA2LSB; + ehdr->e_ident[EI_VERSION] = EV_CURRENT; + ehdr->e_ident[EI_OSABI] = ELF_OSABI; + memset(ehdr->e_ident + EI_PAD, 0, EI_NIDENT - EI_PAD); + ehdr->e_type = ET_CORE; + ehdr->e_machine = ELF_ARCH; + ehdr->e_version = EV_CURRENT; + ehdr->e_phoff = sizeof(Elf64_Ehdr); + ehdr->e_ehsize = sizeof(Elf64_Ehdr); + ehdr->e_phentsize = sizeof(Elf64_Phdr); + + /* Prepare one phdr of type PT_NOTE for each present cpu */ + for_each_present_cpu(cpu) { + phdr = (Elf64_Phdr *)bufp; + bufp += sizeof(Elf64_Phdr); + phdr->p_type = PT_NOTE; + notes_addr = per_cpu_ptr_to_phys(per_cpu_ptr(crash_notes, cpu)); + phdr->p_offset = phdr->p_paddr = notes_addr; + phdr->p_filesz = phdr->p_memsz = sizeof(note_buf_t); + (ehdr->e_phnum)++; + } + + /* Prepare one PT_NOTE header for vmcoreinfo */ + phdr = (Elf64_Phdr *)bufp; + bufp += sizeof(Elf64_Phdr); + phdr->p_type = PT_NOTE; + phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note(); + phdr->p_filesz = phdr->p_memsz = sizeof(vmcoreinfo_note); + (ehdr->e_phnum)++; + +#ifdef CONFIG_X86_64 + /* Prepare PT_LOAD type program header for kernel text region */ + phdr = (Elf64_Phdr *)bufp; + bufp += sizeof(Elf64_Phdr); + phdr->p_type = PT_LOAD; + phdr->p_flags = PF_R|PF_W|PF_X; + phdr->p_vaddr = (Elf64_Addr)_text; + phdr->p_filesz = phdr->p_memsz = _end - _text; + phdr->p_offset = phdr->p_paddr = __pa_symbol(_text); + (ehdr->e_phnum)++; +#endif + + /* Prepare PT_LOAD headers for system ram chunks. */ + ced->ehdr = ehdr; + ced->bufp = bufp; + ret = walk_system_ram_res(0, -1, ced, + prepare_elf64_ram_headers_callback); + if (ret < 0) + return ret; + + *addr = buf; + *sz = elf_sz; + return 0; +} + +/* Prepare elf headers. Return addr and size */ +static int prepare_elf_headers(struct kimage *image, void **addr, + unsigned long *sz) +{ + struct crash_elf_data *ced; + int ret; + + ced = kzalloc(sizeof(*ced), GFP_KERNEL); + if (!ced) + return -ENOMEM; + + fill_up_crash_elf_data(ced, image); + + /* By default prepare 64bit headers */ + ret = prepare_elf64_headers(ced, addr, sz); + kfree(ced); + return ret; +} + +static int add_e820_entry(struct boot_params *params, struct e820entry *entry) +{ + unsigned int nr_e820_entries; + + nr_e820_entries = params->e820_entries; + if (nr_e820_entries >= E820MAX) + return 1; + + memcpy(¶ms->e820_map[nr_e820_entries], entry, + sizeof(struct e820entry)); + params->e820_entries++; + return 0; +} + +static int memmap_entry_callback(u64 start, u64 end, void *arg) +{ + struct crash_memmap_data *cmd = arg; + struct boot_params *params = cmd->params; + struct e820entry ei; + + ei.addr = start; + ei.size = end - start + 1; + ei.type = cmd->type; + add_e820_entry(params, &ei); + + return 0; +} + +static int memmap_exclude_ranges(struct kimage *image, struct crash_mem *cmem, + unsigned long long mstart, + unsigned long long mend) +{ + unsigned long start, end; + int ret = 0; + + cmem->ranges[0].start = mstart; + cmem->ranges[0].end = mend; + cmem->nr_ranges = 1; + + /* Exclude Backup region */ + start = image->arch.backup_load_addr; + end = start + image->arch.backup_src_sz - 1; + ret = exclude_mem_range(cmem, start, end); + if (ret) + return ret; + + /* Exclude elf header region */ + start = image->arch.elf_load_addr; + end = start + image->arch.elf_headers_sz - 1; + return exclude_mem_range(cmem, start, end); +} + +/* Prepare memory map for crash dump kernel */ +int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params) +{ + int i, ret = 0; + unsigned long flags; + struct e820entry ei; + struct crash_memmap_data cmd; + struct crash_mem *cmem; + + cmem = vzalloc(sizeof(struct crash_mem)); + if (!cmem) + return -ENOMEM; + + memset(&cmd, 0, sizeof(struct crash_memmap_data)); + cmd.params = params; + + /* Add first 640K segment */ + ei.addr = image->arch.backup_src_start; + ei.size = image->arch.backup_src_sz; + ei.type = E820_RAM; + add_e820_entry(params, &ei); + + /* Add ACPI tables */ + cmd.type = E820_ACPI; + flags = IORESOURCE_MEM | IORESOURCE_BUSY; + walk_iomem_res("ACPI Tables", flags, 0, -1, &cmd, + memmap_entry_callback); + + /* Add ACPI Non-volatile Storage */ + cmd.type = E820_NVS; + walk_iomem_res("ACPI Non-volatile Storage", flags, 0, -1, &cmd, + memmap_entry_callback); + + /* Add crashk_low_res region */ + if (crashk_low_res.end) { + ei.addr = crashk_low_res.start; + ei.size = crashk_low_res.end - crashk_low_res.start + 1; + ei.type = E820_RAM; + add_e820_entry(params, &ei); + } + + /* Exclude some ranges from crashk_res and add rest to memmap */ + ret = memmap_exclude_ranges(image, cmem, crashk_res.start, + crashk_res.end); + if (ret) + goto out; + + for (i = 0; i < cmem->nr_ranges; i++) { + ei.size = cmem->ranges[i].end - cmem->ranges[i].start + 1; + + /* If entry is less than a page, skip it */ + if (ei.size < PAGE_SIZE) + continue; + ei.addr = cmem->ranges[i].start; + ei.type = E820_RAM; + add_e820_entry(params, &ei); + } + +out: + vfree(cmem); + return ret; +} + +static int determine_backup_region(u64 start, u64 end, void *arg) +{ + struct kimage *image = arg; + + image->arch.backup_src_start = start; + image->arch.backup_src_sz = end - start + 1; + + /* Expecting only one range for backup region */ + return 1; +} + +int crash_load_segments(struct kimage *image) +{ + unsigned long src_start, src_sz, elf_sz; + void *elf_addr; + int ret; + + /* + * Determine and load a segment for backup area. First 640K RAM + * region is backup source + */ + + ret = walk_system_ram_res(KEXEC_BACKUP_SRC_START, KEXEC_BACKUP_SRC_END, + image, determine_backup_region); + + /* Zero or postive return values are ok */ + if (ret < 0) + return ret; + + src_start = image->arch.backup_src_start; + src_sz = image->arch.backup_src_sz; + + /* Add backup segment. */ + if (src_sz) { + /* + * Ideally there is no source for backup segment. This is + * copied in purgatory after crash. Just add a zero filled + * segment for now to make sure checksum logic works fine. + */ + ret = kexec_add_buffer(image, (char *)&crash_zero_bytes, + sizeof(crash_zero_bytes), src_sz, + PAGE_SIZE, 0, -1, 0, + &image->arch.backup_load_addr); + if (ret) + return ret; + pr_debug("Loaded backup region at 0x%lx backup_start=0x%lx memsz=0x%lx\n", + image->arch.backup_load_addr, src_start, src_sz); + } + + /* Prepare elf headers and add a segment */ + ret = prepare_elf_headers(image, &elf_addr, &elf_sz); + if (ret) + return ret; + + image->arch.elf_headers = elf_addr; + image->arch.elf_headers_sz = elf_sz; + + ret = kexec_add_buffer(image, (char *)elf_addr, elf_sz, elf_sz, + ELF_CORE_HEADER_ALIGN, 0, -1, 0, + &image->arch.elf_load_addr); + if (ret) { + vfree((void *)image->arch.elf_headers); + return ret; + } + pr_debug("Loaded ELF headers at 0x%lx bufsz=0x%lx memsz=0x%lx\n", + image->arch.elf_load_addr, elf_sz, elf_sz); + + return ret; +} +#endif /* CONFIG_KEXEC_FILE */ diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 7db54b5d5f8..3d350335124 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -21,6 +21,7 @@ #include <asm/apic.h> #include <asm/pci_x86.h> #include <asm/setup.h> +#include <asm/i8259.h> __initdata u64 initial_dtb; char __initdata cmd_line[COMMAND_LINE_SIZE]; @@ -165,82 +166,6 @@ static void __init dtb_lapic_setup(void) #ifdef CONFIG_X86_IO_APIC static unsigned int ioapic_id; -static void __init dtb_add_ioapic(struct device_node *dn) -{ - struct resource r; - int ret; - - ret = of_address_to_resource(dn, 0, &r); - if (ret) { - printk(KERN_ERR "Can't obtain address from node %s.\n", - dn->full_name); - return; - } - mp_register_ioapic(++ioapic_id, r.start, gsi_top); -} - -static void __init dtb_ioapic_setup(void) -{ - struct device_node *dn; - - for_each_compatible_node(dn, NULL, "intel,ce4100-ioapic") - dtb_add_ioapic(dn); - - if (nr_ioapics) { - of_ioapic = 1; - return; - } - printk(KERN_ERR "Error: No information about IO-APIC in OF.\n"); -} -#else -static void __init dtb_ioapic_setup(void) {} -#endif - -static void __init dtb_apic_setup(void) -{ - dtb_lapic_setup(); - dtb_ioapic_setup(); -} - -#ifdef CONFIG_OF_FLATTREE -static void __init x86_flattree_get_config(void) -{ - u32 size, map_len; - void *dt; - - if (!initial_dtb) - return; - - map_len = max(PAGE_SIZE - (initial_dtb & ~PAGE_MASK), (u64)128); - - initial_boot_params = dt = early_memremap(initial_dtb, map_len); - size = of_get_flat_dt_size(); - if (map_len < size) { - early_iounmap(dt, map_len); - initial_boot_params = dt = early_memremap(initial_dtb, size); - map_len = size; - } - - unflatten_and_copy_device_tree(); - early_iounmap(dt, map_len); -} -#else -static inline void x86_flattree_get_config(void) { } -#endif - -void __init x86_dtb_init(void) -{ - x86_flattree_get_config(); - - if (!of_have_populated_dt()) - return; - - dtb_setup_hpet(); - dtb_apic_setup(); -} - -#ifdef CONFIG_X86_IO_APIC - struct of_ioapic_type { u32 out_type; u32 trigger; @@ -276,10 +201,8 @@ static int ioapic_xlate(struct irq_domain *domain, const u32 *intspec, u32 intsize, irq_hw_number_t *out_hwirq, u32 *out_type) { - struct io_apic_irq_attr attr; struct of_ioapic_type *it; - u32 line, idx; - int rc; + u32 line, idx, gsi; if (WARN_ON(intsize < 2)) return -EINVAL; @@ -291,13 +214,10 @@ static int ioapic_xlate(struct irq_domain *domain, it = &of_ioapic_type[intspec[1]]; - idx = (u32) domain->host_data; - set_io_apic_irq_attr(&attr, idx, line, it->trigger, it->polarity); - - rc = io_apic_setup_irq_pin_once(irq_find_mapping(domain, line), - cpu_to_node(0), &attr); - if (rc) - return rc; + idx = (u32)(long)domain->host_data; + gsi = mp_pin_to_gsi(idx, line); + if (mp_set_gsi_attr(gsi, it->trigger, it->polarity, cpu_to_node(0))) + return -EBUSY; *out_hwirq = line; *out_type = it->out_type; @@ -305,81 +225,86 @@ static int ioapic_xlate(struct irq_domain *domain, } const struct irq_domain_ops ioapic_irq_domain_ops = { + .map = mp_irqdomain_map, + .unmap = mp_irqdomain_unmap, .xlate = ioapic_xlate, }; -static void dt_add_ioapic_domain(unsigned int ioapic_num, - struct device_node *np) +static void __init dtb_add_ioapic(struct device_node *dn) { - struct irq_domain *id; - struct mp_ioapic_gsi *gsi_cfg; + struct resource r; int ret; - int num; - - gsi_cfg = mp_ioapic_gsi_routing(ioapic_num); - num = gsi_cfg->gsi_end - gsi_cfg->gsi_base + 1; - - id = irq_domain_add_linear(np, num, &ioapic_irq_domain_ops, - (void *)ioapic_num); - BUG_ON(!id); - if (gsi_cfg->gsi_base == 0) { - /* - * The first NR_IRQS_LEGACY irq descs are allocated in - * early_irq_init() and need just a mapping. The - * remaining irqs need both. All of them are preallocated - * and assigned so we can keep the 1:1 mapping which the ioapic - * is having. - */ - irq_domain_associate_many(id, 0, 0, NR_IRQS_LEGACY); - - if (num > NR_IRQS_LEGACY) { - ret = irq_create_strict_mappings(id, NR_IRQS_LEGACY, - NR_IRQS_LEGACY, num - NR_IRQS_LEGACY); - if (ret) - pr_err("Error creating mapping for the " - "remaining IRQs: %d\n", ret); - } - irq_set_default_host(id); - } else { - ret = irq_create_strict_mappings(id, gsi_cfg->gsi_base, 0, num); - if (ret) - pr_err("Error creating IRQ mapping: %d\n", ret); + struct ioapic_domain_cfg cfg = { + .type = IOAPIC_DOMAIN_DYNAMIC, + .ops = &ioapic_irq_domain_ops, + .dev = dn, + }; + + ret = of_address_to_resource(dn, 0, &r); + if (ret) { + printk(KERN_ERR "Can't obtain address from node %s.\n", + dn->full_name); + return; } + mp_register_ioapic(++ioapic_id, r.start, gsi_top, &cfg); } -static void __init ioapic_add_ofnode(struct device_node *np) +static void __init dtb_ioapic_setup(void) { - struct resource r; - int i, ret; + struct device_node *dn; - ret = of_address_to_resource(np, 0, &r); - if (ret) { - printk(KERN_ERR "Failed to obtain address for %s\n", - np->full_name); + for_each_compatible_node(dn, NULL, "intel,ce4100-ioapic") + dtb_add_ioapic(dn); + + if (nr_ioapics) { + of_ioapic = 1; return; } + printk(KERN_ERR "Error: No information about IO-APIC in OF.\n"); +} +#else +static void __init dtb_ioapic_setup(void) {} +#endif - for (i = 0; i < nr_ioapics; i++) { - if (r.start == mpc_ioapic_addr(i)) { - dt_add_ioapic_domain(i, np); - return; - } - } - printk(KERN_ERR "IOxAPIC at %s is not registered.\n", np->full_name); +static void __init dtb_apic_setup(void) +{ + dtb_lapic_setup(); + dtb_ioapic_setup(); } -void __init x86_add_irq_domains(void) +#ifdef CONFIG_OF_FLATTREE +static void __init x86_flattree_get_config(void) { - struct device_node *dp; + u32 size, map_len; + void *dt; - if (!of_have_populated_dt()) + if (!initial_dtb) return; - for_each_node_with_property(dp, "interrupt-controller") { - if (of_device_is_compatible(dp, "intel,ce4100-ioapic")) - ioapic_add_ofnode(dp); + map_len = max(PAGE_SIZE - (initial_dtb & ~PAGE_MASK), (u64)128); + + initial_boot_params = dt = early_memremap(initial_dtb, map_len); + size = of_get_flat_dt_size(); + if (map_len < size) { + early_iounmap(dt, map_len); + initial_boot_params = dt = early_memremap(initial_dtb, size); + map_len = size; } + + unflatten_and_copy_device_tree(); + early_iounmap(dt, map_len); } #else -void __init x86_add_irq_domains(void) { } +static inline void x86_flattree_get_config(void) { } #endif + +void __init x86_dtb_init(void) +{ + x86_flattree_get_config(); + + if (!of_have_populated_dt()) + return; + + dtb_setup_hpet(); + dtb_apic_setup(); +} diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 988c00a1f60..49f88648161 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -682,15 +682,14 @@ void __init parse_e820_ext(u64 phys_addr, u32 data_len) * hibernation (32 bit) or software suspend and suspend to RAM (64 bit). * * This function requires the e820 map to be sorted and without any - * overlapping entries and assumes the first e820 area to be RAM. + * overlapping entries. */ void __init e820_mark_nosave_regions(unsigned long limit_pfn) { int i; - unsigned long pfn; + unsigned long pfn = 0; - pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size); - for (i = 1; i < e820.nr_map; i++) { + for (i = 0; i < e820.nr_map; i++) { struct e820entry *ei = &e820.map[i]; if (pfn < PFN_UP(ei->addr)) diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index f9e3fabc871..b553ed89e5f 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -682,7 +682,7 @@ END(syscall_badsys) sysenter_badsys: movl $-ENOSYS,%eax jmp sysenter_after_call -END(syscall_badsys) +END(sysenter_badsys) CFI_ENDPROC .macro FIXUP_ESPFIX_STACK @@ -1058,9 +1058,6 @@ ENTRY(mcount) END(mcount) ENTRY(ftrace_caller) - cmpl $0, function_trace_stop - jne ftrace_stub - pushl %eax pushl %ecx pushl %edx @@ -1092,8 +1089,6 @@ END(ftrace_caller) ENTRY(ftrace_regs_caller) pushf /* push flags before compare (in cs location) */ - cmpl $0, function_trace_stop - jne ftrace_restore_flags /* * i386 does not save SS and ESP when coming from kernel. @@ -1152,7 +1147,6 @@ GLOBAL(ftrace_regs_call) popf /* Pop flags at end (no addl to corrupt flags) */ jmp ftrace_ret -ftrace_restore_flags: popf jmp ftrace_stub #else /* ! CONFIG_DYNAMIC_FTRACE */ @@ -1161,9 +1155,6 @@ ENTRY(mcount) cmpl $__PAGE_OFFSET, %esp jb ftrace_stub /* Paging not enabled yet? */ - cmpl $0, function_trace_stop - jne ftrace_stub - cmpl $ftrace_stub, ftrace_trace_function jnz trace #ifdef CONFIG_FUNCTION_GRAPH_TRACER diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 5e8cb2ad9fb..df088bb03fb 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -207,7 +207,6 @@ ENDPROC(native_usergs_sysret64) */ .macro XCPT_FRAME start=1 offset=0 INTR_FRAME \start, RIP+\offset-ORIG_RAX - /*CFI_REL_OFFSET orig_rax, ORIG_RAX-ORIG_RAX*/ .endm /* @@ -287,21 +286,21 @@ ENDPROC(native_usergs_sysret64) ENTRY(save_paranoid) XCPT_FRAME 1 RDI+8 cld - movq_cfi rdi, RDI+8 - movq_cfi rsi, RSI+8 + movq %rdi, RDI+8(%rsp) + movq %rsi, RSI+8(%rsp) movq_cfi rdx, RDX+8 movq_cfi rcx, RCX+8 movq_cfi rax, RAX+8 - movq_cfi r8, R8+8 - movq_cfi r9, R9+8 - movq_cfi r10, R10+8 - movq_cfi r11, R11+8 + movq %r8, R8+8(%rsp) + movq %r9, R9+8(%rsp) + movq %r10, R10+8(%rsp) + movq %r11, R11+8(%rsp) movq_cfi rbx, RBX+8 - movq_cfi rbp, RBP+8 - movq_cfi r12, R12+8 - movq_cfi r13, R13+8 - movq_cfi r14, R14+8 - movq_cfi r15, R15+8 + movq %rbp, RBP+8(%rsp) + movq %r12, R12+8(%rsp) + movq %r13, R13+8(%rsp) + movq %r14, R14+8(%rsp) + movq %r15, R15+8(%rsp) movl $1,%ebx movl $MSR_GS_BASE,%ecx rdmsr @@ -405,8 +404,8 @@ GLOBAL(system_call_after_swapgs) * and short: */ ENABLE_INTERRUPTS(CLBR_NONE) - SAVE_ARGS 8,0 - movq %rax,ORIG_RAX-ARGOFFSET(%rsp) + SAVE_ARGS 8, 0, rax_enosys=1 + movq_cfi rax,(ORIG_RAX-ARGOFFSET) movq %rcx,RIP-ARGOFFSET(%rsp) CFI_REL_OFFSET rip,RIP-ARGOFFSET testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) @@ -418,7 +417,7 @@ system_call_fastpath: andl $__SYSCALL_MASK,%eax cmpl $__NR_syscall_max,%eax #endif - ja badsys + ja ret_from_sys_call /* and return regs->ax */ movq %r10,%rcx call *sys_call_table(,%rax,8) # XXX: rip relative movq %rax,RAX-ARGOFFSET(%rsp) @@ -477,27 +476,8 @@ sysret_signal: FIXUP_TOP_OF_STACK %r11, -ARGOFFSET jmp int_check_syscall_exit_work -badsys: - movq $-ENOSYS,RAX-ARGOFFSET(%rsp) - jmp ret_from_sys_call - #ifdef CONFIG_AUDITSYSCALL /* - * Fast path for syscall audit without full syscall trace. - * We just call __audit_syscall_entry() directly, and then - * jump back to the normal fast path. - */ -auditsys: - movq %r10,%r8 /* 5th arg: 4th syscall arg */ - movq %rdx,%rcx /* 4th arg: 3rd syscall arg */ - movq %rsi,%rdx /* 3rd arg: 2nd syscall arg */ - movq %rdi,%rsi /* 2nd arg: 1st syscall arg */ - movq %rax,%rdi /* 1st arg: syscall number */ - call __audit_syscall_entry - LOAD_ARGS 0 /* reload call-clobbered registers */ - jmp system_call_fastpath - - /* * Return fast path for syscall audit. Call __audit_syscall_exit() * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT * masked off. @@ -514,18 +494,25 @@ sysret_audit: /* Do syscall tracing */ tracesys: -#ifdef CONFIG_AUDITSYSCALL - testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) - jz auditsys -#endif + leaq -REST_SKIP(%rsp), %rdi + movq $AUDIT_ARCH_X86_64, %rsi + call syscall_trace_enter_phase1 + test %rax, %rax + jnz tracesys_phase2 /* if needed, run the slow path */ + LOAD_ARGS 0 /* else restore clobbered regs */ + jmp system_call_fastpath /* and return to the fast path */ + +tracesys_phase2: SAVE_REST - movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ FIXUP_TOP_OF_STACK %rdi - movq %rsp,%rdi - call syscall_trace_enter + movq %rsp, %rdi + movq $AUDIT_ARCH_X86_64, %rsi + movq %rax,%rdx + call syscall_trace_enter_phase2 + /* * Reload arg registers from stack in case ptrace changed them. - * We don't reload %rax because syscall_trace_enter() returned + * We don't reload %rax because syscall_trace_entry_phase2() returned * the value it wants us to use in the table lookup. */ LOAD_ARGS ARGOFFSET, 1 @@ -536,7 +523,7 @@ tracesys: andl $__SYSCALL_MASK,%eax cmpl $__NR_syscall_max,%eax #endif - ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */ + ja int_ret_from_sys_call /* RAX(%rsp) is already set */ movq %r10,%rcx /* fixup for C */ call *sys_call_table(,%rax,8) movq %rax,RAX-ARGOFFSET(%rsp) @@ -1386,21 +1373,21 @@ ENTRY(error_entry) CFI_ADJUST_CFA_OFFSET 15*8 /* oldrax contains error code */ cld - movq_cfi rdi, RDI+8 - movq_cfi rsi, RSI+8 - movq_cfi rdx, RDX+8 - movq_cfi rcx, RCX+8 - movq_cfi rax, RAX+8 - movq_cfi r8, R8+8 - movq_cfi r9, R9+8 - movq_cfi r10, R10+8 - movq_cfi r11, R11+8 + movq %rdi, RDI+8(%rsp) + movq %rsi, RSI+8(%rsp) + movq %rdx, RDX+8(%rsp) + movq %rcx, RCX+8(%rsp) + movq %rax, RAX+8(%rsp) + movq %r8, R8+8(%rsp) + movq %r9, R9+8(%rsp) + movq %r10, R10+8(%rsp) + movq %r11, R11+8(%rsp) movq_cfi rbx, RBX+8 - movq_cfi rbp, RBP+8 - movq_cfi r12, R12+8 - movq_cfi r13, R13+8 - movq_cfi r14, R14+8 - movq_cfi r15, R15+8 + movq %rbp, RBP+8(%rsp) + movq %r12, R12+8(%rsp) + movq %r13, R13+8(%rsp) + movq %r14, R14+8(%rsp) + movq %r15, R15+8(%rsp) xorl %ebx,%ebx testl $3,CS+8(%rsp) je error_kernelspace @@ -1418,6 +1405,7 @@ error_sti: * compat mode. Check for these here too. */ error_kernelspace: + CFI_REL_OFFSET rcx, RCX+8 incl %ebx leaq native_irq_return_iret(%rip),%rcx cmpq %rcx,RIP+8(%rsp) diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index cbc4a91b131..3386dc9aa33 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -703,6 +703,9 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, unsigned long return_hooker = (unsigned long) &return_to_handler; + if (unlikely(ftrace_graph_is_dead())) + return; + if (unlikely(atomic_read(¤t->tracing_graph_pause))) return; diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 5f9cf20cdb6..3d5fb509bde 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -108,7 +108,7 @@ int arch_install_hw_breakpoint(struct perf_event *bp) int i; for (i = 0; i < HBP_NUM; i++) { - struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]); + struct perf_event **slot = this_cpu_ptr(&bp_per_reg[i]); if (!*slot) { *slot = bp; @@ -122,7 +122,7 @@ int arch_install_hw_breakpoint(struct perf_event *bp) set_debugreg(info->address, i); __this_cpu_write(cpu_debugreg[i], info->address); - dr7 = &__get_cpu_var(cpu_dr7); + dr7 = this_cpu_ptr(&cpu_dr7); *dr7 |= encode_dr7(i, info->len, info->type); set_debugreg(*dr7, 7); @@ -146,7 +146,7 @@ void arch_uninstall_hw_breakpoint(struct perf_event *bp) int i; for (i = 0; i < HBP_NUM; i++) { - struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]); + struct perf_event **slot = this_cpu_ptr(&bp_per_reg[i]); if (*slot == bp) { *slot = NULL; @@ -157,7 +157,7 @@ void arch_uninstall_hw_breakpoint(struct perf_event *bp) if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot")) return; - dr7 = &__get_cpu_var(cpu_dr7); + dr7 = this_cpu_ptr(&cpu_dr7); *dr7 &= ~__encode_dr7(i, info->len, info->type); set_debugreg(*dr7, 7); diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index d5dd8081441..a9a4229f616 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -375,7 +375,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, /* * These bits must be zero. */ - xsave_hdr->reserved1[0] = xsave_hdr->reserved1[1] = 0; + memset(xsave_hdr->reserved, 0, 48); return ret; } diff --git a/arch/x86/kernel/iosf_mbi.c b/arch/x86/kernel/iosf_mbi.c index d30acdc1229..82f8d02f0df 100644 --- a/arch/x86/kernel/iosf_mbi.c +++ b/arch/x86/kernel/iosf_mbi.c @@ -22,10 +22,13 @@ #include <linux/init.h> #include <linux/spinlock.h> #include <linux/pci.h> +#include <linux/debugfs.h> +#include <linux/capability.h> #include <asm/iosf_mbi.h> #define PCI_DEVICE_ID_BAYTRAIL 0x0F00 +#define PCI_DEVICE_ID_BRASWELL 0x2280 #define PCI_DEVICE_ID_QUARK_X1000 0x0958 static DEFINE_SPINLOCK(iosf_mbi_lock); @@ -187,6 +190,89 @@ bool iosf_mbi_available(void) } EXPORT_SYMBOL(iosf_mbi_available); +#ifdef CONFIG_IOSF_MBI_DEBUG +static u32 dbg_mdr; +static u32 dbg_mcr; +static u32 dbg_mcrx; + +static int mcr_get(void *data, u64 *val) +{ + *val = *(u32 *)data; + return 0; +} + +static int mcr_set(void *data, u64 val) +{ + u8 command = ((u32)val & 0xFF000000) >> 24, + port = ((u32)val & 0x00FF0000) >> 16, + offset = ((u32)val & 0x0000FF00) >> 8; + int err; + + *(u32 *)data = val; + + if (!capable(CAP_SYS_RAWIO)) + return -EACCES; + + if (command & 1u) + err = iosf_mbi_write(port, + command, + dbg_mcrx | offset, + dbg_mdr); + else + err = iosf_mbi_read(port, + command, + dbg_mcrx | offset, + &dbg_mdr); + + return err; +} +DEFINE_SIMPLE_ATTRIBUTE(iosf_mcr_fops, mcr_get, mcr_set , "%llx\n"); + +static struct dentry *iosf_dbg; + +static void iosf_sideband_debug_init(void) +{ + struct dentry *d; + + iosf_dbg = debugfs_create_dir("iosf_sb", NULL); + if (IS_ERR_OR_NULL(iosf_dbg)) + return; + + /* mdr */ + d = debugfs_create_x32("mdr", 0660, iosf_dbg, &dbg_mdr); + if (IS_ERR_OR_NULL(d)) + goto cleanup; + + /* mcrx */ + debugfs_create_x32("mcrx", 0660, iosf_dbg, &dbg_mcrx); + if (IS_ERR_OR_NULL(d)) + goto cleanup; + + /* mcr - initiates mailbox tranaction */ + debugfs_create_file("mcr", 0660, iosf_dbg, &dbg_mcr, &iosf_mcr_fops); + if (IS_ERR_OR_NULL(d)) + goto cleanup; + + return; + +cleanup: + debugfs_remove_recursive(d); +} + +static void iosf_debugfs_init(void) +{ + iosf_sideband_debug_init(); +} + +static void iosf_debugfs_remove(void) +{ + debugfs_remove_recursive(iosf_dbg); +} +#else +static inline void iosf_debugfs_init(void) { } +static inline void iosf_debugfs_remove(void) { } +#endif /* CONFIG_IOSF_MBI_DEBUG */ + static int iosf_mbi_probe(struct pci_dev *pdev, const struct pci_device_id *unused) { @@ -202,8 +288,9 @@ static int iosf_mbi_probe(struct pci_dev *pdev, return 0; } -static DEFINE_PCI_DEVICE_TABLE(iosf_mbi_pci_ids) = { +static const struct pci_device_id iosf_mbi_pci_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_BAYTRAIL) }, + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_BRASWELL) }, { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_QUARK_X1000) }, { 0, }, }; @@ -217,11 +304,15 @@ static struct pci_driver iosf_mbi_pci_driver = { static int __init iosf_mbi_init(void) { + iosf_debugfs_init(); + return pci_register_driver(&iosf_mbi_pci_driver); } static void __exit iosf_mbi_exit(void) { + iosf_debugfs_remove(); + pci_unregister_driver(&iosf_mbi_pci_driver); if (mbi_pdev) { pci_dev_put(mbi_pdev); diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 4d1c746892e..e4b503d5558 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -52,13 +52,13 @@ static inline void stack_overflow_check(struct pt_regs *regs) regs->sp <= curbase + THREAD_SIZE) return; - irq_stack_top = (u64)__get_cpu_var(irq_stack_union.irq_stack) + + irq_stack_top = (u64)this_cpu_ptr(irq_stack_union.irq_stack) + STACK_TOP_MARGIN; - irq_stack_bottom = (u64)__get_cpu_var(irq_stack_ptr); + irq_stack_bottom = (u64)__this_cpu_read(irq_stack_ptr); if (regs->sp >= irq_stack_top && regs->sp <= irq_stack_bottom) return; - oist = &__get_cpu_var(orig_ist); + oist = this_cpu_ptr(&orig_ist); estack_top = (u64)oist->ist[0] - EXCEPTION_STKSZ + STACK_TOP_MARGIN; estack_bottom = (u64)oist->ist[N_EXCEPTION_STACKS - 1]; if (regs->sp >= estack_top && regs->sp <= estack_bottom) diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c index 1de84e3ab4e..15d741ddfee 100644 --- a/arch/x86/kernel/irq_work.c +++ b/arch/x86/kernel/irq_work.c @@ -41,7 +41,7 @@ __visible void smp_trace_irq_work_interrupt(struct pt_regs *regs) void arch_irq_work_raise(void) { #ifdef CONFIG_X86_LOCAL_APIC - if (!cpu_has_apic) + if (!arch_irq_work_has_interrupt()) return; apic->send_IPI_self(IRQ_WORK_VECTOR); diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 7f50156542f..44f1ed42fdf 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -78,7 +78,7 @@ void __init init_ISA_irqs(void) #endif legacy_pic->init(0); - for (i = 0; i < legacy_pic->nr_legacy_irqs; i++) + for (i = 0; i < nr_legacy_irqs(); i++) irq_set_chip_and_handler_name(i, chip, handle_level_irq, name); } @@ -87,12 +87,6 @@ void __init init_IRQ(void) int i; /* - * We probably need a better place for this, but it works for - * now ... - */ - x86_add_irq_domains(); - - /* * On cpu 0, Assign IRQ0_VECTOR..IRQ15_VECTOR's to IRQ 0..15. * If these IRQ's are handled by legacy interrupt-controllers like PIC, * then this configuration will likely be static after the boot. If @@ -100,7 +94,7 @@ void __init init_IRQ(void) * then this vector space can be freed and re-used dynamically as the * irq's migrate etc. */ - for (i = 0; i < legacy_pic->nr_legacy_irqs; i++) + for (i = 0; i < nr_legacy_irqs(); i++) per_cpu(vector_irq, 0)[IRQ0_VECTOR + i] = i; x86_init.irqs.intr_init(); @@ -121,7 +115,7 @@ void setup_vector_irq(int cpu) * legacy PIC, for the new cpu that is coming online, setup the static * legacy vector to irq mapping: */ - for (irq = 0; irq < legacy_pic->nr_legacy_irqs; irq++) + for (irq = 0; irq < nr_legacy_irqs(); irq++) per_cpu(vector_irq, cpu)[IRQ0_VECTOR + irq] = irq; #endif @@ -209,7 +203,7 @@ void __init native_init_IRQ(void) set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); } - if (!acpi_ioapic && !of_ioapic) + if (!acpi_ioapic && !of_ioapic && nr_legacy_irqs()) setup_irq(2, &irq2); #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c new file mode 100644 index 00000000000..ca05f86481a --- /dev/null +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -0,0 +1,554 @@ +/* + * Kexec bzImage loader + * + * Copyright (C) 2014 Red Hat Inc. + * Authors: + * Vivek Goyal <vgoyal@redhat.com> + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#define pr_fmt(fmt) "kexec-bzImage64: " fmt + +#include <linux/string.h> +#include <linux/printk.h> +#include <linux/errno.h> +#include <linux/slab.h> +#include <linux/kexec.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/efi.h> +#include <linux/verify_pefile.h> +#include <keys/system_keyring.h> + +#include <asm/bootparam.h> +#include <asm/setup.h> +#include <asm/crash.h> +#include <asm/efi.h> +#include <asm/kexec-bzimage64.h> + +#define MAX_ELFCOREHDR_STR_LEN 30 /* elfcorehdr=0x<64bit-value> */ + +/* + * Defines lowest physical address for various segments. Not sure where + * exactly these limits came from. Current bzimage64 loader in kexec-tools + * uses these so I am retaining it. It can be changed over time as we gain + * more insight. + */ +#define MIN_PURGATORY_ADDR 0x3000 +#define MIN_BOOTPARAM_ADDR 0x3000 +#define MIN_KERNEL_LOAD_ADDR 0x100000 +#define MIN_INITRD_LOAD_ADDR 0x1000000 + +/* + * This is a place holder for all boot loader specific data structure which + * gets allocated in one call but gets freed much later during cleanup + * time. Right now there is only one field but it can grow as need be. + */ +struct bzimage64_data { + /* + * Temporary buffer to hold bootparams buffer. This should be + * freed once the bootparam segment has been loaded. + */ + void *bootparams_buf; +}; + +static int setup_initrd(struct boot_params *params, + unsigned long initrd_load_addr, unsigned long initrd_len) +{ + params->hdr.ramdisk_image = initrd_load_addr & 0xffffffffUL; + params->hdr.ramdisk_size = initrd_len & 0xffffffffUL; + + params->ext_ramdisk_image = initrd_load_addr >> 32; + params->ext_ramdisk_size = initrd_len >> 32; + + return 0; +} + +static int setup_cmdline(struct kimage *image, struct boot_params *params, + unsigned long bootparams_load_addr, + unsigned long cmdline_offset, char *cmdline, + unsigned long cmdline_len) +{ + char *cmdline_ptr = ((char *)params) + cmdline_offset; + unsigned long cmdline_ptr_phys, len; + uint32_t cmdline_low_32, cmdline_ext_32; + + memcpy(cmdline_ptr, cmdline, cmdline_len); + if (image->type == KEXEC_TYPE_CRASH) { + len = sprintf(cmdline_ptr + cmdline_len - 1, + " elfcorehdr=0x%lx", image->arch.elf_load_addr); + cmdline_len += len; + } + cmdline_ptr[cmdline_len - 1] = '\0'; + + pr_debug("Final command line is: %s\n", cmdline_ptr); + cmdline_ptr_phys = bootparams_load_addr + cmdline_offset; + cmdline_low_32 = cmdline_ptr_phys & 0xffffffffUL; + cmdline_ext_32 = cmdline_ptr_phys >> 32; + + params->hdr.cmd_line_ptr = cmdline_low_32; + if (cmdline_ext_32) + params->ext_cmd_line_ptr = cmdline_ext_32; + + return 0; +} + +static int setup_e820_entries(struct boot_params *params) +{ + unsigned int nr_e820_entries; + + nr_e820_entries = e820_saved.nr_map; + + /* TODO: Pass entries more than E820MAX in bootparams setup data */ + if (nr_e820_entries > E820MAX) + nr_e820_entries = E820MAX; + + params->e820_entries = nr_e820_entries; + memcpy(¶ms->e820_map, &e820_saved.map, + nr_e820_entries * sizeof(struct e820entry)); + + return 0; +} + +#ifdef CONFIG_EFI +static int setup_efi_info_memmap(struct boot_params *params, + unsigned long params_load_addr, + unsigned int efi_map_offset, + unsigned int efi_map_sz) +{ + void *efi_map = (void *)params + efi_map_offset; + unsigned long efi_map_phys_addr = params_load_addr + efi_map_offset; + struct efi_info *ei = ¶ms->efi_info; + + if (!efi_map_sz) + return 0; + + efi_runtime_map_copy(efi_map, efi_map_sz); + + ei->efi_memmap = efi_map_phys_addr & 0xffffffff; + ei->efi_memmap_hi = efi_map_phys_addr >> 32; + ei->efi_memmap_size = efi_map_sz; + + return 0; +} + +static int +prepare_add_efi_setup_data(struct boot_params *params, + unsigned long params_load_addr, + unsigned int efi_setup_data_offset) +{ + unsigned long setup_data_phys; + struct setup_data *sd = (void *)params + efi_setup_data_offset; + struct efi_setup_data *esd = (void *)sd + sizeof(struct setup_data); + + esd->fw_vendor = efi.fw_vendor; + esd->runtime = efi.runtime; + esd->tables = efi.config_table; + esd->smbios = efi.smbios; + + sd->type = SETUP_EFI; + sd->len = sizeof(struct efi_setup_data); + + /* Add setup data */ + setup_data_phys = params_load_addr + efi_setup_data_offset; + sd->next = params->hdr.setup_data; + params->hdr.setup_data = setup_data_phys; + + return 0; +} + +static int +setup_efi_state(struct boot_params *params, unsigned long params_load_addr, + unsigned int efi_map_offset, unsigned int efi_map_sz, + unsigned int efi_setup_data_offset) +{ + struct efi_info *current_ei = &boot_params.efi_info; + struct efi_info *ei = ¶ms->efi_info; + + if (!current_ei->efi_memmap_size) + return 0; + + /* + * If 1:1 mapping is not enabled, second kernel can not setup EFI + * and use EFI run time services. User space will have to pass + * acpi_rsdp=<addr> on kernel command line to make second kernel boot + * without efi. + */ + if (efi_enabled(EFI_OLD_MEMMAP)) + return 0; + + ei->efi_loader_signature = current_ei->efi_loader_signature; + ei->efi_systab = current_ei->efi_systab; + ei->efi_systab_hi = current_ei->efi_systab_hi; + + ei->efi_memdesc_version = current_ei->efi_memdesc_version; + ei->efi_memdesc_size = efi_get_runtime_map_desc_size(); + + setup_efi_info_memmap(params, params_load_addr, efi_map_offset, + efi_map_sz); + prepare_add_efi_setup_data(params, params_load_addr, + efi_setup_data_offset); + return 0; +} +#endif /* CONFIG_EFI */ + +static int +setup_boot_parameters(struct kimage *image, struct boot_params *params, + unsigned long params_load_addr, + unsigned int efi_map_offset, unsigned int efi_map_sz, + unsigned int efi_setup_data_offset) +{ + unsigned int nr_e820_entries; + unsigned long long mem_k, start, end; + int i, ret = 0; + + /* Get subarch from existing bootparams */ + params->hdr.hardware_subarch = boot_params.hdr.hardware_subarch; + + /* Copying screen_info will do? */ + memcpy(¶ms->screen_info, &boot_params.screen_info, + sizeof(struct screen_info)); + + /* Fill in memsize later */ + params->screen_info.ext_mem_k = 0; + params->alt_mem_k = 0; + + /* Default APM info */ + memset(¶ms->apm_bios_info, 0, sizeof(params->apm_bios_info)); + + /* Default drive info */ + memset(¶ms->hd0_info, 0, sizeof(params->hd0_info)); + memset(¶ms->hd1_info, 0, sizeof(params->hd1_info)); + + /* Default sysdesc table */ + params->sys_desc_table.length = 0; + + if (image->type == KEXEC_TYPE_CRASH) { + ret = crash_setup_memmap_entries(image, params); + if (ret) + return ret; + } else + setup_e820_entries(params); + + nr_e820_entries = params->e820_entries; + + for (i = 0; i < nr_e820_entries; i++) { + if (params->e820_map[i].type != E820_RAM) + continue; + start = params->e820_map[i].addr; + end = params->e820_map[i].addr + params->e820_map[i].size - 1; + + if ((start <= 0x100000) && end > 0x100000) { + mem_k = (end >> 10) - (0x100000 >> 10); + params->screen_info.ext_mem_k = mem_k; + params->alt_mem_k = mem_k; + if (mem_k > 0xfc00) + params->screen_info.ext_mem_k = 0xfc00; /* 64M*/ + if (mem_k > 0xffffffff) + params->alt_mem_k = 0xffffffff; + } + } + +#ifdef CONFIG_EFI + /* Setup EFI state */ + setup_efi_state(params, params_load_addr, efi_map_offset, efi_map_sz, + efi_setup_data_offset); +#endif + + /* Setup EDD info */ + memcpy(params->eddbuf, boot_params.eddbuf, + EDDMAXNR * sizeof(struct edd_info)); + params->eddbuf_entries = boot_params.eddbuf_entries; + + memcpy(params->edd_mbr_sig_buffer, boot_params.edd_mbr_sig_buffer, + EDD_MBR_SIG_MAX * sizeof(unsigned int)); + + return ret; +} + +static int bzImage64_probe(const char *buf, unsigned long len) +{ + int ret = -ENOEXEC; + struct setup_header *header; + + /* kernel should be atleast two sectors long */ + if (len < 2 * 512) { + pr_err("File is too short to be a bzImage\n"); + return ret; + } + + header = (struct setup_header *)(buf + offsetof(struct boot_params, hdr)); + if (memcmp((char *)&header->header, "HdrS", 4) != 0) { + pr_err("Not a bzImage\n"); + return ret; + } + + if (header->boot_flag != 0xAA55) { + pr_err("No x86 boot sector present\n"); + return ret; + } + + if (header->version < 0x020C) { + pr_err("Must be at least protocol version 2.12\n"); + return ret; + } + + if (!(header->loadflags & LOADED_HIGH)) { + pr_err("zImage not a bzImage\n"); + return ret; + } + + if (!(header->xloadflags & XLF_KERNEL_64)) { + pr_err("Not a bzImage64. XLF_KERNEL_64 is not set.\n"); + return ret; + } + + if (!(header->xloadflags & XLF_CAN_BE_LOADED_ABOVE_4G)) { + pr_err("XLF_CAN_BE_LOADED_ABOVE_4G is not set.\n"); + return ret; + } + + /* + * Can't handle 32bit EFI as it does not allow loading kernel + * above 4G. This should be handled by 32bit bzImage loader + */ + if (efi_enabled(EFI_RUNTIME_SERVICES) && !efi_enabled(EFI_64BIT)) { + pr_debug("EFI is 32 bit. Can't load kernel above 4G.\n"); + return ret; + } + + /* I've got a bzImage */ + pr_debug("It's a relocatable bzImage64\n"); + ret = 0; + + return ret; +} + +static void *bzImage64_load(struct kimage *image, char *kernel, + unsigned long kernel_len, char *initrd, + unsigned long initrd_len, char *cmdline, + unsigned long cmdline_len) +{ + + struct setup_header *header; + int setup_sects, kern16_size, ret = 0; + unsigned long setup_header_size, params_cmdline_sz, params_misc_sz; + struct boot_params *params; + unsigned long bootparam_load_addr, kernel_load_addr, initrd_load_addr; + unsigned long purgatory_load_addr; + unsigned long kernel_bufsz, kernel_memsz, kernel_align; + char *kernel_buf; + struct bzimage64_data *ldata; + struct kexec_entry64_regs regs64; + void *stack; + unsigned int setup_hdr_offset = offsetof(struct boot_params, hdr); + unsigned int efi_map_offset, efi_map_sz, efi_setup_data_offset; + + header = (struct setup_header *)(kernel + setup_hdr_offset); + setup_sects = header->setup_sects; + if (setup_sects == 0) + setup_sects = 4; + + kern16_size = (setup_sects + 1) * 512; + if (kernel_len < kern16_size) { + pr_err("bzImage truncated\n"); + return ERR_PTR(-ENOEXEC); + } + + if (cmdline_len > header->cmdline_size) { + pr_err("Kernel command line too long\n"); + return ERR_PTR(-EINVAL); + } + + /* + * In case of crash dump, we will append elfcorehdr=<addr> to + * command line. Make sure it does not overflow + */ + if (cmdline_len + MAX_ELFCOREHDR_STR_LEN > header->cmdline_size) { + pr_debug("Appending elfcorehdr=<addr> to command line exceeds maximum allowed length\n"); + return ERR_PTR(-EINVAL); + } + + /* Allocate and load backup region */ + if (image->type == KEXEC_TYPE_CRASH) { + ret = crash_load_segments(image); + if (ret) + return ERR_PTR(ret); + } + + /* + * Load purgatory. For 64bit entry point, purgatory code can be + * anywhere. + */ + ret = kexec_load_purgatory(image, MIN_PURGATORY_ADDR, ULONG_MAX, 1, + &purgatory_load_addr); + if (ret) { + pr_err("Loading purgatory failed\n"); + return ERR_PTR(ret); + } + + pr_debug("Loaded purgatory at 0x%lx\n", purgatory_load_addr); + + + /* + * Load Bootparams and cmdline and space for efi stuff. + * + * Allocate memory together for multiple data structures so + * that they all can go in single area/segment and we don't + * have to create separate segment for each. Keeps things + * little bit simple + */ + efi_map_sz = efi_get_runtime_map_size(); + efi_map_sz = ALIGN(efi_map_sz, 16); + params_cmdline_sz = sizeof(struct boot_params) + cmdline_len + + MAX_ELFCOREHDR_STR_LEN; + params_cmdline_sz = ALIGN(params_cmdline_sz, 16); + params_misc_sz = params_cmdline_sz + efi_map_sz + + sizeof(struct setup_data) + + sizeof(struct efi_setup_data); + + params = kzalloc(params_misc_sz, GFP_KERNEL); + if (!params) + return ERR_PTR(-ENOMEM); + efi_map_offset = params_cmdline_sz; + efi_setup_data_offset = efi_map_offset + efi_map_sz; + + /* Copy setup header onto bootparams. Documentation/x86/boot.txt */ + setup_header_size = 0x0202 + kernel[0x0201] - setup_hdr_offset; + + /* Is there a limit on setup header size? */ + memcpy(¶ms->hdr, (kernel + setup_hdr_offset), setup_header_size); + + ret = kexec_add_buffer(image, (char *)params, params_misc_sz, + params_misc_sz, 16, MIN_BOOTPARAM_ADDR, + ULONG_MAX, 1, &bootparam_load_addr); + if (ret) + goto out_free_params; + pr_debug("Loaded boot_param, command line and misc at 0x%lx bufsz=0x%lx memsz=0x%lx\n", + bootparam_load_addr, params_misc_sz, params_misc_sz); + + /* Load kernel */ + kernel_buf = kernel + kern16_size; + kernel_bufsz = kernel_len - kern16_size; + kernel_memsz = PAGE_ALIGN(header->init_size); + kernel_align = header->kernel_alignment; + + ret = kexec_add_buffer(image, kernel_buf, + kernel_bufsz, kernel_memsz, kernel_align, + MIN_KERNEL_LOAD_ADDR, ULONG_MAX, 1, + &kernel_load_addr); + if (ret) + goto out_free_params; + + pr_debug("Loaded 64bit kernel at 0x%lx bufsz=0x%lx memsz=0x%lx\n", + kernel_load_addr, kernel_memsz, kernel_memsz); + + /* Load initrd high */ + if (initrd) { + ret = kexec_add_buffer(image, initrd, initrd_len, initrd_len, + PAGE_SIZE, MIN_INITRD_LOAD_ADDR, + ULONG_MAX, 1, &initrd_load_addr); + if (ret) + goto out_free_params; + + pr_debug("Loaded initrd at 0x%lx bufsz=0x%lx memsz=0x%lx\n", + initrd_load_addr, initrd_len, initrd_len); + + setup_initrd(params, initrd_load_addr, initrd_len); + } + + setup_cmdline(image, params, bootparam_load_addr, + sizeof(struct boot_params), cmdline, cmdline_len); + + /* bootloader info. Do we need a separate ID for kexec kernel loader? */ + params->hdr.type_of_loader = 0x0D << 4; + params->hdr.loadflags = 0; + + /* Setup purgatory regs for entry */ + ret = kexec_purgatory_get_set_symbol(image, "entry64_regs", ®s64, + sizeof(regs64), 1); + if (ret) + goto out_free_params; + + regs64.rbx = 0; /* Bootstrap Processor */ + regs64.rsi = bootparam_load_addr; + regs64.rip = kernel_load_addr + 0x200; + stack = kexec_purgatory_get_symbol_addr(image, "stack_end"); + if (IS_ERR(stack)) { + pr_err("Could not find address of symbol stack_end\n"); + ret = -EINVAL; + goto out_free_params; + } + + regs64.rsp = (unsigned long)stack; + ret = kexec_purgatory_get_set_symbol(image, "entry64_regs", ®s64, + sizeof(regs64), 0); + if (ret) + goto out_free_params; + + ret = setup_boot_parameters(image, params, bootparam_load_addr, + efi_map_offset, efi_map_sz, + efi_setup_data_offset); + if (ret) + goto out_free_params; + + /* Allocate loader specific data */ + ldata = kzalloc(sizeof(struct bzimage64_data), GFP_KERNEL); + if (!ldata) { + ret = -ENOMEM; + goto out_free_params; + } + + /* + * Store pointer to params so that it could be freed after loading + * params segment has been loaded and contents have been copied + * somewhere else. + */ + ldata->bootparams_buf = params; + return ldata; + +out_free_params: + kfree(params); + return ERR_PTR(ret); +} + +/* This cleanup function is called after various segments have been loaded */ +static int bzImage64_cleanup(void *loader_data) +{ + struct bzimage64_data *ldata = loader_data; + + if (!ldata) + return 0; + + kfree(ldata->bootparams_buf); + ldata->bootparams_buf = NULL; + + return 0; +} + +#ifdef CONFIG_KEXEC_BZIMAGE_VERIFY_SIG +static int bzImage64_verify_sig(const char *kernel, unsigned long kernel_len) +{ + bool trusted; + int ret; + + ret = verify_pefile_signature(kernel, kernel_len, + system_trusted_keyring, &trusted); + if (ret < 0) + return ret; + if (!trusted) + return -EKEYREJECTED; + return 0; +} +#endif + +struct kexec_file_ops kexec_bzImage64_ops = { + .probe = bzImage64_probe, + .load = bzImage64_load, + .cleanup = bzImage64_cleanup, +#ifdef CONFIG_KEXEC_BZIMAGE_VERIFY_SIG + .verify_sig = bzImage64_verify_sig, +#endif +}; diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index f304773285a..f1314d0bcf0 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -338,8 +338,10 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op) * a relative jump. */ rel = (long)op->optinsn.insn - (long)op->kp.addr + RELATIVEJUMP_SIZE; - if (abs(rel) > 0x7fffffff) + if (abs(rel) > 0x7fffffff) { + __arch_remove_optimized_kprobe(op, 0); return -ERANGE; + } buf = (u8 *)op->optinsn.insn; diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 3dd8e2c4d74..f6945bef2cd 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -35,6 +35,7 @@ #include <linux/slab.h> #include <linux/kprobes.h> #include <linux/debugfs.h> +#include <linux/nmi.h> #include <asm/timer.h> #include <asm/cpu.h> #include <asm/traps.h> @@ -243,9 +244,9 @@ u32 kvm_read_and_reset_pf_reason(void) { u32 reason = 0; - if (__get_cpu_var(apf_reason).enabled) { - reason = __get_cpu_var(apf_reason).reason; - __get_cpu_var(apf_reason).reason = 0; + if (__this_cpu_read(apf_reason.enabled)) { + reason = __this_cpu_read(apf_reason.reason); + __this_cpu_write(apf_reason.reason, 0); } return reason; @@ -318,7 +319,7 @@ static void kvm_guest_apic_eoi_write(u32 reg, u32 val) * there's no need for lock or memory barriers. * An optimization barrier is implied in apic write. */ - if (__test_and_clear_bit(KVM_PV_EOI_BIT, &__get_cpu_var(kvm_apic_eoi))) + if (__test_and_clear_bit(KVM_PV_EOI_BIT, this_cpu_ptr(&kvm_apic_eoi))) return; apic_write(APIC_EOI, APIC_EOI_ACK); } @@ -329,13 +330,13 @@ void kvm_guest_cpu_init(void) return; if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) { - u64 pa = slow_virt_to_phys(&__get_cpu_var(apf_reason)); + u64 pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason)); #ifdef CONFIG_PREEMPT pa |= KVM_ASYNC_PF_SEND_ALWAYS; #endif wrmsrl(MSR_KVM_ASYNC_PF_EN, pa | KVM_ASYNC_PF_ENABLED); - __get_cpu_var(apf_reason).enabled = 1; + __this_cpu_write(apf_reason.enabled, 1); printk(KERN_INFO"KVM setup async PF for cpu %d\n", smp_processor_id()); } @@ -344,8 +345,8 @@ void kvm_guest_cpu_init(void) unsigned long pa; /* Size alignment is implied but just to make it explicit. */ BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4); - __get_cpu_var(kvm_apic_eoi) = 0; - pa = slow_virt_to_phys(&__get_cpu_var(kvm_apic_eoi)) + __this_cpu_write(kvm_apic_eoi, 0); + pa = slow_virt_to_phys(this_cpu_ptr(&kvm_apic_eoi)) | KVM_MSR_ENABLED; wrmsrl(MSR_KVM_PV_EOI_EN, pa); } @@ -356,11 +357,11 @@ void kvm_guest_cpu_init(void) static void kvm_pv_disable_apf(void) { - if (!__get_cpu_var(apf_reason).enabled) + if (!__this_cpu_read(apf_reason.enabled)) return; wrmsrl(MSR_KVM_ASYNC_PF_EN, 0); - __get_cpu_var(apf_reason).enabled = 0; + __this_cpu_write(apf_reason.enabled, 0); printk(KERN_INFO"Unregister pv shared memory for cpu %d\n", smp_processor_id()); @@ -499,6 +500,13 @@ void __init kvm_guest_init(void) #else kvm_guest_cpu_init(); #endif + + /* + * Hard lockup detection is enabled by default. Disable it, as guests + * can get false positives too easily, for example if the host is + * overcommitted. + */ + watchdog_enable_hardlockup_detector(false); } static noinline uint32_t __kvm_cpuid_base(void) @@ -716,7 +724,7 @@ __visible void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want) if (in_nmi()) return; - w = &__get_cpu_var(klock_waiting); + w = this_cpu_ptr(&klock_waiting); cpu = smp_processor_id(); start = spin_time_start(); diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 1667b1de8d5..72e8e310258 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -247,7 +247,8 @@ void machine_kexec(struct kimage *image) /* now call it */ image->start = relocate_kernel_ptr((unsigned long)image->head, (unsigned long)page_list, - image->start, cpu_has_pae, + image->start, + boot_cpu_has(X86_FEATURE_PAE), image->preserve_context); #ifdef CONFIG_KEXEC_JUMP diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 679cef0791c..485981059a4 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -6,6 +6,8 @@ * Version 2. See the file COPYING for more details. */ +#define pr_fmt(fmt) "kexec: " fmt + #include <linux/mm.h> #include <linux/kexec.h> #include <linux/string.h> @@ -21,6 +23,13 @@ #include <asm/tlbflush.h> #include <asm/mmu_context.h> #include <asm/debugreg.h> +#include <asm/kexec-bzimage64.h> + +#ifdef CONFIG_KEXEC_FILE +static struct kexec_file_ops *kexec_file_loaders[] = { + &kexec_bzImage64_ops, +}; +#endif static void free_transition_pgtable(struct kimage *image) { @@ -171,6 +180,45 @@ static void load_segments(void) ); } +#ifdef CONFIG_KEXEC_FILE +/* Update purgatory as needed after various image segments have been prepared */ +static int arch_update_purgatory(struct kimage *image) +{ + int ret = 0; + + if (!image->file_mode) + return 0; + + /* Setup copying of backup region */ + if (image->type == KEXEC_TYPE_CRASH) { + ret = kexec_purgatory_get_set_symbol(image, "backup_dest", + &image->arch.backup_load_addr, + sizeof(image->arch.backup_load_addr), 0); + if (ret) + return ret; + + ret = kexec_purgatory_get_set_symbol(image, "backup_src", + &image->arch.backup_src_start, + sizeof(image->arch.backup_src_start), 0); + if (ret) + return ret; + + ret = kexec_purgatory_get_set_symbol(image, "backup_sz", + &image->arch.backup_src_sz, + sizeof(image->arch.backup_src_sz), 0); + if (ret) + return ret; + } + + return ret; +} +#else /* !CONFIG_KEXEC_FILE */ +static inline int arch_update_purgatory(struct kimage *image) +{ + return 0; +} +#endif /* CONFIG_KEXEC_FILE */ + int machine_kexec_prepare(struct kimage *image) { unsigned long start_pgtable; @@ -184,6 +232,11 @@ int machine_kexec_prepare(struct kimage *image) if (result) return result; + /* update purgatory as needed */ + result = arch_update_purgatory(image); + if (result) + return result; + return 0; } @@ -283,3 +336,200 @@ void arch_crash_save_vmcoreinfo(void) (unsigned long)&_text - __START_KERNEL); } +/* arch-dependent functionality related to kexec file-based syscall */ + +#ifdef CONFIG_KEXEC_FILE +int arch_kexec_kernel_image_probe(struct kimage *image, void *buf, + unsigned long buf_len) +{ + int i, ret = -ENOEXEC; + struct kexec_file_ops *fops; + + for (i = 0; i < ARRAY_SIZE(kexec_file_loaders); i++) { + fops = kexec_file_loaders[i]; + if (!fops || !fops->probe) + continue; + + ret = fops->probe(buf, buf_len); + if (!ret) { + image->fops = fops; + return ret; + } + } + + return ret; +} + +void *arch_kexec_kernel_image_load(struct kimage *image) +{ + vfree(image->arch.elf_headers); + image->arch.elf_headers = NULL; + + if (!image->fops || !image->fops->load) + return ERR_PTR(-ENOEXEC); + + return image->fops->load(image, image->kernel_buf, + image->kernel_buf_len, image->initrd_buf, + image->initrd_buf_len, image->cmdline_buf, + image->cmdline_buf_len); +} + +int arch_kimage_file_post_load_cleanup(struct kimage *image) +{ + if (!image->fops || !image->fops->cleanup) + return 0; + + return image->fops->cleanup(image->image_loader_data); +} + +int arch_kexec_kernel_verify_sig(struct kimage *image, void *kernel, + unsigned long kernel_len) +{ + if (!image->fops || !image->fops->verify_sig) { + pr_debug("kernel loader does not support signature verification."); + return -EKEYREJECTED; + } + + return image->fops->verify_sig(kernel, kernel_len); +} + +/* + * Apply purgatory relocations. + * + * ehdr: Pointer to elf headers + * sechdrs: Pointer to section headers. + * relsec: section index of SHT_RELA section. + * + * TODO: Some of the code belongs to generic code. Move that in kexec.c. + */ +int arch_kexec_apply_relocations_add(const Elf64_Ehdr *ehdr, + Elf64_Shdr *sechdrs, unsigned int relsec) +{ + unsigned int i; + Elf64_Rela *rel; + Elf64_Sym *sym; + void *location; + Elf64_Shdr *section, *symtabsec; + unsigned long address, sec_base, value; + const char *strtab, *name, *shstrtab; + + /* + * ->sh_offset has been modified to keep the pointer to section + * contents in memory + */ + rel = (void *)sechdrs[relsec].sh_offset; + + /* Section to which relocations apply */ + section = &sechdrs[sechdrs[relsec].sh_info]; + + pr_debug("Applying relocate section %u to %u\n", relsec, + sechdrs[relsec].sh_info); + + /* Associated symbol table */ + symtabsec = &sechdrs[sechdrs[relsec].sh_link]; + + /* String table */ + if (symtabsec->sh_link >= ehdr->e_shnum) { + /* Invalid strtab section number */ + pr_err("Invalid string table section index %d\n", + symtabsec->sh_link); + return -ENOEXEC; + } + + strtab = (char *)sechdrs[symtabsec->sh_link].sh_offset; + + /* section header string table */ + shstrtab = (char *)sechdrs[ehdr->e_shstrndx].sh_offset; + + for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { + + /* + * rel[i].r_offset contains byte offset from beginning + * of section to the storage unit affected. + * + * This is location to update (->sh_offset). This is temporary + * buffer where section is currently loaded. This will finally + * be loaded to a different address later, pointed to by + * ->sh_addr. kexec takes care of moving it + * (kexec_load_segment()). + */ + location = (void *)(section->sh_offset + rel[i].r_offset); + + /* Final address of the location */ + address = section->sh_addr + rel[i].r_offset; + + /* + * rel[i].r_info contains information about symbol table index + * w.r.t which relocation must be made and type of relocation + * to apply. ELF64_R_SYM() and ELF64_R_TYPE() macros get + * these respectively. + */ + sym = (Elf64_Sym *)symtabsec->sh_offset + + ELF64_R_SYM(rel[i].r_info); + + if (sym->st_name) + name = strtab + sym->st_name; + else + name = shstrtab + sechdrs[sym->st_shndx].sh_name; + + pr_debug("Symbol: %s info: %02x shndx: %02x value=%llx size: %llx\n", + name, sym->st_info, sym->st_shndx, sym->st_value, + sym->st_size); + + if (sym->st_shndx == SHN_UNDEF) { + pr_err("Undefined symbol: %s\n", name); + return -ENOEXEC; + } + + if (sym->st_shndx == SHN_COMMON) { + pr_err("symbol '%s' in common section\n", name); + return -ENOEXEC; + } + + if (sym->st_shndx == SHN_ABS) + sec_base = 0; + else if (sym->st_shndx >= ehdr->e_shnum) { + pr_err("Invalid section %d for symbol %s\n", + sym->st_shndx, name); + return -ENOEXEC; + } else + sec_base = sechdrs[sym->st_shndx].sh_addr; + + value = sym->st_value; + value += sec_base; + value += rel[i].r_addend; + + switch (ELF64_R_TYPE(rel[i].r_info)) { + case R_X86_64_NONE: + break; + case R_X86_64_64: + *(u64 *)location = value; + break; + case R_X86_64_32: + *(u32 *)location = value; + if (value != *(u32 *)location) + goto overflow; + break; + case R_X86_64_32S: + *(s32 *)location = value; + if ((s64)value != *(s32 *)location) + goto overflow; + break; + case R_X86_64_PC32: + value -= (u64)address; + *(u32 *)location = value; + break; + default: + pr_err("Unknown rela relocation: %llu\n", + ELF64_R_TYPE(rel[i].r_info)); + return -ENOEXEC; + } + } + return 0; + +overflow: + pr_err("Overflow in relocation type %d value 0x%lx\n", + (int)ELF64_R_TYPE(rel[i].r_info), value); + return -ENOEXEC; +} +#endif /* CONFIG_KEXEC_FILE */ diff --git a/arch/x86/kernel/mcount_64.S b/arch/x86/kernel/mcount_64.S index c050a015316..c73aecf10d3 100644 --- a/arch/x86/kernel/mcount_64.S +++ b/arch/x86/kernel/mcount_64.S @@ -46,10 +46,6 @@ END(function_hook) .endm ENTRY(ftrace_caller) - /* Check if tracing was disabled (quick check) */ - cmpl $0, function_trace_stop - jne ftrace_stub - ftrace_caller_setup /* regs go into 4th parameter (but make it NULL) */ movq $0, %rcx @@ -73,10 +69,6 @@ ENTRY(ftrace_regs_caller) /* Save the current flags before compare (in SS location)*/ pushfq - /* Check if tracing was disabled (quick check) */ - cmpl $0, function_trace_stop - jne ftrace_restore_flags - /* skip=8 to skip flags saved in SS */ ftrace_caller_setup 8 @@ -131,7 +123,7 @@ GLOBAL(ftrace_regs_call) popfq jmp ftrace_return -ftrace_restore_flags: + popfq jmp ftrace_stub @@ -141,9 +133,6 @@ END(ftrace_regs_caller) #else /* ! CONFIG_DYNAMIC_FTRACE */ ENTRY(function_hook) - cmpl $0, function_trace_stop - jne ftrace_stub - cmpq $ftrace_stub, ftrace_trace_function jnz trace diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index d2b56489d70..2d2a237f2c7 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -19,6 +19,7 @@ #include <linux/module.h> #include <linux/smp.h> #include <linux/pci.h> +#include <linux/irqdomain.h> #include <asm/mtrr.h> #include <asm/mpspec.h> @@ -67,7 +68,7 @@ static void __init MP_processor_info(struct mpc_cpu *m) boot_cpu_physical_apicid = m->apicid; } - printk(KERN_INFO "Processor #%d%s\n", m->apicid, bootup_cpu); + pr_info("Processor #%d%s\n", m->apicid, bootup_cpu); generic_processor_info(apicid, m->apicver); } @@ -87,9 +88,8 @@ static void __init MP_bus_info(struct mpc_bus *m) #if MAX_MP_BUSSES < 256 if (m->busid >= MAX_MP_BUSSES) { - printk(KERN_WARNING "MP table busid value (%d) for bustype %s " - " is too large, max. supported is %d\n", - m->busid, str, MAX_MP_BUSSES - 1); + pr_warn("MP table busid value (%d) for bustype %s is too large, max. supported is %d\n", + m->busid, str, MAX_MP_BUSSES - 1); return; } #endif @@ -110,19 +110,29 @@ static void __init MP_bus_info(struct mpc_bus *m) mp_bus_id_to_type[m->busid] = MP_BUS_EISA; #endif } else - printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str); + pr_warn("Unknown bustype %s - ignoring\n", str); } +static struct irq_domain_ops mp_ioapic_irqdomain_ops = { + .map = mp_irqdomain_map, + .unmap = mp_irqdomain_unmap, +}; + static void __init MP_ioapic_info(struct mpc_ioapic *m) { + struct ioapic_domain_cfg cfg = { + .type = IOAPIC_DOMAIN_LEGACY, + .ops = &mp_ioapic_irqdomain_ops, + }; + if (m->flags & MPC_APIC_USABLE) - mp_register_ioapic(m->apicid, m->apicaddr, gsi_top); + mp_register_ioapic(m->apicid, m->apicaddr, gsi_top, &cfg); } static void __init print_mp_irq_info(struct mpc_intsrc *mp_irq) { - apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x," - " IRQ %02x, APIC ID %x, APIC INT %02x\n", + apic_printk(APIC_VERBOSE, + "Int: type %d, pol %d, trig %d, bus %02x, IRQ %02x, APIC ID %x, APIC INT %02x\n", mp_irq->irqtype, mp_irq->irqflag & 3, (mp_irq->irqflag >> 2) & 3, mp_irq->srcbus, mp_irq->srcbusirq, mp_irq->dstapic, mp_irq->dstirq); @@ -135,8 +145,8 @@ static inline void __init MP_ioapic_info(struct mpc_ioapic *m) {} static void __init MP_lintsrc_info(struct mpc_lintsrc *m) { - apic_printk(APIC_VERBOSE, "Lint: type %d, pol %d, trig %d, bus %02x," - " IRQ %02x, APIC ID %x, APIC LINT %02x\n", + apic_printk(APIC_VERBOSE, + "Lint: type %d, pol %d, trig %d, bus %02x, IRQ %02x, APIC ID %x, APIC LINT %02x\n", m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbusid, m->srcbusirq, m->destapic, m->destapiclint); } @@ -148,34 +158,33 @@ static int __init smp_check_mpc(struct mpc_table *mpc, char *oem, char *str) { if (memcmp(mpc->signature, MPC_SIGNATURE, 4)) { - printk(KERN_ERR "MPTABLE: bad signature [%c%c%c%c]!\n", + pr_err("MPTABLE: bad signature [%c%c%c%c]!\n", mpc->signature[0], mpc->signature[1], mpc->signature[2], mpc->signature[3]); return 0; } if (mpf_checksum((unsigned char *)mpc, mpc->length)) { - printk(KERN_ERR "MPTABLE: checksum error!\n"); + pr_err("MPTABLE: checksum error!\n"); return 0; } if (mpc->spec != 0x01 && mpc->spec != 0x04) { - printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n", - mpc->spec); + pr_err("MPTABLE: bad table version (%d)!!\n", mpc->spec); return 0; } if (!mpc->lapic) { - printk(KERN_ERR "MPTABLE: null local APIC address!\n"); + pr_err("MPTABLE: null local APIC address!\n"); return 0; } memcpy(oem, mpc->oem, 8); oem[8] = 0; - printk(KERN_INFO "MPTABLE: OEM ID: %s\n", oem); + pr_info("MPTABLE: OEM ID: %s\n", oem); memcpy(str, mpc->productid, 12); str[12] = 0; - printk(KERN_INFO "MPTABLE: Product ID: %s\n", str); + pr_info("MPTABLE: Product ID: %s\n", str); - printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->lapic); + pr_info("MPTABLE: APIC at: 0x%X\n", mpc->lapic); return 1; } @@ -188,8 +197,8 @@ static void skip_entry(unsigned char **ptr, int *count, int size) static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt) { - printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n" - "type %x\n", *mpt); + pr_err("Your mptable is wrong, contact your HW vendor!\n"); + pr_cont("type %x\n", *mpt); print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16, 1, mpc, mpc->length, 1); } @@ -207,9 +216,6 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) if (!smp_check_mpc(mpc, oem, str)) return 0; -#ifdef CONFIG_X86_32 - generic_mps_oem_check(mpc, oem, str); -#endif /* Initialize the lapic mapping */ if (!acpi_lapic) register_lapic_address(mpc->lapic); @@ -259,7 +265,7 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) } if (!num_processors) - printk(KERN_ERR "MPTABLE: no processors registered!\n"); + pr_err("MPTABLE: no processors registered!\n"); return num_processors; } @@ -295,16 +301,13 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type) * If it does, we assume it's valid. */ if (mpc_default_type == 5) { - printk(KERN_INFO "ISA/PCI bus type with no IRQ information... " - "falling back to ELCR\n"); + pr_info("ISA/PCI bus type with no IRQ information... falling back to ELCR\n"); if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13)) - printk(KERN_ERR "ELCR contains invalid data... " - "not using ELCR\n"); + pr_err("ELCR contains invalid data... not using ELCR\n"); else { - printk(KERN_INFO - "Using ELCR to identify PCI interrupts\n"); + pr_info("Using ELCR to identify PCI interrupts\n"); ELCR_fallback = 1; } } @@ -353,7 +356,7 @@ static void __init construct_ioapic_table(int mpc_default_type) bus.busid = 0; switch (mpc_default_type) { default: - printk(KERN_ERR "???\nUnknown standard configuration %d\n", + pr_err("???\nUnknown standard configuration %d\n", mpc_default_type); /* fall through */ case 1: @@ -462,8 +465,8 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early) #ifdef CONFIG_X86_LOCAL_APIC smp_found_config = 0; #endif - printk(KERN_ERR "BIOS bug, MP table errors detected!...\n" - "... disabling SMP support. (tell your hw vendor)\n"); + pr_err("BIOS bug, MP table errors detected!...\n"); + pr_cont("... disabling SMP support. (tell your hw vendor)\n"); early_iounmap(mpc, size); return -1; } @@ -481,8 +484,7 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early) if (!mp_irq_entries) { struct mpc_bus bus; - printk(KERN_ERR "BIOS bug, no explicit IRQ entries, " - "using default mptable. (tell your hw vendor)\n"); + pr_err("BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n"); bus.type = MP_BUS; bus.busid = 0; @@ -516,14 +518,14 @@ void __init default_get_smp_config(unsigned int early) if (acpi_lapic && acpi_ioapic) return; - printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", - mpf->specification); + pr_info("Intel MultiProcessor Specification v1.%d\n", + mpf->specification); #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) if (mpf->feature2 & (1 << 7)) { - printk(KERN_INFO " IMCR and PIC compatibility mode.\n"); + pr_info(" IMCR and PIC compatibility mode.\n"); pic_mode = 1; } else { - printk(KERN_INFO " Virtual Wire compatibility mode.\n"); + pr_info(" Virtual Wire compatibility mode.\n"); pic_mode = 0; } #endif @@ -539,8 +541,7 @@ void __init default_get_smp_config(unsigned int early) return; } - printk(KERN_INFO "Default MP configuration #%d\n", - mpf->feature1); + pr_info("Default MP configuration #%d\n", mpf->feature1); construct_default_ISA_mptable(mpf->feature1); } else if (mpf->physptr) { @@ -550,7 +551,7 @@ void __init default_get_smp_config(unsigned int early) BUG(); if (!early) - printk(KERN_INFO "Processors: %d\n", num_processors); + pr_info("Processors: %d\n", num_processors); /* * Only use the first configuration found. */ @@ -583,10 +584,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length) #endif mpf_found = mpf; - printk(KERN_INFO "found SMP MP-table at [mem %#010llx-%#010llx] mapped at [%p]\n", - (unsigned long long) virt_to_phys(mpf), - (unsigned long long) virt_to_phys(mpf) + - sizeof(*mpf) - 1, mpf); + pr_info("found SMP MP-table at [mem %#010llx-%#010llx] mapped at [%p]\n", + (unsigned long long) virt_to_phys(mpf), + (unsigned long long) virt_to_phys(mpf) + + sizeof(*mpf) - 1, mpf); mem = virt_to_phys(mpf); memblock_reserve(mem, sizeof(*mpf)); @@ -735,7 +736,7 @@ static int __init replace_intsrc_all(struct mpc_table *mpc, int nr_m_spare = 0; unsigned char *mpt = ((unsigned char *)mpc) + count; - printk(KERN_INFO "mpc_length %x\n", mpc->length); + pr_info("mpc_length %x\n", mpc->length); while (count < mpc->length) { switch (*mpt) { case MP_PROCESSOR: @@ -862,13 +863,13 @@ static int __init update_mp_table(void) if (!smp_check_mpc(mpc, oem, str)) return 0; - printk(KERN_INFO "mpf: %llx\n", (u64)virt_to_phys(mpf)); - printk(KERN_INFO "physptr: %x\n", mpf->physptr); + pr_info("mpf: %llx\n", (u64)virt_to_phys(mpf)); + pr_info("physptr: %x\n", mpf->physptr); if (mpc_new_phys && mpc->length > mpc_new_length) { mpc_new_phys = 0; - printk(KERN_INFO "mpc_new_length is %ld, please use alloc_mptable=8k\n", - mpc_new_length); + pr_info("mpc_new_length is %ld, please use alloc_mptable=8k\n", + mpc_new_length); } if (!mpc_new_phys) { @@ -879,10 +880,10 @@ static int __init update_mp_table(void) mpc->checksum = 0xff; new = mpf_checksum((unsigned char *)mpc, mpc->length); if (old == new) { - printk(KERN_INFO "mpc is readonly, please try alloc_mptable instead\n"); + pr_info("mpc is readonly, please try alloc_mptable instead\n"); return 0; } - printk(KERN_INFO "use in-position replacing\n"); + pr_info("use in-position replacing\n"); } else { mpf->physptr = mpc_new_phys; mpc_new = phys_to_virt(mpc_new_phys); @@ -892,7 +893,7 @@ static int __init update_mp_table(void) if (mpc_new_phys - mpf->physptr) { struct mpf_intel *mpf_new; /* steal 16 bytes from [0, 1k) */ - printk(KERN_INFO "mpf new: %x\n", 0x400 - 16); + pr_info("mpf new: %x\n", 0x400 - 16); mpf_new = phys_to_virt(0x400 - 16); memcpy(mpf_new, mpf, 16); mpf = mpf_new; @@ -900,7 +901,7 @@ static int __init update_mp_table(void) } mpf->checksum = 0; mpf->checksum -= mpf_checksum((unsigned char *)mpf, 16); - printk(KERN_INFO "physptr new: %x\n", mpf->physptr); + pr_info("physptr new: %x\n", mpf->physptr); } /* diff --git a/arch/x86/kernel/pmc_atom.c b/arch/x86/kernel/pmc_atom.c new file mode 100644 index 00000000000..0ee5025e0fa --- /dev/null +++ b/arch/x86/kernel/pmc_atom.c @@ -0,0 +1,324 @@ +/* + * Intel Atom SOC Power Management Controller Driver + * Copyright (c) 2014, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/pci.h> +#include <linux/device.h> +#include <linux/debugfs.h> +#include <linux/seq_file.h> +#include <linux/io.h> + +#include <asm/pmc_atom.h> + +#define DRIVER_NAME KBUILD_MODNAME + +struct pmc_dev { + u32 base_addr; + void __iomem *regmap; +#ifdef CONFIG_DEBUG_FS + struct dentry *dbgfs_dir; +#endif /* CONFIG_DEBUG_FS */ +}; + +static struct pmc_dev pmc_device; +static u32 acpi_base_addr; + +struct pmc_dev_map { + const char *name; + u32 bit_mask; +}; + +static const struct pmc_dev_map dev_map[] = { + {"0 - LPSS1_F0_DMA", BIT_LPSS1_F0_DMA}, + {"1 - LPSS1_F1_PWM1", BIT_LPSS1_F1_PWM1}, + {"2 - LPSS1_F2_PWM2", BIT_LPSS1_F2_PWM2}, + {"3 - LPSS1_F3_HSUART1", BIT_LPSS1_F3_HSUART1}, + {"4 - LPSS1_F4_HSUART2", BIT_LPSS1_F4_HSUART2}, + {"5 - LPSS1_F5_SPI", BIT_LPSS1_F5_SPI}, + {"6 - LPSS1_F6_Reserved", BIT_LPSS1_F6_XXX}, + {"7 - LPSS1_F7_Reserved", BIT_LPSS1_F7_XXX}, + {"8 - SCC_EMMC", BIT_SCC_EMMC}, + {"9 - SCC_SDIO", BIT_SCC_SDIO}, + {"10 - SCC_SDCARD", BIT_SCC_SDCARD}, + {"11 - SCC_MIPI", BIT_SCC_MIPI}, + {"12 - HDA", BIT_HDA}, + {"13 - LPE", BIT_LPE}, + {"14 - OTG", BIT_OTG}, + {"15 - USH", BIT_USH}, + {"16 - GBE", BIT_GBE}, + {"17 - SATA", BIT_SATA}, + {"18 - USB_EHCI", BIT_USB_EHCI}, + {"19 - SEC", BIT_SEC}, + {"20 - PCIE_PORT0", BIT_PCIE_PORT0}, + {"21 - PCIE_PORT1", BIT_PCIE_PORT1}, + {"22 - PCIE_PORT2", BIT_PCIE_PORT2}, + {"23 - PCIE_PORT3", BIT_PCIE_PORT3}, + {"24 - LPSS2_F0_DMA", BIT_LPSS2_F0_DMA}, + {"25 - LPSS2_F1_I2C1", BIT_LPSS2_F1_I2C1}, + {"26 - LPSS2_F2_I2C2", BIT_LPSS2_F2_I2C2}, + {"27 - LPSS2_F3_I2C3", BIT_LPSS2_F3_I2C3}, + {"28 - LPSS2_F3_I2C4", BIT_LPSS2_F4_I2C4}, + {"29 - LPSS2_F5_I2C5", BIT_LPSS2_F5_I2C5}, + {"30 - LPSS2_F6_I2C6", BIT_LPSS2_F6_I2C6}, + {"31 - LPSS2_F7_I2C7", BIT_LPSS2_F7_I2C7}, + {"32 - SMB", BIT_SMB}, + {"33 - OTG_SS_PHY", BIT_OTG_SS_PHY}, + {"34 - USH_SS_PHY", BIT_USH_SS_PHY}, + {"35 - DFX", BIT_DFX}, +}; + +static inline u32 pmc_reg_read(struct pmc_dev *pmc, int reg_offset) +{ + return readl(pmc->regmap + reg_offset); +} + +static inline void pmc_reg_write(struct pmc_dev *pmc, int reg_offset, u32 val) +{ + writel(val, pmc->regmap + reg_offset); +} + +static void pmc_power_off(void) +{ + u16 pm1_cnt_port; + u32 pm1_cnt_value; + + pr_info("Preparing to enter system sleep state S5\n"); + + pm1_cnt_port = acpi_base_addr + PM1_CNT; + + pm1_cnt_value = inl(pm1_cnt_port); + pm1_cnt_value &= SLEEP_TYPE_MASK; + pm1_cnt_value |= SLEEP_TYPE_S5; + pm1_cnt_value |= SLEEP_ENABLE; + + outl(pm1_cnt_value, pm1_cnt_port); +} + +static void pmc_hw_reg_setup(struct pmc_dev *pmc) +{ + /* + * Disable PMC S0IX_WAKE_EN events coming from: + * - LPC clock run + * - GPIO_SUS ored dedicated IRQs + * - GPIO_SCORE ored dedicated IRQs + * - GPIO_SUS shared IRQ + * - GPIO_SCORE shared IRQ + */ + pmc_reg_write(pmc, PMC_S0IX_WAKE_EN, (u32)PMC_WAKE_EN_SETTING); +} + +#ifdef CONFIG_DEBUG_FS +static int pmc_dev_state_show(struct seq_file *s, void *unused) +{ + struct pmc_dev *pmc = s->private; + u32 func_dis, func_dis_2, func_dis_index; + u32 d3_sts_0, d3_sts_1, d3_sts_index; + int dev_num, dev_index, reg_index; + + func_dis = pmc_reg_read(pmc, PMC_FUNC_DIS); + func_dis_2 = pmc_reg_read(pmc, PMC_FUNC_DIS_2); + d3_sts_0 = pmc_reg_read(pmc, PMC_D3_STS_0); + d3_sts_1 = pmc_reg_read(pmc, PMC_D3_STS_1); + + dev_num = ARRAY_SIZE(dev_map); + + for (dev_index = 0; dev_index < dev_num; dev_index++) { + reg_index = dev_index / PMC_REG_BIT_WIDTH; + if (reg_index) { + func_dis_index = func_dis_2; + d3_sts_index = d3_sts_1; + } else { + func_dis_index = func_dis; + d3_sts_index = d3_sts_0; + } + + seq_printf(s, "Dev: %-32s\tState: %s [%s]\n", + dev_map[dev_index].name, + dev_map[dev_index].bit_mask & func_dis_index ? + "Disabled" : "Enabled ", + dev_map[dev_index].bit_mask & d3_sts_index ? + "D3" : "D0"); + } + return 0; +} + +static int pmc_dev_state_open(struct inode *inode, struct file *file) +{ + return single_open(file, pmc_dev_state_show, inode->i_private); +} + +static const struct file_operations pmc_dev_state_ops = { + .open = pmc_dev_state_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int pmc_sleep_tmr_show(struct seq_file *s, void *unused) +{ + struct pmc_dev *pmc = s->private; + u64 s0ir_tmr, s0i1_tmr, s0i2_tmr, s0i3_tmr, s0_tmr; + + s0ir_tmr = (u64)pmc_reg_read(pmc, PMC_S0IR_TMR) << PMC_TMR_SHIFT; + s0i1_tmr = (u64)pmc_reg_read(pmc, PMC_S0I1_TMR) << PMC_TMR_SHIFT; + s0i2_tmr = (u64)pmc_reg_read(pmc, PMC_S0I2_TMR) << PMC_TMR_SHIFT; + s0i3_tmr = (u64)pmc_reg_read(pmc, PMC_S0I3_TMR) << PMC_TMR_SHIFT; + s0_tmr = (u64)pmc_reg_read(pmc, PMC_S0_TMR) << PMC_TMR_SHIFT; + + seq_printf(s, "S0IR Residency:\t%lldus\n", s0ir_tmr); + seq_printf(s, "S0I1 Residency:\t%lldus\n", s0i1_tmr); + seq_printf(s, "S0I2 Residency:\t%lldus\n", s0i2_tmr); + seq_printf(s, "S0I3 Residency:\t%lldus\n", s0i3_tmr); + seq_printf(s, "S0 Residency:\t%lldus\n", s0_tmr); + return 0; +} + +static int pmc_sleep_tmr_open(struct inode *inode, struct file *file) +{ + return single_open(file, pmc_sleep_tmr_show, inode->i_private); +} + +static const struct file_operations pmc_sleep_tmr_ops = { + .open = pmc_sleep_tmr_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static void pmc_dbgfs_unregister(struct pmc_dev *pmc) +{ + if (!pmc->dbgfs_dir) + return; + + debugfs_remove_recursive(pmc->dbgfs_dir); + pmc->dbgfs_dir = NULL; +} + +static int pmc_dbgfs_register(struct pmc_dev *pmc, struct pci_dev *pdev) +{ + struct dentry *dir, *f; + + dir = debugfs_create_dir("pmc_atom", NULL); + if (!dir) + return -ENOMEM; + + f = debugfs_create_file("dev_state", S_IFREG | S_IRUGO, + dir, pmc, &pmc_dev_state_ops); + if (!f) { + dev_err(&pdev->dev, "dev_states register failed\n"); + goto err; + } + f = debugfs_create_file("sleep_state", S_IFREG | S_IRUGO, + dir, pmc, &pmc_sleep_tmr_ops); + if (!f) { + dev_err(&pdev->dev, "sleep_state register failed\n"); + goto err; + } + pmc->dbgfs_dir = dir; + return 0; +err: + pmc_dbgfs_unregister(pmc); + return -ENODEV; +} +#else +static int pmc_dbgfs_register(struct pmc_dev *pmc, struct pci_dev *pdev) +{ + return 0; +} +#endif /* CONFIG_DEBUG_FS */ + +static int pmc_setup_dev(struct pci_dev *pdev) +{ + struct pmc_dev *pmc = &pmc_device; + int ret; + + /* Obtain ACPI base address */ + pci_read_config_dword(pdev, ACPI_BASE_ADDR_OFFSET, &acpi_base_addr); + acpi_base_addr &= ACPI_BASE_ADDR_MASK; + + /* Install power off function */ + if (acpi_base_addr != 0 && pm_power_off == NULL) + pm_power_off = pmc_power_off; + + pci_read_config_dword(pdev, PMC_BASE_ADDR_OFFSET, &pmc->base_addr); + pmc->base_addr &= PMC_BASE_ADDR_MASK; + + pmc->regmap = ioremap_nocache(pmc->base_addr, PMC_MMIO_REG_LEN); + if (!pmc->regmap) { + dev_err(&pdev->dev, "error: ioremap failed\n"); + return -ENOMEM; + } + + /* PMC hardware registers setup */ + pmc_hw_reg_setup(pmc); + + ret = pmc_dbgfs_register(pmc, pdev); + if (ret) { + iounmap(pmc->regmap); + } + + return ret; +} + +/* + * Data for PCI driver interface + * + * This data only exists for exporting the supported + * PCI ids via MODULE_DEVICE_TABLE. We do not actually + * register a pci_driver, because lpc_ich will register + * a driver on the same PCI id. + */ +static const struct pci_device_id pmc_pci_ids[] = { + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_VLV_PMC) }, + { 0, }, +}; + +MODULE_DEVICE_TABLE(pci, pmc_pci_ids); + +static int __init pmc_atom_init(void) +{ + int err = -ENODEV; + struct pci_dev *pdev = NULL; + const struct pci_device_id *ent; + + /* We look for our device - PCU PMC + * we assume that there is max. one device. + * + * We can't use plain pci_driver mechanism, + * as the device is really a multiple function device, + * main driver that binds to the pci_device is lpc_ich + * and have to find & bind to the device this way. + */ + for_each_pci_dev(pdev) { + ent = pci_match_id(pmc_pci_ids, pdev); + if (ent) { + err = pmc_setup_dev(pdev); + goto out; + } + } + /* Device not found. */ +out: + return err; +} + +module_init(pmc_atom_init); +/* no module_exit, this driver shouldn't be unloaded */ + +MODULE_AUTHOR("Aubrey Li <aubrey.li@linux.intel.com>"); +MODULE_DESCRIPTION("Intel Atom SOC Power Management Controller Interface"); +MODULE_LICENSE("GPL v2"); diff --git a/arch/x86/kernel/preempt.S b/arch/x86/kernel/preempt.S deleted file mode 100644 index ca7f0d58a87..00000000000 --- a/arch/x86/kernel/preempt.S +++ /dev/null @@ -1,25 +0,0 @@ - -#include <linux/linkage.h> -#include <asm/dwarf2.h> -#include <asm/asm.h> -#include <asm/calling.h> - -ENTRY(___preempt_schedule) - CFI_STARTPROC - SAVE_ALL - call preempt_schedule - RESTORE_ALL - ret - CFI_ENDPROC - -#ifdef CONFIG_CONTEXT_TRACKING - -ENTRY(___preempt_schedule_context) - CFI_STARTPROC - SAVE_ALL - call preempt_schedule_context - RESTORE_ALL - ret - CFI_ENDPROC - -#endif diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 4505e2a950d..e127ddaa2d5 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -64,14 +64,16 @@ EXPORT_SYMBOL_GPL(task_xstate_cachep); */ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { - int ret; - *dst = *src; - if (fpu_allocated(&src->thread.fpu)) { - memset(&dst->thread.fpu, 0, sizeof(dst->thread.fpu)); - ret = fpu_alloc(&dst->thread.fpu); - if (ret) - return ret; + + dst->thread.fpu_counter = 0; + dst->thread.fpu.has_fpu = 0; + dst->thread.fpu.last_cpu = ~0; + dst->thread.fpu.state = NULL; + if (tsk_used_math(src)) { + int err = fpu_alloc(&dst->thread.fpu); + if (err) + return err; fpu_copy(dst, src); } return 0; @@ -93,6 +95,7 @@ void arch_task_cache_init(void) kmem_cache_create("task_xstate", xstate_size, __alignof__(union thread_xstate), SLAB_PANIC | SLAB_NOTRACK, NULL); + setup_xstate_comp(); } /* diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 7bc86bbe748..8f3ebfe710d 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -138,6 +138,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, p->thread.sp = (unsigned long) childregs; p->thread.sp0 = (unsigned long) (childregs+1); + memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); if (unlikely(p->flags & PF_KTHREAD)) { /* kernel thread */ @@ -152,9 +153,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, childregs->orig_ax = -1; childregs->cs = __KERNEL_CS | get_kernel_rpl(); childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED; - p->thread.fpu_counter = 0; p->thread.io_bitmap_ptr = NULL; - memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); return 0; } *childregs = *current_pt_regs(); @@ -165,13 +164,10 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, p->thread.ip = (unsigned long) ret_from_fork; task_user_gs(p) = get_user_gs(current_pt_regs()); - p->thread.fpu_counter = 0; p->thread.io_bitmap_ptr = NULL; tsk = current; err = -ENOMEM; - memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); - if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr, IO_BITMAP_BYTES, GFP_KERNEL); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index ca5b02d405c..3ed4a68d401 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -163,7 +163,6 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, p->thread.sp = (unsigned long) childregs; p->thread.usersp = me->thread.usersp; set_tsk_thread_flag(p, TIF_FORK); - p->thread.fpu_counter = 0; p->thread.io_bitmap_ptr = NULL; savesegment(gs, p->thread.gsindex); @@ -193,8 +192,6 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, childregs->sp = sp; err = -ENOMEM; - memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); - if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr, IO_BITMAP_BYTES, GFP_KERNEL); diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index eb1c87f0b03..749b0e42341 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1441,24 +1441,126 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, force_sig_info(SIGTRAP, &info, tsk); } - -#ifdef CONFIG_X86_32 -# define IS_IA32 1 -#elif defined CONFIG_IA32_EMULATION -# define IS_IA32 is_compat_task() -#else -# define IS_IA32 0 +static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch) +{ +#ifdef CONFIG_X86_64 + if (arch == AUDIT_ARCH_X86_64) { + audit_syscall_entry(regs->orig_ax, regs->di, + regs->si, regs->dx, regs->r10); + } else #endif + { + audit_syscall_entry(regs->orig_ax, regs->bx, + regs->cx, regs->dx, regs->si); + } +} /* - * We must return the syscall number to actually look up in the table. - * This can be -1L to skip running any syscall at all. + * We can return 0 to resume the syscall or anything else to go to phase + * 2. If we resume the syscall, we need to put something appropriate in + * regs->orig_ax. + * + * NB: We don't have full pt_regs here, but regs->orig_ax and regs->ax + * are fully functional. + * + * For phase 2's benefit, our return value is: + * 0: resume the syscall + * 1: go to phase 2; no seccomp phase 2 needed + * anything else: go to phase 2; pass return value to seccomp */ -long syscall_trace_enter(struct pt_regs *regs) +unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch) +{ + unsigned long ret = 0; + u32 work; + + BUG_ON(regs != task_pt_regs(current)); + + work = ACCESS_ONCE(current_thread_info()->flags) & + _TIF_WORK_SYSCALL_ENTRY; + + /* + * If TIF_NOHZ is set, we are required to call user_exit() before + * doing anything that could touch RCU. + */ + if (work & _TIF_NOHZ) { + user_exit(); + work &= ~TIF_NOHZ; + } + +#ifdef CONFIG_SECCOMP + /* + * Do seccomp first -- it should minimize exposure of other + * code, and keeping seccomp fast is probably more valuable + * than the rest of this. + */ + if (work & _TIF_SECCOMP) { + struct seccomp_data sd; + + sd.arch = arch; + sd.nr = regs->orig_ax; + sd.instruction_pointer = regs->ip; +#ifdef CONFIG_X86_64 + if (arch == AUDIT_ARCH_X86_64) { + sd.args[0] = regs->di; + sd.args[1] = regs->si; + sd.args[2] = regs->dx; + sd.args[3] = regs->r10; + sd.args[4] = regs->r8; + sd.args[5] = regs->r9; + } else +#endif + { + sd.args[0] = regs->bx; + sd.args[1] = regs->cx; + sd.args[2] = regs->dx; + sd.args[3] = regs->si; + sd.args[4] = regs->di; + sd.args[5] = regs->bp; + } + + BUILD_BUG_ON(SECCOMP_PHASE1_OK != 0); + BUILD_BUG_ON(SECCOMP_PHASE1_SKIP != 1); + + ret = seccomp_phase1(&sd); + if (ret == SECCOMP_PHASE1_SKIP) { + regs->orig_ax = -1; + ret = 0; + } else if (ret != SECCOMP_PHASE1_OK) { + return ret; /* Go directly to phase 2 */ + } + + work &= ~_TIF_SECCOMP; + } +#endif + + /* Do our best to finish without phase 2. */ + if (work == 0) + return ret; /* seccomp and/or nohz only (ret == 0 here) */ + +#ifdef CONFIG_AUDITSYSCALL + if (work == _TIF_SYSCALL_AUDIT) { + /* + * If there is no more work to be done except auditing, + * then audit in phase 1. Phase 2 always audits, so, if + * we audit here, then we can't go on to phase 2. + */ + do_audit_syscall_entry(regs, arch); + return 0; + } +#endif + + return 1; /* Something is enabled that we can't handle in phase 1 */ +} + +/* Returns the syscall nr to run (which should match regs->orig_ax). */ +long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch, + unsigned long phase1_result) { long ret = 0; + u32 work = ACCESS_ONCE(current_thread_info()->flags) & + _TIF_WORK_SYSCALL_ENTRY; - user_exit(); + BUG_ON(regs != task_pt_regs(current)); /* * If we stepped into a sysenter/syscall insn, it trapped in @@ -1467,17 +1569,21 @@ long syscall_trace_enter(struct pt_regs *regs) * do_debug() and we need to set it again to restore the user * state. If we entered on the slow path, TF was already set. */ - if (test_thread_flag(TIF_SINGLESTEP)) + if (work & _TIF_SINGLESTEP) regs->flags |= X86_EFLAGS_TF; - /* do the secure computing check first */ - if (secure_computing(regs->orig_ax)) { +#ifdef CONFIG_SECCOMP + /* + * Call seccomp_phase2 before running the other hooks so that + * they can see any changes made by a seccomp tracer. + */ + if (phase1_result > 1 && seccomp_phase2(phase1_result)) { /* seccomp failures shouldn't expose any additional code. */ - ret = -1L; - goto out; + return -1; } +#endif - if (unlikely(test_thread_flag(TIF_SYSCALL_EMU))) + if (unlikely(work & _TIF_SYSCALL_EMU)) ret = -1L; if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) && @@ -1487,19 +1593,22 @@ long syscall_trace_enter(struct pt_regs *regs) if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_enter(regs, regs->orig_ax); - if (IS_IA32) - audit_syscall_entry(regs->orig_ax, regs->bx, regs->cx, - regs->dx, regs->si); -#ifdef CONFIG_X86_64 - else - audit_syscall_entry(regs->orig_ax, regs->di, regs->si, - regs->dx, regs->r10); -#endif + do_audit_syscall_entry(regs, arch); -out: return ret ?: regs->orig_ax; } +long syscall_trace_enter(struct pt_regs *regs) +{ + u32 arch = is_ia32_task() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64; + unsigned long phase1_result = syscall_trace_enter_phase1(regs, arch); + + if (phase1_result == 0) + return regs->orig_ax; + else + return syscall_trace_enter_phase2(regs, arch, phase1_result); +} + void syscall_trace_leave(struct pt_regs *regs) { bool step; diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index ff898bbf579..176a0f99d4d 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -498,6 +498,24 @@ void force_hpet_resume(void) } /* + * According to the datasheet e6xx systems have the HPET hardwired to + * 0xfed00000 + */ +static void e6xx_force_enable_hpet(struct pci_dev *dev) +{ + if (hpet_address || force_hpet_address) + return; + + force_hpet_address = 0xFED00000; + force_hpet_resume_type = NONE_FORCE_HPET_RESUME; + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at " + "0x%lx\n", force_hpet_address); + return; +} +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E6XX_CU, + e6xx_force_enable_hpet); + +/* * HPET MSI on some boards (ATI SB700/SB800) has side effect on * floppy DMA. Disable HPET MSI on such platforms. * See erratum #27 (Misinterpreted MSI Requests May Result in diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 52b1157c53e..17962e667a9 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -28,6 +28,7 @@ #include <linux/mc146818rtc.h> #include <asm/realmode.h> #include <asm/x86_init.h> +#include <asm/efi.h> /* * Power off function, if any @@ -401,12 +402,25 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { static int __init reboot_init(void) { + int rv; + /* * Only do the DMI check if reboot_type hasn't been overridden * on the command line */ - if (reboot_default) - dmi_check_system(reboot_dmi_table); + if (!reboot_default) + return 0; + + /* + * The DMI quirks table takes precedence. If no quirks entry + * matches and the ACPI Hardware Reduced bit is set, force EFI + * reboot. + */ + rv = dmi_check_system(reboot_dmi_table); + + if (!rv && efi_reboot_required()) + reboot_type = BOOT_EFI; + return 0; } core_initcall(reboot_init); @@ -528,11 +542,7 @@ static void native_machine_emergency_restart(void) break; case BOOT_EFI: - if (efi_enabled(EFI_RUNTIME_SERVICES)) - efi.reset_system(reboot_mode == REBOOT_WARM ? - EFI_RESET_WARM : - EFI_RESET_COLD, - EFI_SUCCESS, 0, NULL); + efi_reboot(reboot_mode, NULL); reboot_type = BOOT_BIOS; break; diff --git a/arch/x86/kernel/resource.c b/arch/x86/kernel/resource.c index 2a26819bb6a..80eab01c1a6 100644 --- a/arch/x86/kernel/resource.c +++ b/arch/x86/kernel/resource.c @@ -37,10 +37,12 @@ static void remove_e820_regions(struct resource *avail) void arch_remove_reservations(struct resource *avail) { - /* Trim out BIOS areas (low 1MB and high 2MB) and E820 regions */ + /* + * Trim out BIOS area (high 2MB) and E820 regions. We do not remove + * the low 1MB unconditionally, as this area is needed for some ISA + * cards requiring a memory range, e.g. the i82365 PCMCIA controller. + */ if (avail->flags & IORESOURCE_MEM) { - if (avail->start < BIOS_END) - avail->start = BIOS_END; resource_clip(avail, BIOS_ROM_BASE, BIOS_ROM_END); remove_e820_regions(avail); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 78a0e629892..235cfd39e0d 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -879,6 +879,15 @@ void __init setup_arch(char **cmdline_p) KERNEL_PGD_PTRS); load_cr3(swapper_pg_dir); + /* + * Note: Quark X1000 CPUs advertise PGE incorrectly and require + * a cr3 based tlb flush, so the following __flush_tlb_all() + * will not flush anything because the cpu quirk which clears + * X86_FEATURE_PGE has not been invoked yet. Though due to the + * load_cr3() above the TLB has been flushed already. The + * quirk is invoked before subsequent calls to __flush_tlb_all() + * so proper operation is guaranteed. + */ __flush_tlb_all(); #else printk(KERN_INFO "Command line: %s\n", boot_command_line); @@ -924,10 +933,10 @@ void __init setup_arch(char **cmdline_p) #endif #ifdef CONFIG_EFI if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, - "EL32", 4)) { + EFI32_LOADER_SIGNATURE, 4)) { set_bit(EFI_BOOT, &efi.flags); } else if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, - "EL64", 4)) { + EFI64_LOADER_SIGNATURE, 4)) { set_bit(EFI_BOOT, &efi.flags); set_bit(EFI_64BIT, &efi.flags); } diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 2851d63c120..ed37a768d0f 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -675,6 +675,11 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) * handler too. */ regs->flags &= ~(X86_EFLAGS_DF|X86_EFLAGS_RF|X86_EFLAGS_TF); + /* + * Ensure the signal handler starts with the new fpu state. + */ + if (used_math()) + drop_init_fpu(current); } signal_setup_done(failed, ksig, test_thread_flag(TIF_SINGLESTEP)); } diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 5492798930e..2d5200e5635 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -102,6 +102,8 @@ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); EXPORT_PER_CPU_SYMBOL(cpu_info); +static DEFINE_PER_CPU(struct completion, die_complete); + atomic_t init_deasserted; /* @@ -111,7 +113,6 @@ atomic_t init_deasserted; static void smp_callin(void) { int cpuid, phys_id; - unsigned long timeout; /* * If waken up by an INIT in an 82489DX configuration @@ -130,37 +131,6 @@ static void smp_callin(void) * (This works even if the APIC is not enabled.) */ phys_id = read_apic_id(); - if (cpumask_test_cpu(cpuid, cpu_callin_mask)) { - panic("%s: phys CPU#%d, CPU#%d already present??\n", __func__, - phys_id, cpuid); - } - pr_debug("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id); - - /* - * STARTUP IPIs are fragile beasts as they might sometimes - * trigger some glue motherboard logic. Complete APIC bus - * silence for 1 second, this overestimates the time the - * boot CPU is spending to send the up to 2 STARTUP IPIs - * by a factor of two. This should be enough. - */ - - /* - * Waiting 2s total for startup (udelay is not yet working) - */ - timeout = jiffies + 2*HZ; - while (time_before(jiffies, timeout)) { - /* - * Has the boot CPU finished it's STARTUP sequence? - */ - if (cpumask_test_cpu(cpuid, cpu_callout_mask)) - break; - cpu_relax(); - } - - if (!time_before(jiffies, timeout)) { - panic("%s: CPU%d started up but did not get a callout!\n", - __func__, cpuid); - } /* * the boot CPU has finished the init stage and is spinning @@ -168,10 +138,6 @@ static void smp_callin(void) * CPU, first the APIC. (this is probably redundant on most * boards) */ - - pr_debug("CALLIN, before setup_local_APIC()\n"); - if (apic->smp_callin_clear_local_apic) - apic->smp_callin_clear_local_apic(); setup_local_APIC(); end_local_APIC_setup(); @@ -300,11 +266,19 @@ void smp_store_cpu_info(int id) } static bool +topology_same_node(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) +{ + int cpu1 = c->cpu_index, cpu2 = o->cpu_index; + + return (cpu_to_node(cpu1) == cpu_to_node(cpu2)); +} + +static bool topology_sane(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o, const char *name) { int cpu1 = c->cpu_index, cpu2 = o->cpu_index; - return !WARN_ONCE(cpu_to_node(cpu1) != cpu_to_node(cpu2), + return !WARN_ONCE(!topology_same_node(c, o), "sched: CPU #%d's %s-sibling CPU #%d is not on the same node! " "[node: %d != %d]. Ignoring dependency.\n", cpu1, name, cpu2, cpu_to_node(cpu1), cpu_to_node(cpu2)); @@ -345,17 +319,44 @@ static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) return false; } -static bool match_mc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) +/* + * Unlike the other levels, we do not enforce keeping a + * multicore group inside a NUMA node. If this happens, we will + * discard the MC level of the topology later. + */ +static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) { - if (c->phys_proc_id == o->phys_proc_id) { - if (cpu_has(c, X86_FEATURE_AMD_DCM)) - return true; - - return topology_sane(c, o, "mc"); - } + if (c->phys_proc_id == o->phys_proc_id) + return true; return false; } +static struct sched_domain_topology_level numa_inside_package_topology[] = { +#ifdef CONFIG_SCHED_SMT + { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, +#endif +#ifdef CONFIG_SCHED_MC + { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, +#endif + { NULL, }, +}; +/* + * set_sched_topology() sets the topology internal to a CPU. The + * NUMA topologies are layered on top of it to build the full + * system topology. + * + * If NUMA nodes are observed to occur within a CPU package, this + * function should be called. It forces the sched domain code to + * only use the SMT level for the CPU portion of the topology. + * This essentially falls back to relying on NUMA information + * from the SRAT table to describe the entire system topology + * (except for hyperthreads). + */ +static void primarily_use_numa_for_topology(void) +{ + set_sched_topology(numa_inside_package_topology); +} + void set_cpu_sibling_map(int cpu) { bool has_smt = smp_num_siblings > 1; @@ -392,7 +393,7 @@ void set_cpu_sibling_map(int cpu) for_each_cpu(i, cpu_sibling_setup_mask) { o = &cpu_data(i); - if ((i == cpu) || (has_mp && match_mc(c, o))) { + if ((i == cpu) || (has_mp && match_die(c, o))) { link_mask(core, cpu, i); /* @@ -414,6 +415,8 @@ void set_cpu_sibling_map(int cpu) } else if (i != cpu && !c->booted_cores) c->booted_cores = cpu_data(i).booted_cores; } + if (match_die(c, o) && !topology_same_node(c, o)) + primarily_use_numa_for_topology(); } } @@ -757,8 +760,8 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) unsigned long start_ip = real_mode_header->trampoline_start; unsigned long boot_error = 0; - int timeout; int cpu0_nmi_registered = 0; + unsigned long timeout; /* Just in case we booted with a single CPU. */ alternatives_enable_smp(); @@ -806,6 +809,15 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) } /* + * AP might wait on cpu_callout_mask in cpu_init() with + * cpu_initialized_mask set if previous attempt to online + * it timed-out. Clear cpu_initialized_mask so that after + * INIT/SIPI it could start with a clean state. + */ + cpumask_clear_cpu(cpu, cpu_initialized_mask); + smp_mb(); + + /* * Wake up a CPU in difference cases: * - Use the method in the APIC driver if it's defined * Otherwise, @@ -819,53 +831,38 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) if (!boot_error) { /* - * allow APs to start initializing. + * Wait 10s total for a response from AP */ - pr_debug("Before Callout %d\n", cpu); - cpumask_set_cpu(cpu, cpu_callout_mask); - pr_debug("After Callout %d\n", cpu); + boot_error = -1; + timeout = jiffies + 10*HZ; + while (time_before(jiffies, timeout)) { + if (cpumask_test_cpu(cpu, cpu_initialized_mask)) { + /* + * Tell AP to proceed with initialization + */ + cpumask_set_cpu(cpu, cpu_callout_mask); + boot_error = 0; + break; + } + udelay(100); + schedule(); + } + } + if (!boot_error) { /* - * Wait 5s total for a response + * Wait till AP completes initial initialization */ - for (timeout = 0; timeout < 50000; timeout++) { - if (cpumask_test_cpu(cpu, cpu_callin_mask)) - break; /* It has booted */ - udelay(100); + while (!cpumask_test_cpu(cpu, cpu_callin_mask)) { /* * Allow other tasks to run while we wait for the * AP to come online. This also gives a chance * for the MTRR work(triggered by the AP coming online) * to be completed in the stop machine context. */ + udelay(100); schedule(); } - - if (cpumask_test_cpu(cpu, cpu_callin_mask)) { - print_cpu_msr(&cpu_data(cpu)); - pr_debug("CPU%d: has booted.\n", cpu); - } else { - boot_error = 1; - if (*trampoline_status == 0xA5A5A5A5) - /* trampoline started but...? */ - pr_err("CPU%d: Stuck ??\n", cpu); - else - /* trampoline code not run */ - pr_err("CPU%d: Not responding\n", cpu); - if (apic->inquire_remote_apic) - apic->inquire_remote_apic(apicid); - } - } - - if (boot_error) { - /* Try to put things back the way they were before ... */ - numa_remove_cpu(cpu); /* was set by numa_add_cpu */ - - /* was set by do_boot_cpu() */ - cpumask_clear_cpu(cpu, cpu_callout_mask); - - /* was set by cpu_init() */ - cpumask_clear_cpu(cpu, cpu_initialized_mask); } /* mark "stuck" area as not stuck */ @@ -1143,10 +1140,6 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) enable_IO_APIC(); bsp_end_local_APIC_setup(); - - if (apic->setup_portio_remap) - apic->setup_portio_remap(); - smpboot_setup_io_apic(); /* * Set up local APIC timer on boot CPU. @@ -1292,6 +1285,9 @@ static void remove_siblinginfo(int cpu) for_each_cpu(sibling, cpu_sibling_mask(cpu)) cpumask_clear_cpu(cpu, cpu_sibling_mask(sibling)); + for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) + cpumask_clear_cpu(cpu, cpu_llc_shared_mask(sibling)); + cpumask_clear(cpu_llc_shared_mask(cpu)); cpumask_clear(cpu_sibling_mask(cpu)); cpumask_clear(cpu_core_mask(cpu)); c->phys_proc_id = 0; @@ -1331,26 +1327,24 @@ int native_cpu_disable(void) return ret; clear_local_APIC(); - + init_completion(&per_cpu(die_complete, smp_processor_id())); cpu_disable_common(); + return 0; } void native_cpu_die(unsigned int cpu) { /* We don't do anything here: idle task is faking death itself. */ - unsigned int i; + wait_for_completion_timeout(&per_cpu(die_complete, cpu), HZ); - for (i = 0; i < 10; i++) { - /* They ack this in play_dead by setting CPU_DEAD */ - if (per_cpu(cpu_state, cpu) == CPU_DEAD) { - if (system_state == SYSTEM_RUNNING) - pr_info("CPU %u is now offline\n", cpu); - return; - } - msleep(100); + /* They ack this in play_dead() by setting CPU_DEAD */ + if (per_cpu(cpu_state, cpu) == CPU_DEAD) { + if (system_state == SYSTEM_RUNNING) + pr_info("CPU %u is now offline\n", cpu); + } else { + pr_err("CPU %u didn't die...\n", cpu); } - pr_err("CPU %u didn't die...\n", cpu); } void play_dead_common(void) @@ -1362,6 +1356,7 @@ void play_dead_common(void) mb(); /* Ack it */ __this_cpu_write(cpu_state, CPU_DEAD); + complete(&per_cpu(die_complete, smp_processor_id())); /* * With physical CPU hotplug, we should halt the cpu diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index bf7ef5ce29d..0fa29609b2c 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -68,6 +68,8 @@ static struct irqaction irq0 = { void __init setup_default_timer_irq(void) { + if (!nr_legacy_irqs()) + return; setup_irq(0, &irq0); } diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index ea030319b32..b6025f9e36c 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -234,9 +234,6 @@ static inline unsigned long long cycles_2_ns(unsigned long long cyc) return ns; } -/* XXX surely we already have this someplace in the kernel?! */ -#define DIV_ROUND(n, d) (((n) + ((d) / 2)) / (d)) - static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) { unsigned long long tsc_now, ns_now; @@ -259,7 +256,9 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) * time function is continuous; see the comment near struct * cyc2ns_data. */ - data->cyc2ns_mul = DIV_ROUND(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR, cpu_khz); + data->cyc2ns_mul = + DIV_ROUND_CLOSEST(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR, + cpu_khz); data->cyc2ns_shift = CYC2NS_SCALE_FACTOR; data->cyc2ns_offset = ns_now - mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR); @@ -951,7 +950,7 @@ core_initcall(cpufreq_tsc); static struct clocksource clocksource_tsc; /* - * We compare the TSC to the cycle_last value in the clocksource + * We used to compare the TSC to the cycle_last value in the clocksource * structure to avoid a nasty time-warp. This can be observed in a * very small window right after one CPU updated cycle_last under * xtime/vsyscall_gtod lock and the other CPU reads a TSC value which @@ -961,26 +960,23 @@ static struct clocksource clocksource_tsc; * due to the unsigned delta calculation of the time keeping core * code, which is necessary to support wrapping clocksources like pm * timer. + * + * This sanity check is now done in the core timekeeping code. + * checking the result of read_tsc() - cycle_last for being negative. + * That works because CLOCKSOURCE_MASK(64) does not mask out any bit. */ static cycle_t read_tsc(struct clocksource *cs) { - cycle_t ret = (cycle_t)get_cycles(); - - return ret >= clocksource_tsc.cycle_last ? - ret : clocksource_tsc.cycle_last; -} - -static void resume_tsc(struct clocksource *cs) -{ - if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3)) - clocksource_tsc.cycle_last = 0; + return (cycle_t)get_cycles(); } +/* + * .mask MUST be CLOCKSOURCE_MASK(64). See comment above read_tsc() + */ static struct clocksource clocksource_tsc = { .name = "tsc", .rating = 300, .read = read_tsc, - .resume = resume_tsc, .mask = CLOCKSOURCE_MASK(64), .flags = CLOCK_SOURCE_IS_CONTINUOUS | CLOCK_SOURCE_MUST_VERIFY, diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c index b99b9ad8540..ee22c1d93ae 100644 --- a/arch/x86/kernel/vsmp_64.c +++ b/arch/x86/kernel/vsmp_64.c @@ -152,7 +152,7 @@ static void __init detect_vsmp_box(void) is_vsmp = 1; } -int is_vsmp_box(void) +static int is_vsmp_box(void) { if (is_vsmp != -1) return is_vsmp; @@ -166,7 +166,7 @@ int is_vsmp_box(void) static void __init detect_vsmp_box(void) { } -int is_vsmp_box(void) +static int is_vsmp_box(void) { return 0; } diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index ea5b5709aa7..957779f4eb4 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -81,10 +81,10 @@ static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, if (!show_unhandled_signals) return; - pr_notice_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n", - level, current->comm, task_pid_nr(current), - message, regs->ip, regs->cs, - regs->sp, regs->ax, regs->si, regs->di); + printk_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n", + level, current->comm, task_pid_nr(current), + message, regs->ip, regs->cs, + regs->sp, regs->ax, regs->si, regs->di); } static int addr_to_vsyscall_nr(unsigned long addr) @@ -216,7 +216,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) */ regs->orig_ax = syscall_nr; regs->ax = -ENOSYS; - tmp = secure_computing(syscall_nr); + tmp = secure_computing(); if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) { warn_bad_vsyscall(KERN_DEBUG, regs, "seccomp tried to change syscall nr or ip"); diff --git a/arch/x86/kernel/vsyscall_gtod.c b/arch/x86/kernel/vsyscall_gtod.c index 9531fbb123b..c7d791f32b9 100644 --- a/arch/x86/kernel/vsyscall_gtod.c +++ b/arch/x86/kernel/vsyscall_gtod.c @@ -31,29 +31,30 @@ void update_vsyscall(struct timekeeper *tk) gtod_write_begin(vdata); /* copy vsyscall data */ - vdata->vclock_mode = tk->clock->archdata.vclock_mode; - vdata->cycle_last = tk->clock->cycle_last; - vdata->mask = tk->clock->mask; - vdata->mult = tk->mult; - vdata->shift = tk->shift; + vdata->vclock_mode = tk->tkr.clock->archdata.vclock_mode; + vdata->cycle_last = tk->tkr.cycle_last; + vdata->mask = tk->tkr.mask; + vdata->mult = tk->tkr.mult; + vdata->shift = tk->tkr.shift; vdata->wall_time_sec = tk->xtime_sec; - vdata->wall_time_snsec = tk->xtime_nsec; + vdata->wall_time_snsec = tk->tkr.xtime_nsec; vdata->monotonic_time_sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; - vdata->monotonic_time_snsec = tk->xtime_nsec + vdata->monotonic_time_snsec = tk->tkr.xtime_nsec + ((u64)tk->wall_to_monotonic.tv_nsec - << tk->shift); + << tk->tkr.shift); while (vdata->monotonic_time_snsec >= - (((u64)NSEC_PER_SEC) << tk->shift)) { + (((u64)NSEC_PER_SEC) << tk->tkr.shift)) { vdata->monotonic_time_snsec -= - ((u64)NSEC_PER_SEC) << tk->shift; + ((u64)NSEC_PER_SEC) << tk->tkr.shift; vdata->monotonic_time_sec++; } vdata->wall_time_coarse_sec = tk->xtime_sec; - vdata->wall_time_coarse_nsec = (long)(tk->xtime_nsec >> tk->shift); + vdata->wall_time_coarse_nsec = (long)(tk->tkr.xtime_nsec >> + tk->tkr.shift); vdata->monotonic_time_coarse_sec = vdata->wall_time_coarse_sec + tk->wall_to_monotonic.tv_sec; diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index a4b451c6add..4c540c4719d 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -8,6 +8,7 @@ #include <linux/bootmem.h> #include <linux/compat.h> +#include <linux/cpu.h> #include <asm/i387.h> #include <asm/fpu-internal.h> #include <asm/sigframe.h> @@ -24,7 +25,9 @@ u64 pcntxt_mask; struct xsave_struct *init_xstate_buf; static struct _fpx_sw_bytes fx_sw_reserved, fx_sw_reserved_ia32; -static unsigned int *xstate_offsets, *xstate_sizes, xstate_features; +static unsigned int *xstate_offsets, *xstate_sizes; +static unsigned int xstate_comp_offsets[sizeof(pcntxt_mask)*8]; +static unsigned int xstate_features; /* * If a processor implementation discern that a processor state component is @@ -268,8 +271,6 @@ int save_xstate_sig(void __user *buf, void __user *buf_fx, int size) if (use_fxsr() && save_xstate_epilog(buf_fx, ia32_fxstate)) return -1; - drop_init_fpu(tsk); /* trigger finit */ - return 0; } @@ -283,7 +284,7 @@ sanitize_restored_xstate(struct task_struct *tsk, if (use_xsave()) { /* These bits must be zero. */ - xsave_hdr->reserved1[0] = xsave_hdr->reserved1[1] = 0; + memset(xsave_hdr->reserved, 0, 48); /* * Init the state that is not present in the memory @@ -399,8 +400,11 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size) set_used_math(); } - if (use_eager_fpu()) + if (use_eager_fpu()) { + preempt_disable(); math_state_restore(); + preempt_enable(); + } return err; } else { @@ -479,6 +483,52 @@ static void __init setup_xstate_features(void) } /* + * This function sets up offsets and sizes of all extended states in + * xsave area. This supports both standard format and compacted format + * of the xsave aread. + * + * Input: void + * Output: void + */ +void setup_xstate_comp(void) +{ + unsigned int xstate_comp_sizes[sizeof(pcntxt_mask)*8]; + int i; + + /* + * The FP xstates and SSE xstates are legacy states. They are always + * in the fixed offsets in the xsave area in either compacted form + * or standard form. + */ + xstate_comp_offsets[0] = 0; + xstate_comp_offsets[1] = offsetof(struct i387_fxsave_struct, xmm_space); + + if (!cpu_has_xsaves) { + for (i = 2; i < xstate_features; i++) { + if (test_bit(i, (unsigned long *)&pcntxt_mask)) { + xstate_comp_offsets[i] = xstate_offsets[i]; + xstate_comp_sizes[i] = xstate_sizes[i]; + } + } + return; + } + + xstate_comp_offsets[2] = FXSAVE_SIZE + XSAVE_HDR_SIZE; + + for (i = 2; i < xstate_features; i++) { + if (test_bit(i, (unsigned long *)&pcntxt_mask)) + xstate_comp_sizes[i] = xstate_sizes[i]; + else + xstate_comp_sizes[i] = 0; + + if (i > 2) + xstate_comp_offsets[i] = xstate_comp_offsets[i-1] + + xstate_comp_sizes[i-1]; + + } +} + +/* * setup the xstate image representing the init state */ static void __init setup_init_fpu_buf(void) @@ -496,15 +546,21 @@ static void __init setup_init_fpu_buf(void) setup_xstate_features(); + if (cpu_has_xsaves) { + init_xstate_buf->xsave_hdr.xcomp_bv = + (u64)1 << 63 | pcntxt_mask; + init_xstate_buf->xsave_hdr.xstate_bv = pcntxt_mask; + } + /* * Init all the features state with header_bv being 0x0 */ - xrstor_state(init_xstate_buf, -1); + xrstor_state_booting(init_xstate_buf, -1); /* * Dump the init state again. This is to identify the init state * of any feature which is not represented by all zero's. */ - xsave_state(init_xstate_buf, -1); + xsave_state_booting(init_xstate_buf, -1); } static enum { AUTO, ENABLE, DISABLE } eagerfpu = AUTO; @@ -520,6 +576,30 @@ static int __init eager_fpu_setup(char *s) } __setup("eagerfpu=", eager_fpu_setup); + +/* + * Calculate total size of enabled xstates in XCR0/pcntxt_mask. + */ +static void __init init_xstate_size(void) +{ + unsigned int eax, ebx, ecx, edx; + int i; + + if (!cpu_has_xsaves) { + cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); + xstate_size = ebx; + return; + } + + xstate_size = FXSAVE_SIZE + XSAVE_HDR_SIZE; + for (i = 2; i < 64; i++) { + if (test_bit(i, (unsigned long *)&pcntxt_mask)) { + cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx); + xstate_size += eax; + } + } +} + /* * Enable and initialize the xsave feature. */ @@ -551,8 +631,7 @@ static void __init xstate_enable_boot_cpu(void) /* * Recompute the context size for enabled features */ - cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); - xstate_size = ebx; + init_xstate_size(); update_regset_xstate_info(xstate_size, pcntxt_mask); prepare_fx_sw_frame(); @@ -572,8 +651,9 @@ static void __init xstate_enable_boot_cpu(void) } } - pr_info("enabled xstate_bv 0x%llx, cntxt size 0x%x\n", - pcntxt_mask, xstate_size); + pr_info("enabled xstate_bv 0x%llx, cntxt size 0x%x using %s\n", + pcntxt_mask, xstate_size, + cpu_has_xsaves ? "compacted form" : "standard form"); } /* @@ -635,3 +715,26 @@ void eager_fpu_init(void) else fxrstor_checking(&init_xstate_buf->i387); } + +/* + * Given the xsave area and a state inside, this function returns the + * address of the state. + * + * This is the API that is called to get xstate address in either + * standard format or compacted format of xsave area. + * + * Inputs: + * xsave: base address of the xsave area; + * xstate: state which is defined in xsave.h (e.g. XSTATE_FP, XSTATE_SSE, + * etc.) + * Output: + * address of the state in the xsave area. + */ +void *get_xsave_addr(struct xsave_struct *xsave, int xstate) +{ + int feature = fls64(xstate) - 1; + if (!test_bit(feature, (unsigned long *)&pcntxt_mask)) + return NULL; + + return (void *)xsave + xstate_comp_offsets[feature]; +} diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 287e4c85fff..f9d16ff56c6 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -27,6 +27,7 @@ config KVM select MMU_NOTIFIER select ANON_INODES select HAVE_KVM_IRQCHIP + select HAVE_KVM_IRQFD select HAVE_KVM_IRQ_ROUTING select HAVE_KVM_EVENTFD select KVM_APIC_ARCHITECTURE diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 38a0afe83c6..976e3a57f9e 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -53,14 +53,14 @@ u64 kvm_supported_xcr0(void) return xcr0; } -void kvm_update_cpuid(struct kvm_vcpu *vcpu) +int kvm_update_cpuid(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; struct kvm_lapic *apic = vcpu->arch.apic; best = kvm_find_cpuid_entry(vcpu, 1, 0); if (!best) - return; + return 0; /* Update OSXSAVE bit */ if (cpu_has_xsave && best->function == 0x1) { @@ -88,7 +88,17 @@ void kvm_update_cpuid(struct kvm_vcpu *vcpu) xstate_required_size(vcpu->arch.xcr0); } + /* + * The existing code assumes virtual address is 48-bit in the canonical + * address checks; exit if it is ever changed. + */ + best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); + if (best && ((best->eax & 0xff00) >> 8) != 48 && + ((best->eax & 0xff00) >> 8) != 0) + return -EINVAL; + kvm_pmu_cpuid_update(vcpu); + return 0; } static int is_efer_nx(void) @@ -112,8 +122,8 @@ static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu) break; } } - if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) { - entry->edx &= ~(1 << 20); + if (entry && (entry->edx & bit(X86_FEATURE_NX)) && !is_efer_nx()) { + entry->edx &= ~bit(X86_FEATURE_NX); printk(KERN_INFO "kvm: guest NX capability removed\n"); } } @@ -151,10 +161,9 @@ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, } vcpu->arch.cpuid_nent = cpuid->nent; cpuid_fix_nx_cap(vcpu); - r = 0; kvm_apic_set_version(vcpu); kvm_x86_ops->cpuid_update(vcpu); - kvm_update_cpuid(vcpu); + r = kvm_update_cpuid(vcpu); out_free: vfree(cpuid_entries); @@ -178,9 +187,7 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, vcpu->arch.cpuid_nent = cpuid->nent; kvm_apic_set_version(vcpu); kvm_x86_ops->cpuid_update(vcpu); - kvm_update_cpuid(vcpu); - return 0; - + r = kvm_update_cpuid(vcpu); out: return r; } @@ -767,6 +774,12 @@ void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) if (!best) best = check_cpuid_limit(vcpu, function, index); + /* + * Perfmon not yet supported for L2 guest. + */ + if (is_guest_mode(vcpu) && function == 0xa) + best = NULL; + if (best) { *eax = best->eax; *ebx = best->ebx; diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index f9087315e0c..4452eedfaed 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -3,7 +3,7 @@ #include "x86.h" -void kvm_update_cpuid(struct kvm_vcpu *vcpu); +int kvm_update_cpuid(struct kvm_vcpu *vcpu); struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, u32 function, u32 index); int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, @@ -88,6 +88,14 @@ static inline bool guest_cpuid_has_x2apic(struct kvm_vcpu *vcpu) return best && (best->ecx & bit(X86_FEATURE_X2APIC)); } +static inline bool guest_cpuid_is_amd(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 0, 0); + return best && best->ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx; +} + static inline bool guest_cpuid_has_gbpages(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; @@ -95,4 +103,12 @@ static inline bool guest_cpuid_has_gbpages(struct kvm_vcpu *vcpu) best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); return best && (best->edx & bit(X86_FEATURE_GBPAGES)); } + +static inline bool guest_cpuid_has_rtm(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 7, 0); + return best && (best->ebx & bit(X86_FEATURE_RTM)); +} #endif diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index e4e833d3d7d..a46207a0583 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -162,6 +162,10 @@ #define NoWrite ((u64)1 << 45) /* No writeback */ #define SrcWrite ((u64)1 << 46) /* Write back src operand */ #define NoMod ((u64)1 << 47) /* Mod field is ignored */ +#define Intercept ((u64)1 << 48) /* Has valid intercept field */ +#define CheckPerm ((u64)1 << 49) /* Has valid check_perm field */ +#define NoBigReal ((u64)1 << 50) /* No big real mode */ +#define PrivUD ((u64)1 << 51) /* #UD instead of #GP on CPL > 0 */ #define DstXacc (DstAccLo | SrcAccHi | SrcWrite) @@ -426,6 +430,7 @@ static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt, .modrm_reg = ctxt->modrm_reg, .modrm_rm = ctxt->modrm_rm, .src_val = ctxt->src.val64, + .dst_val = ctxt->dst.val64, .src_bytes = ctxt->src.bytes, .dst_bytes = ctxt->dst.bytes, .ad_bytes = ctxt->ad_bytes, @@ -511,12 +516,6 @@ static u32 desc_limit_scaled(struct desc_struct *desc) return desc->g ? (limit << 12) | 0xfff : limit; } -static void set_seg_override(struct x86_emulate_ctxt *ctxt, int seg) -{ - ctxt->has_seg_override = true; - ctxt->seg_override = seg; -} - static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg) { if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS) @@ -525,17 +524,10 @@ static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg) return ctxt->ops->get_cached_segment_base(ctxt, seg); } -static unsigned seg_override(struct x86_emulate_ctxt *ctxt) -{ - if (!ctxt->has_seg_override) - return 0; - - return ctxt->seg_override; -} - static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec, u32 error, bool valid) { + WARN_ON(vec > 0x1f); ctxt->exception.vector = vec; ctxt->exception.error_code = error; ctxt->exception.error_code_valid = valid; @@ -651,7 +643,12 @@ static int __linearize(struct x86_emulate_ctxt *ctxt, if (!fetch && (desc.type & 8) && !(desc.type & 2)) goto bad; lim = desc_limit_scaled(&desc); - if ((desc.type & 8) || !(desc.type & 4)) { + if ((ctxt->mode == X86EMUL_MODE_REAL) && !fetch && + (ctxt->d & NoBigReal)) { + /* la is between zero and 0xffff */ + if (la > 0xffff || (u32)(la + size - 1) > 0xffff) + goto bad; + } else if ((desc.type & 8) || !(desc.type & 4)) { /* expand-up segment */ if (addr.ea > lim || (u32)(addr.ea + size - 1) > lim) goto bad; @@ -716,68 +713,71 @@ static int segmented_read_std(struct x86_emulate_ctxt *ctxt, } /* - * Fetch the next byte of the instruction being emulated which is pointed to - * by ctxt->_eip, then increment ctxt->_eip. - * - * Also prefetch the remaining bytes of the instruction without crossing page + * Prefetch the remaining bytes of the instruction without crossing page * boundary if they are not in fetch_cache yet. */ -static int do_insn_fetch_byte(struct x86_emulate_ctxt *ctxt, u8 *dest) +static int __do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, int op_size) { - struct fetch_cache *fc = &ctxt->fetch; int rc; - int size, cur_size; - - if (ctxt->_eip == fc->end) { - unsigned long linear; - struct segmented_address addr = { .seg = VCPU_SREG_CS, - .ea = ctxt->_eip }; - cur_size = fc->end - fc->start; - size = min(15UL - cur_size, - PAGE_SIZE - offset_in_page(ctxt->_eip)); - rc = __linearize(ctxt, addr, size, false, true, &linear); - if (unlikely(rc != X86EMUL_CONTINUE)) - return rc; - rc = ctxt->ops->fetch(ctxt, linear, fc->data + cur_size, - size, &ctxt->exception); - if (unlikely(rc != X86EMUL_CONTINUE)) - return rc; - fc->end += size; - } - *dest = fc->data[ctxt->_eip - fc->start]; - ctxt->_eip++; - return X86EMUL_CONTINUE; -} + unsigned size; + unsigned long linear; + int cur_size = ctxt->fetch.end - ctxt->fetch.data; + struct segmented_address addr = { .seg = VCPU_SREG_CS, + .ea = ctxt->eip + cur_size }; + + size = 15UL ^ cur_size; + rc = __linearize(ctxt, addr, size, false, true, &linear); + if (unlikely(rc != X86EMUL_CONTINUE)) + return rc; -static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, - void *dest, unsigned size) -{ - int rc; + size = min_t(unsigned, size, PAGE_SIZE - offset_in_page(linear)); - /* x86 instructions are limited to 15 bytes. */ - if (unlikely(ctxt->_eip + size - ctxt->eip > 15)) + /* + * One instruction can only straddle two pages, + * and one has been loaded at the beginning of + * x86_decode_insn. So, if not enough bytes + * still, we must have hit the 15-byte boundary. + */ + if (unlikely(size < op_size)) return X86EMUL_UNHANDLEABLE; - while (size--) { - rc = do_insn_fetch_byte(ctxt, dest++); - if (rc != X86EMUL_CONTINUE) - return rc; - } + rc = ctxt->ops->fetch(ctxt, linear, ctxt->fetch.end, + size, &ctxt->exception); + if (unlikely(rc != X86EMUL_CONTINUE)) + return rc; + ctxt->fetch.end += size; return X86EMUL_CONTINUE; } +static __always_inline int do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, + unsigned size) +{ + if (unlikely(ctxt->fetch.end - ctxt->fetch.ptr < size)) + return __do_insn_fetch_bytes(ctxt, size); + else + return X86EMUL_CONTINUE; +} + /* Fetch next part of the instruction being emulated. */ #define insn_fetch(_type, _ctxt) \ -({ unsigned long _x; \ - rc = do_insn_fetch(_ctxt, &_x, sizeof(_type)); \ +({ _type _x; \ + \ + rc = do_insn_fetch_bytes(_ctxt, sizeof(_type)); \ if (rc != X86EMUL_CONTINUE) \ goto done; \ - (_type)_x; \ + ctxt->_eip += sizeof(_type); \ + _x = *(_type __aligned(1) *) ctxt->fetch.ptr; \ + ctxt->fetch.ptr += sizeof(_type); \ + _x; \ }) #define insn_fetch_arr(_arr, _size, _ctxt) \ -({ rc = do_insn_fetch(_ctxt, _arr, (_size)); \ +({ \ + rc = do_insn_fetch_bytes(_ctxt, _size); \ if (rc != X86EMUL_CONTINUE) \ goto done; \ + ctxt->_eip += (_size); \ + memcpy(_arr, ctxt->fetch.ptr, _size); \ + ctxt->fetch.ptr += (_size); \ }) /* @@ -1063,19 +1063,17 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, struct operand *op) { u8 sib; - int index_reg = 0, base_reg = 0, scale; + int index_reg, base_reg, scale; int rc = X86EMUL_CONTINUE; ulong modrm_ea = 0; - if (ctxt->rex_prefix) { - ctxt->modrm_reg = (ctxt->rex_prefix & 4) << 1; /* REX.R */ - index_reg = (ctxt->rex_prefix & 2) << 2; /* REX.X */ - ctxt->modrm_rm = base_reg = (ctxt->rex_prefix & 1) << 3; /* REG.B */ - } + ctxt->modrm_reg = ((ctxt->rex_prefix << 1) & 8); /* REX.R */ + index_reg = (ctxt->rex_prefix << 2) & 8; /* REX.X */ + base_reg = (ctxt->rex_prefix << 3) & 8; /* REX.B */ - ctxt->modrm_mod |= (ctxt->modrm & 0xc0) >> 6; + ctxt->modrm_mod = (ctxt->modrm & 0xc0) >> 6; ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3; - ctxt->modrm_rm |= (ctxt->modrm & 0x07); + ctxt->modrm_rm = base_reg | (ctxt->modrm & 0x07); ctxt->modrm_seg = VCPU_SREG_DS; if (ctxt->modrm_mod == 3 || (ctxt->d & NoMod)) { @@ -1093,7 +1091,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, if (ctxt->d & Mmx) { op->type = OP_MM; op->bytes = 8; - op->addr.xmm = ctxt->modrm_rm & 7; + op->addr.mm = ctxt->modrm_rm & 7; return rc; } fetch_register_operand(op); @@ -1190,6 +1188,9 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, } } op->addr.mem.ea = modrm_ea; + if (ctxt->ad_bytes != 8) + ctxt->memop.addr.mem.ea = (u32)ctxt->memop.addr.mem.ea; + done: return rc; } @@ -1220,12 +1221,14 @@ static void fetch_bit_operand(struct x86_emulate_ctxt *ctxt) long sv = 0, mask; if (ctxt->dst.type == OP_MEM && ctxt->src.type == OP_REG) { - mask = ~(ctxt->dst.bytes * 8 - 1); + mask = ~((long)ctxt->dst.bytes * 8 - 1); if (ctxt->src.bytes == 2) sv = (s16)ctxt->src.val & (s16)mask; else if (ctxt->src.bytes == 4) sv = (s32)ctxt->src.val & (s32)mask; + else + sv = (s64)ctxt->src.val & (s64)mask; ctxt->dst.addr.mem.ea += (sv >> 3); } @@ -1315,8 +1318,7 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, in_page = (ctxt->eflags & EFLG_DF) ? offset_in_page(reg_read(ctxt, VCPU_REGS_RDI)) : PAGE_SIZE - offset_in_page(reg_read(ctxt, VCPU_REGS_RDI)); - n = min(min(in_page, (unsigned int)sizeof(rc->data)) / size, - count); + n = min3(in_page, (unsigned int)sizeof(rc->data) / size, count); if (n == 0) n = 1; rc->pos = rc->end = 0; @@ -1358,17 +1360,19 @@ static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, u16 selector, struct desc_ptr *dt) { const struct x86_emulate_ops *ops = ctxt->ops; + u32 base3 = 0; if (selector & 1 << 2) { struct desc_struct desc; u16 sel; memset (dt, 0, sizeof *dt); - if (!ops->get_segment(ctxt, &sel, &desc, NULL, VCPU_SREG_LDTR)) + if (!ops->get_segment(ctxt, &sel, &desc, &base3, + VCPU_SREG_LDTR)) return; dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */ - dt->address = get_desc_base(&desc); + dt->address = get_desc_base(&desc) | ((u64)base3 << 32); } else ops->get_gdt(ctxt, dt); } @@ -1422,6 +1426,7 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, ulong desc_addr; int ret; u16 dummy; + u32 base3 = 0; memset(&seg_desc, 0, sizeof seg_desc); @@ -1464,7 +1469,7 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, return ret; err_code = selector & 0xfffc; - err_vec = GP_VECTOR; + err_vec = in_task_switch ? TS_VECTOR : GP_VECTOR; /* can't load system descriptor into segment selector */ if (seg <= VCPU_SREG_GS && !seg_desc.s) @@ -1487,9 +1492,6 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, goto exception; break; case VCPU_SREG_CS: - if (in_task_switch && rpl != dpl) - goto exception; - if (!(seg_desc.type & 8)) goto exception; @@ -1502,6 +1504,15 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, if (rpl > cpl || dpl != cpl) goto exception; } + /* in long-mode d/b must be clear if l is set */ + if (seg_desc.d && seg_desc.l) { + u64 efer = 0; + + ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); + if (efer & EFER_LMA) + goto exception; + } + /* CS(RPL) <- CPL */ selector = (selector & 0xfffc) | cpl; break; @@ -1538,13 +1549,17 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, ret = write_segment_descriptor(ctxt, selector, &seg_desc); if (ret != X86EMUL_CONTINUE) return ret; + } else if (ctxt->mode == X86EMUL_MODE_PROT64) { + ret = ctxt->ops->read_std(ctxt, desc_addr+8, &base3, + sizeof(base3), &ctxt->exception); + if (ret != X86EMUL_CONTINUE) + return ret; } load: - ctxt->ops->set_segment(ctxt, selector, &seg_desc, 0, seg); + ctxt->ops->set_segment(ctxt, selector, &seg_desc, base3, seg); return X86EMUL_CONTINUE; exception: - emulate_exception(ctxt, err_vec, err_code, true); - return X86EMUL_PROPAGATE_FAULT; + return emulate_exception(ctxt, err_vec, err_code, true); } static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, @@ -1575,34 +1590,28 @@ static void write_register_operand(struct operand *op) static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op) { - int rc; - switch (op->type) { case OP_REG: write_register_operand(op); break; case OP_MEM: if (ctxt->lock_prefix) - rc = segmented_cmpxchg(ctxt, + return segmented_cmpxchg(ctxt, + op->addr.mem, + &op->orig_val, + &op->val, + op->bytes); + else + return segmented_write(ctxt, op->addr.mem, - &op->orig_val, &op->val, op->bytes); - else - rc = segmented_write(ctxt, - op->addr.mem, - &op->val, - op->bytes); - if (rc != X86EMUL_CONTINUE) - return rc; break; case OP_MEM_STR: - rc = segmented_write(ctxt, - op->addr.mem, - op->data, - op->bytes * op->count); - if (rc != X86EMUL_CONTINUE) - return rc; + return segmented_write(ctxt, + op->addr.mem, + op->data, + op->bytes * op->count); break; case OP_XMM: write_sse_reg(ctxt, &op->vec_val, op->addr.xmm); @@ -1671,7 +1680,7 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt, return rc; change_mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_OF - | EFLG_TF | EFLG_DF | EFLG_NT | EFLG_RF | EFLG_AC | EFLG_ID; + | EFLG_TF | EFLG_DF | EFLG_NT | EFLG_AC | EFLG_ID; switch(ctxt->mode) { case X86EMUL_MODE_PROT64: @@ -1754,6 +1763,9 @@ static int em_pop_sreg(struct x86_emulate_ctxt *ctxt) if (rc != X86EMUL_CONTINUE) return rc; + if (ctxt->modrm_reg == VCPU_SREG_SS) + ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS; + rc = load_segment_descriptor(ctxt, (u16)selector, seg); return rc; } @@ -1991,6 +2003,9 @@ static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt) { u64 old = ctxt->dst.orig_val64; + if (ctxt->dst.bytes == 16) + return X86EMUL_UNHANDLEABLE; + if (((u32) (old >> 0) != (u32) reg_read(ctxt, VCPU_REGS_RAX)) || ((u32) (old >> 32) != (u32) reg_read(ctxt, VCPU_REGS_RDX))) { *reg_write(ctxt, VCPU_REGS_RAX) = (u32) (old >> 0); @@ -2017,6 +2032,7 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt) { int rc; unsigned long cs; + int cpl = ctxt->ops->cpl(ctxt); rc = emulate_pop(ctxt, &ctxt->_eip, ctxt->op_bytes); if (rc != X86EMUL_CONTINUE) @@ -2026,6 +2042,9 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt) rc = emulate_pop(ctxt, &cs, ctxt->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; + /* Outer-privilege level return is not implemented */ + if (ctxt->mode >= X86EMUL_MODE_PROT16 && (cs & 3) > cpl) + return X86EMUL_UNHANDLEABLE; rc = load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS); return rc; } @@ -2044,8 +2063,10 @@ static int em_ret_far_imm(struct x86_emulate_ctxt *ctxt) static int em_cmpxchg(struct x86_emulate_ctxt *ctxt) { /* Save real source value, then compare EAX against destination. */ + ctxt->dst.orig_val = ctxt->dst.val; + ctxt->dst.val = reg_read(ctxt, VCPU_REGS_RAX); ctxt->src.orig_val = ctxt->src.val; - ctxt->src.val = reg_read(ctxt, VCPU_REGS_RAX); + ctxt->src.val = ctxt->dst.orig_val; fastop(ctxt, em_cmp); if (ctxt->eflags & EFLG_ZF) { @@ -2055,6 +2076,7 @@ static int em_cmpxchg(struct x86_emulate_ctxt *ctxt) /* Failure: write the value we saw to EAX. */ ctxt->dst.type = OP_REG; ctxt->dst.addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX); + ctxt->dst.val = ctxt->dst.orig_val; } return X86EMUL_CONTINUE; } @@ -2194,7 +2216,7 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt) *reg_write(ctxt, VCPU_REGS_RCX) = ctxt->_eip; if (efer & EFER_LMA) { #ifdef CONFIG_X86_64 - *reg_write(ctxt, VCPU_REGS_R11) = ctxt->eflags & ~EFLG_RF; + *reg_write(ctxt, VCPU_REGS_R11) = ctxt->eflags; ops->get_msr(ctxt, ctxt->mode == X86EMUL_MODE_PROT64 ? @@ -2202,14 +2224,14 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt) ctxt->_eip = msr_data; ops->get_msr(ctxt, MSR_SYSCALL_MASK, &msr_data); - ctxt->eflags &= ~(msr_data | EFLG_RF); + ctxt->eflags &= ~msr_data; #endif } else { /* legacy mode */ ops->get_msr(ctxt, MSR_STAR, &msr_data); ctxt->_eip = (u32)msr_data; - ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); + ctxt->eflags &= ~(EFLG_VM | EFLG_IF); } return X86EMUL_CONTINUE; @@ -2258,7 +2280,7 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt) break; } - ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); + ctxt->eflags &= ~(EFLG_VM | EFLG_IF); cs_sel = (u16)msr_data; cs_sel &= ~SELECTOR_RPL_MASK; ss_sel = cs_sel + 8; @@ -2710,8 +2732,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, if (!next_tss_desc.p || ((desc_limit < 0x67 && (next_tss_desc.type & 8)) || desc_limit < 0x2b)) { - emulate_ts(ctxt, tss_selector & 0xfffc); - return X86EMUL_PROPAGATE_FAULT; + return emulate_ts(ctxt, tss_selector & 0xfffc); } if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) { @@ -2964,7 +2985,7 @@ static int em_rdpmc(struct x86_emulate_ctxt *ctxt) static int em_mov(struct x86_emulate_ctxt *ctxt) { - memcpy(ctxt->dst.valptr, ctxt->src.valptr, ctxt->op_bytes); + memcpy(ctxt->dst.valptr, ctxt->src.valptr, sizeof(ctxt->src.valptr)); return X86EMUL_CONTINUE; } @@ -3003,7 +3024,7 @@ static int em_movbe(struct x86_emulate_ctxt *ctxt) ctxt->dst.val = swab64(ctxt->src.val); break; default: - return X86EMUL_PROPAGATE_FAULT; + BUG(); } return X86EMUL_CONTINUE; } @@ -3127,12 +3148,8 @@ static int em_clts(struct x86_emulate_ctxt *ctxt) static int em_vmcall(struct x86_emulate_ctxt *ctxt) { - int rc; + int rc = ctxt->ops->fix_hypercall(ctxt); - if (ctxt->modrm_mod != 3 || ctxt->modrm_rm != 1) - return X86EMUL_UNHANDLEABLE; - - rc = ctxt->ops->fix_hypercall(ctxt); if (rc != X86EMUL_CONTINUE) return rc; @@ -3221,7 +3238,8 @@ static int em_lidt(struct x86_emulate_ctxt *ctxt) static int em_smsw(struct x86_emulate_ctxt *ctxt) { - ctxt->dst.bytes = 2; + if (ctxt->dst.type == OP_MEM) + ctxt->dst.bytes = 2; ctxt->dst.val = ctxt->ops->get_cr(ctxt, 0); return X86EMUL_CONTINUE; } @@ -3496,7 +3514,7 @@ static int check_rdpmc(struct x86_emulate_ctxt *ctxt) u64 rcx = reg_read(ctxt, VCPU_REGS_RCX); if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) || - (rcx > 3)) + ctxt->ops->check_pmc(ctxt, rcx)) return emulate_gp(ctxt, 0); return X86EMUL_CONTINUE; @@ -3521,9 +3539,9 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt) } #define D(_y) { .flags = (_y) } -#define DI(_y, _i) { .flags = (_y), .intercept = x86_intercept_##_i } -#define DIP(_y, _i, _p) { .flags = (_y), .intercept = x86_intercept_##_i, \ - .check_perm = (_p) } +#define DI(_y, _i) { .flags = (_y)|Intercept, .intercept = x86_intercept_##_i } +#define DIP(_y, _i, _p) { .flags = (_y)|Intercept|CheckPerm, \ + .intercept = x86_intercept_##_i, .check_perm = (_p) } #define N D(NotImpl) #define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) } #define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) } @@ -3532,10 +3550,10 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt) #define I(_f, _e) { .flags = (_f), .u.execute = (_e) } #define F(_f, _e) { .flags = (_f) | Fastop, .u.fastop = (_e) } #define II(_f, _e, _i) \ - { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i } + { .flags = (_f)|Intercept, .u.execute = (_e), .intercept = x86_intercept_##_i } #define IIP(_f, _e, _i, _p) \ - { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i, \ - .check_perm = (_p) } + { .flags = (_f)|Intercept|CheckPerm, .u.execute = (_e), \ + .intercept = x86_intercept_##_i, .check_perm = (_p) } #define GP(_f, _g) { .flags = ((_f) | Prefix), .u.gprefix = (_g) } #define D2bv(_f) D((_f) | ByteOp), D(_f) @@ -3549,6 +3567,12 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt) F2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \ F2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e) +static const struct opcode group7_rm0[] = { + N, + I(SrcNone | Priv | EmulateOnUD, em_vmcall), + N, N, N, N, N, N, +}; + static const struct opcode group7_rm1[] = { DI(SrcNone | Priv, monitor), DI(SrcNone | Priv, mwait), @@ -3634,15 +3658,15 @@ static const struct opcode group6[] = { }; static const struct group_dual group7 = { { - II(Mov | DstMem | Priv, em_sgdt, sgdt), - II(Mov | DstMem | Priv, em_sidt, sidt), + II(Mov | DstMem, em_sgdt, sgdt), + II(Mov | DstMem, em_sidt, sidt), II(SrcMem | Priv, em_lgdt, lgdt), II(SrcMem | Priv, em_lidt, lidt), II(SrcNone | DstMem | Mov, em_smsw, smsw), N, II(SrcMem16 | Mov | Priv, em_lmsw, lmsw), II(SrcMem | ByteOp | Priv | NoAccess, em_invlpg, invlpg), }, { - I(SrcNone | Priv | EmulateOnUD, em_vmcall), + EXT(0, group7_rm0), EXT(0, group7_rm1), N, EXT(0, group7_rm3), II(SrcNone | DstMem | Mov, em_smsw, smsw), N, @@ -3673,14 +3697,18 @@ static const struct gprefix pfx_0f_6f_0f_7f = { I(Mmx, em_mov), I(Sse | Aligned, em_mov), N, I(Sse | Unaligned, em_mov), }; -static const struct gprefix pfx_vmovntpx = { - I(0, em_mov), N, N, N, +static const struct gprefix pfx_0f_2b = { + I(0, em_mov), I(0, em_mov), N, N, }; static const struct gprefix pfx_0f_28_0f_29 = { I(Aligned, em_mov), I(Aligned, em_mov), N, N, }; +static const struct gprefix pfx_0f_e7 = { + N, I(Sse, em_mov), N, N, +}; + static const struct escape escape_d9 = { { N, N, N, N, N, N, N, I(DstMem, em_fnstcw), }, { @@ -3887,7 +3915,7 @@ static const struct opcode twobyte_table[256] = { N, N, N, N, GP(ModRM | DstReg | SrcMem | Mov | Sse, &pfx_0f_28_0f_29), GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_28_0f_29), - N, GP(ModRM | DstMem | SrcReg | Sse | Mov | Aligned, &pfx_vmovntpx), + N, GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_2b), N, N, N, N, /* 0x30 - 0x3F */ II(ImplicitOps | Priv, em_wrmsr, wrmsr), @@ -3899,7 +3927,7 @@ static const struct opcode twobyte_table[256] = { N, N, N, N, N, N, N, N, N, N, /* 0x40 - 0x4F */ - X16(D(DstReg | SrcMem | ModRM | Mov)), + X16(D(DstReg | SrcMem | ModRM)), /* 0x50 - 0x5F */ N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, /* 0x60 - 0x6F */ @@ -3951,7 +3979,8 @@ static const struct opcode twobyte_table[256] = { /* 0xD0 - 0xDF */ N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, /* 0xE0 - 0xEF */ - N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, + N, N, N, N, N, N, N, GP(SrcReg | DstMem | ModRM | Mov, &pfx_0f_e7), + N, N, N, N, N, N, N, N, /* 0xF0 - 0xFF */ N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N }; @@ -4061,12 +4090,12 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, mem_common: *op = ctxt->memop; ctxt->memopp = op; - if ((ctxt->d & BitOp) && op == &ctxt->dst) + if (ctxt->d & BitOp) fetch_bit_operand(ctxt); op->orig_val = op->val; break; case OpMem64: - ctxt->memop.bytes = 8; + ctxt->memop.bytes = (ctxt->op_bytes == 8) ? 16 : 8; goto mem_common; case OpAcc: op->type = OP_REG; @@ -4150,7 +4179,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; op->addr.mem.ea = register_address(ctxt, reg_read(ctxt, VCPU_REGS_RSI)); - op->addr.mem.seg = seg_override(ctxt); + op->addr.mem.seg = ctxt->seg_override; op->val = 0; op->count = 1; break; @@ -4161,7 +4190,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, register_address(ctxt, reg_read(ctxt, VCPU_REGS_RBX) + (reg_read(ctxt, VCPU_REGS_RAX) & 0xff)); - op->addr.mem.seg = seg_override(ctxt); + op->addr.mem.seg = ctxt->seg_override; op->val = 0; break; case OpImmFAddr: @@ -4208,16 +4237,22 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) int mode = ctxt->mode; int def_op_bytes, def_ad_bytes, goffset, simd_prefix; bool op_prefix = false; + bool has_seg_override = false; struct opcode opcode; ctxt->memop.type = OP_NONE; ctxt->memopp = NULL; ctxt->_eip = ctxt->eip; - ctxt->fetch.start = ctxt->_eip; - ctxt->fetch.end = ctxt->fetch.start + insn_len; + ctxt->fetch.ptr = ctxt->fetch.data; + ctxt->fetch.end = ctxt->fetch.data + insn_len; ctxt->opcode_len = 1; if (insn_len > 0) memcpy(ctxt->fetch.data, insn, insn_len); + else { + rc = __do_insn_fetch_bytes(ctxt, 1); + if (rc != X86EMUL_CONTINUE) + return rc; + } switch (mode) { case X86EMUL_MODE_REAL: @@ -4261,11 +4296,13 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) case 0x2e: /* CS override */ case 0x36: /* SS override */ case 0x3e: /* DS override */ - set_seg_override(ctxt, (ctxt->b >> 3) & 3); + has_seg_override = true; + ctxt->seg_override = (ctxt->b >> 3) & 3; break; case 0x64: /* FS override */ case 0x65: /* GS override */ - set_seg_override(ctxt, ctxt->b & 7); + has_seg_override = true; + ctxt->seg_override = ctxt->b & 7; break; case 0x40 ... 0x4f: /* REX */ if (mode != X86EMUL_MODE_PROT64) @@ -4314,6 +4351,13 @@ done_prefixes: if (ctxt->d & ModRM) ctxt->modrm = insn_fetch(u8, ctxt); + /* vex-prefix instructions are not implemented */ + if (ctxt->opcode_len == 1 && (ctxt->b == 0xc5 || ctxt->b == 0xc4) && + (mode == X86EMUL_MODE_PROT64 || + (mode >= X86EMUL_MODE_PROT16 && (ctxt->modrm & 0x80)))) { + ctxt->d = NotImpl; + } + while (ctxt->d & GroupMask) { switch (ctxt->d & GroupMask) { case Group: @@ -4356,49 +4400,59 @@ done_prefixes: ctxt->d |= opcode.flags; } - ctxt->execute = opcode.u.execute; - ctxt->check_perm = opcode.check_perm; - ctxt->intercept = opcode.intercept; - /* Unrecognised? */ - if (ctxt->d == 0 || (ctxt->d & NotImpl)) + if (ctxt->d == 0) return EMULATION_FAILED; - if (!(ctxt->d & EmulateOnUD) && ctxt->ud) + ctxt->execute = opcode.u.execute; + + if (unlikely(ctxt->ud) && likely(!(ctxt->d & EmulateOnUD))) return EMULATION_FAILED; - if (mode == X86EMUL_MODE_PROT64 && (ctxt->d & Stack)) - ctxt->op_bytes = 8; + if (unlikely(ctxt->d & + (NotImpl|Stack|Op3264|Sse|Mmx|Intercept|CheckPerm))) { + /* + * These are copied unconditionally here, and checked unconditionally + * in x86_emulate_insn. + */ + ctxt->check_perm = opcode.check_perm; + ctxt->intercept = opcode.intercept; + + if (ctxt->d & NotImpl) + return EMULATION_FAILED; - if (ctxt->d & Op3264) { - if (mode == X86EMUL_MODE_PROT64) + if (mode == X86EMUL_MODE_PROT64 && (ctxt->d & Stack)) ctxt->op_bytes = 8; - else - ctxt->op_bytes = 4; - } - if (ctxt->d & Sse) - ctxt->op_bytes = 16; - else if (ctxt->d & Mmx) - ctxt->op_bytes = 8; + if (ctxt->d & Op3264) { + if (mode == X86EMUL_MODE_PROT64) + ctxt->op_bytes = 8; + else + ctxt->op_bytes = 4; + } + + if (ctxt->d & Sse) + ctxt->op_bytes = 16; + else if (ctxt->d & Mmx) + ctxt->op_bytes = 8; + } /* ModRM and SIB bytes. */ if (ctxt->d & ModRM) { rc = decode_modrm(ctxt, &ctxt->memop); - if (!ctxt->has_seg_override) - set_seg_override(ctxt, ctxt->modrm_seg); + if (!has_seg_override) { + has_seg_override = true; + ctxt->seg_override = ctxt->modrm_seg; + } } else if (ctxt->d & MemAbs) rc = decode_abs(ctxt, &ctxt->memop); if (rc != X86EMUL_CONTINUE) goto done; - if (!ctxt->has_seg_override) - set_seg_override(ctxt, VCPU_SREG_DS); - - ctxt->memop.addr.mem.seg = seg_override(ctxt); + if (!has_seg_override) + ctxt->seg_override = VCPU_SREG_DS; - if (ctxt->memop.type == OP_MEM && ctxt->ad_bytes != 8) - ctxt->memop.addr.mem.ea = (u32)ctxt->memop.addr.mem.ea; + ctxt->memop.addr.mem.seg = ctxt->seg_override; /* * Decode and fetch the source operand: register, memory @@ -4420,7 +4474,7 @@ done_prefixes: rc = decode_operand(ctxt, &ctxt->dst, (ctxt->d >> DstShift) & OpMask); done: - if (ctxt->memopp && ctxt->memopp->type == OP_MEM && ctxt->rip_relative) + if (ctxt->rip_relative) ctxt->memopp->addr.mem.ea += ctxt->_eip; return (rc != X86EMUL_CONTINUE) ? EMULATION_FAILED : EMULATION_OK; @@ -4495,6 +4549,16 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)) return X86EMUL_CONTINUE; } +void init_decode_cache(struct x86_emulate_ctxt *ctxt) +{ + memset(&ctxt->rip_relative, 0, + (void *)&ctxt->modrm - (void *)&ctxt->rip_relative); + + ctxt->io_read.pos = 0; + ctxt->io_read.end = 0; + ctxt->mem_read.end = 0; +} + int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) { const struct x86_emulate_ops *ops = ctxt->ops; @@ -4503,12 +4567,6 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) ctxt->mem_read.pos = 0; - if ((ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) || - (ctxt->d & Undefined)) { - rc = emulate_ud(ctxt); - goto done; - } - /* LOCK prefix is allowed only with some instructions */ if (ctxt->lock_prefix && (!(ctxt->d & Lock) || ctxt->dst.type != OP_MEM)) { rc = emulate_ud(ctxt); @@ -4520,69 +4578,82 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) goto done; } - if (((ctxt->d & (Sse|Mmx)) && ((ops->get_cr(ctxt, 0) & X86_CR0_EM))) - || ((ctxt->d & Sse) && !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) { - rc = emulate_ud(ctxt); - goto done; - } - - if ((ctxt->d & (Sse|Mmx)) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) { - rc = emulate_nm(ctxt); - goto done; - } + if (unlikely(ctxt->d & + (No64|Undefined|Sse|Mmx|Intercept|CheckPerm|Priv|Prot|String))) { + if ((ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) || + (ctxt->d & Undefined)) { + rc = emulate_ud(ctxt); + goto done; + } - if (ctxt->d & Mmx) { - rc = flush_pending_x87_faults(ctxt); - if (rc != X86EMUL_CONTINUE) + if (((ctxt->d & (Sse|Mmx)) && ((ops->get_cr(ctxt, 0) & X86_CR0_EM))) + || ((ctxt->d & Sse) && !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) { + rc = emulate_ud(ctxt); goto done; - /* - * Now that we know the fpu is exception safe, we can fetch - * operands from it. - */ - fetch_possible_mmx_operand(ctxt, &ctxt->src); - fetch_possible_mmx_operand(ctxt, &ctxt->src2); - if (!(ctxt->d & Mov)) - fetch_possible_mmx_operand(ctxt, &ctxt->dst); - } + } - if (unlikely(ctxt->guest_mode) && ctxt->intercept) { - rc = emulator_check_intercept(ctxt, ctxt->intercept, - X86_ICPT_PRE_EXCEPT); - if (rc != X86EMUL_CONTINUE) + if ((ctxt->d & (Sse|Mmx)) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) { + rc = emulate_nm(ctxt); goto done; - } + } - /* Privileged instruction can be executed only in CPL=0 */ - if ((ctxt->d & Priv) && ops->cpl(ctxt)) { - rc = emulate_gp(ctxt, 0); - goto done; - } + if (ctxt->d & Mmx) { + rc = flush_pending_x87_faults(ctxt); + if (rc != X86EMUL_CONTINUE) + goto done; + /* + * Now that we know the fpu is exception safe, we can fetch + * operands from it. + */ + fetch_possible_mmx_operand(ctxt, &ctxt->src); + fetch_possible_mmx_operand(ctxt, &ctxt->src2); + if (!(ctxt->d & Mov)) + fetch_possible_mmx_operand(ctxt, &ctxt->dst); + } - /* Instruction can only be executed in protected mode */ - if ((ctxt->d & Prot) && ctxt->mode < X86EMUL_MODE_PROT16) { - rc = emulate_ud(ctxt); - goto done; - } + if (unlikely(ctxt->guest_mode) && (ctxt->d & Intercept)) { + rc = emulator_check_intercept(ctxt, ctxt->intercept, + X86_ICPT_PRE_EXCEPT); + if (rc != X86EMUL_CONTINUE) + goto done; + } - /* Do instruction specific permission checks */ - if (ctxt->check_perm) { - rc = ctxt->check_perm(ctxt); - if (rc != X86EMUL_CONTINUE) + /* Privileged instruction can be executed only in CPL=0 */ + if ((ctxt->d & Priv) && ops->cpl(ctxt)) { + if (ctxt->d & PrivUD) + rc = emulate_ud(ctxt); + else + rc = emulate_gp(ctxt, 0); goto done; - } + } - if (unlikely(ctxt->guest_mode) && ctxt->intercept) { - rc = emulator_check_intercept(ctxt, ctxt->intercept, - X86_ICPT_POST_EXCEPT); - if (rc != X86EMUL_CONTINUE) + /* Instruction can only be executed in protected mode */ + if ((ctxt->d & Prot) && ctxt->mode < X86EMUL_MODE_PROT16) { + rc = emulate_ud(ctxt); goto done; - } + } - if (ctxt->rep_prefix && (ctxt->d & String)) { - /* All REP prefixes have the same first termination condition */ - if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) { - ctxt->eip = ctxt->_eip; - goto done; + /* Do instruction specific permission checks */ + if (ctxt->d & CheckPerm) { + rc = ctxt->check_perm(ctxt); + if (rc != X86EMUL_CONTINUE) + goto done; + } + + if (unlikely(ctxt->guest_mode) && (ctxt->d & Intercept)) { + rc = emulator_check_intercept(ctxt, ctxt->intercept, + X86_ICPT_POST_EXCEPT); + if (rc != X86EMUL_CONTINUE) + goto done; + } + + if (ctxt->rep_prefix && (ctxt->d & String)) { + /* All REP prefixes have the same first termination condition */ + if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) { + ctxt->eip = ctxt->_eip; + ctxt->eflags &= ~EFLG_RF; + goto done; + } } } @@ -4616,13 +4687,18 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) special_insn: - if (unlikely(ctxt->guest_mode) && ctxt->intercept) { + if (unlikely(ctxt->guest_mode) && (ctxt->d & Intercept)) { rc = emulator_check_intercept(ctxt, ctxt->intercept, X86_ICPT_POST_MEMACCESS); if (rc != X86EMUL_CONTINUE) goto done; } + if (ctxt->rep_prefix && (ctxt->d & String)) + ctxt->eflags |= EFLG_RF; + else + ctxt->eflags &= ~EFLG_RF; + if (ctxt->execute) { if (ctxt->d & Fastop) { void (*fop)(struct fastop *) = (void *)ctxt->execute; @@ -4657,8 +4733,9 @@ special_insn: break; case 0x90 ... 0x97: /* nop / xchg reg, rax */ if (ctxt->dst.addr.reg == reg_rmw(ctxt, VCPU_REGS_RAX)) - break; - rc = em_xchg(ctxt); + ctxt->dst.type = OP_NONE; + else + rc = em_xchg(ctxt); break; case 0x98: /* cbw/cwde/cdqe */ switch (ctxt->op_bytes) { @@ -4709,17 +4786,17 @@ special_insn: goto done; writeback: - if (!(ctxt->d & NoWrite)) { - rc = writeback(ctxt, &ctxt->dst); - if (rc != X86EMUL_CONTINUE) - goto done; - } if (ctxt->d & SrcWrite) { BUG_ON(ctxt->src.type == OP_MEM || ctxt->src.type == OP_MEM_STR); rc = writeback(ctxt, &ctxt->src); if (rc != X86EMUL_CONTINUE) goto done; } + if (!(ctxt->d & NoWrite)) { + rc = writeback(ctxt, &ctxt->dst); + if (rc != X86EMUL_CONTINUE) + goto done; + } /* * restore dst type in case the decoding will be reused @@ -4761,13 +4838,16 @@ writeback: } goto done; /* skip rip writeback */ } + ctxt->eflags &= ~EFLG_RF; } ctxt->eip = ctxt->_eip; done: - if (rc == X86EMUL_PROPAGATE_FAULT) + if (rc == X86EMUL_PROPAGATE_FAULT) { + WARN_ON(ctxt->exception.vector > 0x1f); ctxt->have_exception = true; + } if (rc == X86EMUL_INTERCEPTED) return EMULATION_INTERCEPTED; @@ -4793,8 +4873,10 @@ twobyte_insn: ops->get_dr(ctxt, ctxt->modrm_reg, &ctxt->dst.val); break; case 0x40 ... 0x4f: /* cmov */ - ctxt->dst.val = ctxt->dst.orig_val = ctxt->src.val; - if (!test_cc(ctxt->b, ctxt->eflags)) + if (test_cc(ctxt->b, ctxt->eflags)) + ctxt->dst.val = ctxt->src.val; + else if (ctxt->mode != X86EMUL_MODE_PROT64 || + ctxt->op_bytes != 4) ctxt->dst.type = OP_NONE; /* no writeback */ break; case 0x80 ... 0x8f: /* jnz rel, etc*/ @@ -4818,8 +4900,8 @@ twobyte_insn: break; case 0xc3: /* movnti */ ctxt->dst.bytes = ctxt->op_bytes; - ctxt->dst.val = (ctxt->op_bytes == 4) ? (u32) ctxt->src.val : - (u64) ctxt->src.val; + ctxt->dst.val = (ctxt->op_bytes == 8) ? (u64) ctxt->src.val : + (u32) ctxt->src.val; break; default: goto cannot_emulate; diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index bd0da433e6d..a1ec6a50a05 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -108,7 +108,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v) vector = kvm_cpu_get_extint(v); - if (kvm_apic_vid_enabled(v->kvm) || vector != -1) + if (vector != -1) return vector; /* PIC */ return kvm_get_apic_interrupt(v); /* APIC */ diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 00691185817..b8345dd41b2 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -112,17 +112,6 @@ static inline int __apic_test_and_clear_vector(int vec, void *bitmap) struct static_key_deferred apic_hw_disabled __read_mostly; struct static_key_deferred apic_sw_disabled __read_mostly; -static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val) -{ - if ((kvm_apic_get_reg(apic, APIC_SPIV) ^ val) & APIC_SPIV_APIC_ENABLED) { - if (val & APIC_SPIV_APIC_ENABLED) - static_key_slow_dec_deferred(&apic_sw_disabled); - else - static_key_slow_inc(&apic_sw_disabled.key); - } - apic_set_reg(apic, APIC_SPIV, val); -} - static inline int apic_enabled(struct kvm_lapic *apic) { return kvm_apic_sw_enabled(apic) && kvm_apic_hw_enabled(apic); @@ -210,6 +199,20 @@ out: kvm_vcpu_request_scan_ioapic(kvm); } +static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val) +{ + u32 prev = kvm_apic_get_reg(apic, APIC_SPIV); + + apic_set_reg(apic, APIC_SPIV, val); + if ((prev ^ val) & APIC_SPIV_APIC_ENABLED) { + if (val & APIC_SPIV_APIC_ENABLED) { + static_key_slow_dec_deferred(&apic_sw_disabled); + recalculate_apic_map(apic->vcpu->kvm); + } else + static_key_slow_inc(&apic_sw_disabled.key); + } +} + static inline void kvm_apic_set_id(struct kvm_lapic *apic, u8 id) { apic_set_reg(apic, APIC_ID, id << 24); @@ -352,25 +355,46 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic) static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) { - apic->irr_pending = false; + struct kvm_vcpu *vcpu; + + vcpu = apic->vcpu; + apic_clear_vector(vec, apic->regs + APIC_IRR); - if (apic_search_irr(apic) != -1) - apic->irr_pending = true; + if (unlikely(kvm_apic_vid_enabled(vcpu->kvm))) + /* try to update RVI */ + kvm_make_request(KVM_REQ_EVENT, vcpu); + else { + vec = apic_search_irr(apic); + apic->irr_pending = (vec != -1); + } } static inline void apic_set_isr(int vec, struct kvm_lapic *apic) { - /* Note that we never get here with APIC virtualization enabled. */ + struct kvm_vcpu *vcpu; + + if (__apic_test_and_set_vector(vec, apic->regs + APIC_ISR)) + return; + + vcpu = apic->vcpu; - if (!__apic_test_and_set_vector(vec, apic->regs + APIC_ISR)) - ++apic->isr_count; - BUG_ON(apic->isr_count > MAX_APIC_VECTOR); /* - * ISR (in service register) bit is set when injecting an interrupt. - * The highest vector is injected. Thus the latest bit set matches - * the highest bit in ISR. + * With APIC virtualization enabled, all caching is disabled + * because the processor can modify ISR under the hood. Instead + * just set SVI. */ - apic->highest_isr_cache = vec; + if (unlikely(kvm_apic_vid_enabled(vcpu->kvm))) + kvm_x86_ops->hwapic_isr_update(vcpu->kvm, vec); + else { + ++apic->isr_count; + BUG_ON(apic->isr_count > MAX_APIC_VECTOR); + /* + * ISR (in service register) bit is set when injecting an interrupt. + * The highest vector is injected. Thus the latest bit set matches + * the highest bit in ISR. + */ + apic->highest_isr_cache = vec; + } } static inline int apic_find_highest_isr(struct kvm_lapic *apic) @@ -685,6 +709,8 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, int result = 0; struct kvm_vcpu *vcpu = apic->vcpu; + trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode, + trig_mode, vector); switch (delivery_mode) { case APIC_DM_LOWEST: vcpu->arch.apic_arb_prio++; @@ -706,8 +732,6 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); } - trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode, - trig_mode, vector, false); break; case APIC_DM_REMRD: @@ -1331,6 +1355,9 @@ void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data) return; hrtimer_cancel(&apic->lapic_timer.timer); + /* Inject here so clearing tscdeadline won't override new value */ + if (apic_has_pending_timer(vcpu)) + kvm_inject_apic_timer_irqs(vcpu); apic->lapic_timer.tscdeadline = data; start_apic_timer(apic); } @@ -1451,7 +1478,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) vcpu->arch.apic_arb_prio = 0; vcpu->arch.apic_attention = 0; - apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr=" + apic_debug("%s: vcpu=%p, id=%d, base_msr=" "0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__, vcpu, kvm_apic_id(apic), vcpu->arch.apic_base, apic->base_address); @@ -1618,6 +1645,8 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu) if (atomic_read(&apic->lapic_timer.pending) > 0) { kvm_apic_local_deliver(apic, APIC_LVTT); + if (apic_lvtt_tscdeadline(apic)) + apic->lapic_timer.tscdeadline = 0; atomic_set(&apic->lapic_timer.pending, 0); } } @@ -1627,11 +1656,16 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) int vector = kvm_apic_has_interrupt(vcpu); struct kvm_lapic *apic = vcpu->arch.apic; - /* Note that we never get here with APIC virtualization enabled. */ - if (vector == -1) return -1; + /* + * We get here even with APIC virtualization enabled, if doing + * nested virtualization and L1 runs with the "acknowledge interrupt + * on exit" mode. Then we cannot inject the interrupt via RVI, + * because the process would deliver it through the IDT. + */ + apic_set_isr(vector, apic); apic_update_ppr(apic); apic_clear_irr(vector, apic); @@ -1895,7 +1929,7 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu) /* evaluate pending_events before reading the vector */ smp_rmb(); sipi_vector = apic->sipi_vector; - pr_debug("vcpu %d received sipi with vector # %x\n", + apic_debug("vcpu %d received sipi with vector # %x\n", vcpu->vcpu_id, sipi_vector); kvm_vcpu_deliver_sipi_vector(vcpu, sipi_vector); vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 931467881da..ac1c4de3a48 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -199,16 +199,20 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask) EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); /* - * spte bits of bit 3 ~ bit 11 are used as low 9 bits of generation number, - * the bits of bits 52 ~ bit 61 are used as high 10 bits of generation - * number. + * the low bit of the generation number is always presumed to be zero. + * This disables mmio caching during memslot updates. The concept is + * similar to a seqcount but instead of retrying the access we just punt + * and ignore the cache. + * + * spte bits 3-11 are used as bits 1-9 of the generation number, + * the bits 52-61 are used as bits 10-19 of the generation number. */ -#define MMIO_SPTE_GEN_LOW_SHIFT 3 +#define MMIO_SPTE_GEN_LOW_SHIFT 2 #define MMIO_SPTE_GEN_HIGH_SHIFT 52 -#define MMIO_GEN_SHIFT 19 -#define MMIO_GEN_LOW_SHIFT 9 -#define MMIO_GEN_LOW_MASK ((1 << MMIO_GEN_LOW_SHIFT) - 1) +#define MMIO_GEN_SHIFT 20 +#define MMIO_GEN_LOW_SHIFT 10 +#define MMIO_GEN_LOW_MASK ((1 << MMIO_GEN_LOW_SHIFT) - 2) #define MMIO_GEN_MASK ((1 << MMIO_GEN_SHIFT) - 1) #define MMIO_MAX_GEN ((1 << MMIO_GEN_SHIFT) - 1) @@ -236,12 +240,7 @@ static unsigned int get_mmio_spte_generation(u64 spte) static unsigned int kvm_current_mmio_generation(struct kvm *kvm) { - /* - * Init kvm generation close to MMIO_MAX_GEN to easily test the - * code of handling generation number wrap-around. - */ - return (kvm_memslots(kvm)->generation + - MMIO_MAX_GEN - 150) & MMIO_GEN_MASK; + return kvm_memslots(kvm)->generation & MMIO_GEN_MASK; } static void mark_mmio_spte(struct kvm *kvm, u64 *sptep, u64 gfn, @@ -296,11 +295,6 @@ static bool check_mmio_spte(struct kvm *kvm, u64 spte) return likely(kvm_gen == spte_gen); } -static inline u64 rsvd_bits(int s, int e) -{ - return ((1ULL << (e - s + 1)) - 1) << s; -} - void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, u64 dirty_mask, u64 nx_mask, u64 x_mask) { @@ -1180,7 +1174,7 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) * Write-protect on the specified @sptep, @pt_protect indicates whether * spte write-protection is caused by protecting shadow page table. * - * Note: write protection is difference between drity logging and spte + * Note: write protection is difference between dirty logging and spte * protection: * - for dirty logging, the spte can be set to writable at anytime if * its dirty bitmap is properly set. @@ -1268,7 +1262,8 @@ static bool rmap_write_protect(struct kvm *kvm, u64 gfn) } static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, - struct kvm_memory_slot *slot, unsigned long data) + struct kvm_memory_slot *slot, gfn_t gfn, int level, + unsigned long data) { u64 *sptep; struct rmap_iterator iter; @@ -1276,7 +1271,8 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, while ((sptep = rmap_get_first(*rmapp, &iter))) { BUG_ON(!(*sptep & PT_PRESENT_MASK)); - rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", sptep, *sptep); + rmap_printk("kvm_rmap_unmap_hva: spte %p %llx gfn %llx (%d)\n", + sptep, *sptep, gfn, level); drop_spte(kvm, sptep); need_tlb_flush = 1; @@ -1286,7 +1282,8 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, } static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, - struct kvm_memory_slot *slot, unsigned long data) + struct kvm_memory_slot *slot, gfn_t gfn, int level, + unsigned long data) { u64 *sptep; struct rmap_iterator iter; @@ -1300,7 +1297,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { BUG_ON(!is_shadow_present_pte(*sptep)); - rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", sptep, *sptep); + rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n", + sptep, *sptep, gfn, level); need_flush = 1; @@ -1334,6 +1332,8 @@ static int kvm_handle_hva_range(struct kvm *kvm, int (*handler)(struct kvm *kvm, unsigned long *rmapp, struct kvm_memory_slot *slot, + gfn_t gfn, + int level, unsigned long data)) { int j; @@ -1363,6 +1363,7 @@ static int kvm_handle_hva_range(struct kvm *kvm, j < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++j) { unsigned long idx, idx_end; unsigned long *rmapp; + gfn_t gfn = gfn_start; /* * {idx(page_j) | page_j intersects with @@ -1373,8 +1374,10 @@ static int kvm_handle_hva_range(struct kvm *kvm, rmapp = __gfn_to_rmap(gfn_start, j, memslot); - for (; idx <= idx_end; ++idx) - ret |= handler(kvm, rmapp++, memslot, data); + for (; idx <= idx_end; + ++idx, gfn += (1UL << KVM_HPAGE_GFN_SHIFT(j))) + ret |= handler(kvm, rmapp++, memslot, + gfn, j, data); } } @@ -1385,6 +1388,7 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, unsigned long data, int (*handler)(struct kvm *kvm, unsigned long *rmapp, struct kvm_memory_slot *slot, + gfn_t gfn, int level, unsigned long data)) { return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler); @@ -1406,24 +1410,14 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) } static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, - struct kvm_memory_slot *slot, unsigned long data) + struct kvm_memory_slot *slot, gfn_t gfn, int level, + unsigned long data) { u64 *sptep; struct rmap_iterator uninitialized_var(iter); int young = 0; - /* - * In case of absence of EPT Access and Dirty Bits supports, - * emulate the accessed bit for EPT, by checking if this page has - * an EPT mapping, and clearing it if it does. On the next access, - * a new EPT mapping will be established. - * This has some overhead, but not as much as the cost of swapping - * out actively used pages or breaking up actively used hugepages. - */ - if (!shadow_accessed_mask) { - young = kvm_unmap_rmapp(kvm, rmapp, slot, data); - goto out; - } + BUG_ON(!shadow_accessed_mask); for (sptep = rmap_get_first(*rmapp, &iter); sptep; sptep = rmap_get_next(&iter)) { @@ -1435,14 +1429,13 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, (unsigned long *)sptep); } } -out: - /* @data has hva passed to kvm_age_hva(). */ - trace_kvm_age_page(data, slot, young); + trace_kvm_age_page(gfn, level, slot, young); return young; } static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, - struct kvm_memory_slot *slot, unsigned long data) + struct kvm_memory_slot *slot, gfn_t gfn, + int level, unsigned long data) { u64 *sptep; struct rmap_iterator iter; @@ -1480,13 +1473,33 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); - kvm_unmap_rmapp(vcpu->kvm, rmapp, NULL, 0); + kvm_unmap_rmapp(vcpu->kvm, rmapp, NULL, gfn, sp->role.level, 0); kvm_flush_remote_tlbs(vcpu->kvm); } -int kvm_age_hva(struct kvm *kvm, unsigned long hva) +int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) { - return kvm_handle_hva(kvm, hva, hva, kvm_age_rmapp); + /* + * In case of absence of EPT Access and Dirty Bits supports, + * emulate the accessed bit for EPT, by checking if this page has + * an EPT mapping, and clearing it if it does. On the next access, + * a new EPT mapping will be established. + * This has some overhead, but not as much as the cost of swapping + * out actively used pages or breaking up actively used hugepages. + */ + if (!shadow_accessed_mask) { + /* + * We are holding the kvm->mmu_lock, and we are blowing up + * shadow PTEs. MMU notifier consumers need to be kept at bay. + * This is correct as long as we don't decouple the mmu_lock + * protected regions (like invalidate_range_start|end does). + */ + kvm->mmu_notifier_seq++; + return kvm_handle_hva_range(kvm, start, end, 0, + kvm_unmap_rmapp); + } + + return kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp); } int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) @@ -1749,7 +1762,7 @@ static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, return 1; } - kvm_mmu_flush_tlb(vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); return 0; } @@ -1802,7 +1815,7 @@ static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); if (flush) - kvm_mmu_flush_tlb(vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); } struct mmu_page_path { @@ -2536,7 +2549,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, true, host_writable)) { if (write_fault) *emulate = 1; - kvm_mmu_flush_tlb(vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); } if (unlikely(is_mmio_spte(*sptep) && emulate)) @@ -3163,7 +3176,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu) if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return; - vcpu_clear_mmio_info(vcpu, ~0ul); + vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { hpa_t root = vcpu->arch.mmu.root_hpa; @@ -3206,7 +3219,7 @@ static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr, { if (exception) exception->error_code = 0; - return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access); + return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access, exception); } static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct) @@ -3450,13 +3463,6 @@ static void nonpaging_init_context(struct kvm_vcpu *vcpu, context->nx = false; } -void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) -{ - ++vcpu->stat.tlb_flush; - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); -} -EXPORT_SYMBOL_GPL(kvm_mmu_flush_tlb); - void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu) { mmu_free_roots(vcpu); @@ -3518,6 +3524,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int maxphyaddr = cpuid_maxphyaddr(vcpu); u64 exb_bit_rsvd = 0; u64 gbpages_bit_rsvd = 0; + u64 nonleaf_bit8_rsvd = 0; context->bad_mt_xwr = 0; @@ -3525,6 +3532,14 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, exb_bit_rsvd = rsvd_bits(63, 63); if (!guest_cpuid_has_gbpages(vcpu)) gbpages_bit_rsvd = rsvd_bits(7, 7); + + /* + * Non-leaf PML4Es and PDPEs reserve bit 8 (which would be the G bit for + * leaf entries) on AMD CPUs only. + */ + if (guest_cpuid_is_amd(vcpu)) + nonleaf_bit8_rsvd = rsvd_bits(8, 8); + switch (context->root_level) { case PT32_ROOT_LEVEL: /* no rsvd bits for 2 level 4K page table entries */ @@ -3559,9 +3574,9 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, break; case PT64_ROOT_LEVEL: context->rsvd_bits_mask[0][3] = exb_bit_rsvd | - rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 7); + nonleaf_bit8_rsvd | rsvd_bits(7, 7) | rsvd_bits(maxphyaddr, 51); context->rsvd_bits_mask[0][2] = exb_bit_rsvd | - gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51); + nonleaf_bit8_rsvd | gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51); context->rsvd_bits_mask[0][1] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 51); context->rsvd_bits_mask[0][0] = exb_bit_rsvd | @@ -3962,7 +3977,7 @@ static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page, if (remote_flush) kvm_flush_remote_tlbs(vcpu->kvm); else if (local_flush) - kvm_mmu_flush_tlb(vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); } static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, @@ -4223,7 +4238,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) { vcpu->arch.mmu.invlpg(vcpu, gva); - kvm_mmu_flush_tlb(vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); ++vcpu->stat.invlpg; } EXPORT_SYMBOL_GPL(kvm_mmu_invlpg); @@ -4433,7 +4448,7 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm) * The very rare case: if the generation-number is round, * zap all shadow pages. */ - if (unlikely(kvm_current_mmio_generation(kvm) >= MMIO_MAX_GEN)) { + if (unlikely(kvm_current_mmio_generation(kvm) == 0)) { printk_ratelimited(KERN_INFO "kvm: zapping shadow pages for mmio generation wraparound\n"); kvm_mmu_invalidate_zap_all_pages(kvm); } @@ -4534,7 +4549,7 @@ int kvm_mmu_module_init(void) if (!mmu_page_header_cache) goto nomem; - if (percpu_counter_init(&kvm_total_used_mmu_pages, 0)) + if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL)) goto nomem; register_shrinker(&mmu_shrinker); diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index b982112d2ca..bde8ee72575 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -56,6 +56,11 @@ #define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT) #define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT) +static inline u64 rsvd_bits(int s, int e) +{ + return ((1ULL << (e - s + 1)) - 1) << s; +} + int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask); diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index 1185fe7a7f4..9ade5cfb5a4 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c @@ -273,7 +273,7 @@ static int mmu_audit_set(const char *val, const struct kernel_param *kp) int ret; unsigned long enable; - ret = strict_strtoul(val, 10, &enable); + ret = kstrtoul(val, 10, &enable); if (ret < 0) return -EINVAL; diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index 9d2e0ffcb19..5aaf3564176 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -22,7 +22,7 @@ __entry->unsync = sp->unsync; #define KVM_MMU_PAGE_PRINTK() ({ \ - const char *ret = p->buffer + p->len; \ + const u32 saved_len = p->len; \ static const char *access_str[] = { \ "---", "--x", "w--", "w-x", "-u-", "-ux", "wu-", "wux" \ }; \ @@ -41,7 +41,7 @@ role.nxe ? "" : "!", \ __entry->root_count, \ __entry->unsync ? "unsync" : "sync", 0); \ - ret; \ + p->buffer + saved_len; \ }) #define kvm_mmu_trace_pferr_flags \ diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 41077652826..806d58e3c32 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -298,8 +298,7 @@ retry_walk: } #endif walker->max_level = walker->level; - ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || - (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0); + ASSERT(!is_long_mode(vcpu) && is_pae(vcpu)); accessed_dirty = PT_GUEST_ACCESSED_MASK; pt_access = pte_access = ACC_ALL; @@ -321,9 +320,22 @@ retry_walk: walker->pte_gpa[walker->level - 1] = pte_gpa; real_gfn = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn), - PFERR_USER_MASK|PFERR_WRITE_MASK); + PFERR_USER_MASK|PFERR_WRITE_MASK, + &walker->fault); + + /* + * FIXME: This can happen if emulation (for of an INS/OUTS + * instruction) triggers a nested page fault. The exit + * qualification / exit info field will incorrectly have + * "guest page access" as the nested page fault's cause, + * instead of "guest page structure access". To fix this, + * the x86_exception struct should be augmented with enough + * information to fix the exit_qualification or exit_info_1 + * fields. + */ if (unlikely(real_gfn == UNMAPPED_GVA)) - goto error; + return 0; + real_gfn = gpa_to_gfn(real_gfn); host_addr = gfn_to_hva_prot(vcpu->kvm, real_gfn, @@ -364,7 +376,7 @@ retry_walk: if (PTTYPE == 32 && walker->level == PT_DIRECTORY_LEVEL && is_cpuid_PSE36()) gfn += pse36_gfn_delta(pte); - real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), access); + real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), access, &walker->fault); if (real_gpa == UNMAPPED_GVA) return 0; diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index cbecaa90399..8e6b7d869d2 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -15,6 +15,7 @@ #include <linux/types.h> #include <linux/kvm_host.h> #include <linux/perf_event.h> +#include <asm/perf_event.h> #include "x86.h" #include "cpuid.h" #include "lapic.h" @@ -428,6 +429,15 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; } +int kvm_pmu_check_pmc(struct kvm_vcpu *vcpu, unsigned pmc) +{ + struct kvm_pmu *pmu = &vcpu->arch.pmu; + bool fixed = pmc & (1u << 30); + pmc &= ~(3u << 30); + return (!fixed && pmc >= pmu->nr_arch_gp_counters) || + (fixed && pmc >= pmu->nr_arch_fixed_counters); +} + int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data) { struct kvm_pmu *pmu = &vcpu->arch.pmu; @@ -454,7 +464,8 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu) { struct kvm_pmu *pmu = &vcpu->arch.pmu; struct kvm_cpuid_entry2 *entry; - unsigned bitmap_len; + union cpuid10_eax eax; + union cpuid10_edx edx; pmu->nr_arch_gp_counters = 0; pmu->nr_arch_fixed_counters = 0; @@ -466,25 +477,27 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu) entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); if (!entry) return; + eax.full = entry->eax; + edx.full = entry->edx; - pmu->version = entry->eax & 0xff; + pmu->version = eax.split.version_id; if (!pmu->version) return; - pmu->nr_arch_gp_counters = min((int)(entry->eax >> 8) & 0xff, - INTEL_PMC_MAX_GENERIC); - pmu->counter_bitmask[KVM_PMC_GP] = - ((u64)1 << ((entry->eax >> 16) & 0xff)) - 1; - bitmap_len = (entry->eax >> 24) & 0xff; - pmu->available_event_types = ~entry->ebx & ((1ull << bitmap_len) - 1); + pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters, + INTEL_PMC_MAX_GENERIC); + pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << eax.split.bit_width) - 1; + pmu->available_event_types = ~entry->ebx & + ((1ull << eax.split.mask_length) - 1); if (pmu->version == 1) { pmu->nr_arch_fixed_counters = 0; } else { - pmu->nr_arch_fixed_counters = min((int)(entry->edx & 0x1f), + pmu->nr_arch_fixed_counters = + min_t(int, edx.split.num_counters_fixed, INTEL_PMC_MAX_FIXED); pmu->counter_bitmask[KVM_PMC_FIXED] = - ((u64)1 << ((entry->edx >> 5) & 0xff)) - 1; + ((u64)1 << edx.split.bit_width_fixed) - 1; } pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) | diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index b5e994ad013..65510f624df 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -486,14 +486,14 @@ static int is_external_interrupt(u32 info) return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR); } -static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) +static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); u32 ret = 0; if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) - ret |= KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS; - return ret & mask; + ret = KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS; + return ret; } static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) @@ -622,7 +622,7 @@ static int has_svm(void) return 1; } -static void svm_hardware_disable(void *garbage) +static void svm_hardware_disable(void) { /* Make sure we clean up behind us */ if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) @@ -633,7 +633,7 @@ static void svm_hardware_disable(void *garbage) amd_pmu_disable_virt(); } -static int svm_hardware_enable(void *garbage) +static int svm_hardware_enable(void) { struct svm_cpu_data *sd; @@ -670,7 +670,7 @@ static int svm_hardware_enable(void *garbage) if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) { wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT); - __get_cpu_var(current_tsc_ratio) = TSC_RATIO_DEFAULT; + __this_cpu_write(current_tsc_ratio, TSC_RATIO_DEFAULT); } @@ -1257,7 +1257,8 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) svm->asid_generation = 0; init_vmcb(svm); - svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; + svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE | + MSR_IA32_APICBASE_ENABLE; if (kvm_vcpu_is_bsp(&svm->vcpu)) svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; @@ -1312,8 +1313,8 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); if (static_cpu_has(X86_FEATURE_TSCRATEMSR) && - svm->tsc_ratio != __get_cpu_var(current_tsc_ratio)) { - __get_cpu_var(current_tsc_ratio) = svm->tsc_ratio; + svm->tsc_ratio != __this_cpu_read(current_tsc_ratio)) { + __this_cpu_write(current_tsc_ratio, svm->tsc_ratio); wrmsrl(MSR_AMD64_TSC_RATIO, svm->tsc_ratio); } } @@ -1415,7 +1416,16 @@ static void svm_get_segment(struct kvm_vcpu *vcpu, var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1; var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1; var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1; - var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1; + + /* + * AMD CPUs circa 2014 track the G bit for all segments except CS. + * However, the SVM spec states that the G bit is not observed by the + * CPU, and some VMware virtual CPUs drop the G bit for all segments. + * So let's synthesize a legal G bit for all segments, this helps + * running KVM nested. It also helps cross-vendor migration, because + * Intel's vmentry has a check on the 'G' bit. + */ + var->g = s->limit > 0xfffff; /* * AMD's VMCB does not have an explicit unusable field, so emulate it @@ -1424,14 +1434,6 @@ static void svm_get_segment(struct kvm_vcpu *vcpu, var->unusable = !var->present || (var->type == 0); switch (seg) { - case VCPU_SREG_CS: - /* - * SVM always stores 0 for the 'G' bit in the CS selector in - * the VMCB on a VMEXIT. This hurts cross-vendor migration: - * Intel's VMENTRY has a check on the 'G' bit. - */ - var->g = s->limit > 0xfffff; - break; case VCPU_SREG_TR: /* * Work around a bug where the busy flag in the tr selector @@ -1973,10 +1975,26 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu, { struct vcpu_svm *svm = to_svm(vcpu); - svm->vmcb->control.exit_code = SVM_EXIT_NPF; - svm->vmcb->control.exit_code_hi = 0; - svm->vmcb->control.exit_info_1 = fault->error_code; - svm->vmcb->control.exit_info_2 = fault->address; + if (svm->vmcb->control.exit_code != SVM_EXIT_NPF) { + /* + * TODO: track the cause of the nested page fault, and + * correctly fill in the high bits of exit_info_1. + */ + svm->vmcb->control.exit_code = SVM_EXIT_NPF; + svm->vmcb->control.exit_code_hi = 0; + svm->vmcb->control.exit_info_1 = (1ULL << 32); + svm->vmcb->control.exit_info_2 = fault->address; + } + + svm->vmcb->control.exit_info_1 &= ~0xffffffffULL; + svm->vmcb->control.exit_info_1 |= fault->error_code; + + /* + * The present bit is always zero for page structure faults on real + * hardware. + */ + if (svm->vmcb->control.exit_info_1 & (2ULL << 32)) + svm->vmcb->control.exit_info_1 &= ~1; nested_svm_vmexit(svm); } @@ -2116,22 +2134,27 @@ static void nested_svm_unmap(struct page *page) static int nested_svm_intercept_ioio(struct vcpu_svm *svm) { - unsigned port; - u8 val, bit; + unsigned port, size, iopm_len; + u16 val, mask; + u8 start_bit; u64 gpa; if (!(svm->nested.intercept & (1ULL << INTERCEPT_IOIO_PROT))) return NESTED_EXIT_HOST; port = svm->vmcb->control.exit_info_1 >> 16; + size = (svm->vmcb->control.exit_info_1 & SVM_IOIO_SIZE_MASK) >> + SVM_IOIO_SIZE_SHIFT; gpa = svm->nested.vmcb_iopm + (port / 8); - bit = port % 8; - val = 0; + start_bit = port % 8; + iopm_len = (start_bit + size > 8) ? 2 : 1; + mask = (0xf >> (4 - size)) << start_bit; + val = 0; - if (kvm_read_guest(svm->vcpu.kvm, gpa, &val, 1)) - val &= (1 << bit); + if (kvm_read_guest(svm->vcpu.kvm, gpa, &val, iopm_len)) + return NESTED_EXIT_DONE; - return val ? NESTED_EXIT_DONE : NESTED_EXIT_HOST; + return (val & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST; } static int nested_svm_exit_handled_msr(struct vcpu_svm *svm) @@ -3025,7 +3048,7 @@ static int cr8_write_interception(struct vcpu_svm *svm) return 0; } -u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) +static u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) { struct vmcb *vmcb = get_host_vmcb(to_svm(vcpu)); return vmcb->control.tsc_offset + @@ -4205,7 +4228,8 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu, if (info->intercept == x86_intercept_cr_write) icpt_info.exit_code += info->modrm_reg; - if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0) + if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0 || + info->intercept == x86_intercept_clts) break; intercept = svm->nested.intercept; @@ -4250,14 +4274,14 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu, u64 exit_info; u32 bytes; - exit_info = (vcpu->arch.regs[VCPU_REGS_RDX] & 0xffff) << 16; - if (info->intercept == x86_intercept_in || info->intercept == x86_intercept_ins) { - exit_info |= SVM_IOIO_TYPE_MASK; - bytes = info->src_bytes; - } else { + exit_info = ((info->src_val & 0xffff) << 16) | + SVM_IOIO_TYPE_MASK; bytes = info->dst_bytes; + } else { + exit_info = (info->dst_val & 0xffff) << 16; + bytes = info->src_bytes; } if (info->intercept == x86_intercept_outs || @@ -4298,6 +4322,10 @@ static void svm_handle_external_intr(struct kvm_vcpu *vcpu) local_irq_enable(); } +static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu) +{ +} + static struct kvm_x86_ops svm_x86_ops = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, @@ -4342,7 +4370,6 @@ static struct kvm_x86_ops svm_x86_ops = { .cache_reg = svm_cache_reg, .get_rflags = svm_get_rflags, .set_rflags = svm_set_rflags, - .fpu_activate = svm_fpu_activate, .fpu_deactivate = svm_fpu_deactivate, .tlb_flush = svm_flush_tlb, @@ -4399,6 +4426,8 @@ static struct kvm_x86_ops svm_x86_ops = { .check_intercept = svm_check_intercept, .handle_external_intr = svm_handle_external_intr, + + .sched_in = svm_sched_in, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 33574c95220..6b06ab8748d 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -415,15 +415,14 @@ TRACE_EVENT(kvm_apic_ipi, ); TRACE_EVENT(kvm_apic_accept_irq, - TP_PROTO(__u32 apicid, __u16 dm, __u8 tm, __u8 vec, bool coalesced), - TP_ARGS(apicid, dm, tm, vec, coalesced), + TP_PROTO(__u32 apicid, __u16 dm, __u8 tm, __u8 vec), + TP_ARGS(apicid, dm, tm, vec), TP_STRUCT__entry( __field( __u32, apicid ) __field( __u16, dm ) __field( __u8, tm ) __field( __u8, vec ) - __field( bool, coalesced ) ), TP_fast_assign( @@ -431,14 +430,12 @@ TRACE_EVENT(kvm_apic_accept_irq, __entry->dm = dm; __entry->tm = tm; __entry->vec = vec; - __entry->coalesced = coalesced; ), - TP_printk("apicid %x vec %u (%s|%s)%s", + TP_printk("apicid %x vec %u (%s|%s)", __entry->apicid, __entry->vec, __print_symbolic((__entry->dm >> 8 & 0x7), kvm_deliver_mode), - __entry->tm ? "level" : "edge", - __entry->coalesced ? " (coalesced)" : "") + __entry->tm ? "level" : "edge") ); TRACE_EVENT(kvm_eoi, @@ -721,10 +718,10 @@ TRACE_EVENT(kvm_emulate_insn, ), TP_fast_assign( - __entry->rip = vcpu->arch.emulate_ctxt.fetch.start; __entry->csbase = kvm_x86_ops->get_segment_base(vcpu, VCPU_SREG_CS); - __entry->len = vcpu->arch.emulate_ctxt._eip - - vcpu->arch.emulate_ctxt.fetch.start; + __entry->len = vcpu->arch.emulate_ctxt.fetch.ptr + - vcpu->arch.emulate_ctxt.fetch.data; + __entry->rip = vcpu->arch.emulate_ctxt._eip - __entry->len; memcpy(__entry->insn, vcpu->arch.emulate_ctxt.fetch.data, 15); @@ -850,6 +847,36 @@ TRACE_EVENT(kvm_track_tsc, #endif /* CONFIG_X86_64 */ +TRACE_EVENT(kvm_ple_window, + TP_PROTO(bool grow, unsigned int vcpu_id, int new, int old), + TP_ARGS(grow, vcpu_id, new, old), + + TP_STRUCT__entry( + __field( bool, grow ) + __field( unsigned int, vcpu_id ) + __field( int, new ) + __field( int, old ) + ), + + TP_fast_assign( + __entry->grow = grow; + __entry->vcpu_id = vcpu_id; + __entry->new = new; + __entry->old = old; + ), + + TP_printk("vcpu %u: ple_window %d (%s %d)", + __entry->vcpu_id, + __entry->new, + __entry->grow ? "grow" : "shrink", + __entry->old) +); + +#define trace_kvm_ple_window_grow(vcpu_id, new, old) \ + trace_kvm_ple_window(true, vcpu_id, new, old) +#define trace_kvm_ple_window_shrink(vcpu_id, new, old) \ + trace_kvm_ple_window(false, vcpu_id, new, old) + #endif /* _TRACE_KVM_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 801332edefc..0acac81f198 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -125,14 +125,32 @@ module_param(nested, bool, S_IRUGO); * Time is measured based on a counter that runs at the same rate as the TSC, * refer SDM volume 3b section 21.6.13 & 22.1.3. */ -#define KVM_VMX_DEFAULT_PLE_GAP 128 -#define KVM_VMX_DEFAULT_PLE_WINDOW 4096 +#define KVM_VMX_DEFAULT_PLE_GAP 128 +#define KVM_VMX_DEFAULT_PLE_WINDOW 4096 +#define KVM_VMX_DEFAULT_PLE_WINDOW_GROW 2 +#define KVM_VMX_DEFAULT_PLE_WINDOW_SHRINK 0 +#define KVM_VMX_DEFAULT_PLE_WINDOW_MAX \ + INT_MAX / KVM_VMX_DEFAULT_PLE_WINDOW_GROW + static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP; module_param(ple_gap, int, S_IRUGO); static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; module_param(ple_window, int, S_IRUGO); +/* Default doubles per-vcpu window every exit. */ +static int ple_window_grow = KVM_VMX_DEFAULT_PLE_WINDOW_GROW; +module_param(ple_window_grow, int, S_IRUGO); + +/* Default resets per-vcpu window every exit to ple_window. */ +static int ple_window_shrink = KVM_VMX_DEFAULT_PLE_WINDOW_SHRINK; +module_param(ple_window_shrink, int, S_IRUGO); + +/* Default is to compute the maximum so we can never overflow. */ +static int ple_window_actual_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; +static int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; +module_param(ple_window_max, int, S_IRUGO); + extern const ulong vmx_return; #define NR_AUTOLOAD_MSRS 8 @@ -379,10 +397,14 @@ struct nested_vmx { * we must keep them pinned while L2 runs. */ struct page *apic_access_page; + struct page *virtual_apic_page; u64 msr_ia32_feature_control; struct hrtimer preemption_timer; bool preemption_timer_expired; + + /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */ + u64 vmcs01_debugctl; }; #define POSTED_INTR_ON 0 @@ -450,6 +472,7 @@ struct vcpu_vmx { int gs_ldt_reload_needed; int fs_reload_needed; u64 msr_host_bndcfgs; + unsigned long vmcs_host_cr4; /* May not match real cr4 */ } host_state; struct { int vm86_active; @@ -481,6 +504,10 @@ struct vcpu_vmx { /* Support for a guest hypervisor (nested VMX) */ struct nested_vmx nested; + + /* Dynamic PLE window. */ + int ple_window; + bool ple_window_dirty; }; enum segment_cache_field { @@ -530,6 +557,7 @@ static int max_shadow_read_only_fields = ARRAY_SIZE(shadow_read_only_fields); static unsigned long shadow_read_write_fields[] = { + TPR_THRESHOLD, GUEST_RIP, GUEST_RSP, GUEST_CR0, @@ -740,7 +768,7 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var); static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu); static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx); static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx); -static bool vmx_mpx_supported(void); +static int alloc_identity_pagetable(struct kvm *kvm); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); @@ -820,7 +848,6 @@ static const u32 vmx_msr_index[] = { #endif MSR_EFER, MSR_TSC_AUX, MSR_STAR, }; -#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) static inline bool is_page_fault(u32 intr_info) { @@ -1600,7 +1627,7 @@ static void reload_tss(void) /* * VT restores TR but not its size. Useless. */ - struct desc_ptr *gdt = &__get_cpu_var(host_gdt); + struct desc_ptr *gdt = this_cpu_ptr(&host_gdt); struct desc_struct *descs; descs = (void *)gdt->address; @@ -1646,7 +1673,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) static unsigned long segment_base(u16 selector) { - struct desc_ptr *gdt = &__get_cpu_var(host_gdt); + struct desc_ptr *gdt = this_cpu_ptr(&host_gdt); struct desc_struct *d; unsigned long table_base; unsigned long v; @@ -1776,7 +1803,7 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx) */ if (!user_has_fpu() && !vmx->vcpu.guest_fpu_loaded) stts(); - load_gdt(&__get_cpu_var(host_gdt)); + load_gdt(this_cpu_ptr(&host_gdt)); } static void vmx_load_host_state(struct vcpu_vmx *vmx) @@ -1806,7 +1833,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) } if (vmx->loaded_vmcs->cpu != cpu) { - struct desc_ptr *gdt = &__get_cpu_var(host_gdt); + struct desc_ptr *gdt = this_cpu_ptr(&host_gdt); unsigned long sysenter_esp; kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); @@ -1940,7 +1967,7 @@ static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) vmcs_writel(GUEST_RFLAGS, rflags); } -static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) +static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu) { u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); int ret = 0; @@ -1950,7 +1977,7 @@ static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) if (interruptibility & GUEST_INTR_STATE_MOV_SS) ret |= KVM_X86_SHADOW_INT_MOV_SS; - return ret & mask; + return ret; } static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) @@ -2134,7 +2161,7 @@ static u64 guest_read_tsc(void) * Like guest_read_tsc, but always returns L1's notion of the timestamp * counter, even if a nested guest (L2) is currently running. */ -u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) +static u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) { u64 tsc_offset; @@ -2239,10 +2266,13 @@ static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu) * or other means. */ static u32 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high; +static u32 nested_vmx_true_procbased_ctls_low; static u32 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high; static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high; static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high; +static u32 nested_vmx_true_exit_ctls_low; static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high; +static u32 nested_vmx_true_entry_ctls_low; static u32 nested_vmx_misc_low, nested_vmx_misc_high; static u32 nested_vmx_ept_caps; static __init void nested_vmx_setup_ctls_msrs(void) @@ -2265,21 +2295,13 @@ static __init void nested_vmx_setup_ctls_msrs(void) /* pin-based controls */ rdmsr(MSR_IA32_VMX_PINBASED_CTLS, nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high); - /* - * According to the Intel spec, if bit 55 of VMX_BASIC is off (as it is - * in our case), bits 1, 2 and 4 (i.e., 0x16) must be 1 in this MSR. - */ nested_vmx_pinbased_ctls_low |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; nested_vmx_pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS; nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | PIN_BASED_VMX_PREEMPTION_TIMER; - /* - * Exit controls - * If bit 55 of VMX_BASIC is off, bits 0-8 and 10, 11, 13, 14, 16 and - * 17 must be 1. - */ + /* exit controls */ rdmsr(MSR_IA32_VMX_EXIT_CTLS, nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high); nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; @@ -2296,10 +2318,13 @@ static __init void nested_vmx_setup_ctls_msrs(void) if (vmx_mpx_supported()) nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; + /* We support free control of debug control saving. */ + nested_vmx_true_exit_ctls_low = nested_vmx_exit_ctls_low & + ~VM_EXIT_SAVE_DEBUG_CONTROLS; + /* entry controls */ rdmsr(MSR_IA32_VMX_ENTRY_CTLS, nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high); - /* If bit 55 of VMX_BASIC is off, bits 0-8 and 12 must be 1. */ nested_vmx_entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; nested_vmx_entry_ctls_high &= #ifdef CONFIG_X86_64 @@ -2311,10 +2336,14 @@ static __init void nested_vmx_setup_ctls_msrs(void) if (vmx_mpx_supported()) nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; + /* We support free control of debug control loading. */ + nested_vmx_true_entry_ctls_low = nested_vmx_entry_ctls_low & + ~VM_ENTRY_LOAD_DEBUG_CONTROLS; + /* cpu-based controls */ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high); - nested_vmx_procbased_ctls_low = 0; + nested_vmx_procbased_ctls_low = CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; nested_vmx_procbased_ctls_high &= CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING | @@ -2327,7 +2356,7 @@ static __init void nested_vmx_setup_ctls_msrs(void) CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING | CPU_BASED_RDTSC_EXITING | - CPU_BASED_PAUSE_EXITING | + CPU_BASED_PAUSE_EXITING | CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; /* * We can allow some features even when not supported by the @@ -2335,7 +2364,12 @@ static __init void nested_vmx_setup_ctls_msrs(void) * can use it to avoid exits to L1 - even when L0 runs L2 * without MSR bitmaps. */ - nested_vmx_procbased_ctls_high |= CPU_BASED_USE_MSR_BITMAPS; + nested_vmx_procbased_ctls_high |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | + CPU_BASED_USE_MSR_BITMAPS; + + /* We support free control of CR3 access interception. */ + nested_vmx_true_procbased_ctls_low = nested_vmx_procbased_ctls_low & + ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); /* secondary cpu-based controls */ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, @@ -2394,7 +2428,7 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) * guest, and the VMCS structure we give it - not about the * VMX support of the underlying hardware. */ - *pdata = VMCS12_REVISION | + *pdata = VMCS12_REVISION | VMX_BASIC_TRUE_CTLS | ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); break; @@ -2404,16 +2438,25 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) nested_vmx_pinbased_ctls_high); break; case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: + *pdata = vmx_control_msr(nested_vmx_true_procbased_ctls_low, + nested_vmx_procbased_ctls_high); + break; case MSR_IA32_VMX_PROCBASED_CTLS: *pdata = vmx_control_msr(nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high); break; case MSR_IA32_VMX_TRUE_EXIT_CTLS: + *pdata = vmx_control_msr(nested_vmx_true_exit_ctls_low, + nested_vmx_exit_ctls_high); + break; case MSR_IA32_VMX_EXIT_CTLS: *pdata = vmx_control_msr(nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high); break; case MSR_IA32_VMX_TRUE_ENTRY_CTLS: + *pdata = vmx_control_msr(nested_vmx_true_entry_ctls_low, + nested_vmx_entry_ctls_high); + break; case MSR_IA32_VMX_ENTRY_CTLS: *pdata = vmx_control_msr(nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high); @@ -2442,7 +2485,7 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) *pdata = -1ULL; break; case MSR_IA32_VMX_VMCS_ENUM: - *pdata = 0x1f; + *pdata = 0x2e; /* highest index: VMX_PREEMPTION_TIMER_VALUE */ break; case MSR_IA32_VMX_PROCBASED_CTLS2: *pdata = vmx_control_msr(nested_vmx_secondary_ctls_low, @@ -2584,6 +2627,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_CR_PAT: if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { + if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data)) + return 1; vmcs_write64(GUEST_IA32_PAT, data); vcpu->arch.pat = data; break; @@ -2687,7 +2732,7 @@ static void kvm_cpu_vmxon(u64 addr) : "memory", "cc"); } -static int hardware_enable(void *garbage) +static int hardware_enable(void) { int cpu = raw_smp_processor_id(); u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); @@ -2727,7 +2772,7 @@ static int hardware_enable(void *garbage) ept_sync_global(); } - native_store_gdt(&__get_cpu_var(host_gdt)); + native_store_gdt(this_cpu_ptr(&host_gdt)); return 0; } @@ -2751,7 +2796,7 @@ static void kvm_cpu_vmxoff(void) asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc"); } -static void hardware_disable(void *garbage) +static void hardware_disable(void) { if (vmm_exclusive) { vmclear_local_loaded_vmcss(); @@ -3090,9 +3135,17 @@ static __init int hardware_setup(void) if (!cpu_has_vmx_unrestricted_guest()) enable_unrestricted_guest = 0; - if (!cpu_has_vmx_flexpriority()) + if (!cpu_has_vmx_flexpriority()) { flexpriority_enabled = 0; + /* + * set_apic_access_page_addr() is used to reload apic access + * page upon invalidation. No need to do anything if the + * processor does not have the APIC_ACCESS_ADDR VMCS field. + */ + kvm_x86_ops->set_apic_access_page_addr = NULL; + } + if (!cpu_has_vmx_tpr_shadow()) kvm_x86_ops->update_cr8_intercept = NULL; @@ -3653,7 +3706,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var)); out: - vmx->emulation_required |= emulation_required(vcpu); + vmx->emulation_required = emulation_required(vcpu); } static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) @@ -3888,7 +3941,7 @@ static int init_rmode_tss(struct kvm *kvm) { gfn_t fn; u16 data = 0; - int r, idx, ret = 0; + int idx, r; idx = srcu_read_lock(&kvm->srcu); fn = kvm->arch.tss_addr >> PAGE_SHIFT; @@ -3910,32 +3963,32 @@ static int init_rmode_tss(struct kvm *kvm) r = kvm_write_guest_page(kvm, fn, &data, RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1, sizeof(u8)); - if (r < 0) - goto out; - - ret = 1; out: srcu_read_unlock(&kvm->srcu, idx); - return ret; + return r; } static int init_rmode_identity_map(struct kvm *kvm) { - int i, idx, r, ret; + int i, idx, r = 0; pfn_t identity_map_pfn; u32 tmp; if (!enable_ept) - return 1; - if (unlikely(!kvm->arch.ept_identity_pagetable)) { - printk(KERN_ERR "EPT: identity-mapping pagetable " - "haven't been allocated!\n"); return 0; - } + + /* Protect kvm->arch.ept_identity_pagetable_done. */ + mutex_lock(&kvm->slots_lock); + if (likely(kvm->arch.ept_identity_pagetable_done)) - return 1; - ret = 0; + goto out2; + identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT; + + r = alloc_identity_pagetable(kvm); + if (r < 0) + goto out2; + idx = srcu_read_lock(&kvm->srcu); r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE); if (r < 0) @@ -3950,10 +4003,13 @@ static int init_rmode_identity_map(struct kvm *kvm) goto out; } kvm->arch.ept_identity_pagetable_done = true; - ret = 1; + out: srcu_read_unlock(&kvm->srcu, idx); - return ret; + +out2: + mutex_unlock(&kvm->slots_lock); + return r; } static void seg_setup(int seg) @@ -3978,23 +4034,28 @@ static int alloc_apic_access_page(struct kvm *kvm) int r = 0; mutex_lock(&kvm->slots_lock); - if (kvm->arch.apic_access_page) + if (kvm->arch.apic_access_page_done) goto out; kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT; kvm_userspace_mem.flags = 0; - kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL; + kvm_userspace_mem.guest_phys_addr = APIC_DEFAULT_PHYS_BASE; kvm_userspace_mem.memory_size = PAGE_SIZE; r = __kvm_set_memory_region(kvm, &kvm_userspace_mem); if (r) goto out; - page = gfn_to_page(kvm, 0xfee00); + page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); if (is_error_page(page)) { r = -EFAULT; goto out; } - kvm->arch.apic_access_page = page; + /* + * Do not pin the page in memory, so that memory hot-unplug + * is able to migrate it. + */ + put_page(page); + kvm->arch.apic_access_page_done = true; out: mutex_unlock(&kvm->slots_lock); return r; @@ -4002,31 +4063,20 @@ out: static int alloc_identity_pagetable(struct kvm *kvm) { - struct page *page; + /* Called with kvm->slots_lock held. */ + struct kvm_userspace_memory_region kvm_userspace_mem; int r = 0; - mutex_lock(&kvm->slots_lock); - if (kvm->arch.ept_identity_pagetable) - goto out; + BUG_ON(kvm->arch.ept_identity_pagetable_done); + kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT; kvm_userspace_mem.flags = 0; kvm_userspace_mem.guest_phys_addr = kvm->arch.ept_identity_map_addr; kvm_userspace_mem.memory_size = PAGE_SIZE; r = __kvm_set_memory_region(kvm, &kvm_userspace_mem); - if (r) - goto out; - - page = gfn_to_page(kvm, kvm->arch.ept_identity_map_addr >> PAGE_SHIFT); - if (is_error_page(page)) { - r = -EFAULT; - goto out; - } - kvm->arch.ept_identity_pagetable = page; -out: - mutex_unlock(&kvm->slots_lock); return r; } @@ -4218,11 +4268,16 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx) u32 low32, high32; unsigned long tmpl; struct desc_ptr dt; + unsigned long cr4; vmcs_writel(HOST_CR0, read_cr0() & ~X86_CR0_TS); /* 22.2.3 */ - vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */ vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ + /* Save the most likely value for this task's CR4 in the VMCS. */ + cr4 = read_cr4(); + vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */ + vmx->host_state.vmcs_host_cr4 = cr4; + vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ #ifdef CONFIG_X86_64 /* @@ -4385,7 +4440,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) if (ple_gap) { vmcs_write32(PLE_GAP, ple_gap); - vmcs_write32(PLE_WINDOW, ple_window); + vmx->ple_window = ple_window; + vmx->ple_window_dirty = true; } vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); @@ -4422,7 +4478,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmx->vcpu.arch.pat = host_pat; } - for (i = 0; i < NR_VMX_MSR; ++i) { + for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) { u32 index = vmx_msr_index[i]; u32 data_low, data_high; int j = vmx->nmsrs; @@ -4460,7 +4516,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); kvm_set_cr8(&vmx->vcpu, 0); - apic_base_msr.data = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; + apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE; if (kvm_vcpu_is_bsp(&vmx->vcpu)) apic_base_msr.data |= MSR_IA32_APICBASE_BSP; apic_base_msr.host_initiated = true; @@ -4520,9 +4576,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmcs_write32(TPR_THRESHOLD, 0); } - if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) - vmcs_write64(APIC_ACCESS_ADDR, - page_to_phys(vmx->vcpu.kvm->arch.apic_access_page)); + kvm_vcpu_reload_apic_access_page(vcpu); if (vmx_vm_has_apicv(vcpu->kvm)) memset(&vmx->pi_desc, 0, sizeof(struct pi_desc)); @@ -4712,10 +4766,7 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) if (ret) return ret; kvm->arch.tss_addr = addr; - if (!init_rmode_tss(kvm)) - return -ENOMEM; - - return 0; + return init_rmode_tss(kvm); } static bool rmode_exception(struct kvm_vcpu *vcpu, int vec) @@ -4873,7 +4924,7 @@ static int handle_exception(struct kvm_vcpu *vcpu) if (!(vcpu->guest_debug & (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { vcpu->arch.dr6 &= ~15; - vcpu->arch.dr6 |= dr6; + vcpu->arch.dr6 |= dr6 | DR6_RTM; if (!(dr6 & ~DR6_RESERVED)) /* icebp */ skip_emulated_instruction(vcpu); @@ -5039,7 +5090,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) reg = (exit_qualification >> 8) & 15; switch ((exit_qualification >> 4) & 3) { case 0: /* mov to cr */ - val = kvm_register_read(vcpu, reg); + val = kvm_register_readl(vcpu, reg); trace_kvm_cr_write(cr, val); switch (cr) { case 0: @@ -5056,7 +5107,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) return 1; case 8: { u8 cr8_prev = kvm_get_cr8(vcpu); - u8 cr8 = kvm_register_read(vcpu, reg); + u8 cr8 = (u8)val; err = kvm_set_cr8(vcpu, cr8); kvm_complete_insn_gp(vcpu, err); if (irqchip_in_kernel(vcpu->kvm)) @@ -5132,7 +5183,7 @@ static int handle_dr(struct kvm_vcpu *vcpu) return 0; } else { vcpu->arch.dr7 &= ~DR7_GD; - vcpu->arch.dr6 |= DR6_BD; + vcpu->arch.dr6 |= DR6_BD | DR6_RTM; vmcs_writel(GUEST_DR7, vcpu->arch.dr7); kvm_queue_exception(vcpu, DB_VECTOR); return 1; @@ -5165,7 +5216,7 @@ static int handle_dr(struct kvm_vcpu *vcpu) return 1; kvm_register_write(vcpu, reg, val); } else - if (kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg))) + if (kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg))) return 1; skip_emulated_instruction(vcpu); @@ -5504,17 +5555,18 @@ static u64 ept_rsvd_mask(u64 spte, int level) for (i = 51; i > boot_cpu_data.x86_phys_bits; i--) mask |= (1ULL << i); - if (level > 2) + if (level == 4) /* bits 7:3 reserved */ mask |= 0xf8; - else if (level == 2) { - if (spte & (1ULL << 7)) - /* 2MB ref, bits 20:12 reserved */ - mask |= 0x1ff000; - else - /* bits 6:3 reserved */ - mask |= 0x78; - } + else if (spte & (1ULL << 7)) + /* + * 1GB/2MB page, bits 29:12 or 20:12 reserved respectively, + * level == 1 if the hypervisor is using the ignored bit 7. + */ + mask |= (PAGE_SIZE << ((level - 1) * 9)) - PAGE_SIZE; + else if (level > 1) + /* bits 6:3 reserved */ + mask |= 0x78; return mask; } @@ -5544,7 +5596,8 @@ static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte, WARN_ON(1); } - if (level == 1 || (level == 2 && (spte & (1ULL << 7)))) { + /* bits 5:3 are _not_ reserved for large page or leaf page */ + if ((rsvd_bits & 0x38) == 0) { u64 ept_mem_type = (spte & 0x38) >> 3; if (ept_mem_type == 2 || ept_mem_type == 3 || @@ -5621,7 +5674,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING; - while (!guest_state_valid(vcpu) && count-- != 0) { + while (vmx->emulation_required && count-- != 0) { if (intr_window_requested && vmx_interrupt_allowed(vcpu)) return handle_interrupt_window(&vmx->vcpu); @@ -5655,17 +5708,89 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) schedule(); } - vmx->emulation_required = emulation_required(vcpu); out: return ret; } +static int __grow_ple_window(int val) +{ + if (ple_window_grow < 1) + return ple_window; + + val = min(val, ple_window_actual_max); + + if (ple_window_grow < ple_window) + val *= ple_window_grow; + else + val += ple_window_grow; + + return val; +} + +static int __shrink_ple_window(int val, int modifier, int minimum) +{ + if (modifier < 1) + return ple_window; + + if (modifier < ple_window) + val /= modifier; + else + val -= modifier; + + return max(val, minimum); +} + +static void grow_ple_window(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int old = vmx->ple_window; + + vmx->ple_window = __grow_ple_window(old); + + if (vmx->ple_window != old) + vmx->ple_window_dirty = true; + + trace_kvm_ple_window_grow(vcpu->vcpu_id, vmx->ple_window, old); +} + +static void shrink_ple_window(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int old = vmx->ple_window; + + vmx->ple_window = __shrink_ple_window(old, + ple_window_shrink, ple_window); + + if (vmx->ple_window != old) + vmx->ple_window_dirty = true; + + trace_kvm_ple_window_shrink(vcpu->vcpu_id, vmx->ple_window, old); +} + +/* + * ple_window_actual_max is computed to be one grow_ple_window() below + * ple_window_max. (See __grow_ple_window for the reason.) + * This prevents overflows, because ple_window_max is int. + * ple_window_max effectively rounded down to a multiple of ple_window_grow in + * this process. + * ple_window_max is also prevented from setting vmx->ple_window < ple_window. + */ +static void update_ple_window_actual_max(void) +{ + ple_window_actual_max = + __shrink_ple_window(max(ple_window_max, ple_window), + ple_window_grow, INT_MIN); +} + /* * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE * exiting, so only get here on cpu with PAUSE-Loop-Exiting. */ static int handle_pause(struct kvm_vcpu *vcpu) { + if (ple_gap) + grow_ple_window(vcpu); + skip_emulated_instruction(vcpu); kvm_vcpu_on_spin(vcpu); @@ -5754,22 +5879,27 @@ static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr) /* * Free all VMCSs saved for this vcpu, except the one pointed by - * vmx->loaded_vmcs. These include the VMCSs in vmcs02_pool (except the one - * currently used, if running L2), and vmcs01 when running L2. + * vmx->loaded_vmcs. We must be running L1, so vmx->loaded_vmcs + * must be &vmx->vmcs01. */ static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx) { struct vmcs02_list *item, *n; + + WARN_ON(vmx->loaded_vmcs != &vmx->vmcs01); list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) { - if (vmx->loaded_vmcs != &item->vmcs02) - free_loaded_vmcs(&item->vmcs02); + /* + * Something will leak if the above WARN triggers. Better than + * a use-after-free. + */ + if (vmx->loaded_vmcs == &item->vmcs02) + continue; + + free_loaded_vmcs(&item->vmcs02); list_del(&item->list); kfree(item); + vmx->nested.vmcs02_num--; } - vmx->nested.vmcs02_num = 0; - - if (vmx->loaded_vmcs != &vmx->vmcs01) - free_loaded_vmcs(&vmx->vmcs01); } /* @@ -5918,7 +6048,7 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason, * which replaces physical address width with 32 * */ - if (!IS_ALIGNED(vmptr, PAGE_SIZE) || (vmptr >> maxphyaddr)) { + if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) { nested_vmx_failInvalid(vcpu); skip_emulated_instruction(vcpu); return 1; @@ -5936,7 +6066,7 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason, vmx->nested.vmxon_ptr = vmptr; break; case EXIT_REASON_VMCLEAR: - if (!IS_ALIGNED(vmptr, PAGE_SIZE) || (vmptr >> maxphyaddr)) { + if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) { nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS); skip_emulated_instruction(vcpu); @@ -5951,7 +6081,7 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason, } break; case EXIT_REASON_VMPTRLD: - if (!IS_ALIGNED(vmptr, PAGE_SIZE) || (vmptr >> maxphyaddr)) { + if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) { nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS); skip_emulated_instruction(vcpu); @@ -6086,20 +6216,27 @@ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) static inline void nested_release_vmcs12(struct vcpu_vmx *vmx) { u32 exec_control; + if (vmx->nested.current_vmptr == -1ull) + return; + + /* current_vmptr and current_vmcs12 are always set/reset together */ + if (WARN_ON(vmx->nested.current_vmcs12 == NULL)) + return; + if (enable_shadow_vmcs) { - if (vmx->nested.current_vmcs12 != NULL) { - /* copy to memory all shadowed fields in case - they were modified */ - copy_shadow_to_vmcs12(vmx); - vmx->nested.sync_shadow_vmcs = false; - exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); - exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); - vmcs_write64(VMCS_LINK_POINTER, -1ull); - } + /* copy to memory all shadowed fields in case + they were modified */ + copy_shadow_to_vmcs12(vmx); + vmx->nested.sync_shadow_vmcs = false; + exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); + vmcs_write64(VMCS_LINK_POINTER, -1ull); } kunmap(vmx->nested.current_vmcs12_page); nested_release_page(vmx->nested.current_vmcs12_page); + vmx->nested.current_vmptr = -1ull; + vmx->nested.current_vmcs12 = NULL; } /* @@ -6110,18 +6247,19 @@ static void free_nested(struct vcpu_vmx *vmx) { if (!vmx->nested.vmxon) return; + vmx->nested.vmxon = false; - if (vmx->nested.current_vmptr != -1ull) { - nested_release_vmcs12(vmx); - vmx->nested.current_vmptr = -1ull; - vmx->nested.current_vmcs12 = NULL; - } + nested_release_vmcs12(vmx); if (enable_shadow_vmcs) free_vmcs(vmx->nested.current_shadow_vmcs); /* Unpin physical memory we referred to in current vmcs02 */ if (vmx->nested.apic_access_page) { nested_release_page(vmx->nested.apic_access_page); - vmx->nested.apic_access_page = 0; + vmx->nested.apic_access_page = NULL; + } + if (vmx->nested.virtual_apic_page) { + nested_release_page(vmx->nested.virtual_apic_page); + vmx->nested.virtual_apic_page = NULL; } nested_free_all_saved_vmcss(vmx); @@ -6152,11 +6290,8 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMCLEAR, &vmptr)) return 1; - if (vmptr == vmx->nested.current_vmptr) { + if (vmptr == vmx->nested.current_vmptr) nested_release_vmcs12(vmx); - vmx->nested.current_vmptr = -1ull; - vmx->nested.current_vmcs12 = NULL; - } page = nested_get_page(vcpu, vmptr); if (page == NULL) { @@ -6384,7 +6519,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) return 1; /* Decode instruction info and find the field to read */ - field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); + field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); /* Read the field, zero-extended to a u64 field_value */ if (!vmcs12_read_any(vcpu, field, &field_value)) { nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); @@ -6397,7 +6532,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) * on the guest's mode (32 or 64 bit), not on the given field's length. */ if (vmx_instruction_info & (1u << 10)) { - kvm_register_write(vcpu, (((vmx_instruction_info) >> 3) & 0xf), + kvm_register_writel(vcpu, (((vmx_instruction_info) >> 3) & 0xf), field_value); } else { if (get_vmx_mem_address(vcpu, exit_qualification, @@ -6434,21 +6569,21 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) return 1; if (vmx_instruction_info & (1u << 10)) - field_value = kvm_register_read(vcpu, + field_value = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 3) & 0xf)); else { if (get_vmx_mem_address(vcpu, exit_qualification, vmx_instruction_info, &gva)) return 1; if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, - &field_value, (is_long_mode(vcpu) ? 8 : 4), &e)) { + &field_value, (is_64_bit_mode(vcpu) ? 8 : 4), &e)) { kvm_inject_page_fault(vcpu, &e); return 1; } } - field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); + field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); if (vmcs_field_readonly(field)) { nested_vmx_failValid(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); @@ -6498,9 +6633,8 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) skip_emulated_instruction(vcpu); return 1; } - if (vmx->nested.current_vmptr != -1ull) - nested_release_vmcs12(vmx); + nested_release_vmcs12(vmx); vmx->nested.current_vmptr = vmptr; vmx->nested.current_vmcs12 = new_vmcs12; vmx->nested.current_vmcs12_page = page; @@ -6571,7 +6705,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) } vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); - type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf); + type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); types = (nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; @@ -6596,7 +6730,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) switch (type) { case VMX_EPT_EXTENT_GLOBAL: kvm_mmu_sync_roots(vcpu); - kvm_mmu_flush_tlb(vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); nested_vmx_succeed(vcpu); break; default: @@ -6751,7 +6885,7 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); int cr = exit_qualification & 15; int reg = (exit_qualification >> 8) & 15; - unsigned long val = kvm_register_read(vcpu, reg); + unsigned long val = kvm_register_readl(vcpu, reg); switch ((exit_qualification >> 4) & 3) { case 0: /* mov to cr */ @@ -6871,6 +7005,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) case EXIT_REASON_TASK_SWITCH: return 1; case EXIT_REASON_CPUID: + if (kvm_register_read(vcpu, VCPU_REGS_RAX) == 0xa) + return 0; return 1; case EXIT_REASON_HLT: return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); @@ -6915,7 +7051,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) case EXIT_REASON_MCE_DURING_VMENTRY: return 0; case EXIT_REASON_TPR_BELOW_THRESHOLD: - return 1; + return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW); case EXIT_REASON_APIC_ACCESS: return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); @@ -7036,6 +7172,12 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) { + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + + if (is_guest_mode(vcpu) && + nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) + return; + if (irr == -1 || tpr < irr) { vmcs_write32(TPR_THRESHOLD, 0); return; @@ -7073,6 +7215,29 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) vmx_set_msr_bitmap(vcpu); } +static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + /* + * Currently we do not handle the nested case where L2 has an + * APIC access page of its own; that page is still pinned. + * Hence, we skip the case where the VCPU is in guest mode _and_ + * L1 prepared an APIC access page for L2. + * + * For the case where L1 and L2 share the same APIC access page + * (flexpriority=Y but SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES clear + * in the vmcs12), this function will only update either the vmcs01 + * or the vmcs02. If the former, the vmcs02 will be updated by + * prepare_vmcs02. If the latter, the vmcs01 will be updated in + * the next L2->L1 exit. + */ + if (!is_guest_mode(vcpu) || + !nested_cpu_has2(vmx->nested.current_vmcs12, + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) + vmcs_write64(APIC_ACCESS_ADDR, hpa); +} + static void vmx_hwapic_isr_update(struct kvm *kvm, int isr) { u16 status; @@ -7112,7 +7277,26 @@ static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) if (max_irr == -1) return; - vmx_set_rvi(max_irr); + /* + * If a vmexit is needed, vmx_check_nested_events handles it. + */ + if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) + return; + + if (!is_guest_mode(vcpu)) { + vmx_set_rvi(max_irr); + return; + } + + /* + * Fall back to pre-APICv interrupt injection since L2 + * is run without virtual interrupt delivery. + */ + if (!kvm_event_needs_reinjection(vcpu) && + vmx_interrupt_allowed(vcpu)) { + kvm_queue_interrupt(vcpu, max_irr, false); + vmx_inject_irq(vcpu); + } } static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) @@ -7336,7 +7520,7 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long debugctlmsr; + unsigned long debugctlmsr, cr4; /* Record the guest's net vcpu time for enforced NMI injections. */ if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) @@ -7347,6 +7531,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) if (vmx->emulation_required) return; + if (vmx->ple_window_dirty) { + vmx->ple_window_dirty = false; + vmcs_write32(PLE_WINDOW, vmx->ple_window); + } + if (vmx->nested.sync_shadow_vmcs) { copy_vmcs12_to_shadow(vmx); vmx->nested.sync_shadow_vmcs = false; @@ -7357,6 +7546,12 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); + cr4 = read_cr4(); + if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) { + vmcs_writel(HOST_CR4, cr4); + vmx->host_state.vmcs_host_cr4 = cr4; + } + /* When single-stepping over STI and MOV SS, we must clear the * corresponding interruptibility bits in the guest state. Otherwise * vmentry fails as it then expects bit 14 (BS) in pending debug @@ -7520,13 +7715,31 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx_complete_interrupts(vmx); } +static void vmx_load_vmcs01(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int cpu; + + if (vmx->loaded_vmcs == &vmx->vmcs01) + return; + + cpu = get_cpu(); + vmx->loaded_vmcs = &vmx->vmcs01; + vmx_vcpu_put(vcpu); + vmx_vcpu_load(vcpu, cpu); + vcpu->cpu = cpu; + put_cpu(); +} + static void vmx_free_vcpu(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); free_vpid(vmx); - free_loaded_vmcs(vmx->loaded_vmcs); + leave_guest_mode(vcpu); + vmx_load_vmcs01(vcpu); free_nested(vmx); + free_loaded_vmcs(vmx->loaded_vmcs); kfree(vmx->guest_msrs); kvm_vcpu_uninit(vcpu); kmem_cache_free(kvm_vcpu_cache, vmx); @@ -7548,6 +7761,9 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) goto free_vcpu; vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); + BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) * sizeof(vmx->guest_msrs[0]) + > PAGE_SIZE); + err = -ENOMEM; if (!vmx->guest_msrs) { goto uninit_vcpu; @@ -7581,10 +7797,8 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) if (!kvm->arch.ept_identity_map_addr) kvm->arch.ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR; - err = -ENOMEM; - if (alloc_identity_pagetable(kvm) != 0) - goto free_vmcs; - if (!init_rmode_identity_map(kvm)) + err = init_rmode_identity_map(kvm); + if (err) goto free_vmcs; } @@ -7763,6 +7977,55 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, kvm_inject_page_fault(vcpu, fault); } +static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { + /* TODO: Also verify bits beyond physical address width are 0 */ + if (!PAGE_ALIGNED(vmcs12->apic_access_addr)) + return false; + + /* + * Translate L1 physical address to host physical + * address for vmcs02. Keep the page pinned, so this + * physical address remains valid. We keep a reference + * to it so we can release it later. + */ + if (vmx->nested.apic_access_page) /* shouldn't happen */ + nested_release_page(vmx->nested.apic_access_page); + vmx->nested.apic_access_page = + nested_get_page(vcpu, vmcs12->apic_access_addr); + } + + if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { + /* TODO: Also verify bits beyond physical address width are 0 */ + if (!PAGE_ALIGNED(vmcs12->virtual_apic_page_addr)) + return false; + + if (vmx->nested.virtual_apic_page) /* shouldn't happen */ + nested_release_page(vmx->nested.virtual_apic_page); + vmx->nested.virtual_apic_page = + nested_get_page(vcpu, vmcs12->virtual_apic_page_addr); + + /* + * Failing the vm entry is _not_ what the processor does + * but it's basically the only possibility we have. + * We could still enter the guest if CR8 load exits are + * enabled, CR8 store exits are enabled, and virtualize APIC + * access is disabled; in this case the processor would never + * use the TPR shadow and we could simply clear the bit from + * the execution control. But such a configuration is useless, + * so let's keep the code simple. + */ + if (!vmx->nested.virtual_apic_page) + return false; + } + + return true; +} + static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu) { u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value; @@ -7788,7 +8051,7 @@ static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu) /* * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it - * with L0's requirements for its guest (a.k.a. vmsc01), so we can run the L2 + * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2 * guest in a way that will both be appropriate to L1's requests, and our * needs. In addition to modifying the active vmcs (which is vmcs02), this * function also has additional necessary side-effects, like setting various @@ -7836,7 +8099,13 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); - vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); + if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { + kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); + vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); + } else { + kvm_set_dr(vcpu, 7, vcpu->arch.dr7); + vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl); + } vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, vmcs12->vm_entry_intr_info_field); vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, @@ -7846,7 +8115,6 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, vmcs12->guest_interruptibility_info); vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); - kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); vmx_set_rflags(vcpu, vmcs12->guest_rflags); vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, vmcs12->guest_pending_dbg_exceptions); @@ -7904,16 +8172,6 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) { /* - * Translate L1 physical address to host physical - * address for vmcs02. Keep the page pinned, so this - * physical address remains valid. We keep a reference - * to it so we can release it later. - */ - if (vmx->nested.apic_access_page) /* shouldn't happen */ - nested_release_page(vmx->nested.apic_access_page); - vmx->nested.apic_access_page = - nested_get_page(vcpu, vmcs12->apic_access_addr); - /* * If translation failed, no matter: This feature asks * to exit when accessing the given address, and if it * can never be accessed, this feature won't do @@ -7928,8 +8186,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) } else if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) { exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; - vmcs_write64(APIC_ACCESS_ADDR, - page_to_phys(vcpu->kvm->arch.apic_access_page)); + kvm_vcpu_reload_apic_access_page(vcpu); } vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); @@ -7958,6 +8215,13 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; exec_control &= ~CPU_BASED_TPR_SHADOW; exec_control |= vmcs12->cpu_based_vm_exec_control; + + if (exec_control & CPU_BASED_TPR_SHADOW) { + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, + page_to_phys(vmx->nested.virtual_apic_page)); + vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); + } + /* * Merging of IO and MSR bitmaps not currently supported. * Rather, exit every time. @@ -8113,14 +8377,13 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) } if ((vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_MSR_BITMAPS) && - !IS_ALIGNED(vmcs12->msr_bitmap, PAGE_SIZE)) { + !PAGE_ALIGNED(vmcs12->msr_bitmap)) { /*TODO: Also verify bits beyond physical address width are 0*/ nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); return 1; } - if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && - !IS_ALIGNED(vmcs12->apic_access_addr, PAGE_SIZE)) { + if (!nested_get_vmcs12_pages(vcpu, vmcs12)) { /*TODO: Also verify bits beyond physical address width are 0*/ nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); return 1; @@ -8136,15 +8399,18 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) } if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, - nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high) || + nested_vmx_true_procbased_ctls_low, + nested_vmx_procbased_ctls_high) || !vmx_control_verify(vmcs12->secondary_vm_exec_control, nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high) || !vmx_control_verify(vmcs12->pin_based_vm_exec_control, nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high) || !vmx_control_verify(vmcs12->vm_exit_controls, - nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high) || + nested_vmx_true_exit_ctls_low, + nested_vmx_exit_ctls_high) || !vmx_control_verify(vmcs12->vm_entry_controls, - nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high)) + nested_vmx_true_entry_ctls_low, + nested_vmx_entry_ctls_high)) { nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); return 1; @@ -8221,6 +8487,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) vmx->nested.vmcs01_tsc_offset = vmcs_read64(TSC_OFFSET); + if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) + vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); + cpu = get_cpu(); vmx->loaded_vmcs = vmcs02; vmx_vcpu_put(vcpu); @@ -8398,7 +8667,6 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); - kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7); vmcs12->guest_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); vmcs12->guest_rip = kvm_register_read(vcpu, VCPU_REGS_RIP); vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); @@ -8477,9 +8745,13 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); + if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) { + kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7); + vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); + } + /* TODO: These cannot have changed unless we have MSR bitmaps and * the relevant bit asks not to trap the change */ - vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT) vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT); if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) @@ -8670,7 +8942,6 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, unsigned long exit_qualification) { struct vcpu_vmx *vmx = to_vmx(vcpu); - int cpu; struct vmcs12 *vmcs12 = get_vmcs12(vcpu); /* trying to cancel vmlaunch/vmresume is a bug */ @@ -8680,6 +8951,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, exit_qualification); + vmx_load_vmcs01(vcpu); + if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT) && nested_exit_intr_ack_set(vcpu)) { int irq = kvm_cpu_get_interrupt(vcpu); @@ -8695,13 +8968,6 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, vmcs12->vm_exit_intr_error_code, KVM_ISA_VMX); - cpu = get_cpu(); - vmx->loaded_vmcs = &vmx->vmcs01; - vmx_vcpu_put(vcpu); - vmx_vcpu_load(vcpu, cpu); - vcpu->cpu = cpu; - put_cpu(); - vm_entry_controls_init(vmx, vmcs_read32(VM_ENTRY_CONTROLS)); vm_exit_controls_init(vmx, vmcs_read32(VM_EXIT_CONTROLS)); vmx_segment_cache_clear(vmx); @@ -8721,10 +8987,20 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, /* Unpin physical memory we referred to in vmcs02 */ if (vmx->nested.apic_access_page) { nested_release_page(vmx->nested.apic_access_page); - vmx->nested.apic_access_page = 0; + vmx->nested.apic_access_page = NULL; + } + if (vmx->nested.virtual_apic_page) { + nested_release_page(vmx->nested.virtual_apic_page); + vmx->nested.virtual_apic_page = NULL; } /* + * We are now running in L2, mmu_notifier will force to reload the + * page's hpa for L2 vmcs. Need to reload it for L1 before entering L1. + */ + kvm_vcpu_reload_apic_access_page(vcpu); + + /* * Exiting from L2 to L1, we're now back to L1 which thinks it just * finished a VMLAUNCH or VMRESUME instruction, so we need to set the * success or failure flag accordingly. @@ -8777,6 +9053,12 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu, return X86EMUL_CONTINUE; } +static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu) +{ + if (ple_gap) + shrink_ple_window(vcpu); +} + static struct kvm_x86_ops vmx_x86_ops = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, @@ -8821,7 +9103,6 @@ static struct kvm_x86_ops vmx_x86_ops = { .cache_reg = vmx_cache_reg, .get_rflags = vmx_get_rflags, .set_rflags = vmx_set_rflags, - .fpu_activate = vmx_fpu_activate, .fpu_deactivate = vmx_fpu_deactivate, .tlb_flush = vmx_flush_tlb, @@ -8844,6 +9125,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .enable_irq_window = enable_irq_window, .update_cr8_intercept = update_cr8_intercept, .set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode, + .set_apic_access_page_addr = vmx_set_apic_access_page_addr, .vm_has_apicv = vmx_vm_has_apicv, .load_eoi_exitmap = vmx_load_eoi_exitmap, .hwapic_irr_update = vmx_hwapic_irr_update, @@ -8882,6 +9164,8 @@ static struct kvm_x86_ops vmx_x86_ops = { .mpx_supported = vmx_mpx_supported, .check_nested_events = vmx_check_nested_events, + + .sched_in = vmx_sched_in, }; static int __init vmx_init(void) @@ -8890,7 +9174,7 @@ static int __init vmx_init(void) rdmsrl_safe(MSR_EFER, &host_efer); - for (i = 0; i < NR_VMX_MSR; ++i) + for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) kvm_define_shared_msr(i, vmx_msr_index[i]); vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL); @@ -8996,6 +9280,8 @@ static int __init vmx_init(void) } else kvm_disable_tdp(); + update_ple_window_actual_max(); + return 0; out7: @@ -9029,7 +9315,7 @@ static void __exit vmx_exit(void) free_page((unsigned long)vmx_vmread_bitmap); #ifdef CONFIG_KEXEC - rcu_assign_pointer(crash_vmclear_loaded_vmcss, NULL); + RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL); synchronize_rcu(); #endif diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ef432f891d3..34c8f94331f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -87,6 +87,7 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); static void update_cr8_intercept(struct kvm_vcpu *vcpu); static void process_nmi(struct kvm_vcpu *vcpu); +static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); struct kvm_x86_ops *kvm_x86_ops; EXPORT_SYMBOL_GPL(kvm_x86_ops); @@ -211,6 +212,7 @@ static void shared_msr_update(unsigned slot, u32 msr) void kvm_define_shared_msr(unsigned slot, u32 msr) { + BUG_ON(slot >= KVM_NR_SHARED_MSRS); if (slot >= shared_msrs_global.nr) shared_msrs_global.nr = slot + 1; shared_msrs_global.msrs[slot] = msr; @@ -244,7 +246,7 @@ void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask) } EXPORT_SYMBOL_GPL(kvm_set_shared_msr); -static void drop_user_return_notifiers(void *ignore) +static void drop_user_return_notifiers(void) { unsigned int cpu = smp_processor_id(); struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu); @@ -310,6 +312,31 @@ static int exception_class(int vector) return EXCPT_BENIGN; } +#define EXCPT_FAULT 0 +#define EXCPT_TRAP 1 +#define EXCPT_ABORT 2 +#define EXCPT_INTERRUPT 3 + +static int exception_type(int vector) +{ + unsigned int mask; + + if (WARN_ON(vector > 31 || vector == NMI_VECTOR)) + return EXCPT_INTERRUPT; + + mask = 1 << vector; + + /* #DB is trap, as instruction watchpoints are handled elsewhere */ + if (mask & ((1 << DB_VECTOR) | (1 << BP_VECTOR) | (1 << OF_VECTOR))) + return EXCPT_TRAP; + + if (mask & ((1 << DF_VECTOR) | (1 << MC_VECTOR))) + return EXCPT_ABORT; + + /* Reserved exceptions will result in fault */ + return EXCPT_FAULT; +} + static void kvm_multiple_exception(struct kvm_vcpu *vcpu, unsigned nr, bool has_error, u32 error_code, bool reinject) @@ -381,12 +408,14 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) } EXPORT_SYMBOL_GPL(kvm_inject_page_fault); -void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) +static bool kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) { if (mmu_is_nested(vcpu) && !fault->nested_page_fault) vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault); else vcpu->arch.mmu.inject_page_fault(vcpu, fault); + + return fault->nested_page_fault; } void kvm_inject_nmi(struct kvm_vcpu *vcpu) @@ -430,11 +459,12 @@ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, gfn_t ngfn, void *data, int offset, int len, u32 access) { + struct x86_exception exception; gfn_t real_gfn; gpa_t ngpa; ngpa = gfn_to_gpa(ngfn); - real_gfn = mmu->translate_gpa(vcpu, ngpa, access); + real_gfn = mmu->translate_gpa(vcpu, ngpa, access, &exception); if (real_gfn == UNMAPPED_GVA) return -EFAULT; @@ -699,7 +729,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) { kvm_mmu_sync_roots(vcpu); - kvm_mmu_flush_tlb(vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); return 0; } @@ -758,6 +788,15 @@ static void kvm_update_dr7(struct kvm_vcpu *vcpu) vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED; } +static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu) +{ + u64 fixed = DR6_FIXED_1; + + if (!guest_cpuid_has_rtm(vcpu)) + fixed |= DR6_RTM; + return fixed; +} + static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) { switch (dr) { @@ -773,7 +812,7 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) case 6: if (val & 0xffffffff00000000ULL) return -1; /* #GP */ - vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1; + vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu); kvm_update_dr6(vcpu); break; case 5: @@ -984,9 +1023,8 @@ struct pvclock_gtod_data { u32 shift; } clock; - /* open coded 'struct timespec' */ - u64 monotonic_time_snsec; - time_t monotonic_time_sec; + u64 boot_ns; + u64 nsec_base; }; static struct pvclock_gtod_data pvclock_gtod_data; @@ -994,27 +1032,21 @@ static struct pvclock_gtod_data pvclock_gtod_data; static void update_pvclock_gtod(struct timekeeper *tk) { struct pvclock_gtod_data *vdata = &pvclock_gtod_data; + u64 boot_ns; + + boot_ns = ktime_to_ns(ktime_add(tk->tkr.base_mono, tk->offs_boot)); write_seqcount_begin(&vdata->seq); /* copy pvclock gtod data */ - vdata->clock.vclock_mode = tk->clock->archdata.vclock_mode; - vdata->clock.cycle_last = tk->clock->cycle_last; - vdata->clock.mask = tk->clock->mask; - vdata->clock.mult = tk->mult; - vdata->clock.shift = tk->shift; - - vdata->monotonic_time_sec = tk->xtime_sec - + tk->wall_to_monotonic.tv_sec; - vdata->monotonic_time_snsec = tk->xtime_nsec - + (tk->wall_to_monotonic.tv_nsec - << tk->shift); - while (vdata->monotonic_time_snsec >= - (((u64)NSEC_PER_SEC) << tk->shift)) { - vdata->monotonic_time_snsec -= - ((u64)NSEC_PER_SEC) << tk->shift; - vdata->monotonic_time_sec++; - } + vdata->clock.vclock_mode = tk->tkr.clock->archdata.vclock_mode; + vdata->clock.cycle_last = tk->tkr.cycle_last; + vdata->clock.mask = tk->tkr.mask; + vdata->clock.mult = tk->tkr.mult; + vdata->clock.shift = tk->tkr.shift; + + vdata->boot_ns = boot_ns; + vdata->nsec_base = tk->tkr.xtime_nsec; write_seqcount_end(&vdata->seq); } @@ -1109,11 +1141,7 @@ static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz, static inline u64 get_kernel_ns(void) { - struct timespec ts; - - ktime_get_ts(&ts); - monotonic_to_bootbased(&ts); - return timespec_to_ns(&ts); + return ktime_get_boot_ns(); } #ifdef CONFIG_X86_64 @@ -1215,6 +1243,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) unsigned long flags; s64 usdiff; bool matched; + bool already_matched; u64 data = msr->data; raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); @@ -1279,6 +1308,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) pr_debug("kvm: adjusted tsc offset by %llu\n", delta); } matched = true; + already_matched = (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation); } else { /* * We split periods of matched TSC writes into generations. @@ -1294,7 +1324,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) kvm->arch.cur_tsc_write = data; kvm->arch.cur_tsc_offset = offset; matched = false; - pr_debug("kvm: new tsc generation %u, clock %llu\n", + pr_debug("kvm: new tsc generation %llu, clock %llu\n", kvm->arch.cur_tsc_generation, data); } @@ -1319,10 +1349,11 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); spin_lock(&kvm->arch.pvclock_gtod_sync_lock); - if (matched) - kvm->arch.nr_vcpus_matched_tsc++; - else + if (!matched) { kvm->arch.nr_vcpus_matched_tsc = 0; + } else if (!already_matched) { + kvm->arch.nr_vcpus_matched_tsc++; + } kvm_track_tsc_matching(vcpu); spin_unlock(&kvm->arch.pvclock_gtod_sync_lock); @@ -1375,23 +1406,22 @@ static inline u64 vgettsc(cycle_t *cycle_now) return v * gtod->clock.mult; } -static int do_monotonic(struct timespec *ts, cycle_t *cycle_now) +static int do_monotonic_boot(s64 *t, cycle_t *cycle_now) { + struct pvclock_gtod_data *gtod = &pvclock_gtod_data; unsigned long seq; - u64 ns; int mode; - struct pvclock_gtod_data *gtod = &pvclock_gtod_data; + u64 ns; - ts->tv_nsec = 0; do { seq = read_seqcount_begin(>od->seq); mode = gtod->clock.vclock_mode; - ts->tv_sec = gtod->monotonic_time_sec; - ns = gtod->monotonic_time_snsec; + ns = gtod->nsec_base; ns += vgettsc(cycle_now); ns >>= gtod->clock.shift; + ns += gtod->boot_ns; } while (unlikely(read_seqcount_retry(>od->seq, seq))); - timespec_add_ns(ts, ns); + *t = ns; return mode; } @@ -1399,19 +1429,11 @@ static int do_monotonic(struct timespec *ts, cycle_t *cycle_now) /* returns true if host is using tsc clocksource */ static bool kvm_get_time_and_clockread(s64 *kernel_ns, cycle_t *cycle_now) { - struct timespec ts; - /* checked again under seqlock below */ if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC) return false; - if (do_monotonic(&ts, cycle_now) != VCLOCK_TSC) - return false; - - monotonic_to_bootbased(&ts); - *kernel_ns = timespec_to_ns(&ts); - - return true; + return do_monotonic_boot(kernel_ns, cycle_now) == VCLOCK_TSC; } #endif @@ -1499,7 +1521,7 @@ static void kvm_gen_update_masterclock(struct kvm *kvm) pvclock_update_vm_gtod_copy(kvm); kvm_for_each_vcpu(i, vcpu, kvm) - set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests); + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); /* guest entries allowed */ kvm_for_each_vcpu(i, vcpu, kvm) @@ -1537,7 +1559,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) /* Keep irq disabled to prevent changes to the clock */ local_irq_save(flags); - this_tsc_khz = __get_cpu_var(cpu_tsc_khz); + this_tsc_khz = __this_cpu_read(cpu_tsc_khz); if (unlikely(this_tsc_khz == 0)) { local_irq_restore(flags); kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); @@ -1642,7 +1664,7 @@ static void kvmclock_update_fn(struct work_struct *work) struct kvm_vcpu *vcpu; kvm_for_each_vcpu(i, vcpu, kvm) { - set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests); + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); kvm_vcpu_kick(vcpu); } } @@ -1651,7 +1673,7 @@ static void kvm_gen_kvmclock_update(struct kvm_vcpu *v) { struct kvm *kvm = v->kvm; - set_bit(KVM_REQ_CLOCK_UPDATE, &v->requests); + kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); schedule_delayed_work(&kvm->arch.kvmclock_update_work, KVMCLOCK_UPDATE_DELAY); } @@ -1704,9 +1726,10 @@ static bool valid_mtrr_type(unsigned t) return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */ } -static bool mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) +bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) { int i; + u64 mask; if (!msr_mtrr_valid(msr)) return false; @@ -1728,14 +1751,31 @@ static bool mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) } /* variable MTRRs */ - return valid_mtrr_type(data & 0xff); + WARN_ON(!(msr >= 0x200 && msr < 0x200 + 2 * KVM_NR_VAR_MTRR)); + + mask = (~0ULL) << cpuid_maxphyaddr(vcpu); + if ((msr & 1) == 0) { + /* MTRR base */ + if (!valid_mtrr_type(data & 0xff)) + return false; + mask |= 0xf00; + } else + /* MTRR mask */ + mask |= 0x7ff; + if (data & mask) { + kvm_inject_gp(vcpu, 0); + return false; + } + + return true; } +EXPORT_SYMBOL_GPL(kvm_mtrr_valid); static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data) { u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; - if (!mtrr_valid(vcpu, msr, data)) + if (!kvm_mtrr_valid(vcpu, msr, data)) return 1; if (msr == MSR_MTRRdefType) { @@ -1786,7 +1826,7 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data) break; default: if (msr >= MSR_IA32_MC0_CTL && - msr < MSR_IA32_MC0_CTL + 4 * bank_num) { + msr < MSR_IA32_MCx_CTL(bank_num)) { u32 offset = msr - MSR_IA32_MC0_CTL; /* only 0 or all 1s can be written to IA32_MCi_CTL * some Linux kernels though clear bit 10 in bank 4 to @@ -2032,6 +2072,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) data &= ~(u64)0x40; /* ignore flush filter disable */ data &= ~(u64)0x100; /* ignore ignne emulation enable */ data &= ~(u64)0x8; /* ignore TLB cache disable */ + data &= ~(u64)0x40000; /* ignore Mc status write enable */ if (data != 0) { vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", data); @@ -2144,7 +2185,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_MCG_CTL: case MSR_IA32_MCG_STATUS: - case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: + case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: return set_msr_mce(vcpu, msr, data); /* Performance counters are not protected by a CPUID bit, @@ -2310,7 +2351,7 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) break; default: if (msr >= MSR_IA32_MC0_CTL && - msr < MSR_IA32_MC0_CTL + 4 * bank_num) { + msr < MSR_IA32_MCx_CTL(bank_num)) { u32 offset = msr - MSR_IA32_MC0_CTL; data = vcpu->arch.mce_banks[offset]; break; @@ -2399,7 +2440,13 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_K7_HWCR: case MSR_VM_HSAVE_PA: case MSR_K7_EVNTSEL0: + case MSR_K7_EVNTSEL1: + case MSR_K7_EVNTSEL2: + case MSR_K7_EVNTSEL3: case MSR_K7_PERFCTR0: + case MSR_K7_PERFCTR1: + case MSR_K7_PERFCTR2: + case MSR_K7_PERFCTR3: case MSR_K8_INT_PENDING_MSG: case MSR_AMD64_NB_CFG: case MSR_FAM10H_MMIO_CONF_BASE: @@ -2485,7 +2532,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_IA32_MCG_CAP: case MSR_IA32_MCG_CTL: case MSR_IA32_MCG_STATUS: - case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: + case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: return get_msr_mce(vcpu, msr, pdata); case MSR_K7_CLK_CTL: /* @@ -2616,7 +2663,7 @@ out: return r; } -int kvm_dev_ioctl_check_extension(long ext) +int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) { int r; @@ -2803,7 +2850,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (unlikely(vcpu->arch.tsc_offset_adjustment)) { adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment); vcpu->arch.tsc_offset_adjustment = 0; - set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests); + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); } if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) { @@ -2974,9 +3021,7 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft; events->interrupt.nr = vcpu->arch.interrupt.nr; events->interrupt.soft = 0; - events->interrupt.shadow = - kvm_x86_ops->get_interrupt_shadow(vcpu, - KVM_X86_SHADOW_INT_MOV_SS | KVM_X86_SHADOW_INT_STI); + events->interrupt.shadow = kvm_x86_ops->get_interrupt_shadow(vcpu); events->nmi.injected = vcpu->arch.nmi_injected; events->nmi.pending = vcpu->arch.nmi_pending != 0; @@ -4022,16 +4067,16 @@ void kvm_get_segment(struct kvm_vcpu *vcpu, kvm_x86_ops->get_segment(vcpu, var, seg); } -gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access) +gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, + struct x86_exception *exception) { gpa_t t_gpa; - struct x86_exception exception; BUG_ON(!mmu_is_nested(vcpu)); /* NPT walks are always user-walks */ access |= PFERR_USER_MASK; - t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &exception); + t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, exception); return t_gpa; } @@ -4082,7 +4127,8 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, if (gpa == UNMAPPED_GVA) return X86EMUL_PROPAGATE_FAULT; - ret = kvm_read_guest(vcpu->kvm, gpa, data, toread); + ret = kvm_read_guest_page(vcpu->kvm, gpa >> PAGE_SHIFT, data, + offset, toread); if (ret < 0) { r = X86EMUL_IO_NEEDED; goto out; @@ -4103,10 +4149,24 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt, { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + unsigned offset; + int ret; - return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, - access | PFERR_FETCH_MASK, - exception); + /* Inline kvm_read_guest_virt_helper for speed. */ + gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access|PFERR_FETCH_MASK, + exception); + if (unlikely(gpa == UNMAPPED_GVA)) + return X86EMUL_PROPAGATE_FAULT; + + offset = addr & (PAGE_SIZE-1); + if (WARN_ON(offset + bytes > PAGE_SIZE)) + bytes = (unsigned)PAGE_SIZE - offset; + ret = kvm_read_guest_page(vcpu->kvm, gpa >> PAGE_SHIFT, val, + offset, bytes); + if (unlikely(ret < 0)) + return X86EMUL_IO_NEEDED; + + return X86EMUL_CONTINUE; } int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt, @@ -4730,7 +4790,6 @@ static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector, if (desc->g) var.limit = (var.limit << 12) | 0xfff; var.type = desc->type; - var.present = desc->p; var.dpl = desc->dpl; var.db = desc->d; var.s = desc->s; @@ -4762,6 +4821,12 @@ static int emulator_set_msr(struct x86_emulate_ctxt *ctxt, return kvm_set_msr(emul_to_vcpu(ctxt), &msr); } +static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt, + u32 pmc) +{ + return kvm_pmu_check_pmc(emul_to_vcpu(ctxt), pmc); +} + static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt, u32 pmc, u64 *pdata) { @@ -4838,6 +4903,7 @@ static const struct x86_emulate_ops emulate_ops = { .set_dr = emulator_set_dr, .set_msr = emulator_set_msr, .get_msr = emulator_get_msr, + .check_pmc = emulator_check_pmc, .read_pmc = emulator_read_pmc, .halt = emulator_halt, .wbinvd = emulator_wbinvd, @@ -4850,7 +4916,7 @@ static const struct x86_emulate_ops emulate_ops = { static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) { - u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask); + u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu); /* * an sti; sti; sequence only disable interrupts for the first * instruction. So, if the last instruction, be it emulated or @@ -4858,33 +4924,27 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) * means that the last instruction is an sti. We should not * leave the flag on in this case. The same goes for mov ss */ - if (!(int_shadow & mask)) + if (int_shadow & mask) + mask = 0; + if (unlikely(int_shadow || mask)) { kvm_x86_ops->set_interrupt_shadow(vcpu, mask); + if (!mask) + kvm_make_request(KVM_REQ_EVENT, vcpu); + } } -static void inject_emulated_exception(struct kvm_vcpu *vcpu) +static bool inject_emulated_exception(struct kvm_vcpu *vcpu) { struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; if (ctxt->exception.vector == PF_VECTOR) - kvm_propagate_fault(vcpu, &ctxt->exception); - else if (ctxt->exception.error_code_valid) + return kvm_propagate_fault(vcpu, &ctxt->exception); + + if (ctxt->exception.error_code_valid) kvm_queue_exception_e(vcpu, ctxt->exception.vector, ctxt->exception.error_code); else kvm_queue_exception(vcpu, ctxt->exception.vector); -} - -static void init_decode_cache(struct x86_emulate_ctxt *ctxt) -{ - memset(&ctxt->opcode_len, 0, - (void *)&ctxt->_regs - (void *)&ctxt->opcode_len); - - ctxt->fetch.start = 0; - ctxt->fetch.end = 0; - ctxt->io_read.pos = 0; - ctxt->io_read.end = 0; - ctxt->mem_read.pos = 0; - ctxt->mem_read.end = 0; + return false; } static void init_emulate_ctxt(struct kvm_vcpu *vcpu) @@ -4941,7 +5001,7 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu) ++vcpu->stat.insn_emulation_fail; trace_kvm_emulate_insn_failed(vcpu); - if (!is_guest_mode(vcpu)) { + if (!is_guest_mode(vcpu) && kvm_x86_ops->get_cpl(vcpu) == 0) { vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; vcpu->run->internal.ndata = 0; @@ -5085,23 +5145,22 @@ static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7, return dr6; } -static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, int *r) +static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, unsigned long rflags, int *r) { struct kvm_run *kvm_run = vcpu->run; /* - * Use the "raw" value to see if TF was passed to the processor. - * Note that the new value of the flags has not been saved yet. + * rflags is the old, "raw" value of the flags. The new value has + * not been saved yet. * * This is correct even for TF set by the guest, because "the * processor will not generate this exception after the instruction * that sets the TF flag". */ - unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); - if (unlikely(rflags & X86_EFLAGS_TF)) { if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { - kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1; + kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 | + DR6_RTM; kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip; kvm_run->debug.arch.exception = DB_VECTOR; kvm_run->exit_reason = KVM_EXIT_DEBUG; @@ -5114,7 +5173,7 @@ static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, int *r) * cleared by the processor". */ vcpu->arch.dr6 &= ~15; - vcpu->arch.dr6 |= DR6_BS; + vcpu->arch.dr6 |= DR6_BS | DR6_RTM; kvm_queue_exception(vcpu, DB_VECTOR); } } @@ -5133,7 +5192,7 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r) vcpu->arch.eff_db); if (dr6 != 0) { - kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1; + kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1 | DR6_RTM; kvm_run->debug.arch.pc = kvm_rip_read(vcpu) + get_segment_base(vcpu, VCPU_SREG_CS); @@ -5144,14 +5203,15 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r) } } - if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK)) { + if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK) && + !(kvm_get_rflags(vcpu) & X86_EFLAGS_RF)) { dr6 = kvm_vcpu_check_hw_bp(eip, 0, vcpu->arch.dr7, vcpu->arch.db); if (dr6 != 0) { vcpu->arch.dr6 &= ~15; - vcpu->arch.dr6 |= dr6; + vcpu->arch.dr6 |= dr6 | DR6_RTM; kvm_queue_exception(vcpu, DB_VECTOR); *r = EMULATE_DONE; return true; @@ -5193,6 +5253,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, ctxt->interruptibility = 0; ctxt->have_exception = false; + ctxt->exception.vector = -1; ctxt->perm_ok = false; ctxt->ud = emulation_type & EMULTYPE_TRAP_UD; @@ -5215,6 +5276,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, if (emulation_type & EMULTYPE_SKIP) { kvm_rip_write(vcpu, ctxt->_eip); + if (ctxt->eflags & X86_EFLAGS_RF) + kvm_set_rflags(vcpu, ctxt->eflags & ~X86_EFLAGS_RF); return EMULATE_DONE; } @@ -5243,8 +5306,9 @@ restart: } if (ctxt->have_exception) { - inject_emulated_exception(vcpu); r = EMULATE_DONE; + if (inject_emulated_exception(vcpu)) + return r; } else if (vcpu->arch.pio.count) { if (!vcpu->arch.pio.in) { /* FIXME: return into emulator if single-stepping. */ @@ -5265,13 +5329,22 @@ restart: r = EMULATE_DONE; if (writeback) { + unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); toggle_interruptibility(vcpu, ctxt->interruptibility); - kvm_make_request(KVM_REQ_EVENT, vcpu); vcpu->arch.emulate_regs_need_sync_to_vcpu = false; kvm_rip_write(vcpu, ctxt->eip); if (r == EMULATE_DONE) - kvm_vcpu_check_singlestep(vcpu, &r); - kvm_set_rflags(vcpu, ctxt->eflags); + kvm_vcpu_check_singlestep(vcpu, rflags, &r); + __kvm_set_rflags(vcpu, ctxt->eflags); + + /* + * For STI, interrupts are shadowed; so KVM_REQ_EVENT will + * do nothing, and it will be requested again as soon as + * the shadow expires. But we still need to check here, + * because POPF has no interrupt shadow. + */ + if (unlikely((ctxt->eflags & ~rflags) & X86_EFLAGS_IF)) + kvm_make_request(KVM_REQ_EVENT, vcpu); } else vcpu->arch.emulate_regs_need_sync_to_vcpu = true; @@ -5503,7 +5576,7 @@ static void kvm_set_mmio_spte_mask(void) * entry to generate page fault with PFER.RSV = 1. */ /* Mask the reserved physical address bits. */ - mask = ((1ull << (51 - maxphyaddr + 1)) - 1) << maxphyaddr; + mask = rsvd_bits(maxphyaddr, 51); /* Bit 62 is always reserved for 32bit host. */ mask |= 0x3ull << 62; @@ -5534,7 +5607,7 @@ static void pvclock_gtod_update_fn(struct work_struct *work) spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) kvm_for_each_vcpu(i, vcpu, kvm) - set_bit(KVM_REQ_MASTERCLOCK_UPDATE, &vcpu->requests); + kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); atomic_set(&kvm_guest_has_master_clock, 0); spin_unlock(&kvm_lock); } @@ -5662,7 +5735,6 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) u64 param, ingpa, outgpa, ret; uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0; bool fast, longmode; - int cs_db, cs_l; /* * hypercall generates UD from non zero cpl and real mode @@ -5673,8 +5745,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) return 0; } - kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); - longmode = is_long_mode(vcpu) && cs_l == 1; + longmode = is_64_bit_mode(vcpu); if (!longmode) { param = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) | @@ -5739,7 +5810,7 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid) int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) { unsigned long nr, a0, a1, a2, a3, ret; - int r = 1; + int op_64_bit, r = 1; if (kvm_hv_hypercall_enabled(vcpu->kvm)) return kvm_hv_hypercall(vcpu); @@ -5752,7 +5823,8 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) trace_kvm_hypercall(nr, a0, a1, a2, a3); - if (!is_long_mode(vcpu)) { + op_64_bit = is_64_bit_mode(vcpu); + if (!op_64_bit) { nr &= 0xFFFFFFFF; a0 &= 0xFFFFFFFF; a1 &= 0xFFFFFFFF; @@ -5778,6 +5850,8 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) break; } out: + if (!op_64_bit) + ret = (u32)ret; kvm_register_write(vcpu, VCPU_REGS_RAX, ret); ++vcpu->stat.hypercalls; return r; @@ -5856,6 +5930,11 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win) trace_kvm_inj_exception(vcpu->arch.exception.nr, vcpu->arch.exception.has_error_code, vcpu->arch.exception.error_code); + + if (exception_type(vcpu->arch.exception.nr) == EXCPT_FAULT) + __kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) | + X86_EFLAGS_RF); + kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr, vcpu->arch.exception.has_error_code, vcpu->arch.exception.error_code, @@ -5941,6 +6020,44 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) kvm_apic_update_tmr(vcpu, tmr); } +static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu) +{ + ++vcpu->stat.tlb_flush; + kvm_x86_ops->tlb_flush(vcpu); +} + +void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) +{ + struct page *page = NULL; + + if (!irqchip_in_kernel(vcpu->kvm)) + return; + + if (!kvm_x86_ops->set_apic_access_page_addr) + return; + + page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); + kvm_x86_ops->set_apic_access_page_addr(vcpu, page_to_phys(page)); + + /* + * Do not pin apic access page in memory, the MMU notifier + * will call us again if it is migrated or swapped out. + */ + put_page(page); +} +EXPORT_SYMBOL_GPL(kvm_vcpu_reload_apic_access_page); + +void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, + unsigned long address) +{ + /* + * The physical address of apic access page is stored in the VMCS. + * Update it when it becomes invalid. + */ + if (address == gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT)) + kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD); +} + /* * Returns 1 to let __vcpu_run() continue the guest execution loop without * exiting to the userspace. Otherwise, the value will be returned to the @@ -5970,7 +6087,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu)) kvm_mmu_sync_roots(vcpu); if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) - kvm_x86_ops->tlb_flush(vcpu); + kvm_vcpu_flush_tlb(vcpu); if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) { vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS; r = 0; @@ -6001,6 +6118,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_deliver_pmi(vcpu); if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu)) vcpu_scan_ioapic(vcpu); + if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu)) + kvm_vcpu_reload_apic_access_page(vcpu); } if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { @@ -6847,9 +6966,11 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu) atomic_set(&vcpu->arch.nmi_queued, 0); vcpu->arch.nmi_pending = 0; vcpu->arch.nmi_injected = false; + kvm_clear_interrupt_queue(vcpu); + kvm_clear_exception_queue(vcpu); memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db)); - vcpu->arch.dr6 = DR6_FIXED_1; + vcpu->arch.dr6 = DR6_INIT; kvm_update_dr6(vcpu); vcpu->arch.dr7 = DR7_FIXED_1; kvm_update_dr7(vcpu); @@ -6884,7 +7005,7 @@ void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, unsigned int vector) kvm_rip_write(vcpu, 0); } -int kvm_arch_hardware_enable(void *garbage) +int kvm_arch_hardware_enable(void) { struct kvm *kvm; struct kvm_vcpu *vcpu; @@ -6895,7 +7016,7 @@ int kvm_arch_hardware_enable(void *garbage) bool stable, backwards_tsc = false; kvm_shared_msr_cpu_online(); - ret = kvm_x86_ops->hardware_enable(garbage); + ret = kvm_x86_ops->hardware_enable(); if (ret != 0) return ret; @@ -6904,7 +7025,7 @@ int kvm_arch_hardware_enable(void *garbage) list_for_each_entry(kvm, &vm_list, vm_list) { kvm_for_each_vcpu(i, vcpu, kvm) { if (!stable && vcpu->cpu == smp_processor_id()) - set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests); + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); if (stable && vcpu->arch.last_host_tsc > local_tsc) { backwards_tsc = true; if (vcpu->arch.last_host_tsc > max_tsc) @@ -6958,8 +7079,7 @@ int kvm_arch_hardware_enable(void *garbage) kvm_for_each_vcpu(i, vcpu, kvm) { vcpu->arch.tsc_offset_adjustment += delta_cyc; vcpu->arch.last_host_tsc = local_tsc; - set_bit(KVM_REQ_MASTERCLOCK_UPDATE, - &vcpu->requests); + kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); } /* @@ -6976,10 +7096,10 @@ int kvm_arch_hardware_enable(void *garbage) return 0; } -void kvm_arch_hardware_disable(void *garbage) +void kvm_arch_hardware_disable(void) { - kvm_x86_ops->hardware_disable(garbage); - drop_user_return_notifiers(garbage); + kvm_x86_ops->hardware_disable(); + drop_user_return_notifiers(); } int kvm_arch_hardware_setup(void) @@ -7096,6 +7216,11 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) static_key_slow_dec(&kvm_no_apic_vcpu); } +void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) +{ + kvm_x86_ops->sched_in(vcpu, cpu); +} + int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { if (type) @@ -7187,10 +7312,6 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kfree(kvm->arch.vpic); kfree(kvm->arch.vioapic); kvm_free_vcpus(kvm); - if (kvm->arch.apic_access_page) - put_page(kvm->arch.apic_access_page); - if (kvm->arch.ept_identity_pagetable) - put_page(kvm->arch.ept_identity_pagetable); kfree(rcu_dereference_check(kvm->arch.apic_map, 1)); } @@ -7405,12 +7526,17 @@ unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_get_rflags); -void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) +static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP && kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip)) rflags |= X86_EFLAGS_TF; kvm_x86_ops->set_rflags(vcpu, rflags); +} + +void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) +{ + __kvm_set_rflags(vcpu, rflags); kvm_make_request(KVM_REQ_EVENT, vcpu); } EXPORT_SYMBOL_GPL(kvm_set_rflags); @@ -7588,3 +7714,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 8c97bac9a89..7cb9c45a5fe 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -47,6 +47,16 @@ static inline int is_long_mode(struct kvm_vcpu *vcpu) #endif } +static inline bool is_64_bit_mode(struct kvm_vcpu *vcpu) +{ + int cs_db, cs_l; + + if (!is_long_mode(vcpu)) + return false; + kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + return cs_l; +} + static inline bool mmu_is_nested(struct kvm_vcpu *vcpu) { return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu; @@ -78,15 +88,23 @@ static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu, vcpu->arch.mmio_gva = gva & PAGE_MASK; vcpu->arch.access = access; vcpu->arch.mmio_gfn = gfn; + vcpu->arch.mmio_gen = kvm_memslots(vcpu->kvm)->generation; +} + +static inline bool vcpu_match_mmio_gen(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.mmio_gen == kvm_memslots(vcpu->kvm)->generation; } /* - * Clear the mmio cache info for the given gva, - * specially, if gva is ~0ul, we clear all mmio cache info. + * Clear the mmio cache info for the given gva. If gva is MMIO_GVA_ANY, we + * clear all mmio cache info. */ +#define MMIO_GVA_ANY (~(gva_t)0) + static inline void vcpu_clear_mmio_info(struct kvm_vcpu *vcpu, gva_t gva) { - if (gva != (~0ul) && vcpu->arch.mmio_gva != (gva & PAGE_MASK)) + if (gva != MMIO_GVA_ANY && vcpu->arch.mmio_gva != (gva & PAGE_MASK)) return; vcpu->arch.mmio_gva = 0; @@ -94,7 +112,8 @@ static inline void vcpu_clear_mmio_info(struct kvm_vcpu *vcpu, gva_t gva) static inline bool vcpu_match_mmio_gva(struct kvm_vcpu *vcpu, unsigned long gva) { - if (vcpu->arch.mmio_gva && vcpu->arch.mmio_gva == (gva & PAGE_MASK)) + if (vcpu_match_mmio_gen(vcpu) && vcpu->arch.mmio_gva && + vcpu->arch.mmio_gva == (gva & PAGE_MASK)) return true; return false; @@ -102,12 +121,30 @@ static inline bool vcpu_match_mmio_gva(struct kvm_vcpu *vcpu, unsigned long gva) static inline bool vcpu_match_mmio_gpa(struct kvm_vcpu *vcpu, gpa_t gpa) { - if (vcpu->arch.mmio_gfn && vcpu->arch.mmio_gfn == gpa >> PAGE_SHIFT) + if (vcpu_match_mmio_gen(vcpu) && vcpu->arch.mmio_gfn && + vcpu->arch.mmio_gfn == gpa >> PAGE_SHIFT) return true; return false; } +static inline unsigned long kvm_register_readl(struct kvm_vcpu *vcpu, + enum kvm_reg reg) +{ + unsigned long val = kvm_register_read(vcpu, reg); + + return is_64_bit_mode(vcpu) ? val : (u32)val; +} + +static inline void kvm_register_writel(struct kvm_vcpu *vcpu, + enum kvm_reg reg, + unsigned long val) +{ + if (!is_64_bit_mode(vcpu)) + val = (u32)val; + return kvm_register_write(vcpu, reg, val); +} + void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); @@ -122,6 +159,8 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception); +bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data); + #define KVM_SUPPORTED_XCR0 (XSTATE_FP | XSTATE_SSE | XSTATE_YMM \ | XSTATE_BNDREGS | XSTATE_BNDCSR) extern u64 host_xcr0; diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 4d4f96a2763..db92793b7e2 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -20,7 +20,6 @@ lib-y := delay.o misc.o cmdline.o lib-y += thunk_$(BITS).o lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o lib-y += memcpy_$(BITS).o -lib-$(CONFIG_SMP) += rwlock.o lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o @@ -39,7 +38,7 @@ endif else obj-y += iomap_copy_64.o lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o - lib-y += thunk_64.o clear_page_64.o copy_page_64.o + lib-y += clear_page_64.o copy_page_64.o lib-y += memmove_64.o memset_64.o lib-y += copy_user_64.o copy_user_nocache_64.o lib-y += cmpxchg16b_emu.o diff --git a/arch/x86/lib/cmpxchg16b_emu.S b/arch/x86/lib/cmpxchg16b_emu.S index 1e572c507d0..40a172541ee 100644 --- a/arch/x86/lib/cmpxchg16b_emu.S +++ b/arch/x86/lib/cmpxchg16b_emu.S @@ -6,15 +6,8 @@ * */ #include <linux/linkage.h> -#include <asm/alternative-asm.h> -#include <asm/frame.h> #include <asm/dwarf2.h> - -#ifdef CONFIG_SMP -#define SEG_PREFIX %gs: -#else -#define SEG_PREFIX -#endif +#include <asm/percpu.h> .text @@ -39,24 +32,25 @@ CFI_STARTPROC # *atomic* on a single cpu (as provided by the this_cpu_xx class of # macros). # -this_cpu_cmpxchg16b_emu: - pushf + pushfq_cfi cli - cmpq SEG_PREFIX(%rsi), %rax - jne not_same - cmpq SEG_PREFIX 8(%rsi), %rdx - jne not_same + cmpq PER_CPU_VAR((%rsi)), %rax + jne .Lnot_same + cmpq PER_CPU_VAR(8(%rsi)), %rdx + jne .Lnot_same - movq %rbx, SEG_PREFIX(%rsi) - movq %rcx, SEG_PREFIX 8(%rsi) + movq %rbx, PER_CPU_VAR((%rsi)) + movq %rcx, PER_CPU_VAR(8(%rsi)) - popf + CFI_REMEMBER_STATE + popfq_cfi mov $1, %al ret - not_same: - popf + CFI_RESTORE_STATE +.Lnot_same: + popfq_cfi xor %al,%al ret diff --git a/arch/x86/lib/cmpxchg8b_emu.S b/arch/x86/lib/cmpxchg8b_emu.S index 828cb710dec..b4807fce517 100644 --- a/arch/x86/lib/cmpxchg8b_emu.S +++ b/arch/x86/lib/cmpxchg8b_emu.S @@ -7,11 +7,8 @@ */ #include <linux/linkage.h> -#include <asm/alternative-asm.h> -#include <asm/frame.h> #include <asm/dwarf2.h> - .text /* @@ -30,27 +27,28 @@ CFI_STARTPROC # set the whole ZF thing (caller will just compare # eax:edx with the expected value) # -cmpxchg8b_emu: - pushfl + pushfl_cfi cli cmpl (%esi), %eax - jne not_same + jne .Lnot_same cmpl 4(%esi), %edx - jne half_same + jne .Lhalf_same movl %ebx, (%esi) movl %ecx, 4(%esi) - popfl + CFI_REMEMBER_STATE + popfl_cfi ret - not_same: + CFI_RESTORE_STATE +.Lnot_same: movl (%esi), %eax - half_same: +.Lhalf_same: movl 4(%esi), %edx - popfl + popfl_cfi ret CFI_ENDPROC diff --git a/arch/x86/lib/rwlock.S b/arch/x86/lib/rwlock.S deleted file mode 100644 index 1cad22139c8..00000000000 --- a/arch/x86/lib/rwlock.S +++ /dev/null @@ -1,44 +0,0 @@ -/* Slow paths of read/write spinlocks. */ - -#include <linux/linkage.h> -#include <asm/alternative-asm.h> -#include <asm/frame.h> -#include <asm/rwlock.h> - -#ifdef CONFIG_X86_32 -# define __lock_ptr eax -#else -# define __lock_ptr rdi -#endif - -ENTRY(__write_lock_failed) - CFI_STARTPROC - FRAME -0: LOCK_PREFIX - WRITE_LOCK_ADD($RW_LOCK_BIAS) (%__lock_ptr) -1: rep; nop - cmpl $WRITE_LOCK_CMP, (%__lock_ptr) - jne 1b - LOCK_PREFIX - WRITE_LOCK_SUB($RW_LOCK_BIAS) (%__lock_ptr) - jnz 0b - ENDFRAME - ret - CFI_ENDPROC -END(__write_lock_failed) - -ENTRY(__read_lock_failed) - CFI_STARTPROC - FRAME -0: LOCK_PREFIX - READ_LOCK_SIZE(inc) (%__lock_ptr) -1: rep; nop - READ_LOCK_SIZE(cmp) $1, (%__lock_ptr) - js 1b - LOCK_PREFIX - READ_LOCK_SIZE(dec) (%__lock_ptr) - js 0b - ENDFRAME - ret - CFI_ENDPROC -END(__read_lock_failed) diff --git a/arch/x86/lib/thunk_32.S b/arch/x86/lib/thunk_32.S index 28f85c91671..e28cdaf5ac2 100644 --- a/arch/x86/lib/thunk_32.S +++ b/arch/x86/lib/thunk_32.S @@ -6,25 +6,46 @@ */ #include <linux/linkage.h> #include <asm/asm.h> + #include <asm/dwarf2.h> -#ifdef CONFIG_TRACE_IRQFLAGS /* put return address in eax (arg1) */ - .macro thunk_ra name,func + .macro THUNK name, func, put_ret_addr_in_eax=0 .globl \name \name: - pushl %eax - pushl %ecx - pushl %edx + CFI_STARTPROC + pushl_cfi %eax + CFI_REL_OFFSET eax, 0 + pushl_cfi %ecx + CFI_REL_OFFSET ecx, 0 + pushl_cfi %edx + CFI_REL_OFFSET edx, 0 + + .if \put_ret_addr_in_eax /* Place EIP in the arg1 */ movl 3*4(%esp), %eax + .endif + call \func - popl %edx - popl %ecx - popl %eax + popl_cfi %edx + CFI_RESTORE edx + popl_cfi %ecx + CFI_RESTORE ecx + popl_cfi %eax + CFI_RESTORE eax ret + CFI_ENDPROC _ASM_NOKPROBE(\name) .endm - thunk_ra trace_hardirqs_on_thunk,trace_hardirqs_on_caller - thunk_ra trace_hardirqs_off_thunk,trace_hardirqs_off_caller +#ifdef CONFIG_TRACE_IRQFLAGS + THUNK trace_hardirqs_on_thunk,trace_hardirqs_on_caller,1 + THUNK trace_hardirqs_off_thunk,trace_hardirqs_off_caller,1 +#endif + +#ifdef CONFIG_PREEMPT + THUNK ___preempt_schedule, preempt_schedule +#ifdef CONFIG_CONTEXT_TRACKING + THUNK ___preempt_schedule_context, preempt_schedule_context #endif +#endif + diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S index 92d9feaff42..b30b5ebd614 100644 --- a/arch/x86/lib/thunk_64.S +++ b/arch/x86/lib/thunk_64.S @@ -38,6 +38,13 @@ THUNK lockdep_sys_exit_thunk,lockdep_sys_exit #endif +#ifdef CONFIG_PREEMPT + THUNK ___preempt_schedule, preempt_schedule +#ifdef CONFIG_CONTEXT_TRACKING + THUNK ___preempt_schedule_context, preempt_schedule_context +#endif +#endif + /* SAVE_ARGS below is used only for the .cfi directives it contains. */ CFI_STARTPROC SAVE_ARGS diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 167ffcac16e..95a427e5788 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -48,7 +48,9 @@ enum address_markers_idx { LOW_KERNEL_NR, VMALLOC_START_NR, VMEMMAP_START_NR, +# ifdef CONFIG_X86_ESPFIX64 ESPFIX_START_NR, +# endif HIGH_KERNEL_NR, MODULES_VADDR_NR, MODULES_END_NR, @@ -71,7 +73,9 @@ static struct addr_marker address_markers[] = { { PAGE_OFFSET, "Low Kernel Mapping" }, { VMALLOC_START, "vmalloc() Area" }, { VMEMMAP_START, "Vmemmap" }, +# ifdef CONFIG_X86_ESPFIX64 { ESPFIX_BASE_ADDR, "ESPfix Area", 16 }, +# endif { __START_KERNEL_map, "High Kernel Mapping" }, { MODULES_VADDR, "Modules" }, { MODULES_END, "End Modules" }, diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 36642793e31..d973e61e450 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -3,7 +3,6 @@ * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs. * Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar */ -#include <linux/magic.h> /* STACK_END_MAGIC */ #include <linux/sched.h> /* test_thread_flag(), ... */ #include <linux/kdebug.h> /* oops_begin/end, ... */ #include <linux/module.h> /* search_exception_table */ @@ -350,7 +349,7 @@ out: void vmalloc_sync_all(void) { - sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END); + sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END, 0); } /* @@ -577,6 +576,8 @@ static int is_f00f_bug(struct pt_regs *regs, unsigned long address) static const char nx_warning[] = KERN_CRIT "kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n"; +static const char smep_warning[] = KERN_CRIT +"unable to execute userspace code (SMEP?) (uid: %d)\n"; static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, @@ -597,6 +598,10 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, if (pte && pte_present(*pte) && !pte_exec(*pte)) printk(nx_warning, from_kuid(&init_user_ns, current_uid())); + if (pte && pte_present(*pte) && pte_exec(*pte) && + (pgd_flags(*pgd) & _PAGE_USER) && + (read_cr4() & X86_CR4_SMEP)) + printk(smep_warning, from_kuid(&init_user_ns, current_uid())); } printk(KERN_ALERT "BUG: unable to handle kernel "); @@ -643,7 +648,6 @@ no_context(struct pt_regs *regs, unsigned long error_code, unsigned long address, int signal, int si_code) { struct task_struct *tsk = current; - unsigned long *stackend; unsigned long flags; int sig; @@ -703,8 +707,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, show_fault_oops(regs, error_code, address); - stackend = end_of_stack(tsk); - if (tsk != &init_task && *stackend != STACK_END_MAGIC) + if (task_stack_end_corrupted(tsk)) printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); tsk->thread.cr2 = address; @@ -927,8 +930,17 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte) * cross-processor TLB flush, even if no stale TLB entries exist * on other processors. * + * Spurious faults may only occur if the TLB contains an entry with + * fewer permission than the page table entry. Non-present (P = 0) + * and reserved bit (R = 1) faults are never spurious. + * * There are no security implications to leaving a stale TLB when * increasing the permissions on a page. + * + * Returns non-zero if a spurious fault was handled, zero otherwise. + * + * See Intel Developer's Manual Vol 3 Section 4.10.4.3, bullet 3 + * (Optional Invalidation). */ static noinline int spurious_fault(unsigned long error_code, unsigned long address) @@ -939,8 +951,17 @@ spurious_fault(unsigned long error_code, unsigned long address) pte_t *pte; int ret; - /* Reserved-bit violation or user access to kernel space? */ - if (error_code & (PF_USER | PF_RSVD)) + /* + * Only writes to RO or instruction fetches from NX may cause + * spurious faults. + * + * These could be from user or supervisor accesses but the TLB + * is only lazily flushed after a kernel mapping protection + * change, so user accesses are not expected to cause spurious + * faults. + */ + if (error_code != (PF_WRITE | PF_PROT) + && error_code != (PF_INSTR | PF_PROT)) return 0; pgd = init_mm.pgd + pgd_index(address); @@ -1212,7 +1233,8 @@ good_area: /* * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo - * the fault: + * the fault. Since we never set FAULT_FLAG_RETRY_NOWAIT, if + * we get VM_FAULT_RETRY back, the mmap_sem has been unlocked. */ fault = handle_mm_fault(mm, vma, address, flags); diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index f9713061811..66dba36f234 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -18,6 +18,13 @@ #include <asm/dma.h> /* for MAX_DMA_PFN */ #include <asm/microcode.h> +/* + * We need to define the tracepoints somewhere, and tlb.c + * is only compied when SMP=y. + */ +#define CREATE_TRACE_POINTS +#include <trace/events/tlb.h> + #include "mm_internal.h" static unsigned long __initdata pgt_buf_start; diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index e39504878ae..c8140e12816 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -537,7 +537,7 @@ static void __init pagetable_init(void) permanent_kmaps_init(pgd_base); } -pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP); +pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL); EXPORT_SYMBOL_GPL(__supported_pte_mask); /* user-defined highmem size */ @@ -825,7 +825,8 @@ void __init mem_init(void) int arch_add_memory(int nid, u64 start, u64 size) { struct pglist_data *pgdata = NODE_DATA(nid); - struct zone *zone = pgdata->node_zones + ZONE_HIGHMEM; + struct zone *zone = pgdata->node_zones + + zone_for_memory(nid, start, size, ZONE_HIGHMEM); unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index df1a9927ad2..4cb8763868f 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -151,7 +151,7 @@ early_param("gbpages", parse_direct_gbpages_on); * around without checking the pgd every time. */ -pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP; +pteval_t __supported_pte_mask __read_mostly = ~0; EXPORT_SYMBOL_GPL(__supported_pte_mask); int force_personality32; @@ -178,7 +178,7 @@ __setup("noexec32=", nonx32_setup); * When memory was added/removed make sure all the processes MM have * suitable PGD entries in the local PGD level page. */ -void sync_global_pgds(unsigned long start, unsigned long end) +void sync_global_pgds(unsigned long start, unsigned long end, int removed) { unsigned long address; @@ -186,7 +186,12 @@ void sync_global_pgds(unsigned long start, unsigned long end) const pgd_t *pgd_ref = pgd_offset_k(address); struct page *page; - if (pgd_none(*pgd_ref)) + /* + * When it is called after memory hot remove, pgd_none() + * returns true. In this case (removed == 1), we must clear + * the PGD entries in the local PGD level page. + */ + if (pgd_none(*pgd_ref) && !removed) continue; spin_lock(&pgd_lock); @@ -199,12 +204,18 @@ void sync_global_pgds(unsigned long start, unsigned long end) pgt_lock = &pgd_page_get_mm(page)->page_table_lock; spin_lock(pgt_lock); - if (pgd_none(*pgd)) - set_pgd(pgd, *pgd_ref); - else + if (!pgd_none(*pgd_ref) && !pgd_none(*pgd)) BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); + if (removed) { + if (pgd_none(*pgd_ref) && !pgd_none(*pgd)) + pgd_clear(pgd); + } else { + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); + } + spin_unlock(pgt_lock); } spin_unlock(&pgd_lock); @@ -633,7 +644,7 @@ kernel_physical_mapping_init(unsigned long start, } if (pgd_changed) - sync_global_pgds(addr, end - 1); + sync_global_pgds(addr, end - 1, 0); __flush_tlb_all(); @@ -691,7 +702,8 @@ static void update_end_of_memory_vars(u64 start, u64 size) int arch_add_memory(int nid, u64 start, u64 size) { struct pglist_data *pgdat = NODE_DATA(nid); - struct zone *zone = pgdat->node_zones + ZONE_NORMAL; + struct zone *zone = pgdat->node_zones + + zone_for_memory(nid, start, size, ZONE_NORMAL); unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; int ret; @@ -975,25 +987,26 @@ static void __meminit remove_pagetable(unsigned long start, unsigned long end, bool direct) { unsigned long next; + unsigned long addr; pgd_t *pgd; pud_t *pud; bool pgd_changed = false; - for (; start < end; start = next) { - next = pgd_addr_end(start, end); + for (addr = start; addr < end; addr = next) { + next = pgd_addr_end(addr, end); - pgd = pgd_offset_k(start); + pgd = pgd_offset_k(addr); if (!pgd_present(*pgd)) continue; pud = (pud_t *)pgd_page_vaddr(*pgd); - remove_pud_table(pud, start, next, direct); + remove_pud_table(pud, addr, next, direct); if (free_pud_table(pud, pgd)) pgd_changed = true; } if (pgd_changed) - sync_global_pgds(start, end - 1); + sync_global_pgds(start, end - 1, 1); flush_tlb_all(); } @@ -1340,7 +1353,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) else err = vmemmap_populate_basepages(start, end, node); if (!err) - sync_global_pgds(start, end - 1); + sync_global_pgds(start, end - 1, 0); return err; } diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index baff1da354e..af78e50ca6c 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -86,6 +86,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, pgprot_t prot; int retval; void __iomem *ret_addr; + int ram_region; /* Don't allow wraparound or zero size */ last_addr = phys_addr + size - 1; @@ -108,12 +109,23 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, /* * Don't allow anybody to remap normal RAM that we're using.. */ - pfn = phys_addr >> PAGE_SHIFT; - last_pfn = last_addr >> PAGE_SHIFT; - if (walk_system_ram_range(pfn, last_pfn - pfn + 1, NULL, - __ioremap_check_ram) == 1) + /* First check if whole region can be identified as RAM or not */ + ram_region = region_is_ram(phys_addr, size); + if (ram_region > 0) { + WARN_ONCE(1, "ioremap on RAM at 0x%lx - 0x%lx\n", + (unsigned long int)phys_addr, + (unsigned long int)last_addr); return NULL; + } + /* If could not be identified(-1), check page by page */ + if (ram_region < 0) { + pfn = phys_addr >> PAGE_SHIFT; + last_pfn = last_addr >> PAGE_SHIFT; + if (walk_system_ram_range(pfn, last_pfn - pfn + 1, NULL, + __ioremap_check_ram) == 1) + return NULL; + } /* * Mappings have to be page-aligned */ diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c index dd89a13f105..b4f2e7e9e90 100644 --- a/arch/x86/mm/kmemcheck/kmemcheck.c +++ b/arch/x86/mm/kmemcheck/kmemcheck.c @@ -140,7 +140,7 @@ static DEFINE_PER_CPU(struct kmemcheck_context, kmemcheck_context); bool kmemcheck_active(struct pt_regs *regs) { - struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context); + struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context); return data->balance > 0; } @@ -148,7 +148,7 @@ bool kmemcheck_active(struct pt_regs *regs) /* Save an address that needs to be shown/hidden */ static void kmemcheck_save_addr(unsigned long addr) { - struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context); + struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context); BUG_ON(data->n_addrs >= ARRAY_SIZE(data->addr)); data->addr[data->n_addrs++] = addr; @@ -156,7 +156,7 @@ static void kmemcheck_save_addr(unsigned long addr) static unsigned int kmemcheck_show_all(void) { - struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context); + struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context); unsigned int i; unsigned int n; @@ -169,7 +169,7 @@ static unsigned int kmemcheck_show_all(void) static unsigned int kmemcheck_hide_all(void) { - struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context); + struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context); unsigned int i; unsigned int n; @@ -185,7 +185,7 @@ static unsigned int kmemcheck_hide_all(void) */ void kmemcheck_show(struct pt_regs *regs) { - struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context); + struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context); BUG_ON(!irqs_disabled()); @@ -226,7 +226,7 @@ void kmemcheck_show(struct pt_regs *regs) */ void kmemcheck_hide(struct pt_regs *regs) { - struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context); + struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context); int n; BUG_ON(!irqs_disabled()); @@ -528,7 +528,7 @@ static void kmemcheck_access(struct pt_regs *regs, const uint8_t *insn_primary; unsigned int size; - struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context); + struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context); /* Recursive fault -- ouch. */ if (data->busy) { diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 25e7e1372bb..919b91205cd 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -31,7 +31,7 @@ #include <linux/sched.h> #include <asm/elf.h> -struct __read_mostly va_alignment va_align = { +struct va_alignment __read_mostly va_align = { .flags = -1, }; diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index a32b706c401..1a883705a12 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -185,8 +185,8 @@ int __init numa_add_memblk(int nid, u64 start, u64 end) return numa_add_memblk_to(nid, start, end, &numa_meminfo); } -/* Initialize NODE_DATA for a node on the local memory */ -static void __init setup_node_data(int nid, u64 start, u64 end) +/* Allocate NODE_DATA for a node on the local memory */ +static void __init alloc_node_data(int nid) { const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE); u64 nd_pa; @@ -194,18 +194,6 @@ static void __init setup_node_data(int nid, u64 start, u64 end) int tnid; /* - * Don't confuse VM with a node that doesn't have the - * minimum amount of memory: - */ - if (end && (end - start) < NODE_MIN_SIZE) - return; - - start = roundup(start, ZONE_ALIGN); - - printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n", - nid, start, end - 1); - - /* * Allocate node data. Try node-local memory and then any node. * Never allocate in DMA zone. */ @@ -222,7 +210,7 @@ static void __init setup_node_data(int nid, u64 start, u64 end) nd = __va(nd_pa); /* report and initialize */ - printk(KERN_INFO " NODE_DATA [mem %#010Lx-%#010Lx]\n", + printk(KERN_INFO "NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n", nid, nd_pa, nd_pa + nd_size - 1); tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT); if (tnid != nid) @@ -230,9 +218,6 @@ static void __init setup_node_data(int nid, u64 start, u64 end) node_data[nid] = nd; memset(NODE_DATA(nid), 0, sizeof(pg_data_t)); - NODE_DATA(nid)->node_id = nid; - NODE_DATA(nid)->node_start_pfn = start >> PAGE_SHIFT; - NODE_DATA(nid)->node_spanned_pages = (end - start) >> PAGE_SHIFT; node_set_online(nid); } @@ -478,6 +463,42 @@ static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi) return true; } +static void __init numa_clear_kernel_node_hotplug(void) +{ + int i, nid; + nodemask_t numa_kernel_nodes = NODE_MASK_NONE; + unsigned long start, end; + struct memblock_region *r; + + /* + * At this time, all memory regions reserved by memblock are + * used by the kernel. Set the nid in memblock.reserved will + * mark out all the nodes the kernel resides in. + */ + for (i = 0; i < numa_meminfo.nr_blks; i++) { + struct numa_memblk *mb = &numa_meminfo.blk[i]; + + memblock_set_node(mb->start, mb->end - mb->start, + &memblock.reserved, mb->nid); + } + + /* Mark all kernel nodes. */ + for_each_memblock(reserved, r) + node_set(r->nid, numa_kernel_nodes); + + /* Clear MEMBLOCK_HOTPLUG flag for memory in kernel nodes. */ + for (i = 0; i < numa_meminfo.nr_blks; i++) { + nid = numa_meminfo.blk[i].nid; + if (!node_isset(nid, numa_kernel_nodes)) + continue; + + start = numa_meminfo.blk[i].start; + end = numa_meminfo.blk[i].end; + + memblock_clear_hotplug(start, end - start); + } +} + static int __init numa_register_memblks(struct numa_meminfo *mi) { unsigned long uninitialized_var(pfn_align); @@ -496,6 +517,15 @@ static int __init numa_register_memblks(struct numa_meminfo *mi) } /* + * At very early time, the kernel have to use some memory such as + * loading the kernel image. We cannot prevent this anyway. So any + * node the kernel resides in should be un-hotpluggable. + * + * And when we come here, alloc node data won't fail. + */ + numa_clear_kernel_node_hotplug(); + + /* * If sections array is gonna be used for pfn -> nid mapping, check * whether its granularity is fine enough. */ @@ -523,8 +553,17 @@ static int __init numa_register_memblks(struct numa_meminfo *mi) end = max(mi->blk[i].end, end); } - if (start < end) - setup_node_data(nid, start, end); + if (start >= end) + continue; + + /* + * Don't confuse VM with a node that doesn't have the + * minimum amount of memory: + */ + if (end && (end - start) < NODE_MIN_SIZE) + continue; + + alloc_node_data(nid); } /* Dump memblock with node info and return. */ @@ -554,41 +593,6 @@ static void __init numa_init_array(void) } } -static void __init numa_clear_kernel_node_hotplug(void) -{ - int i, nid; - nodemask_t numa_kernel_nodes = NODE_MASK_NONE; - unsigned long start, end; - struct memblock_region *r; - - /* - * At this time, all memory regions reserved by memblock are - * used by the kernel. Set the nid in memblock.reserved will - * mark out all the nodes the kernel resides in. - */ - for (i = 0; i < numa_meminfo.nr_blks; i++) { - struct numa_memblk *mb = &numa_meminfo.blk[i]; - memblock_set_node(mb->start, mb->end - mb->start, - &memblock.reserved, mb->nid); - } - - /* Mark all kernel nodes. */ - for_each_memblock(reserved, r) - node_set(r->nid, numa_kernel_nodes); - - /* Clear MEMBLOCK_HOTPLUG flag for memory in kernel nodes. */ - for (i = 0; i < numa_meminfo.nr_blks; i++) { - nid = numa_meminfo.blk[i].nid; - if (!node_isset(nid, numa_kernel_nodes)) - continue; - - start = numa_meminfo.blk[i].start; - end = numa_meminfo.blk[i].end; - - memblock_clear_hotplug(start, end - start); - } -} - static int __init numa_init(int (*init_func)(void)) { int i; @@ -643,15 +647,6 @@ static int __init numa_init(int (*init_func)(void)) } numa_init_array(); - /* - * At very early time, the kernel have to use some memory such as - * loading the kernel image. We cannot prevent this anyway. So any - * node the kernel resides in should be un-hotpluggable. - * - * And when we come here, numa_init() won't fail. - */ - numa_clear_kernel_node_hotplug(); - return 0; } diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index 4dd8cf65257..75cc0978d45 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -59,41 +59,6 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pteval) __flush_tlb_one(vaddr); } -/* - * Associate a large virtual page frame with a given physical page frame - * and protection flags for that frame. pfn is for the base of the page, - * vaddr is what the page gets mapped to - both must be properly aligned. - * The pmd must already be instantiated. Assumes PAE mode. - */ -void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - - if (vaddr & (PMD_SIZE-1)) { /* vaddr is misaligned */ - printk(KERN_WARNING "set_pmd_pfn: vaddr misaligned\n"); - return; /* BUG(); */ - } - if (pfn & (PTRS_PER_PTE-1)) { /* pfn is misaligned */ - printk(KERN_WARNING "set_pmd_pfn: pfn misaligned\n"); - return; /* BUG(); */ - } - pgd = swapper_pg_dir + pgd_index(vaddr); - if (pgd_none(*pgd)) { - printk(KERN_WARNING "set_pmd_pfn: pgd_none\n"); - return; /* BUG(); */ - } - pud = pud_offset(pgd, vaddr); - pmd = pmd_offset(pud, vaddr); - set_pmd(pmd, pfn_pmd(pfn, flags)); - /* - * It's enough to flush this one mapping. - * (PGE mappings get flushed as well) - */ - __flush_tlb_one(vaddr); -} - unsigned long __FIXADDR_TOP = 0xfffff000; EXPORT_SYMBOL(__FIXADDR_TOP); diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index dd8dda167a2..ee61c36d64f 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -49,6 +49,13 @@ void leave_mm(int cpu) if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) { cpumask_clear_cpu(cpu, mm_cpumask(active_mm)); load_cr3(swapper_pg_dir); + /* + * This gets called in the idle path where RCU + * functions differently. Tracing normally + * uses RCU, so we have to call the tracepoint + * specially here. + */ + trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); } } EXPORT_SYMBOL_GPL(leave_mm); @@ -102,20 +109,24 @@ static void flush_tlb_func(void *info) if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm)) return; + if (!f->flush_end) + f->flush_end = f->flush_start + PAGE_SIZE; count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { - if (f->flush_end == TLB_FLUSH_ALL) + if (f->flush_end == TLB_FLUSH_ALL) { local_flush_tlb(); - else if (!f->flush_end) - __flush_tlb_single(f->flush_start); - else { + trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, TLB_FLUSH_ALL); + } else { unsigned long addr; + unsigned long nr_pages = + f->flush_end - f->flush_start / PAGE_SIZE; addr = f->flush_start; while (addr < f->flush_end) { __flush_tlb_single(addr); addr += PAGE_SIZE; } + trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, nr_pages); } } else leave_mm(smp_processor_id()); @@ -153,46 +164,45 @@ void flush_tlb_current_task(void) count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); local_flush_tlb(); + trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL); if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL); preempt_enable(); } +/* + * See Documentation/x86/tlb.txt for details. We choose 33 + * because it is large enough to cover the vast majority (at + * least 95%) of allocations, and is small enough that we are + * confident it will not cause too much overhead. Each single + * flush is about 100 ns, so this caps the maximum overhead at + * _about_ 3,000 ns. + * + * This is in units of pages. + */ +static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; + void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, unsigned long end, unsigned long vmflag) { unsigned long addr; - unsigned act_entries, tlb_entries = 0; - unsigned long nr_base_pages; + /* do a global flush by default */ + unsigned long base_pages_to_flush = TLB_FLUSH_ALL; preempt_disable(); if (current->active_mm != mm) - goto flush_all; + goto out; if (!current->mm) { leave_mm(smp_processor_id()); - goto flush_all; + goto out; } - if (end == TLB_FLUSH_ALL || tlb_flushall_shift == -1 - || vmflag & VM_HUGETLB) { - local_flush_tlb(); - goto flush_all; - } - - /* In modern CPU, last level tlb used for both data/ins */ - if (vmflag & VM_EXEC) - tlb_entries = tlb_lli_4k[ENTRIES]; - else - tlb_entries = tlb_lld_4k[ENTRIES]; + if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB)) + base_pages_to_flush = (end - start) >> PAGE_SHIFT; - /* Assume all of TLB entries was occupied by this task */ - act_entries = tlb_entries >> tlb_flushall_shift; - act_entries = mm->total_vm > act_entries ? act_entries : mm->total_vm; - nr_base_pages = (end - start) >> PAGE_SHIFT; - - /* tlb_flushall_shift is on balance point, details in commit log */ - if (nr_base_pages > act_entries) { + if (base_pages_to_flush > tlb_single_page_flush_ceiling) { + base_pages_to_flush = TLB_FLUSH_ALL; count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); local_flush_tlb(); } else { @@ -201,17 +211,15 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); __flush_tlb_single(addr); } - - if (cpumask_any_but(mm_cpumask(mm), - smp_processor_id()) < nr_cpu_ids) - flush_tlb_others(mm_cpumask(mm), mm, start, end); - preempt_enable(); - return; } - -flush_all: + trace_tlb_flush(TLB_LOCAL_MM_SHOOTDOWN, base_pages_to_flush); +out: + if (base_pages_to_flush == TLB_FLUSH_ALL) { + start = 0UL; + end = TLB_FLUSH_ALL; + } if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) - flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL); + flush_tlb_others(mm_cpumask(mm), mm, start, end); preempt_enable(); } @@ -260,32 +268,26 @@ static void do_kernel_range_flush(void *info) void flush_tlb_kernel_range(unsigned long start, unsigned long end) { - unsigned act_entries; - struct flush_tlb_info info; - - /* In modern CPU, last level tlb used for both data/ins */ - act_entries = tlb_lld_4k[ENTRIES]; /* Balance as user space task's flush, a bit conservative */ - if (end == TLB_FLUSH_ALL || tlb_flushall_shift == -1 || - (end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift) - + if (end == TLB_FLUSH_ALL || + (end - start) > tlb_single_page_flush_ceiling * PAGE_SIZE) { on_each_cpu(do_flush_tlb_all, NULL, 1); - else { + } else { + struct flush_tlb_info info; info.flush_start = start; info.flush_end = end; on_each_cpu(do_kernel_range_flush, &info, 1); } } -#ifdef CONFIG_DEBUG_TLBFLUSH static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { char buf[32]; unsigned int len; - len = sprintf(buf, "%hd\n", tlb_flushall_shift); + len = sprintf(buf, "%ld\n", tlb_single_page_flush_ceiling); return simple_read_from_buffer(user_buf, count, ppos, buf, len); } @@ -294,20 +296,20 @@ static ssize_t tlbflush_write_file(struct file *file, { char buf[32]; ssize_t len; - s8 shift; + int ceiling; len = min(count, sizeof(buf) - 1); if (copy_from_user(buf, user_buf, len)) return -EFAULT; buf[len] = '\0'; - if (kstrtos8(buf, 0, &shift)) + if (kstrtoint(buf, 0, &ceiling)) return -EINVAL; - if (shift < -1 || shift >= BITS_PER_LONG) + if (ceiling < 0) return -EINVAL; - tlb_flushall_shift = shift; + tlb_single_page_flush_ceiling = ceiling; return count; } @@ -317,11 +319,10 @@ static const struct file_operations fops_tlbflush = { .llseek = default_llseek, }; -static int __init create_tlb_flushall_shift(void) +static int __init create_tlb_single_page_flush_ceiling(void) { - debugfs_create_file("tlb_flushall_shift", S_IRUSR | S_IWUSR, + debugfs_create_file("tlb_single_page_flush_ceiling", S_IRUSR | S_IWUSR, arch_debugfs_dir, NULL, &fops_tlbflush); return 0; } -late_initcall(create_tlb_flushall_shift); -#endif +late_initcall(create_tlb_single_page_flush_ceiling); diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 99bef86ed6d..3f627345d51 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -8,12 +8,10 @@ * as published by the Free Software Foundation; version 2 * of the License. */ -#include <linux/moduleloader.h> -#include <asm/cacheflush.h> #include <linux/netdevice.h> #include <linux/filter.h> #include <linux/if_vlan.h> -#include <linux/random.h> +#include <asm/cacheflush.h> int bpf_jit_enable __read_mostly; @@ -109,39 +107,6 @@ static inline void bpf_flush_icache(void *start, void *end) #define CHOOSE_LOAD_FUNC(K, func) \ ((int)K < 0 ? ((int)K >= SKF_LL_OFF ? func##_negative_offset : func) : func##_positive_offset) -struct bpf_binary_header { - unsigned int pages; - /* Note : for security reasons, bpf code will follow a randomly - * sized amount of int3 instructions - */ - u8 image[]; -}; - -static struct bpf_binary_header *bpf_alloc_binary(unsigned int proglen, - u8 **image_ptr) -{ - unsigned int sz, hole; - struct bpf_binary_header *header; - - /* Most of BPF filters are really small, - * but if some of them fill a page, allow at least - * 128 extra bytes to insert a random section of int3 - */ - sz = round_up(proglen + sizeof(*header) + 128, PAGE_SIZE); - header = module_alloc(sz); - if (!header) - return NULL; - - memset(header, 0xcc, sz); /* fill whole space with int3 instructions */ - - header->pages = sz / PAGE_SIZE; - hole = min(sz - (proglen + sizeof(*header)), PAGE_SIZE - sizeof(*header)); - - /* insert a random number of int3 instructions before BPF code */ - *image_ptr = &header->image[prandom_u32() % hole]; - return header; -} - /* pick a register outside of BPF range for JIT internal work */ #define AUX_REG (MAX_BPF_REG + 1) @@ -206,17 +171,28 @@ static inline u8 add_2reg(u8 byte, u32 dst_reg, u32 src_reg) return byte + reg2hex[dst_reg] + (reg2hex[src_reg] << 3); } +static void jit_fill_hole(void *area, unsigned int size) +{ + /* fill whole space with int3 instructions */ + memset(area, 0xcc, size); +} + struct jit_context { unsigned int cleanup_addr; /* epilogue code offset */ bool seen_ld_abs; }; -static int do_jit(struct sk_filter *bpf_prog, int *addrs, u8 *image, +/* maximum number of bytes emitted while JITing one eBPF insn */ +#define BPF_MAX_INSN_SIZE 128 +#define BPF_INSN_SAFETY 64 + +static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, int oldproglen, struct jit_context *ctx) { - struct sock_filter_int *insn = bpf_prog->insnsi; + struct bpf_insn *insn = bpf_prog->insnsi; int insn_cnt = bpf_prog->len; - u8 temp[64]; + bool seen_ld_abs = ctx->seen_ld_abs | (oldproglen == 0); + u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY]; int i; int proglen = 0; u8 *prog = temp; @@ -235,7 +211,7 @@ static int do_jit(struct sk_filter *bpf_prog, int *addrs, u8 *image, /* mov qword ptr [rbp-X],rbx */ EMIT3_off32(0x48, 0x89, 0x9D, -stacksize); - /* sk_convert_filter() maps classic BPF register X to R7 and uses R8 + /* bpf_convert_filter() maps classic BPF register X to R7 and uses R8 * as temporary, so all tcpdump filters need to spill/fill R7(r13) and * R8(r14). R9(r15) spill could be made conditional, but there is only * one 'bpf_error' return path out of helper functions inside bpf_jit.S @@ -254,7 +230,7 @@ static int do_jit(struct sk_filter *bpf_prog, int *addrs, u8 *image, EMIT2(0x31, 0xc0); /* xor eax, eax */ EMIT3(0x4D, 0x31, 0xED); /* xor r13, r13 */ - if (ctx->seen_ld_abs) { + if (seen_ld_abs) { /* r9d : skb->len - skb->data_len (headlen) * r10 : skb->data */ @@ -393,6 +369,23 @@ static int do_jit(struct sk_filter *bpf_prog, int *addrs, u8 *image, EMIT1_off32(add_1reg(0xB8, dst_reg), imm32); break; + case BPF_LD | BPF_IMM | BPF_DW: + if (insn[1].code != 0 || insn[1].src_reg != 0 || + insn[1].dst_reg != 0 || insn[1].off != 0) { + /* verifier must catch invalid insns */ + pr_err("invalid BPF_LD_IMM64 insn\n"); + return -EINVAL; + } + + /* movabsq %rax, imm64 */ + EMIT2(add_1mod(0x48, dst_reg), add_1reg(0xB8, dst_reg)); + EMIT(insn[0].imm, 4); + EMIT(insn[1].imm, 4); + + insn++; + i++; + break; + /* dst %= src, dst /= src, dst %= imm32, dst /= imm32 */ case BPF_ALU | BPF_MOD | BPF_X: case BPF_ALU | BPF_DIV | BPF_X: @@ -515,6 +508,48 @@ static int do_jit(struct sk_filter *bpf_prog, int *addrs, u8 *image, EMIT3(0xC1, add_1reg(b3, dst_reg), imm32); break; + case BPF_ALU | BPF_LSH | BPF_X: + case BPF_ALU | BPF_RSH | BPF_X: + case BPF_ALU | BPF_ARSH | BPF_X: + case BPF_ALU64 | BPF_LSH | BPF_X: + case BPF_ALU64 | BPF_RSH | BPF_X: + case BPF_ALU64 | BPF_ARSH | BPF_X: + + /* check for bad case when dst_reg == rcx */ + if (dst_reg == BPF_REG_4) { + /* mov r11, dst_reg */ + EMIT_mov(AUX_REG, dst_reg); + dst_reg = AUX_REG; + } + + if (src_reg != BPF_REG_4) { /* common case */ + EMIT1(0x51); /* push rcx */ + + /* mov rcx, src_reg */ + EMIT_mov(BPF_REG_4, src_reg); + } + + /* shl %rax, %cl | shr %rax, %cl | sar %rax, %cl */ + if (BPF_CLASS(insn->code) == BPF_ALU64) + EMIT1(add_1mod(0x48, dst_reg)); + else if (is_ereg(dst_reg)) + EMIT1(add_1mod(0x40, dst_reg)); + + switch (BPF_OP(insn->code)) { + case BPF_LSH: b3 = 0xE0; break; + case BPF_RSH: b3 = 0xE8; break; + case BPF_ARSH: b3 = 0xF8; break; + } + EMIT2(0xD3, add_1reg(b3, dst_reg)); + + if (src_reg != BPF_REG_4) + EMIT1(0x59); /* pop rcx */ + + if (insn->dst_reg == BPF_REG_4) + /* mov dst_reg, r11 */ + EMIT_mov(insn->dst_reg, AUX_REG); + break; + case BPF_ALU | BPF_END | BPF_FROM_BE: switch (imm32) { case 16: @@ -655,7 +690,7 @@ xadd: if (is_imm8(insn->off)) case BPF_JMP | BPF_CALL: func = (u8 *) __bpf_call_base + imm32; jmp_offset = func - (image + addrs[i]); - if (ctx->seen_ld_abs) { + if (seen_ld_abs) { EMIT2(0x41, 0x52); /* push %r10 */ EMIT2(0x41, 0x51); /* push %r9 */ /* need to adjust jmp offset, since @@ -669,7 +704,7 @@ xadd: if (is_imm8(insn->off)) return -EINVAL; } EMIT1_off32(0xE8, jmp_offset); - if (ctx->seen_ld_abs) { + if (seen_ld_abs) { EMIT2(0x41, 0x59); /* pop %r9 */ EMIT2(0x41, 0x5A); /* pop %r10 */ } @@ -774,7 +809,8 @@ emit_jmp: goto common_load; case BPF_LD | BPF_ABS | BPF_W: func = CHOOSE_LOAD_FUNC(imm32, sk_load_word); -common_load: ctx->seen_ld_abs = true; +common_load: + ctx->seen_ld_abs = seen_ld_abs = true; jmp_offset = func - (image + addrs[i]); if (!func || !is_simm32(jmp_offset)) { pr_err("unsupported bpf func %d addr %p image %p\n", @@ -841,13 +877,18 @@ common_load: ctx->seen_ld_abs = true; /* By design x64 JIT should support all BPF instructions * This error will be seen if new instruction was added * to interpreter, but not to JIT - * or if there is junk in sk_filter + * or if there is junk in bpf_prog */ pr_err("bpf_jit: unknown opcode %02x\n", insn->code); return -EINVAL; } ilen = prog - temp; + if (ilen > BPF_MAX_INSN_SIZE) { + pr_err("bpf_jit_compile fatal insn size error\n"); + return -EFAULT; + } + if (image) { if (unlikely(proglen + ilen > oldproglen)) { pr_err("bpf_jit_compile fatal error\n"); @@ -862,11 +903,11 @@ common_load: ctx->seen_ld_abs = true; return proglen; } -void bpf_jit_compile(struct sk_filter *prog) +void bpf_jit_compile(struct bpf_prog *prog) { } -void bpf_int_jit_compile(struct sk_filter *prog) +void bpf_int_jit_compile(struct bpf_prog *prog) { struct bpf_binary_header *header = NULL; int proglen, oldproglen = 0; @@ -900,17 +941,20 @@ void bpf_int_jit_compile(struct sk_filter *prog) if (proglen <= 0) { image = NULL; if (header) - module_free(NULL, header); + bpf_jit_binary_free(header); goto out; } if (image) { - if (proglen != oldproglen) + if (proglen != oldproglen) { pr_err("bpf_jit: proglen=%d != oldproglen=%d\n", proglen, oldproglen); + goto out; + } break; } if (proglen == oldproglen) { - header = bpf_alloc_binary(proglen, &image); + header = bpf_jit_binary_alloc(proglen, &image, + 1, jit_fill_hole); if (!header) goto out; } @@ -924,29 +968,23 @@ void bpf_int_jit_compile(struct sk_filter *prog) bpf_flush_icache(header, image + proglen); set_memory_ro((unsigned long)header, header->pages); prog->bpf_func = (void *)image; - prog->jited = 1; + prog->jited = true; } out: kfree(addrs); } -static void bpf_jit_free_deferred(struct work_struct *work) +void bpf_jit_free(struct bpf_prog *fp) { - struct sk_filter *fp = container_of(work, struct sk_filter, work); unsigned long addr = (unsigned long)fp->bpf_func & PAGE_MASK; struct bpf_binary_header *header = (void *)addr; + if (!fp->jited) + goto free_filter; + set_memory_rw(addr, header->pages); - module_free(NULL, header); - kfree(fp); -} + bpf_jit_binary_free(header); -void bpf_jit_free(struct sk_filter *fp) -{ - if (fp->jited) { - INIT_WORK(&fp->work, bpf_jit_free_deferred); - schedule_work(&fp->work); - } else { - kfree(fp); - } +free_filter: + bpf_prog_unlock_free(fp); } diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 379e8bd0dee..1d2e6392f5f 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -64,11 +64,11 @@ u64 op_x86_get_ctrl(struct op_x86_model_spec const *model, static int profile_exceptions_notify(unsigned int val, struct pt_regs *regs) { if (ctr_running) - model->check_ctrs(regs, &__get_cpu_var(cpu_msrs)); + model->check_ctrs(regs, this_cpu_ptr(&cpu_msrs)); else if (!nmi_enabled) return NMI_DONE; else - model->stop(&__get_cpu_var(cpu_msrs)); + model->stop(this_cpu_ptr(&cpu_msrs)); return NMI_HANDLED; } @@ -91,7 +91,7 @@ static void nmi_cpu_save_registers(struct op_msrs *msrs) static void nmi_cpu_start(void *dummy) { - struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs); + struct op_msrs const *msrs = this_cpu_ptr(&cpu_msrs); if (!msrs->controls) WARN_ON_ONCE(1); else @@ -111,7 +111,7 @@ static int nmi_start(void) static void nmi_cpu_stop(void *dummy) { - struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs); + struct op_msrs const *msrs = this_cpu_ptr(&cpu_msrs); if (!msrs->controls) WARN_ON_ONCE(1); else diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c index 98ab13058f8..ad1d91f475a 100644 --- a/arch/x86/oprofile/op_model_p4.c +++ b/arch/x86/oprofile/op_model_p4.c @@ -372,7 +372,7 @@ static unsigned int get_stagger(void) { #ifdef CONFIG_SMP int cpu = smp_processor_id(); - return cpu != cpumask_first(__get_cpu_var(cpu_sibling_map)); + return cpu != cpumask_first(this_cpu_cpumask_var_ptr(cpu_sibling_map)); #endif return 0; } diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 5075371ab59..cfd1b132b8e 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -448,7 +448,7 @@ static void probe_pci_root_info(struct pci_root_info *info, return; size = sizeof(*info->res) * info->res_num; - info->res = kzalloc(size, GFP_KERNEL); + info->res = kzalloc_node(size, GFP_KERNEL, info->sd.node); if (!info->res) { info->res_num = 0; return; @@ -456,7 +456,7 @@ static void probe_pci_root_info(struct pci_root_info *info, size = sizeof(*info->res_offset) * info->res_num; info->res_num = 0; - info->res_offset = kzalloc(size, GFP_KERNEL); + info->res_offset = kzalloc_node(size, GFP_KERNEL, info->sd.node); if (!info->res_offset) { kfree(info->res); info->res = NULL; @@ -499,7 +499,7 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) if (node != NUMA_NO_NODE && !node_online(node)) node = NUMA_NO_NODE; - info = kzalloc(sizeof(*info), GFP_KERNEL); + info = kzalloc_node(sizeof(*info), GFP_KERNEL, node); if (!info) { printk(KERN_WARNING "pci_bus %04x:%02x: " "ignored (out of memory)\n", domain, busnum); diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 059a76c2973..7b20bccf364 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -81,14 +81,14 @@ struct pci_ops pci_root_ops = { */ DEFINE_RAW_SPINLOCK(pci_config_lock); -static int can_skip_ioresource_align(const struct dmi_system_id *d) +static int __init can_skip_ioresource_align(const struct dmi_system_id *d) { pci_probe |= PCI_CAN_SKIP_ISA_ALIGN; printk(KERN_INFO "PCI: %s detected, can skip ISA alignment\n", d->ident); return 0; } -static const struct dmi_system_id can_skip_pciprobe_dmi_table[] = { +static const struct dmi_system_id can_skip_pciprobe_dmi_table[] __initconst = { /* * Systems where PCI IO resource ISA alignment can be skipped * when the ISA enable bit in the bridge control is not set @@ -186,7 +186,7 @@ void pcibios_remove_bus(struct pci_bus *bus) * on the kernel command line (which was parsed earlier). */ -static int set_bf_sort(const struct dmi_system_id *d) +static int __init set_bf_sort(const struct dmi_system_id *d) { if (pci_bf_sort == pci_bf_sort_default) { pci_bf_sort = pci_dmi_bf; @@ -195,8 +195,8 @@ static int set_bf_sort(const struct dmi_system_id *d) return 0; } -static void read_dmi_type_b1(const struct dmi_header *dm, - void *private_data) +static void __init read_dmi_type_b1(const struct dmi_header *dm, + void *private_data) { u8 *d = (u8 *)dm + 4; @@ -217,7 +217,7 @@ static void read_dmi_type_b1(const struct dmi_header *dm, } } -static int find_sort_method(const struct dmi_system_id *d) +static int __init find_sort_method(const struct dmi_system_id *d) { dmi_walk(read_dmi_type_b1, NULL); @@ -232,7 +232,7 @@ static int find_sort_method(const struct dmi_system_id *d) * Enable renumbering of PCI bus# ranges to reach all PCI busses (Cardbus) */ #ifdef __i386__ -static int assign_all_busses(const struct dmi_system_id *d) +static int __init assign_all_busses(const struct dmi_system_id *d) { pci_probe |= PCI_ASSIGN_ALL_BUSSES; printk(KERN_INFO "%s detected: enabling PCI bus# renumbering" @@ -241,7 +241,7 @@ static int assign_all_busses(const struct dmi_system_id *d) } #endif -static int set_scan_all(const struct dmi_system_id *d) +static int __init set_scan_all(const struct dmi_system_id *d) { printk(KERN_INFO "PCI: %s detected, enabling pci=pcie_scan_all\n", d->ident); @@ -249,7 +249,7 @@ static int set_scan_all(const struct dmi_system_id *d) return 0; } -static const struct dmi_system_id pciprobe_dmi_table[] = { +static const struct dmi_system_id pciprobe_dmi_table[] __initconst = { #ifdef __i386__ /* * Laptops which need pci=assign-busses to see Cardbus cards @@ -512,7 +512,7 @@ int __init pcibios_init(void) return 0; } -char * __init pcibios_setup(char *str) +char *__init pcibios_setup(char *str) { if (!strcmp(str, "off")) { pci_probe = 0; diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index b5e60268d93..9a2b7101ae8 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -350,8 +350,7 @@ static void pci_fixup_video(struct pci_dev *pdev) pci_read_config_word(pdev, PCI_COMMAND, &config); if (config & (PCI_COMMAND_IO | PCI_COMMAND_MEMORY)) { pdev->resource[PCI_ROM_RESOURCE].flags |= IORESOURCE_ROM_SHADOW; - dev_printk(KERN_DEBUG, &pdev->dev, "Boot video device\n"); - vga_set_default_device(pdev); + dev_printk(KERN_DEBUG, &pdev->dev, "Video device with shadowed ROM\n"); } } } diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index a19ed92e74e..37c1435889c 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -162,6 +162,10 @@ pcibios_align_resource(void *data, const struct resource *res, return start; if (start & 0x300) start = (start + 0x3ff) & ~0x3ff; + } else if (res->flags & IORESOURCE_MEM) { + /* The low 1MB range is reserved for ISA cards */ + if (start < BIOS_END) + start = BIOS_END; } return start; } @@ -438,8 +442,6 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, */ prot |= _PAGE_CACHE_UC_MINUS; - prot |= _PAGE_IOMAP; /* creating a mapping for IO */ - vma->vm_page_prot = __pgprot(prot); if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c index 84b9d672843..b9958c36407 100644 --- a/arch/x86/pci/intel_mid_pci.c +++ b/arch/x86/pci/intel_mid_pci.c @@ -208,27 +208,31 @@ static int pci_write(struct pci_bus *bus, unsigned int devfn, int where, static int intel_mid_pci_irq_enable(struct pci_dev *dev) { - u8 pin; - struct io_apic_irq_attr irq_attr; + int polarity; - pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); + if (intel_mid_identify_cpu() == INTEL_MID_CPU_CHIP_TANGIER) + polarity = 0; /* active high */ + else + polarity = 1; /* active low */ /* * MRST only have IOAPIC, the PCI irq lines are 1:1 mapped to * IOAPIC RTE entries, so we just enable RTE for the device. */ - irq_attr.ioapic = mp_find_ioapic(dev->irq); - irq_attr.ioapic_pin = dev->irq; - irq_attr.trigger = 1; /* level */ - if (intel_mid_identify_cpu() == INTEL_MID_CPU_CHIP_TANGIER) - irq_attr.polarity = 0; /* active high */ - else - irq_attr.polarity = 1; /* active low */ - io_apic_set_pci_routing(&dev->dev, dev->irq, &irq_attr); + if (mp_set_gsi_attr(dev->irq, 1, polarity, dev_to_node(&dev->dev))) + return -EBUSY; + if (mp_map_gsi_to_irq(dev->irq, IOAPIC_MAP_ALLOC) < 0) + return -EBUSY; return 0; } +static void intel_mid_pci_irq_disable(struct pci_dev *dev) +{ + if (!mp_should_keep_irq(&dev->dev) && dev->irq > 0) + mp_unmap_irq(dev->irq); +} + struct pci_ops intel_mid_pci_ops = { .read = pci_read, .write = pci_write, @@ -245,6 +249,7 @@ int __init intel_mid_pci_init(void) pr_info("Intel MID platform detected, using MID PCI ops\n"); pci_mmcfg_late_init(); pcibios_enable_irq = intel_mid_pci_irq_enable; + pcibios_disable_irq = intel_mid_pci_irq_disable; pci_root_ops = intel_mid_pci_ops; pci_soc_mode = 1; /* Continue with standard init */ diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index 84112f55dd7..eb500c2592a 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -26,6 +26,7 @@ static int acer_tm360_irqrouting; static struct irq_routing_table *pirq_table; static int pirq_enable_irq(struct pci_dev *dev); +static void pirq_disable_irq(struct pci_dev *dev); /* * Never use: 0, 1, 2 (timer, keyboard, and cascade) @@ -53,7 +54,7 @@ struct irq_router_handler { }; int (*pcibios_enable_irq)(struct pci_dev *dev) = pirq_enable_irq; -void (*pcibios_disable_irq)(struct pci_dev *dev) = NULL; +void (*pcibios_disable_irq)(struct pci_dev *dev) = pirq_disable_irq; /* * Check passed address for the PCI IRQ Routing Table signature @@ -1186,7 +1187,7 @@ void pcibios_penalize_isa_irq(int irq, int active) static int pirq_enable_irq(struct pci_dev *dev) { - u8 pin; + u8 pin = 0; pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); if (pin && !pcibios_lookup_irq(dev, 1)) { @@ -1227,8 +1228,6 @@ static int pirq_enable_irq(struct pci_dev *dev) } dev = temp_dev; if (irq >= 0) { - io_apic_set_pci_routing(&dev->dev, irq, - &irq_attr); dev->irq = irq; dev_info(&dev->dev, "PCI->APIC IRQ transform: " "INT %c -> IRQ %d\n", 'A' + pin - 1, irq); @@ -1254,3 +1253,12 @@ static int pirq_enable_irq(struct pci_dev *dev) } return 0; } + +static void pirq_disable_irq(struct pci_dev *dev) +{ + if (io_apic_assign_pci_irqs && !mp_should_keep_irq(&dev->dev) && + dev->irq) { + mp_unmap_irq(dev->irq); + dev->irq = 0; + } +} diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 248642f4bab..326198a4434 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -31,7 +31,7 @@ static DEFINE_MUTEX(pci_mmcfg_lock); LIST_HEAD(pci_mmcfg_list); -static __init void pci_mmconfig_remove(struct pci_mmcfg_region *cfg) +static void __init pci_mmconfig_remove(struct pci_mmcfg_region *cfg) { if (cfg->res.parent) release_resource(&cfg->res); @@ -39,7 +39,7 @@ static __init void pci_mmconfig_remove(struct pci_mmcfg_region *cfg) kfree(cfg); } -static __init void free_all_mmcfg(void) +static void __init free_all_mmcfg(void) { struct pci_mmcfg_region *cfg, *tmp; @@ -93,7 +93,7 @@ static struct pci_mmcfg_region *pci_mmconfig_alloc(int segment, int start, return new; } -static __init struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start, +static struct pci_mmcfg_region *__init pci_mmconfig_add(int segment, int start, int end, u64 addr) { struct pci_mmcfg_region *new; @@ -125,7 +125,7 @@ struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int bus) return NULL; } -static const char __init *pci_mmcfg_e7520(void) +static const char *__init pci_mmcfg_e7520(void) { u32 win; raw_pci_ops->read(0, 0, PCI_DEVFN(0, 0), 0xce, 2, &win); @@ -140,7 +140,7 @@ static const char __init *pci_mmcfg_e7520(void) return "Intel Corporation E7520 Memory Controller Hub"; } -static const char __init *pci_mmcfg_intel_945(void) +static const char *__init pci_mmcfg_intel_945(void) { u32 pciexbar, mask = 0, len = 0; @@ -184,7 +184,7 @@ static const char __init *pci_mmcfg_intel_945(void) return "Intel Corporation 945G/GZ/P/PL Express Memory Controller Hub"; } -static const char __init *pci_mmcfg_amd_fam10h(void) +static const char *__init pci_mmcfg_amd_fam10h(void) { u32 low, high, address; u64 base, msr; @@ -235,21 +235,25 @@ static const char __init *pci_mmcfg_amd_fam10h(void) } static bool __initdata mcp55_checked; -static const char __init *pci_mmcfg_nvidia_mcp55(void) +static const char *__init pci_mmcfg_nvidia_mcp55(void) { int bus; int mcp55_mmconf_found = 0; - static const u32 extcfg_regnum = 0x90; - static const u32 extcfg_regsize = 4; - static const u32 extcfg_enable_mask = 1<<31; - static const u32 extcfg_start_mask = 0xff<<16; - static const int extcfg_start_shift = 16; - static const u32 extcfg_size_mask = 0x3<<28; - static const int extcfg_size_shift = 28; - static const int extcfg_sizebus[] = {0x100, 0x80, 0x40, 0x20}; - static const u32 extcfg_base_mask[] = {0x7ff8, 0x7ffc, 0x7ffe, 0x7fff}; - static const int extcfg_base_lshift = 25; + static const u32 extcfg_regnum __initconst = 0x90; + static const u32 extcfg_regsize __initconst = 4; + static const u32 extcfg_enable_mask __initconst = 1 << 31; + static const u32 extcfg_start_mask __initconst = 0xff << 16; + static const int extcfg_start_shift __initconst = 16; + static const u32 extcfg_size_mask __initconst = 0x3 << 28; + static const int extcfg_size_shift __initconst = 28; + static const int extcfg_sizebus[] __initconst = { + 0x100, 0x80, 0x40, 0x20 + }; + static const u32 extcfg_base_mask[] __initconst = { + 0x7ff8, 0x7ffc, 0x7ffe, 0x7fff + }; + static const int extcfg_base_lshift __initconst = 25; /* * do check if amd fam10h already took over @@ -302,7 +306,7 @@ struct pci_mmcfg_hostbridge_probe { const char *(*probe)(void); }; -static struct pci_mmcfg_hostbridge_probe pci_mmcfg_probes[] __initdata = { +static const struct pci_mmcfg_hostbridge_probe pci_mmcfg_probes[] __initconst = { { 0, PCI_DEVFN(0, 0), PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, pci_mmcfg_e7520 }, { 0, PCI_DEVFN(0, 0), PCI_VENDOR_ID_INTEL, diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c index c77b24a8b2d..9b83b9051ae 100644 --- a/arch/x86/pci/pcbios.c +++ b/arch/x86/pci/pcbios.c @@ -79,13 +79,13 @@ union bios32 { static struct { unsigned long address; unsigned short segment; -} bios32_indirect = { 0, __KERNEL_CS }; +} bios32_indirect __initdata = { 0, __KERNEL_CS }; /* * Returns the entry point for the given service, NULL on error */ -static unsigned long bios32_service(unsigned long service) +static unsigned long __init bios32_service(unsigned long service) { unsigned char return_code; /* %al */ unsigned long address; /* %ebx */ @@ -124,7 +124,7 @@ static struct { static int pci_bios_present; -static int check_pcibios(void) +static int __init check_pcibios(void) { u32 signature, eax, ebx, ecx; u8 status, major_ver, minor_ver, hw_mech; @@ -312,7 +312,7 @@ static const struct pci_raw_ops pci_bios_access = { * Try to find PCI BIOS. */ -static const struct pci_raw_ops *pci_find_bios(void) +static const struct pci_raw_ops *__init pci_find_bios(void) { union bios32 *check; unsigned char sum; diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 905956f1646..093f5f4272d 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -23,6 +23,7 @@ #include <xen/features.h> #include <xen/events.h> #include <asm/xen/pci.h> +#include <asm/i8259.h> static int xen_pcifront_enable_irq(struct pci_dev *dev) { @@ -40,7 +41,7 @@ static int xen_pcifront_enable_irq(struct pci_dev *dev) /* In PV DomU the Xen PCI backend puts the PIRQ in the interrupt line.*/ pirq = gsi; - if (gsi < NR_IRQS_LEGACY) + if (gsi < nr_legacy_irqs()) share = 0; rc = xen_bind_pirq_gsi_to_irq(gsi, pirq, share, "pcifront"); @@ -511,7 +512,7 @@ int __init pci_xen_initial_domain(void) xen_setup_acpi_sci(); __acpi_register_gsi = acpi_register_gsi_xen; /* Pre-allocate legacy irqs */ - for (irq = 0; irq < NR_IRQS_LEGACY; irq++) { + for (irq = 0; irq < nr_legacy_irqs(); irq++) { int trigger, polarity; if (acpi_get_override_irq(irq, &trigger, &polarity) == -1) @@ -522,7 +523,7 @@ int __init pci_xen_initial_domain(void) true /* Map GSI to PIRQ */); } if (0 == nr_ioapics) { - for (irq = 0; irq < NR_IRQS_LEGACY; irq++) + for (irq = 0; irq < nr_legacy_irqs(); irq++) xen_bind_pirq_gsi_to_irq(irq, irq, 0, "xt-pic"); } return 0; diff --git a/arch/x86/platform/ce4100/ce4100.c b/arch/x86/platform/ce4100/ce4100.c index 8244f5ec2f4..701fd5843c8 100644 --- a/arch/x86/platform/ce4100/ce4100.c +++ b/arch/x86/platform/ce4100/ce4100.c @@ -135,14 +135,10 @@ static void __init sdv_arch_setup(void) sdv_serial_fixup(); } -#ifdef CONFIG_X86_IO_APIC static void sdv_pci_init(void) { x86_of_pci_init(); - /* We can't set this earlier, because we need to calibrate the timer */ - legacy_pic = &null_legacy_pic; } -#endif /* * CE4100 specific x86_init function overrides and early setup @@ -155,7 +151,9 @@ void __init x86_ce4100_early_setup(void) x86_init.resources.probe_roms = x86_init_noop; x86_init.mpparse.get_smp_config = x86_init_uint_noop; x86_init.mpparse.find_smp_config = x86_init_noop; + x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc_nocheck; x86_init.pci.init = ce4100_pci_init; + x86_init.pci.init_irq = sdv_pci_init; /* * By default, the reboot method is ACPI which is supported by the @@ -166,10 +164,5 @@ void __init x86_ce4100_early_setup(void) */ reboot_type = BOOT_KBD; -#ifdef CONFIG_X86_IO_APIC - x86_init.pci.init_irq = sdv_pci_init; - x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc_nocheck; -#endif - pm_power_off = ce4100_power_off; } diff --git a/arch/x86/platform/efi/Makefile b/arch/x86/platform/efi/Makefile index d51045afcaa..2846aaab510 100644 --- a/arch/x86/platform/efi/Makefile +++ b/arch/x86/platform/efi/Makefile @@ -1,4 +1,4 @@ -obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o +obj-$(CONFIG_EFI) += quirks.o efi.o efi_$(BITS).o efi_stub_$(BITS).o obj-$(CONFIG_ACPI_BGRT) += efi-bgrt.o obj-$(CONFIG_EARLY_PRINTK_EFI) += early_printk.o obj-$(CONFIG_EFI_MIXED) += efi_thunk_$(BITS).o diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 87fc96bcc13..850da94fef3 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -56,13 +56,6 @@ #define EFI_DEBUG -#define EFI_MIN_RESERVE 5120 - -#define EFI_DUMMY_GUID \ - EFI_GUID(0x4424ac57, 0xbe4b, 0x47dd, 0x9e, 0x97, 0xed, 0x50, 0xf0, 0x9f, 0x92, 0xa9) - -static efi_char16_t efi_dummy_name[6] = { 'D', 'U', 'M', 'M', 'Y', 0 }; - struct efi_memory_map memmap; static struct efi efi_phys __initdata; @@ -95,139 +88,6 @@ static int __init setup_add_efi_memmap(char *arg) } early_param("add_efi_memmap", setup_add_efi_memmap); -static bool efi_no_storage_paranoia; - -static int __init setup_storage_paranoia(char *arg) -{ - efi_no_storage_paranoia = true; - return 0; -} -early_param("efi_no_storage_paranoia", setup_storage_paranoia); - -static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc) -{ - unsigned long flags; - efi_status_t status; - - spin_lock_irqsave(&rtc_lock, flags); - status = efi_call_virt(get_time, tm, tc); - spin_unlock_irqrestore(&rtc_lock, flags); - return status; -} - -static efi_status_t virt_efi_set_time(efi_time_t *tm) -{ - unsigned long flags; - efi_status_t status; - - spin_lock_irqsave(&rtc_lock, flags); - status = efi_call_virt(set_time, tm); - spin_unlock_irqrestore(&rtc_lock, flags); - return status; -} - -static efi_status_t virt_efi_get_wakeup_time(efi_bool_t *enabled, - efi_bool_t *pending, - efi_time_t *tm) -{ - unsigned long flags; - efi_status_t status; - - spin_lock_irqsave(&rtc_lock, flags); - status = efi_call_virt(get_wakeup_time, enabled, pending, tm); - spin_unlock_irqrestore(&rtc_lock, flags); - return status; -} - -static efi_status_t virt_efi_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm) -{ - unsigned long flags; - efi_status_t status; - - spin_lock_irqsave(&rtc_lock, flags); - status = efi_call_virt(set_wakeup_time, enabled, tm); - spin_unlock_irqrestore(&rtc_lock, flags); - return status; -} - -static efi_status_t virt_efi_get_variable(efi_char16_t *name, - efi_guid_t *vendor, - u32 *attr, - unsigned long *data_size, - void *data) -{ - return efi_call_virt(get_variable, - name, vendor, attr, - data_size, data); -} - -static efi_status_t virt_efi_get_next_variable(unsigned long *name_size, - efi_char16_t *name, - efi_guid_t *vendor) -{ - return efi_call_virt(get_next_variable, - name_size, name, vendor); -} - -static efi_status_t virt_efi_set_variable(efi_char16_t *name, - efi_guid_t *vendor, - u32 attr, - unsigned long data_size, - void *data) -{ - return efi_call_virt(set_variable, - name, vendor, attr, - data_size, data); -} - -static efi_status_t virt_efi_query_variable_info(u32 attr, - u64 *storage_space, - u64 *remaining_space, - u64 *max_variable_size) -{ - if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION) - return EFI_UNSUPPORTED; - - return efi_call_virt(query_variable_info, attr, storage_space, - remaining_space, max_variable_size); -} - -static efi_status_t virt_efi_get_next_high_mono_count(u32 *count) -{ - return efi_call_virt(get_next_high_mono_count, count); -} - -static void virt_efi_reset_system(int reset_type, - efi_status_t status, - unsigned long data_size, - efi_char16_t *data) -{ - __efi_call_virt(reset_system, reset_type, status, - data_size, data); -} - -static efi_status_t virt_efi_update_capsule(efi_capsule_header_t **capsules, - unsigned long count, - unsigned long sg_list) -{ - if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION) - return EFI_UNSUPPORTED; - - return efi_call_virt(update_capsule, capsules, count, sg_list); -} - -static efi_status_t virt_efi_query_capsule_caps(efi_capsule_header_t **capsules, - unsigned long count, - u64 *max_size, - int *reset_type) -{ - if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION) - return EFI_UNSUPPORTED; - - return efi_call_virt(query_capsule_caps, capsules, count, max_size, - reset_type); -} - static efi_status_t __init phys_efi_set_virtual_address_map( unsigned long memory_map_size, unsigned long descriptor_size, @@ -244,42 +104,6 @@ static efi_status_t __init phys_efi_set_virtual_address_map( return status; } -int efi_set_rtc_mmss(const struct timespec *now) -{ - unsigned long nowtime = now->tv_sec; - efi_status_t status; - efi_time_t eft; - efi_time_cap_t cap; - struct rtc_time tm; - - status = efi.get_time(&eft, &cap); - if (status != EFI_SUCCESS) { - pr_err("Oops: efitime: can't read time!\n"); - return -1; - } - - rtc_time_to_tm(nowtime, &tm); - if (!rtc_valid_tm(&tm)) { - eft.year = tm.tm_year + 1900; - eft.month = tm.tm_mon + 1; - eft.day = tm.tm_mday; - eft.minute = tm.tm_min; - eft.second = tm.tm_sec; - eft.nanosecond = 0; - } else { - pr_err("%s: Invalid EFI RTC value: write of %lx to EFI RTC failed\n", - __func__, nowtime); - return -1; - } - - status = efi.set_time(&eft); - if (status != EFI_SUCCESS) { - pr_err("Oops: efitime: can't write time!\n"); - return -1; - } - return 0; -} - void efi_get_time(struct timespec *now) { efi_status_t status; @@ -350,6 +174,9 @@ int __init efi_memblock_x86_reserve_range(void) struct efi_info *e = &boot_params.efi_info; unsigned long pmap; + if (efi_enabled(EFI_PARAVIRT)) + return 0; + #ifdef CONFIG_X86_32 /* Can't handle data above 4GB at this time */ if (e->efi_memmap_hi) { @@ -392,69 +219,15 @@ static void __init print_efi_memmap(void) #endif /* EFI_DEBUG */ } -void __init efi_reserve_boot_services(void) -{ - void *p; - - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - efi_memory_desc_t *md = p; - u64 start = md->phys_addr; - u64 size = md->num_pages << EFI_PAGE_SHIFT; - - if (md->type != EFI_BOOT_SERVICES_CODE && - md->type != EFI_BOOT_SERVICES_DATA) - continue; - /* Only reserve where possible: - * - Not within any already allocated areas - * - Not over any memory area (really needed, if above?) - * - Not within any part of the kernel - * - Not the bios reserved area - */ - if ((start + size > __pa_symbol(_text) - && start <= __pa_symbol(_end)) || - !e820_all_mapped(start, start+size, E820_RAM) || - memblock_is_region_reserved(start, size)) { - /* Could not reserve, skip it */ - md->num_pages = 0; - memblock_dbg("Could not reserve boot range [0x%010llx-0x%010llx]\n", - start, start+size-1); - } else - memblock_reserve(start, size); - } -} - void __init efi_unmap_memmap(void) { clear_bit(EFI_MEMMAP, &efi.flags); if (memmap.map) { - early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size); + early_memunmap(memmap.map, memmap.nr_map * memmap.desc_size); memmap.map = NULL; } } -void __init efi_free_boot_services(void) -{ - void *p; - - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - efi_memory_desc_t *md = p; - unsigned long long start = md->phys_addr; - unsigned long long size = md->num_pages << EFI_PAGE_SHIFT; - - if (md->type != EFI_BOOT_SERVICES_CODE && - md->type != EFI_BOOT_SERVICES_DATA) - continue; - - /* Could not reserve boot area */ - if (!size) - continue; - - free_bootmem_late(start, size); - } - - efi_unmap_memmap(); -} - static int __init efi_systab_init(void *phys) { if (efi_enabled(EFI_64BIT)) { @@ -467,12 +240,12 @@ static int __init efi_systab_init(void *phys) if (!data) return -ENOMEM; } - systab64 = early_ioremap((unsigned long)phys, + systab64 = early_memremap((unsigned long)phys, sizeof(*systab64)); if (systab64 == NULL) { pr_err("Couldn't map the system table!\n"); if (data) - early_iounmap(data, sizeof(*data)); + early_memunmap(data, sizeof(*data)); return -ENOMEM; } @@ -504,9 +277,9 @@ static int __init efi_systab_init(void *phys) systab64->tables; tmp |= data ? data->tables : systab64->tables; - early_iounmap(systab64, sizeof(*systab64)); + early_memunmap(systab64, sizeof(*systab64)); if (data) - early_iounmap(data, sizeof(*data)); + early_memunmap(data, sizeof(*data)); #ifdef CONFIG_X86_32 if (tmp >> 32) { pr_err("EFI data located above 4GB, disabling EFI.\n"); @@ -516,7 +289,7 @@ static int __init efi_systab_init(void *phys) } else { efi_system_table_32_t *systab32; - systab32 = early_ioremap((unsigned long)phys, + systab32 = early_memremap((unsigned long)phys, sizeof(*systab32)); if (systab32 == NULL) { pr_err("Couldn't map the system table!\n"); @@ -537,7 +310,7 @@ static int __init efi_systab_init(void *phys) efi_systab.nr_tables = systab32->nr_tables; efi_systab.tables = systab32->tables; - early_iounmap(systab32, sizeof(*systab32)); + early_memunmap(systab32, sizeof(*systab32)); } efi.systab = &efi_systab; @@ -563,7 +336,7 @@ static int __init efi_runtime_init32(void) { efi_runtime_services_32_t *runtime; - runtime = early_ioremap((unsigned long)efi.systab->runtime, + runtime = early_memremap((unsigned long)efi.systab->runtime, sizeof(efi_runtime_services_32_t)); if (!runtime) { pr_err("Could not map the runtime service table!\n"); @@ -578,7 +351,7 @@ static int __init efi_runtime_init32(void) efi_phys.set_virtual_address_map = (efi_set_virtual_address_map_t *) (unsigned long)runtime->set_virtual_address_map; - early_iounmap(runtime, sizeof(efi_runtime_services_32_t)); + early_memunmap(runtime, sizeof(efi_runtime_services_32_t)); return 0; } @@ -587,7 +360,7 @@ static int __init efi_runtime_init64(void) { efi_runtime_services_64_t *runtime; - runtime = early_ioremap((unsigned long)efi.systab->runtime, + runtime = early_memremap((unsigned long)efi.systab->runtime, sizeof(efi_runtime_services_64_t)); if (!runtime) { pr_err("Could not map the runtime service table!\n"); @@ -602,7 +375,7 @@ static int __init efi_runtime_init64(void) efi_phys.set_virtual_address_map = (efi_set_virtual_address_map_t *) (unsigned long)runtime->set_virtual_address_map; - early_iounmap(runtime, sizeof(efi_runtime_services_64_t)); + early_memunmap(runtime, sizeof(efi_runtime_services_64_t)); return 0; } @@ -616,14 +389,24 @@ static int __init efi_runtime_init(void) * the runtime services table so that we can grab the physical * address of several of the EFI runtime functions, needed to * set the firmware into virtual mode. + * + * When EFI_PARAVIRT is in force then we could not map runtime + * service memory region because we do not have direct access to it. + * However, runtime services are available through proxy functions + * (e.g. in case of Xen dom0 EFI implementation they call special + * hypercall which executes relevant EFI functions) and that is why + * they are always enabled. */ - if (efi_enabled(EFI_64BIT)) - rv = efi_runtime_init64(); - else - rv = efi_runtime_init32(); - if (rv) - return rv; + if (!efi_enabled(EFI_PARAVIRT)) { + if (efi_enabled(EFI_64BIT)) + rv = efi_runtime_init64(); + else + rv = efi_runtime_init32(); + + if (rv) + return rv; + } set_bit(EFI_RUNTIME_SERVICES, &efi.flags); @@ -632,8 +415,11 @@ static int __init efi_runtime_init(void) static int __init efi_memmap_init(void) { + if (efi_enabled(EFI_PARAVIRT)) + return 0; + /* Map the EFI memory map */ - memmap.map = early_ioremap((unsigned long)memmap.phys_map, + memmap.map = early_memremap((unsigned long)memmap.phys_map, memmap.nr_map * memmap.desc_size); if (memmap.map == NULL) { pr_err("Could not map the memory map!\n"); @@ -649,62 +435,6 @@ static int __init efi_memmap_init(void) return 0; } -/* - * A number of config table entries get remapped to virtual addresses - * after entering EFI virtual mode. However, the kexec kernel requires - * their physical addresses therefore we pass them via setup_data and - * correct those entries to their respective physical addresses here. - * - * Currently only handles smbios which is necessary for some firmware - * implementation. - */ -static int __init efi_reuse_config(u64 tables, int nr_tables) -{ - int i, sz, ret = 0; - void *p, *tablep; - struct efi_setup_data *data; - - if (!efi_setup) - return 0; - - if (!efi_enabled(EFI_64BIT)) - return 0; - - data = early_memremap(efi_setup, sizeof(*data)); - if (!data) { - ret = -ENOMEM; - goto out; - } - - if (!data->smbios) - goto out_memremap; - - sz = sizeof(efi_config_table_64_t); - - p = tablep = early_memremap(tables, nr_tables * sz); - if (!p) { - pr_err("Could not map Configuration table!\n"); - ret = -ENOMEM; - goto out_memremap; - } - - for (i = 0; i < efi.systab->nr_tables; i++) { - efi_guid_t guid; - - guid = ((efi_config_table_64_t *)p)->guid; - - if (!efi_guidcmp(guid, SMBIOS_TABLE_GUID)) - ((efi_config_table_64_t *)p)->table = data->smbios; - p += sz; - } - early_iounmap(tablep, nr_tables * sz); - -out_memremap: - early_iounmap(data, sizeof(*data)); -out: - return ret; -} - void __init efi_init(void) { efi_char16_t *c16; @@ -728,8 +458,6 @@ void __init efi_init(void) if (efi_systab_init(efi_phys.systab)) return; - set_bit(EFI_SYSTEM_TABLES, &efi.flags); - efi.config_table = (unsigned long)efi.systab->tables; efi.fw_vendor = (unsigned long)efi.systab->fw_vendor; efi.runtime = (unsigned long)efi.systab->runtime; @@ -737,14 +465,14 @@ void __init efi_init(void) /* * Show what we know for posterity */ - c16 = tmp = early_ioremap(efi.systab->fw_vendor, 2); + c16 = tmp = early_memremap(efi.systab->fw_vendor, 2); if (c16) { for (i = 0; i < sizeof(vendor) - 1 && *c16; ++i) vendor[i] = *c16++; vendor[i] = '\0'; } else pr_err("Could not map the firmware vendor!\n"); - early_iounmap(tmp, 2); + early_memunmap(tmp, 2); pr_info("EFI v%u.%.02u by %s\n", efi.systab->hdr.revision >> 16, @@ -770,8 +498,6 @@ void __init efi_init(void) if (efi_memmap_init()) return; - set_bit(EFI_MEMMAP, &efi.flags); - print_efi_memmap(); } @@ -847,22 +573,6 @@ void __init old_map_region(efi_memory_desc_t *md) (unsigned long long)md->phys_addr); } -static void native_runtime_setup(void) -{ - efi.get_time = virt_efi_get_time; - efi.set_time = virt_efi_set_time; - efi.get_wakeup_time = virt_efi_get_wakeup_time; - efi.set_wakeup_time = virt_efi_set_wakeup_time; - efi.get_variable = virt_efi_get_variable; - efi.get_next_variable = virt_efi_get_next_variable; - efi.set_variable = virt_efi_set_variable; - efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count; - efi.reset_system = virt_efi_reset_system; - efi.query_variable_info = virt_efi_query_variable_info; - efi.update_capsule = virt_efi_update_capsule; - efi.query_capsule_caps = virt_efi_query_capsule_caps; -} - /* Merge contiguous regions of the same type and attribute */ static void __init efi_merge_regions(void) { @@ -1049,7 +759,7 @@ static void __init kexec_enter_virtual_mode(void) */ efi.runtime_version = efi_systab.hdr.revision; - native_runtime_setup(); + efi_native_runtime_setup(); efi.set_virtual_address_map = NULL; @@ -1057,11 +767,7 @@ static void __init kexec_enter_virtual_mode(void) runtime_code_page_mkexec(); /* clean DUMMY object */ - efi.set_variable(efi_dummy_name, &EFI_DUMMY_GUID, - EFI_VARIABLE_NON_VOLATILE | - EFI_VARIABLE_BOOTSERVICE_ACCESS | - EFI_VARIABLE_RUNTIME_ACCESS, - 0, NULL); + efi_delete_dummy_variable(); #endif } @@ -1142,7 +848,7 @@ static void __init __efi_enter_virtual_mode(void) efi.runtime_version = efi_systab.hdr.revision; if (efi_is_native()) - native_runtime_setup(); + efi_native_runtime_setup(); else efi_thunk_runtime_setup(); @@ -1179,15 +885,14 @@ static void __init __efi_enter_virtual_mode(void) free_pages((unsigned long)new_memmap, pg_shift); /* clean DUMMY object */ - efi.set_variable(efi_dummy_name, &EFI_DUMMY_GUID, - EFI_VARIABLE_NON_VOLATILE | - EFI_VARIABLE_BOOTSERVICE_ACCESS | - EFI_VARIABLE_RUNTIME_ACCESS, - 0, NULL); + efi_delete_dummy_variable(); } void __init efi_enter_virtual_mode(void) { + if (efi_enabled(EFI_PARAVIRT)) + return; + if (efi_setup) kexec_enter_virtual_mode(); else @@ -1220,6 +925,9 @@ u64 efi_mem_attributes(unsigned long phys_addr) efi_memory_desc_t *md; void *p; + if (!efi_enabled(EFI_MEMMAP)) + return 0; + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { md = p; if ((md->phys_addr <= phys_addr) && @@ -1230,86 +938,6 @@ u64 efi_mem_attributes(unsigned long phys_addr) return 0; } -/* - * Some firmware implementations refuse to boot if there's insufficient space - * in the variable store. Ensure that we never use more than a safe limit. - * - * Return EFI_SUCCESS if it is safe to write 'size' bytes to the variable - * store. - */ -efi_status_t efi_query_variable_store(u32 attributes, unsigned long size) -{ - efi_status_t status; - u64 storage_size, remaining_size, max_size; - - if (!(attributes & EFI_VARIABLE_NON_VOLATILE)) - return 0; - - status = efi.query_variable_info(attributes, &storage_size, - &remaining_size, &max_size); - if (status != EFI_SUCCESS) - return status; - - /* - * We account for that by refusing the write if permitting it would - * reduce the available space to under 5KB. This figure was provided by - * Samsung, so should be safe. - */ - if ((remaining_size - size < EFI_MIN_RESERVE) && - !efi_no_storage_paranoia) { - - /* - * Triggering garbage collection may require that the firmware - * generate a real EFI_OUT_OF_RESOURCES error. We can force - * that by attempting to use more space than is available. - */ - unsigned long dummy_size = remaining_size + 1024; - void *dummy = kzalloc(dummy_size, GFP_ATOMIC); - - if (!dummy) - return EFI_OUT_OF_RESOURCES; - - status = efi.set_variable(efi_dummy_name, &EFI_DUMMY_GUID, - EFI_VARIABLE_NON_VOLATILE | - EFI_VARIABLE_BOOTSERVICE_ACCESS | - EFI_VARIABLE_RUNTIME_ACCESS, - dummy_size, dummy); - - if (status == EFI_SUCCESS) { - /* - * This should have failed, so if it didn't make sure - * that we delete it... - */ - efi.set_variable(efi_dummy_name, &EFI_DUMMY_GUID, - EFI_VARIABLE_NON_VOLATILE | - EFI_VARIABLE_BOOTSERVICE_ACCESS | - EFI_VARIABLE_RUNTIME_ACCESS, - 0, dummy); - } - - kfree(dummy); - - /* - * The runtime code may now have triggered a garbage collection - * run, so check the variable info again - */ - status = efi.query_variable_info(attributes, &storage_size, - &remaining_size, &max_size); - - if (status != EFI_SUCCESS) - return status; - - /* - * There still isn't enough room, so return an error - */ - if (remaining_size - size < EFI_MIN_RESERVE) - return EFI_OUT_OF_RESOURCES; - } - - return EFI_SUCCESS; -} -EXPORT_SYMBOL_GPL(efi_query_variable_store); - static int __init parse_efi_cmdline(char *str) { if (*str == '=') @@ -1321,22 +949,3 @@ static int __init parse_efi_cmdline(char *str) return 0; } early_param("efi", parse_efi_cmdline); - -void __init efi_apply_memmap_quirks(void) -{ - /* - * Once setup is done earlier, unmap the EFI memory map on mismatched - * firmware/kernel architectures since there is no support for runtime - * services. - */ - if (!efi_runtime_supported()) { - pr_info("efi: Setup done, disabling due to 32/64-bit mismatch\n"); - efi_unmap_memmap(); - } - - /* - * UV doesn't support the new EFI pagetable mapping yet. - */ - if (is_uv_system()) - set_bit(EFI_OLD_MEMMAP, &efi.flags); -} diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c new file mode 100644 index 00000000000..1c7380da65f --- /dev/null +++ b/arch/x86/platform/efi/quirks.c @@ -0,0 +1,290 @@ +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/time.h> +#include <linux/types.h> +#include <linux/efi.h> +#include <linux/slab.h> +#include <linux/memblock.h> +#include <linux/bootmem.h> +#include <linux/acpi.h> +#include <asm/efi.h> +#include <asm/uv/uv.h> + +#define EFI_MIN_RESERVE 5120 + +#define EFI_DUMMY_GUID \ + EFI_GUID(0x4424ac57, 0xbe4b, 0x47dd, 0x9e, 0x97, 0xed, 0x50, 0xf0, 0x9f, 0x92, 0xa9) + +static efi_char16_t efi_dummy_name[6] = { 'D', 'U', 'M', 'M', 'Y', 0 }; + +static bool efi_no_storage_paranoia; + +/* + * Some firmware implementations refuse to boot if there's insufficient + * space in the variable store. The implementation of garbage collection + * in some FW versions causes stale (deleted) variables to take up space + * longer than intended and space is only freed once the store becomes + * almost completely full. + * + * Enabling this option disables the space checks in + * efi_query_variable_store() and forces garbage collection. + * + * Only enable this option if deleting EFI variables does not free up + * space in your variable store, e.g. if despite deleting variables + * you're unable to create new ones. + */ +static int __init setup_storage_paranoia(char *arg) +{ + efi_no_storage_paranoia = true; + return 0; +} +early_param("efi_no_storage_paranoia", setup_storage_paranoia); + +/* + * Deleting the dummy variable which kicks off garbage collection +*/ +void efi_delete_dummy_variable(void) +{ + efi.set_variable(efi_dummy_name, &EFI_DUMMY_GUID, + EFI_VARIABLE_NON_VOLATILE | + EFI_VARIABLE_BOOTSERVICE_ACCESS | + EFI_VARIABLE_RUNTIME_ACCESS, + 0, NULL); +} + +/* + * Some firmware implementations refuse to boot if there's insufficient space + * in the variable store. Ensure that we never use more than a safe limit. + * + * Return EFI_SUCCESS if it is safe to write 'size' bytes to the variable + * store. + */ +efi_status_t efi_query_variable_store(u32 attributes, unsigned long size) +{ + efi_status_t status; + u64 storage_size, remaining_size, max_size; + + if (!(attributes & EFI_VARIABLE_NON_VOLATILE)) + return 0; + + status = efi.query_variable_info(attributes, &storage_size, + &remaining_size, &max_size); + if (status != EFI_SUCCESS) + return status; + + /* + * We account for that by refusing the write if permitting it would + * reduce the available space to under 5KB. This figure was provided by + * Samsung, so should be safe. + */ + if ((remaining_size - size < EFI_MIN_RESERVE) && + !efi_no_storage_paranoia) { + + /* + * Triggering garbage collection may require that the firmware + * generate a real EFI_OUT_OF_RESOURCES error. We can force + * that by attempting to use more space than is available. + */ + unsigned long dummy_size = remaining_size + 1024; + void *dummy = kzalloc(dummy_size, GFP_ATOMIC); + + if (!dummy) + return EFI_OUT_OF_RESOURCES; + + status = efi.set_variable(efi_dummy_name, &EFI_DUMMY_GUID, + EFI_VARIABLE_NON_VOLATILE | + EFI_VARIABLE_BOOTSERVICE_ACCESS | + EFI_VARIABLE_RUNTIME_ACCESS, + dummy_size, dummy); + + if (status == EFI_SUCCESS) { + /* + * This should have failed, so if it didn't make sure + * that we delete it... + */ + efi_delete_dummy_variable(); + } + + kfree(dummy); + + /* + * The runtime code may now have triggered a garbage collection + * run, so check the variable info again + */ + status = efi.query_variable_info(attributes, &storage_size, + &remaining_size, &max_size); + + if (status != EFI_SUCCESS) + return status; + + /* + * There still isn't enough room, so return an error + */ + if (remaining_size - size < EFI_MIN_RESERVE) + return EFI_OUT_OF_RESOURCES; + } + + return EFI_SUCCESS; +} +EXPORT_SYMBOL_GPL(efi_query_variable_store); + +/* + * The UEFI specification makes it clear that the operating system is free to do + * whatever it wants with boot services code after ExitBootServices() has been + * called. Ignoring this recommendation a significant bunch of EFI implementations + * continue calling into boot services code (SetVirtualAddressMap). In order to + * work around such buggy implementations we reserve boot services region during + * EFI init and make sure it stays executable. Then, after SetVirtualAddressMap(), it +* is discarded. +*/ +void __init efi_reserve_boot_services(void) +{ + void *p; + + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + efi_memory_desc_t *md = p; + u64 start = md->phys_addr; + u64 size = md->num_pages << EFI_PAGE_SHIFT; + + if (md->type != EFI_BOOT_SERVICES_CODE && + md->type != EFI_BOOT_SERVICES_DATA) + continue; + /* Only reserve where possible: + * - Not within any already allocated areas + * - Not over any memory area (really needed, if above?) + * - Not within any part of the kernel + * - Not the bios reserved area + */ + if ((start + size > __pa_symbol(_text) + && start <= __pa_symbol(_end)) || + !e820_all_mapped(start, start+size, E820_RAM) || + memblock_is_region_reserved(start, size)) { + /* Could not reserve, skip it */ + md->num_pages = 0; + memblock_dbg("Could not reserve boot range [0x%010llx-0x%010llx]\n", + start, start+size-1); + } else + memblock_reserve(start, size); + } +} + +void __init efi_free_boot_services(void) +{ + void *p; + + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + efi_memory_desc_t *md = p; + unsigned long long start = md->phys_addr; + unsigned long long size = md->num_pages << EFI_PAGE_SHIFT; + + if (md->type != EFI_BOOT_SERVICES_CODE && + md->type != EFI_BOOT_SERVICES_DATA) + continue; + + /* Could not reserve boot area */ + if (!size) + continue; + + free_bootmem_late(start, size); + } + + efi_unmap_memmap(); +} + +/* + * A number of config table entries get remapped to virtual addresses + * after entering EFI virtual mode. However, the kexec kernel requires + * their physical addresses therefore we pass them via setup_data and + * correct those entries to their respective physical addresses here. + * + * Currently only handles smbios which is necessary for some firmware + * implementation. + */ +int __init efi_reuse_config(u64 tables, int nr_tables) +{ + int i, sz, ret = 0; + void *p, *tablep; + struct efi_setup_data *data; + + if (!efi_setup) + return 0; + + if (!efi_enabled(EFI_64BIT)) + return 0; + + data = early_memremap(efi_setup, sizeof(*data)); + if (!data) { + ret = -ENOMEM; + goto out; + } + + if (!data->smbios) + goto out_memremap; + + sz = sizeof(efi_config_table_64_t); + + p = tablep = early_memremap(tables, nr_tables * sz); + if (!p) { + pr_err("Could not map Configuration table!\n"); + ret = -ENOMEM; + goto out_memremap; + } + + for (i = 0; i < efi.systab->nr_tables; i++) { + efi_guid_t guid; + + guid = ((efi_config_table_64_t *)p)->guid; + + if (!efi_guidcmp(guid, SMBIOS_TABLE_GUID)) + ((efi_config_table_64_t *)p)->table = data->smbios; + p += sz; + } + early_memunmap(tablep, nr_tables * sz); + +out_memremap: + early_memunmap(data, sizeof(*data)); +out: + return ret; +} + +void __init efi_apply_memmap_quirks(void) +{ + /* + * Once setup is done earlier, unmap the EFI memory map on mismatched + * firmware/kernel architectures since there is no support for runtime + * services. + */ + if (!efi_runtime_supported()) { + pr_info("efi: Setup done, disabling due to 32/64-bit mismatch\n"); + efi_unmap_memmap(); + } + + /* + * UV doesn't support the new EFI pagetable mapping yet. + */ + if (is_uv_system()) + set_bit(EFI_OLD_MEMMAP, &efi.flags); +} + +/* + * For most modern platforms the preferred method of powering off is via + * ACPI. However, there are some that are known to require the use of + * EFI runtime services and for which ACPI does not work at all. + * + * Using EFI is a last resort, to be used only if no other option + * exists. + */ +bool efi_reboot_required(void) +{ + if (!acpi_gbl_reduced_hardware) + return false; + + efi_reboot_quirk_mode = EFI_RESET_WARM; + return true; +} + +bool efi_poweroff_required(void) +{ + return !!acpi_gbl_reduced_hardware; +} diff --git a/arch/x86/platform/intel-mid/device_libs/platform_wdt.c b/arch/x86/platform/intel-mid/device_libs/platform_wdt.c index 973cf3bfa9f..0b283d4d0ad 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_wdt.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_wdt.c @@ -26,28 +26,18 @@ static struct platform_device wdt_dev = { static int tangier_probe(struct platform_device *pdev) { - int ioapic; - int irq; + int gsi; struct intel_mid_wdt_pdata *pdata = pdev->dev.platform_data; - struct io_apic_irq_attr irq_attr = { 0 }; if (!pdata) return -EINVAL; - irq = pdata->irq; - ioapic = mp_find_ioapic(irq); - if (ioapic >= 0) { - int ret; - irq_attr.ioapic = ioapic; - irq_attr.ioapic_pin = irq; - irq_attr.trigger = 1; - /* irq_attr.polarity = 0; -> Active high */ - ret = io_apic_set_pci_routing(NULL, irq, &irq_attr); - if (ret) - return ret; - } else { + /* IOAPIC builds identity mapping between GSI and IRQ on MID */ + gsi = pdata->irq; + if (mp_set_gsi_attr(gsi, 1, 0, cpu_to_node(0)) || + mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC) <= 0) { dev_warn(&pdev->dev, "cannot find interrupt %d in ioapic\n", - irq); + gsi); return -EINVAL; } diff --git a/arch/x86/platform/intel-mid/sfi.c b/arch/x86/platform/intel-mid/sfi.c index 994c40bd7cb..3c53a90fdb1 100644 --- a/arch/x86/platform/intel-mid/sfi.c +++ b/arch/x86/platform/intel-mid/sfi.c @@ -432,9 +432,8 @@ static int __init sfi_parse_devs(struct sfi_table_header *table) struct sfi_table_simple *sb; struct sfi_device_table_entry *pentry; struct devs_id *dev = NULL; - int num, i; - int ioapic; - struct io_apic_irq_attr irq_attr; + int num, i, ret; + int polarity; sb = (struct sfi_table_simple *)table; num = SFI_GET_NUM_ENTRIES(sb, struct sfi_device_table_entry); @@ -448,35 +447,30 @@ static int __init sfi_parse_devs(struct sfi_table_header *table) * devices, but they have separate RTE entry in IOAPIC * so we have to enable them one by one here */ - ioapic = mp_find_ioapic(irq); - if (ioapic >= 0) { - irq_attr.ioapic = ioapic; - irq_attr.ioapic_pin = irq; - irq_attr.trigger = 1; - if (intel_mid_identify_cpu() == - INTEL_MID_CPU_CHIP_TANGIER) { - if (!strncmp(pentry->name, - "r69001-ts-i2c", 13)) - /* active low */ - irq_attr.polarity = 1; - else if (!strncmp(pentry->name, - "synaptics_3202", 14)) - /* active low */ - irq_attr.polarity = 1; - else if (irq == 41) - /* fast_int_1 */ - irq_attr.polarity = 1; - else - /* active high */ - irq_attr.polarity = 0; - } else { - /* PNW and CLV go with active low */ - irq_attr.polarity = 1; - } - io_apic_set_pci_routing(NULL, irq, &irq_attr); + if (intel_mid_identify_cpu() == + INTEL_MID_CPU_CHIP_TANGIER) { + if (!strncmp(pentry->name, "r69001-ts-i2c", 13)) + /* active low */ + polarity = 1; + else if (!strncmp(pentry->name, + "synaptics_3202", 14)) + /* active low */ + polarity = 1; + else if (irq == 41) + /* fast_int_1 */ + polarity = 1; + else + /* active high */ + polarity = 0; + } else { + /* PNW and CLV go with active low */ + polarity = 1; } - } else { - irq = 0; /* No irq */ + + ret = mp_set_gsi_attr(irq, 1, polarity, NUMA_NO_NODE); + if (ret == 0) + ret = mp_map_gsi_to_irq(irq, IOAPIC_MAP_ALLOC); + WARN_ON(ret < 0); } dev = get_device_id(pentry->type, pentry->name); diff --git a/arch/x86/platform/sfi/sfi.c b/arch/x86/platform/sfi/sfi.c index bcd1a703e3e..2a8a74f3bd7 100644 --- a/arch/x86/platform/sfi/sfi.c +++ b/arch/x86/platform/sfi/sfi.c @@ -25,6 +25,7 @@ #include <linux/init.h> #include <linux/sfi.h> #include <linux/io.h> +#include <linux/irqdomain.h> #include <asm/io_apic.h> #include <asm/mpspec.h> @@ -70,19 +71,26 @@ static int __init sfi_parse_cpus(struct sfi_table_header *table) #endif /* CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_X86_IO_APIC +static struct irq_domain_ops sfi_ioapic_irqdomain_ops = { + .map = mp_irqdomain_map, +}; static int __init sfi_parse_ioapic(struct sfi_table_header *table) { struct sfi_table_simple *sb; struct sfi_apic_table_entry *pentry; int i, num; + struct ioapic_domain_cfg cfg = { + .type = IOAPIC_DOMAIN_STRICT, + .ops = &sfi_ioapic_irqdomain_ops, + }; sb = (struct sfi_table_simple *)table; num = SFI_GET_NUM_ENTRIES(sb, struct sfi_apic_table_entry); pentry = (struct sfi_apic_table_entry *)sb->pentry; for (i = 0; i < num; i++) { - mp_register_ioapic(i, pentry->phys_addr, gsi_top); + mp_register_ioapic(i, pentry->phys_addr, gsi_top, &cfg); pentry++; } diff --git a/arch/x86/platform/ts5500/ts5500.c b/arch/x86/platform/ts5500/ts5500.c index 9471b9456f2..baf16e72e66 100644 --- a/arch/x86/platform/ts5500/ts5500.c +++ b/arch/x86/platform/ts5500/ts5500.c @@ -1,7 +1,7 @@ /* * Technologic Systems TS-5500 Single Board Computer support * - * Copyright (C) 2013 Savoir-faire Linux Inc. + * Copyright (C) 2013-2014 Savoir-faire Linux Inc. * Vivien Didelot <vivien.didelot@savoirfairelinux.com> * * This program is free software; you can redistribute it and/or modify it under @@ -15,8 +15,8 @@ * state or available options. For further information about sysfs entries, see * Documentation/ABI/testing/sysfs-platform-ts5500. * - * This code actually supports the TS-5500 platform, but it may be extended to - * support similar Technologic Systems x86-based platforms, such as the TS-5600. + * This code may be extended to support similar x86-based platforms. + * Actually, the TS-5500 and TS-5400 are supported. */ #include <linux/delay.h> @@ -32,6 +32,7 @@ /* Product code register */ #define TS5500_PRODUCT_CODE_ADDR 0x74 #define TS5500_PRODUCT_CODE 0x60 /* TS-5500 product code */ +#define TS5400_PRODUCT_CODE 0x40 /* TS-5400 product code */ /* SRAM/RS-485/ADC options, and RS-485 RTS/Automatic RS-485 flags register */ #define TS5500_SRAM_RS485_ADC_ADDR 0x75 @@ -66,6 +67,7 @@ /** * struct ts5500_sbc - TS-5500 board description + * @name: Board model name. * @id: Board product ID. * @sram: Flag for SRAM option. * @rs485: Flag for RS-485 option. @@ -75,6 +77,7 @@ * @jumpers: Bitfield for jumpers' state. */ struct ts5500_sbc { + const char *name; int id; bool sram; bool rs485; @@ -122,13 +125,16 @@ static int __init ts5500_detect_config(struct ts5500_sbc *sbc) if (!request_region(TS5500_PRODUCT_CODE_ADDR, 4, "ts5500")) return -EBUSY; - tmp = inb(TS5500_PRODUCT_CODE_ADDR); - if (tmp != TS5500_PRODUCT_CODE) { - pr_err("This platform is not a TS-5500 (found ID 0x%x)\n", tmp); + sbc->id = inb(TS5500_PRODUCT_CODE_ADDR); + if (sbc->id == TS5500_PRODUCT_CODE) { + sbc->name = "TS-5500"; + } else if (sbc->id == TS5400_PRODUCT_CODE) { + sbc->name = "TS-5400"; + } else { + pr_err("ts5500: unknown product code 0x%x\n", sbc->id); ret = -ENODEV; goto cleanup; } - sbc->id = tmp; tmp = inb(TS5500_SRAM_RS485_ADC_ADDR); sbc->sram = tmp & TS5500_SRAM; @@ -147,48 +153,52 @@ cleanup: return ret; } -static ssize_t ts5500_show_id(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t name_show(struct device *dev, struct device_attribute *attr, + char *buf) { struct ts5500_sbc *sbc = dev_get_drvdata(dev); - return sprintf(buf, "0x%.2x\n", sbc->id); + return sprintf(buf, "%s\n", sbc->name); } +static DEVICE_ATTR_RO(name); -static ssize_t ts5500_show_jumpers(struct device *dev, - struct device_attribute *attr, - char *buf) +static ssize_t id_show(struct device *dev, struct device_attribute *attr, + char *buf) { struct ts5500_sbc *sbc = dev_get_drvdata(dev); - return sprintf(buf, "0x%.2x\n", sbc->jumpers >> 1); + return sprintf(buf, "0x%.2x\n", sbc->id); } +static DEVICE_ATTR_RO(id); -#define TS5500_SHOW(field) \ - static ssize_t ts5500_show_##field(struct device *dev, \ - struct device_attribute *attr, \ - char *buf) \ - { \ - struct ts5500_sbc *sbc = dev_get_drvdata(dev); \ - return sprintf(buf, "%d\n", sbc->field); \ - } - -TS5500_SHOW(sram) -TS5500_SHOW(rs485) -TS5500_SHOW(adc) -TS5500_SHOW(ereset) -TS5500_SHOW(itr) +static ssize_t jumpers_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct ts5500_sbc *sbc = dev_get_drvdata(dev); -static DEVICE_ATTR(id, S_IRUGO, ts5500_show_id, NULL); -static DEVICE_ATTR(jumpers, S_IRUGO, ts5500_show_jumpers, NULL); -static DEVICE_ATTR(sram, S_IRUGO, ts5500_show_sram, NULL); -static DEVICE_ATTR(rs485, S_IRUGO, ts5500_show_rs485, NULL); -static DEVICE_ATTR(adc, S_IRUGO, ts5500_show_adc, NULL); -static DEVICE_ATTR(ereset, S_IRUGO, ts5500_show_ereset, NULL); -static DEVICE_ATTR(itr, S_IRUGO, ts5500_show_itr, NULL); + return sprintf(buf, "0x%.2x\n", sbc->jumpers >> 1); +} +static DEVICE_ATTR_RO(jumpers); + +#define TS5500_ATTR_BOOL(_field) \ + static ssize_t _field##_show(struct device *dev, \ + struct device_attribute *attr, char *buf) \ + { \ + struct ts5500_sbc *sbc = dev_get_drvdata(dev); \ + \ + return sprintf(buf, "%d\n", sbc->_field); \ + } \ + static DEVICE_ATTR_RO(_field) + +TS5500_ATTR_BOOL(sram); +TS5500_ATTR_BOOL(rs485); +TS5500_ATTR_BOOL(adc); +TS5500_ATTR_BOOL(ereset); +TS5500_ATTR_BOOL(itr); static struct attribute *ts5500_attributes[] = { &dev_attr_id.attr, + &dev_attr_name.attr, &dev_attr_jumpers.attr, &dev_attr_sram.attr, &dev_attr_rs485.attr, @@ -311,12 +321,14 @@ static int __init ts5500_init(void) if (err) goto error; - ts5500_dio1_pdev.dev.parent = &pdev->dev; - if (platform_device_register(&ts5500_dio1_pdev)) - dev_warn(&pdev->dev, "DIO1 block registration failed\n"); - ts5500_dio2_pdev.dev.parent = &pdev->dev; - if (platform_device_register(&ts5500_dio2_pdev)) - dev_warn(&pdev->dev, "DIO2 block registration failed\n"); + if (sbc->id == TS5500_PRODUCT_CODE) { + ts5500_dio1_pdev.dev.parent = &pdev->dev; + if (platform_device_register(&ts5500_dio1_pdev)) + dev_warn(&pdev->dev, "DIO1 block registration failed\n"); + ts5500_dio2_pdev.dev.parent = &pdev->dev; + if (platform_device_register(&ts5500_dio2_pdev)) + dev_warn(&pdev->dev, "DIO2 block registration failed\n"); + } if (led_classdev_register(&pdev->dev, &ts5500_led_cdev)) dev_warn(&pdev->dev, "LED registration failed\n"); diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index dfe605ac1bc..3968d67d366 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -1,7 +1,7 @@ /* * SGI UltraViolet TLB flush routines. * - * (c) 2008-2012 Cliff Wickman <cpw@sgi.com>, SGI. + * (c) 2008-2014 Cliff Wickman <cpw@sgi.com>, SGI. * * This code is released under the GNU General Public License version 2 or * later. @@ -451,7 +451,7 @@ static inline unsigned long long cycles_2_ns(unsigned long long cyc) /* * The reverse of the above; converts a duration in ns to a duration in cycles. - */ + */ static inline unsigned long long ns_2_cycles(unsigned long long ns) { struct cyc2ns_data *data = cyc2ns_read_begin(); @@ -563,7 +563,7 @@ static int uv1_wait_completion(struct bau_desc *bau_desc, * UV2 could have an extra bit of status in the ACTIVATION_STATUS_2 register. * But not currently used. */ -static unsigned long uv2_read_status(unsigned long offset, int rshft, int desc) +static unsigned long uv2_3_read_status(unsigned long offset, int rshft, int desc) { unsigned long descriptor_status; @@ -606,7 +606,7 @@ int handle_uv2_busy(struct bau_control *bcp) return FLUSH_GIVEUP; } -static int uv2_wait_completion(struct bau_desc *bau_desc, +static int uv2_3_wait_completion(struct bau_desc *bau_desc, unsigned long mmr_offset, int right_shift, struct bau_control *bcp, long try) { @@ -616,7 +616,7 @@ static int uv2_wait_completion(struct bau_desc *bau_desc, long busy_reps = 0; struct ptc_stats *stat = bcp->statp; - descriptor_stat = uv2_read_status(mmr_offset, right_shift, desc); + descriptor_stat = uv2_3_read_status(mmr_offset, right_shift, desc); /* spin on the status MMR, waiting for it to go idle */ while (descriptor_stat != UV2H_DESC_IDLE) { @@ -658,8 +658,7 @@ static int uv2_wait_completion(struct bau_desc *bau_desc, /* not to hammer on the clock */ busy_reps = 0; ttm = get_cycles(); - if ((ttm - bcp->send_message) > - bcp->timeout_interval) + if ((ttm - bcp->send_message) > bcp->timeout_interval) return handle_uv2_busy(bcp); } /* @@ -667,8 +666,7 @@ static int uv2_wait_completion(struct bau_desc *bau_desc, */ cpu_relax(); } - descriptor_stat = uv2_read_status(mmr_offset, right_shift, - desc); + descriptor_stat = uv2_3_read_status(mmr_offset, right_shift, desc); } bcp->conseccompletes++; return FLUSH_COMPLETE; @@ -679,8 +677,7 @@ static int uv2_wait_completion(struct bau_desc *bau_desc, * which register to read and position in that register based on cpu in * current hub. */ -static int wait_completion(struct bau_desc *bau_desc, - struct bau_control *bcp, long try) +static int wait_completion(struct bau_desc *bau_desc, struct bau_control *bcp, long try) { int right_shift; unsigned long mmr_offset; @@ -695,11 +692,9 @@ static int wait_completion(struct bau_desc *bau_desc, } if (bcp->uvhub_version == 1) - return uv1_wait_completion(bau_desc, mmr_offset, right_shift, - bcp, try); + return uv1_wait_completion(bau_desc, mmr_offset, right_shift, bcp, try); else - return uv2_wait_completion(bau_desc, mmr_offset, right_shift, - bcp, try); + return uv2_3_wait_completion(bau_desc, mmr_offset, right_shift, bcp, try); } /* @@ -888,7 +883,7 @@ int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp, struct ptc_stats *stat = bcp->statp; struct bau_control *hmaster = bcp->uvhub_master; struct uv1_bau_msg_header *uv1_hdr = NULL; - struct uv2_bau_msg_header *uv2_hdr = NULL; + struct uv2_3_bau_msg_header *uv2_3_hdr = NULL; if (bcp->uvhub_version == 1) { uv1 = 1; @@ -902,27 +897,28 @@ int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp, if (uv1) uv1_hdr = &bau_desc->header.uv1_hdr; else - uv2_hdr = &bau_desc->header.uv2_hdr; + /* uv2 and uv3 */ + uv2_3_hdr = &bau_desc->header.uv2_3_hdr; do { if (try == 0) { if (uv1) uv1_hdr->msg_type = MSG_REGULAR; else - uv2_hdr->msg_type = MSG_REGULAR; + uv2_3_hdr->msg_type = MSG_REGULAR; seq_number = bcp->message_number++; } else { if (uv1) uv1_hdr->msg_type = MSG_RETRY; else - uv2_hdr->msg_type = MSG_RETRY; + uv2_3_hdr->msg_type = MSG_RETRY; stat->s_retry_messages++; } if (uv1) uv1_hdr->sequence = seq_number; else - uv2_hdr->sequence = seq_number; + uv2_3_hdr->sequence = seq_number; index = (1UL << AS_PUSH_SHIFT) | bcp->uvhub_cpu; bcp->send_message = get_cycles(); @@ -1080,8 +1076,10 @@ static int set_distrib_bits(struct cpumask *flush_mask, struct bau_control *bcp, * done. The returned pointer is valid till preemption is re-enabled. */ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, - struct mm_struct *mm, unsigned long start, - unsigned long end, unsigned int cpu) + struct mm_struct *mm, + unsigned long start, + unsigned long end, + unsigned int cpu) { int locals = 0; int remotes = 0; @@ -1268,6 +1266,7 @@ void uv_bau_message_interrupt(struct pt_regs *regs) if (bcp->uvhub_version == 2) process_uv2_message(&msgdesc, bcp); else + /* no error workaround for uv1 or uv3 */ bau_process_message(&msgdesc, bcp, 1); msg++; @@ -1325,8 +1324,12 @@ static void __init enable_timeouts(void) */ mmr_image |= (1L << SOFTACK_MSHIFT); if (is_uv2_hub()) { + /* do not touch the legacy mode bit */ /* hw bug workaround; do not use extended status */ mmr_image &= ~(1L << UV2_EXT_SHFT); + } else if (is_uv3_hub()) { + mmr_image &= ~(1L << PREFETCH_HINT_SHFT); + mmr_image |= (1L << SB_STATUS_SHFT); } write_mmr_misc_control(pnode, mmr_image); } @@ -1476,7 +1479,7 @@ static ssize_t ptc_proc_write(struct file *file, const char __user *user, return count; } - if (strict_strtol(optstr, 10, &input_arg) < 0) { + if (kstrtol(optstr, 10, &input_arg) < 0) { printk(KERN_DEBUG "%s is invalid\n", optstr); return -EINVAL; } @@ -1692,7 +1695,7 @@ static void activation_descriptor_init(int node, int pnode, int base_pnode) struct bau_desc *bau_desc; struct bau_desc *bd2; struct uv1_bau_msg_header *uv1_hdr; - struct uv2_bau_msg_header *uv2_hdr; + struct uv2_3_bau_msg_header *uv2_3_hdr; struct bau_control *bcp; /* @@ -1739,15 +1742,15 @@ static void activation_descriptor_init(int node, int pnode, int base_pnode) */ } else { /* - * BIOS uses legacy mode, but UV2 hardware always + * BIOS uses legacy mode, but uv2 and uv3 hardware always * uses native mode for selective broadcasts. */ - uv2_hdr = &bd2->header.uv2_hdr; - uv2_hdr->swack_flag = 1; - uv2_hdr->base_dest_nasid = + uv2_3_hdr = &bd2->header.uv2_3_hdr; + uv2_3_hdr->swack_flag = 1; + uv2_3_hdr->base_dest_nasid = UV_PNODE_TO_NASID(base_pnode); - uv2_hdr->dest_subnodeid = UV_LB_SUBNODEID; - uv2_hdr->command = UV_NET_ENDPOINT_INTD; + uv2_3_hdr->dest_subnodeid = UV_LB_SUBNODEID; + uv2_3_hdr->command = UV_NET_ENDPOINT_INTD; } } for_each_present_cpu(cpu) { @@ -1858,6 +1861,7 @@ static int calculate_destination_timeout(void) ts_ns *= (mult1 * mult2); ret = ts_ns / 1000; } else { + /* same destination timeout for uv2 and uv3 */ /* 4 bits 0/1 for 10/80us base, 3 bits of multiplier */ mmr_image = uv_read_local_mmr(UVH_LB_BAU_MISC_CONTROL); mmr_image = (mmr_image & UV_SA_MASK) >> UV_SA_SHFT; @@ -2012,8 +2016,10 @@ static int scan_sock(struct socket_desc *sdp, struct uvhub_desc *bdp, bcp->uvhub_version = 1; else if (is_uv2_hub()) bcp->uvhub_version = 2; + else if (is_uv3_hub()) + bcp->uvhub_version = 3; else { - printk(KERN_EMERG "uvhub version not 1 or 2\n"); + printk(KERN_EMERG "uvhub version not 1, 2 or 3\n"); return 1; } bcp->uvhub_master = *hmasterp; @@ -2138,9 +2144,10 @@ static int __init uv_bau_init(void) } vector = UV_BAU_MESSAGE; - for_each_possible_blade(uvhub) + for_each_possible_blade(uvhub) { if (uv_blade_nr_possible_cpus(uvhub)) init_uvhub(uvhub, vector, uv_base_pnode); + } alloc_intr_gate(vector, uv_bau_message_intr1); diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c index c89c93320c1..c6b146e6711 100644 --- a/arch/x86/platform/uv/uv_nmi.c +++ b/arch/x86/platform/uv/uv_nmi.c @@ -63,8 +63,8 @@ static struct uv_hub_nmi_s **uv_hub_nmi_list; -DEFINE_PER_CPU(struct uv_cpu_nmi_s, __uv_cpu_nmi); -EXPORT_PER_CPU_SYMBOL_GPL(__uv_cpu_nmi); +DEFINE_PER_CPU(struct uv_cpu_nmi_s, uv_cpu_nmi); +EXPORT_PER_CPU_SYMBOL_GPL(uv_cpu_nmi); static unsigned long nmi_mmr; static unsigned long nmi_mmr_clear; @@ -215,7 +215,7 @@ static int uv_check_nmi(struct uv_hub_nmi_s *hub_nmi) int nmi = 0; local64_inc(&uv_nmi_count); - uv_cpu_nmi.queries++; + this_cpu_inc(uv_cpu_nmi.queries); do { nmi = atomic_read(&hub_nmi->in_nmi); @@ -293,7 +293,7 @@ static void uv_nmi_nr_cpus_ping(void) int cpu; for_each_cpu(cpu, uv_nmi_cpu_mask) - atomic_set(&uv_cpu_nmi_per(cpu).pinging, 1); + uv_cpu_nmi_per(cpu).pinging = 1; apic->send_IPI_mask(uv_nmi_cpu_mask, APIC_DM_NMI); } @@ -304,8 +304,8 @@ static void uv_nmi_cleanup_mask(void) int cpu; for_each_cpu(cpu, uv_nmi_cpu_mask) { - atomic_set(&uv_cpu_nmi_per(cpu).pinging, 0); - atomic_set(&uv_cpu_nmi_per(cpu).state, UV_NMI_STATE_OUT); + uv_cpu_nmi_per(cpu).pinging = 0; + uv_cpu_nmi_per(cpu).state = UV_NMI_STATE_OUT; cpumask_clear_cpu(cpu, uv_nmi_cpu_mask); } } @@ -328,7 +328,7 @@ static int uv_nmi_wait_cpus(int first) int loop_delay = uv_nmi_loop_delay; for_each_cpu(j, uv_nmi_cpu_mask) { - if (atomic_read(&uv_cpu_nmi_per(j).state)) { + if (uv_cpu_nmi_per(j).state) { cpumask_clear_cpu(j, uv_nmi_cpu_mask); if (++k >= n) break; @@ -359,7 +359,7 @@ static int uv_nmi_wait_cpus(int first) static void uv_nmi_wait(int master) { /* indicate this cpu is in */ - atomic_set(&uv_cpu_nmi.state, UV_NMI_STATE_IN); + this_cpu_write(uv_cpu_nmi.state, UV_NMI_STATE_IN); /* if not the first cpu in (the master), then we are a slave cpu */ if (!master) @@ -419,7 +419,7 @@ static void uv_nmi_dump_state_cpu(int cpu, struct pt_regs *regs) "UV:%sNMI process trace for CPU %d\n", dots, cpu); show_regs(regs); } - atomic_set(&uv_cpu_nmi.state, UV_NMI_STATE_DUMP_DONE); + this_cpu_write(uv_cpu_nmi.state, UV_NMI_STATE_DUMP_DONE); } /* Trigger a slave cpu to dump it's state */ @@ -427,20 +427,20 @@ static void uv_nmi_trigger_dump(int cpu) { int retry = uv_nmi_trigger_delay; - if (atomic_read(&uv_cpu_nmi_per(cpu).state) != UV_NMI_STATE_IN) + if (uv_cpu_nmi_per(cpu).state != UV_NMI_STATE_IN) return; - atomic_set(&uv_cpu_nmi_per(cpu).state, UV_NMI_STATE_DUMP); + uv_cpu_nmi_per(cpu).state = UV_NMI_STATE_DUMP; do { cpu_relax(); udelay(10); - if (atomic_read(&uv_cpu_nmi_per(cpu).state) + if (uv_cpu_nmi_per(cpu).state != UV_NMI_STATE_DUMP) return; } while (--retry > 0); pr_crit("UV: CPU %d stuck in process dump function\n", cpu); - atomic_set(&uv_cpu_nmi_per(cpu).state, UV_NMI_STATE_DUMP_DONE); + uv_cpu_nmi_per(cpu).state = UV_NMI_STATE_DUMP_DONE; } /* Wait until all cpus ready to exit */ @@ -488,7 +488,7 @@ static void uv_nmi_dump_state(int cpu, struct pt_regs *regs, int master) } else { while (!atomic_read(&uv_nmi_slave_continue)) cpu_relax(); - while (atomic_read(&uv_cpu_nmi.state) != UV_NMI_STATE_DUMP) + while (this_cpu_read(uv_cpu_nmi.state) != UV_NMI_STATE_DUMP) cpu_relax(); uv_nmi_dump_state_cpu(cpu, regs); } @@ -615,7 +615,7 @@ int uv_handle_nmi(unsigned int reason, struct pt_regs *regs) local_irq_save(flags); /* If not a UV System NMI, ignore */ - if (!atomic_read(&uv_cpu_nmi.pinging) && !uv_check_nmi(hub_nmi)) { + if (!this_cpu_read(uv_cpu_nmi.pinging) && !uv_check_nmi(hub_nmi)) { local_irq_restore(flags); return NMI_DONE; } @@ -639,7 +639,7 @@ int uv_handle_nmi(unsigned int reason, struct pt_regs *regs) uv_call_kgdb_kdb(cpu, regs, master); /* Clear per_cpu "in nmi" flag */ - atomic_set(&uv_cpu_nmi.state, UV_NMI_STATE_OUT); + this_cpu_write(uv_cpu_nmi.state, UV_NMI_STATE_OUT); /* Clear MMR NMI flag on each hub */ uv_clear_nmi(cpu); @@ -666,16 +666,16 @@ static int uv_handle_nmi_ping(unsigned int reason, struct pt_regs *regs) { int ret; - uv_cpu_nmi.queries++; - if (!atomic_read(&uv_cpu_nmi.pinging)) { + this_cpu_inc(uv_cpu_nmi.queries); + if (!this_cpu_read(uv_cpu_nmi.pinging)) { local64_inc(&uv_nmi_ping_misses); return NMI_DONE; } - uv_cpu_nmi.pings++; + this_cpu_inc(uv_cpu_nmi.pings); local64_inc(&uv_nmi_ping_count); ret = uv_handle_nmi(reason, regs); - atomic_set(&uv_cpu_nmi.pinging, 0); + this_cpu_write(uv_cpu_nmi.pinging, 0); return ret; } diff --git a/arch/x86/platform/uv/uv_time.c b/arch/x86/platform/uv/uv_time.c index 5c86786bbfd..a244237f3cf 100644 --- a/arch/x86/platform/uv/uv_time.c +++ b/arch/x86/platform/uv/uv_time.c @@ -365,7 +365,7 @@ __setup("uvrtcevt", uv_enable_evt_rtc); static __init void uv_rtc_register_clockevents(struct work_struct *dummy) { - struct clock_event_device *ced = &__get_cpu_var(cpu_ced); + struct clock_event_device *ced = this_cpu_ptr(&cpu_ced); *ced = clock_event_device_uv; ced->cpumask = cpumask_of(smp_processor_id()); diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 424f4c97a44..6ec7910f59b 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -165,7 +165,7 @@ static void fix_processor_context(void) * by __save_processor_state() * @ctxt - structure to load the registers contents from */ -static void __restore_processor_state(struct saved_context *ctxt) +static void notrace __restore_processor_state(struct saved_context *ctxt) { if (ctxt->misc_enable_saved) wrmsrl(MSR_IA32_MISC_ENABLE, ctxt->misc_enable); @@ -239,7 +239,7 @@ static void __restore_processor_state(struct saved_context *ctxt) } /* Needed by apm.c */ -void restore_processor_state(void) +void notrace restore_processor_state(void) { __restore_processor_state(&saved_context); } diff --git a/arch/x86/power/hibernate_32.c b/arch/x86/power/hibernate_32.c index 7d28c885d23..291226b952a 100644 --- a/arch/x86/power/hibernate_32.c +++ b/arch/x86/power/hibernate_32.c @@ -13,13 +13,11 @@ #include <asm/page.h> #include <asm/pgtable.h> #include <asm/mmzone.h> +#include <asm/sections.h> /* Defined in hibernate_asm_32.S */ extern int restore_image(void); -/* References to section boundaries */ -extern const void __nosave_begin, __nosave_end; - /* Pointer to the temporary resume page tables */ pgd_t *resume_pg_dir; diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index 35e2bb6c0f3..009947d419a 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -17,11 +17,9 @@ #include <asm/page.h> #include <asm/pgtable.h> #include <asm/mtrr.h> +#include <asm/sections.h> #include <asm/suspend.h> -/* References to section boundaries */ -extern __visible const void __nosave_begin, __nosave_end; - /* Defined in hibernate_asm_64.S */ extern asmlinkage __visible int restore_image(void); diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile new file mode 100644 index 00000000000..f52e033557c --- /dev/null +++ b/arch/x86/purgatory/Makefile @@ -0,0 +1,29 @@ +purgatory-y := purgatory.o stack.o setup-x86_$(BITS).o sha256.o entry64.o string.o + +targets += $(purgatory-y) +PURGATORY_OBJS = $(addprefix $(obj)/,$(purgatory-y)) + +LDFLAGS_purgatory.ro := -e purgatory_start -r --no-undefined -nostdlib -z nodefaultlib +targets += purgatory.ro + +# Default KBUILD_CFLAGS can have -pg option set when FTRACE is enabled. That +# in turn leaves some undefined symbols like __fentry__ in purgatory and not +# sure how to relocate those. Like kexec-tools, use custom flags. + +KBUILD_CFLAGS := -fno-strict-aliasing -Wall -Wstrict-prototypes -fno-zero-initialized-in-bss -fno-builtin -ffreestanding -c -MD -Os -mcmodel=large +KBUILD_CFLAGS += -m$(BITS) + +$(obj)/purgatory.ro: $(PURGATORY_OBJS) FORCE + $(call if_changed,ld) + +targets += kexec-purgatory.c + +CMD_BIN2C = $(objtree)/scripts/basic/bin2c +quiet_cmd_bin2c = BIN2C $@ + cmd_bin2c = $(CMD_BIN2C) kexec_purgatory < $< > $@ + +$(obj)/kexec-purgatory.c: $(obj)/purgatory.ro FORCE + $(call if_changed,bin2c) + + +obj-$(CONFIG_KEXEC_FILE) += kexec-purgatory.o diff --git a/arch/x86/purgatory/entry64.S b/arch/x86/purgatory/entry64.S new file mode 100644 index 00000000000..d1a4291d356 --- /dev/null +++ b/arch/x86/purgatory/entry64.S @@ -0,0 +1,101 @@ +/* + * Copyright (C) 2003,2004 Eric Biederman (ebiederm@xmission.com) + * Copyright (C) 2014 Red Hat Inc. + + * Author(s): Vivek Goyal <vgoyal@redhat.com> + * + * This code has been taken from kexec-tools. + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + + .text + .balign 16 + .code64 + .globl entry64, entry64_regs + + +entry64: + /* Setup a gdt that should be preserved */ + lgdt gdt(%rip) + + /* load the data segments */ + movl $0x18, %eax /* data segment */ + movl %eax, %ds + movl %eax, %es + movl %eax, %ss + movl %eax, %fs + movl %eax, %gs + + /* Setup new stack */ + leaq stack_init(%rip), %rsp + pushq $0x10 /* CS */ + leaq new_cs_exit(%rip), %rax + pushq %rax + lretq +new_cs_exit: + + /* Load the registers */ + movq rax(%rip), %rax + movq rbx(%rip), %rbx + movq rcx(%rip), %rcx + movq rdx(%rip), %rdx + movq rsi(%rip), %rsi + movq rdi(%rip), %rdi + movq rsp(%rip), %rsp + movq rbp(%rip), %rbp + movq r8(%rip), %r8 + movq r9(%rip), %r9 + movq r10(%rip), %r10 + movq r11(%rip), %r11 + movq r12(%rip), %r12 + movq r13(%rip), %r13 + movq r14(%rip), %r14 + movq r15(%rip), %r15 + + /* Jump to the new code... */ + jmpq *rip(%rip) + + .section ".rodata" + .balign 4 +entry64_regs: +rax: .quad 0x0 +rcx: .quad 0x0 +rdx: .quad 0x0 +rbx: .quad 0x0 +rsp: .quad 0x0 +rbp: .quad 0x0 +rsi: .quad 0x0 +rdi: .quad 0x0 +r8: .quad 0x0 +r9: .quad 0x0 +r10: .quad 0x0 +r11: .quad 0x0 +r12: .quad 0x0 +r13: .quad 0x0 +r14: .quad 0x0 +r15: .quad 0x0 +rip: .quad 0x0 + .size entry64_regs, . - entry64_regs + + /* GDT */ + .section ".rodata" + .balign 16 +gdt: + /* 0x00 unusable segment + * 0x08 unused + * so use them as gdt ptr + */ + .word gdt_end - gdt - 1 + .quad gdt + .word 0, 0, 0 + + /* 0x10 4GB flat code segment */ + .word 0xFFFF, 0x0000, 0x9A00, 0x00AF + + /* 0x18 4GB flat data segment */ + .word 0xFFFF, 0x0000, 0x9200, 0x00CF +gdt_end: +stack: .quad 0, 0 +stack_init: diff --git a/arch/x86/purgatory/purgatory.c b/arch/x86/purgatory/purgatory.c new file mode 100644 index 00000000000..25e068ba338 --- /dev/null +++ b/arch/x86/purgatory/purgatory.c @@ -0,0 +1,72 @@ +/* + * purgatory: Runs between two kernels + * + * Copyright (C) 2014 Red Hat Inc. + * + * Author: + * Vivek Goyal <vgoyal@redhat.com> + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#include "sha256.h" +#include "../boot/string.h" + +struct sha_region { + unsigned long start; + unsigned long len; +}; + +unsigned long backup_dest = 0; +unsigned long backup_src = 0; +unsigned long backup_sz = 0; + +u8 sha256_digest[SHA256_DIGEST_SIZE] = { 0 }; + +struct sha_region sha_regions[16] = {}; + +/* + * On x86, second kernel requries first 640K of memory to boot. Copy + * first 640K to a backup region in reserved memory range so that second + * kernel can use first 640K. + */ +static int copy_backup_region(void) +{ + if (backup_dest) + memcpy((void *)backup_dest, (void *)backup_src, backup_sz); + + return 0; +} + +int verify_sha256_digest(void) +{ + struct sha_region *ptr, *end; + u8 digest[SHA256_DIGEST_SIZE]; + struct sha256_state sctx; + + sha256_init(&sctx); + end = &sha_regions[sizeof(sha_regions)/sizeof(sha_regions[0])]; + for (ptr = sha_regions; ptr < end; ptr++) + sha256_update(&sctx, (uint8_t *)(ptr->start), ptr->len); + + sha256_final(&sctx, digest); + + if (memcmp(digest, sha256_digest, sizeof(digest))) + return 1; + + return 0; +} + +void purgatory(void) +{ + int ret; + + ret = verify_sha256_digest(); + if (ret) { + /* loop forever */ + for (;;) + ; + } + copy_backup_region(); +} diff --git a/arch/x86/purgatory/setup-x86_64.S b/arch/x86/purgatory/setup-x86_64.S new file mode 100644 index 00000000000..fe3c91ba1bd --- /dev/null +++ b/arch/x86/purgatory/setup-x86_64.S @@ -0,0 +1,58 @@ +/* + * purgatory: setup code + * + * Copyright (C) 2003,2004 Eric Biederman (ebiederm@xmission.com) + * Copyright (C) 2014 Red Hat Inc. + * + * This code has been taken from kexec-tools. + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + + .text + .globl purgatory_start + .balign 16 +purgatory_start: + .code64 + + /* Load a gdt so I know what the segment registers are */ + lgdt gdt(%rip) + + /* load the data segments */ + movl $0x18, %eax /* data segment */ + movl %eax, %ds + movl %eax, %es + movl %eax, %ss + movl %eax, %fs + movl %eax, %gs + + /* Setup a stack */ + leaq lstack_end(%rip), %rsp + + /* Call the C code */ + call purgatory + jmp entry64 + + .section ".rodata" + .balign 16 +gdt: /* 0x00 unusable segment + * 0x08 unused + * so use them as the gdt ptr + */ + .word gdt_end - gdt - 1 + .quad gdt + .word 0, 0, 0 + + /* 0x10 4GB flat code segment */ + .word 0xFFFF, 0x0000, 0x9A00, 0x00AF + + /* 0x18 4GB flat data segment */ + .word 0xFFFF, 0x0000, 0x9200, 0x00CF +gdt_end: + + .bss + .balign 4096 +lstack: + .skip 4096 +lstack_end: diff --git a/arch/x86/purgatory/sha256.c b/arch/x86/purgatory/sha256.c new file mode 100644 index 00000000000..548ca675a14 --- /dev/null +++ b/arch/x86/purgatory/sha256.c @@ -0,0 +1,283 @@ +/* + * SHA-256, as specified in + * http://csrc.nist.gov/groups/STM/cavp/documents/shs/sha256-384-512.pdf + * + * SHA-256 code by Jean-Luc Cooke <jlcooke@certainkey.com>. + * + * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com> + * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk> + * Copyright (c) 2002 James Morris <jmorris@intercode.com.au> + * Copyright (c) 2014 Red Hat Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +#include <linux/bitops.h> +#include <asm/byteorder.h> +#include "sha256.h" +#include "../boot/string.h" + +static inline u32 Ch(u32 x, u32 y, u32 z) +{ + return z ^ (x & (y ^ z)); +} + +static inline u32 Maj(u32 x, u32 y, u32 z) +{ + return (x & y) | (z & (x | y)); +} + +#define e0(x) (ror32(x, 2) ^ ror32(x, 13) ^ ror32(x, 22)) +#define e1(x) (ror32(x, 6) ^ ror32(x, 11) ^ ror32(x, 25)) +#define s0(x) (ror32(x, 7) ^ ror32(x, 18) ^ (x >> 3)) +#define s1(x) (ror32(x, 17) ^ ror32(x, 19) ^ (x >> 10)) + +static inline void LOAD_OP(int I, u32 *W, const u8 *input) +{ + W[I] = __be32_to_cpu(((__be32 *)(input))[I]); +} + +static inline void BLEND_OP(int I, u32 *W) +{ + W[I] = s1(W[I-2]) + W[I-7] + s0(W[I-15]) + W[I-16]; +} + +static void sha256_transform(u32 *state, const u8 *input) +{ + u32 a, b, c, d, e, f, g, h, t1, t2; + u32 W[64]; + int i; + + /* load the input */ + for (i = 0; i < 16; i++) + LOAD_OP(i, W, input); + + /* now blend */ + for (i = 16; i < 64; i++) + BLEND_OP(i, W); + + /* load the state into our registers */ + a = state[0]; b = state[1]; c = state[2]; d = state[3]; + e = state[4]; f = state[5]; g = state[6]; h = state[7]; + + /* now iterate */ + t1 = h + e1(e) + Ch(e, f, g) + 0x428a2f98 + W[0]; + t2 = e0(a) + Maj(a, b, c); d += t1; h = t1 + t2; + t1 = g + e1(d) + Ch(d, e, f) + 0x71374491 + W[1]; + t2 = e0(h) + Maj(h, a, b); c += t1; g = t1 + t2; + t1 = f + e1(c) + Ch(c, d, e) + 0xb5c0fbcf + W[2]; + t2 = e0(g) + Maj(g, h, a); b += t1; f = t1 + t2; + t1 = e + e1(b) + Ch(b, c, d) + 0xe9b5dba5 + W[3]; + t2 = e0(f) + Maj(f, g, h); a += t1; e = t1 + t2; + t1 = d + e1(a) + Ch(a, b, c) + 0x3956c25b + W[4]; + t2 = e0(e) + Maj(e, f, g); h += t1; d = t1 + t2; + t1 = c + e1(h) + Ch(h, a, b) + 0x59f111f1 + W[5]; + t2 = e0(d) + Maj(d, e, f); g += t1; c = t1 + t2; + t1 = b + e1(g) + Ch(g, h, a) + 0x923f82a4 + W[6]; + t2 = e0(c) + Maj(c, d, e); f += t1; b = t1 + t2; + t1 = a + e1(f) + Ch(f, g, h) + 0xab1c5ed5 + W[7]; + t2 = e0(b) + Maj(b, c, d); e += t1; a = t1 + t2; + + t1 = h + e1(e) + Ch(e, f, g) + 0xd807aa98 + W[8]; + t2 = e0(a) + Maj(a, b, c); d += t1; h = t1 + t2; + t1 = g + e1(d) + Ch(d, e, f) + 0x12835b01 + W[9]; + t2 = e0(h) + Maj(h, a, b); c += t1; g = t1 + t2; + t1 = f + e1(c) + Ch(c, d, e) + 0x243185be + W[10]; + t2 = e0(g) + Maj(g, h, a); b += t1; f = t1 + t2; + t1 = e + e1(b) + Ch(b, c, d) + 0x550c7dc3 + W[11]; + t2 = e0(f) + Maj(f, g, h); a += t1; e = t1 + t2; + t1 = d + e1(a) + Ch(a, b, c) + 0x72be5d74 + W[12]; + t2 = e0(e) + Maj(e, f, g); h += t1; d = t1 + t2; + t1 = c + e1(h) + Ch(h, a, b) + 0x80deb1fe + W[13]; + t2 = e0(d) + Maj(d, e, f); g += t1; c = t1 + t2; + t1 = b + e1(g) + Ch(g, h, a) + 0x9bdc06a7 + W[14]; + t2 = e0(c) + Maj(c, d, e); f += t1; b = t1 + t2; + t1 = a + e1(f) + Ch(f, g, h) + 0xc19bf174 + W[15]; + t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2; + + t1 = h + e1(e) + Ch(e, f, g) + 0xe49b69c1 + W[16]; + t2 = e0(a) + Maj(a, b, c); d += t1; h = t1+t2; + t1 = g + e1(d) + Ch(d, e, f) + 0xefbe4786 + W[17]; + t2 = e0(h) + Maj(h, a, b); c += t1; g = t1+t2; + t1 = f + e1(c) + Ch(c, d, e) + 0x0fc19dc6 + W[18]; + t2 = e0(g) + Maj(g, h, a); b += t1; f = t1+t2; + t1 = e + e1(b) + Ch(b, c, d) + 0x240ca1cc + W[19]; + t2 = e0(f) + Maj(f, g, h); a += t1; e = t1+t2; + t1 = d + e1(a) + Ch(a, b, c) + 0x2de92c6f + W[20]; + t2 = e0(e) + Maj(e, f, g); h += t1; d = t1+t2; + t1 = c + e1(h) + Ch(h, a, b) + 0x4a7484aa + W[21]; + t2 = e0(d) + Maj(d, e, f); g += t1; c = t1+t2; + t1 = b + e1(g) + Ch(g, h, a) + 0x5cb0a9dc + W[22]; + t2 = e0(c) + Maj(c, d, e); f += t1; b = t1+t2; + t1 = a + e1(f) + Ch(f, g, h) + 0x76f988da + W[23]; + t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2; + + t1 = h + e1(e) + Ch(e, f, g) + 0x983e5152 + W[24]; + t2 = e0(a) + Maj(a, b, c); d += t1; h = t1+t2; + t1 = g + e1(d) + Ch(d, e, f) + 0xa831c66d + W[25]; + t2 = e0(h) + Maj(h, a, b); c += t1; g = t1+t2; + t1 = f + e1(c) + Ch(c, d, e) + 0xb00327c8 + W[26]; + t2 = e0(g) + Maj(g, h, a); b += t1; f = t1+t2; + t1 = e + e1(b) + Ch(b, c, d) + 0xbf597fc7 + W[27]; + t2 = e0(f) + Maj(f, g, h); a += t1; e = t1+t2; + t1 = d + e1(a) + Ch(a, b, c) + 0xc6e00bf3 + W[28]; + t2 = e0(e) + Maj(e, f, g); h += t1; d = t1+t2; + t1 = c + e1(h) + Ch(h, a, b) + 0xd5a79147 + W[29]; + t2 = e0(d) + Maj(d, e, f); g += t1; c = t1+t2; + t1 = b + e1(g) + Ch(g, h, a) + 0x06ca6351 + W[30]; + t2 = e0(c) + Maj(c, d, e); f += t1; b = t1+t2; + t1 = a + e1(f) + Ch(f, g, h) + 0x14292967 + W[31]; + t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2; + + t1 = h + e1(e) + Ch(e, f, g) + 0x27b70a85 + W[32]; + t2 = e0(a) + Maj(a, b, c); d += t1; h = t1+t2; + t1 = g + e1(d) + Ch(d, e, f) + 0x2e1b2138 + W[33]; + t2 = e0(h) + Maj(h, a, b); c += t1; g = t1+t2; + t1 = f + e1(c) + Ch(c, d, e) + 0x4d2c6dfc + W[34]; + t2 = e0(g) + Maj(g, h, a); b += t1; f = t1+t2; + t1 = e + e1(b) + Ch(b, c, d) + 0x53380d13 + W[35]; + t2 = e0(f) + Maj(f, g, h); a += t1; e = t1+t2; + t1 = d + e1(a) + Ch(a, b, c) + 0x650a7354 + W[36]; + t2 = e0(e) + Maj(e, f, g); h += t1; d = t1+t2; + t1 = c + e1(h) + Ch(h, a, b) + 0x766a0abb + W[37]; + t2 = e0(d) + Maj(d, e, f); g += t1; c = t1+t2; + t1 = b + e1(g) + Ch(g, h, a) + 0x81c2c92e + W[38]; + t2 = e0(c) + Maj(c, d, e); f += t1; b = t1+t2; + t1 = a + e1(f) + Ch(f, g, h) + 0x92722c85 + W[39]; + t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2; + + t1 = h + e1(e) + Ch(e, f, g) + 0xa2bfe8a1 + W[40]; + t2 = e0(a) + Maj(a, b, c); d += t1; h = t1+t2; + t1 = g + e1(d) + Ch(d, e, f) + 0xa81a664b + W[41]; + t2 = e0(h) + Maj(h, a, b); c += t1; g = t1+t2; + t1 = f + e1(c) + Ch(c, d, e) + 0xc24b8b70 + W[42]; + t2 = e0(g) + Maj(g, h, a); b += t1; f = t1+t2; + t1 = e + e1(b) + Ch(b, c, d) + 0xc76c51a3 + W[43]; + t2 = e0(f) + Maj(f, g, h); a += t1; e = t1+t2; + t1 = d + e1(a) + Ch(a, b, c) + 0xd192e819 + W[44]; + t2 = e0(e) + Maj(e, f, g); h += t1; d = t1+t2; + t1 = c + e1(h) + Ch(h, a, b) + 0xd6990624 + W[45]; + t2 = e0(d) + Maj(d, e, f); g += t1; c = t1+t2; + t1 = b + e1(g) + Ch(g, h, a) + 0xf40e3585 + W[46]; + t2 = e0(c) + Maj(c, d, e); f += t1; b = t1+t2; + t1 = a + e1(f) + Ch(f, g, h) + 0x106aa070 + W[47]; + t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2; + + t1 = h + e1(e) + Ch(e, f, g) + 0x19a4c116 + W[48]; + t2 = e0(a) + Maj(a, b, c); d += t1; h = t1+t2; + t1 = g + e1(d) + Ch(d, e, f) + 0x1e376c08 + W[49]; + t2 = e0(h) + Maj(h, a, b); c += t1; g = t1+t2; + t1 = f + e1(c) + Ch(c, d, e) + 0x2748774c + W[50]; + t2 = e0(g) + Maj(g, h, a); b += t1; f = t1+t2; + t1 = e + e1(b) + Ch(b, c, d) + 0x34b0bcb5 + W[51]; + t2 = e0(f) + Maj(f, g, h); a += t1; e = t1+t2; + t1 = d + e1(a) + Ch(a, b, c) + 0x391c0cb3 + W[52]; + t2 = e0(e) + Maj(e, f, g); h += t1; d = t1+t2; + t1 = c + e1(h) + Ch(h, a, b) + 0x4ed8aa4a + W[53]; + t2 = e0(d) + Maj(d, e, f); g += t1; c = t1+t2; + t1 = b + e1(g) + Ch(g, h, a) + 0x5b9cca4f + W[54]; + t2 = e0(c) + Maj(c, d, e); f += t1; b = t1+t2; + t1 = a + e1(f) + Ch(f, g, h) + 0x682e6ff3 + W[55]; + t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2; + + t1 = h + e1(e) + Ch(e, f, g) + 0x748f82ee + W[56]; + t2 = e0(a) + Maj(a, b, c); d += t1; h = t1+t2; + t1 = g + e1(d) + Ch(d, e, f) + 0x78a5636f + W[57]; + t2 = e0(h) + Maj(h, a, b); c += t1; g = t1+t2; + t1 = f + e1(c) + Ch(c, d, e) + 0x84c87814 + W[58]; + t2 = e0(g) + Maj(g, h, a); b += t1; f = t1+t2; + t1 = e + e1(b) + Ch(b, c, d) + 0x8cc70208 + W[59]; + t2 = e0(f) + Maj(f, g, h); a += t1; e = t1+t2; + t1 = d + e1(a) + Ch(a, b, c) + 0x90befffa + W[60]; + t2 = e0(e) + Maj(e, f, g); h += t1; d = t1+t2; + t1 = c + e1(h) + Ch(h, a, b) + 0xa4506ceb + W[61]; + t2 = e0(d) + Maj(d, e, f); g += t1; c = t1+t2; + t1 = b + e1(g) + Ch(g, h, a) + 0xbef9a3f7 + W[62]; + t2 = e0(c) + Maj(c, d, e); f += t1; b = t1+t2; + t1 = a + e1(f) + Ch(f, g, h) + 0xc67178f2 + W[63]; + t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2; + + state[0] += a; state[1] += b; state[2] += c; state[3] += d; + state[4] += e; state[5] += f; state[6] += g; state[7] += h; + + /* clear any sensitive info... */ + a = b = c = d = e = f = g = h = t1 = t2 = 0; + memset(W, 0, 64 * sizeof(u32)); +} + +int sha256_init(struct sha256_state *sctx) +{ + sctx->state[0] = SHA256_H0; + sctx->state[1] = SHA256_H1; + sctx->state[2] = SHA256_H2; + sctx->state[3] = SHA256_H3; + sctx->state[4] = SHA256_H4; + sctx->state[5] = SHA256_H5; + sctx->state[6] = SHA256_H6; + sctx->state[7] = SHA256_H7; + sctx->count = 0; + + return 0; +} + +int sha256_update(struct sha256_state *sctx, const u8 *data, unsigned int len) +{ + unsigned int partial, done; + const u8 *src; + + partial = sctx->count & 0x3f; + sctx->count += len; + done = 0; + src = data; + + if ((partial + len) > 63) { + if (partial) { + done = -partial; + memcpy(sctx->buf + partial, data, done + 64); + src = sctx->buf; + } + + do { + sha256_transform(sctx->state, src); + done += 64; + src = data + done; + } while (done + 63 < len); + + partial = 0; + } + memcpy(sctx->buf + partial, src, len - done); + + return 0; +} + +int sha256_final(struct sha256_state *sctx, u8 *out) +{ + __be32 *dst = (__be32 *)out; + __be64 bits; + unsigned int index, pad_len; + int i; + static const u8 padding[64] = { 0x80, }; + + /* Save number of bits */ + bits = cpu_to_be64(sctx->count << 3); + + /* Pad out to 56 mod 64. */ + index = sctx->count & 0x3f; + pad_len = (index < 56) ? (56 - index) : ((64+56) - index); + sha256_update(sctx, padding, pad_len); + + /* Append length (before padding) */ + sha256_update(sctx, (const u8 *)&bits, sizeof(bits)); + + /* Store state in digest */ + for (i = 0; i < 8; i++) + dst[i] = cpu_to_be32(sctx->state[i]); + + /* Zeroize sensitive information. */ + memset(sctx, 0, sizeof(*sctx)); + + return 0; +} diff --git a/arch/x86/purgatory/sha256.h b/arch/x86/purgatory/sha256.h new file mode 100644 index 00000000000..bd15a412773 --- /dev/null +++ b/arch/x86/purgatory/sha256.h @@ -0,0 +1,22 @@ +/* + * Copyright (C) 2014 Red Hat Inc. + * + * Author: Vivek Goyal <vgoyal@redhat.com> + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#ifndef SHA256_H +#define SHA256_H + + +#include <linux/types.h> +#include <crypto/sha.h> + +extern int sha256_init(struct sha256_state *sctx); +extern int sha256_update(struct sha256_state *sctx, const u8 *input, + unsigned int length); +extern int sha256_final(struct sha256_state *sctx, u8 *hash); + +#endif /* SHA256_H */ diff --git a/arch/x86/purgatory/stack.S b/arch/x86/purgatory/stack.S new file mode 100644 index 00000000000..3cefba1fefc --- /dev/null +++ b/arch/x86/purgatory/stack.S @@ -0,0 +1,19 @@ +/* + * purgatory: stack + * + * Copyright (C) 2014 Red Hat Inc. + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + + /* A stack for the loaded kernel. + * Seperate and in the data section so it can be prepopulated. + */ + .data + .balign 4096 + .globl stack, stack_end + +stack: + .skip 4096 +stack_end: diff --git a/arch/x86/purgatory/string.c b/arch/x86/purgatory/string.c new file mode 100644 index 00000000000..d886b1fa36f --- /dev/null +++ b/arch/x86/purgatory/string.c @@ -0,0 +1,13 @@ +/* + * Simple string functions. + * + * Copyright (C) 2014 Red Hat Inc. + * + * Author: + * Vivek Goyal <vgoyal@redhat.com> + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#include "../boot/string.c" diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl index d6b86792161..9fe1b5d002f 100644 --- a/arch/x86/syscalls/syscall_32.tbl +++ b/arch/x86/syscalls/syscall_32.tbl @@ -360,3 +360,7 @@ 351 i386 sched_setattr sys_sched_setattr 352 i386 sched_getattr sys_sched_getattr 353 i386 renameat2 sys_renameat2 +354 i386 seccomp sys_seccomp +355 i386 getrandom sys_getrandom +356 i386 memfd_create sys_memfd_create +357 i386 bpf sys_bpf diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl index ec255a1646d..281150b539a 100644 --- a/arch/x86/syscalls/syscall_64.tbl +++ b/arch/x86/syscalls/syscall_64.tbl @@ -323,6 +323,11 @@ 314 common sched_setattr sys_sched_setattr 315 common sched_getattr sys_sched_getattr 316 common renameat2 sys_renameat2 +317 common seccomp sys_seccomp +318 common getrandom sys_getrandom +319 common memfd_create sys_memfd_create +320 common kexec_file_load sys_kexec_file_load +321 common bpf sys_bpf # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index bbb1d2259ec..a5efb21d522 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -695,7 +695,7 @@ static void walk_relocs(int (*process)(struct section *sec, Elf_Rel *rel, * */ static int per_cpu_shndx = -1; -Elf_Addr per_cpu_load_addr; +static Elf_Addr per_cpu_load_addr; static void percpu_init(void) { diff --git a/arch/x86/um/asm/elf.h b/arch/x86/um/asm/elf.h index 0feee2fd507..25a1022dd79 100644 --- a/arch/x86/um/asm/elf.h +++ b/arch/x86/um/asm/elf.h @@ -216,6 +216,5 @@ extern long elf_aux_hwcap; #define ELF_HWCAP (elf_aux_hwcap) #define SET_PERSONALITY(ex) do ; while(0) -#define __HAVE_ARCH_GATE_AREA 1 #endif diff --git a/arch/x86/um/asm/processor.h b/arch/x86/um/asm/processor.h index 04f82e020f2..2a206d2b14a 100644 --- a/arch/x86/um/asm/processor.h +++ b/arch/x86/um/asm/processor.h @@ -25,7 +25,8 @@ static inline void rep_nop(void) __asm__ __volatile__("rep;nop": : :"memory"); } -#define cpu_relax() rep_nop() +#define cpu_relax() rep_nop() +#define cpu_relax_lowlatency() cpu_relax() #include <asm/processor-generic.h> diff --git a/arch/x86/um/checksum_32.S b/arch/x86/um/checksum_32.S index 8d0c420465c..fa4b8b9841f 100644 --- a/arch/x86/um/checksum_32.S +++ b/arch/x86/um/checksum_32.S @@ -214,242 +214,3 @@ csum_partial: ret #endif - -/* -unsigned int csum_partial_copy_generic (const char *src, char *dst, - int len, int sum, int *src_err_ptr, int *dst_err_ptr) - */ - -/* - * Copy from ds while checksumming, otherwise like csum_partial - * - * The macros SRC and DST specify the type of access for the instruction. - * thus we can call a custom exception handler for all access types. - * - * FIXME: could someone double-check whether I haven't mixed up some SRC and - * DST definitions? It's damn hard to trigger all cases. I hope I got - * them all but there's no guarantee. - */ - -#define SRC(y...) \ - 9999: y; \ - _ASM_EXTABLE(9999b, 6001f) - -#define DST(y...) \ - 9999: y; \ - _ASM_EXTABLE(9999b, 6002f) - -.align 4 - -#ifndef CONFIG_X86_USE_PPRO_CHECKSUM - -#define ARGBASE 16 -#define FP 12 - -csum_partial_copy_generic_i386: - subl $4,%esp - pushl %edi - pushl %esi - pushl %ebx - movl ARGBASE+16(%esp),%eax # sum - movl ARGBASE+12(%esp),%ecx # len - movl ARGBASE+4(%esp),%esi # src - movl ARGBASE+8(%esp),%edi # dst - - testl $2, %edi # Check alignment. - jz 2f # Jump if alignment is ok. - subl $2, %ecx # Alignment uses up two bytes. - jae 1f # Jump if we had at least two bytes. - addl $2, %ecx # ecx was < 2. Deal with it. - jmp 4f -SRC(1: movw (%esi), %bx ) - addl $2, %esi -DST( movw %bx, (%edi) ) - addl $2, %edi - addw %bx, %ax - adcl $0, %eax -2: - movl %ecx, FP(%esp) - shrl $5, %ecx - jz 2f - testl %esi, %esi -SRC(1: movl (%esi), %ebx ) -SRC( movl 4(%esi), %edx ) - adcl %ebx, %eax -DST( movl %ebx, (%edi) ) - adcl %edx, %eax -DST( movl %edx, 4(%edi) ) - -SRC( movl 8(%esi), %ebx ) -SRC( movl 12(%esi), %edx ) - adcl %ebx, %eax -DST( movl %ebx, 8(%edi) ) - adcl %edx, %eax -DST( movl %edx, 12(%edi) ) - -SRC( movl 16(%esi), %ebx ) -SRC( movl 20(%esi), %edx ) - adcl %ebx, %eax -DST( movl %ebx, 16(%edi) ) - adcl %edx, %eax -DST( movl %edx, 20(%edi) ) - -SRC( movl 24(%esi), %ebx ) -SRC( movl 28(%esi), %edx ) - adcl %ebx, %eax -DST( movl %ebx, 24(%edi) ) - adcl %edx, %eax -DST( movl %edx, 28(%edi) ) - - lea 32(%esi), %esi - lea 32(%edi), %edi - dec %ecx - jne 1b - adcl $0, %eax -2: movl FP(%esp), %edx - movl %edx, %ecx - andl $0x1c, %edx - je 4f - shrl $2, %edx # This clears CF -SRC(3: movl (%esi), %ebx ) - adcl %ebx, %eax -DST( movl %ebx, (%edi) ) - lea 4(%esi), %esi - lea 4(%edi), %edi - dec %edx - jne 3b - adcl $0, %eax -4: andl $3, %ecx - jz 7f - cmpl $2, %ecx - jb 5f -SRC( movw (%esi), %cx ) - leal 2(%esi), %esi -DST( movw %cx, (%edi) ) - leal 2(%edi), %edi - je 6f - shll $16,%ecx -SRC(5: movb (%esi), %cl ) -DST( movb %cl, (%edi) ) -6: addl %ecx, %eax - adcl $0, %eax -7: -5000: - -# Exception handler: -.section .fixup, "ax" - -6001: - movl ARGBASE+20(%esp), %ebx # src_err_ptr - movl $-EFAULT, (%ebx) - - # zero the complete destination - computing the rest - # is too much work - movl ARGBASE+8(%esp), %edi # dst - movl ARGBASE+12(%esp), %ecx # len - xorl %eax,%eax - rep ; stosb - - jmp 5000b - -6002: - movl ARGBASE+24(%esp), %ebx # dst_err_ptr - movl $-EFAULT,(%ebx) - jmp 5000b - -.previous - - popl %ebx - popl %esi - popl %edi - popl %ecx # equivalent to addl $4,%esp - ret - -#else - -/* Version for PentiumII/PPro */ - -#define ROUND1(x) \ - SRC(movl x(%esi), %ebx ) ; \ - addl %ebx, %eax ; \ - DST(movl %ebx, x(%edi) ) ; - -#define ROUND(x) \ - SRC(movl x(%esi), %ebx ) ; \ - adcl %ebx, %eax ; \ - DST(movl %ebx, x(%edi) ) ; - -#define ARGBASE 12 - -csum_partial_copy_generic_i386: - pushl %ebx - pushl %edi - pushl %esi - movl ARGBASE+4(%esp),%esi #src - movl ARGBASE+8(%esp),%edi #dst - movl ARGBASE+12(%esp),%ecx #len - movl ARGBASE+16(%esp),%eax #sum -# movl %ecx, %edx - movl %ecx, %ebx - movl %esi, %edx - shrl $6, %ecx - andl $0x3c, %ebx - negl %ebx - subl %ebx, %esi - subl %ebx, %edi - lea -1(%esi),%edx - andl $-32,%edx - lea 3f(%ebx,%ebx), %ebx - testl %esi, %esi - jmp *%ebx -1: addl $64,%esi - addl $64,%edi - SRC(movb -32(%edx),%bl) ; SRC(movb (%edx),%bl) - ROUND1(-64) ROUND(-60) ROUND(-56) ROUND(-52) - ROUND (-48) ROUND(-44) ROUND(-40) ROUND(-36) - ROUND (-32) ROUND(-28) ROUND(-24) ROUND(-20) - ROUND (-16) ROUND(-12) ROUND(-8) ROUND(-4) -3: adcl $0,%eax - addl $64, %edx - dec %ecx - jge 1b -4: movl ARGBASE+12(%esp),%edx #len - andl $3, %edx - jz 7f - cmpl $2, %edx - jb 5f -SRC( movw (%esi), %dx ) - leal 2(%esi), %esi -DST( movw %dx, (%edi) ) - leal 2(%edi), %edi - je 6f - shll $16,%edx -5: -SRC( movb (%esi), %dl ) -DST( movb %dl, (%edi) ) -6: addl %edx, %eax - adcl $0, %eax -7: -.section .fixup, "ax" -6001: movl ARGBASE+20(%esp), %ebx # src_err_ptr - movl $-EFAULT, (%ebx) - # zero the complete destination (computing the rest is too much work) - movl ARGBASE+8(%esp),%edi # dst - movl ARGBASE+12(%esp),%ecx # len - xorl %eax,%eax - rep; stosb - jmp 7b -6002: movl ARGBASE+24(%esp), %ebx # dst_err_ptr - movl $-EFAULT, (%ebx) - jmp 7b -.previous - - popl %esi - popl %edi - popl %ebx - ret - -#undef ROUND -#undef ROUND1 - -#endif diff --git a/arch/x86/um/mem_64.c b/arch/x86/um/mem_64.c index c6492e75797..f8fecaddcc0 100644 --- a/arch/x86/um/mem_64.c +++ b/arch/x86/um/mem_64.c @@ -9,18 +9,3 @@ const char *arch_vma_name(struct vm_area_struct *vma) return NULL; } - -struct vm_area_struct *get_gate_vma(struct mm_struct *mm) -{ - return NULL; -} - -int in_gate_area(struct mm_struct *mm, unsigned long addr) -{ - return 0; -} - -int in_gate_area_no_mm(unsigned long addr) -{ - return 0; -} diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c index 5e04a1c899f..79d824551c1 100644 --- a/arch/x86/um/signal.c +++ b/arch/x86/um/signal.c @@ -370,13 +370,12 @@ struct rt_sigframe char retcode[8]; }; -int setup_signal_stack_sc(unsigned long stack_top, int sig, - struct k_sigaction *ka, struct pt_regs *regs, - sigset_t *mask) +int setup_signal_stack_sc(unsigned long stack_top, struct ksignal *ksig, + struct pt_regs *regs, sigset_t *mask) { struct sigframe __user *frame; void __user *restorer; - int err = 0; + int err = 0, sig = ksig->sig; /* This is the same calculation as i386 - ((sp + 4) & 15) == 0 */ stack_top = ((stack_top + 4) & -16UL) - 4; @@ -385,8 +384,8 @@ int setup_signal_stack_sc(unsigned long stack_top, int sig, return 1; restorer = frame->retcode; - if (ka->sa.sa_flags & SA_RESTORER) - restorer = ka->sa.sa_restorer; + if (ksig->ka.sa.sa_flags & SA_RESTORER) + restorer = ksig->ka.sa.sa_restorer; err |= __put_user(restorer, &frame->pretcode); err |= __put_user(sig, &frame->sig); @@ -410,20 +409,19 @@ int setup_signal_stack_sc(unsigned long stack_top, int sig, return err; PT_REGS_SP(regs) = (unsigned long) frame; - PT_REGS_IP(regs) = (unsigned long) ka->sa.sa_handler; + PT_REGS_IP(regs) = (unsigned long) ksig->ka.sa.sa_handler; PT_REGS_AX(regs) = (unsigned long) sig; PT_REGS_DX(regs) = (unsigned long) 0; PT_REGS_CX(regs) = (unsigned long) 0; return 0; } -int setup_signal_stack_si(unsigned long stack_top, int sig, - struct k_sigaction *ka, struct pt_regs *regs, - siginfo_t *info, sigset_t *mask) +int setup_signal_stack_si(unsigned long stack_top, struct ksignal *ksig, + struct pt_regs *regs, sigset_t *mask) { struct rt_sigframe __user *frame; void __user *restorer; - int err = 0; + int err = 0, sig = ksig->sig; stack_top &= -8UL; frame = (struct rt_sigframe __user *) stack_top - 1; @@ -431,14 +429,14 @@ int setup_signal_stack_si(unsigned long stack_top, int sig, return 1; restorer = frame->retcode; - if (ka->sa.sa_flags & SA_RESTORER) - restorer = ka->sa.sa_restorer; + if (ksig->ka.sa.sa_flags & SA_RESTORER) + restorer = ksig->ka.sa.sa_restorer; err |= __put_user(restorer, &frame->pretcode); err |= __put_user(sig, &frame->sig); err |= __put_user(&frame->info, &frame->pinfo); err |= __put_user(&frame->uc, &frame->puc); - err |= copy_siginfo_to_user(&frame->info, info); + err |= copy_siginfo_to_user(&frame->info, &ksig->info); err |= copy_ucontext_to_user(&frame->uc, &frame->fpstate, mask, PT_REGS_SP(regs)); @@ -457,7 +455,7 @@ int setup_signal_stack_si(unsigned long stack_top, int sig, return err; PT_REGS_SP(regs) = (unsigned long) frame; - PT_REGS_IP(regs) = (unsigned long) ka->sa.sa_handler; + PT_REGS_IP(regs) = (unsigned long) ksig->ka.sa.sa_handler; PT_REGS_AX(regs) = (unsigned long) sig; PT_REGS_DX(regs) = (unsigned long) &frame->info; PT_REGS_CX(regs) = (unsigned long) &frame->uc; @@ -502,12 +500,11 @@ struct rt_sigframe struct _fpstate fpstate; }; -int setup_signal_stack_si(unsigned long stack_top, int sig, - struct k_sigaction *ka, struct pt_regs * regs, - siginfo_t *info, sigset_t *set) +int setup_signal_stack_si(unsigned long stack_top, struct ksignal *ksig, + struct pt_regs *regs, sigset_t *set) { struct rt_sigframe __user *frame; - int err = 0; + int err = 0, sig = ksig->sig; frame = (struct rt_sigframe __user *) round_down(stack_top - sizeof(struct rt_sigframe), 16); @@ -517,8 +514,8 @@ int setup_signal_stack_si(unsigned long stack_top, int sig, if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) goto out; - if (ka->sa.sa_flags & SA_SIGINFO) { - err |= copy_siginfo_to_user(&frame->info, info); + if (ksig->ka.sa.sa_flags & SA_SIGINFO) { + err |= copy_siginfo_to_user(&frame->info, &ksig->info); if (err) goto out; } @@ -543,8 +540,8 @@ int setup_signal_stack_si(unsigned long stack_top, int sig, * already in userspace. */ /* x86-64 should always use SA_RESTORER. */ - if (ka->sa.sa_flags & SA_RESTORER) - err |= __put_user(ka->sa.sa_restorer, &frame->pretcode); + if (ksig->ka.sa.sa_flags & SA_RESTORER) + err |= __put_user(ksig->ka.sa.sa_restorer, &frame->pretcode); else /* could use a vstub here */ return err; @@ -570,7 +567,7 @@ int setup_signal_stack_si(unsigned long stack_top, int sig, */ PT_REGS_SI(regs) = (unsigned long) &frame->info; PT_REGS_DX(regs) = (unsigned long) &frame->uc; - PT_REGS_IP(regs) = (unsigned long) ka->sa.sa_handler; + PT_REGS_IP(regs) = (unsigned long) ksig->ka.sa.sa_handler; out: return err; } diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile index 61b04fe36e6..5a4affe025e 100644 --- a/arch/x86/vdso/Makefile +++ b/arch/x86/vdso/Makefile @@ -10,7 +10,7 @@ VDSO32-$(CONFIG_X86_32) := y VDSO32-$(CONFIG_COMPAT) := y # files to link into the vdso -vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o vdso-fakesections.o +vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o # files to link into kernel obj-y += vma.o @@ -37,7 +37,8 @@ vdso_img_sodbg := $(vdso_img-y:%=vdso%.so.dbg) obj-y += $(vdso_img_objs) targets += $(vdso_img_cfiles) targets += $(vdso_img_sodbg) -.SECONDARY: $(vdso_img-y:%=$(obj)/vdso-image-%.c) +.SECONDARY: $(vdso_img-y:%=$(obj)/vdso-image-%.c) \ + $(vdso_img-y:%=$(obj)/vdso%.so) export CPPFLAGS_vdso.lds += -P -C @@ -54,10 +55,10 @@ hostprogs-y += vdso2c quiet_cmd_vdso2c = VDSO2C $@ define cmd_vdso2c - $(obj)/vdso2c $< $@ + $(obj)/vdso2c $< $(<:%.dbg=%) $@ endef -$(obj)/vdso-image-%.c: $(obj)/vdso%.so.dbg $(obj)/vdso2c FORCE +$(obj)/vdso-image-%.c: $(obj)/vdso%.so.dbg $(obj)/vdso%.so $(obj)/vdso2c FORCE $(call if_changed,vdso2c) # @@ -113,6 +114,10 @@ $(obj)/%-x32.o: $(obj)/%.o FORCE targets += vdsox32.lds $(vobjx32s-y) +$(obj)/%.so: OBJCOPYFLAGS := -S +$(obj)/%.so: $(obj)/%.so.dbg + $(call if_changed,objcopy) + $(obj)/vdsox32.so.dbg: $(src)/vdsox32.lds $(vobjx32s) FORCE $(call if_changed,vdso) @@ -134,7 +139,7 @@ override obj-dirs = $(dir $(obj)) $(obj)/vdso32/ targets += vdso32/vdso32.lds targets += vdso32/note.o vdso32/vclock_gettime.o $(vdso32.so-y:%=vdso32/%.o) -targets += vdso32/vclock_gettime.o vdso32/vdso-fakesections.o +targets += vdso32/vclock_gettime.o $(obj)/vdso32.o: $(vdso32-images:%=$(obj)/%) @@ -156,7 +161,6 @@ $(vdso32-images:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_32) $(vdso32-images:%=$(obj)/%.dbg): $(obj)/vdso32-%.so.dbg: FORCE \ $(obj)/vdso32/vdso32.lds \ $(obj)/vdso32/vclock_gettime.o \ - $(obj)/vdso32/vdso-fakesections.o \ $(obj)/vdso32/note.o \ $(obj)/vdso32/%.o $(call if_changed,vdso) diff --git a/arch/x86/vdso/vdso-fakesections.c b/arch/x86/vdso/vdso-fakesections.c deleted file mode 100644 index aa5fbfab20a..00000000000 --- a/arch/x86/vdso/vdso-fakesections.c +++ /dev/null @@ -1,21 +0,0 @@ -/* - * Copyright 2014 Andy Lutomirski - * Subject to the GNU Public License, v.2 - * - * String table for loadable section headers. See vdso2c.h for why - * this exists. - */ - -const char fake_shstrtab[] __attribute__((section(".fake_shstrtab"))) = - ".hash\0" - ".dynsym\0" - ".dynstr\0" - ".gnu.version\0" - ".gnu.version_d\0" - ".dynamic\0" - ".rodata\0" - ".fake_shstrtab\0" /* Yay, self-referential code. */ - ".note\0" - ".eh_frame_hdr\0" - ".eh_frame\0" - ".text"; diff --git a/arch/x86/vdso/vdso-layout.lds.S b/arch/x86/vdso/vdso-layout.lds.S index 9197544eea9..de2c921025f 100644 --- a/arch/x86/vdso/vdso-layout.lds.S +++ b/arch/x86/vdso/vdso-layout.lds.S @@ -18,6 +18,25 @@ SECTIONS { + /* + * User/kernel shared data is before the vDSO. This may be a little + * uglier than putting it after the vDSO, but it avoids issues with + * non-allocatable things that dangle past the end of the PT_LOAD + * segment. + */ + + vvar_start = . - 2 * PAGE_SIZE; + vvar_page = vvar_start; + + /* Place all vvars at the offsets in asm/vvar.h. */ +#define EMIT_VVAR(name, offset) vvar_ ## name = vvar_page + offset; +#define __VVAR_KERNEL_LDS +#include <asm/vvar.h> +#undef __VVAR_KERNEL_LDS +#undef EMIT_VVAR + + hpet_page = vvar_start + PAGE_SIZE; + . = SIZEOF_HEADERS; .hash : { *(.hash) } :text @@ -74,31 +93,6 @@ SECTIONS .altinstructions : { *(.altinstructions) } :text .altinstr_replacement : { *(.altinstr_replacement) } :text - /* - * The remainder of the vDSO consists of special pages that are - * shared between the kernel and userspace. It needs to be at the - * end so that it doesn't overlap the mapping of the actual - * vDSO image. - */ - - . = ALIGN(PAGE_SIZE); - vvar_page = .; - - /* Place all vvars at the offsets in asm/vvar.h. */ -#define EMIT_VVAR(name, offset) vvar_ ## name = vvar_page + offset; -#define __VVAR_KERNEL_LDS -#include <asm/vvar.h> -#undef __VVAR_KERNEL_LDS -#undef EMIT_VVAR - - . = vvar_page + PAGE_SIZE; - - hpet_page = .; - . = . + PAGE_SIZE; - - . = ALIGN(PAGE_SIZE); - end_mapping = .; - /DISCARD/ : { *(.discard) *(.discard.*) diff --git a/arch/x86/vdso/vdso2c.c b/arch/x86/vdso/vdso2c.c index 238dbe82776..8627db24a7f 100644 --- a/arch/x86/vdso/vdso2c.c +++ b/arch/x86/vdso/vdso2c.c @@ -1,3 +1,53 @@ +/* + * vdso2c - A vdso image preparation tool + * Copyright (c) 2014 Andy Lutomirski and others + * Licensed under the GPL v2 + * + * vdso2c requires stripped and unstripped input. It would be trivial + * to fully strip the input in here, but, for reasons described below, + * we need to write a section table. Doing this is more or less + * equivalent to dropping all non-allocatable sections, but it's + * easier to let objcopy handle that instead of doing it ourselves. + * If we ever need to do something fancier than what objcopy provides, + * it would be straightforward to add here. + * + * We're keep a section table for a few reasons: + * + * The Go runtime had a couple of bugs: it would read the section + * table to try to figure out how many dynamic symbols there were (it + * shouldn't have looked at the section table at all) and, if there + * were no SHT_SYNDYM section table entry, it would use an + * uninitialized value for the number of symbols. An empty DYNSYM + * table would work, but I see no reason not to write a valid one (and + * keep full performance for old Go programs). This hack is only + * needed on x86_64. + * + * The bug was introduced on 2012-08-31 by: + * https://code.google.com/p/go/source/detail?r=56ea40aac72b + * and was fixed on 2014-06-13 by: + * https://code.google.com/p/go/source/detail?r=fc1cd5e12595 + * + * Binutils has issues debugging the vDSO: it reads the section table to + * find SHT_NOTE; it won't look at PT_NOTE for the in-memory vDSO, which + * would break build-id if we removed the section table. Binutils + * also requires that shstrndx != 0. See: + * https://sourceware.org/bugzilla/show_bug.cgi?id=17064 + * + * elfutils might not look for PT_NOTE if there is a section table at + * all. I don't know whether this matters for any practical purpose. + * + * For simplicity, rather than hacking up a partial section table, we + * just write a mostly complete one. We omit non-dynamic symbols, + * though, since they're rather large. + * + * Once binutils gets fixed, we might be able to drop this for all but + * the 64-bit vdso, since build-id only works in kernel RPMs, and + * systems that update to new enough kernel RPMs will likely update + * binutils in sync. build-id has never worked for home-built kernel + * RPMs without manual symlinking, and I suspect that no one ever does + * that. + */ + #include <inttypes.h> #include <stdint.h> #include <unistd.h> @@ -20,9 +70,9 @@ const char *outfilename; /* Symbols that we need in vdso2c. */ enum { + sym_vvar_start, sym_vvar_page, sym_hpet_page, - sym_end_mapping, sym_VDSO_FAKE_SECTION_TABLE_START, sym_VDSO_FAKE_SECTION_TABLE_END, }; @@ -38,9 +88,9 @@ struct vdso_sym { }; struct vdso_sym required_syms[] = { + [sym_vvar_start] = {"vvar_start", true}, [sym_vvar_page] = {"vvar_page", true}, [sym_hpet_page] = {"hpet_page", true}, - [sym_end_mapping] = {"end_mapping", true}, [sym_VDSO_FAKE_SECTION_TABLE_START] = { "VDSO_FAKE_SECTION_TABLE_START", false }, @@ -61,7 +111,8 @@ static void fail(const char *format, ...) va_start(ap, format); fprintf(stderr, "Error: "); vfprintf(stderr, format, ap); - unlink(outfilename); + if (outfilename) + unlink(outfilename); exit(1); va_end(ap); } @@ -96,9 +147,11 @@ extern void bad_put_le(void); #define NSYMS (sizeof(required_syms) / sizeof(required_syms[0])) -#define BITSFUNC3(name, bits) name##bits -#define BITSFUNC2(name, bits) BITSFUNC3(name, bits) -#define BITSFUNC(name) BITSFUNC2(name, ELF_BITS) +#define BITSFUNC3(name, bits, suffix) name##bits##suffix +#define BITSFUNC2(name, bits, suffix) BITSFUNC3(name, bits, suffix) +#define BITSFUNC(name) BITSFUNC2(name, ELF_BITS, ) + +#define INT_BITS BITSFUNC2(int, ELF_BITS, _t) #define ELF_BITS_XFORM2(bits, x) Elf##bits##_##x #define ELF_BITS_XFORM(bits, x) ELF_BITS_XFORM2(bits, x) @@ -112,30 +165,53 @@ extern void bad_put_le(void); #include "vdso2c.h" #undef ELF_BITS -static void go(void *addr, size_t len, FILE *outfile, const char *name) +static void go(void *raw_addr, size_t raw_len, + void *stripped_addr, size_t stripped_len, + FILE *outfile, const char *name) { - Elf64_Ehdr *hdr = (Elf64_Ehdr *)addr; + Elf64_Ehdr *hdr = (Elf64_Ehdr *)raw_addr; if (hdr->e_ident[EI_CLASS] == ELFCLASS64) { - go64(addr, len, outfile, name); + go64(raw_addr, raw_len, stripped_addr, stripped_len, + outfile, name); } else if (hdr->e_ident[EI_CLASS] == ELFCLASS32) { - go32(addr, len, outfile, name); + go32(raw_addr, raw_len, stripped_addr, stripped_len, + outfile, name); } else { fail("unknown ELF class\n"); } } +static void map_input(const char *name, void **addr, size_t *len, int prot) +{ + off_t tmp_len; + + int fd = open(name, O_RDONLY); + if (fd == -1) + err(1, "%s", name); + + tmp_len = lseek(fd, 0, SEEK_END); + if (tmp_len == (off_t)-1) + err(1, "lseek"); + *len = (size_t)tmp_len; + + *addr = mmap(NULL, tmp_len, prot, MAP_PRIVATE, fd, 0); + if (*addr == MAP_FAILED) + err(1, "mmap"); + + close(fd); +} + int main(int argc, char **argv) { - int fd; - off_t len; - void *addr; + size_t raw_len, stripped_len; + void *raw_addr, *stripped_addr; FILE *outfile; char *name, *tmp; int namelen; - if (argc != 3) { - printf("Usage: vdso2c INPUT OUTPUT\n"); + if (argc != 4) { + printf("Usage: vdso2c RAW_INPUT STRIPPED_INPUT OUTPUT\n"); return 1; } @@ -143,7 +219,7 @@ int main(int argc, char **argv) * Figure out the struct name. If we're writing to a .so file, * generate raw output insted. */ - name = strdup(argv[2]); + name = strdup(argv[3]); namelen = strlen(name); if (namelen >= 3 && !strcmp(name + namelen - 3, ".so")) { name = NULL; @@ -159,26 +235,18 @@ int main(int argc, char **argv) *tmp = '_'; } - fd = open(argv[1], O_RDONLY); - if (fd == -1) - err(1, "%s", argv[1]); - - len = lseek(fd, 0, SEEK_END); - if (len == (off_t)-1) - err(1, "lseek"); - - addr = mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); - if (addr == MAP_FAILED) - err(1, "mmap"); + map_input(argv[1], &raw_addr, &raw_len, PROT_READ); + map_input(argv[2], &stripped_addr, &stripped_len, PROT_READ); - outfilename = argv[2]; + outfilename = argv[3]; outfile = fopen(outfilename, "w"); if (!outfile) err(1, "%s", argv[2]); - go(addr, (size_t)len, outfile, name); + go(raw_addr, raw_len, stripped_addr, stripped_len, outfile, name); - munmap(addr, len); + munmap(raw_addr, raw_len); + munmap(stripped_addr, stripped_len); fclose(outfile); return 0; diff --git a/arch/x86/vdso/vdso2c.h b/arch/x86/vdso/vdso2c.h index 11b65d4f941..0224987556c 100644 --- a/arch/x86/vdso/vdso2c.h +++ b/arch/x86/vdso/vdso2c.h @@ -4,139 +4,23 @@ * are built for 32-bit userspace. */ -/* - * We're writing a section table for a few reasons: - * - * The Go runtime had a couple of bugs: it would read the section - * table to try to figure out how many dynamic symbols there were (it - * shouldn't have looked at the section table at all) and, if there - * were no SHT_SYNDYM section table entry, it would use an - * uninitialized value for the number of symbols. An empty DYNSYM - * table would work, but I see no reason not to write a valid one (and - * keep full performance for old Go programs). This hack is only - * needed on x86_64. - * - * The bug was introduced on 2012-08-31 by: - * https://code.google.com/p/go/source/detail?r=56ea40aac72b - * and was fixed on 2014-06-13 by: - * https://code.google.com/p/go/source/detail?r=fc1cd5e12595 - * - * Binutils has issues debugging the vDSO: it reads the section table to - * find SHT_NOTE; it won't look at PT_NOTE for the in-memory vDSO, which - * would break build-id if we removed the section table. Binutils - * also requires that shstrndx != 0. See: - * https://sourceware.org/bugzilla/show_bug.cgi?id=17064 - * - * elfutils might not look for PT_NOTE if there is a section table at - * all. I don't know whether this matters for any practical purpose. - * - * For simplicity, rather than hacking up a partial section table, we - * just write a mostly complete one. We omit non-dynamic symbols, - * though, since they're rather large. - * - * Once binutils gets fixed, we might be able to drop this for all but - * the 64-bit vdso, since build-id only works in kernel RPMs, and - * systems that update to new enough kernel RPMs will likely update - * binutils in sync. build-id has never worked for home-built kernel - * RPMs without manual symlinking, and I suspect that no one ever does - * that. - */ -struct BITSFUNC(fake_sections) -{ - ELF(Shdr) *table; - unsigned long table_offset; - int count, max_count; - - int in_shstrndx; - unsigned long shstr_offset; - const char *shstrtab; - size_t shstrtab_len; - - int out_shstrndx; -}; - -static unsigned int BITSFUNC(find_shname)(struct BITSFUNC(fake_sections) *out, - const char *name) -{ - const char *outname = out->shstrtab; - while (outname - out->shstrtab < out->shstrtab_len) { - if (!strcmp(name, outname)) - return (outname - out->shstrtab) + out->shstr_offset; - outname += strlen(outname) + 1; - } - - if (*name) - printf("Warning: could not find output name \"%s\"\n", name); - return out->shstr_offset + out->shstrtab_len - 1; /* Use a null. */ -} - -static void BITSFUNC(init_sections)(struct BITSFUNC(fake_sections) *out) -{ - if (!out->in_shstrndx) - fail("didn't find the fake shstrndx\n"); - - memset(out->table, 0, out->max_count * sizeof(ELF(Shdr))); - - if (out->max_count < 1) - fail("we need at least two fake output sections\n"); - - PUT_LE(&out->table[0].sh_type, SHT_NULL); - PUT_LE(&out->table[0].sh_name, BITSFUNC(find_shname)(out, "")); - - out->count = 1; -} - -static void BITSFUNC(copy_section)(struct BITSFUNC(fake_sections) *out, - int in_idx, const ELF(Shdr) *in, - const char *name) -{ - uint64_t flags = GET_LE(&in->sh_flags); - - bool copy = flags & SHF_ALLOC && - (GET_LE(&in->sh_size) || - (GET_LE(&in->sh_type) != SHT_RELA && - GET_LE(&in->sh_type) != SHT_REL)) && - strcmp(name, ".altinstructions") && - strcmp(name, ".altinstr_replacement"); - - if (!copy) - return; - - if (out->count >= out->max_count) - fail("too many copied sections (max = %d)\n", out->max_count); - - if (in_idx == out->in_shstrndx) - out->out_shstrndx = out->count; - - out->table[out->count] = *in; - PUT_LE(&out->table[out->count].sh_name, - BITSFUNC(find_shname)(out, name)); - - /* elfutils requires that a strtab have the correct type. */ - if (!strcmp(name, ".fake_shstrtab")) - PUT_LE(&out->table[out->count].sh_type, SHT_STRTAB); - - out->count++; -} - -static void BITSFUNC(go)(void *addr, size_t len, +static void BITSFUNC(go)(void *raw_addr, size_t raw_len, + void *stripped_addr, size_t stripped_len, FILE *outfile, const char *name) { int found_load = 0; unsigned long load_size = -1; /* Work around bogus warning */ - unsigned long data_size; - ELF(Ehdr) *hdr = (ELF(Ehdr) *)addr; + unsigned long mapping_size; + ELF(Ehdr) *hdr = (ELF(Ehdr) *)raw_addr; int i; unsigned long j; ELF(Shdr) *symtab_hdr = NULL, *strtab_hdr, *secstrings_hdr, *alt_sec = NULL; ELF(Dyn) *dyn = 0, *dyn_end = 0; const char *secstrings; - uint64_t syms[NSYMS] = {}; - - struct BITSFUNC(fake_sections) fake_sections = {}; + INT_BITS syms[NSYMS] = {}; - ELF(Phdr) *pt = (ELF(Phdr) *)(addr + GET_LE(&hdr->e_phoff)); + ELF(Phdr) *pt = (ELF(Phdr) *)(raw_addr + GET_LE(&hdr->e_phoff)); /* Walk the segment table. */ for (i = 0; i < GET_LE(&hdr->e_phnum); i++) { @@ -154,14 +38,16 @@ static void BITSFUNC(go)(void *addr, size_t len, load_size = GET_LE(&pt[i].p_memsz); found_load = 1; } else if (GET_LE(&pt[i].p_type) == PT_DYNAMIC) { - dyn = addr + GET_LE(&pt[i].p_offset); - dyn_end = addr + GET_LE(&pt[i].p_offset) + + dyn = raw_addr + GET_LE(&pt[i].p_offset); + dyn_end = raw_addr + GET_LE(&pt[i].p_offset) + GET_LE(&pt[i].p_memsz); } } if (!found_load) fail("no PT_LOAD seg\n"); - data_size = (load_size + 4095) / 4096 * 4096; + + if (stripped_len < load_size) + fail("stripped input is too short\n"); /* Walk the dynamic table */ for (i = 0; dyn + i < dyn_end && @@ -173,11 +59,11 @@ static void BITSFUNC(go)(void *addr, size_t len, } /* Walk the section table */ - secstrings_hdr = addr + GET_LE(&hdr->e_shoff) + + secstrings_hdr = raw_addr + GET_LE(&hdr->e_shoff) + GET_LE(&hdr->e_shentsize)*GET_LE(&hdr->e_shstrndx); - secstrings = addr + GET_LE(&secstrings_hdr->sh_offset); + secstrings = raw_addr + GET_LE(&secstrings_hdr->sh_offset); for (i = 0; i < GET_LE(&hdr->e_shnum); i++) { - ELF(Shdr) *sh = addr + GET_LE(&hdr->e_shoff) + + ELF(Shdr) *sh = raw_addr + GET_LE(&hdr->e_shoff) + GET_LE(&hdr->e_shentsize) * i; if (GET_LE(&sh->sh_type) == SHT_SYMTAB) symtab_hdr = sh; @@ -190,7 +76,7 @@ static void BITSFUNC(go)(void *addr, size_t len, if (!symtab_hdr) fail("no symbol table\n"); - strtab_hdr = addr + GET_LE(&hdr->e_shoff) + + strtab_hdr = raw_addr + GET_LE(&hdr->e_shoff) + GET_LE(&hdr->e_shentsize) * GET_LE(&symtab_hdr->sh_link); /* Walk the symbol table */ @@ -198,9 +84,9 @@ static void BITSFUNC(go)(void *addr, size_t len, i < GET_LE(&symtab_hdr->sh_size) / GET_LE(&symtab_hdr->sh_entsize); i++) { int k; - ELF(Sym) *sym = addr + GET_LE(&symtab_hdr->sh_offset) + + ELF(Sym) *sym = raw_addr + GET_LE(&symtab_hdr->sh_offset) + GET_LE(&symtab_hdr->sh_entsize) * i; - const char *name = addr + GET_LE(&strtab_hdr->sh_offset) + + const char *name = raw_addr + GET_LE(&strtab_hdr->sh_offset) + GET_LE(&sym->st_name); for (k = 0; k < NSYMS; k++) { @@ -209,75 +95,45 @@ static void BITSFUNC(go)(void *addr, size_t len, fail("duplicate symbol %s\n", required_syms[k].name); } + + /* + * Careful: we use negative addresses, but + * st_value is unsigned, so we rely + * on syms[k] being a signed type of the + * correct width. + */ syms[k] = GET_LE(&sym->st_value); } } - - if (!strcmp(name, "fake_shstrtab")) { - ELF(Shdr) *sh; - - fake_sections.in_shstrndx = GET_LE(&sym->st_shndx); - fake_sections.shstrtab = addr + GET_LE(&sym->st_value); - fake_sections.shstrtab_len = GET_LE(&sym->st_size); - sh = addr + GET_LE(&hdr->e_shoff) + - GET_LE(&hdr->e_shentsize) * - fake_sections.in_shstrndx; - fake_sections.shstr_offset = GET_LE(&sym->st_value) - - GET_LE(&sh->sh_addr); - } - } - - /* Build the output section table. */ - if (!syms[sym_VDSO_FAKE_SECTION_TABLE_START] || - !syms[sym_VDSO_FAKE_SECTION_TABLE_END]) - fail("couldn't find fake section table\n"); - if ((syms[sym_VDSO_FAKE_SECTION_TABLE_END] - - syms[sym_VDSO_FAKE_SECTION_TABLE_START]) % sizeof(ELF(Shdr))) - fail("fake section table size isn't a multiple of sizeof(Shdr)\n"); - fake_sections.table = addr + syms[sym_VDSO_FAKE_SECTION_TABLE_START]; - fake_sections.table_offset = syms[sym_VDSO_FAKE_SECTION_TABLE_START]; - fake_sections.max_count = (syms[sym_VDSO_FAKE_SECTION_TABLE_END] - - syms[sym_VDSO_FAKE_SECTION_TABLE_START]) / - sizeof(ELF(Shdr)); - - BITSFUNC(init_sections)(&fake_sections); - for (i = 0; i < GET_LE(&hdr->e_shnum); i++) { - ELF(Shdr) *sh = addr + GET_LE(&hdr->e_shoff) + - GET_LE(&hdr->e_shentsize) * i; - BITSFUNC(copy_section)(&fake_sections, i, sh, - secstrings + GET_LE(&sh->sh_name)); } - if (!fake_sections.out_shstrndx) - fail("didn't generate shstrndx?!?\n"); - - PUT_LE(&hdr->e_shoff, fake_sections.table_offset); - PUT_LE(&hdr->e_shentsize, sizeof(ELF(Shdr))); - PUT_LE(&hdr->e_shnum, fake_sections.count); - PUT_LE(&hdr->e_shstrndx, fake_sections.out_shstrndx); /* Validate mapping addresses. */ for (i = 0; i < sizeof(special_pages) / sizeof(special_pages[0]); i++) { - if (!syms[i]) + INT_BITS symval = syms[special_pages[i]]; + + if (!symval) continue; /* The mapping isn't used; ignore it. */ - if (syms[i] % 4096) + if (symval % 4096) fail("%s must be a multiple of 4096\n", required_syms[i].name); - if (syms[i] < data_size) - fail("%s must be after the text mapping\n", + if (symval + 4096 < syms[sym_vvar_start]) + fail("%s underruns vvar_start\n", required_syms[i].name); - if (syms[sym_end_mapping] < syms[i] + 4096) - fail("%s overruns end_mapping\n", + if (symval + 4096 > 0) + fail("%s is on the wrong side of the vdso text\n", required_syms[i].name); } - if (syms[sym_end_mapping] % 4096) - fail("end_mapping must be a multiple of 4096\n"); + if (syms[sym_vvar_start] % 4096) + fail("vvar_begin must be a multiple of 4096\n"); if (!name) { - fwrite(addr, load_size, 1, outfile); + fwrite(stripped_addr, stripped_len, 1, outfile); return; } + mapping_size = (stripped_len + 4095) / 4096 * 4096; + fprintf(outfile, "/* AUTOMATICALLY GENERATED -- DO NOT EDIT */\n\n"); fprintf(outfile, "#include <linux/linkage.h>\n"); fprintf(outfile, "#include <asm/page_types.h>\n"); @@ -285,20 +141,21 @@ static void BITSFUNC(go)(void *addr, size_t len, fprintf(outfile, "\n"); fprintf(outfile, "static unsigned char raw_data[%lu] __page_aligned_data = {", - data_size); - for (j = 0; j < load_size; j++) { + mapping_size); + for (j = 0; j < stripped_len; j++) { if (j % 10 == 0) fprintf(outfile, "\n\t"); - fprintf(outfile, "0x%02X, ", (int)((unsigned char *)addr)[j]); + fprintf(outfile, "0x%02X, ", + (int)((unsigned char *)stripped_addr)[j]); } fprintf(outfile, "\n};\n\n"); fprintf(outfile, "static struct page *pages[%lu];\n\n", - data_size / 4096); + mapping_size / 4096); fprintf(outfile, "const struct vdso_image %s = {\n", name); fprintf(outfile, "\t.data = raw_data,\n"); - fprintf(outfile, "\t.size = %lu,\n", data_size); + fprintf(outfile, "\t.size = %lu,\n", mapping_size); fprintf(outfile, "\t.text_mapping = {\n"); fprintf(outfile, "\t\t.name = \"[vdso]\",\n"); fprintf(outfile, "\t\t.pages = pages,\n"); @@ -311,8 +168,8 @@ static void BITSFUNC(go)(void *addr, size_t len, } for (i = 0; i < NSYMS; i++) { if (required_syms[i].export && syms[i]) - fprintf(outfile, "\t.sym_%s = 0x%" PRIx64 ",\n", - required_syms[i].name, syms[i]); + fprintf(outfile, "\t.sym_%s = %" PRIi64 ",\n", + required_syms[i].name, (int64_t)syms[i]); } fprintf(outfile, "};\n"); } diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index e4f7781ee16..e904c270573 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -115,23 +115,6 @@ static __init int ia32_binfmt_init(void) return 0; } __initcall(ia32_binfmt_init); -#endif - -#else /* CONFIG_X86_32 */ - -struct vm_area_struct *get_gate_vma(struct mm_struct *mm) -{ - return NULL; -} - -int in_gate_area(struct mm_struct *mm, unsigned long addr) -{ - return 0; -} - -int in_gate_area_no_mm(unsigned long addr) -{ - return 0; -} +#endif /* CONFIG_SYSCTL */ #endif /* CONFIG_X86_64 */ diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index 5a5176de8d0..970463b566c 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -93,7 +93,7 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; - unsigned long addr; + unsigned long addr, text_start; int ret = 0; static struct page *no_pages[] = {NULL}; static struct vm_special_mapping vvar_mapping = { @@ -103,26 +103,28 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr) if (calculate_addr) { addr = vdso_addr(current->mm->start_stack, - image->sym_end_mapping); + image->size - image->sym_vvar_start); } else { addr = 0; } down_write(&mm->mmap_sem); - addr = get_unmapped_area(NULL, addr, image->sym_end_mapping, 0, 0); + addr = get_unmapped_area(NULL, addr, + image->size - image->sym_vvar_start, 0, 0); if (IS_ERR_VALUE(addr)) { ret = addr; goto up_fail; } - current->mm->context.vdso = (void __user *)addr; + text_start = addr - image->sym_vvar_start; + current->mm->context.vdso = (void __user *)text_start; /* * MAYWRITE to allow gdb to COW and set breakpoints */ vma = _install_special_mapping(mm, - addr, + text_start, image->size, VM_READ|VM_EXEC| VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, @@ -134,9 +136,9 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr) } vma = _install_special_mapping(mm, - addr + image->size, - image->sym_end_mapping - image->size, - VM_READ, + addr, + -image->sym_vvar_start, + VM_READ|VM_MAYREAD, &vvar_mapping); if (IS_ERR(vma)) { @@ -146,7 +148,7 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr) if (image->sym_vvar_page) ret = remap_pfn_range(vma, - addr + image->sym_vvar_page, + text_start + image->sym_vvar_page, __pa_symbol(&__vvar_page) >> PAGE_SHIFT, PAGE_SIZE, PAGE_READONLY); @@ -157,7 +159,7 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr) #ifdef CONFIG_HPET_TIMER if (hpet_address && image->sym_hpet_page) { ret = io_remap_pfn_range(vma, - addr + image->sym_hpet_page, + text_start + image->sym_hpet_page, hpet_address >> PAGE_SHIFT, PAGE_SIZE, pgprot_noncached(PAGE_READONLY)); diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 96ab2c09cb6..7322755f337 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -22,3 +22,4 @@ obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o obj-$(CONFIG_XEN_DOM0) += apic.o vga.o obj-$(CONFIG_SWIOTLB_XEN) += pci-swiotlb-xen.o +obj-$(CONFIG_XEN_EFI) += efi.o diff --git a/arch/x86/xen/efi.c b/arch/x86/xen/efi.c new file mode 100644 index 00000000000..be14cc3e48d --- /dev/null +++ b/arch/x86/xen/efi.c @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2014 Oracle Co., Daniel Kiper + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/bitops.h> +#include <linux/efi.h> +#include <linux/init.h> +#include <linux/string.h> + +#include <xen/xen-ops.h> + +#include <asm/page.h> +#include <asm/setup.h> + +void __init xen_efi_init(void) +{ + efi_system_table_t *efi_systab_xen; + + efi_systab_xen = xen_efi_probe(); + + if (efi_systab_xen == NULL) + return; + + strncpy((char *)&boot_params.efi_info.efi_loader_signature, "Xen", + sizeof(boot_params.efi_info.efi_loader_signature)); + boot_params.efi_info.efi_systab = (__u32)__pa(efi_systab_xen); + boot_params.efi_info.efi_systab_hi = (__u32)(__pa(efi_systab_xen) >> 32); + + set_bit(EFI_BOOT, &efi.flags); + set_bit(EFI_PARAVIRT, &efi.flags); + set_bit(EFI_64BIT, &efi.flags); +} diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index ffb101e4573..1a3f0445432 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -821,7 +821,7 @@ static void xen_convert_trap_info(const struct desc_ptr *desc, void xen_copy_trap_info(struct trap_info *traps) { - const struct desc_ptr *desc = &__get_cpu_var(idt_desc); + const struct desc_ptr *desc = this_cpu_ptr(&idt_desc); xen_convert_trap_info(desc, traps); } @@ -838,7 +838,7 @@ static void xen_load_idt(const struct desc_ptr *desc) spin_lock(&lock); - __get_cpu_var(idt_desc) = *desc; + memcpy(this_cpu_ptr(&idt_desc), desc, sizeof(idt_desc)); xen_convert_trap_info(desc, traps); @@ -1463,6 +1463,7 @@ static void __ref xen_setup_gdt(int cpu) pv_cpu_ops.load_gdt = xen_load_gdt; } +#ifdef CONFIG_XEN_PVH /* * A PV guest starts with default flags that are not set for PVH, set them * here asap. @@ -1508,17 +1509,21 @@ static void __init xen_pvh_early_guest_init(void) return; xen_have_vector_callback = 1; + + xen_pvh_early_cpu_init(0, false); xen_pvh_set_cr_flags(0); #ifdef CONFIG_X86_32 BUG(); /* PVH: Implement proper support. */ #endif } +#endif /* CONFIG_XEN_PVH */ /* First C function to be called on Xen boot */ asmlinkage __visible void __init xen_start_kernel(void) { struct physdev_set_iopl set_iopl; + unsigned long initrd_start = 0; int rc; if (!xen_start_info) @@ -1527,7 +1532,9 @@ asmlinkage __visible void __init xen_start_kernel(void) xen_domain_type = XEN_PV_DOMAIN; xen_setup_features(); +#ifdef CONFIG_XEN_PVH xen_pvh_early_guest_init(); +#endif xen_setup_machphys_mapping(); /* Install Xen paravirt ops */ @@ -1559,8 +1566,6 @@ asmlinkage __visible void __init xen_start_kernel(void) #endif __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD); - __supported_pte_mask |= _PAGE_IOMAP; - /* * Prevent page tables from being allocated in highmem, even * if CONFIG_HIGHPTE is enabled. @@ -1667,10 +1672,16 @@ asmlinkage __visible void __init xen_start_kernel(void) new_cpu_data.x86_capability[0] = cpuid_edx(1); #endif + if (xen_start_info->mod_start) { + if (xen_start_info->flags & SIF_MOD_START_PFN) + initrd_start = PFN_PHYS(xen_start_info->mod_start); + else + initrd_start = __pa(xen_start_info->mod_start); + } + /* Poke various useful things into boot_params */ boot_params.hdr.type_of_loader = (9 << 4) | 0; - boot_params.hdr.ramdisk_image = xen_start_info->mod_start - ? __pa(xen_start_info->mod_start) : 0; + boot_params.hdr.ramdisk_image = initrd_start; boot_params.hdr.ramdisk_size = xen_start_info->mod_len; boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line); @@ -1718,6 +1729,8 @@ asmlinkage __visible void __init xen_start_kernel(void) xen_setup_runstate_info(0); + xen_efi_init(); + /* Start the world */ #ifdef CONFIG_X86_32 i386_start_kernel(); @@ -1826,8 +1839,19 @@ static void __init xen_hvm_guest_init(void) xen_hvm_init_mmu_ops(); } +static bool xen_nopv = false; +static __init int xen_parse_nopv(char *arg) +{ + xen_nopv = true; + return 0; +} +early_param("xen_nopv", xen_parse_nopv); + static uint32_t __init xen_hvm_platform(void) { + if (xen_nopv) + return 0; + if (xen_pv_domain()) return 0; @@ -1836,6 +1860,8 @@ static uint32_t __init xen_hvm_platform(void) bool xen_hvm_need_lapic(void) { + if (xen_nopv) + return false; if (xen_pv_domain()) return false; if (!xen_hvm_domain()) diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c index ebfa9b2c871..1580e7a5a4c 100644 --- a/arch/x86/xen/grant-table.c +++ b/arch/x86/xen/grant-table.c @@ -49,7 +49,7 @@ static struct gnttab_vm_area { struct vm_struct *area; pte_t **ptes; -} gnttab_shared_vm_area, gnttab_status_vm_area; +} gnttab_shared_vm_area; int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes, unsigned long max_nr_gframes, @@ -73,43 +73,16 @@ int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes, return 0; } -int arch_gnttab_map_status(uint64_t *frames, unsigned long nr_gframes, - unsigned long max_nr_gframes, - grant_status_t **__shared) -{ - grant_status_t *shared = *__shared; - unsigned long addr; - unsigned long i; - - if (shared == NULL) - *__shared = shared = gnttab_status_vm_area.area->addr; - - addr = (unsigned long)shared; - - for (i = 0; i < nr_gframes; i++) { - set_pte_at(&init_mm, addr, gnttab_status_vm_area.ptes[i], - mfn_pte(frames[i], PAGE_KERNEL)); - addr += PAGE_SIZE; - } - - return 0; -} - void arch_gnttab_unmap(void *shared, unsigned long nr_gframes) { - pte_t **ptes; unsigned long addr; unsigned long i; - if (shared == gnttab_status_vm_area.area->addr) - ptes = gnttab_status_vm_area.ptes; - else - ptes = gnttab_shared_vm_area.ptes; - addr = (unsigned long)shared; for (i = 0; i < nr_gframes; i++) { - set_pte_at(&init_mm, addr, ptes[i], __pte(0)); + set_pte_at(&init_mm, addr, gnttab_shared_vm_area.ptes[i], + __pte(0)); addr += PAGE_SIZE; } } @@ -129,35 +102,12 @@ static int arch_gnttab_valloc(struct gnttab_vm_area *area, unsigned nr_frames) return 0; } -static void arch_gnttab_vfree(struct gnttab_vm_area *area) +int arch_gnttab_init(unsigned long nr_shared) { - free_vm_area(area->area); - kfree(area->ptes); -} - -int arch_gnttab_init(unsigned long nr_shared, unsigned long nr_status) -{ - int ret; - if (!xen_pv_domain()) return 0; - ret = arch_gnttab_valloc(&gnttab_shared_vm_area, nr_shared); - if (ret < 0) - return ret; - - /* - * Always allocate the space for the status frames in case - * we're migrated to a host with V2 support. - */ - ret = arch_gnttab_valloc(&gnttab_status_vm_area, nr_status); - if (ret < 0) - goto err; - - return 0; - err: - arch_gnttab_vfree(&gnttab_shared_vm_area); - return -ENOMEM; + return arch_gnttab_valloc(&gnttab_shared_vm_area, nr_shared); } #ifdef CONFIG_XEN_PVH @@ -168,6 +118,7 @@ static int __init xlated_setup_gnttab_pages(void) { struct page **pages; xen_pfn_t *pfns; + void *vaddr; int rc; unsigned int i; unsigned long nr_grant_frames = gnttab_max_grant_frames(); @@ -193,21 +144,20 @@ static int __init xlated_setup_gnttab_pages(void) for (i = 0; i < nr_grant_frames; i++) pfns[i] = page_to_pfn(pages[i]); - rc = arch_gnttab_map_shared(pfns, nr_grant_frames, nr_grant_frames, - &xen_auto_xlat_grant_frames.vaddr); - - if (rc) { + vaddr = vmap(pages, nr_grant_frames, 0, PAGE_KERNEL); + if (!vaddr) { pr_warn("%s Couldn't map %ld pfns rc:%d\n", __func__, nr_grant_frames, rc); free_xenballooned_pages(nr_grant_frames, pages); kfree(pages); kfree(pfns); - return rc; + return -ENOMEM; } kfree(pages); xen_auto_xlat_grant_frames.pfn = pfns; xen_auto_xlat_grant_frames.count = nr_grant_frames; + xen_auto_xlat_grant_frames.vaddr = vaddr; return 0; } diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index e8a1201c329..f62af7647ec 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -399,38 +399,14 @@ static pteval_t pte_pfn_to_mfn(pteval_t val) if (unlikely(mfn == INVALID_P2M_ENTRY)) { mfn = 0; flags = 0; - } else { - /* - * Paramount to do this test _after_ the - * INVALID_P2M_ENTRY as INVALID_P2M_ENTRY & - * IDENTITY_FRAME_BIT resolves to true. - */ - mfn &= ~FOREIGN_FRAME_BIT; - if (mfn & IDENTITY_FRAME_BIT) { - mfn &= ~IDENTITY_FRAME_BIT; - flags |= _PAGE_IOMAP; - } - } + } else + mfn &= ~(FOREIGN_FRAME_BIT | IDENTITY_FRAME_BIT); val = ((pteval_t)mfn << PAGE_SHIFT) | flags; } return val; } -static pteval_t iomap_pte(pteval_t val) -{ - if (val & _PAGE_PRESENT) { - unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; - pteval_t flags = val & PTE_FLAGS_MASK; - - /* We assume the pte frame number is a MFN, so - just use it as-is. */ - val = ((pteval_t)pfn << PAGE_SHIFT) | flags; - } - - return val; -} - __visible pteval_t xen_pte_val(pte_t pte) { pteval_t pteval = pte.pte; @@ -441,9 +417,6 @@ __visible pteval_t xen_pte_val(pte_t pte) pteval = (pteval & ~_PAGE_PAT) | _PAGE_PWT; } #endif - if (xen_initial_domain() && (pteval & _PAGE_IOMAP)) - return pteval; - return pte_mfn_to_pfn(pteval); } PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val); @@ -481,7 +454,6 @@ void xen_set_pat(u64 pat) __visible pte_t xen_make_pte(pteval_t pte) { - phys_addr_t addr = (pte & PTE_PFN_MASK); #if 0 /* If Linux is trying to set a WC pte, then map to the Xen WC. * If _PAGE_PAT is set, then it probably means it is really @@ -496,19 +468,7 @@ __visible pte_t xen_make_pte(pteval_t pte) pte = (pte & ~(_PAGE_PCD | _PAGE_PWT)) | _PAGE_PAT; } #endif - /* - * Unprivileged domains are allowed to do IOMAPpings for - * PCI passthrough, but not map ISA space. The ISA - * mappings are just dummy local mappings to keep other - * parts of the kernel happy. - */ - if (unlikely(pte & _PAGE_IOMAP) && - (xen_initial_domain() || addr >= ISA_END_ADDRESS)) { - pte = iomap_pte(pte); - } else { - pte &= ~_PAGE_IOMAP; - pte = pte_pfn_to_mfn(pte); - } + pte = pte_pfn_to_mfn(pte); return native_make_pte(pte); } @@ -1866,12 +1826,11 @@ static void __init check_pt_base(unsigned long *pt_base, unsigned long *pt_end, * * We can construct this by grafting the Xen provided pagetable into * head_64.S's preconstructed pagetables. We copy the Xen L2's into - * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This - * means that only the kernel has a physical mapping to start with - - * but that's enough to get __va working. We need to fill in the rest - * of the physical mapping once some sort of allocator has been set - * up. - * NOTE: for PVH, the page tables are native. + * level2_ident_pgt, and level2_kernel_pgt. This means that only the + * kernel has a physical mapping to start with - but that's enough to + * get __va working. We need to fill in the rest of the physical + * mapping once some sort of allocator has been set up. NOTE: for + * PVH, the page tables are native. */ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) { @@ -1902,8 +1861,11 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) /* L3_i[0] -> level2_ident_pgt */ convert_pfn_mfn(level3_ident_pgt); /* L3_k[510] -> level2_kernel_pgt - * L3_i[511] -> level2_fixmap_pgt */ + * L3_k[511] -> level2_fixmap_pgt */ convert_pfn_mfn(level3_kernel_pgt); + + /* L3_k[511][506] -> level1_fixmap_pgt */ + convert_pfn_mfn(level2_fixmap_pgt); } /* We get [511][511] and have Xen's version of level2_kernel_pgt */ l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd); @@ -1913,21 +1875,15 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) addr[1] = (unsigned long)l3; addr[2] = (unsigned long)l2; /* Graft it onto L4[272][0]. Note that we creating an aliasing problem: - * Both L4[272][0] and L4[511][511] have entries that point to the same + * Both L4[272][0] and L4[511][510] have entries that point to the same * L2 (PMD) tables. Meaning that if you modify it in __va space * it will be also modified in the __ka space! (But if you just * modify the PMD table to point to other PTE's or none, then you * are OK - which is what cleanup_highmap does) */ copy_page(level2_ident_pgt, l2); - /* Graft it onto L4[511][511] */ + /* Graft it onto L4[511][510] */ copy_page(level2_kernel_pgt, l2); - /* Get [511][510] and graft that in level2_fixmap_pgt */ - l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd); - l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud); - copy_page(level2_fixmap_pgt, l2); - /* Note that we don't do anything with level1_fixmap_pgt which - * we don't need. */ if (!xen_feature(XENFEAT_auto_translated_physmap)) { /* Make pagetable pieces RO */ set_page_prot(init_level4_pgt, PAGE_KERNEL_RO); @@ -1937,6 +1893,7 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO); set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO); + set_page_prot(level1_fixmap_pgt, PAGE_KERNEL_RO); /* Pin down new L4 */ pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, @@ -2094,7 +2051,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) default: /* By default, set_fixmap is used for hardware mappings */ - pte = mfn_pte(phys, __pgprot(pgprot_val(prot) | _PAGE_IOMAP)); + pte = mfn_pte(phys, prot); break; } diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c index 0d82003e76a..ea54a08d830 100644 --- a/arch/x86/xen/multicalls.c +++ b/arch/x86/xen/multicalls.c @@ -54,7 +54,7 @@ DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags); void xen_mc_flush(void) { - struct mc_buffer *b = &__get_cpu_var(mc_buffer); + struct mc_buffer *b = this_cpu_ptr(&mc_buffer); struct multicall_entry *mc; int ret = 0; unsigned long flags; @@ -131,7 +131,7 @@ void xen_mc_flush(void) struct multicall_space __xen_mc_entry(size_t args) { - struct mc_buffer *b = &__get_cpu_var(mc_buffer); + struct mc_buffer *b = this_cpu_ptr(&mc_buffer); struct multicall_space ret; unsigned argidx = roundup(b->argidx, sizeof(u64)); @@ -162,7 +162,7 @@ struct multicall_space __xen_mc_entry(size_t args) struct multicall_space xen_mc_extend_args(unsigned long op, size_t size) { - struct mc_buffer *b = &__get_cpu_var(mc_buffer); + struct mc_buffer *b = this_cpu_ptr(&mc_buffer); struct multicall_space ret = { NULL, NULL }; BUG_ON(preemptible()); @@ -192,7 +192,7 @@ out: void xen_mc_callback(void (*fn)(void *), void *data) { - struct mc_buffer *b = &__get_cpu_var(mc_buffer); + struct mc_buffer *b = this_cpu_ptr(&mc_buffer); struct callback *cb; if (b->cbidx == MC_BATCH) { diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 9bb3d82ffec..9f5983b01ed 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -173,6 +173,7 @@ #include <xen/balloon.h> #include <xen/grant_table.h> +#include "p2m.h" #include "multicalls.h" #include "xen-ops.h" @@ -180,12 +181,6 @@ static void __init m2p_override_init(void); unsigned long xen_max_p2m_pfn __read_mostly; -#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) -#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *)) -#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **)) - -#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE) - /* Placeholders for holes in the address space */ static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE); static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE); @@ -202,16 +197,12 @@ static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_identity_mfn, P2M_MID_PER_PAGE); RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); -/* We might hit two boundary violations at the start and end, at max each - * boundary violation will require three middle nodes. */ -RESERVE_BRK(p2m_mid_extra, PAGE_SIZE * 2 * 3); - -/* When we populate back during bootup, the amount of pages can vary. The - * max we have is seen is 395979, but that does not mean it can't be more. - * Some machines can have 3GB I/O holes even. With early_can_reuse_p2m_middle - * it can re-use Xen provided mfn_list array, so we only need to allocate at - * most three P2M top nodes. */ -RESERVE_BRK(p2m_populated, PAGE_SIZE * 3); +/* For each I/O range remapped we may lose up to two leaf pages for the boundary + * violations and three mid pages to cover up to 3GB. With + * early_can_reuse_p2m_middle() most of the leaf pages will be reused by the + * remapped region. + */ +RESERVE_BRK(p2m_identity_remap, PAGE_SIZE * 2 * 3 * MAX_REMAP_RANGES); static inline unsigned p2m_top_index(unsigned long pfn) { @@ -841,10 +832,9 @@ unsigned long __init set_phys_range_identity(unsigned long pfn_s, pfn = ALIGN(pfn, P2M_PER_PAGE); } - if (!WARN((pfn - pfn_s) != (pfn_e - pfn_s), + WARN((pfn - pfn_s) != (pfn_e - pfn_s), "Identity mapping failed. We are %ld short of 1-1 mappings!\n", - (pfn_e - pfn_s) - (pfn - pfn_s))) - printk(KERN_DEBUG "1-1 mapping on %lx->%lx\n", pfn_s, pfn); + (pfn_e - pfn_s) - (pfn - pfn_s)); return pfn - pfn_s; } diff --git a/arch/x86/xen/p2m.h b/arch/x86/xen/p2m.h new file mode 100644 index 00000000000..ad8aee24ab7 --- /dev/null +++ b/arch/x86/xen/p2m.h @@ -0,0 +1,15 @@ +#ifndef _XEN_P2M_H +#define _XEN_P2M_H + +#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) +#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *)) +#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **)) + +#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE) + +#define MAX_REMAP_RANGES 10 + +extern unsigned long __init set_phys_range_identity(unsigned long pfn_s, + unsigned long pfn_e); + +#endif /* _XEN_P2M_H */ diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 2e555163c2f..af7216128d9 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -29,6 +29,7 @@ #include <xen/features.h> #include "xen-ops.h" #include "vdso.h" +#include "p2m.h" /* These are code, but not functions. Defined in entry.S */ extern const char xen_hypervisor_callback[]; @@ -46,6 +47,9 @@ struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata; /* Number of pages released from the initial allocation. */ unsigned long xen_released_pages; +/* Buffer used to remap identity mapped pages */ +unsigned long xen_remap_buf[P2M_PER_PAGE] __initdata; + /* * The maximum amount of extra memory compared to the base size. The * main scaling factor is the size of struct page. At extreme ratios @@ -151,107 +155,325 @@ static unsigned long __init xen_do_chunk(unsigned long start, return len; } -static unsigned long __init xen_release_chunk(unsigned long start, - unsigned long end) -{ - return xen_do_chunk(start, end, true); -} - -static unsigned long __init xen_populate_chunk( +/* + * Finds the next RAM pfn available in the E820 map after min_pfn. + * This function updates min_pfn with the pfn found and returns + * the size of that range or zero if not found. + */ +static unsigned long __init xen_find_pfn_range( const struct e820entry *list, size_t map_size, - unsigned long max_pfn, unsigned long *last_pfn, - unsigned long credits_left) + unsigned long *min_pfn) { const struct e820entry *entry; unsigned int i; unsigned long done = 0; - unsigned long dest_pfn; for (i = 0, entry = list; i < map_size; i++, entry++) { unsigned long s_pfn; unsigned long e_pfn; - unsigned long pfns; - long capacity; - - if (credits_left <= 0) - break; if (entry->type != E820_RAM) continue; e_pfn = PFN_DOWN(entry->addr + entry->size); - /* We only care about E820 after the xen_start_info->nr_pages */ - if (e_pfn <= max_pfn) + /* We only care about E820 after this */ + if (e_pfn < *min_pfn) continue; s_pfn = PFN_UP(entry->addr); - /* If the E820 falls within the nr_pages, we want to start - * at the nr_pages PFN. - * If that would mean going past the E820 entry, skip it + + /* If min_pfn falls within the E820 entry, we want to start + * at the min_pfn PFN. */ - if (s_pfn <= max_pfn) { - capacity = e_pfn - max_pfn; - dest_pfn = max_pfn; + if (s_pfn <= *min_pfn) { + done = e_pfn - *min_pfn; } else { - capacity = e_pfn - s_pfn; - dest_pfn = s_pfn; + done = e_pfn - s_pfn; + *min_pfn = s_pfn; } + break; + } - if (credits_left < capacity) - capacity = credits_left; + return done; +} - pfns = xen_do_chunk(dest_pfn, dest_pfn + capacity, false); - done += pfns; - *last_pfn = (dest_pfn + pfns); - if (pfns < capacity) - break; - credits_left -= pfns; +/* + * This releases a chunk of memory and then does the identity map. It's used as + * as a fallback if the remapping fails. + */ +static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn, + unsigned long end_pfn, unsigned long nr_pages, unsigned long *identity, + unsigned long *released) +{ + WARN_ON(start_pfn > end_pfn); + + /* Need to release pages first */ + *released += xen_do_chunk(start_pfn, min(end_pfn, nr_pages), true); + *identity += set_phys_range_identity(start_pfn, end_pfn); +} + +/* + * Helper function to update both the p2m and m2p tables. + */ +static unsigned long __init xen_update_mem_tables(unsigned long pfn, + unsigned long mfn) +{ + struct mmu_update update = { + .ptr = ((unsigned long long)mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE, + .val = pfn + }; + + /* Update p2m */ + if (!early_set_phys_to_machine(pfn, mfn)) { + WARN(1, "Failed to set p2m mapping for pfn=%ld mfn=%ld\n", + pfn, mfn); + return false; } - return done; + + /* Update m2p */ + if (HYPERVISOR_mmu_update(&update, 1, NULL, DOMID_SELF) < 0) { + WARN(1, "Failed to set m2p mapping for mfn=%ld pfn=%ld\n", + mfn, pfn); + return false; + } + + return true; } -static void __init xen_set_identity_and_release_chunk( - unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages, - unsigned long *released, unsigned long *identity) +/* + * This function updates the p2m and m2p tables with an identity map from + * start_pfn to start_pfn+size and remaps the underlying RAM of the original + * allocation at remap_pfn. It must do so carefully in P2M_PER_PAGE sized blocks + * to not exhaust the reserved brk space. Doing it in properly aligned blocks + * ensures we only allocate the minimum required leaf pages in the p2m table. It + * copies the existing mfns from the p2m table under the 1:1 map, overwrites + * them with the identity map and then updates the p2m and m2p tables with the + * remapped memory. + */ +static unsigned long __init xen_do_set_identity_and_remap_chunk( + unsigned long start_pfn, unsigned long size, unsigned long remap_pfn) { - unsigned long pfn; + unsigned long ident_pfn_iter, remap_pfn_iter; + unsigned long ident_start_pfn_align, remap_start_pfn_align; + unsigned long ident_end_pfn_align, remap_end_pfn_align; + unsigned long ident_boundary_pfn, remap_boundary_pfn; + unsigned long ident_cnt = 0; + unsigned long remap_cnt = 0; + unsigned long left = size; + unsigned long mod; + int i; + + WARN_ON(size == 0); + + BUG_ON(xen_feature(XENFEAT_auto_translated_physmap)); /* - * If the PFNs are currently mapped, clear the mappings - * (except for the ISA region which must be 1:1 mapped) to - * release the refcounts (in Xen) on the original frames. + * Determine the proper alignment to remap memory in P2M_PER_PAGE sized + * blocks. We need to keep track of both the existing pfn mapping and + * the new pfn remapping. */ - for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) { - pte_t pte = __pte_ma(0); + mod = start_pfn % P2M_PER_PAGE; + ident_start_pfn_align = + mod ? (start_pfn - mod + P2M_PER_PAGE) : start_pfn; + mod = remap_pfn % P2M_PER_PAGE; + remap_start_pfn_align = + mod ? (remap_pfn - mod + P2M_PER_PAGE) : remap_pfn; + mod = (start_pfn + size) % P2M_PER_PAGE; + ident_end_pfn_align = start_pfn + size - mod; + mod = (remap_pfn + size) % P2M_PER_PAGE; + remap_end_pfn_align = remap_pfn + size - mod; + + /* Iterate over each p2m leaf node in each range */ + for (ident_pfn_iter = ident_start_pfn_align, remap_pfn_iter = remap_start_pfn_align; + ident_pfn_iter < ident_end_pfn_align && remap_pfn_iter < remap_end_pfn_align; + ident_pfn_iter += P2M_PER_PAGE, remap_pfn_iter += P2M_PER_PAGE) { + /* Check we aren't past the end */ + BUG_ON(ident_pfn_iter + P2M_PER_PAGE > start_pfn + size); + BUG_ON(remap_pfn_iter + P2M_PER_PAGE > remap_pfn + size); + + /* Save p2m mappings */ + for (i = 0; i < P2M_PER_PAGE; i++) + xen_remap_buf[i] = pfn_to_mfn(ident_pfn_iter + i); + + /* Set identity map which will free a p2m leaf */ + ident_cnt += set_phys_range_identity(ident_pfn_iter, + ident_pfn_iter + P2M_PER_PAGE); + +#ifdef DEBUG + /* Helps verify a p2m leaf has been freed */ + for (i = 0; i < P2M_PER_PAGE; i++) { + unsigned int pfn = ident_pfn_iter + i; + BUG_ON(pfn_to_mfn(pfn) != pfn); + } +#endif + /* Now remap memory */ + for (i = 0; i < P2M_PER_PAGE; i++) { + unsigned long mfn = xen_remap_buf[i]; + + /* This will use the p2m leaf freed above */ + if (!xen_update_mem_tables(remap_pfn_iter + i, mfn)) { + WARN(1, "Failed to update mem mapping for pfn=%ld mfn=%ld\n", + remap_pfn_iter + i, mfn); + return 0; + } + + remap_cnt++; + } - if (pfn < PFN_UP(ISA_END_ADDRESS)) - pte = mfn_pte(pfn, PAGE_KERNEL_IO); + left -= P2M_PER_PAGE; + } - (void)HYPERVISOR_update_va_mapping( - (unsigned long)__va(pfn << PAGE_SHIFT), pte, 0); + /* Max boundary space possible */ + BUG_ON(left > (P2M_PER_PAGE - 1) * 2); + + /* Now handle the boundary conditions */ + ident_boundary_pfn = start_pfn; + remap_boundary_pfn = remap_pfn; + for (i = 0; i < left; i++) { + unsigned long mfn; + + /* These two checks move from the start to end boundaries */ + if (ident_boundary_pfn == ident_start_pfn_align) + ident_boundary_pfn = ident_pfn_iter; + if (remap_boundary_pfn == remap_start_pfn_align) + remap_boundary_pfn = remap_pfn_iter; + + /* Check we aren't past the end */ + BUG_ON(ident_boundary_pfn >= start_pfn + size); + BUG_ON(remap_boundary_pfn >= remap_pfn + size); + + mfn = pfn_to_mfn(ident_boundary_pfn); + + if (!xen_update_mem_tables(remap_boundary_pfn, mfn)) { + WARN(1, "Failed to update mem mapping for pfn=%ld mfn=%ld\n", + remap_pfn_iter + i, mfn); + return 0; + } + remap_cnt++; + + ident_boundary_pfn++; + remap_boundary_pfn++; } - if (start_pfn < nr_pages) - *released += xen_release_chunk( - start_pfn, min(end_pfn, nr_pages)); + /* Finish up the identity map */ + if (ident_start_pfn_align >= ident_end_pfn_align) { + /* + * In this case we have an identity range which does not span an + * aligned block so everything needs to be identity mapped here. + * If we didn't check this we might remap too many pages since + * the align boundaries are not meaningful in this case. + */ + ident_cnt += set_phys_range_identity(start_pfn, + start_pfn + size); + } else { + /* Remapped above so check each end of the chunk */ + if (start_pfn < ident_start_pfn_align) + ident_cnt += set_phys_range_identity(start_pfn, + ident_start_pfn_align); + if (start_pfn + size > ident_pfn_iter) + ident_cnt += set_phys_range_identity(ident_pfn_iter, + start_pfn + size); + } - *identity += set_phys_range_identity(start_pfn, end_pfn); + BUG_ON(ident_cnt != size); + BUG_ON(remap_cnt != size); + + return size; } -static unsigned long __init xen_set_identity_and_release( - const struct e820entry *list, size_t map_size, unsigned long nr_pages) +/* + * This function takes a contiguous pfn range that needs to be identity mapped + * and: + * + * 1) Finds a new range of pfns to use to remap based on E820 and remap_pfn. + * 2) Calls the do_ function to actually do the mapping/remapping work. + * + * The goal is to not allocate additional memory but to remap the existing + * pages. In the case of an error the underlying memory is simply released back + * to Xen and not remapped. + */ +static unsigned long __init xen_set_identity_and_remap_chunk( + const struct e820entry *list, size_t map_size, unsigned long start_pfn, + unsigned long end_pfn, unsigned long nr_pages, unsigned long remap_pfn, + unsigned long *identity, unsigned long *remapped, + unsigned long *released) +{ + unsigned long pfn; + unsigned long i = 0; + unsigned long n = end_pfn - start_pfn; + + while (i < n) { + unsigned long cur_pfn = start_pfn + i; + unsigned long left = n - i; + unsigned long size = left; + unsigned long remap_range_size; + + /* Do not remap pages beyond the current allocation */ + if (cur_pfn >= nr_pages) { + /* Identity map remaining pages */ + *identity += set_phys_range_identity(cur_pfn, + cur_pfn + size); + break; + } + if (cur_pfn + size > nr_pages) + size = nr_pages - cur_pfn; + + remap_range_size = xen_find_pfn_range(list, map_size, + &remap_pfn); + if (!remap_range_size) { + pr_warning("Unable to find available pfn range, not remapping identity pages\n"); + xen_set_identity_and_release_chunk(cur_pfn, + cur_pfn + left, nr_pages, identity, released); + break; + } + /* Adjust size to fit in current e820 RAM region */ + if (size > remap_range_size) + size = remap_range_size; + + if (!xen_do_set_identity_and_remap_chunk(cur_pfn, size, remap_pfn)) { + WARN(1, "Failed to remap 1:1 memory cur_pfn=%ld size=%ld remap_pfn=%ld\n", + cur_pfn, size, remap_pfn); + xen_set_identity_and_release_chunk(cur_pfn, + cur_pfn + left, nr_pages, identity, released); + break; + } + + /* Update variables to reflect new mappings. */ + i += size; + remap_pfn += size; + *identity += size; + *remapped += size; + } + + /* + * If the PFNs are currently mapped, the VA mapping also needs + * to be updated to be 1:1. + */ + for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) + (void)HYPERVISOR_update_va_mapping( + (unsigned long)__va(pfn << PAGE_SHIFT), + mfn_pte(pfn, PAGE_KERNEL_IO), 0); + + return remap_pfn; +} + +static unsigned long __init xen_set_identity_and_remap( + const struct e820entry *list, size_t map_size, unsigned long nr_pages, + unsigned long *released) { phys_addr_t start = 0; - unsigned long released = 0; unsigned long identity = 0; + unsigned long remapped = 0; + unsigned long last_pfn = nr_pages; const struct e820entry *entry; + unsigned long num_released = 0; int i; /* * Combine non-RAM regions and gaps until a RAM region (or the * end of the map) is reached, then set the 1:1 map and - * release the pages (if available) in those non-RAM regions. + * remap the memory in those non-RAM regions. * * The combined non-RAM regions are rounded to a whole number * of pages so any partial pages are accessible via the 1:1 @@ -269,22 +491,24 @@ static unsigned long __init xen_set_identity_and_release( end_pfn = PFN_UP(entry->addr); if (start_pfn < end_pfn) - xen_set_identity_and_release_chunk( - start_pfn, end_pfn, nr_pages, - &released, &identity); - + last_pfn = xen_set_identity_and_remap_chunk( + list, map_size, start_pfn, + end_pfn, nr_pages, last_pfn, + &identity, &remapped, + &num_released); start = end; } } - if (released) - printk(KERN_INFO "Released %lu pages of unused memory\n", released); - if (identity) - printk(KERN_INFO "Set %ld page(s) to 1-1 mapping\n", identity); + *released = num_released; - return released; -} + pr_info("Set %ld page(s) to 1-1 mapping\n", identity); + pr_info("Remapped %ld page(s), last_pfn=%ld\n", remapped, + last_pfn); + pr_info("Released %ld page(s)\n", num_released); + return last_pfn; +} static unsigned long __init xen_get_max_pages(void) { unsigned long max_pages = MAX_DOMAIN_PAGES; @@ -347,7 +571,6 @@ char * __init xen_memory_setup(void) unsigned long max_pages; unsigned long last_pfn = 0; unsigned long extra_pages = 0; - unsigned long populated; int i; int op; @@ -392,20 +615,11 @@ char * __init xen_memory_setup(void) extra_pages += max_pages - max_pfn; /* - * Set P2M for all non-RAM pages and E820 gaps to be identity - * type PFNs. Any RAM pages that would be made inaccesible by - * this are first released. + * Set identity map on non-RAM pages and remap the underlying RAM. */ - xen_released_pages = xen_set_identity_and_release( - map, memmap.nr_entries, max_pfn); - - /* - * Populate back the non-RAM pages and E820 gaps that had been - * released. */ - populated = xen_populate_chunk(map, memmap.nr_entries, - max_pfn, &last_pfn, xen_released_pages); + last_pfn = xen_set_identity_and_remap(map, memmap.nr_entries, max_pfn, + &xen_released_pages); - xen_released_pages -= populated; extra_pages += xen_released_pages; if (last_pfn > max_pfn) { diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 7005974c3ff..8650cdb5320 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -37,6 +37,7 @@ #include <xen/hvc-console.h> #include "xen-ops.h" #include "mmu.h" +#include "smp.h" cpumask_var_t xen_cpu_initialized_map; @@ -99,10 +100,14 @@ static void cpu_bringup(void) wmb(); /* make sure everything is out */ } -/* Note: cpu parameter is only relevant for PVH */ -static void cpu_bringup_and_idle(int cpu) +/* + * Note: cpu parameter is only relevant for PVH. The reason for passing it + * is we can't do smp_processor_id until the percpu segments are loaded, for + * which we need the cpu number! So we pass it in rdi as first parameter. + */ +asmlinkage __visible void cpu_bringup_and_idle(int cpu) { -#ifdef CONFIG_X86_64 +#ifdef CONFIG_XEN_PVH if (xen_feature(XENFEAT_auto_translated_physmap) && xen_feature(XENFEAT_supervisor_mode_kernel)) xen_pvh_secondary_vcpu_init(cpu); @@ -360,6 +365,8 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) struct desc_struct *gdt; unsigned long gdt_mfn; + /* used to tell cpu_init() that it can proceed with initialization */ + cpumask_set_cpu(cpu, cpu_callout_mask); if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map)) return 0; @@ -374,11 +381,10 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) ctxt->user_regs.fs = __KERNEL_PERCPU; ctxt->user_regs.gs = __KERNEL_STACK_CANARY; #endif - ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; - memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt)); if (!xen_feature(XENFEAT_auto_translated_physmap)) { + ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; ctxt->flags = VGCF_IN_KERNEL; ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ ctxt->user_regs.ds = __USER_DS; @@ -413,15 +419,18 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) (unsigned long)xen_failsafe_callback; ctxt->user_regs.cs = __KERNEL_CS; per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir); -#ifdef CONFIG_X86_32 } -#else - } else - /* N.B. The user_regs.eip (cpu_bringup_and_idle) is called with - * %rdi having the cpu number - which means are passing in - * as the first parameter the cpu. Subtle! +#ifdef CONFIG_XEN_PVH + else { + /* + * The vcpu comes on kernel page tables which have the NX pte + * bit set. This means before DS/SS is touched, NX in + * EFER must be set. Hence the following assembly glue code. */ + ctxt->user_regs.eip = (unsigned long)xen_pvh_early_cpu_init; ctxt->user_regs.rdi = cpu; + ctxt->user_regs.rsi = true; /* entry == true */ + } #endif ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs); ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir)); diff --git a/arch/x86/xen/smp.h b/arch/x86/xen/smp.h index c7c2d89efd7..963d62a35c8 100644 --- a/arch/x86/xen/smp.h +++ b/arch/x86/xen/smp.h @@ -8,4 +8,12 @@ extern void xen_send_IPI_allbutself(int vector); extern void xen_send_IPI_all(int vector); extern void xen_send_IPI_self(int vector); +#ifdef CONFIG_XEN_PVH +extern void xen_pvh_early_cpu_init(int cpu, bool entry); +#else +static inline void xen_pvh_early_cpu_init(int cpu, bool entry) +{ +} +#endif + #endif diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 0ba5f3b967f..23b45eb9a89 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -109,7 +109,7 @@ static bool xen_pvspin = true; __visible void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want) { int irq = __this_cpu_read(lock_kicker_irq); - struct xen_lock_waiting *w = &__get_cpu_var(lock_waiting); + struct xen_lock_waiting *w = this_cpu_ptr(&lock_waiting); int cpu = smp_processor_id(); u64 start; unsigned long flags; diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 7b78f88c170..a1d430b112b 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -80,7 +80,7 @@ static void get_runstate_snapshot(struct vcpu_runstate_info *res) BUG_ON(preemptible()); - state = &__get_cpu_var(xen_runstate); + state = this_cpu_ptr(&xen_runstate); /* * The runstate info is always updated by the hypervisor on @@ -123,7 +123,7 @@ static void do_stolen_accounting(void) WARN_ON(state.state != RUNSTATE_running); - snap = &__get_cpu_var(xen_runstate_snapshot); + snap = this_cpu_ptr(&xen_runstate_snapshot); /* work out how much time the VCPU has not been runn*ing* */ runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable]; @@ -158,7 +158,7 @@ cycle_t xen_clocksource_read(void) cycle_t ret; preempt_disable_notrace(); - src = &__get_cpu_var(xen_vcpu)->time; + src = this_cpu_ptr(&xen_vcpu->time); ret = pvclock_clocksource_read(src); preempt_enable_notrace(); return ret; @@ -397,7 +397,7 @@ static DEFINE_PER_CPU(struct xen_clock_event_device, xen_clock_events) = { .evt. static irqreturn_t xen_timer_interrupt(int irq, void *dev_id) { - struct clock_event_device *evt = &__get_cpu_var(xen_clock_events).evt; + struct clock_event_device *evt = this_cpu_ptr(&xen_clock_events.evt); irqreturn_t ret; ret = IRQ_NONE; @@ -444,7 +444,7 @@ void xen_setup_timer(int cpu) irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt, IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER| - IRQF_FORCE_RESUME, + IRQF_FORCE_RESUME|IRQF_EARLY_RESUME, name, NULL); (void)xen_set_irq_priority(irq, XEN_IRQ_PRIORITY_MAX); @@ -460,7 +460,7 @@ void xen_setup_cpu_clockevents(void) { BUG_ON(preemptible()); - clockevents_register_device(&__get_cpu_var(xen_clock_events).evt); + clockevents_register_device(this_cpu_ptr(&xen_clock_events.evt)); } void xen_timer_resume(void) diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 485b6958554..674b222544b 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -47,6 +47,41 @@ ENTRY(startup_xen) __FINIT +#ifdef CONFIG_XEN_PVH +/* + * xen_pvh_early_cpu_init() - early PVH VCPU initialization + * @cpu: this cpu number (%rdi) + * @entry: true if this is a secondary vcpu coming up on this entry + * point, false if this is the boot CPU being initialized for + * the first time (%rsi) + * + * Note: This is called as a function on the boot CPU, and is the entry point + * on the secondary CPU. + */ +ENTRY(xen_pvh_early_cpu_init) + mov %rsi, %r11 + + /* Gather features to see if NX implemented. */ + mov $0x80000001, %eax + cpuid + mov %edx, %esi + + mov $MSR_EFER, %ecx + rdmsr + bts $_EFER_SCE, %eax + + bt $20, %esi + jnc 1f /* No NX, skip setting it */ + bts $_EFER_NX, %eax +1: wrmsr +#ifdef CONFIG_SMP + cmp $0, %r11b + jne cpu_bringup_and_idle +#endif + ret + +#endif /* CONFIG_XEN_PVH */ + .pushsection .text .balign PAGE_SIZE ENTRY(hypercall_page) @@ -124,6 +159,7 @@ NEXT_HYPERCALL(arch_6) ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .quad _PAGE_PRESENT; .quad _PAGE_PRESENT) ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1) + ELFNOTE(Xen, XEN_ELFNOTE_MOD_START_PFN, .long 1) ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, _ASM_PTR __HYPERVISOR_VIRT_START) ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, _ASM_PTR 0) diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 97d87659f77..28c7e0be56e 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -105,6 +105,14 @@ static inline void __init xen_init_apic(void) } #endif +#ifdef CONFIG_XEN_EFI +extern void xen_efi_init(void); +#else +static inline void __init xen_efi_init(void) +{ +} +#endif + /* Declare an asm function, along with symbols needed to make it inlineable */ #define DECL_ASM(ret, name, ...) \ |