diff options
Diffstat (limited to 'arch/x86')
419 files changed, 17398 insertions, 9019 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 46c3bff3ced..70c0f3da047 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1,7 +1,7 @@ # Select 32 or 64 bit config 64BIT bool "64-bit kernel" if ARCH = "x86" - default ARCH = "x86_64" + default ARCH != "i386" ---help--- Say yes to build a 64-bit kernel - formerly known as x86_64 Say no to build a 32-bit kernel - formerly known as i386 @@ -22,11 +22,12 @@ config X86 def_bool y select HAVE_AOUT if X86_32 select HAVE_UNSTABLE_SCHED_CLOCK + select ARCH_SUPPORTS_NUMA_BALANCING + select ARCH_WANTS_PROT_NUMA_PROT_NONE select HAVE_IDE select HAVE_OPROFILE select HAVE_PCSPKR_PLATFORM select HAVE_PERF_EVENTS - select HAVE_IRQ_WORK select HAVE_IOREMAP_PROT select HAVE_KPROBES select HAVE_MEMBLOCK @@ -38,10 +39,12 @@ config X86 select HAVE_DMA_CONTIGUOUS if !SWIOTLB select HAVE_KRETPROBES select HAVE_OPTPROBES + select HAVE_KPROBES_ON_FTRACE select HAVE_FTRACE_MCOUNT_RECORD select HAVE_FENTRY if X86_64 select HAVE_C_RECORDMCOUNT select HAVE_DYNAMIC_FTRACE + select HAVE_DYNAMIC_FTRACE_WITH_REGS select HAVE_FUNCTION_TRACER select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_GRAPH_FP_TEST @@ -69,8 +72,8 @@ config X86 select HAVE_PERF_USER_STACK_DUMP select HAVE_DEBUG_KMEMLEAK select ANON_INODES - select HAVE_ALIGNED_STRUCT_PAGE if SLUB && !M386 - select HAVE_CMPXCHG_LOCAL if !M386 + select HAVE_ALIGNED_STRUCT_PAGE if SLUB + select HAVE_CMPXCHG_LOCAL select HAVE_CMPXCHG_DOUBLE select HAVE_ARCH_KMEMCHECK select HAVE_USER_RETURN_NOTIFIER @@ -104,14 +107,19 @@ config X86 select GENERIC_CLOCKEVENTS_BROADCAST if X86_64 || (X86_32 && X86_LOCAL_APIC) select GENERIC_TIME_VSYSCALL if X86_64 select KTIME_SCALAR if X86_32 + select ALWAYS_USE_PERSISTENT_CLOCK select GENERIC_STRNCPY_FROM_USER select GENERIC_STRNLEN_USER - select HAVE_RCU_USER_QS if X86_64 + select HAVE_CONTEXT_TRACKING if X86_64 select HAVE_IRQ_TIME_ACCOUNTING - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE + select VIRT_TO_BUS select MODULES_USE_ELF_REL if X86_32 select MODULES_USE_ELF_RELA if X86_64 + select CLONE_BACKWARDS if X86_32 + select ARCH_USE_BUILTIN_BSWAP + select OLD_SIGSUSPEND3 if X86_32 || IA32_EMULATION + select OLD_SIGACTION if X86_32 + select COMPAT_OLD_SIGACTION if IA32_EMULATION config INSTRUCTION_DECODER def_bool y @@ -171,13 +179,8 @@ config ARCH_MAY_HAVE_PC_FDC def_bool y depends on ISA_DMA_API -config RWSEM_GENERIC_SPINLOCK - def_bool y - depends on !X86_XADD - config RWSEM_XCHGADD_ALGORITHM def_bool y - depends on X86_XADD config GENERIC_CALIBRATE_DELAY def_bool y @@ -225,7 +228,7 @@ config ARCH_SUPPORTS_DEBUG_PAGEALLOC config HAVE_INTEL_TXT def_bool y - depends on EXPERIMENTAL && INTEL_IOMMU && ACPI + depends on INTEL_IOMMU && ACPI config X86_32_SMP def_bool y @@ -310,7 +313,7 @@ config X86_X2APIC If you don't know what to do here, say N. config X86_MPPARSE - bool "Enable MPS table" if ACPI + bool "Enable MPS table" if ACPI || SFI default y depends on X86_LOCAL_APIC ---help--- @@ -323,6 +326,10 @@ config X86_BIGSMP ---help--- This option is needed for the systems that have more than 8 CPUs +config GOLDFISH + def_bool y + depends on X86_GOLDFISH + if X86_32 config X86_EXTENDED_PLATFORM bool "Support for extended (non-PC) x86 platforms" @@ -374,6 +381,7 @@ config X86_NUMACHIP depends on NUMA depends on SMP depends on X86_X2APIC + depends on PCI_MMCONFIG ---help--- Adds support for Numascale NumaChip large-SMP systems. Needed to enable more than ~168 cores. @@ -404,6 +412,14 @@ config X86_UV # Following is an alphabetically sorted list of 32 bit extended platforms # Please maintain the alphabetic order if and when there are additions +config X86_GOLDFISH + bool "Goldfish (Virtual Platform)" + depends on X86_32 + ---help--- + Enable support for the Goldfish virtual platform used primarily + for Android development. Unless you are building for the Android + Goldfish emulator say N here. + config X86_INTEL_CE bool "CE4100 TV platform" depends on PCI @@ -456,6 +472,16 @@ config X86_MDFLD endif +config X86_INTEL_LPSS + bool "Intel Low Power Subsystem Support" + depends on ACPI + select COMMON_CLK + ---help--- + Select to build support for Intel Low Power Subsystem such as + found on Intel Lynxpoint PCH. Selecting this option enables + things like clock tree (common clock framework) which are needed + by the LPSS peripheral drivers. + config X86_RDC321X bool "RDC R-321x SoC" depends on X86_32 @@ -619,7 +645,7 @@ config PARAVIRT config PARAVIRT_SPINLOCKS bool "Paravirtualization layer for spinlocks" - depends on PARAVIRT && SMP && EXPERIMENTAL + depends on PARAVIRT && SMP ---help--- Paravirtualized spinlocks allow a pvops backend to replace the spinlock implementation with something virtualization-friendly @@ -731,7 +757,7 @@ config GART_IOMMU config CALGARY_IOMMU bool "IBM Calgary IOMMU support" select SWIOTLB - depends on X86_64 && PCI && EXPERIMENTAL + depends on X86_64 && PCI ---help--- Support for hardware IOMMUs in IBM's xSeries x366 and x460 systems. Needed to run systems with more than 3GB of memory @@ -773,7 +799,7 @@ config IOMMU_HELPER config MAXSMP bool "Enable Maximum number of SMP Processors and NUMA Nodes" - depends on X86_64 && SMP && DEBUG_KERNEL && EXPERIMENTAL + depends on X86_64 && SMP && DEBUG_KERNEL select CPUMASK_OFFSTACK ---help--- Enable maximum number of CPUS and NUMA Nodes for this architecture. @@ -1031,6 +1057,24 @@ config MICROCODE_OLD_INTERFACE def_bool y depends on MICROCODE +config MICROCODE_INTEL_LIB + def_bool y + depends on MICROCODE_INTEL + +config MICROCODE_INTEL_EARLY + bool "Early load microcode" + depends on MICROCODE_INTEL && BLK_DEV_INITRD + default y + help + This option provides functionality to read additional microcode data + at the beginning of initrd image. The data tells kernel to load + microcode to CPU's as early as possible. No functional change if no + microcode data is glued to the initrd, therefore it's safe to say Y. + +config MICROCODE_EARLY + def_bool y + depends on MICROCODE_INTEL_EARLY + config X86_MSR tristate "/dev/cpu/*/msr - Model-specific register support" ---help--- @@ -1100,7 +1144,7 @@ config HIGHMEM4G config HIGHMEM64G bool "64GB" - depends on !M386 && !M486 + depends on !M486 select X86_PAE ---help--- Select this if you have a 32-bit processor and more than 4 @@ -1109,7 +1153,6 @@ config HIGHMEM64G endchoice choice - depends on EXPERIMENTAL prompt "Memory split" if EXPERT default VMSPLIT_3G depends on X86_32 @@ -1186,7 +1229,7 @@ config DIRECT_GBPAGES config NUMA bool "Numa Memory Allocation and Scheduler Support" depends on SMP - depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || X86_BIGSMP || X86_SUMMIT && ACPI) && EXPERIMENTAL) + depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || X86_BIGSMP || X86_SUMMIT && ACPI)) default y if (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP) ---help--- Enable NUMA (Non Uniform Memory Access) support. @@ -1255,10 +1298,6 @@ config NODES_SHIFT Specify the maximum number of NUMA Nodes available on the target system. Increases memory reserved to accommodate various tables. -config HAVE_ARCH_ALLOC_REMAP - def_bool y - depends on X86_32 && NUMA - config ARCH_HAVE_MEMORY_PRESENT def_bool y depends on X86_32 && DISCONTIGMEM @@ -1281,7 +1320,7 @@ config ARCH_DISCONTIGMEM_DEFAULT config ARCH_SPARSEMEM_ENABLE def_bool y - depends on X86_64 || NUMA || (EXPERIMENTAL && X86_32) || X86_32_NON_STANDARD + depends on X86_64 || NUMA || X86_32 || X86_32_NON_STANDARD select SPARSEMEM_STATIC if X86_32 select SPARSEMEM_VMEMMAP_ENABLE if X86_64 @@ -1595,8 +1634,7 @@ config CRASH_DUMP For more details see Documentation/kdump/kdump.txt config KEXEC_JUMP - bool "kexec jump (EXPERIMENTAL)" - depends on EXPERIMENTAL + bool "kexec jump" depends on KEXEC && HIBERNATION ---help--- Jump between original kernel and kexeced kernel and invoke @@ -1698,6 +1736,50 @@ config HOTPLUG_CPU automatically on SMP systems. ) Say N if you want to disable CPU hotplug. +config BOOTPARAM_HOTPLUG_CPU0 + bool "Set default setting of cpu0_hotpluggable" + default n + depends on HOTPLUG_CPU + ---help--- + Set whether default state of cpu0_hotpluggable is on or off. + + Say Y here to enable CPU0 hotplug by default. If this switch + is turned on, there is no need to give cpu0_hotplug kernel + parameter and the CPU0 hotplug feature is enabled by default. + + Please note: there are two known CPU0 dependencies if you want + to enable the CPU0 hotplug feature either by this switch or by + cpu0_hotplug kernel parameter. + + First, resume from hibernate or suspend always starts from CPU0. + So hibernate and suspend are prevented if CPU0 is offline. + + Second dependency is PIC interrupts always go to CPU0. CPU0 can not + offline if any interrupt can not migrate out of CPU0. There may + be other CPU0 dependencies. + + Please make sure the dependencies are under your control before + you enable this feature. + + Say N if you don't want to enable CPU0 hotplug feature by default. + You still can enable the CPU0 hotplug feature at boot by kernel + parameter cpu0_hotplug. + +config DEBUG_HOTPLUG_CPU0 + def_bool n + prompt "Debug CPU0 hotplug" + depends on HOTPLUG_CPU + ---help--- + Enabling this option offlines CPU0 (if CPU0 can be offlined) as + soon as possible and boots up userspace with CPU0 offlined. User + can online CPU0 back after boot time. + + To debug CPU0 hotplug, you need to enable CPU0 offline/online + feature by either turning on CONFIG_BOOTPARAM_HOTPLUG_CPU0 during + compilation or giving cpu0_hotplug kernel parameter at boot. + + If unsure, say N. + config COMPAT_VDSO def_bool y prompt "Compat VDSO support" @@ -1870,6 +1952,7 @@ config APM_DO_ENABLE this feature. config APM_CPU_IDLE + depends on CPU_IDLE bool "Make CPU Idle calls when idle" ---help--- Enable calls to APM CPU Idle/CPU Busy inside the kernel's idle loop. @@ -1995,7 +2078,7 @@ config PCI_MMCONFIG config PCI_CNB20LE_QUIRK bool "Read CNB20LE Host Bridge Windows" if EXPERT - depends on PCI && EXPERIMENTAL + depends on PCI help Read the PCI windows out of the CNB20LE host bridge. This allows PCI hotplug to work on systems with the CNB20LE chipset which do @@ -2096,6 +2179,7 @@ config OLPC_XO1_RTC config OLPC_XO1_SCI bool "OLPC XO-1 SCI extras" depends on OLPC && OLPC_XO1_PM + depends on INPUT=y select POWER_SUPPLY select GPIO_CS5535 select MFD_CORE @@ -2145,6 +2229,15 @@ config GEOS ---help--- This option enables system support for the Traverse Technologies GEOS. +config TS5500 + bool "Technologic Systems TS-5500 platform support" + depends on MELAN + select CHECK_SIGNATURE + select NEW_LEDS + select LEDS_CLASS + ---help--- + This option enables system support for the Technologic Systems TS-5500. + endif # X86_32 config AMD_NB @@ -2189,8 +2282,8 @@ config IA32_AOUT Support old a.out binaries in the 32bit emulation. config X86_X32 - bool "x32 ABI for 64-bit mode (EXPERIMENTAL)" - depends on X86_64 && IA32_EMULATION && EXPERIMENTAL + bool "x32 ABI for 64-bit mode" + depends on X86_64 && IA32_EMULATION ---help--- Include code to run binaries for the x32 native 32-bit ABI for 64-bit processors. An x32 process gets access to the diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index f3b86d0df44..c026cca5602 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -4,23 +4,24 @@ choice default M686 if X86_32 default GENERIC_CPU if X86_64 -config M386 - bool "386" - depends on X86_32 && !UML +config M486 + bool "486" + depends on X86_32 ---help--- - This is the processor type of your CPU. This information is used for - optimizing purposes. In order to compile a kernel that can run on - all x86 CPU types (albeit not optimally fast), you can specify - "386" here. + This is the processor type of your CPU. This information is + used for optimizing purposes. In order to compile a kernel + that can run on all supported x86 CPU types (albeit not + optimally fast), you can specify "486" here. + + Note that the 386 is no longer supported, this includes + AMD/Cyrix/Intel 386DX/DXL/SL/SLC/SX, Cyrix/TI 486DLC/DLC2, + UMC 486SX-S and the NexGen Nx586. The kernel will not necessarily run on earlier architectures than the one you have chosen, e.g. a Pentium optimized kernel will run on a PPro, but not necessarily on a i486. Here are the settings recommended for greatest speed: - - "386" for the AMD/Cyrix/Intel 386DX/DXL/SL/SLC/SX, Cyrix/TI - 486DLC/DLC2, and UMC 486SX-S. Only "386" kernels will run on a 386 - class machine. - "486" for the AMD/Cyrix/IBM/Intel 486DX/DX2/DX4 or SL/SLC/SLC2/SLC3/SX/SX2 and UMC U5D or U5S. - "586" for generic Pentium CPUs lacking the TSC @@ -43,16 +44,7 @@ config M386 - "VIA C3-2" for VIA C3-2 "Nehemiah" (model 9 and above). - "VIA C7" for VIA C7. - If you don't know what to do, choose "386". - -config M486 - bool "486" - depends on X86_32 - ---help--- - Select this for a 486 series processor, either Intel or one of the - compatible processors from AMD, Cyrix, IBM, or Intel. Includes DX, - DX2, and DX4 variants; also SL/SLC/SLC2/SLC3/SX/SX2 and UMC U5D or - U5S. + If you don't know what to do, choose "486". config M586 bool "586/K5/5x86/6x86/6x86MX" @@ -305,24 +297,16 @@ config X86_INTERNODE_CACHE_SHIFT default "12" if X86_VSMP default X86_L1_CACHE_SHIFT -config X86_CMPXCHG - def_bool y - depends on X86_64 || (X86_32 && !M386) - config X86_L1_CACHE_SHIFT int default "7" if MPENTIUM4 || MPSC default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU - default "4" if MELAN || M486 || M386 || MGEODEGX1 + default "4" if MELAN || M486 || MGEODEGX1 default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX -config X86_XADD - def_bool y - depends on !M386 - config X86_PPRO_FENCE bool "PentiumPro memory ordering errata workaround" - depends on M686 || M586MMX || M586TSC || M586 || M486 || M386 || MGEODEGX1 + depends on M686 || M586MMX || M586TSC || M586 || M486 || MGEODEGX1 ---help--- Old PentiumPro multiprocessor systems had errata that could cause memory operations to violate the x86 ordering standard in rare cases. @@ -335,27 +319,11 @@ config X86_PPRO_FENCE config X86_F00F_BUG def_bool y - depends on M586MMX || M586TSC || M586 || M486 || M386 + depends on M586MMX || M586TSC || M586 || M486 config X86_INVD_BUG def_bool y - depends on M486 || M386 - -config X86_WP_WORKS_OK - def_bool y - depends on !M386 - -config X86_INVLPG - def_bool y - depends on X86_32 && !M386 - -config X86_BSWAP - def_bool y - depends on X86_32 && !M386 - -config X86_POPAD_OK - def_bool y - depends on X86_32 && !M386 + depends on M486 config X86_ALIGNMENT_16 def_bool y @@ -412,12 +380,11 @@ config X86_MINIMUM_CPU_FAMILY default "64" if X86_64 default "6" if X86_32 && X86_P6_NOP default "5" if X86_32 && X86_CMPXCHG64 - default "4" if X86_32 && (X86_XADD || X86_CMPXCHG || X86_BSWAP || X86_WP_WORKS_OK) - default "3" + default "4" config X86_DEBUGCTLMSR def_bool y - depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486 || M386) && !UML + depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486) && !UML menuconfig PROCESSOR_SELECT bool "Supported processor vendors" if EXPERT @@ -441,7 +408,7 @@ config CPU_SUP_INTEL config CPU_SUP_CYRIX_32 default y bool "Support Cyrix processors" if PROCESSOR_SELECT - depends on M386 || M486 || M586 || M586TSC || M586MMX || (EXPERT && !64BIT) + depends on M486 || M586 || M586TSC || M586MMX || (EXPERT && !64BIT) ---help--- This enables detection, tunings and quirks for Cyrix processors @@ -495,7 +462,7 @@ config CPU_SUP_TRANSMETA_32 config CPU_SUP_UMC_32 default y bool "Support UMC processors" if PROCESSOR_SELECT - depends on M386 || M486 || (EXPERT && !64BIT) + depends on M486 || (EXPERT && !64BIT) ---help--- This enables detection, tunings and quirks for UMC processors diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 05afcca66de..5c477260294 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -2,7 +2,11 @@ # select defconfig based on actual architecture ifeq ($(ARCH),x86) + ifeq ($(shell uname -m),x86_64) + KBUILD_DEFCONFIG := x86_64_defconfig + else KBUILD_DEFCONFIG := i386_defconfig + endif else KBUILD_DEFCONFIG := $(ARCH)_defconfig endif @@ -123,9 +127,10 @@ cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTI # does binutils support specific instructions? asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1) avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1) +avx2_instr :=$(call as-instr,vpbroadcastb %xmm0$(comma)%ymm1,-DCONFIG_AS_AVX2=1) -KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) -KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) +KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) +KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) LDFLAGS := -m elf_$(UTS_MACHINE) diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu index 86cee7b749e..6647ed49c66 100644 --- a/arch/x86/Makefile_32.cpu +++ b/arch/x86/Makefile_32.cpu @@ -10,7 +10,6 @@ tune = $(call cc-option,-mcpu=$(1),$(2)) endif align := $(cc-option-align) -cflags-$(CONFIG_M386) += -march=i386 cflags-$(CONFIG_M486) += -march=i486 cflags-$(CONFIG_M586) += -march=i586 cflags-$(CONFIG_M586TSC) += -march=i586 diff --git a/arch/x86/boot/.gitignore b/arch/x86/boot/.gitignore index 851fe936d24..e3cf9f682be 100644 --- a/arch/x86/boot/.gitignore +++ b/arch/x86/boot/.gitignore @@ -2,7 +2,6 @@ bootsect bzImage cpustr.h mkcpustr -offsets.h voffset.h zoffset.h setup diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index ccce0ed67dd..379814bc41e 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -71,7 +71,7 @@ GCOV_PROFILE := n $(obj)/bzImage: asflags-y := $(SVGA_MODE) quiet_cmd_image = BUILD $@ -cmd_image = $(obj)/tools/build $(obj)/setup.bin $(obj)/vmlinux.bin > $@ +cmd_image = $(obj)/tools/build $(obj)/setup.bin $(obj)/vmlinux.bin $(obj)/zoffset.h > $@ $(obj)/bzImage: $(obj)/setup.bin $(obj)/vmlinux.bin $(obj)/tools/build FORCE $(call if_changed,image) @@ -92,7 +92,7 @@ targets += voffset.h $(obj)/voffset.h: vmlinux FORCE $(call if_changed,voffset) -sed-zoffset := -e 's/^\([0-9a-fA-F]*\) . \(startup_32\|input_data\|_end\|z_.*\)$$/\#define ZO_\2 0x\1/p' +sed-zoffset := -e 's/^\([0-9a-fA-F]*\) . \(startup_32\|startup_64\|efi_pe_entry\|efi_stub_entry\|input_data\|_end\|z_.*\)$$/\#define ZO_\2 0x\1/p' quiet_cmd_zoffset = ZOFFSET $@ cmd_zoffset = $(NM) $< | sed -n $(sed-zoffset) > $@ diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index 18997e5a105..5b7531966b8 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h @@ -285,16 +285,26 @@ struct biosregs { void intcall(u8 int_no, const struct biosregs *ireg, struct biosregs *oreg); /* cmdline.c */ -int __cmdline_find_option(u32 cmdline_ptr, const char *option, char *buffer, int bufsize); -int __cmdline_find_option_bool(u32 cmdline_ptr, const char *option); +int __cmdline_find_option(unsigned long cmdline_ptr, const char *option, char *buffer, int bufsize); +int __cmdline_find_option_bool(unsigned long cmdline_ptr, const char *option); static inline int cmdline_find_option(const char *option, char *buffer, int bufsize) { - return __cmdline_find_option(boot_params.hdr.cmd_line_ptr, option, buffer, bufsize); + unsigned long cmd_line_ptr = boot_params.hdr.cmd_line_ptr; + + if (cmd_line_ptr >= 0x100000) + return -1; /* inaccessible */ + + return __cmdline_find_option(cmd_line_ptr, option, buffer, bufsize); } static inline int cmdline_find_option_bool(const char *option) { - return __cmdline_find_option_bool(boot_params.hdr.cmd_line_ptr, option); + unsigned long cmd_line_ptr = boot_params.hdr.cmd_line_ptr; + + if (cmd_line_ptr >= 0x100000) + return -1; /* inaccessible */ + + return __cmdline_find_option_bool(cmd_line_ptr, option); } diff --git a/arch/x86/boot/cmdline.c b/arch/x86/boot/cmdline.c index 6b3b6f708c0..625d21b0cd3 100644 --- a/arch/x86/boot/cmdline.c +++ b/arch/x86/boot/cmdline.c @@ -27,7 +27,7 @@ static inline int myisspace(u8 c) * Returns the length of the argument (regardless of if it was * truncated to fit in the buffer), or -1 on not found. */ -int __cmdline_find_option(u32 cmdline_ptr, const char *option, char *buffer, int bufsize) +int __cmdline_find_option(unsigned long cmdline_ptr, const char *option, char *buffer, int bufsize) { addr_t cptr; char c; @@ -41,8 +41,8 @@ int __cmdline_find_option(u32 cmdline_ptr, const char *option, char *buffer, int st_bufcpy /* Copying this to buffer */ } state = st_wordstart; - if (!cmdline_ptr || cmdline_ptr >= 0x100000) - return -1; /* No command line, or inaccessible */ + if (!cmdline_ptr) + return -1; /* No command line */ cptr = cmdline_ptr & 0xf; set_fs(cmdline_ptr >> 4); @@ -99,7 +99,7 @@ int __cmdline_find_option(u32 cmdline_ptr, const char *option, char *buffer, int * Returns the position of that option (starts counting with 1) * or 0 on not found */ -int __cmdline_find_option_bool(u32 cmdline_ptr, const char *option) +int __cmdline_find_option_bool(unsigned long cmdline_ptr, const char *option) { addr_t cptr; char c; @@ -111,8 +111,8 @@ int __cmdline_find_option_bool(u32 cmdline_ptr, const char *option) st_wordskip, /* Miscompare, skip */ } state = st_wordstart; - if (!cmdline_ptr || cmdline_ptr >= 0x100000) - return -1; /* No command line, or inaccessible */ + if (!cmdline_ptr) + return -1; /* No command line */ cptr = cmdline_ptr & 0xf; set_fs(cmdline_ptr >> 4); diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c index 10f6b1178c6..bffd73b45b1 100644 --- a/arch/x86/boot/compressed/cmdline.c +++ b/arch/x86/boot/compressed/cmdline.c @@ -13,13 +13,21 @@ static inline char rdfs8(addr_t addr) return *((char *)(fs + addr)); } #include "../cmdline.c" +static unsigned long get_cmd_line_ptr(void) +{ + unsigned long cmd_line_ptr = real_mode->hdr.cmd_line_ptr; + + cmd_line_ptr |= (u64)real_mode->ext_cmd_line_ptr << 32; + + return cmd_line_ptr; +} int cmdline_find_option(const char *option, char *buffer, int bufsize) { - return __cmdline_find_option(real_mode->hdr.cmd_line_ptr, option, buffer, bufsize); + return __cmdline_find_option(get_cmd_line_ptr(), option, buffer, bufsize); } int cmdline_find_option_bool(const char *option) { - return __cmdline_find_option_bool(real_mode->hdr.cmd_line_ptr, option); + return __cmdline_find_option_bool(get_cmd_line_ptr(), option); } #endif diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index c760e073963..c205035a6b9 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -8,31 +8,39 @@ * ----------------------------------------------------------------------- */ #include <linux/efi.h> +#include <linux/pci.h> #include <asm/efi.h> #include <asm/setup.h> #include <asm/desc.h> +#undef memcpy /* Use memcpy from misc.c */ + #include "eboot.h" static efi_system_table_t *sys_table; +static void efi_char16_printk(efi_char16_t *str) +{ + struct efi_simple_text_output_protocol *out; + + out = (struct efi_simple_text_output_protocol *)sys_table->con_out; + efi_call_phys2(out->output_string, out, str); +} + static void efi_printk(char *str) { char *s8; for (s8 = str; *s8; s8++) { - struct efi_simple_text_output_protocol *out; efi_char16_t ch[2] = { 0 }; ch[0] = *s8; - out = (struct efi_simple_text_output_protocol *)sys_table->con_out; - if (*s8 == '\n') { efi_char16_t nl[2] = { '\r', 0 }; - efi_call_phys2(out->output_string, out, nl); + efi_char16_printk(nl); } - efi_call_phys2(out->output_string, out, ch); + efi_char16_printk(ch); } } @@ -243,6 +251,123 @@ static void find_bits(unsigned long mask, u8 *pos, u8 *size) *size = len; } +static efi_status_t setup_efi_pci(struct boot_params *params) +{ + efi_pci_io_protocol *pci; + efi_status_t status; + void **pci_handle; + efi_guid_t pci_proto = EFI_PCI_IO_PROTOCOL_GUID; + unsigned long nr_pci, size = 0; + int i; + struct setup_data *data; + + data = (struct setup_data *)(unsigned long)params->hdr.setup_data; + + while (data && data->next) + data = (struct setup_data *)(unsigned long)data->next; + + status = efi_call_phys5(sys_table->boottime->locate_handle, + EFI_LOCATE_BY_PROTOCOL, &pci_proto, + NULL, &size, pci_handle); + + if (status == EFI_BUFFER_TOO_SMALL) { + status = efi_call_phys3(sys_table->boottime->allocate_pool, + EFI_LOADER_DATA, size, &pci_handle); + + if (status != EFI_SUCCESS) + return status; + + status = efi_call_phys5(sys_table->boottime->locate_handle, + EFI_LOCATE_BY_PROTOCOL, &pci_proto, + NULL, &size, pci_handle); + } + + if (status != EFI_SUCCESS) + goto free_handle; + + nr_pci = size / sizeof(void *); + for (i = 0; i < nr_pci; i++) { + void *h = pci_handle[i]; + uint64_t attributes; + struct pci_setup_rom *rom; + + status = efi_call_phys3(sys_table->boottime->handle_protocol, + h, &pci_proto, &pci); + + if (status != EFI_SUCCESS) + continue; + + if (!pci) + continue; + +#ifdef CONFIG_X86_64 + status = efi_call_phys4(pci->attributes, pci, + EfiPciIoAttributeOperationGet, 0, + &attributes); +#else + status = efi_call_phys5(pci->attributes, pci, + EfiPciIoAttributeOperationGet, 0, 0, + &attributes); +#endif + if (status != EFI_SUCCESS) + continue; + + if (!pci->romimage || !pci->romsize) + continue; + + size = pci->romsize + sizeof(*rom); + + status = efi_call_phys3(sys_table->boottime->allocate_pool, + EFI_LOADER_DATA, size, &rom); + + if (status != EFI_SUCCESS) + continue; + + rom->data.type = SETUP_PCI; + rom->data.len = size - sizeof(struct setup_data); + rom->data.next = 0; + rom->pcilen = pci->romsize; + + status = efi_call_phys5(pci->pci.read, pci, + EfiPciIoWidthUint16, PCI_VENDOR_ID, + 1, &(rom->vendor)); + + if (status != EFI_SUCCESS) + goto free_struct; + + status = efi_call_phys5(pci->pci.read, pci, + EfiPciIoWidthUint16, PCI_DEVICE_ID, + 1, &(rom->devid)); + + if (status != EFI_SUCCESS) + goto free_struct; + + status = efi_call_phys5(pci->get_location, pci, + &(rom->segment), &(rom->bus), + &(rom->device), &(rom->function)); + + if (status != EFI_SUCCESS) + goto free_struct; + + memcpy(rom->romdata, pci->romimage, pci->romsize); + + if (data) + data->next = (unsigned long)rom; + else + params->hdr.setup_data = (unsigned long)rom; + + data = (struct setup_data *)rom; + + continue; + free_struct: + efi_call_phys1(sys_table->boottime->free_pool, rom); + } + +free_handle: + efi_call_phys1(sys_table->boottime->free_pool, pci_handle); + return status; +} + /* * See if we have Graphics Output Protocol */ @@ -314,10 +439,9 @@ static efi_status_t setup_gop(struct screen_info *si, efi_guid_t *proto, * Once we've found a GOP supporting ConOut, * don't bother looking any further. */ + first_gop = gop; if (conout_found) break; - - first_gop = gop; } } @@ -590,7 +714,12 @@ static efi_status_t handle_ramdisks(efi_loaded_image_t *image, if ((u8 *)p >= (u8 *)filename_16 + sizeof(filename_16)) break; - *p++ = *str++; + if (*str == '/') { + *p++ = '\\'; + *str++; + } else { + *p++ = *str++; + } } *p = '\0'; @@ -618,7 +747,9 @@ static efi_status_t handle_ramdisks(efi_loaded_image_t *image, status = efi_call_phys5(fh->open, fh, &h, filename_16, EFI_FILE_MODE_READ, (u64)0); if (status != EFI_SUCCESS) { - efi_printk("Failed to open initrd file\n"); + efi_printk("Failed to open initrd file: "); + efi_char16_printk(filename_16); + efi_printk("\n"); goto close_handles; } @@ -1026,6 +1157,8 @@ struct boot_params *efi_main(void *handle, efi_system_table_t *_table, setup_graphics(boot_params); + setup_efi_pci(boot_params); + status = efi_call_phys3(sys_table->boottime->allocate_pool, EFI_LOADER_DATA, sizeof(*gdt), (void **)&gdt); diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index aa4aaf1b238..1e3184f6072 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -35,11 +35,11 @@ ENTRY(startup_32) #ifdef CONFIG_EFI_STUB jmp preferred_addr - .balign 0x10 /* * We don't need the return address, so set up the stack so - * efi_main() can find its arugments. + * efi_main() can find its arguments. */ +ENTRY(efi_pe_entry) add $0x4, %esp call make_boot_params @@ -50,8 +50,10 @@ ENTRY(startup_32) pushl %eax pushl %esi pushl %ecx + sub $0x4, %esp - .org 0x30,0x90 +ENTRY(efi_stub_entry) + add $0x4, %esp call efi_main cmpl $0, %eax movl %eax, %esi diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 2c4b171eec3..c1d383d1fb7 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -37,6 +37,12 @@ __HEAD .code32 ENTRY(startup_32) + /* + * 32bit entry is 0 and it is ABI so immutable! + * If we come here directly from a bootloader, + * kernel(text+data+bss+brk) ramdisk, zero_page, command line + * all need to be under the 4G limit. + */ cld /* * Test KEEP_SEGMENTS flag to see if the bootloader is asking @@ -154,6 +160,12 @@ ENTRY(startup_32) btsl $_EFER_LME, %eax wrmsr + /* After gdt is loaded */ + xorl %eax, %eax + lldt %ax + movl $0x20, %eax + ltr %ax + /* * Setup for the jump to 64bit mode * @@ -176,37 +188,27 @@ ENTRY(startup_32) lret ENDPROC(startup_32) -no_longmode: - /* This isn't an x86-64 CPU so hang */ -1: - hlt - jmp 1b - -#include "../../kernel/verify_cpu.S" - - /* - * Be careful here startup_64 needs to be at a predictable - * address so I can export it in an ELF header. Bootloaders - * should look at the ELF header to find this address, as - * it may change in the future. - */ .code64 .org 0x200 ENTRY(startup_64) /* + * 64bit entry is 0x200 and it is ABI so immutable! * We come here either from startup_32 or directly from a - * 64bit bootloader. If we come here from a bootloader we depend on - * an identity mapped page table being provied that maps our - * entire text+data+bss and hopefully all of memory. + * 64bit bootloader. + * If we come here from a bootloader, kernel(text+data+bss+brk), + * ramdisk, zero_page, command line could be above 4G. + * We depend on an identity mapped page table being provided + * that maps our entire kernel(text+data+bss+brk), zero page + * and command line. */ #ifdef CONFIG_EFI_STUB /* - * The entry point for the PE/COFF executable is 0x210, so only - * legacy boot loaders will execute this jmp. + * The entry point for the PE/COFF executable is efi_pe_entry, so + * only legacy boot loaders will execute this jmp. */ jmp preferred_addr - .org 0x210 +ENTRY(efi_pe_entry) mov %rcx, %rdi mov %rdx, %rsi pushq %rdi @@ -218,7 +220,7 @@ ENTRY(startup_64) popq %rsi popq %rdi - .org 0x230,0x90 +ENTRY(efi_stub_entry) call efi_main movq %rax,%rsi cmpq $0,%rax @@ -247,9 +249,6 @@ preferred_addr: movl %eax, %ss movl %eax, %fs movl %eax, %gs - lldt %ax - movl $0x20, %eax - ltr %ax /* * Compute the decompressed kernel start address. It is where @@ -349,6 +348,15 @@ relocated: */ jmp *%rbp + .code32 +no_longmode: + /* This isn't an x86-64 CPU so hang */ +1: + hlt + jmp 1b + +#include "../../kernel/verify_cpu.S" + .data gdt: .word gdt_end - gdt diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 88f7ff6da40..7cb56c6ca35 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -325,6 +325,8 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap, { real_mode = rmode; + sanitize_boot_params(real_mode); + if (real_mode->screen_info.orig_video_mode == 7) { vidmem = (char *) 0xb0000; vidport = 0x3b4; diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 0e6dc0ee0ee..674019d8e23 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -18,6 +18,7 @@ #include <asm/page.h> #include <asm/boot.h> #include <asm/bootparam.h> +#include <asm/bootparam_utils.h> #define BOOT_BOOT_H #include "../ctype.h" diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 2a017441b8b..9ec06a1f6d6 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -21,6 +21,7 @@ #include <asm/e820.h> #include <asm/page_types.h> #include <asm/setup.h> +#include <asm/bootparam.h> #include "boot.h" #include "voffset.h" #include "zoffset.h" @@ -255,6 +256,9 @@ section_table: # header, from the old boot sector. .section ".header", "a" + .globl sentinel +sentinel: .byte 0xff, 0xff /* Used to detect broken loaders */ + .globl hdr hdr: setup_sects: .byte 0 /* Filled in by build.c */ @@ -279,7 +283,7 @@ _start: # Part 2 of the header, from the old setup.S .ascii "HdrS" # header signature - .word 0x020b # header version number (>= 0x0105) + .word 0x020c # header version number (>= 0x0105) # or else old loadlin-1.5 will fail) .globl realmode_swtch realmode_swtch: .word 0, 0 # default_switch, SETUPSEG @@ -297,13 +301,7 @@ type_of_loader: .byte 0 # 0 means ancient bootloader, newer # flags, unused bits must be zero (RFU) bit within loadflags loadflags: -LOADED_HIGH = 1 # If set, the kernel is loaded high -CAN_USE_HEAP = 0x80 # If set, the loader also has set - # heap_end_ptr to tell how much - # space behind setup.S can be used for - # heap purposes. - # Only the loader knows what is free - .byte LOADED_HIGH + .byte LOADED_HIGH # The kernel is to be loaded high setup_move_size: .word 0x8000 # size to move, when setup is not # loaded at 0x90000. We will move setup @@ -369,7 +367,31 @@ relocatable_kernel: .byte 1 relocatable_kernel: .byte 0 #endif min_alignment: .byte MIN_KERNEL_ALIGN_LG2 # minimum alignment -pad3: .word 0 + +xloadflags: +#ifdef CONFIG_X86_64 +# define XLF0 XLF_KERNEL_64 /* 64-bit kernel */ +#else +# define XLF0 0 +#endif + +#if defined(CONFIG_RELOCATABLE) && defined(CONFIG_X86_64) + /* kernel/boot_param/ramdisk could be loaded above 4g */ +# define XLF1 XLF_CAN_BE_LOADED_ABOVE_4G +#else +# define XLF1 0 +#endif + +#ifdef CONFIG_EFI_STUB +# ifdef CONFIG_X86_64 +# define XLF23 XLF_EFI_HANDOVER_64 /* 64-bit EFI handover ok */ +# else +# define XLF23 XLF_EFI_HANDOVER_32 /* 32-bit EFI handover ok */ +# endif +#else +# define XLF23 0 +#endif + .word XLF0 | XLF1 | XLF23 cmdline_size: .long COMMAND_LINE_SIZE-1 #length of the command line, #added with boot protocol @@ -397,8 +419,13 @@ pref_address: .quad LOAD_PHYSICAL_ADDR # preferred load addr #define INIT_SIZE VO_INIT_SIZE #endif init_size: .long INIT_SIZE # kernel initialization size -handover_offset: .long 0x30 # offset to the handover +handover_offset: +#ifdef CONFIG_EFI_STUB + .long 0x30 # offset to the handover # protocol entry point +#else + .long 0 +#endif # End of setup header ##################################################### @@ -476,6 +503,3 @@ die: setup_corrupt: .byte 7 .string "No setup signature found...\n" - - .data -dummy: .long 0 diff --git a/arch/x86/boot/setup.ld b/arch/x86/boot/setup.ld index 03c0683636b..96a6c756353 100644 --- a/arch/x86/boot/setup.ld +++ b/arch/x86/boot/setup.ld @@ -13,7 +13,7 @@ SECTIONS .bstext : { *(.bstext) } .bsdata : { *(.bsdata) } - . = 497; + . = 495; .header : { *(.header) } .entrytext : { *(.entrytext) } .inittext : { *(.inittext) } diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c index 4b8e165ee57..94c54465002 100644 --- a/arch/x86/boot/tools/build.c +++ b/arch/x86/boot/tools/build.c @@ -52,6 +52,10 @@ int is_big_kernel; #define PECOFF_RELOC_RESERVE 0x20 +unsigned long efi_stub_entry; +unsigned long efi_pe_entry; +unsigned long startup_64; + /*----------------------------------------------------------------------*/ static const u32 crctab32[] = { @@ -132,7 +136,7 @@ static void die(const char * str, ...) static void usage(void) { - die("Usage: build setup system [> image]"); + die("Usage: build setup system [zoffset.h] [> image]"); } #ifdef CONFIG_EFI_STUB @@ -206,30 +210,54 @@ static void update_pecoff_text(unsigned int text_start, unsigned int file_sz) */ put_unaligned_le32(file_sz - 512, &buf[pe_header + 0x1c]); -#ifdef CONFIG_X86_32 /* - * Address of entry point. - * - * The EFI stub entry point is +16 bytes from the start of - * the .text section. + * Address of entry point for PE/COFF executable */ - put_unaligned_le32(text_start + 16, &buf[pe_header + 0x28]); -#else - /* - * Address of entry point. startup_32 is at the beginning and - * the 64-bit entry point (startup_64) is always 512 bytes - * after. The EFI stub entry point is 16 bytes after that, as - * the first instruction allows legacy loaders to jump over - * the EFI stub initialisation - */ - put_unaligned_le32(text_start + 528, &buf[pe_header + 0x28]); -#endif /* CONFIG_X86_32 */ + put_unaligned_le32(text_start + efi_pe_entry, &buf[pe_header + 0x28]); update_pecoff_section_header(".text", text_start, text_sz); } #endif /* CONFIG_EFI_STUB */ + +/* + * Parse zoffset.h and find the entry points. We could just #include zoffset.h + * but that would mean tools/build would have to be rebuilt every time. It's + * not as if parsing it is hard... + */ +#define PARSE_ZOFS(p, sym) do { \ + if (!strncmp(p, "#define ZO_" #sym " ", 11+sizeof(#sym))) \ + sym = strtoul(p + 11 + sizeof(#sym), NULL, 16); \ +} while (0) + +static void parse_zoffset(char *fname) +{ + FILE *file; + char *p; + int c; + + file = fopen(fname, "r"); + if (!file) + die("Unable to open `%s': %m", fname); + c = fread(buf, 1, sizeof(buf) - 1, file); + if (ferror(file)) + die("read-error on `zoffset.h'"); + buf[c] = 0; + + p = (char *)buf; + + while (p && *p) { + PARSE_ZOFS(p, efi_stub_entry); + PARSE_ZOFS(p, efi_pe_entry); + PARSE_ZOFS(p, startup_64); + + p = strchr(p, '\n'); + while (p && (*p == '\r' || *p == '\n')) + p++; + } +} + int main(int argc, char ** argv) { unsigned int i, sz, setup_sectors; @@ -241,7 +269,19 @@ int main(int argc, char ** argv) void *kernel; u32 crc = 0xffffffffUL; - if (argc != 3) + /* Defaults for old kernel */ +#ifdef CONFIG_X86_32 + efi_pe_entry = 0x10; + efi_stub_entry = 0x30; +#else + efi_pe_entry = 0x210; + efi_stub_entry = 0x230; + startup_64 = 0x200; +#endif + + if (argc == 4) + parse_zoffset(argv[3]); + else if (argc != 3) usage(); /* Copy the setup code */ @@ -299,6 +339,11 @@ int main(int argc, char ** argv) #ifdef CONFIG_EFI_STUB update_pecoff_text(setup_sectors * 512, sz + i + ((sys_size * 16) - sz)); + +#ifdef CONFIG_X86_64 /* Yes, this is really how we defined it :( */ + efi_stub_entry -= 0x200; +#endif + put_unaligned_le32(efi_stub_entry, &buf[0x264]); #endif crc = partial_crc32(buf, i, crc); diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 5598547281a..94447086e55 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -1,3 +1,4 @@ +# CONFIG_64BIT is not set CONFIG_EXPERIMENTAL=y # CONFIG_LOCALVERSION_AUTO is not set CONFIG_SYSVIPC=y diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 5bacb4a226a..63947a8f9f0 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -12,6 +12,7 @@ obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o +obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += camellia-aesni-avx-x86_64.o obj-$(CONFIG_CRYPTO_CAST5_AVX_X86_64) += cast5-avx-x86_64.o obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o @@ -26,6 +27,7 @@ obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o +obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o aes-i586-y := aes-i586-asm_32.o aes_glue.o twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o @@ -34,6 +36,8 @@ serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o +camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \ + camellia_aesni_avx_glue.o cast5-avx-x86_64-y := cast5-avx-x86_64-asm_64.o cast5_avx_glue.o cast6-avx-x86_64-y := cast6-avx-x86_64-asm_64.o cast6_avx_glue.o blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o @@ -47,3 +51,6 @@ serpent-avx-x86_64-y := serpent-avx-x86_64-asm_64.o serpent_avx_glue.o aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o +crc32c-intel-y := crc32c-intel_glue.o +crc32c-intel-$(CONFIG_CRYPTO_CRC32C_X86_64) += crc32c-pcl-intel-asm_64.o +crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o diff --git a/arch/x86/crypto/aes-i586-asm_32.S b/arch/x86/crypto/aes-i586-asm_32.S index b949ec2f9af..2849dbc59e1 100644 --- a/arch/x86/crypto/aes-i586-asm_32.S +++ b/arch/x86/crypto/aes-i586-asm_32.S @@ -36,6 +36,7 @@ .file "aes-i586-asm.S" .text +#include <linux/linkage.h> #include <asm/asm-offsets.h> #define tlen 1024 // length of each of 4 'xor' arrays (256 32-bit words) @@ -219,14 +220,10 @@ // AES (Rijndael) Encryption Subroutine /* void aes_enc_blk(struct crypto_aes_ctx *ctx, u8 *out_blk, const u8 *in_blk) */ -.global aes_enc_blk - .extern crypto_ft_tab .extern crypto_fl_tab -.align 4 - -aes_enc_blk: +ENTRY(aes_enc_blk) push %ebp mov ctx(%esp),%ebp @@ -290,18 +287,15 @@ aes_enc_blk: mov %r0,(%ebp) pop %ebp ret +ENDPROC(aes_enc_blk) // AES (Rijndael) Decryption Subroutine /* void aes_dec_blk(struct crypto_aes_ctx *ctx, u8 *out_blk, const u8 *in_blk) */ -.global aes_dec_blk - .extern crypto_it_tab .extern crypto_il_tab -.align 4 - -aes_dec_blk: +ENTRY(aes_dec_blk) push %ebp mov ctx(%esp),%ebp @@ -365,3 +359,4 @@ aes_dec_blk: mov %r0,(%ebp) pop %ebp ret +ENDPROC(aes_dec_blk) diff --git a/arch/x86/crypto/aes-x86_64-asm_64.S b/arch/x86/crypto/aes-x86_64-asm_64.S index 5b577d5a059..91056554716 100644 --- a/arch/x86/crypto/aes-x86_64-asm_64.S +++ b/arch/x86/crypto/aes-x86_64-asm_64.S @@ -15,6 +15,7 @@ .text +#include <linux/linkage.h> #include <asm/asm-offsets.h> #define R1 %rax @@ -49,10 +50,8 @@ #define R11 %r11 #define prologue(FUNC,KEY,B128,B192,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11) \ - .global FUNC; \ - .type FUNC,@function; \ - .align 8; \ -FUNC: movq r1,r2; \ + ENTRY(FUNC); \ + movq r1,r2; \ movq r3,r4; \ leaq KEY+48(r8),r9; \ movq r10,r11; \ @@ -71,14 +70,15 @@ FUNC: movq r1,r2; \ je B192; \ leaq 32(r9),r9; -#define epilogue(r1,r2,r3,r4,r5,r6,r7,r8,r9) \ +#define epilogue(FUNC,r1,r2,r3,r4,r5,r6,r7,r8,r9) \ movq r1,r2; \ movq r3,r4; \ movl r5 ## E,(r9); \ movl r6 ## E,4(r9); \ movl r7 ## E,8(r9); \ movl r8 ## E,12(r9); \ - ret; + ret; \ + ENDPROC(FUNC); #define round(TAB,OFFSET,r1,r2,r3,r4,r5,r6,r7,r8,ra,rb,rc,rd) \ movzbl r2 ## H,r5 ## E; \ @@ -133,7 +133,7 @@ FUNC: movq r1,r2; \ #define entry(FUNC,KEY,B128,B192) \ prologue(FUNC,KEY,B128,B192,R2,R8,R7,R9,R1,R3,R4,R6,R10,R5,R11) -#define return epilogue(R8,R2,R9,R7,R5,R6,R3,R4,R11) +#define return(FUNC) epilogue(FUNC,R8,R2,R9,R7,R5,R6,R3,R4,R11) #define encrypt_round(TAB,OFFSET) \ round(TAB,OFFSET,R1,R2,R3,R4,R5,R6,R7,R10,R5,R6,R3,R4) \ @@ -151,12 +151,12 @@ FUNC: movq r1,r2; \ /* void aes_enc_blk(stuct crypto_tfm *tfm, u8 *out, const u8 *in) */ - entry(aes_enc_blk,0,enc128,enc192) + entry(aes_enc_blk,0,.Le128,.Le192) encrypt_round(crypto_ft_tab,-96) encrypt_round(crypto_ft_tab,-80) -enc192: encrypt_round(crypto_ft_tab,-64) +.Le192: encrypt_round(crypto_ft_tab,-64) encrypt_round(crypto_ft_tab,-48) -enc128: encrypt_round(crypto_ft_tab,-32) +.Le128: encrypt_round(crypto_ft_tab,-32) encrypt_round(crypto_ft_tab,-16) encrypt_round(crypto_ft_tab, 0) encrypt_round(crypto_ft_tab, 16) @@ -166,16 +166,16 @@ enc128: encrypt_round(crypto_ft_tab,-32) encrypt_round(crypto_ft_tab, 80) encrypt_round(crypto_ft_tab, 96) encrypt_final(crypto_fl_tab,112) - return + return(aes_enc_blk) /* void aes_dec_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in) */ - entry(aes_dec_blk,240,dec128,dec192) + entry(aes_dec_blk,240,.Ld128,.Ld192) decrypt_round(crypto_it_tab,-96) decrypt_round(crypto_it_tab,-80) -dec192: decrypt_round(crypto_it_tab,-64) +.Ld192: decrypt_round(crypto_it_tab,-64) decrypt_round(crypto_it_tab,-48) -dec128: decrypt_round(crypto_it_tab,-32) +.Ld128: decrypt_round(crypto_it_tab,-32) decrypt_round(crypto_it_tab,-16) decrypt_round(crypto_it_tab, 0) decrypt_round(crypto_it_tab, 16) @@ -185,4 +185,4 @@ dec128: decrypt_round(crypto_it_tab,-32) decrypt_round(crypto_it_tab, 80) decrypt_round(crypto_it_tab, 96) decrypt_final(crypto_il_tab,112) - return + return(aes_dec_blk) diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index 3470624d783..04b797767b9 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -1262,7 +1262,6 @@ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst * poly = x^128 + x^127 + x^126 + x^121 + 1 * *****************************************************************************/ - ENTRY(aesni_gcm_dec) push %r12 push %r13 @@ -1437,6 +1436,7 @@ _return_T_done_decrypt: pop %r13 pop %r12 ret +ENDPROC(aesni_gcm_dec) /***************************************************************************** @@ -1700,10 +1700,12 @@ _return_T_done_encrypt: pop %r13 pop %r12 ret +ENDPROC(aesni_gcm_enc) #endif +.align 4 _key_expansion_128: _key_expansion_256a: pshufd $0b11111111, %xmm1, %xmm1 @@ -1715,6 +1717,8 @@ _key_expansion_256a: movaps %xmm0, (TKEYP) add $0x10, TKEYP ret +ENDPROC(_key_expansion_128) +ENDPROC(_key_expansion_256a) .align 4 _key_expansion_192a: @@ -1739,6 +1743,7 @@ _key_expansion_192a: movaps %xmm1, 0x10(TKEYP) add $0x20, TKEYP ret +ENDPROC(_key_expansion_192a) .align 4 _key_expansion_192b: @@ -1758,6 +1763,7 @@ _key_expansion_192b: movaps %xmm0, (TKEYP) add $0x10, TKEYP ret +ENDPROC(_key_expansion_192b) .align 4 _key_expansion_256b: @@ -1770,6 +1776,7 @@ _key_expansion_256b: movaps %xmm2, (TKEYP) add $0x10, TKEYP ret +ENDPROC(_key_expansion_256b) /* * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, @@ -1882,6 +1889,7 @@ ENTRY(aesni_set_key) popl KEYP #endif ret +ENDPROC(aesni_set_key) /* * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) @@ -1903,6 +1911,7 @@ ENTRY(aesni_enc) popl KEYP #endif ret +ENDPROC(aesni_enc) /* * _aesni_enc1: internal ABI @@ -1960,6 +1969,7 @@ _aesni_enc1: movaps 0x70(TKEYP), KEY AESENCLAST KEY STATE ret +ENDPROC(_aesni_enc1) /* * _aesni_enc4: internal ABI @@ -2068,6 +2078,7 @@ _aesni_enc4: AESENCLAST KEY STATE3 AESENCLAST KEY STATE4 ret +ENDPROC(_aesni_enc4) /* * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) @@ -2090,6 +2101,7 @@ ENTRY(aesni_dec) popl KEYP #endif ret +ENDPROC(aesni_dec) /* * _aesni_dec1: internal ABI @@ -2147,6 +2159,7 @@ _aesni_dec1: movaps 0x70(TKEYP), KEY AESDECLAST KEY STATE ret +ENDPROC(_aesni_dec1) /* * _aesni_dec4: internal ABI @@ -2255,6 +2268,7 @@ _aesni_dec4: AESDECLAST KEY STATE3 AESDECLAST KEY STATE4 ret +ENDPROC(_aesni_dec4) /* * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, @@ -2312,6 +2326,7 @@ ENTRY(aesni_ecb_enc) popl LEN #endif ret +ENDPROC(aesni_ecb_enc) /* * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, @@ -2370,6 +2385,7 @@ ENTRY(aesni_ecb_dec) popl LEN #endif ret +ENDPROC(aesni_ecb_dec) /* * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, @@ -2411,6 +2427,7 @@ ENTRY(aesni_cbc_enc) popl IVP #endif ret +ENDPROC(aesni_cbc_enc) /* * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, @@ -2501,6 +2518,7 @@ ENTRY(aesni_cbc_dec) popl IVP #endif ret +ENDPROC(aesni_cbc_dec) #ifdef __x86_64__ .align 16 @@ -2527,6 +2545,7 @@ _aesni_inc_init: MOVQ_R64_XMM TCTR_LOW INC MOVQ_R64_XMM CTR TCTR_LOW ret +ENDPROC(_aesni_inc_init) /* * _aesni_inc: internal ABI @@ -2555,6 +2574,7 @@ _aesni_inc: movaps CTR, IV PSHUFB_XMM BSWAP_MASK IV ret +ENDPROC(_aesni_inc) /* * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, @@ -2615,4 +2635,5 @@ ENTRY(aesni_ctr_enc) movups IV, (IVP) .Lctr_enc_just_ret: ret +ENDPROC(aesni_ctr_enc) #endif diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 1b9c22bea8a..a0795da22c0 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -40,10 +40,6 @@ #include <linux/workqueue.h> #include <linux/spinlock.h> -#if defined(CONFIG_CRYPTO_CTR) || defined(CONFIG_CRYPTO_CTR_MODULE) -#define HAS_CTR -#endif - #if defined(CONFIG_CRYPTO_PCBC) || defined(CONFIG_CRYPTO_PCBC_MODULE) #define HAS_PCBC #endif @@ -395,12 +391,6 @@ static int ablk_ctr_init(struct crypto_tfm *tfm) return ablk_init_common(tfm, "__driver-ctr-aes-aesni"); } -#ifdef HAS_CTR -static int ablk_rfc3686_ctr_init(struct crypto_tfm *tfm) -{ - return ablk_init_common(tfm, "rfc3686(__driver-ctr-aes-aesni)"); -} -#endif #endif #ifdef HAS_PCBC @@ -1158,33 +1148,6 @@ static struct crypto_alg aesni_algs[] = { { .maxauthsize = 16, }, }, -#ifdef HAS_CTR -}, { - .cra_name = "rfc3686(ctr(aes))", - .cra_driver_name = "rfc3686-ctr-aes-aesni", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct async_helper_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_init = ablk_rfc3686_ctr_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = AES_MIN_KEY_SIZE + - CTR_RFC3686_NONCE_SIZE, - .max_keysize = AES_MAX_KEY_SIZE + - CTR_RFC3686_NONCE_SIZE, - .ivsize = CTR_RFC3686_IV_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, - .geniv = "seqiv", - }, - }, -#endif #endif #ifdef HAS_PCBC }, { diff --git a/arch/x86/crypto/blowfish-x86_64-asm_64.S b/arch/x86/crypto/blowfish-x86_64-asm_64.S index 391d245dc08..246c67006ed 100644 --- a/arch/x86/crypto/blowfish-x86_64-asm_64.S +++ b/arch/x86/crypto/blowfish-x86_64-asm_64.S @@ -20,6 +20,8 @@ * */ +#include <linux/linkage.h> + .file "blowfish-x86_64-asm.S" .text @@ -116,11 +118,7 @@ bswapq RX0; \ xorq RX0, (RIO); -.align 8 -.global __blowfish_enc_blk -.type __blowfish_enc_blk,@function; - -__blowfish_enc_blk: +ENTRY(__blowfish_enc_blk) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -148,19 +146,16 @@ __blowfish_enc_blk: movq %r10, RIO; test %cl, %cl; - jnz __enc_xor; + jnz .L__enc_xor; write_block(); ret; -__enc_xor: +.L__enc_xor: xor_block(); ret; +ENDPROC(__blowfish_enc_blk) -.align 8 -.global blowfish_dec_blk -.type blowfish_dec_blk,@function; - -blowfish_dec_blk: +ENTRY(blowfish_dec_blk) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -189,6 +184,7 @@ blowfish_dec_blk: movq %r11, %rbp; ret; +ENDPROC(blowfish_dec_blk) /********************************************************************** 4-way blowfish, four blocks parallel @@ -300,11 +296,7 @@ blowfish_dec_blk: bswapq RX3; \ xorq RX3, 24(RIO); -.align 8 -.global __blowfish_enc_blk_4way -.type __blowfish_enc_blk_4way,@function; - -__blowfish_enc_blk_4way: +ENTRY(__blowfish_enc_blk_4way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -336,7 +328,7 @@ __blowfish_enc_blk_4way: movq %r11, RIO; test %bpl, %bpl; - jnz __enc_xor4; + jnz .L__enc_xor4; write_block4(); @@ -344,18 +336,15 @@ __blowfish_enc_blk_4way: popq %rbp; ret; -__enc_xor4: +.L__enc_xor4: xor_block4(); popq %rbx; popq %rbp; ret; +ENDPROC(__blowfish_enc_blk_4way) -.align 8 -.global blowfish_dec_blk_4way -.type blowfish_dec_blk_4way,@function; - -blowfish_dec_blk_4way: +ENTRY(blowfish_dec_blk_4way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -387,4 +376,4 @@ blowfish_dec_blk_4way: popq %rbp; ret; - +ENDPROC(blowfish_dec_blk_4way) diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S new file mode 100644 index 00000000000..cfc163469c7 --- /dev/null +++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S @@ -0,0 +1,1092 @@ +/* + * x86_64/AVX/AES-NI assembler implementation of Camellia + * + * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + */ + +/* + * Version licensed under 2-clause BSD License is available at: + * http://koti.mbnet.fi/axh/crypto/camellia-BSD-1.2.0-aesni1.tar.xz + */ + +#include <linux/linkage.h> + +#define CAMELLIA_TABLE_BYTE_LEN 272 + +/* struct camellia_ctx: */ +#define key_table 0 +#define key_length CAMELLIA_TABLE_BYTE_LEN + +/* register macros */ +#define CTX %rdi + +/********************************************************************** + 16-way camellia + **********************************************************************/ +#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \ + vpand x, mask4bit, tmp0; \ + vpandn x, mask4bit, x; \ + vpsrld $4, x, x; \ + \ + vpshufb tmp0, lo_t, tmp0; \ + vpshufb x, hi_t, x; \ + vpxor tmp0, x, x; + +/* + * IN: + * x0..x7: byte-sliced AB state + * mem_cd: register pointer storing CD state + * key: index for key material + * OUT: + * x0..x7: new byte-sliced CD state + */ +#define roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \ + t7, mem_cd, key) \ + /* \ + * S-function with AES subbytes \ + */ \ + vmovdqa .Linv_shift_row, t4; \ + vbroadcastss .L0f0f0f0f, t7; \ + vmovdqa .Lpre_tf_lo_s1, t0; \ + vmovdqa .Lpre_tf_hi_s1, t1; \ + \ + /* AES inverse shift rows */ \ + vpshufb t4, x0, x0; \ + vpshufb t4, x7, x7; \ + vpshufb t4, x1, x1; \ + vpshufb t4, x4, x4; \ + vpshufb t4, x2, x2; \ + vpshufb t4, x5, x5; \ + vpshufb t4, x3, x3; \ + vpshufb t4, x6, x6; \ + \ + /* prefilter sboxes 1, 2 and 3 */ \ + vmovdqa .Lpre_tf_lo_s4, t2; \ + vmovdqa .Lpre_tf_hi_s4, t3; \ + filter_8bit(x0, t0, t1, t7, t6); \ + filter_8bit(x7, t0, t1, t7, t6); \ + filter_8bit(x1, t0, t1, t7, t6); \ + filter_8bit(x4, t0, t1, t7, t6); \ + filter_8bit(x2, t0, t1, t7, t6); \ + filter_8bit(x5, t0, t1, t7, t6); \ + \ + /* prefilter sbox 4 */ \ + vpxor t4, t4, t4; \ + filter_8bit(x3, t2, t3, t7, t6); \ + filter_8bit(x6, t2, t3, t7, t6); \ + \ + /* AES subbytes + AES shift rows */ \ + vmovdqa .Lpost_tf_lo_s1, t0; \ + vmovdqa .Lpost_tf_hi_s1, t1; \ + vaesenclast t4, x0, x0; \ + vaesenclast t4, x7, x7; \ + vaesenclast t4, x1, x1; \ + vaesenclast t4, x4, x4; \ + vaesenclast t4, x2, x2; \ + vaesenclast t4, x5, x5; \ + vaesenclast t4, x3, x3; \ + vaesenclast t4, x6, x6; \ + \ + /* postfilter sboxes 1 and 4 */ \ + vmovdqa .Lpost_tf_lo_s3, t2; \ + vmovdqa .Lpost_tf_hi_s3, t3; \ + filter_8bit(x0, t0, t1, t7, t6); \ + filter_8bit(x7, t0, t1, t7, t6); \ + filter_8bit(x3, t0, t1, t7, t6); \ + filter_8bit(x6, t0, t1, t7, t6); \ + \ + /* postfilter sbox 3 */ \ + vmovdqa .Lpost_tf_lo_s2, t4; \ + vmovdqa .Lpost_tf_hi_s2, t5; \ + filter_8bit(x2, t2, t3, t7, t6); \ + filter_8bit(x5, t2, t3, t7, t6); \ + \ + vpxor t6, t6, t6; \ + vmovq key, t0; \ + \ + /* postfilter sbox 2 */ \ + filter_8bit(x1, t4, t5, t7, t2); \ + filter_8bit(x4, t4, t5, t7, t2); \ + \ + vpsrldq $5, t0, t5; \ + vpsrldq $1, t0, t1; \ + vpsrldq $2, t0, t2; \ + vpsrldq $3, t0, t3; \ + vpsrldq $4, t0, t4; \ + vpshufb t6, t0, t0; \ + vpshufb t6, t1, t1; \ + vpshufb t6, t2, t2; \ + vpshufb t6, t3, t3; \ + vpshufb t6, t4, t4; \ + vpsrldq $2, t5, t7; \ + vpshufb t6, t7, t7; \ + \ + /* \ + * P-function \ + */ \ + vpxor x5, x0, x0; \ + vpxor x6, x1, x1; \ + vpxor x7, x2, x2; \ + vpxor x4, x3, x3; \ + \ + vpxor x2, x4, x4; \ + vpxor x3, x5, x5; \ + vpxor x0, x6, x6; \ + vpxor x1, x7, x7; \ + \ + vpxor x7, x0, x0; \ + vpxor x4, x1, x1; \ + vpxor x5, x2, x2; \ + vpxor x6, x3, x3; \ + \ + vpxor x3, x4, x4; \ + vpxor x0, x5, x5; \ + vpxor x1, x6, x6; \ + vpxor x2, x7, x7; /* note: high and low parts swapped */ \ + \ + /* \ + * Add key material and result to CD (x becomes new CD) \ + */ \ + \ + vpxor t3, x4, x4; \ + vpxor 0 * 16(mem_cd), x4, x4; \ + \ + vpxor t2, x5, x5; \ + vpxor 1 * 16(mem_cd), x5, x5; \ + \ + vpsrldq $1, t5, t3; \ + vpshufb t6, t5, t5; \ + vpshufb t6, t3, t6; \ + \ + vpxor t1, x6, x6; \ + vpxor 2 * 16(mem_cd), x6, x6; \ + \ + vpxor t0, x7, x7; \ + vpxor 3 * 16(mem_cd), x7, x7; \ + \ + vpxor t7, x0, x0; \ + vpxor 4 * 16(mem_cd), x0, x0; \ + \ + vpxor t6, x1, x1; \ + vpxor 5 * 16(mem_cd), x1, x1; \ + \ + vpxor t5, x2, x2; \ + vpxor 6 * 16(mem_cd), x2, x2; \ + \ + vpxor t4, x3, x3; \ + vpxor 7 * 16(mem_cd), x3, x3; + +/* + * Size optimization... with inlined roundsm16, binary would be over 5 times + * larger and would only be 0.5% faster (on sandy-bridge). + */ +.align 8 +roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd: + roundsm16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, + %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, + %rcx, (%r9)); + ret; +ENDPROC(roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd) + +.align 8 +roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab: + roundsm16(%xmm4, %xmm5, %xmm6, %xmm7, %xmm0, %xmm1, %xmm2, %xmm3, + %xmm12, %xmm13, %xmm14, %xmm15, %xmm8, %xmm9, %xmm10, %xmm11, + %rax, (%r9)); + ret; +ENDPROC(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) + +/* + * IN/OUT: + * x0..x7: byte-sliced AB state preloaded + * mem_ab: byte-sliced AB state in memory + * mem_cb: byte-sliced CD state in memory + */ +#define two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ + y6, y7, mem_ab, mem_cd, i, dir, store_ab) \ + leaq (key_table + (i) * 8)(CTX), %r9; \ + call roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \ + \ + vmovdqu x4, 0 * 16(mem_cd); \ + vmovdqu x5, 1 * 16(mem_cd); \ + vmovdqu x6, 2 * 16(mem_cd); \ + vmovdqu x7, 3 * 16(mem_cd); \ + vmovdqu x0, 4 * 16(mem_cd); \ + vmovdqu x1, 5 * 16(mem_cd); \ + vmovdqu x2, 6 * 16(mem_cd); \ + vmovdqu x3, 7 * 16(mem_cd); \ + \ + leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \ + call roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \ + \ + store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab); + +#define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */ + +#define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \ + /* Store new AB state */ \ + vmovdqu x0, 0 * 16(mem_ab); \ + vmovdqu x1, 1 * 16(mem_ab); \ + vmovdqu x2, 2 * 16(mem_ab); \ + vmovdqu x3, 3 * 16(mem_ab); \ + vmovdqu x4, 4 * 16(mem_ab); \ + vmovdqu x5, 5 * 16(mem_ab); \ + vmovdqu x6, 6 * 16(mem_ab); \ + vmovdqu x7, 7 * 16(mem_ab); + +#define enc_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ + y6, y7, mem_ab, mem_cd, i) \ + two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ + y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \ + two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ + y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \ + two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ + y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store); + +#define dec_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ + y6, y7, mem_ab, mem_cd, i) \ + two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ + y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \ + two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ + y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \ + two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ + y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store); + +/* + * IN: + * v0..3: byte-sliced 32-bit integers + * OUT: + * v0..3: (IN <<< 1) + */ +#define rol32_1_16(v0, v1, v2, v3, t0, t1, t2, zero) \ + vpcmpgtb v0, zero, t0; \ + vpaddb v0, v0, v0; \ + vpabsb t0, t0; \ + \ + vpcmpgtb v1, zero, t1; \ + vpaddb v1, v1, v1; \ + vpabsb t1, t1; \ + \ + vpcmpgtb v2, zero, t2; \ + vpaddb v2, v2, v2; \ + vpabsb t2, t2; \ + \ + vpor t0, v1, v1; \ + \ + vpcmpgtb v3, zero, t0; \ + vpaddb v3, v3, v3; \ + vpabsb t0, t0; \ + \ + vpor t1, v2, v2; \ + vpor t2, v3, v3; \ + vpor t0, v0, v0; + +/* + * IN: + * r: byte-sliced AB state in memory + * l: byte-sliced CD state in memory + * OUT: + * x0..x7: new byte-sliced CD state + */ +#define fls16(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \ + tt1, tt2, tt3, kll, klr, krl, krr) \ + /* \ + * t0 = kll; \ + * t0 &= ll; \ + * lr ^= rol32(t0, 1); \ + */ \ + vpxor tt0, tt0, tt0; \ + vmovd kll, t0; \ + vpshufb tt0, t0, t3; \ + vpsrldq $1, t0, t0; \ + vpshufb tt0, t0, t2; \ + vpsrldq $1, t0, t0; \ + vpshufb tt0, t0, t1; \ + vpsrldq $1, t0, t0; \ + vpshufb tt0, t0, t0; \ + \ + vpand l0, t0, t0; \ + vpand l1, t1, t1; \ + vpand l2, t2, t2; \ + vpand l3, t3, t3; \ + \ + rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \ + \ + vpxor l4, t0, l4; \ + vmovdqu l4, 4 * 16(l); \ + vpxor l5, t1, l5; \ + vmovdqu l5, 5 * 16(l); \ + vpxor l6, t2, l6; \ + vmovdqu l6, 6 * 16(l); \ + vpxor l7, t3, l7; \ + vmovdqu l7, 7 * 16(l); \ + \ + /* \ + * t2 = krr; \ + * t2 |= rr; \ + * rl ^= t2; \ + */ \ + \ + vmovd krr, t0; \ + vpshufb tt0, t0, t3; \ + vpsrldq $1, t0, t0; \ + vpshufb tt0, t0, t2; \ + vpsrldq $1, t0, t0; \ + vpshufb tt0, t0, t1; \ + vpsrldq $1, t0, t0; \ + vpshufb tt0, t0, t0; \ + \ + vpor 4 * 16(r), t0, t0; \ + vpor 5 * 16(r), t1, t1; \ + vpor 6 * 16(r), t2, t2; \ + vpor 7 * 16(r), t3, t3; \ + \ + vpxor 0 * 16(r), t0, t0; \ + vpxor 1 * 16(r), t1, t1; \ + vpxor 2 * 16(r), t2, t2; \ + vpxor 3 * 16(r), t3, t3; \ + vmovdqu t0, 0 * 16(r); \ + vmovdqu t1, 1 * 16(r); \ + vmovdqu t2, 2 * 16(r); \ + vmovdqu t3, 3 * 16(r); \ + \ + /* \ + * t2 = krl; \ + * t2 &= rl; \ + * rr ^= rol32(t2, 1); \ + */ \ + vmovd krl, t0; \ + vpshufb tt0, t0, t3; \ + vpsrldq $1, t0, t0; \ + vpshufb tt0, t0, t2; \ + vpsrldq $1, t0, t0; \ + vpshufb tt0, t0, t1; \ + vpsrldq $1, t0, t0; \ + vpshufb tt0, t0, t0; \ + \ + vpand 0 * 16(r), t0, t0; \ + vpand 1 * 16(r), t1, t1; \ + vpand 2 * 16(r), t2, t2; \ + vpand 3 * 16(r), t3, t3; \ + \ + rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \ + \ + vpxor 4 * 16(r), t0, t0; \ + vpxor 5 * 16(r), t1, t1; \ + vpxor 6 * 16(r), t2, t2; \ + vpxor 7 * 16(r), t3, t3; \ + vmovdqu t0, 4 * 16(r); \ + vmovdqu t1, 5 * 16(r); \ + vmovdqu t2, 6 * 16(r); \ + vmovdqu t3, 7 * 16(r); \ + \ + /* \ + * t0 = klr; \ + * t0 |= lr; \ + * ll ^= t0; \ + */ \ + \ + vmovd klr, t0; \ + vpshufb tt0, t0, t3; \ + vpsrldq $1, t0, t0; \ + vpshufb tt0, t0, t2; \ + vpsrldq $1, t0, t0; \ + vpshufb tt0, t0, t1; \ + vpsrldq $1, t0, t0; \ + vpshufb tt0, t0, t0; \ + \ + vpor l4, t0, t0; \ + vpor l5, t1, t1; \ + vpor l6, t2, t2; \ + vpor l7, t3, t3; \ + \ + vpxor l0, t0, l0; \ + vmovdqu l0, 0 * 16(l); \ + vpxor l1, t1, l1; \ + vmovdqu l1, 1 * 16(l); \ + vpxor l2, t2, l2; \ + vmovdqu l2, 2 * 16(l); \ + vpxor l3, t3, l3; \ + vmovdqu l3, 3 * 16(l); + +#define transpose_4x4(x0, x1, x2, x3, t1, t2) \ + vpunpckhdq x1, x0, t2; \ + vpunpckldq x1, x0, x0; \ + \ + vpunpckldq x3, x2, t1; \ + vpunpckhdq x3, x2, x2; \ + \ + vpunpckhqdq t1, x0, x1; \ + vpunpcklqdq t1, x0, x0; \ + \ + vpunpckhqdq x2, t2, x3; \ + vpunpcklqdq x2, t2, x2; + +#define byteslice_16x16b(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, \ + b3, c3, d3, st0, st1) \ + vmovdqu d2, st0; \ + vmovdqu d3, st1; \ + transpose_4x4(a0, a1, a2, a3, d2, d3); \ + transpose_4x4(b0, b1, b2, b3, d2, d3); \ + vmovdqu st0, d2; \ + vmovdqu st1, d3; \ + \ + vmovdqu a0, st0; \ + vmovdqu a1, st1; \ + transpose_4x4(c0, c1, c2, c3, a0, a1); \ + transpose_4x4(d0, d1, d2, d3, a0, a1); \ + \ + vmovdqu .Lshufb_16x16b, a0; \ + vmovdqu st1, a1; \ + vpshufb a0, a2, a2; \ + vpshufb a0, a3, a3; \ + vpshufb a0, b0, b0; \ + vpshufb a0, b1, b1; \ + vpshufb a0, b2, b2; \ + vpshufb a0, b3, b3; \ + vpshufb a0, a1, a1; \ + vpshufb a0, c0, c0; \ + vpshufb a0, c1, c1; \ + vpshufb a0, c2, c2; \ + vpshufb a0, c3, c3; \ + vpshufb a0, d0, d0; \ + vpshufb a0, d1, d1; \ + vpshufb a0, d2, d2; \ + vpshufb a0, d3, d3; \ + vmovdqu d3, st1; \ + vmovdqu st0, d3; \ + vpshufb a0, d3, a0; \ + vmovdqu d2, st0; \ + \ + transpose_4x4(a0, b0, c0, d0, d2, d3); \ + transpose_4x4(a1, b1, c1, d1, d2, d3); \ + vmovdqu st0, d2; \ + vmovdqu st1, d3; \ + \ + vmovdqu b0, st0; \ + vmovdqu b1, st1; \ + transpose_4x4(a2, b2, c2, d2, b0, b1); \ + transpose_4x4(a3, b3, c3, d3, b0, b1); \ + vmovdqu st0, b0; \ + vmovdqu st1, b1; \ + /* does not adjust output bytes inside vectors */ + +/* load blocks to registers and apply pre-whitening */ +#define inpack16_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ + y6, y7, rio, key) \ + vmovq key, x0; \ + vpshufb .Lpack_bswap, x0, x0; \ + \ + vpxor 0 * 16(rio), x0, y7; \ + vpxor 1 * 16(rio), x0, y6; \ + vpxor 2 * 16(rio), x0, y5; \ + vpxor 3 * 16(rio), x0, y4; \ + vpxor 4 * 16(rio), x0, y3; \ + vpxor 5 * 16(rio), x0, y2; \ + vpxor 6 * 16(rio), x0, y1; \ + vpxor 7 * 16(rio), x0, y0; \ + vpxor 8 * 16(rio), x0, x7; \ + vpxor 9 * 16(rio), x0, x6; \ + vpxor 10 * 16(rio), x0, x5; \ + vpxor 11 * 16(rio), x0, x4; \ + vpxor 12 * 16(rio), x0, x3; \ + vpxor 13 * 16(rio), x0, x2; \ + vpxor 14 * 16(rio), x0, x1; \ + vpxor 15 * 16(rio), x0, x0; + +/* byteslice pre-whitened blocks and store to temporary memory */ +#define inpack16_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ + y6, y7, mem_ab, mem_cd) \ + byteslice_16x16b(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \ + y5, y6, y7, (mem_ab), (mem_cd)); \ + \ + vmovdqu x0, 0 * 16(mem_ab); \ + vmovdqu x1, 1 * 16(mem_ab); \ + vmovdqu x2, 2 * 16(mem_ab); \ + vmovdqu x3, 3 * 16(mem_ab); \ + vmovdqu x4, 4 * 16(mem_ab); \ + vmovdqu x5, 5 * 16(mem_ab); \ + vmovdqu x6, 6 * 16(mem_ab); \ + vmovdqu x7, 7 * 16(mem_ab); \ + vmovdqu y0, 0 * 16(mem_cd); \ + vmovdqu y1, 1 * 16(mem_cd); \ + vmovdqu y2, 2 * 16(mem_cd); \ + vmovdqu y3, 3 * 16(mem_cd); \ + vmovdqu y4, 4 * 16(mem_cd); \ + vmovdqu y5, 5 * 16(mem_cd); \ + vmovdqu y6, 6 * 16(mem_cd); \ + vmovdqu y7, 7 * 16(mem_cd); + +/* de-byteslice, apply post-whitening and store blocks */ +#define outunpack16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \ + y5, y6, y7, key, stack_tmp0, stack_tmp1) \ + byteslice_16x16b(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, y3, \ + y7, x3, x7, stack_tmp0, stack_tmp1); \ + \ + vmovdqu x0, stack_tmp0; \ + \ + vmovq key, x0; \ + vpshufb .Lpack_bswap, x0, x0; \ + \ + vpxor x0, y7, y7; \ + vpxor x0, y6, y6; \ + vpxor x0, y5, y5; \ + vpxor x0, y4, y4; \ + vpxor x0, y3, y3; \ + vpxor x0, y2, y2; \ + vpxor x0, y1, y1; \ + vpxor x0, y0, y0; \ + vpxor x0, x7, x7; \ + vpxor x0, x6, x6; \ + vpxor x0, x5, x5; \ + vpxor x0, x4, x4; \ + vpxor x0, x3, x3; \ + vpxor x0, x2, x2; \ + vpxor x0, x1, x1; \ + vpxor stack_tmp0, x0, x0; + +#define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ + y6, y7, rio) \ + vmovdqu x0, 0 * 16(rio); \ + vmovdqu x1, 1 * 16(rio); \ + vmovdqu x2, 2 * 16(rio); \ + vmovdqu x3, 3 * 16(rio); \ + vmovdqu x4, 4 * 16(rio); \ + vmovdqu x5, 5 * 16(rio); \ + vmovdqu x6, 6 * 16(rio); \ + vmovdqu x7, 7 * 16(rio); \ + vmovdqu y0, 8 * 16(rio); \ + vmovdqu y1, 9 * 16(rio); \ + vmovdqu y2, 10 * 16(rio); \ + vmovdqu y3, 11 * 16(rio); \ + vmovdqu y4, 12 * 16(rio); \ + vmovdqu y5, 13 * 16(rio); \ + vmovdqu y6, 14 * 16(rio); \ + vmovdqu y7, 15 * 16(rio); + +.data +.align 16 + +#define SHUFB_BYTES(idx) \ + 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx) + +.Lshufb_16x16b: + .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3); + +.Lpack_bswap: + .long 0x00010203 + .long 0x04050607 + .long 0x80808080 + .long 0x80808080 + +/* For CTR-mode IV byteswap */ +.Lbswap128_mask: + .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 + +/* + * pre-SubByte transform + * + * pre-lookup for sbox1, sbox2, sbox3: + * swap_bitendianness( + * isom_map_camellia_to_aes( + * camellia_f( + * swap_bitendianess(in) + * ) + * ) + * ) + * + * (note: '⊕ 0xc5' inside camellia_f()) + */ +.Lpre_tf_lo_s1: + .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86 + .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88 +.Lpre_tf_hi_s1: + .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a + .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23 + +/* + * pre-SubByte transform + * + * pre-lookup for sbox4: + * swap_bitendianness( + * isom_map_camellia_to_aes( + * camellia_f( + * swap_bitendianess(in <<< 1) + * ) + * ) + * ) + * + * (note: '⊕ 0xc5' inside camellia_f()) + */ +.Lpre_tf_lo_s4: + .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25 + .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74 +.Lpre_tf_hi_s4: + .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72 + .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf + +/* + * post-SubByte transform + * + * post-lookup for sbox1, sbox4: + * swap_bitendianness( + * camellia_h( + * isom_map_aes_to_camellia( + * swap_bitendianness( + * aes_inverse_affine_transform(in) + * ) + * ) + * ) + * ) + * + * (note: '⊕ 0x6e' inside camellia_h()) + */ +.Lpost_tf_lo_s1: + .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31 + .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1 +.Lpost_tf_hi_s1: + .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8 + .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c + +/* + * post-SubByte transform + * + * post-lookup for sbox2: + * swap_bitendianness( + * camellia_h( + * isom_map_aes_to_camellia( + * swap_bitendianness( + * aes_inverse_affine_transform(in) + * ) + * ) + * ) + * ) <<< 1 + * + * (note: '⊕ 0x6e' inside camellia_h()) + */ +.Lpost_tf_lo_s2: + .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62 + .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3 +.Lpost_tf_hi_s2: + .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51 + .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18 + +/* + * post-SubByte transform + * + * post-lookup for sbox3: + * swap_bitendianness( + * camellia_h( + * isom_map_aes_to_camellia( + * swap_bitendianness( + * aes_inverse_affine_transform(in) + * ) + * ) + * ) + * ) >>> 1 + * + * (note: '⊕ 0x6e' inside camellia_h()) + */ +.Lpost_tf_lo_s3: + .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98 + .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8 +.Lpost_tf_hi_s3: + .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54 + .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06 + +/* For isolating SubBytes from AESENCLAST, inverse shift row */ +.Linv_shift_row: + .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b + .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03 + +/* 4-bit mask */ +.align 4 +.L0f0f0f0f: + .long 0x0f0f0f0f + +.text + +.align 8 +__camellia_enc_blk16: + /* input: + * %rdi: ctx, CTX + * %rax: temporary storage, 256 bytes + * %xmm0..%xmm15: 16 plaintext blocks + * output: + * %xmm0..%xmm15: 16 encrypted blocks, order swapped: + * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 + */ + + leaq 8 * 16(%rax), %rcx; + + inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, + %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, + %xmm15, %rax, %rcx); + + enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, + %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, + %xmm15, %rax, %rcx, 0); + + fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, + %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, + %xmm15, + ((key_table + (8) * 8) + 0)(CTX), + ((key_table + (8) * 8) + 4)(CTX), + ((key_table + (8) * 8) + 8)(CTX), + ((key_table + (8) * 8) + 12)(CTX)); + + enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, + %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, + %xmm15, %rax, %rcx, 8); + + fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, + %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, + %xmm15, + ((key_table + (16) * 8) + 0)(CTX), + ((key_table + (16) * 8) + 4)(CTX), + ((key_table + (16) * 8) + 8)(CTX), + ((key_table + (16) * 8) + 12)(CTX)); + + enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, + %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, + %xmm15, %rax, %rcx, 16); + + movl $24, %r8d; + cmpl $16, key_length(CTX); + jne .Lenc_max32; + +.Lenc_done: + /* load CD for output */ + vmovdqu 0 * 16(%rcx), %xmm8; + vmovdqu 1 * 16(%rcx), %xmm9; + vmovdqu 2 * 16(%rcx), %xmm10; + vmovdqu 3 * 16(%rcx), %xmm11; + vmovdqu 4 * 16(%rcx), %xmm12; + vmovdqu 5 * 16(%rcx), %xmm13; + vmovdqu 6 * 16(%rcx), %xmm14; + vmovdqu 7 * 16(%rcx), %xmm15; + + outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, + %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, + %xmm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 16(%rax)); + + ret; + +.align 8 +.Lenc_max32: + movl $32, %r8d; + + fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, + %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, + %xmm15, + ((key_table + (24) * 8) + 0)(CTX), + ((key_table + (24) * 8) + 4)(CTX), + ((key_table + (24) * 8) + 8)(CTX), + ((key_table + (24) * 8) + 12)(CTX)); + + enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, + %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, + %xmm15, %rax, %rcx, 24); + + jmp .Lenc_done; +ENDPROC(__camellia_enc_blk16) + +.align 8 +__camellia_dec_blk16: + /* input: + * %rdi: ctx, CTX + * %rax: temporary storage, 256 bytes + * %r8d: 24 for 16 byte key, 32 for larger + * %xmm0..%xmm15: 16 encrypted blocks + * output: + * %xmm0..%xmm15: 16 plaintext blocks, order swapped: + * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 + */ + + leaq 8 * 16(%rax), %rcx; + + inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, + %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, + %xmm15, %rax, %rcx); + + cmpl $32, %r8d; + je .Ldec_max32; + +.Ldec_max24: + dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, + %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, + %xmm15, %rax, %rcx, 16); + + fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, + %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, + %xmm15, + ((key_table + (16) * 8) + 8)(CTX), + ((key_table + (16) * 8) + 12)(CTX), + ((key_table + (16) * 8) + 0)(CTX), + ((key_table + (16) * 8) + 4)(CTX)); + + dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, + %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, + %xmm15, %rax, %rcx, 8); + + fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, + %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, + %xmm15, + ((key_table + (8) * 8) + 8)(CTX), + ((key_table + (8) * 8) + 12)(CTX), + ((key_table + (8) * 8) + 0)(CTX), + ((key_table + (8) * 8) + 4)(CTX)); + + dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, + %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, + %xmm15, %rax, %rcx, 0); + + /* load CD for output */ + vmovdqu 0 * 16(%rcx), %xmm8; + vmovdqu 1 * 16(%rcx), %xmm9; + vmovdqu 2 * 16(%rcx), %xmm10; + vmovdqu 3 * 16(%rcx), %xmm11; + vmovdqu 4 * 16(%rcx), %xmm12; + vmovdqu 5 * 16(%rcx), %xmm13; + vmovdqu 6 * 16(%rcx), %xmm14; + vmovdqu 7 * 16(%rcx), %xmm15; + + outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, + %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, + %xmm15, (key_table)(CTX), (%rax), 1 * 16(%rax)); + + ret; + +.align 8 +.Ldec_max32: + dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, + %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, + %xmm15, %rax, %rcx, 24); + + fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, + %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, + %xmm15, + ((key_table + (24) * 8) + 8)(CTX), + ((key_table + (24) * 8) + 12)(CTX), + ((key_table + (24) * 8) + 0)(CTX), + ((key_table + (24) * 8) + 4)(CTX)); + + jmp .Ldec_max24; +ENDPROC(__camellia_dec_blk16) + +ENTRY(camellia_ecb_enc_16way) + /* input: + * %rdi: ctx, CTX + * %rsi: dst (16 blocks) + * %rdx: src (16 blocks) + */ + + inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, + %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, + %xmm15, %rdx, (key_table)(CTX)); + + /* now dst can be used as temporary buffer (even in src == dst case) */ + movq %rsi, %rax; + + call __camellia_enc_blk16; + + write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0, + %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9, + %xmm8, %rsi); + + ret; +ENDPROC(camellia_ecb_enc_16way) + +ENTRY(camellia_ecb_dec_16way) + /* input: + * %rdi: ctx, CTX + * %rsi: dst (16 blocks) + * %rdx: src (16 blocks) + */ + + cmpl $16, key_length(CTX); + movl $32, %r8d; + movl $24, %eax; + cmovel %eax, %r8d; /* max */ + + inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, + %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, + %xmm15, %rdx, (key_table)(CTX, %r8, 8)); + + /* now dst can be used as temporary buffer (even in src == dst case) */ + movq %rsi, %rax; + + call __camellia_dec_blk16; + + write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0, + %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9, + %xmm8, %rsi); + + ret; +ENDPROC(camellia_ecb_dec_16way) + +ENTRY(camellia_cbc_dec_16way) + /* input: + * %rdi: ctx, CTX + * %rsi: dst (16 blocks) + * %rdx: src (16 blocks) + */ + + cmpl $16, key_length(CTX); + movl $32, %r8d; + movl $24, %eax; + cmovel %eax, %r8d; /* max */ + + inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, + %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, + %xmm15, %rdx, (key_table)(CTX, %r8, 8)); + + /* + * dst might still be in-use (in case dst == src), so use stack for + * temporary storage. + */ + subq $(16 * 16), %rsp; + movq %rsp, %rax; + + call __camellia_dec_blk16; + + addq $(16 * 16), %rsp; + + vpxor (0 * 16)(%rdx), %xmm6, %xmm6; + vpxor (1 * 16)(%rdx), %xmm5, %xmm5; + vpxor (2 * 16)(%rdx), %xmm4, %xmm4; + vpxor (3 * 16)(%rdx), %xmm3, %xmm3; + vpxor (4 * 16)(%rdx), %xmm2, %xmm2; + vpxor (5 * 16)(%rdx), %xmm1, %xmm1; + vpxor (6 * 16)(%rdx), %xmm0, %xmm0; + vpxor (7 * 16)(%rdx), %xmm15, %xmm15; + vpxor (8 * 16)(%rdx), %xmm14, %xmm14; + vpxor (9 * 16)(%rdx), %xmm13, %xmm13; + vpxor (10 * 16)(%rdx), %xmm12, %xmm12; + vpxor (11 * 16)(%rdx), %xmm11, %xmm11; + vpxor (12 * 16)(%rdx), %xmm10, %xmm10; + vpxor (13 * 16)(%rdx), %xmm9, %xmm9; + vpxor (14 * 16)(%rdx), %xmm8, %xmm8; + write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0, + %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9, + %xmm8, %rsi); + + ret; +ENDPROC(camellia_cbc_dec_16way) + +#define inc_le128(x, minus_one, tmp) \ + vpcmpeqq minus_one, x, tmp; \ + vpsubq minus_one, x, x; \ + vpslldq $8, tmp, tmp; \ + vpsubq tmp, x, x; + +ENTRY(camellia_ctr_16way) + /* input: + * %rdi: ctx, CTX + * %rsi: dst (16 blocks) + * %rdx: src (16 blocks) + * %rcx: iv (little endian, 128bit) + */ + + subq $(16 * 16), %rsp; + movq %rsp, %rax; + + vmovdqa .Lbswap128_mask, %xmm14; + + /* load IV and byteswap */ + vmovdqu (%rcx), %xmm0; + vpshufb %xmm14, %xmm0, %xmm15; + vmovdqu %xmm15, 15 * 16(%rax); + + vpcmpeqd %xmm15, %xmm15, %xmm15; + vpsrldq $8, %xmm15, %xmm15; /* low: -1, high: 0 */ + + /* construct IVs */ + inc_le128(%xmm0, %xmm15, %xmm13); + vpshufb %xmm14, %xmm0, %xmm13; + vmovdqu %xmm13, 14 * 16(%rax); + inc_le128(%xmm0, %xmm15, %xmm13); + vpshufb %xmm14, %xmm0, %xmm13; + vmovdqu %xmm13, 13 * 16(%rax); + inc_le128(%xmm0, %xmm15, %xmm13); + vpshufb %xmm14, %xmm0, %xmm12; + inc_le128(%xmm0, %xmm15, %xmm13); + vpshufb %xmm14, %xmm0, %xmm11; + inc_le128(%xmm0, %xmm15, %xmm13); + vpshufb %xmm14, %xmm0, %xmm10; + inc_le128(%xmm0, %xmm15, %xmm13); + vpshufb %xmm14, %xmm0, %xmm9; + inc_le128(%xmm0, %xmm15, %xmm13); + vpshufb %xmm14, %xmm0, %xmm8; + inc_le128(%xmm0, %xmm15, %xmm13); + vpshufb %xmm14, %xmm0, %xmm7; + inc_le128(%xmm0, %xmm15, %xmm13); + vpshufb %xmm14, %xmm0, %xmm6; + inc_le128(%xmm0, %xmm15, %xmm13); + vpshufb %xmm14, %xmm0, %xmm5; + inc_le128(%xmm0, %xmm15, %xmm13); + vpshufb %xmm14, %xmm0, %xmm4; + inc_le128(%xmm0, %xmm15, %xmm13); + vpshufb %xmm14, %xmm0, %xmm3; + inc_le128(%xmm0, %xmm15, %xmm13); + vpshufb %xmm14, %xmm0, %xmm2; + inc_le128(%xmm0, %xmm15, %xmm13); + vpshufb %xmm14, %xmm0, %xmm1; + inc_le128(%xmm0, %xmm15, %xmm13); + vmovdqa %xmm0, %xmm13; + vpshufb %xmm14, %xmm0, %xmm0; + inc_le128(%xmm13, %xmm15, %xmm14); + vmovdqu %xmm13, (%rcx); + + /* inpack16_pre: */ + vmovq (key_table)(CTX), %xmm15; + vpshufb .Lpack_bswap, %xmm15, %xmm15; + vpxor %xmm0, %xmm15, %xmm0; + vpxor %xmm1, %xmm15, %xmm1; + vpxor %xmm2, %xmm15, %xmm2; + vpxor %xmm3, %xmm15, %xmm3; + vpxor %xmm4, %xmm15, %xmm4; + vpxor %xmm5, %xmm15, %xmm5; + vpxor %xmm6, %xmm15, %xmm6; + vpxor %xmm7, %xmm15, %xmm7; + vpxor %xmm8, %xmm15, %xmm8; + vpxor %xmm9, %xmm15, %xmm9; + vpxor %xmm10, %xmm15, %xmm10; + vpxor %xmm11, %xmm15, %xmm11; + vpxor %xmm12, %xmm15, %xmm12; + vpxor 13 * 16(%rax), %xmm15, %xmm13; + vpxor 14 * 16(%rax), %xmm15, %xmm14; + vpxor 15 * 16(%rax), %xmm15, %xmm15; + + call __camellia_enc_blk16; + + addq $(16 * 16), %rsp; + + vpxor 0 * 16(%rdx), %xmm7, %xmm7; + vpxor 1 * 16(%rdx), %xmm6, %xmm6; + vpxor 2 * 16(%rdx), %xmm5, %xmm5; + vpxor 3 * 16(%rdx), %xmm4, %xmm4; + vpxor 4 * 16(%rdx), %xmm3, %xmm3; + vpxor 5 * 16(%rdx), %xmm2, %xmm2; + vpxor 6 * 16(%rdx), %xmm1, %xmm1; + vpxor 7 * 16(%rdx), %xmm0, %xmm0; + vpxor 8 * 16(%rdx), %xmm15, %xmm15; + vpxor 9 * 16(%rdx), %xmm14, %xmm14; + vpxor 10 * 16(%rdx), %xmm13, %xmm13; + vpxor 11 * 16(%rdx), %xmm12, %xmm12; + vpxor 12 * 16(%rdx), %xmm11, %xmm11; + vpxor 13 * 16(%rdx), %xmm10, %xmm10; + vpxor 14 * 16(%rdx), %xmm9, %xmm9; + vpxor 15 * 16(%rdx), %xmm8, %xmm8; + write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0, + %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9, + %xmm8, %rsi); + + ret; +ENDPROC(camellia_ctr_16way) diff --git a/arch/x86/crypto/camellia-x86_64-asm_64.S b/arch/x86/crypto/camellia-x86_64-asm_64.S index 0b3374335fd..310319c601e 100644 --- a/arch/x86/crypto/camellia-x86_64-asm_64.S +++ b/arch/x86/crypto/camellia-x86_64-asm_64.S @@ -20,6 +20,8 @@ * */ +#include <linux/linkage.h> + .file "camellia-x86_64-asm_64.S" .text @@ -188,10 +190,7 @@ bswapq RAB0; \ movq RAB0, 4*2(RIO); -.global __camellia_enc_blk; -.type __camellia_enc_blk,@function; - -__camellia_enc_blk: +ENTRY(__camellia_enc_blk) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -214,33 +213,31 @@ __camellia_enc_blk: movl $24, RT1d; /* max */ cmpb $16, key_length(CTX); - je __enc_done; + je .L__enc_done; enc_fls(24); enc_rounds(24); movl $32, RT1d; /* max */ -__enc_done: +.L__enc_done: testb RXORbl, RXORbl; movq RDST, RIO; - jnz __enc_xor; + jnz .L__enc_xor; enc_outunpack(mov, RT1); movq RRBP, %rbp; ret; -__enc_xor: +.L__enc_xor: enc_outunpack(xor, RT1); movq RRBP, %rbp; ret; +ENDPROC(__camellia_enc_blk) -.global camellia_dec_blk; -.type camellia_dec_blk,@function; - -camellia_dec_blk: +ENTRY(camellia_dec_blk) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -258,12 +255,12 @@ camellia_dec_blk: dec_inpack(RT2); cmpb $24, RT2bl; - je __dec_rounds16; + je .L__dec_rounds16; dec_rounds(24); dec_fls(24); -__dec_rounds16: +.L__dec_rounds16: dec_rounds(16); dec_fls(16); dec_rounds(8); @@ -276,6 +273,7 @@ __dec_rounds16: movq RRBP, %rbp; ret; +ENDPROC(camellia_dec_blk) /********************************************************************** 2-way camellia @@ -426,10 +424,7 @@ __dec_rounds16: bswapq RAB1; \ movq RAB1, 12*2(RIO); -.global __camellia_enc_blk_2way; -.type __camellia_enc_blk_2way,@function; - -__camellia_enc_blk_2way: +ENTRY(__camellia_enc_blk_2way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -453,16 +448,16 @@ __camellia_enc_blk_2way: movl $24, RT2d; /* max */ cmpb $16, key_length(CTX); - je __enc2_done; + je .L__enc2_done; enc_fls2(24); enc_rounds2(24); movl $32, RT2d; /* max */ -__enc2_done: +.L__enc2_done: test RXORbl, RXORbl; movq RDST, RIO; - jnz __enc2_xor; + jnz .L__enc2_xor; enc_outunpack2(mov, RT2); @@ -470,17 +465,15 @@ __enc2_done: popq %rbx; ret; -__enc2_xor: +.L__enc2_xor: enc_outunpack2(xor, RT2); movq RRBP, %rbp; popq %rbx; ret; +ENDPROC(__camellia_enc_blk_2way) -.global camellia_dec_blk_2way; -.type camellia_dec_blk_2way,@function; - -camellia_dec_blk_2way: +ENTRY(camellia_dec_blk_2way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -499,12 +492,12 @@ camellia_dec_blk_2way: dec_inpack2(RT2); cmpb $24, RT2bl; - je __dec2_rounds16; + je .L__dec2_rounds16; dec_rounds2(24); dec_fls2(24); -__dec2_rounds16: +.L__dec2_rounds16: dec_rounds2(16); dec_fls2(16); dec_rounds2(8); @@ -518,3 +511,4 @@ __dec2_rounds16: movq RRBP, %rbp; movq RXOR, %rbx; ret; +ENDPROC(camellia_dec_blk_2way) diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c new file mode 100644 index 00000000000..96cbb6068fc --- /dev/null +++ b/arch/x86/crypto/camellia_aesni_avx_glue.c @@ -0,0 +1,558 @@ +/* + * Glue Code for x86_64/AVX/AES-NI assembler optimized version of Camellia + * + * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/crypto.h> +#include <linux/err.h> +#include <crypto/algapi.h> +#include <crypto/ctr.h> +#include <crypto/lrw.h> +#include <crypto/xts.h> +#include <asm/xcr.h> +#include <asm/xsave.h> +#include <asm/crypto/camellia.h> +#include <asm/crypto/ablk_helper.h> +#include <asm/crypto/glue_helper.h> + +#define CAMELLIA_AESNI_PARALLEL_BLOCKS 16 + +/* 16-way AES-NI parallel cipher functions */ +asmlinkage void camellia_ecb_enc_16way(struct camellia_ctx *ctx, u8 *dst, + const u8 *src); +asmlinkage void camellia_ecb_dec_16way(struct camellia_ctx *ctx, u8 *dst, + const u8 *src); + +asmlinkage void camellia_cbc_dec_16way(struct camellia_ctx *ctx, u8 *dst, + const u8 *src); +asmlinkage void camellia_ctr_16way(struct camellia_ctx *ctx, u8 *dst, + const u8 *src, le128 *iv); + +static const struct common_glue_ctx camellia_enc = { + .num_funcs = 3, + .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, + + .funcs = { { + .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, + .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_enc_16way) } + }, { + .num_blocks = 2, + .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk_2way) } + }, { + .num_blocks = 1, + .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk) } + } } +}; + +static const struct common_glue_ctx camellia_ctr = { + .num_funcs = 3, + .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, + + .funcs = { { + .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, + .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_ctr_16way) } + }, { + .num_blocks = 2, + .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr_2way) } + }, { + .num_blocks = 1, + .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr) } + } } +}; + +static const struct common_glue_ctx camellia_dec = { + .num_funcs = 3, + .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, + + .funcs = { { + .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, + .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_dec_16way) } + }, { + .num_blocks = 2, + .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk_2way) } + }, { + .num_blocks = 1, + .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk) } + } } +}; + +static const struct common_glue_ctx camellia_dec_cbc = { + .num_funcs = 3, + .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, + + .funcs = { { + .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, + .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_cbc_dec_16way) } + }, { + .num_blocks = 2, + .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_decrypt_cbc_2way) } + }, { + .num_blocks = 1, + .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_dec_blk) } + } } +}; + +static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + return glue_ecb_crypt_128bit(&camellia_enc, desc, dst, src, nbytes); +} + +static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + return glue_ecb_crypt_128bit(&camellia_dec, desc, dst, src, nbytes); +} + +static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(camellia_enc_blk), desc, + dst, src, nbytes); +} + +static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + return glue_cbc_decrypt_128bit(&camellia_dec_cbc, desc, dst, src, + nbytes); +} + +static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + return glue_ctr_crypt_128bit(&camellia_ctr, desc, dst, src, nbytes); +} + +static inline bool camellia_fpu_begin(bool fpu_enabled, unsigned int nbytes) +{ + return glue_fpu_begin(CAMELLIA_BLOCK_SIZE, + CAMELLIA_AESNI_PARALLEL_BLOCKS, NULL, fpu_enabled, + nbytes); +} + +static inline void camellia_fpu_end(bool fpu_enabled) +{ + glue_fpu_end(fpu_enabled); +} + +static int camellia_setkey(struct crypto_tfm *tfm, const u8 *in_key, + unsigned int key_len) +{ + return __camellia_setkey(crypto_tfm_ctx(tfm), in_key, key_len, + &tfm->crt_flags); +} + +struct crypt_priv { + struct camellia_ctx *ctx; + bool fpu_enabled; +}; + +static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) +{ + const unsigned int bsize = CAMELLIA_BLOCK_SIZE; + struct crypt_priv *ctx = priv; + int i; + + ctx->fpu_enabled = camellia_fpu_begin(ctx->fpu_enabled, nbytes); + + if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) { + camellia_ecb_enc_16way(ctx->ctx, srcdst, srcdst); + srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS; + nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS; + } + + while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) { + camellia_enc_blk_2way(ctx->ctx, srcdst, srcdst); + srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS; + nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS; + } + + for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) + camellia_enc_blk(ctx->ctx, srcdst, srcdst); +} + +static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) +{ + const unsigned int bsize = CAMELLIA_BLOCK_SIZE; + struct crypt_priv *ctx = priv; + int i; + + ctx->fpu_enabled = camellia_fpu_begin(ctx->fpu_enabled, nbytes); + + if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) { + camellia_ecb_dec_16way(ctx->ctx, srcdst, srcdst); + srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS; + nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS; + } + + while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) { + camellia_dec_blk_2way(ctx->ctx, srcdst, srcdst); + srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS; + nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS; + } + + for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) + camellia_dec_blk(ctx->ctx, srcdst, srcdst); +} + +static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct camellia_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[CAMELLIA_AESNI_PARALLEL_BLOCKS]; + struct crypt_priv crypt_ctx = { + .ctx = &ctx->camellia_ctx, + .fpu_enabled = false, + }; + struct lrw_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .table_ctx = &ctx->lrw_table, + .crypt_ctx = &crypt_ctx, + .crypt_fn = encrypt_callback, + }; + int ret; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + ret = lrw_crypt(desc, dst, src, nbytes, &req); + camellia_fpu_end(crypt_ctx.fpu_enabled); + + return ret; +} + +static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct camellia_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[CAMELLIA_AESNI_PARALLEL_BLOCKS]; + struct crypt_priv crypt_ctx = { + .ctx = &ctx->camellia_ctx, + .fpu_enabled = false, + }; + struct lrw_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .table_ctx = &ctx->lrw_table, + .crypt_ctx = &crypt_ctx, + .crypt_fn = decrypt_callback, + }; + int ret; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + ret = lrw_crypt(desc, dst, src, nbytes, &req); + camellia_fpu_end(crypt_ctx.fpu_enabled); + + return ret; +} + +static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[CAMELLIA_AESNI_PARALLEL_BLOCKS]; + struct crypt_priv crypt_ctx = { + .ctx = &ctx->crypt_ctx, + .fpu_enabled = false, + }; + struct xts_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .tweak_ctx = &ctx->tweak_ctx, + .tweak_fn = XTS_TWEAK_CAST(camellia_enc_blk), + .crypt_ctx = &crypt_ctx, + .crypt_fn = encrypt_callback, + }; + int ret; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + ret = xts_crypt(desc, dst, src, nbytes, &req); + camellia_fpu_end(crypt_ctx.fpu_enabled); + + return ret; +} + +static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[CAMELLIA_AESNI_PARALLEL_BLOCKS]; + struct crypt_priv crypt_ctx = { + .ctx = &ctx->crypt_ctx, + .fpu_enabled = false, + }; + struct xts_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .tweak_ctx = &ctx->tweak_ctx, + .tweak_fn = XTS_TWEAK_CAST(camellia_enc_blk), + .crypt_ctx = &crypt_ctx, + .crypt_fn = decrypt_callback, + }; + int ret; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + ret = xts_crypt(desc, dst, src, nbytes, &req); + camellia_fpu_end(crypt_ctx.fpu_enabled); + + return ret; +} + +static struct crypto_alg cmll_algs[10] = { { + .cra_name = "__ecb-camellia-aesni", + .cra_driver_name = "__driver-ecb-camellia-aesni", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = CAMELLIA_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct camellia_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_u = { + .blkcipher = { + .min_keysize = CAMELLIA_MIN_KEY_SIZE, + .max_keysize = CAMELLIA_MAX_KEY_SIZE, + .setkey = camellia_setkey, + .encrypt = ecb_encrypt, + .decrypt = ecb_decrypt, + }, + }, +}, { + .cra_name = "__cbc-camellia-aesni", + .cra_driver_name = "__driver-cbc-camellia-aesni", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = CAMELLIA_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct camellia_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_u = { + .blkcipher = { + .min_keysize = CAMELLIA_MIN_KEY_SIZE, + .max_keysize = CAMELLIA_MAX_KEY_SIZE, + .setkey = camellia_setkey, + .encrypt = cbc_encrypt, + .decrypt = cbc_decrypt, + }, + }, +}, { + .cra_name = "__ctr-camellia-aesni", + .cra_driver_name = "__driver-ctr-camellia-aesni", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct camellia_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_u = { + .blkcipher = { + .min_keysize = CAMELLIA_MIN_KEY_SIZE, + .max_keysize = CAMELLIA_MAX_KEY_SIZE, + .ivsize = CAMELLIA_BLOCK_SIZE, + .setkey = camellia_setkey, + .encrypt = ctr_crypt, + .decrypt = ctr_crypt, + }, + }, +}, { + .cra_name = "__lrw-camellia-aesni", + .cra_driver_name = "__driver-lrw-camellia-aesni", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = CAMELLIA_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct camellia_lrw_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_exit = lrw_camellia_exit_tfm, + .cra_u = { + .blkcipher = { + .min_keysize = CAMELLIA_MIN_KEY_SIZE + + CAMELLIA_BLOCK_SIZE, + .max_keysize = CAMELLIA_MAX_KEY_SIZE + + CAMELLIA_BLOCK_SIZE, + .ivsize = CAMELLIA_BLOCK_SIZE, + .setkey = lrw_camellia_setkey, + .encrypt = lrw_encrypt, + .decrypt = lrw_decrypt, + }, + }, +}, { + .cra_name = "__xts-camellia-aesni", + .cra_driver_name = "__driver-xts-camellia-aesni", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = CAMELLIA_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct camellia_xts_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_u = { + .blkcipher = { + .min_keysize = CAMELLIA_MIN_KEY_SIZE * 2, + .max_keysize = CAMELLIA_MAX_KEY_SIZE * 2, + .ivsize = CAMELLIA_BLOCK_SIZE, + .setkey = xts_camellia_setkey, + .encrypt = xts_encrypt, + .decrypt = xts_decrypt, + }, + }, +}, { + .cra_name = "ecb(camellia)", + .cra_driver_name = "ecb-camellia-aesni", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = CAMELLIA_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_helper_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_init = ablk_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = CAMELLIA_MIN_KEY_SIZE, + .max_keysize = CAMELLIA_MAX_KEY_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + }, + }, +}, { + .cra_name = "cbc(camellia)", + .cra_driver_name = "cbc-camellia-aesni", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = CAMELLIA_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_helper_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_init = ablk_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = CAMELLIA_MIN_KEY_SIZE, + .max_keysize = CAMELLIA_MAX_KEY_SIZE, + .ivsize = CAMELLIA_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = __ablk_encrypt, + .decrypt = ablk_decrypt, + }, + }, +}, { + .cra_name = "ctr(camellia)", + .cra_driver_name = "ctr-camellia-aesni", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct async_helper_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_init = ablk_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = CAMELLIA_MIN_KEY_SIZE, + .max_keysize = CAMELLIA_MAX_KEY_SIZE, + .ivsize = CAMELLIA_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_encrypt, + .geniv = "chainiv", + }, + }, +}, { + .cra_name = "lrw(camellia)", + .cra_driver_name = "lrw-camellia-aesni", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = CAMELLIA_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_helper_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_init = ablk_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = CAMELLIA_MIN_KEY_SIZE + + CAMELLIA_BLOCK_SIZE, + .max_keysize = CAMELLIA_MAX_KEY_SIZE + + CAMELLIA_BLOCK_SIZE, + .ivsize = CAMELLIA_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + }, + }, +}, { + .cra_name = "xts(camellia)", + .cra_driver_name = "xts-camellia-aesni", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = CAMELLIA_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_helper_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_init = ablk_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = CAMELLIA_MIN_KEY_SIZE * 2, + .max_keysize = CAMELLIA_MAX_KEY_SIZE * 2, + .ivsize = CAMELLIA_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + }, + }, +} }; + +static int __init camellia_aesni_init(void) +{ + u64 xcr0; + + if (!cpu_has_avx || !cpu_has_aes || !cpu_has_osxsave) { + pr_info("AVX or AES-NI instructions are not detected.\n"); + return -ENODEV; + } + + xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); + if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { + pr_info("AVX detected but unusable.\n"); + return -ENODEV; + } + + return crypto_register_algs(cmll_algs, ARRAY_SIZE(cmll_algs)); +} + +static void __exit camellia_aesni_fini(void) +{ + crypto_unregister_algs(cmll_algs, ARRAY_SIZE(cmll_algs)); +} + +module_init(camellia_aesni_init); +module_exit(camellia_aesni_fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Camellia Cipher Algorithm, AES-NI/AVX optimized"); +MODULE_ALIAS("camellia"); +MODULE_ALIAS("camellia-asm"); diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c index 42ffd2bbab5..5cb86ccd4ac 100644 --- a/arch/x86/crypto/camellia_glue.c +++ b/arch/x86/crypto/camellia_glue.c @@ -32,53 +32,24 @@ #include <crypto/algapi.h> #include <crypto/lrw.h> #include <crypto/xts.h> +#include <asm/crypto/camellia.h> #include <asm/crypto/glue_helper.h> -#define CAMELLIA_MIN_KEY_SIZE 16 -#define CAMELLIA_MAX_KEY_SIZE 32 -#define CAMELLIA_BLOCK_SIZE 16 -#define CAMELLIA_TABLE_BYTE_LEN 272 - -struct camellia_ctx { - u64 key_table[CAMELLIA_TABLE_BYTE_LEN / sizeof(u64)]; - u32 key_length; -}; - /* regular block cipher functions */ asmlinkage void __camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst, const u8 *src, bool xor); +EXPORT_SYMBOL_GPL(__camellia_enc_blk); asmlinkage void camellia_dec_blk(struct camellia_ctx *ctx, u8 *dst, const u8 *src); +EXPORT_SYMBOL_GPL(camellia_dec_blk); /* 2-way parallel cipher functions */ asmlinkage void __camellia_enc_blk_2way(struct camellia_ctx *ctx, u8 *dst, const u8 *src, bool xor); +EXPORT_SYMBOL_GPL(__camellia_enc_blk_2way); asmlinkage void camellia_dec_blk_2way(struct camellia_ctx *ctx, u8 *dst, const u8 *src); - -static inline void camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst, - const u8 *src) -{ - __camellia_enc_blk(ctx, dst, src, false); -} - -static inline void camellia_enc_blk_xor(struct camellia_ctx *ctx, u8 *dst, - const u8 *src) -{ - __camellia_enc_blk(ctx, dst, src, true); -} - -static inline void camellia_enc_blk_2way(struct camellia_ctx *ctx, u8 *dst, - const u8 *src) -{ - __camellia_enc_blk_2way(ctx, dst, src, false); -} - -static inline void camellia_enc_blk_xor_2way(struct camellia_ctx *ctx, u8 *dst, - const u8 *src) -{ - __camellia_enc_blk_2way(ctx, dst, src, true); -} +EXPORT_SYMBOL_GPL(camellia_dec_blk_2way); static void camellia_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { @@ -1275,9 +1246,8 @@ static void camellia_setup192(const unsigned char *key, u64 *subkey) camellia_setup256(kk, subkey); } -static int __camellia_setkey(struct camellia_ctx *cctx, - const unsigned char *key, - unsigned int key_len, u32 *flags) +int __camellia_setkey(struct camellia_ctx *cctx, const unsigned char *key, + unsigned int key_len, u32 *flags) { if (key_len != 16 && key_len != 24 && key_len != 32) { *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; @@ -1300,6 +1270,7 @@ static int __camellia_setkey(struct camellia_ctx *cctx, return 0; } +EXPORT_SYMBOL_GPL(__camellia_setkey); static int camellia_setkey(struct crypto_tfm *tfm, const u8 *in_key, unsigned int key_len) @@ -1308,7 +1279,7 @@ static int camellia_setkey(struct crypto_tfm *tfm, const u8 *in_key, &tfm->crt_flags); } -static void camellia_decrypt_cbc_2way(void *ctx, u128 *dst, const u128 *src) +void camellia_decrypt_cbc_2way(void *ctx, u128 *dst, const u128 *src) { u128 iv = *src; @@ -1316,22 +1287,23 @@ static void camellia_decrypt_cbc_2way(void *ctx, u128 *dst, const u128 *src) u128_xor(&dst[1], &dst[1], &iv); } +EXPORT_SYMBOL_GPL(camellia_decrypt_cbc_2way); -static void camellia_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv) +void camellia_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv) { be128 ctrblk; if (dst != src) *dst = *src; - u128_to_be128(&ctrblk, iv); - u128_inc(iv); + le128_to_be128(&ctrblk, iv); + le128_inc(iv); camellia_enc_blk_xor(ctx, (u8 *)dst, (u8 *)&ctrblk); } +EXPORT_SYMBOL_GPL(camellia_crypt_ctr); -static void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src, - u128 *iv) +void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src, le128 *iv) { be128 ctrblks[2]; @@ -1340,13 +1312,14 @@ static void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src, dst[1] = src[1]; } - u128_to_be128(&ctrblks[0], iv); - u128_inc(iv); - u128_to_be128(&ctrblks[1], iv); - u128_inc(iv); + le128_to_be128(&ctrblks[0], iv); + le128_inc(iv); + le128_to_be128(&ctrblks[1], iv); + le128_inc(iv); camellia_enc_blk_xor_2way(ctx, (u8 *)dst, (u8 *)ctrblks); } +EXPORT_SYMBOL_GPL(camellia_crypt_ctr_2way); static const struct common_glue_ctx camellia_enc = { .num_funcs = 2, @@ -1464,13 +1437,8 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) camellia_dec_blk(ctx, srcdst, srcdst); } -struct camellia_lrw_ctx { - struct lrw_table_ctx lrw_table; - struct camellia_ctx camellia_ctx; -}; - -static int lrw_camellia_setkey(struct crypto_tfm *tfm, const u8 *key, - unsigned int keylen) +int lrw_camellia_setkey(struct crypto_tfm *tfm, const u8 *key, + unsigned int keylen) { struct camellia_lrw_ctx *ctx = crypto_tfm_ctx(tfm); int err; @@ -1484,6 +1452,7 @@ static int lrw_camellia_setkey(struct crypto_tfm *tfm, const u8 *key, return lrw_init_table(&ctx->lrw_table, key + keylen - CAMELLIA_BLOCK_SIZE); } +EXPORT_SYMBOL_GPL(lrw_camellia_setkey); static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) @@ -1519,20 +1488,16 @@ static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, return lrw_crypt(desc, dst, src, nbytes, &req); } -static void lrw_exit_tfm(struct crypto_tfm *tfm) +void lrw_camellia_exit_tfm(struct crypto_tfm *tfm) { struct camellia_lrw_ctx *ctx = crypto_tfm_ctx(tfm); lrw_free_table(&ctx->lrw_table); } +EXPORT_SYMBOL_GPL(lrw_camellia_exit_tfm); -struct camellia_xts_ctx { - struct camellia_ctx tweak_ctx; - struct camellia_ctx crypt_ctx; -}; - -static int xts_camellia_setkey(struct crypto_tfm *tfm, const u8 *key, - unsigned int keylen) +int xts_camellia_setkey(struct crypto_tfm *tfm, const u8 *key, + unsigned int keylen) { struct camellia_xts_ctx *ctx = crypto_tfm_ctx(tfm); u32 *flags = &tfm->crt_flags; @@ -1555,6 +1520,7 @@ static int xts_camellia_setkey(struct crypto_tfm *tfm, const u8 *key, return __camellia_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2, flags); } +EXPORT_SYMBOL_GPL(xts_camellia_setkey); static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) @@ -1679,7 +1645,7 @@ static struct crypto_alg camellia_algs[6] = { { .cra_alignmask = 0, .cra_type = &crypto_blkcipher_type, .cra_module = THIS_MODULE, - .cra_exit = lrw_exit_tfm, + .cra_exit = lrw_camellia_exit_tfm, .cra_u = { .blkcipher = { .min_keysize = CAMELLIA_MIN_KEY_SIZE + diff --git a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S index a41a3aaba22..c35fd5d6ecd 100644 --- a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S @@ -23,12 +23,14 @@ * */ +#include <linux/linkage.h> + .file "cast5-avx-x86_64-asm_64.S" -.extern cast5_s1 -.extern cast5_s2 -.extern cast5_s3 -.extern cast5_s4 +.extern cast_s1 +.extern cast_s2 +.extern cast_s3 +.extern cast_s4 /* structure of crypto context */ #define km 0 @@ -36,10 +38,10 @@ #define rr ((16*4)+16) /* s-boxes */ -#define s1 cast5_s1 -#define s2 cast5_s2 -#define s3 cast5_s3 -#define s4 cast5_s4 +#define s1 cast_s1 +#define s2 cast_s2 +#define s3 cast_s3 +#define s4 cast_s4 /********************************************************************** 16-way AVX cast5 @@ -180,31 +182,17 @@ vpunpcklqdq t1, t0, x0; \ vpunpckhqdq t1, t0, x1; -#define inpack_blocks(in, x0, x1, t0, t1, rmask) \ - vmovdqu (0*4*4)(in), x0; \ - vmovdqu (1*4*4)(in), x1; \ +#define inpack_blocks(x0, x1, t0, t1, rmask) \ vpshufb rmask, x0, x0; \ vpshufb rmask, x1, x1; \ \ transpose_2x4(x0, x1, t0, t1) -#define outunpack_blocks(out, x0, x1, t0, t1, rmask) \ +#define outunpack_blocks(x0, x1, t0, t1, rmask) \ transpose_2x4(x0, x1, t0, t1) \ \ vpshufb rmask, x0, x0; \ - vpshufb rmask, x1, x1; \ - vmovdqu x0, (0*4*4)(out); \ - vmovdqu x1, (1*4*4)(out); - -#define outunpack_xor_blocks(out, x0, x1, t0, t1, rmask) \ - transpose_2x4(x0, x1, t0, t1) \ - \ - vpshufb rmask, x0, x0; \ - vpshufb rmask, x1, x1; \ - vpxor (0*4*4)(out), x0, x0; \ - vmovdqu x0, (0*4*4)(out); \ - vpxor (1*4*4)(out), x1, x1; \ - vmovdqu x1, (1*4*4)(out); + vpshufb rmask, x1, x1; .data @@ -213,6 +201,8 @@ .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 .Lbswap128_mask: .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 +.Lbswap_iv_mask: + .byte 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0 .L16_mask: .byte 16, 16, 16, 16 .L32_mask: @@ -223,35 +213,40 @@ .text .align 16 -.global __cast5_enc_blk_16way -.type __cast5_enc_blk_16way,@function; - -__cast5_enc_blk_16way: +__cast5_enc_blk16: /* input: * %rdi: ctx, CTX - * %rsi: dst - * %rdx: src - * %rcx: bool, if true: xor output + * RL1: blocks 1 and 2 + * RR1: blocks 3 and 4 + * RL2: blocks 5 and 6 + * RR2: blocks 7 and 8 + * RL3: blocks 9 and 10 + * RR3: blocks 11 and 12 + * RL4: blocks 13 and 14 + * RR4: blocks 15 and 16 + * output: + * RL1: encrypted blocks 1 and 2 + * RR1: encrypted blocks 3 and 4 + * RL2: encrypted blocks 5 and 6 + * RR2: encrypted blocks 7 and 8 + * RL3: encrypted blocks 9 and 10 + * RR3: encrypted blocks 11 and 12 + * RL4: encrypted blocks 13 and 14 + * RR4: encrypted blocks 15 and 16 */ pushq %rbp; pushq %rbx; - pushq %rcx; vmovdqa .Lbswap_mask, RKM; vmovd .Lfirst_mask, R1ST; vmovd .L32_mask, R32; enc_preload_rkr(); - leaq 1*(2*4*4)(%rdx), %rax; - inpack_blocks(%rdx, RL1, RR1, RTMP, RX, RKM); - inpack_blocks(%rax, RL2, RR2, RTMP, RX, RKM); - leaq 2*(2*4*4)(%rdx), %rax; - inpack_blocks(%rax, RL3, RR3, RTMP, RX, RKM); - leaq 3*(2*4*4)(%rdx), %rax; - inpack_blocks(%rax, RL4, RR4, RTMP, RX, RKM); - - movq %rsi, %r11; + inpack_blocks(RL1, RR1, RTMP, RX, RKM); + inpack_blocks(RL2, RR2, RTMP, RX, RKM); + inpack_blocks(RL3, RR3, RTMP, RX, RKM); + inpack_blocks(RL4, RR4, RTMP, RX, RKM); round(RL, RR, 0, 1); round(RR, RL, 1, 2); @@ -268,52 +263,48 @@ __cast5_enc_blk_16way: movzbl rr(CTX), %eax; testl %eax, %eax; - jnz __skip_enc; + jnz .L__skip_enc; round(RL, RR, 12, 1); round(RR, RL, 13, 2); round(RL, RR, 14, 3); round(RR, RL, 15, 1); -__skip_enc: - popq %rcx; +.L__skip_enc: popq %rbx; popq %rbp; vmovdqa .Lbswap_mask, RKM; - leaq 1*(2*4*4)(%r11), %rax; - - testb %cl, %cl; - jnz __enc_xor16; - - outunpack_blocks(%r11, RR1, RL1, RTMP, RX, RKM); - outunpack_blocks(%rax, RR2, RL2, RTMP, RX, RKM); - leaq 2*(2*4*4)(%r11), %rax; - outunpack_blocks(%rax, RR3, RL3, RTMP, RX, RKM); - leaq 3*(2*4*4)(%r11), %rax; - outunpack_blocks(%rax, RR4, RL4, RTMP, RX, RKM); - - ret; -__enc_xor16: - outunpack_xor_blocks(%r11, RR1, RL1, RTMP, RX, RKM); - outunpack_xor_blocks(%rax, RR2, RL2, RTMP, RX, RKM); - leaq 2*(2*4*4)(%r11), %rax; - outunpack_xor_blocks(%rax, RR3, RL3, RTMP, RX, RKM); - leaq 3*(2*4*4)(%r11), %rax; - outunpack_xor_blocks(%rax, RR4, RL4, RTMP, RX, RKM); + outunpack_blocks(RR1, RL1, RTMP, RX, RKM); + outunpack_blocks(RR2, RL2, RTMP, RX, RKM); + outunpack_blocks(RR3, RL3, RTMP, RX, RKM); + outunpack_blocks(RR4, RL4, RTMP, RX, RKM); ret; +ENDPROC(__cast5_enc_blk16) .align 16 -.global cast5_dec_blk_16way -.type cast5_dec_blk_16way,@function; - -cast5_dec_blk_16way: +__cast5_dec_blk16: /* input: * %rdi: ctx, CTX - * %rsi: dst - * %rdx: src + * RL1: encrypted blocks 1 and 2 + * RR1: encrypted blocks 3 and 4 + * RL2: encrypted blocks 5 and 6 + * RR2: encrypted blocks 7 and 8 + * RL3: encrypted blocks 9 and 10 + * RR3: encrypted blocks 11 and 12 + * RL4: encrypted blocks 13 and 14 + * RR4: encrypted blocks 15 and 16 + * output: + * RL1: decrypted blocks 1 and 2 + * RR1: decrypted blocks 3 and 4 + * RL2: decrypted blocks 5 and 6 + * RR2: decrypted blocks 7 and 8 + * RL3: decrypted blocks 9 and 10 + * RR3: decrypted blocks 11 and 12 + * RL4: decrypted blocks 13 and 14 + * RR4: decrypted blocks 15 and 16 */ pushq %rbp; @@ -324,26 +315,21 @@ cast5_dec_blk_16way: vmovd .L32_mask, R32; dec_preload_rkr(); - leaq 1*(2*4*4)(%rdx), %rax; - inpack_blocks(%rdx, RL1, RR1, RTMP, RX, RKM); - inpack_blocks(%rax, RL2, RR2, RTMP, RX, RKM); - leaq 2*(2*4*4)(%rdx), %rax; - inpack_blocks(%rax, RL3, RR3, RTMP, RX, RKM); - leaq 3*(2*4*4)(%rdx), %rax; - inpack_blocks(%rax, RL4, RR4, RTMP, RX, RKM); - - movq %rsi, %r11; + inpack_blocks(RL1, RR1, RTMP, RX, RKM); + inpack_blocks(RL2, RR2, RTMP, RX, RKM); + inpack_blocks(RL3, RR3, RTMP, RX, RKM); + inpack_blocks(RL4, RR4, RTMP, RX, RKM); movzbl rr(CTX), %eax; testl %eax, %eax; - jnz __skip_dec; + jnz .L__skip_dec; round(RL, RR, 15, 1); round(RR, RL, 14, 3); round(RL, RR, 13, 2); round(RR, RL, 12, 1); -__dec_tail: +.L__dec_tail: round(RL, RR, 11, 3); round(RR, RL, 10, 2); round(RL, RR, 9, 1); @@ -361,16 +347,200 @@ __dec_tail: popq %rbx; popq %rbp; - leaq 1*(2*4*4)(%r11), %rax; - outunpack_blocks(%r11, RR1, RL1, RTMP, RX, RKM); - outunpack_blocks(%rax, RR2, RL2, RTMP, RX, RKM); - leaq 2*(2*4*4)(%r11), %rax; - outunpack_blocks(%rax, RR3, RL3, RTMP, RX, RKM); - leaq 3*(2*4*4)(%r11), %rax; - outunpack_blocks(%rax, RR4, RL4, RTMP, RX, RKM); + outunpack_blocks(RR1, RL1, RTMP, RX, RKM); + outunpack_blocks(RR2, RL2, RTMP, RX, RKM); + outunpack_blocks(RR3, RL3, RTMP, RX, RKM); + outunpack_blocks(RR4, RL4, RTMP, RX, RKM); ret; -__skip_dec: +.L__skip_dec: vpsrldq $4, RKR, RKR; - jmp __dec_tail; + jmp .L__dec_tail; +ENDPROC(__cast5_dec_blk16) + +ENTRY(cast5_ecb_enc_16way) + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + */ + + movq %rsi, %r11; + + vmovdqu (0*4*4)(%rdx), RL1; + vmovdqu (1*4*4)(%rdx), RR1; + vmovdqu (2*4*4)(%rdx), RL2; + vmovdqu (3*4*4)(%rdx), RR2; + vmovdqu (4*4*4)(%rdx), RL3; + vmovdqu (5*4*4)(%rdx), RR3; + vmovdqu (6*4*4)(%rdx), RL4; + vmovdqu (7*4*4)(%rdx), RR4; + + call __cast5_enc_blk16; + + vmovdqu RR1, (0*4*4)(%r11); + vmovdqu RL1, (1*4*4)(%r11); + vmovdqu RR2, (2*4*4)(%r11); + vmovdqu RL2, (3*4*4)(%r11); + vmovdqu RR3, (4*4*4)(%r11); + vmovdqu RL3, (5*4*4)(%r11); + vmovdqu RR4, (6*4*4)(%r11); + vmovdqu RL4, (7*4*4)(%r11); + + ret; +ENDPROC(cast5_ecb_enc_16way) + +ENTRY(cast5_ecb_dec_16way) + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + */ + + movq %rsi, %r11; + + vmovdqu (0*4*4)(%rdx), RL1; + vmovdqu (1*4*4)(%rdx), RR1; + vmovdqu (2*4*4)(%rdx), RL2; + vmovdqu (3*4*4)(%rdx), RR2; + vmovdqu (4*4*4)(%rdx), RL3; + vmovdqu (5*4*4)(%rdx), RR3; + vmovdqu (6*4*4)(%rdx), RL4; + vmovdqu (7*4*4)(%rdx), RR4; + + call __cast5_dec_blk16; + + vmovdqu RR1, (0*4*4)(%r11); + vmovdqu RL1, (1*4*4)(%r11); + vmovdqu RR2, (2*4*4)(%r11); + vmovdqu RL2, (3*4*4)(%r11); + vmovdqu RR3, (4*4*4)(%r11); + vmovdqu RL3, (5*4*4)(%r11); + vmovdqu RR4, (6*4*4)(%r11); + vmovdqu RL4, (7*4*4)(%r11); + + ret; +ENDPROC(cast5_ecb_dec_16way) + +ENTRY(cast5_cbc_dec_16way) + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + */ + + pushq %r12; + + movq %rsi, %r11; + movq %rdx, %r12; + + vmovdqu (0*16)(%rdx), RL1; + vmovdqu (1*16)(%rdx), RR1; + vmovdqu (2*16)(%rdx), RL2; + vmovdqu (3*16)(%rdx), RR2; + vmovdqu (4*16)(%rdx), RL3; + vmovdqu (5*16)(%rdx), RR3; + vmovdqu (6*16)(%rdx), RL4; + vmovdqu (7*16)(%rdx), RR4; + + call __cast5_dec_blk16; + + /* xor with src */ + vmovq (%r12), RX; + vpshufd $0x4f, RX, RX; + vpxor RX, RR1, RR1; + vpxor 0*16+8(%r12), RL1, RL1; + vpxor 1*16+8(%r12), RR2, RR2; + vpxor 2*16+8(%r12), RL2, RL2; + vpxor 3*16+8(%r12), RR3, RR3; + vpxor 4*16+8(%r12), RL3, RL3; + vpxor 5*16+8(%r12), RR4, RR4; + vpxor 6*16+8(%r12), RL4, RL4; + + vmovdqu RR1, (0*16)(%r11); + vmovdqu RL1, (1*16)(%r11); + vmovdqu RR2, (2*16)(%r11); + vmovdqu RL2, (3*16)(%r11); + vmovdqu RR3, (4*16)(%r11); + vmovdqu RL3, (5*16)(%r11); + vmovdqu RR4, (6*16)(%r11); + vmovdqu RL4, (7*16)(%r11); + + popq %r12; + + ret; +ENDPROC(cast5_cbc_dec_16way) + +ENTRY(cast5_ctr_16way) + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + * %rcx: iv (big endian, 64bit) + */ + + pushq %r12; + + movq %rsi, %r11; + movq %rdx, %r12; + + vpcmpeqd RTMP, RTMP, RTMP; + vpsrldq $8, RTMP, RTMP; /* low: -1, high: 0 */ + + vpcmpeqd RKR, RKR, RKR; + vpaddq RKR, RKR, RKR; /* low: -2, high: -2 */ + vmovdqa .Lbswap_iv_mask, R1ST; + vmovdqa .Lbswap128_mask, RKM; + + /* load IV and byteswap */ + vmovq (%rcx), RX; + vpshufb R1ST, RX, RX; + + /* construct IVs */ + vpsubq RTMP, RX, RX; /* le: IV1, IV0 */ + vpshufb RKM, RX, RL1; /* be: IV0, IV1 */ + vpsubq RKR, RX, RX; + vpshufb RKM, RX, RR1; /* be: IV2, IV3 */ + vpsubq RKR, RX, RX; + vpshufb RKM, RX, RL2; /* be: IV4, IV5 */ + vpsubq RKR, RX, RX; + vpshufb RKM, RX, RR2; /* be: IV6, IV7 */ + vpsubq RKR, RX, RX; + vpshufb RKM, RX, RL3; /* be: IV8, IV9 */ + vpsubq RKR, RX, RX; + vpshufb RKM, RX, RR3; /* be: IV10, IV11 */ + vpsubq RKR, RX, RX; + vpshufb RKM, RX, RL4; /* be: IV12, IV13 */ + vpsubq RKR, RX, RX; + vpshufb RKM, RX, RR4; /* be: IV14, IV15 */ + + /* store last IV */ + vpsubq RTMP, RX, RX; /* le: IV16, IV14 */ + vpshufb R1ST, RX, RX; /* be: IV16, IV16 */ + vmovq RX, (%rcx); + + call __cast5_enc_blk16; + + /* dst = src ^ iv */ + vpxor (0*16)(%r12), RR1, RR1; + vpxor (1*16)(%r12), RL1, RL1; + vpxor (2*16)(%r12), RR2, RR2; + vpxor (3*16)(%r12), RL2, RL2; + vpxor (4*16)(%r12), RR3, RR3; + vpxor (5*16)(%r12), RL3, RL3; + vpxor (6*16)(%r12), RR4, RR4; + vpxor (7*16)(%r12), RL4, RL4; + vmovdqu RR1, (0*16)(%r11); + vmovdqu RL1, (1*16)(%r11); + vmovdqu RR2, (2*16)(%r11); + vmovdqu RL2, (3*16)(%r11); + vmovdqu RR3, (4*16)(%r11); + vmovdqu RL3, (5*16)(%r11); + vmovdqu RR4, (6*16)(%r11); + vmovdqu RL4, (7*16)(%r11); + + popq %r12; + + ret; +ENDPROC(cast5_ctr_16way) diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c index e0ea14f9547..c6631813dc1 100644 --- a/arch/x86/crypto/cast5_avx_glue.c +++ b/arch/x86/crypto/cast5_avx_glue.c @@ -37,29 +37,14 @@ #define CAST5_PARALLEL_BLOCKS 16 -asmlinkage void __cast5_enc_blk_16way(struct cast5_ctx *ctx, u8 *dst, - const u8 *src, bool xor); -asmlinkage void cast5_dec_blk_16way(struct cast5_ctx *ctx, u8 *dst, +asmlinkage void cast5_ecb_enc_16way(struct cast5_ctx *ctx, u8 *dst, const u8 *src); - -static inline void cast5_enc_blk_xway(struct cast5_ctx *ctx, u8 *dst, - const u8 *src) -{ - __cast5_enc_blk_16way(ctx, dst, src, false); -} - -static inline void cast5_enc_blk_xway_xor(struct cast5_ctx *ctx, u8 *dst, - const u8 *src) -{ - __cast5_enc_blk_16way(ctx, dst, src, true); -} - -static inline void cast5_dec_blk_xway(struct cast5_ctx *ctx, u8 *dst, - const u8 *src) -{ - cast5_dec_blk_16way(ctx, dst, src); -} - +asmlinkage void cast5_ecb_dec_16way(struct cast5_ctx *ctx, u8 *dst, + const u8 *src); +asmlinkage void cast5_cbc_dec_16way(struct cast5_ctx *ctx, u8 *dst, + const u8 *src); +asmlinkage void cast5_ctr_16way(struct cast5_ctx *ctx, u8 *dst, const u8 *src, + __be64 *iv); static inline bool cast5_fpu_begin(bool fpu_enabled, unsigned int nbytes) { @@ -79,8 +64,11 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk, struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); const unsigned int bsize = CAST5_BLOCK_SIZE; unsigned int nbytes; + void (*fn)(struct cast5_ctx *ctx, u8 *dst, const u8 *src); int err; + fn = (enc) ? cast5_ecb_enc_16way : cast5_ecb_dec_16way; + err = blkcipher_walk_virt(desc, walk); desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; @@ -93,10 +81,7 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk, /* Process multi-block batch */ if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) { do { - if (enc) - cast5_enc_blk_xway(ctx, wdst, wsrc); - else - cast5_dec_blk_xway(ctx, wdst, wsrc); + fn(ctx, wdst, wsrc); wsrc += bsize * CAST5_PARALLEL_BLOCKS; wdst += bsize * CAST5_PARALLEL_BLOCKS; @@ -107,12 +92,11 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk, goto done; } + fn = (enc) ? __cast5_encrypt : __cast5_decrypt; + /* Handle leftovers */ do { - if (enc) - __cast5_encrypt(ctx, wdst, wsrc); - else - __cast5_decrypt(ctx, wdst, wsrc); + fn(ctx, wdst, wsrc); wsrc += bsize; wdst += bsize; @@ -194,9 +178,7 @@ static unsigned int __cbc_decrypt(struct blkcipher_desc *desc, unsigned int nbytes = walk->nbytes; u64 *src = (u64 *)walk->src.virt.addr; u64 *dst = (u64 *)walk->dst.virt.addr; - u64 ivs[CAST5_PARALLEL_BLOCKS - 1]; u64 last_iv; - int i; /* Start of the last block. */ src += nbytes / bsize - 1; @@ -211,13 +193,7 @@ static unsigned int __cbc_decrypt(struct blkcipher_desc *desc, src -= CAST5_PARALLEL_BLOCKS - 1; dst -= CAST5_PARALLEL_BLOCKS - 1; - for (i = 0; i < CAST5_PARALLEL_BLOCKS - 1; i++) - ivs[i] = src[i]; - - cast5_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src); - - for (i = 0; i < CAST5_PARALLEL_BLOCKS - 1; i++) - *(dst + (i + 1)) ^= *(ivs + i); + cast5_cbc_dec_16way(ctx, (u8 *)dst, (u8 *)src); nbytes -= bsize; if (nbytes < bsize) @@ -298,23 +274,12 @@ static unsigned int __ctr_crypt(struct blkcipher_desc *desc, unsigned int nbytes = walk->nbytes; u64 *src = (u64 *)walk->src.virt.addr; u64 *dst = (u64 *)walk->dst.virt.addr; - u64 ctrblk = be64_to_cpu(*(__be64 *)walk->iv); - __be64 ctrblocks[CAST5_PARALLEL_BLOCKS]; - int i; /* Process multi-block batch */ if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) { do { - /* create ctrblks for parallel encrypt */ - for (i = 0; i < CAST5_PARALLEL_BLOCKS; i++) { - if (dst != src) - dst[i] = src[i]; - - ctrblocks[i] = cpu_to_be64(ctrblk++); - } - - cast5_enc_blk_xway_xor(ctx, (u8 *)dst, - (u8 *)ctrblocks); + cast5_ctr_16way(ctx, (u8 *)dst, (u8 *)src, + (__be64 *)walk->iv); src += CAST5_PARALLEL_BLOCKS; dst += CAST5_PARALLEL_BLOCKS; @@ -327,13 +292,16 @@ static unsigned int __ctr_crypt(struct blkcipher_desc *desc, /* Handle leftovers */ do { + u64 ctrblk; + if (dst != src) *dst = *src; - ctrblocks[0] = cpu_to_be64(ctrblk++); + ctrblk = *(u64 *)walk->iv; + be64_add_cpu((__be64 *)walk->iv, 1); - __cast5_encrypt(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks); - *dst ^= ctrblocks[0]; + __cast5_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk); + *dst ^= ctrblk; src += 1; dst += 1; @@ -341,7 +309,6 @@ static unsigned int __ctr_crypt(struct blkcipher_desc *desc, } while (nbytes >= bsize); done: - *(__be64 *)walk->iv = cpu_to_be64(ctrblk); return nbytes; } diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S index 218d283772f..f93b6105a0c 100644 --- a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S @@ -23,22 +23,25 @@ * */ +#include <linux/linkage.h> +#include "glue_helper-asm-avx.S" + .file "cast6-avx-x86_64-asm_64.S" -.extern cast6_s1 -.extern cast6_s2 -.extern cast6_s3 -.extern cast6_s4 +.extern cast_s1 +.extern cast_s2 +.extern cast_s3 +.extern cast_s4 /* structure of crypto context */ #define km 0 #define kr (12*4*4) /* s-boxes */ -#define s1 cast6_s1 -#define s2 cast6_s2 -#define s3 cast6_s3 -#define s4 cast6_s4 +#define s1 cast_s1 +#define s2 cast_s2 +#define s3 cast_s3 +#define s4 cast_s4 /********************************************************************** 8-way AVX cast6 @@ -205,11 +208,7 @@ vpunpcklqdq x3, t2, x2; \ vpunpckhqdq x3, t2, x3; -#define inpack_blocks(in, x0, x1, x2, x3, t0, t1, t2, rmask) \ - vmovdqu (0*4*4)(in), x0; \ - vmovdqu (1*4*4)(in), x1; \ - vmovdqu (2*4*4)(in), x2; \ - vmovdqu (3*4*4)(in), x3; \ +#define inpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \ vpshufb rmask, x0, x0; \ vpshufb rmask, x1, x1; \ vpshufb rmask, x2, x2; \ @@ -217,39 +216,21 @@ \ transpose_4x4(x0, x1, x2, x3, t0, t1, t2) -#define outunpack_blocks(out, x0, x1, x2, x3, t0, t1, t2, rmask) \ - transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ - \ - vpshufb rmask, x0, x0; \ - vpshufb rmask, x1, x1; \ - vpshufb rmask, x2, x2; \ - vpshufb rmask, x3, x3; \ - vmovdqu x0, (0*4*4)(out); \ - vmovdqu x1, (1*4*4)(out); \ - vmovdqu x2, (2*4*4)(out); \ - vmovdqu x3, (3*4*4)(out); - -#define outunpack_xor_blocks(out, x0, x1, x2, x3, t0, t1, t2, rmask) \ +#define outunpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \ transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ \ vpshufb rmask, x0, x0; \ vpshufb rmask, x1, x1; \ vpshufb rmask, x2, x2; \ - vpshufb rmask, x3, x3; \ - vpxor (0*4*4)(out), x0, x0; \ - vmovdqu x0, (0*4*4)(out); \ - vpxor (1*4*4)(out), x1, x1; \ - vmovdqu x1, (1*4*4)(out); \ - vpxor (2*4*4)(out), x2, x2; \ - vmovdqu x2, (2*4*4)(out); \ - vpxor (3*4*4)(out), x3, x3; \ - vmovdqu x3, (3*4*4)(out); + vpshufb rmask, x3, x3; .data .align 16 .Lbswap_mask: .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 +.Lbswap128_mask: + .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 .Lrkr_enc_Q_Q_QBAR_QBAR: .byte 0, 1, 2, 3, 4, 5, 6, 7, 11, 10, 9, 8, 15, 14, 13, 12 .Lrkr_enc_QBAR_QBAR_QBAR_QBAR: @@ -269,31 +250,24 @@ .text -.align 16 -.global __cast6_enc_blk_8way -.type __cast6_enc_blk_8way,@function; - -__cast6_enc_blk_8way: +.align 8 +__cast6_enc_blk8: /* input: * %rdi: ctx, CTX - * %rsi: dst - * %rdx: src - * %rcx: bool, if true: xor output + * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks + * output: + * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks */ pushq %rbp; pushq %rbx; - pushq %rcx; vmovdqa .Lbswap_mask, RKM; vmovd .Lfirst_mask, R1ST; vmovd .L32_mask, R32; - leaq (4*4*4)(%rdx), %rax; - inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); - inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); - - movq %rsi, %r11; + inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); + inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); preload_rkr(0, dummy, none); Q(0); @@ -311,36 +285,24 @@ __cast6_enc_blk_8way: QBAR(10); QBAR(11); - popq %rcx; popq %rbx; popq %rbp; vmovdqa .Lbswap_mask, RKM; - leaq (4*4*4)(%r11), %rax; - - testb %cl, %cl; - jnz __enc_xor8; - outunpack_blocks(%r11, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); - outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); + outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); + outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); ret; +ENDPROC(__cast6_enc_blk8) -__enc_xor8: - outunpack_xor_blocks(%r11, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); - outunpack_xor_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); - - ret; - -.align 16 -.global cast6_dec_blk_8way -.type cast6_dec_blk_8way,@function; - -cast6_dec_blk_8way: +.align 8 +__cast6_dec_blk8: /* input: * %rdi: ctx, CTX - * %rsi: dst - * %rdx: src + * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks + * output: + * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks */ pushq %rbp; @@ -350,11 +312,8 @@ cast6_dec_blk_8way: vmovd .Lfirst_mask, R1ST; vmovd .L32_mask, R32; - leaq (4*4*4)(%rdx), %rax; - inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); - inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); - - movq %rsi, %r11; + inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); + inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); preload_rkr(2, shuffle, .Lrkr_dec_Q_Q_Q_Q); Q(11); @@ -376,8 +335,92 @@ cast6_dec_blk_8way: popq %rbp; vmovdqa .Lbswap_mask, RKM; - leaq (4*4*4)(%r11), %rax; - outunpack_blocks(%r11, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); - outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); + outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); + outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); + + ret; +ENDPROC(__cast6_dec_blk8) + +ENTRY(cast6_ecb_enc_8way) + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + */ + + movq %rsi, %r11; + + load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + + call __cast6_enc_blk8; + + store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + + ret; +ENDPROC(cast6_ecb_enc_8way) + +ENTRY(cast6_ecb_dec_8way) + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + */ + + movq %rsi, %r11; + + load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + + call __cast6_dec_blk8; + + store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + + ret; +ENDPROC(cast6_ecb_dec_8way) + +ENTRY(cast6_cbc_dec_8way) + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + */ + + pushq %r12; + + movq %rsi, %r11; + movq %rdx, %r12; + + load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + + call __cast6_dec_blk8; + + store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + + popq %r12; + + ret; +ENDPROC(cast6_cbc_dec_8way) + +ENTRY(cast6_ctr_8way) + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + * %rcx: iv (little endian, 128bit) + */ + + pushq %r12; + + movq %rsi, %r11; + movq %rdx, %r12; + + load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2, + RD2, RX, RKR, RKM); + + call __cast6_enc_blk8; + + store_ctr_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + + popq %r12; ret; +ENDPROC(cast6_ctr_8way) diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c index 15e5f85a501..92f7ca24790 100644 --- a/arch/x86/crypto/cast6_avx_glue.c +++ b/arch/x86/crypto/cast6_avx_glue.c @@ -40,79 +40,34 @@ #define CAST6_PARALLEL_BLOCKS 8 -asmlinkage void __cast6_enc_blk_8way(struct cast6_ctx *ctx, u8 *dst, - const u8 *src, bool xor); -asmlinkage void cast6_dec_blk_8way(struct cast6_ctx *ctx, u8 *dst, +asmlinkage void cast6_ecb_enc_8way(struct cast6_ctx *ctx, u8 *dst, + const u8 *src); +asmlinkage void cast6_ecb_dec_8way(struct cast6_ctx *ctx, u8 *dst, const u8 *src); -static inline void cast6_enc_blk_xway(struct cast6_ctx *ctx, u8 *dst, - const u8 *src) -{ - __cast6_enc_blk_8way(ctx, dst, src, false); -} - -static inline void cast6_enc_blk_xway_xor(struct cast6_ctx *ctx, u8 *dst, - const u8 *src) -{ - __cast6_enc_blk_8way(ctx, dst, src, true); -} - -static inline void cast6_dec_blk_xway(struct cast6_ctx *ctx, u8 *dst, - const u8 *src) -{ - cast6_dec_blk_8way(ctx, dst, src); -} - - -static void cast6_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src) -{ - u128 ivs[CAST6_PARALLEL_BLOCKS - 1]; - unsigned int j; - - for (j = 0; j < CAST6_PARALLEL_BLOCKS - 1; j++) - ivs[j] = src[j]; - - cast6_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src); - - for (j = 0; j < CAST6_PARALLEL_BLOCKS - 1; j++) - u128_xor(dst + (j + 1), dst + (j + 1), ivs + j); -} +asmlinkage void cast6_cbc_dec_8way(struct cast6_ctx *ctx, u8 *dst, + const u8 *src); +asmlinkage void cast6_ctr_8way(struct cast6_ctx *ctx, u8 *dst, const u8 *src, + le128 *iv); -static void cast6_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv) +static void cast6_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv) { be128 ctrblk; - u128_to_be128(&ctrblk, iv); - u128_inc(iv); + le128_to_be128(&ctrblk, iv); + le128_inc(iv); __cast6_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk); u128_xor(dst, src, (u128 *)&ctrblk); } -static void cast6_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src, - u128 *iv) -{ - be128 ctrblks[CAST6_PARALLEL_BLOCKS]; - unsigned int i; - - for (i = 0; i < CAST6_PARALLEL_BLOCKS; i++) { - if (dst != src) - dst[i] = src[i]; - - u128_to_be128(&ctrblks[i], iv); - u128_inc(iv); - } - - cast6_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks); -} - static const struct common_glue_ctx cast6_enc = { .num_funcs = 2, .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS, .funcs = { { .num_blocks = CAST6_PARALLEL_BLOCKS, - .fn_u = { .ecb = GLUE_FUNC_CAST(cast6_enc_blk_xway) } + .fn_u = { .ecb = GLUE_FUNC_CAST(cast6_ecb_enc_8way) } }, { .num_blocks = 1, .fn_u = { .ecb = GLUE_FUNC_CAST(__cast6_encrypt) } @@ -125,7 +80,7 @@ static const struct common_glue_ctx cast6_ctr = { .funcs = { { .num_blocks = CAST6_PARALLEL_BLOCKS, - .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(cast6_crypt_ctr_xway) } + .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(cast6_ctr_8way) } }, { .num_blocks = 1, .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(cast6_crypt_ctr) } @@ -138,7 +93,7 @@ static const struct common_glue_ctx cast6_dec = { .funcs = { { .num_blocks = CAST6_PARALLEL_BLOCKS, - .fn_u = { .ecb = GLUE_FUNC_CAST(cast6_dec_blk_xway) } + .fn_u = { .ecb = GLUE_FUNC_CAST(cast6_ecb_dec_8way) } }, { .num_blocks = 1, .fn_u = { .ecb = GLUE_FUNC_CAST(__cast6_decrypt) } @@ -151,7 +106,7 @@ static const struct common_glue_ctx cast6_dec_cbc = { .funcs = { { .num_blocks = CAST6_PARALLEL_BLOCKS, - .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(cast6_decrypt_cbc_xway) } + .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(cast6_cbc_dec_8way) } }, { .num_blocks = 1, .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__cast6_decrypt) } @@ -215,7 +170,7 @@ static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes); if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) { - cast6_enc_blk_xway(ctx->ctx, srcdst, srcdst); + cast6_ecb_enc_8way(ctx->ctx, srcdst, srcdst); return; } @@ -232,7 +187,7 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes); if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) { - cast6_dec_blk_xway(ctx->ctx, srcdst, srcdst); + cast6_ecb_dec_8way(ctx->ctx, srcdst, srcdst); return; } diff --git a/arch/x86/crypto/crc32-pclmul_asm.S b/arch/x86/crypto/crc32-pclmul_asm.S new file mode 100644 index 00000000000..c8335014a04 --- /dev/null +++ b/arch/x86/crypto/crc32-pclmul_asm.S @@ -0,0 +1,246 @@ +/* GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see http://www.gnu.org/licenses + * + * Please visit http://www.xyratex.com/contact if you need additional + * information or have any questions. + * + * GPL HEADER END + */ + +/* + * Copyright 2012 Xyratex Technology Limited + * + * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32 + * calculation. + * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE) + * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found + * at: + * http://www.intel.com/products/processor/manuals/ + * Intel(R) 64 and IA-32 Architectures Software Developer's Manual + * Volume 2B: Instruction Set Reference, N-Z + * + * Authors: Gregory Prestas <Gregory_Prestas@us.xyratex.com> + * Alexander Boyko <Alexander_Boyko@xyratex.com> + */ + +#include <linux/linkage.h> +#include <asm/inst.h> + + +.align 16 +/* + * [x4*128+32 mod P(x) << 32)]' << 1 = 0x154442bd4 + * #define CONSTANT_R1 0x154442bd4LL + * + * [(x4*128-32 mod P(x) << 32)]' << 1 = 0x1c6e41596 + * #define CONSTANT_R2 0x1c6e41596LL + */ +.Lconstant_R2R1: + .octa 0x00000001c6e415960000000154442bd4 +/* + * [(x128+32 mod P(x) << 32)]' << 1 = 0x1751997d0 + * #define CONSTANT_R3 0x1751997d0LL + * + * [(x128-32 mod P(x) << 32)]' << 1 = 0x0ccaa009e + * #define CONSTANT_R4 0x0ccaa009eLL + */ +.Lconstant_R4R3: + .octa 0x00000000ccaa009e00000001751997d0 +/* + * [(x64 mod P(x) << 32)]' << 1 = 0x163cd6124 + * #define CONSTANT_R5 0x163cd6124LL + */ +.Lconstant_R5: + .octa 0x00000000000000000000000163cd6124 +.Lconstant_mask32: + .octa 0x000000000000000000000000FFFFFFFF +/* + * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL + * + * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))` = 0x1F7011641LL + * #define CONSTANT_RU 0x1F7011641LL + */ +.Lconstant_RUpoly: + .octa 0x00000001F701164100000001DB710641 + +#define CONSTANT %xmm0 + +#ifdef __x86_64__ +#define BUF %rdi +#define LEN %rsi +#define CRC %edx +#else +#define BUF %eax +#define LEN %edx +#define CRC %ecx +#endif + + + +.text +/** + * Calculate crc32 + * BUF - buffer (16 bytes aligned) + * LEN - sizeof buffer (16 bytes aligned), LEN should be grater than 63 + * CRC - initial crc32 + * return %eax crc32 + * uint crc32_pclmul_le_16(unsigned char const *buffer, + * size_t len, uint crc32) + */ +.globl crc32_pclmul_le_16 +.align 4, 0x90 +crc32_pclmul_le_16:/* buffer and buffer size are 16 bytes aligned */ + movdqa (BUF), %xmm1 + movdqa 0x10(BUF), %xmm2 + movdqa 0x20(BUF), %xmm3 + movdqa 0x30(BUF), %xmm4 + movd CRC, CONSTANT + pxor CONSTANT, %xmm1 + sub $0x40, LEN + add $0x40, BUF +#ifndef __x86_64__ + /* This is for position independent code(-fPIC) support for 32bit */ + call delta +delta: + pop %ecx +#endif + cmp $0x40, LEN + jb less_64 + +#ifdef __x86_64__ + movdqa .Lconstant_R2R1(%rip), CONSTANT +#else + movdqa .Lconstant_R2R1 - delta(%ecx), CONSTANT +#endif + +loop_64:/* 64 bytes Full cache line folding */ + prefetchnta 0x40(BUF) + movdqa %xmm1, %xmm5 + movdqa %xmm2, %xmm6 + movdqa %xmm3, %xmm7 +#ifdef __x86_64__ + movdqa %xmm4, %xmm8 +#endif + PCLMULQDQ 00, CONSTANT, %xmm1 + PCLMULQDQ 00, CONSTANT, %xmm2 + PCLMULQDQ 00, CONSTANT, %xmm3 +#ifdef __x86_64__ + PCLMULQDQ 00, CONSTANT, %xmm4 +#endif + PCLMULQDQ 0x11, CONSTANT, %xmm5 + PCLMULQDQ 0x11, CONSTANT, %xmm6 + PCLMULQDQ 0x11, CONSTANT, %xmm7 +#ifdef __x86_64__ + PCLMULQDQ 0x11, CONSTANT, %xmm8 +#endif + pxor %xmm5, %xmm1 + pxor %xmm6, %xmm2 + pxor %xmm7, %xmm3 +#ifdef __x86_64__ + pxor %xmm8, %xmm4 +#else + /* xmm8 unsupported for x32 */ + movdqa %xmm4, %xmm5 + PCLMULQDQ 00, CONSTANT, %xmm4 + PCLMULQDQ 0x11, CONSTANT, %xmm5 + pxor %xmm5, %xmm4 +#endif + + pxor (BUF), %xmm1 + pxor 0x10(BUF), %xmm2 + pxor 0x20(BUF), %xmm3 + pxor 0x30(BUF), %xmm4 + + sub $0x40, LEN + add $0x40, BUF + cmp $0x40, LEN + jge loop_64 +less_64:/* Folding cache line into 128bit */ +#ifdef __x86_64__ + movdqa .Lconstant_R4R3(%rip), CONSTANT +#else + movdqa .Lconstant_R4R3 - delta(%ecx), CONSTANT +#endif + prefetchnta (BUF) + + movdqa %xmm1, %xmm5 + PCLMULQDQ 0x00, CONSTANT, %xmm1 + PCLMULQDQ 0x11, CONSTANT, %xmm5 + pxor %xmm5, %xmm1 + pxor %xmm2, %xmm1 + + movdqa %xmm1, %xmm5 + PCLMULQDQ 0x00, CONSTANT, %xmm1 + PCLMULQDQ 0x11, CONSTANT, %xmm5 + pxor %xmm5, %xmm1 + pxor %xmm3, %xmm1 + + movdqa %xmm1, %xmm5 + PCLMULQDQ 0x00, CONSTANT, %xmm1 + PCLMULQDQ 0x11, CONSTANT, %xmm5 + pxor %xmm5, %xmm1 + pxor %xmm4, %xmm1 + + cmp $0x10, LEN + jb fold_64 +loop_16:/* Folding rest buffer into 128bit */ + movdqa %xmm1, %xmm5 + PCLMULQDQ 0x00, CONSTANT, %xmm1 + PCLMULQDQ 0x11, CONSTANT, %xmm5 + pxor %xmm5, %xmm1 + pxor (BUF), %xmm1 + sub $0x10, LEN + add $0x10, BUF + cmp $0x10, LEN + jge loop_16 + +fold_64: + /* perform the last 64 bit fold, also adds 32 zeroes + * to the input stream */ + PCLMULQDQ 0x01, %xmm1, CONSTANT /* R4 * xmm1.low */ + psrldq $0x08, %xmm1 + pxor CONSTANT, %xmm1 + + /* final 32-bit fold */ + movdqa %xmm1, %xmm2 +#ifdef __x86_64__ + movdqa .Lconstant_R5(%rip), CONSTANT + movdqa .Lconstant_mask32(%rip), %xmm3 +#else + movdqa .Lconstant_R5 - delta(%ecx), CONSTANT + movdqa .Lconstant_mask32 - delta(%ecx), %xmm3 +#endif + psrldq $0x04, %xmm2 + pand %xmm3, %xmm1 + PCLMULQDQ 0x00, CONSTANT, %xmm1 + pxor %xmm2, %xmm1 + + /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */ +#ifdef __x86_64__ + movdqa .Lconstant_RUpoly(%rip), CONSTANT +#else + movdqa .Lconstant_RUpoly - delta(%ecx), CONSTANT +#endif + movdqa %xmm1, %xmm2 + pand %xmm3, %xmm1 + PCLMULQDQ 0x10, CONSTANT, %xmm1 + pand %xmm3, %xmm1 + PCLMULQDQ 0x00, CONSTANT, %xmm1 + pxor %xmm2, %xmm1 + pextrd $0x01, %xmm1, %eax + + ret diff --git a/arch/x86/crypto/crc32-pclmul_glue.c b/arch/x86/crypto/crc32-pclmul_glue.c new file mode 100644 index 00000000000..9d014a74ef9 --- /dev/null +++ b/arch/x86/crypto/crc32-pclmul_glue.c @@ -0,0 +1,201 @@ +/* GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see http://www.gnu.org/licenses + * + * Please visit http://www.xyratex.com/contact if you need additional + * information or have any questions. + * + * GPL HEADER END + */ + +/* + * Copyright 2012 Xyratex Technology Limited + * + * Wrappers for kernel crypto shash api to pclmulqdq crc32 imlementation. + */ +#include <linux/init.h> +#include <linux/module.h> +#include <linux/string.h> +#include <linux/kernel.h> +#include <linux/crc32.h> +#include <crypto/internal/hash.h> + +#include <asm/cpufeature.h> +#include <asm/cpu_device_id.h> +#include <asm/i387.h> + +#define CHKSUM_BLOCK_SIZE 1 +#define CHKSUM_DIGEST_SIZE 4 + +#define PCLMUL_MIN_LEN 64L /* minimum size of buffer + * for crc32_pclmul_le_16 */ +#define SCALE_F 16L /* size of xmm register */ +#define SCALE_F_MASK (SCALE_F - 1) + +u32 crc32_pclmul_le_16(unsigned char const *buffer, size_t len, u32 crc32); + +static u32 __attribute__((pure)) + crc32_pclmul_le(u32 crc, unsigned char const *p, size_t len) +{ + unsigned int iquotient; + unsigned int iremainder; + unsigned int prealign; + + if (len < PCLMUL_MIN_LEN + SCALE_F_MASK || !irq_fpu_usable()) + return crc32_le(crc, p, len); + + if ((long)p & SCALE_F_MASK) { + /* align p to 16 byte */ + prealign = SCALE_F - ((long)p & SCALE_F_MASK); + + crc = crc32_le(crc, p, prealign); + len -= prealign; + p = (unsigned char *)(((unsigned long)p + SCALE_F_MASK) & + ~SCALE_F_MASK); + } + iquotient = len & (~SCALE_F_MASK); + iremainder = len & SCALE_F_MASK; + + kernel_fpu_begin(); + crc = crc32_pclmul_le_16(p, iquotient, crc); + kernel_fpu_end(); + + if (iremainder) + crc = crc32_le(crc, p + iquotient, iremainder); + + return crc; +} + +static int crc32_pclmul_cra_init(struct crypto_tfm *tfm) +{ + u32 *key = crypto_tfm_ctx(tfm); + + *key = 0; + + return 0; +} + +static int crc32_pclmul_setkey(struct crypto_shash *hash, const u8 *key, + unsigned int keylen) +{ + u32 *mctx = crypto_shash_ctx(hash); + + if (keylen != sizeof(u32)) { + crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN); + return -EINVAL; + } + *mctx = le32_to_cpup((__le32 *)key); + return 0; +} + +static int crc32_pclmul_init(struct shash_desc *desc) +{ + u32 *mctx = crypto_shash_ctx(desc->tfm); + u32 *crcp = shash_desc_ctx(desc); + + *crcp = *mctx; + + return 0; +} + +static int crc32_pclmul_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + u32 *crcp = shash_desc_ctx(desc); + + *crcp = crc32_pclmul_le(*crcp, data, len); + return 0; +} + +/* No final XOR 0xFFFFFFFF, like crc32_le */ +static int __crc32_pclmul_finup(u32 *crcp, const u8 *data, unsigned int len, + u8 *out) +{ + *(__le32 *)out = cpu_to_le32(crc32_pclmul_le(*crcp, data, len)); + return 0; +} + +static int crc32_pclmul_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return __crc32_pclmul_finup(shash_desc_ctx(desc), data, len, out); +} + +static int crc32_pclmul_final(struct shash_desc *desc, u8 *out) +{ + u32 *crcp = shash_desc_ctx(desc); + + *(__le32 *)out = cpu_to_le32p(crcp); + return 0; +} + +static int crc32_pclmul_digest(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return __crc32_pclmul_finup(crypto_shash_ctx(desc->tfm), data, len, + out); +} + +static struct shash_alg alg = { + .setkey = crc32_pclmul_setkey, + .init = crc32_pclmul_init, + .update = crc32_pclmul_update, + .final = crc32_pclmul_final, + .finup = crc32_pclmul_finup, + .digest = crc32_pclmul_digest, + .descsize = sizeof(u32), + .digestsize = CHKSUM_DIGEST_SIZE, + .base = { + .cra_name = "crc32", + .cra_driver_name = "crc32-pclmul", + .cra_priority = 200, + .cra_blocksize = CHKSUM_BLOCK_SIZE, + .cra_ctxsize = sizeof(u32), + .cra_module = THIS_MODULE, + .cra_init = crc32_pclmul_cra_init, + } +}; + +static const struct x86_cpu_id crc32pclmul_cpu_id[] = { + X86_FEATURE_MATCH(X86_FEATURE_PCLMULQDQ), + {} +}; +MODULE_DEVICE_TABLE(x86cpu, crc32pclmul_cpu_id); + + +static int __init crc32_pclmul_mod_init(void) +{ + + if (!x86_match_cpu(crc32pclmul_cpu_id)) { + pr_info("PCLMULQDQ-NI instructions are not detected.\n"); + return -ENODEV; + } + return crypto_register_shash(&alg); +} + +static void __exit crc32_pclmul_mod_fini(void) +{ + crypto_unregister_shash(&alg); +} + +module_init(crc32_pclmul_mod_init); +module_exit(crc32_pclmul_mod_fini); + +MODULE_AUTHOR("Alexander Boyko <alexander_boyko@xyratex.com>"); +MODULE_LICENSE("GPL"); + +MODULE_ALIAS("crc32"); +MODULE_ALIAS("crc32-pclmul"); diff --git a/arch/x86/crypto/crc32c-intel.c b/arch/x86/crypto/crc32c-intel_glue.c index 493f959261f..6812ad98355 100644 --- a/arch/x86/crypto/crc32c-intel.c +++ b/arch/x86/crypto/crc32c-intel_glue.c @@ -32,6 +32,8 @@ #include <asm/cpufeature.h> #include <asm/cpu_device_id.h> +#include <asm/i387.h> +#include <asm/fpu-internal.h> #define CHKSUM_BLOCK_SIZE 1 #define CHKSUM_DIGEST_SIZE 4 @@ -44,6 +46,31 @@ #define REX_PRE #endif +#ifdef CONFIG_X86_64 +/* + * use carryless multiply version of crc32c when buffer + * size is >= 512 (when eager fpu is enabled) or + * >= 1024 (when eager fpu is disabled) to account + * for fpu state save/restore overhead. + */ +#define CRC32C_PCL_BREAKEVEN_EAGERFPU 512 +#define CRC32C_PCL_BREAKEVEN_NOEAGERFPU 1024 + +asmlinkage unsigned int crc_pcl(const u8 *buffer, int len, + unsigned int crc_init); +static int crc32c_pcl_breakeven = CRC32C_PCL_BREAKEVEN_EAGERFPU; +#if defined(X86_FEATURE_EAGER_FPU) +#define set_pcl_breakeven_point() \ +do { \ + if (!use_eager_fpu()) \ + crc32c_pcl_breakeven = CRC32C_PCL_BREAKEVEN_NOEAGERFPU; \ +} while (0) +#else +#define set_pcl_breakeven_point() \ + (crc32c_pcl_breakeven = CRC32C_PCL_BREAKEVEN_NOEAGERFPU) +#endif +#endif /* CONFIG_X86_64 */ + static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t length) { while (length--) { @@ -154,6 +181,52 @@ static int crc32c_intel_cra_init(struct crypto_tfm *tfm) return 0; } +#ifdef CONFIG_X86_64 +static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + u32 *crcp = shash_desc_ctx(desc); + + /* + * use faster PCL version if datasize is large enough to + * overcome kernel fpu state save/restore overhead + */ + if (len >= crc32c_pcl_breakeven && irq_fpu_usable()) { + kernel_fpu_begin(); + *crcp = crc_pcl(data, len, *crcp); + kernel_fpu_end(); + } else + *crcp = crc32c_intel_le_hw(*crcp, data, len); + return 0; +} + +static int __crc32c_pcl_intel_finup(u32 *crcp, const u8 *data, unsigned int len, + u8 *out) +{ + if (len >= crc32c_pcl_breakeven && irq_fpu_usable()) { + kernel_fpu_begin(); + *(__le32 *)out = ~cpu_to_le32(crc_pcl(data, len, *crcp)); + kernel_fpu_end(); + } else + *(__le32 *)out = + ~cpu_to_le32(crc32c_intel_le_hw(*crcp, data, len)); + return 0; +} + +static int crc32c_pcl_intel_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return __crc32c_pcl_intel_finup(shash_desc_ctx(desc), data, len, out); +} + +static int crc32c_pcl_intel_digest(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return __crc32c_pcl_intel_finup(crypto_shash_ctx(desc->tfm), data, len, + out); +} +#endif /* CONFIG_X86_64 */ + static struct shash_alg alg = { .setkey = crc32c_intel_setkey, .init = crc32c_intel_init, @@ -184,6 +257,14 @@ static int __init crc32c_intel_mod_init(void) { if (!x86_match_cpu(crc32c_cpu_id)) return -ENODEV; +#ifdef CONFIG_X86_64 + if (cpu_has_pclmulqdq) { + alg.update = crc32c_pcl_intel_update; + alg.finup = crc32c_pcl_intel_finup; + alg.digest = crc32c_pcl_intel_digest; + set_pcl_breakeven_point(); + } +#endif return crypto_register_shash(&alg); } diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S new file mode 100644 index 00000000000..cf1a7ec4cc3 --- /dev/null +++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S @@ -0,0 +1,464 @@ +/* + * Implement fast CRC32C with PCLMULQDQ instructions. (x86_64) + * + * The white paper on CRC32C calculations with PCLMULQDQ instruction can be + * downloaded from: + * http://download.intel.com/design/intarch/papers/323405.pdf + * + * Copyright (C) 2012 Intel Corporation. + * + * Authors: + * Wajdi Feghali <wajdi.k.feghali@intel.com> + * James Guilford <james.guilford@intel.com> + * David Cote <david.m.cote@intel.com> + * Tim Chen <tim.c.chen@linux.intel.com> + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/linkage.h> + +## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction + +.macro LABEL prefix n +\prefix\n\(): +.endm + +.macro JMPTBL_ENTRY i +.word crc_\i - crc_array +.endm + +.macro JNC_LESS_THAN j + jnc less_than_\j +.endm + +# Define threshold where buffers are considered "small" and routed to more +# efficient "by-1" code. This "by-1" code only handles up to 255 bytes, so +# SMALL_SIZE can be no larger than 255. + +#define SMALL_SIZE 200 + +.if (SMALL_SIZE > 255) +.error "SMALL_ SIZE must be < 256" +.endif + +# unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init); + +ENTRY(crc_pcl) +#define bufp %rdi +#define bufp_dw %edi +#define bufp_w %di +#define bufp_b %dil +#define bufptmp %rcx +#define block_0 %rcx +#define block_1 %rdx +#define block_2 %r11 +#define len %rsi +#define len_dw %esi +#define len_w %si +#define len_b %sil +#define crc_init_arg %rdx +#define tmp %rbx +#define crc_init %r8 +#define crc_init_dw %r8d +#define crc1 %r9 +#define crc2 %r10 + + pushq %rbx + pushq %rdi + pushq %rsi + + ## Move crc_init for Linux to a different + mov crc_init_arg, crc_init + + ################################################################ + ## 1) ALIGN: + ################################################################ + + mov bufp, bufptmp # rdi = *buf + neg bufp + and $7, bufp # calculate the unalignment amount of + # the address + je proc_block # Skip if aligned + + ## If len is less than 8 and we're unaligned, we need to jump + ## to special code to avoid reading beyond the end of the buffer + cmp $8, len + jae do_align + # less_than_8 expects length in upper 3 bits of len_dw + # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30] + shl $32-3+1, len_dw + jmp less_than_8_post_shl1 + +do_align: + #### Calculate CRC of unaligned bytes of the buffer (if any) + movq (bufptmp), tmp # load a quadward from the buffer + add bufp, bufptmp # align buffer pointer for quadword + # processing + sub bufp, len # update buffer length +align_loop: + crc32b %bl, crc_init_dw # compute crc32 of 1-byte + shr $8, tmp # get next byte + dec bufp + jne align_loop + +proc_block: + + ################################################################ + ## 2) PROCESS BLOCKS: + ################################################################ + + ## compute num of bytes to be processed + movq len, tmp # save num bytes in tmp + + cmpq $128*24, len + jae full_block + +continue_block: + cmpq $SMALL_SIZE, len + jb small + + ## len < 128*24 + movq $2731, %rax # 2731 = ceil(2^16 / 24) + mul len_dw + shrq $16, %rax + + ## eax contains floor(bytes / 24) = num 24-byte chunks to do + + ## process rax 24-byte chunks (128 >= rax >= 0) + + ## compute end address of each block + ## block 0 (base addr + RAX * 8) + ## block 1 (base addr + RAX * 16) + ## block 2 (base addr + RAX * 24) + lea (bufptmp, %rax, 8), block_0 + lea (block_0, %rax, 8), block_1 + lea (block_1, %rax, 8), block_2 + + xor crc1, crc1 + xor crc2, crc2 + + ## branch into array + lea jump_table(%rip), bufp + movzxw (bufp, %rax, 2), len + offset=crc_array-jump_table + lea offset(bufp, len, 1), bufp + jmp *bufp + + ################################################################ + ## 2a) PROCESS FULL BLOCKS: + ################################################################ +full_block: + movq $128,%rax + lea 128*8*2(block_0), block_1 + lea 128*8*3(block_0), block_2 + add $128*8*1, block_0 + + xor crc1,crc1 + xor crc2,crc2 + + # Fall thruogh into top of crc array (crc_128) + + ################################################################ + ## 3) CRC Array: + ################################################################ + +crc_array: + i=128 +.rept 128-1 +.altmacro +LABEL crc_ %i +.noaltmacro + crc32q -i*8(block_0), crc_init + crc32q -i*8(block_1), crc1 + crc32q -i*8(block_2), crc2 + i=(i-1) +.endr + +.altmacro +LABEL crc_ %i +.noaltmacro + crc32q -i*8(block_0), crc_init + crc32q -i*8(block_1), crc1 +# SKIP crc32 -i*8(block_2), crc2 ; Don't do this one yet + + mov block_2, block_0 + + ################################################################ + ## 4) Combine three results: + ################################################################ + + lea (K_table-16)(%rip), bufp # first entry is for idx 1 + shlq $3, %rax # rax *= 8 + subq %rax, tmp # tmp -= rax*8 + shlq $1, %rax + subq %rax, tmp # tmp -= rax*16 + # (total tmp -= rax*24) + addq %rax, bufp + + movdqa (bufp), %xmm0 # 2 consts: K1:K2 + + movq crc_init, %xmm1 # CRC for block 1 + pclmulqdq $0x00,%xmm0,%xmm1 # Multiply by K2 + + movq crc1, %xmm2 # CRC for block 2 + pclmulqdq $0x10, %xmm0, %xmm2 # Multiply by K1 + + pxor %xmm2,%xmm1 + movq %xmm1, %rax + xor -i*8(block_2), %rax + mov crc2, crc_init + crc32 %rax, crc_init + +################################################################ +## 5) Check for end: +################################################################ + +LABEL crc_ 0 + mov tmp, len + cmp $128*24, tmp + jae full_block + cmp $24, tmp + jae continue_block + +less_than_24: + shl $32-4, len_dw # less_than_16 expects length + # in upper 4 bits of len_dw + jnc less_than_16 + crc32q (bufptmp), crc_init + crc32q 8(bufptmp), crc_init + jz do_return + add $16, bufptmp + # len is less than 8 if we got here + # less_than_8 expects length in upper 3 bits of len_dw + # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30] + shl $2, len_dw + jmp less_than_8_post_shl1 + + ####################################################################### + ## 6) LESS THAN 256-bytes REMAIN AT THIS POINT (8-bits of len are full) + ####################################################################### +small: + shl $32-8, len_dw # Prepare len_dw for less_than_256 + j=256 +.rept 5 # j = {256, 128, 64, 32, 16} +.altmacro +LABEL less_than_ %j # less_than_j: Length should be in + # upper lg(j) bits of len_dw + j=(j/2) + shl $1, len_dw # Get next MSB + JNC_LESS_THAN %j +.noaltmacro + i=0 +.rept (j/8) + crc32q i(bufptmp), crc_init # Compute crc32 of 8-byte data + i=i+8 +.endr + jz do_return # Return if remaining length is zero + add $j, bufptmp # Advance buf +.endr + +less_than_8: # Length should be stored in + # upper 3 bits of len_dw + shl $1, len_dw +less_than_8_post_shl1: + jnc less_than_4 + crc32l (bufptmp), crc_init_dw # CRC of 4 bytes + jz do_return # return if remaining data is zero + add $4, bufptmp +less_than_4: # Length should be stored in + # upper 2 bits of len_dw + shl $1, len_dw + jnc less_than_2 + crc32w (bufptmp), crc_init_dw # CRC of 2 bytes + jz do_return # return if remaining data is zero + add $2, bufptmp +less_than_2: # Length should be stored in the MSB + # of len_dw + shl $1, len_dw + jnc less_than_1 + crc32b (bufptmp), crc_init_dw # CRC of 1 byte +less_than_1: # Length should be zero +do_return: + movq crc_init, %rax + popq %rsi + popq %rdi + popq %rbx + ret + + ################################################################ + ## jump table Table is 129 entries x 2 bytes each + ################################################################ +.align 4 +jump_table: + i=0 +.rept 129 +.altmacro +JMPTBL_ENTRY %i +.noaltmacro + i=i+1 +.endr + +ENDPROC(crc_pcl) + + ################################################################ + ## PCLMULQDQ tables + ## Table is 128 entries x 2 quad words each + ################################################################ +.data +.align 64 +K_table: + .quad 0x14cd00bd6,0x105ec76f0 + .quad 0x0ba4fc28e,0x14cd00bd6 + .quad 0x1d82c63da,0x0f20c0dfe + .quad 0x09e4addf8,0x0ba4fc28e + .quad 0x039d3b296,0x1384aa63a + .quad 0x102f9b8a2,0x1d82c63da + .quad 0x14237f5e6,0x01c291d04 + .quad 0x00d3b6092,0x09e4addf8 + .quad 0x0c96cfdc0,0x0740eef02 + .quad 0x18266e456,0x039d3b296 + .quad 0x0daece73e,0x0083a6eec + .quad 0x0ab7aff2a,0x102f9b8a2 + .quad 0x1248ea574,0x1c1733996 + .quad 0x083348832,0x14237f5e6 + .quad 0x12c743124,0x02ad91c30 + .quad 0x0b9e02b86,0x00d3b6092 + .quad 0x018b33a4e,0x06992cea2 + .quad 0x1b331e26a,0x0c96cfdc0 + .quad 0x17d35ba46,0x07e908048 + .quad 0x1bf2e8b8a,0x18266e456 + .quad 0x1a3e0968a,0x11ed1f9d8 + .quad 0x0ce7f39f4,0x0daece73e + .quad 0x061d82e56,0x0f1d0f55e + .quad 0x0d270f1a2,0x0ab7aff2a + .quad 0x1c3f5f66c,0x0a87ab8a8 + .quad 0x12ed0daac,0x1248ea574 + .quad 0x065863b64,0x08462d800 + .quad 0x11eef4f8e,0x083348832 + .quad 0x1ee54f54c,0x071d111a8 + .quad 0x0b3e32c28,0x12c743124 + .quad 0x0064f7f26,0x0ffd852c6 + .quad 0x0dd7e3b0c,0x0b9e02b86 + .quad 0x0f285651c,0x0dcb17aa4 + .quad 0x010746f3c,0x018b33a4e + .quad 0x1c24afea4,0x0f37c5aee + .quad 0x0271d9844,0x1b331e26a + .quad 0x08e766a0c,0x06051d5a2 + .quad 0x093a5f730,0x17d35ba46 + .quad 0x06cb08e5c,0x11d5ca20e + .quad 0x06b749fb2,0x1bf2e8b8a + .quad 0x1167f94f2,0x021f3d99c + .quad 0x0cec3662e,0x1a3e0968a + .quad 0x19329634a,0x08f158014 + .quad 0x0e6fc4e6a,0x0ce7f39f4 + .quad 0x08227bb8a,0x1a5e82106 + .quad 0x0b0cd4768,0x061d82e56 + .quad 0x13c2b89c4,0x188815ab2 + .quad 0x0d7a4825c,0x0d270f1a2 + .quad 0x10f5ff2ba,0x105405f3e + .quad 0x00167d312,0x1c3f5f66c + .quad 0x0f6076544,0x0e9adf796 + .quad 0x026f6a60a,0x12ed0daac + .quad 0x1a2adb74e,0x096638b34 + .quad 0x19d34af3a,0x065863b64 + .quad 0x049c3cc9c,0x1e50585a0 + .quad 0x068bce87a,0x11eef4f8e + .quad 0x1524fa6c6,0x19f1c69dc + .quad 0x16cba8aca,0x1ee54f54c + .quad 0x042d98888,0x12913343e + .quad 0x1329d9f7e,0x0b3e32c28 + .quad 0x1b1c69528,0x088f25a3a + .quad 0x02178513a,0x0064f7f26 + .quad 0x0e0ac139e,0x04e36f0b0 + .quad 0x0170076fa,0x0dd7e3b0c + .quad 0x141a1a2e2,0x0bd6f81f8 + .quad 0x16ad828b4,0x0f285651c + .quad 0x041d17b64,0x19425cbba + .quad 0x1fae1cc66,0x010746f3c + .quad 0x1a75b4b00,0x18db37e8a + .quad 0x0f872e54c,0x1c24afea4 + .quad 0x01e41e9fc,0x04c144932 + .quad 0x086d8e4d2,0x0271d9844 + .quad 0x160f7af7a,0x052148f02 + .quad 0x05bb8f1bc,0x08e766a0c + .quad 0x0a90fd27a,0x0a3c6f37a + .quad 0x0b3af077a,0x093a5f730 + .quad 0x04984d782,0x1d22c238e + .quad 0x0ca6ef3ac,0x06cb08e5c + .quad 0x0234e0b26,0x063ded06a + .quad 0x1d88abd4a,0x06b749fb2 + .quad 0x04597456a,0x04d56973c + .quad 0x0e9e28eb4,0x1167f94f2 + .quad 0x07b3ff57a,0x19385bf2e + .quad 0x0c9c8b782,0x0cec3662e + .quad 0x13a9cba9e,0x0e417f38a + .quad 0x093e106a4,0x19329634a + .quad 0x167001a9c,0x14e727980 + .quad 0x1ddffc5d4,0x0e6fc4e6a + .quad 0x00df04680,0x0d104b8fc + .quad 0x02342001e,0x08227bb8a + .quad 0x00a2a8d7e,0x05b397730 + .quad 0x168763fa6,0x0b0cd4768 + .quad 0x1ed5a407a,0x0e78eb416 + .quad 0x0d2c3ed1a,0x13c2b89c4 + .quad 0x0995a5724,0x1641378f0 + .quad 0x19b1afbc4,0x0d7a4825c + .quad 0x109ffedc0,0x08d96551c + .quad 0x0f2271e60,0x10f5ff2ba + .quad 0x00b0bf8ca,0x00bf80dd2 + .quad 0x123888b7a,0x00167d312 + .quad 0x1e888f7dc,0x18dcddd1c + .quad 0x002ee03b2,0x0f6076544 + .quad 0x183e8d8fe,0x06a45d2b2 + .quad 0x133d7a042,0x026f6a60a + .quad 0x116b0f50c,0x1dd3e10e8 + .quad 0x05fabe670,0x1a2adb74e + .quad 0x130004488,0x0de87806c + .quad 0x000bcf5f6,0x19d34af3a + .quad 0x18f0c7078,0x014338754 + .quad 0x017f27698,0x049c3cc9c + .quad 0x058ca5f00,0x15e3e77ee + .quad 0x1af900c24,0x068bce87a + .quad 0x0b5cfca28,0x0dd07448e + .quad 0x0ded288f8,0x1524fa6c6 + .quad 0x059f229bc,0x1d8048348 + .quad 0x06d390dec,0x16cba8aca + .quad 0x037170390,0x0a3e3e02c + .quad 0x06353c1cc,0x042d98888 + .quad 0x0c4584f5c,0x0d73c7bea + .quad 0x1f16a3418,0x1329d9f7e + .quad 0x0531377e2,0x185137662 + .quad 0x1d8d9ca7c,0x1b1c69528 + .quad 0x0b25b29f2,0x18a08b5bc + .quad 0x19fb2a8b0,0x02178513a + .quad 0x1a08fe6ac,0x1da758ae0 + .quad 0x045cddf4e,0x0e0ac139e + .quad 0x1a91647f2,0x169cf9eb0 + .quad 0x1a0f717c4,0x0170076fa diff --git a/arch/x86/crypto/ghash-clmulni-intel_asm.S b/arch/x86/crypto/ghash-clmulni-intel_asm.S index 1eb7f90cb7b..586f41aac36 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_asm.S +++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S @@ -94,6 +94,7 @@ __clmul_gf128mul_ble: pxor T2, T1 pxor T1, DATA ret +ENDPROC(__clmul_gf128mul_ble) /* void clmul_ghash_mul(char *dst, const be128 *shash) */ ENTRY(clmul_ghash_mul) @@ -105,6 +106,7 @@ ENTRY(clmul_ghash_mul) PSHUFB_XMM BSWAP DATA movups DATA, (%rdi) ret +ENDPROC(clmul_ghash_mul) /* * void clmul_ghash_update(char *dst, const char *src, unsigned int srclen, @@ -131,6 +133,7 @@ ENTRY(clmul_ghash_update) movups DATA, (%rdi) .Lupdate_just_ret: ret +ENDPROC(clmul_ghash_update) /* * void clmul_ghash_setkey(be128 *shash, const u8 *key); @@ -155,3 +158,4 @@ ENTRY(clmul_ghash_setkey) pxor %xmm1, %xmm0 movups %xmm0, (%rdi) ret +ENDPROC(clmul_ghash_setkey) diff --git a/arch/x86/crypto/glue_helper-asm-avx.S b/arch/x86/crypto/glue_helper-asm-avx.S new file mode 100644 index 00000000000..f7b6ea2ddfd --- /dev/null +++ b/arch/x86/crypto/glue_helper-asm-avx.S @@ -0,0 +1,91 @@ +/* + * Shared glue code for 128bit block ciphers, AVX assembler macros + * + * Copyright (c) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#define load_8way(src, x0, x1, x2, x3, x4, x5, x6, x7) \ + vmovdqu (0*16)(src), x0; \ + vmovdqu (1*16)(src), x1; \ + vmovdqu (2*16)(src), x2; \ + vmovdqu (3*16)(src), x3; \ + vmovdqu (4*16)(src), x4; \ + vmovdqu (5*16)(src), x5; \ + vmovdqu (6*16)(src), x6; \ + vmovdqu (7*16)(src), x7; + +#define store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \ + vmovdqu x0, (0*16)(dst); \ + vmovdqu x1, (1*16)(dst); \ + vmovdqu x2, (2*16)(dst); \ + vmovdqu x3, (3*16)(dst); \ + vmovdqu x4, (4*16)(dst); \ + vmovdqu x5, (5*16)(dst); \ + vmovdqu x6, (6*16)(dst); \ + vmovdqu x7, (7*16)(dst); + +#define store_cbc_8way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \ + vpxor (0*16)(src), x1, x1; \ + vpxor (1*16)(src), x2, x2; \ + vpxor (2*16)(src), x3, x3; \ + vpxor (3*16)(src), x4, x4; \ + vpxor (4*16)(src), x5, x5; \ + vpxor (5*16)(src), x6, x6; \ + vpxor (6*16)(src), x7, x7; \ + store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7); + +#define inc_le128(x, minus_one, tmp) \ + vpcmpeqq minus_one, x, tmp; \ + vpsubq minus_one, x, x; \ + vpslldq $8, tmp, tmp; \ + vpsubq tmp, x, x; + +#define load_ctr_8way(iv, bswap, x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2) \ + vpcmpeqd t0, t0, t0; \ + vpsrldq $8, t0, t0; /* low: -1, high: 0 */ \ + vmovdqa bswap, t1; \ + \ + /* load IV and byteswap */ \ + vmovdqu (iv), x7; \ + vpshufb t1, x7, x0; \ + \ + /* construct IVs */ \ + inc_le128(x7, t0, t2); \ + vpshufb t1, x7, x1; \ + inc_le128(x7, t0, t2); \ + vpshufb t1, x7, x2; \ + inc_le128(x7, t0, t2); \ + vpshufb t1, x7, x3; \ + inc_le128(x7, t0, t2); \ + vpshufb t1, x7, x4; \ + inc_le128(x7, t0, t2); \ + vpshufb t1, x7, x5; \ + inc_le128(x7, t0, t2); \ + vpshufb t1, x7, x6; \ + inc_le128(x7, t0, t2); \ + vmovdqa x7, t2; \ + vpshufb t1, x7, x7; \ + inc_le128(t2, t0, t1); \ + vmovdqu t2, (iv); + +#define store_ctr_8way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \ + vpxor (0*16)(src), x0, x0; \ + vpxor (1*16)(src), x1, x1; \ + vpxor (2*16)(src), x2, x2; \ + vpxor (3*16)(src), x3, x3; \ + vpxor (4*16)(src), x4, x4; \ + vpxor (5*16)(src), x5, x5; \ + vpxor (6*16)(src), x6, x6; \ + vpxor (7*16)(src), x7, x7; \ + store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7); diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c index 30b3927bd73..22ce4f683e5 100644 --- a/arch/x86/crypto/glue_helper.c +++ b/arch/x86/crypto/glue_helper.c @@ -221,16 +221,16 @@ static void glue_ctr_crypt_final_128bit(const common_glue_ctr_func_t fn_ctr, u8 *src = (u8 *)walk->src.virt.addr; u8 *dst = (u8 *)walk->dst.virt.addr; unsigned int nbytes = walk->nbytes; - u128 ctrblk; + le128 ctrblk; u128 tmp; - be128_to_u128(&ctrblk, (be128 *)walk->iv); + be128_to_le128(&ctrblk, (be128 *)walk->iv); memcpy(&tmp, src, nbytes); fn_ctr(ctx, &tmp, &tmp, &ctrblk); memcpy(dst, &tmp, nbytes); - u128_to_be128((be128 *)walk->iv, &ctrblk); + le128_to_be128((be128 *)walk->iv, &ctrblk); } EXPORT_SYMBOL_GPL(glue_ctr_crypt_final_128bit); @@ -243,11 +243,11 @@ static unsigned int __glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx, unsigned int nbytes = walk->nbytes; u128 *src = (u128 *)walk->src.virt.addr; u128 *dst = (u128 *)walk->dst.virt.addr; - u128 ctrblk; + le128 ctrblk; unsigned int num_blocks, func_bytes; unsigned int i; - be128_to_u128(&ctrblk, (be128 *)walk->iv); + be128_to_le128(&ctrblk, (be128 *)walk->iv); /* Process multi-block batch */ for (i = 0; i < gctx->num_funcs; i++) { @@ -269,7 +269,7 @@ static unsigned int __glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx, } done: - u128_to_be128((be128 *)walk->iv, &ctrblk); + le128_to_be128((be128 *)walk->iv, &ctrblk); return nbytes; } diff --git a/arch/x86/crypto/salsa20-i586-asm_32.S b/arch/x86/crypto/salsa20-i586-asm_32.S index 72eb306680b..329452b8f79 100644 --- a/arch/x86/crypto/salsa20-i586-asm_32.S +++ b/arch/x86/crypto/salsa20-i586-asm_32.S @@ -2,11 +2,12 @@ # D. J. Bernstein # Public domain. -# enter ECRYPT_encrypt_bytes +#include <linux/linkage.h> + .text -.p2align 5 -.globl ECRYPT_encrypt_bytes -ECRYPT_encrypt_bytes: + +# enter salsa20_encrypt_bytes +ENTRY(salsa20_encrypt_bytes) mov %esp,%eax and $31,%eax add $256,%eax @@ -933,11 +934,10 @@ ECRYPT_encrypt_bytes: add $64,%esi # goto bytesatleast1 jmp ._bytesatleast1 -# enter ECRYPT_keysetup -.text -.p2align 5 -.globl ECRYPT_keysetup -ECRYPT_keysetup: +ENDPROC(salsa20_encrypt_bytes) + +# enter salsa20_keysetup +ENTRY(salsa20_keysetup) mov %esp,%eax and $31,%eax add $256,%eax @@ -1060,11 +1060,10 @@ ECRYPT_keysetup: # leave add %eax,%esp ret -# enter ECRYPT_ivsetup -.text -.p2align 5 -.globl ECRYPT_ivsetup -ECRYPT_ivsetup: +ENDPROC(salsa20_keysetup) + +# enter salsa20_ivsetup +ENTRY(salsa20_ivsetup) mov %esp,%eax and $31,%eax add $256,%eax @@ -1112,3 +1111,4 @@ ECRYPT_ivsetup: # leave add %eax,%esp ret +ENDPROC(salsa20_ivsetup) diff --git a/arch/x86/crypto/salsa20-x86_64-asm_64.S b/arch/x86/crypto/salsa20-x86_64-asm_64.S index 6214a9b0970..9279e0b2d60 100644 --- a/arch/x86/crypto/salsa20-x86_64-asm_64.S +++ b/arch/x86/crypto/salsa20-x86_64-asm_64.S @@ -1,8 +1,7 @@ -# enter ECRYPT_encrypt_bytes -.text -.p2align 5 -.globl ECRYPT_encrypt_bytes -ECRYPT_encrypt_bytes: +#include <linux/linkage.h> + +# enter salsa20_encrypt_bytes +ENTRY(salsa20_encrypt_bytes) mov %rsp,%r11 and $31,%r11 add $256,%r11 @@ -802,11 +801,10 @@ ECRYPT_encrypt_bytes: # comment:fp stack unchanged by jump # goto bytesatleast1 jmp ._bytesatleast1 -# enter ECRYPT_keysetup -.text -.p2align 5 -.globl ECRYPT_keysetup -ECRYPT_keysetup: +ENDPROC(salsa20_encrypt_bytes) + +# enter salsa20_keysetup +ENTRY(salsa20_keysetup) mov %rsp,%r11 and $31,%r11 add $256,%r11 @@ -892,11 +890,10 @@ ECRYPT_keysetup: mov %rdi,%rax mov %rsi,%rdx ret -# enter ECRYPT_ivsetup -.text -.p2align 5 -.globl ECRYPT_ivsetup -ECRYPT_ivsetup: +ENDPROC(salsa20_keysetup) + +# enter salsa20_ivsetup +ENTRY(salsa20_ivsetup) mov %rsp,%r11 and $31,%r11 add $256,%r11 @@ -918,3 +915,4 @@ ECRYPT_ivsetup: mov %rdi,%rax mov %rsi,%rdx ret +ENDPROC(salsa20_ivsetup) diff --git a/arch/x86/crypto/salsa20_glue.c b/arch/x86/crypto/salsa20_glue.c index a3a3c0205c1..5e8e67739bb 100644 --- a/arch/x86/crypto/salsa20_glue.c +++ b/arch/x86/crypto/salsa20_glue.c @@ -26,11 +26,6 @@ #define SALSA20_MIN_KEY_SIZE 16U #define SALSA20_MAX_KEY_SIZE 32U -// use the ECRYPT_* function names -#define salsa20_keysetup ECRYPT_keysetup -#define salsa20_ivsetup ECRYPT_ivsetup -#define salsa20_encrypt_bytes ECRYPT_encrypt_bytes - struct salsa20_ctx { u32 input[16]; diff --git a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S index 504106bf04a..43c938612b7 100644 --- a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S @@ -24,7 +24,17 @@ * */ +#include <linux/linkage.h> +#include "glue_helper-asm-avx.S" + .file "serpent-avx-x86_64-asm_64.S" + +.data +.align 16 + +.Lbswap128_mask: + .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 + .text #define CTX %rdi @@ -550,51 +560,25 @@ vpunpcklqdq x3, t2, x2; \ vpunpckhqdq x3, t2, x3; -#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \ - vmovdqu (0*4*4)(in), x0; \ - vmovdqu (1*4*4)(in), x1; \ - vmovdqu (2*4*4)(in), x2; \ - vmovdqu (3*4*4)(in), x3; \ - \ +#define read_blocks(x0, x1, x2, x3, t0, t1, t2) \ transpose_4x4(x0, x1, x2, x3, t0, t1, t2) -#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \ - transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ - \ - vmovdqu x0, (0*4*4)(out); \ - vmovdqu x1, (1*4*4)(out); \ - vmovdqu x2, (2*4*4)(out); \ - vmovdqu x3, (3*4*4)(out); - -#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \ - transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ - \ - vpxor (0*4*4)(out), x0, x0; \ - vmovdqu x0, (0*4*4)(out); \ - vpxor (1*4*4)(out), x1, x1; \ - vmovdqu x1, (1*4*4)(out); \ - vpxor (2*4*4)(out), x2, x2; \ - vmovdqu x2, (2*4*4)(out); \ - vpxor (3*4*4)(out), x3, x3; \ - vmovdqu x3, (3*4*4)(out); +#define write_blocks(x0, x1, x2, x3, t0, t1, t2) \ + transpose_4x4(x0, x1, x2, x3, t0, t1, t2) .align 8 -.global __serpent_enc_blk_8way_avx -.type __serpent_enc_blk_8way_avx,@function; - -__serpent_enc_blk_8way_avx: +__serpent_enc_blk8_avx: /* input: * %rdi: ctx, CTX - * %rsi: dst - * %rdx: src - * %rcx: bool, if true: xor output + * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks + * output: + * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks */ vpcmpeqd RNOT, RNOT, RNOT; - leaq (4*4*4)(%rdx), %rax; - read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2); - read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); + read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); + read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); K2(RA, RB, RC, RD, RE, 0); S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1); @@ -630,38 +614,25 @@ __serpent_enc_blk_8way_avx: S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31); S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32); - leaq (4*4*4)(%rsi), %rax; - - testb %cl, %cl; - jnz __enc_xor8; - - write_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2); - write_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); - - ret; - -__enc_xor8: - xor_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2); - xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); + write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); + write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); ret; +ENDPROC(__serpent_enc_blk8_avx) .align 8 -.global serpent_dec_blk_8way_avx -.type serpent_dec_blk_8way_avx,@function; - -serpent_dec_blk_8way_avx: +__serpent_dec_blk8_avx: /* input: * %rdi: ctx, CTX - * %rsi: dst - * %rdx: src + * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks + * output: + * RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2: decrypted blocks */ vpcmpeqd RNOT, RNOT, RNOT; - leaq (4*4*4)(%rdx), %rax; - read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2); - read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); + read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); + read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); K2(RA, RB, RC, RD, RE, 32); SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31); @@ -697,8 +668,74 @@ serpent_dec_blk_8way_avx: SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1); S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0); - leaq (4*4*4)(%rsi), %rax; - write_blocks(%rsi, RC1, RD1, RB1, RE1, RK0, RK1, RK2); - write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2); + write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2); + write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2); + + ret; +ENDPROC(__serpent_dec_blk8_avx) + +ENTRY(serpent_ecb_enc_8way_avx) + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + */ + + load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + + call __serpent_enc_blk8_avx; + + store_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + + ret; +ENDPROC(serpent_ecb_enc_8way_avx) + +ENTRY(serpent_ecb_dec_8way_avx) + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + */ + + load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + + call __serpent_dec_blk8_avx; + + store_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); + + ret; +ENDPROC(serpent_ecb_dec_8way_avx) + +ENTRY(serpent_cbc_dec_8way_avx) + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + */ + + load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + + call __serpent_dec_blk8_avx; + + store_cbc_8way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); + + ret; +ENDPROC(serpent_cbc_dec_8way_avx) + +ENTRY(serpent_ctr_8way_avx) + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + * %rcx: iv (little endian, 128bit) + */ + + load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2, + RD2, RK0, RK1, RK2); + + call __serpent_enc_blk8_avx; + + store_ctr_8way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); ret; +ENDPROC(serpent_ctr_8way_avx) diff --git a/arch/x86/crypto/serpent-sse2-i586-asm_32.S b/arch/x86/crypto/serpent-sse2-i586-asm_32.S index c00053d42f9..d348f1553a7 100644 --- a/arch/x86/crypto/serpent-sse2-i586-asm_32.S +++ b/arch/x86/crypto/serpent-sse2-i586-asm_32.S @@ -24,6 +24,8 @@ * */ +#include <linux/linkage.h> + .file "serpent-sse2-i586-asm_32.S" .text @@ -510,11 +512,7 @@ pxor t0, x3; \ movdqu x3, (3*4*4)(out); -.align 8 -.global __serpent_enc_blk_4way -.type __serpent_enc_blk_4way,@function; - -__serpent_enc_blk_4way: +ENTRY(__serpent_enc_blk_4way) /* input: * arg_ctx(%esp): ctx, CTX * arg_dst(%esp): dst @@ -566,22 +564,19 @@ __serpent_enc_blk_4way: movl arg_dst(%esp), %eax; cmpb $0, arg_xor(%esp); - jnz __enc_xor4; + jnz .L__enc_xor4; write_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE); ret; -__enc_xor4: +.L__enc_xor4: xor_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE); ret; +ENDPROC(__serpent_enc_blk_4way) -.align 8 -.global serpent_dec_blk_4way -.type serpent_dec_blk_4way,@function; - -serpent_dec_blk_4way: +ENTRY(serpent_dec_blk_4way) /* input: * arg_ctx(%esp): ctx, CTX * arg_dst(%esp): dst @@ -633,3 +628,4 @@ serpent_dec_blk_4way: write_blocks(%eax, RC, RD, RB, RE, RT0, RT1, RA); ret; +ENDPROC(serpent_dec_blk_4way) diff --git a/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S b/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S index 3ee1ff04d3e..acc066c7c6b 100644 --- a/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S +++ b/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S @@ -24,6 +24,8 @@ * */ +#include <linux/linkage.h> + .file "serpent-sse2-x86_64-asm_64.S" .text @@ -632,11 +634,7 @@ pxor t0, x3; \ movdqu x3, (3*4*4)(out); -.align 8 -.global __serpent_enc_blk_8way -.type __serpent_enc_blk_8way,@function; - -__serpent_enc_blk_8way: +ENTRY(__serpent_enc_blk_8way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -687,24 +685,21 @@ __serpent_enc_blk_8way: leaq (4*4*4)(%rsi), %rax; testb %cl, %cl; - jnz __enc_xor8; + jnz .L__enc_xor8; write_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2); write_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); ret; -__enc_xor8: +.L__enc_xor8: xor_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2); xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); ret; +ENDPROC(__serpent_enc_blk_8way) -.align 8 -.global serpent_dec_blk_8way -.type serpent_dec_blk_8way,@function; - -serpent_dec_blk_8way: +ENTRY(serpent_dec_blk_8way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -756,3 +751,4 @@ serpent_dec_blk_8way: write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2); ret; +ENDPROC(serpent_dec_blk_8way) diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c index 3f543a04cf1..52abaaf28e7 100644 --- a/arch/x86/crypto/serpent_avx_glue.c +++ b/arch/x86/crypto/serpent_avx_glue.c @@ -42,55 +42,24 @@ #include <asm/crypto/ablk_helper.h> #include <asm/crypto/glue_helper.h> -static void serpent_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src) -{ - u128 ivs[SERPENT_PARALLEL_BLOCKS - 1]; - unsigned int j; - - for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++) - ivs[j] = src[j]; - - serpent_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src); - - for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++) - u128_xor(dst + (j + 1), dst + (j + 1), ivs + j); -} - -static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv) +static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv) { be128 ctrblk; - u128_to_be128(&ctrblk, iv); - u128_inc(iv); + le128_to_be128(&ctrblk, iv); + le128_inc(iv); __serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk); u128_xor(dst, src, (u128 *)&ctrblk); } -static void serpent_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src, - u128 *iv) -{ - be128 ctrblks[SERPENT_PARALLEL_BLOCKS]; - unsigned int i; - - for (i = 0; i < SERPENT_PARALLEL_BLOCKS; i++) { - if (dst != src) - dst[i] = src[i]; - - u128_to_be128(&ctrblks[i], iv); - u128_inc(iv); - } - - serpent_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks); -} - static const struct common_glue_ctx serpent_enc = { .num_funcs = 2, .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, .funcs = { { .num_blocks = SERPENT_PARALLEL_BLOCKS, - .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_enc_blk_xway) } + .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_enc_8way_avx) } }, { .num_blocks = 1, .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_encrypt) } @@ -103,7 +72,7 @@ static const struct common_glue_ctx serpent_ctr = { .funcs = { { .num_blocks = SERPENT_PARALLEL_BLOCKS, - .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr_xway) } + .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_ctr_8way_avx) } }, { .num_blocks = 1, .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr) } @@ -116,7 +85,7 @@ static const struct common_glue_ctx serpent_dec = { .funcs = { { .num_blocks = SERPENT_PARALLEL_BLOCKS, - .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_dec_blk_xway) } + .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_dec_8way_avx) } }, { .num_blocks = 1, .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_decrypt) } @@ -129,7 +98,7 @@ static const struct common_glue_ctx serpent_dec_cbc = { .funcs = { { .num_blocks = SERPENT_PARALLEL_BLOCKS, - .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_decrypt_cbc_xway) } + .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_cbc_dec_8way_avx) } }, { .num_blocks = 1, .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__serpent_decrypt) } @@ -193,7 +162,7 @@ static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes); if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) { - serpent_enc_blk_xway(ctx->ctx, srcdst, srcdst); + serpent_ecb_enc_8way_avx(ctx->ctx, srcdst, srcdst); return; } @@ -210,7 +179,7 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes); if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) { - serpent_dec_blk_xway(ctx->ctx, srcdst, srcdst); + serpent_ecb_dec_8way_avx(ctx->ctx, srcdst, srcdst); return; } diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c index 9107a9908c4..97a356ece24 100644 --- a/arch/x86/crypto/serpent_sse2_glue.c +++ b/arch/x86/crypto/serpent_sse2_glue.c @@ -59,19 +59,19 @@ static void serpent_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src) u128_xor(dst + (j + 1), dst + (j + 1), ivs + j); } -static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv) +static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv) { be128 ctrblk; - u128_to_be128(&ctrblk, iv); - u128_inc(iv); + le128_to_be128(&ctrblk, iv); + le128_inc(iv); __serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk); u128_xor(dst, src, (u128 *)&ctrblk); } static void serpent_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src, - u128 *iv) + le128 *iv) { be128 ctrblks[SERPENT_PARALLEL_BLOCKS]; unsigned int i; @@ -80,8 +80,8 @@ static void serpent_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src, if (dst != src) dst[i] = src[i]; - u128_to_be128(&ctrblks[i], iv); - u128_inc(iv); + le128_to_be128(&ctrblks[i], iv); + le128_inc(iv); } serpent_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks); diff --git a/arch/x86/crypto/sha1_ssse3_asm.S b/arch/x86/crypto/sha1_ssse3_asm.S index 49d6987a73d..a4109506a5e 100644 --- a/arch/x86/crypto/sha1_ssse3_asm.S +++ b/arch/x86/crypto/sha1_ssse3_asm.S @@ -28,6 +28,8 @@ * (at your option) any later version. */ +#include <linux/linkage.h> + #define CTX %rdi // arg1 #define BUF %rsi // arg2 #define CNT %rdx // arg3 @@ -69,10 +71,8 @@ * param: function's name */ .macro SHA1_VECTOR_ASM name - .global \name - .type \name, @function - .align 32 -\name: + ENTRY(\name) + push %rbx push %rbp push %r12 @@ -106,7 +106,7 @@ pop %rbx ret - .size \name, .-\name + ENDPROC(\name) .endm /* diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S index 1585abb13dd..8d3e113b2c9 100644 --- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S @@ -23,7 +23,17 @@ * */ +#include <linux/linkage.h> +#include "glue_helper-asm-avx.S" + .file "twofish-avx-x86_64-asm_64.S" + +.data +.align 16 + +.Lbswap128_mask: + .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 + .text /* structure of crypto context */ @@ -217,69 +227,43 @@ vpunpcklqdq x3, t2, x2; \ vpunpckhqdq x3, t2, x3; -#define inpack_blocks(in, x0, x1, x2, x3, wkey, t0, t1, t2) \ - vpxor (0*4*4)(in), wkey, x0; \ - vpxor (1*4*4)(in), wkey, x1; \ - vpxor (2*4*4)(in), wkey, x2; \ - vpxor (3*4*4)(in), wkey, x3; \ +#define inpack_blocks(x0, x1, x2, x3, wkey, t0, t1, t2) \ + vpxor x0, wkey, x0; \ + vpxor x1, wkey, x1; \ + vpxor x2, wkey, x2; \ + vpxor x3, wkey, x3; \ \ transpose_4x4(x0, x1, x2, x3, t0, t1, t2) -#define outunpack_blocks(out, x0, x1, x2, x3, wkey, t0, t1, t2) \ - transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ - \ - vpxor x0, wkey, x0; \ - vmovdqu x0, (0*4*4)(out); \ - vpxor x1, wkey, x1; \ - vmovdqu x1, (1*4*4)(out); \ - vpxor x2, wkey, x2; \ - vmovdqu x2, (2*4*4)(out); \ - vpxor x3, wkey, x3; \ - vmovdqu x3, (3*4*4)(out); - -#define outunpack_xor_blocks(out, x0, x1, x2, x3, wkey, t0, t1, t2) \ +#define outunpack_blocks(x0, x1, x2, x3, wkey, t0, t1, t2) \ transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ \ - vpxor x0, wkey, x0; \ - vpxor (0*4*4)(out), x0, x0; \ - vmovdqu x0, (0*4*4)(out); \ - vpxor x1, wkey, x1; \ - vpxor (1*4*4)(out), x1, x1; \ - vmovdqu x1, (1*4*4)(out); \ - vpxor x2, wkey, x2; \ - vpxor (2*4*4)(out), x2, x2; \ - vmovdqu x2, (2*4*4)(out); \ - vpxor x3, wkey, x3; \ - vpxor (3*4*4)(out), x3, x3; \ - vmovdqu x3, (3*4*4)(out); + vpxor x0, wkey, x0; \ + vpxor x1, wkey, x1; \ + vpxor x2, wkey, x2; \ + vpxor x3, wkey, x3; .align 8 -.global __twofish_enc_blk_8way -.type __twofish_enc_blk_8way,@function; - -__twofish_enc_blk_8way: +__twofish_enc_blk8: /* input: * %rdi: ctx, CTX - * %rsi: dst - * %rdx: src - * %rcx: bool, if true: xor output + * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks + * output: + * RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2: encrypted blocks */ + vmovdqu w(CTX), RK1; + pushq %rbp; pushq %rbx; pushq %rcx; - vmovdqu w(CTX), RK1; - - leaq (4*4*4)(%rdx), %rax; - inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2); + inpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2); preload_rgi(RA1); rotate_1l(RD1); - inpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2); + inpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2); rotate_1l(RD2); - movq %rsi, %r11; - encrypt_cycle(0); encrypt_cycle(1); encrypt_cycle(2); @@ -295,47 +279,32 @@ __twofish_enc_blk_8way: popq %rbx; popq %rbp; - leaq (4*4*4)(%r11), %rax; - - testb %cl, %cl; - jnz __enc_xor8; - - outunpack_blocks(%r11, RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2); - outunpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2); - - ret; - -__enc_xor8: - outunpack_xor_blocks(%r11, RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2); - outunpack_xor_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2); + outunpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2); + outunpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2); ret; +ENDPROC(__twofish_enc_blk8) .align 8 -.global twofish_dec_blk_8way -.type twofish_dec_blk_8way,@function; - -twofish_dec_blk_8way: +__twofish_dec_blk8: /* input: * %rdi: ctx, CTX - * %rsi: dst - * %rdx: src + * RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2: encrypted blocks + * output: + * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks */ + vmovdqu (w+4*4)(CTX), RK1; + pushq %rbp; pushq %rbx; - vmovdqu (w+4*4)(CTX), RK1; - - leaq (4*4*4)(%rdx), %rax; - inpack_blocks(%rdx, RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2); + inpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2); preload_rgi(RC1); rotate_1l(RA1); - inpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2); + inpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2); rotate_1l(RA2); - movq %rsi, %r11; - decrypt_cycle(7); decrypt_cycle(6); decrypt_cycle(5); @@ -350,8 +319,92 @@ twofish_dec_blk_8way: popq %rbx; popq %rbp; - leaq (4*4*4)(%r11), %rax; - outunpack_blocks(%r11, RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2); - outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2); + outunpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2); + outunpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2); + + ret; +ENDPROC(__twofish_dec_blk8) + +ENTRY(twofish_ecb_enc_8way) + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + */ + + movq %rsi, %r11; + + load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + + call __twofish_enc_blk8; + + store_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2); + + ret; +ENDPROC(twofish_ecb_enc_8way) + +ENTRY(twofish_ecb_dec_8way) + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + */ + + movq %rsi, %r11; + + load_8way(%rdx, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2); + + call __twofish_dec_blk8; + + store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + + ret; +ENDPROC(twofish_ecb_dec_8way) + +ENTRY(twofish_cbc_dec_8way) + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + */ + + pushq %r12; + + movq %rsi, %r11; + movq %rdx, %r12; + + load_8way(%rdx, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2); + + call __twofish_dec_blk8; + + store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + + popq %r12; + + ret; +ENDPROC(twofish_cbc_dec_8way) + +ENTRY(twofish_ctr_8way) + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + * %rcx: iv (little endian, 128bit) + */ + + pushq %r12; + + movq %rsi, %r11; + movq %rdx, %r12; + + load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2, + RD2, RX0, RX1, RY0); + + call __twofish_enc_blk8; + + store_ctr_8way(%r12, %r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2); + + popq %r12; ret; +ENDPROC(twofish_ctr_8way) diff --git a/arch/x86/crypto/twofish-i586-asm_32.S b/arch/x86/crypto/twofish-i586-asm_32.S index 658af4bb35c..694ea4587ba 100644 --- a/arch/x86/crypto/twofish-i586-asm_32.S +++ b/arch/x86/crypto/twofish-i586-asm_32.S @@ -20,6 +20,7 @@ .file "twofish-i586-asm.S" .text +#include <linux/linkage.h> #include <asm/asm-offsets.h> /* return address at 0 */ @@ -219,11 +220,7 @@ xor %esi, d ## D;\ ror $1, d ## D; -.align 4 -.global twofish_enc_blk -.global twofish_dec_blk - -twofish_enc_blk: +ENTRY(twofish_enc_blk) push %ebp /* save registers according to calling convention*/ push %ebx push %esi @@ -277,8 +274,9 @@ twofish_enc_blk: pop %ebp mov $1, %eax ret +ENDPROC(twofish_enc_blk) -twofish_dec_blk: +ENTRY(twofish_dec_blk) push %ebp /* save registers according to calling convention*/ push %ebx push %esi @@ -333,3 +331,4 @@ twofish_dec_blk: pop %ebp mov $1, %eax ret +ENDPROC(twofish_dec_blk) diff --git a/arch/x86/crypto/twofish-x86_64-asm_64-3way.S b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S index 5b012a2c511..1c3b7ceb36d 100644 --- a/arch/x86/crypto/twofish-x86_64-asm_64-3way.S +++ b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S @@ -20,6 +20,8 @@ * */ +#include <linux/linkage.h> + .file "twofish-x86_64-asm-3way.S" .text @@ -214,11 +216,7 @@ rorq $32, RAB2; \ outunpack3(mov, RIO, 2, RAB, 2); -.align 8 -.global __twofish_enc_blk_3way -.type __twofish_enc_blk_3way,@function; - -__twofish_enc_blk_3way: +ENTRY(__twofish_enc_blk_3way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -250,7 +248,7 @@ __twofish_enc_blk_3way: popq %rbp; /* bool xor */ testb %bpl, %bpl; - jnz __enc_xor3; + jnz .L__enc_xor3; outunpack_enc3(mov); @@ -262,7 +260,7 @@ __twofish_enc_blk_3way: popq %r15; ret; -__enc_xor3: +.L__enc_xor3: outunpack_enc3(xor); popq %rbx; @@ -272,11 +270,9 @@ __enc_xor3: popq %r14; popq %r15; ret; +ENDPROC(__twofish_enc_blk_3way) -.global twofish_dec_blk_3way -.type twofish_dec_blk_3way,@function; - -twofish_dec_blk_3way: +ENTRY(twofish_dec_blk_3way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -313,4 +309,4 @@ twofish_dec_blk_3way: popq %r14; popq %r15; ret; - +ENDPROC(twofish_dec_blk_3way) diff --git a/arch/x86/crypto/twofish-x86_64-asm_64.S b/arch/x86/crypto/twofish-x86_64-asm_64.S index 7bcf3fcc366..a039d21986a 100644 --- a/arch/x86/crypto/twofish-x86_64-asm_64.S +++ b/arch/x86/crypto/twofish-x86_64-asm_64.S @@ -20,6 +20,7 @@ .file "twofish-x86_64-asm.S" .text +#include <linux/linkage.h> #include <asm/asm-offsets.h> #define a_offset 0 @@ -214,11 +215,7 @@ xor %r8d, d ## D;\ ror $1, d ## D; -.align 8 -.global twofish_enc_blk -.global twofish_dec_blk - -twofish_enc_blk: +ENTRY(twofish_enc_blk) pushq R1 /* %rdi contains the ctx address */ @@ -269,8 +266,9 @@ twofish_enc_blk: popq R1 movq $1,%rax ret +ENDPROC(twofish_enc_blk) -twofish_dec_blk: +ENTRY(twofish_dec_blk) pushq R1 /* %rdi contains the ctx address */ @@ -320,3 +318,4 @@ twofish_dec_blk: popq R1 movq $1,%rax ret +ENDPROC(twofish_dec_blk) diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c index e7708b5442e..94ac91d26e4 100644 --- a/arch/x86/crypto/twofish_avx_glue.c +++ b/arch/x86/crypto/twofish_avx_glue.c @@ -45,66 +45,23 @@ #define TWOFISH_PARALLEL_BLOCKS 8 -static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, - const u8 *src) -{ - __twofish_enc_blk_3way(ctx, dst, src, false); -} - /* 8-way parallel cipher functions */ -asmlinkage void __twofish_enc_blk_8way(struct twofish_ctx *ctx, u8 *dst, - const u8 *src, bool xor); -asmlinkage void twofish_dec_blk_8way(struct twofish_ctx *ctx, u8 *dst, +asmlinkage void twofish_ecb_enc_8way(struct twofish_ctx *ctx, u8 *dst, + const u8 *src); +asmlinkage void twofish_ecb_dec_8way(struct twofish_ctx *ctx, u8 *dst, const u8 *src); -static inline void twofish_enc_blk_xway(struct twofish_ctx *ctx, u8 *dst, - const u8 *src) -{ - __twofish_enc_blk_8way(ctx, dst, src, false); -} - -static inline void twofish_enc_blk_xway_xor(struct twofish_ctx *ctx, u8 *dst, - const u8 *src) -{ - __twofish_enc_blk_8way(ctx, dst, src, true); -} +asmlinkage void twofish_cbc_dec_8way(struct twofish_ctx *ctx, u8 *dst, + const u8 *src); +asmlinkage void twofish_ctr_8way(struct twofish_ctx *ctx, u8 *dst, + const u8 *src, le128 *iv); -static inline void twofish_dec_blk_xway(struct twofish_ctx *ctx, u8 *dst, +static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, const u8 *src) { - twofish_dec_blk_8way(ctx, dst, src); -} - -static void twofish_dec_blk_cbc_xway(void *ctx, u128 *dst, const u128 *src) -{ - u128 ivs[TWOFISH_PARALLEL_BLOCKS - 1]; - unsigned int j; - - for (j = 0; j < TWOFISH_PARALLEL_BLOCKS - 1; j++) - ivs[j] = src[j]; - - twofish_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src); - - for (j = 0; j < TWOFISH_PARALLEL_BLOCKS - 1; j++) - u128_xor(dst + (j + 1), dst + (j + 1), ivs + j); + __twofish_enc_blk_3way(ctx, dst, src, false); } -static void twofish_enc_blk_ctr_xway(void *ctx, u128 *dst, const u128 *src, - u128 *iv) -{ - be128 ctrblks[TWOFISH_PARALLEL_BLOCKS]; - unsigned int i; - - for (i = 0; i < TWOFISH_PARALLEL_BLOCKS; i++) { - if (dst != src) - dst[i] = src[i]; - - u128_to_be128(&ctrblks[i], iv); - u128_inc(iv); - } - - twofish_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks); -} static const struct common_glue_ctx twofish_enc = { .num_funcs = 3, @@ -112,7 +69,7 @@ static const struct common_glue_ctx twofish_enc = { .funcs = { { .num_blocks = TWOFISH_PARALLEL_BLOCKS, - .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_xway) } + .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_enc_8way) } }, { .num_blocks = 3, .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_3way) } @@ -128,7 +85,7 @@ static const struct common_glue_ctx twofish_ctr = { .funcs = { { .num_blocks = TWOFISH_PARALLEL_BLOCKS, - .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_xway) } + .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_ctr_8way) } }, { .num_blocks = 3, .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_3way) } @@ -144,7 +101,7 @@ static const struct common_glue_ctx twofish_dec = { .funcs = { { .num_blocks = TWOFISH_PARALLEL_BLOCKS, - .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_xway) } + .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_dec_8way) } }, { .num_blocks = 3, .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_3way) } @@ -160,7 +117,7 @@ static const struct common_glue_ctx twofish_dec_cbc = { .funcs = { { .num_blocks = TWOFISH_PARALLEL_BLOCKS, - .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_xway) } + .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_cbc_dec_8way) } }, { .num_blocks = 3, .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_3way) } @@ -227,7 +184,7 @@ static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes); if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) { - twofish_enc_blk_xway(ctx->ctx, srcdst, srcdst); + twofish_ecb_enc_8way(ctx->ctx, srcdst, srcdst); return; } @@ -249,7 +206,7 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes); if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) { - twofish_dec_blk_xway(ctx->ctx, srcdst, srcdst); + twofish_ecb_dec_8way(ctx->ctx, srcdst, srcdst); return; } diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c index aa3eb358b7e..13e63b3e1df 100644 --- a/arch/x86/crypto/twofish_glue_3way.c +++ b/arch/x86/crypto/twofish_glue_3way.c @@ -62,15 +62,15 @@ void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src) } EXPORT_SYMBOL_GPL(twofish_dec_blk_cbc_3way); -void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv) +void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv) { be128 ctrblk; if (dst != src) *dst = *src; - u128_to_be128(&ctrblk, iv); - u128_inc(iv); + le128_to_be128(&ctrblk, iv); + le128_inc(iv); twofish_enc_blk(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk); u128_xor(dst, dst, (u128 *)&ctrblk); @@ -78,7 +78,7 @@ void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv) EXPORT_SYMBOL_GPL(twofish_enc_blk_ctr); void twofish_enc_blk_ctr_3way(void *ctx, u128 *dst, const u128 *src, - u128 *iv) + le128 *iv) { be128 ctrblks[3]; @@ -88,12 +88,12 @@ void twofish_enc_blk_ctr_3way(void *ctx, u128 *dst, const u128 *src, dst[2] = src[2]; } - u128_to_be128(&ctrblks[0], iv); - u128_inc(iv); - u128_to_be128(&ctrblks[1], iv); - u128_inc(iv); - u128_to_be128(&ctrblks[2], iv); - u128_inc(iv); + le128_to_be128(&ctrblks[0], iv); + le128_inc(iv); + le128_to_be128(&ctrblks[1], iv); + le128_inc(iv); + le128_to_be128(&ctrblks[2], iv); + le128_inc(iv); twofish_enc_blk_xor_3way(ctx, (u8 *)dst, (u8 *)ctrblks); } diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index 07b3a68d2d2..03abf9b7001 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -35,7 +35,7 @@ #undef WARN_OLD #undef CORE_DUMP /* definitely broken */ -static int load_aout_binary(struct linux_binprm *, struct pt_regs *regs); +static int load_aout_binary(struct linux_binprm *); static int load_aout_library(struct file *); #ifdef CORE_DUMP @@ -260,9 +260,10 @@ static u32 __user *create_aout_tables(char __user *p, struct linux_binprm *bprm) * These are the functions used to load a.out style executables and shared * libraries. There is no binary dependent code anywhere else. */ -static int load_aout_binary(struct linux_binprm *bprm, struct pt_regs *regs) +static int load_aout_binary(struct linux_binprm *bprm) { unsigned long error, fd_offset, rlim; + struct pt_regs *regs = current_pt_regs(); struct exec ex; int retval; @@ -270,7 +271,7 @@ static int load_aout_binary(struct linux_binprm *bprm, struct pt_regs *regs) if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != OMAGIC && N_MAGIC(ex) != QMAGIC && N_MAGIC(ex) != NMAGIC) || N_TRSIZE(ex) || N_DRSIZE(ex) || - i_size_read(bprm->file->f_path.dentry->d_inode) < + i_size_read(file_inode(bprm->file)) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) { return -ENOEXEC; } @@ -424,12 +425,10 @@ beyond_if: static int load_aout_library(struct file *file) { - struct inode *inode; unsigned long bss, start_addr, len, error; int retval; struct exec ex; - inode = file->f_path.dentry->d_inode; retval = -ENOEXEC; error = kernel_read(file, 0, (char *) &ex, sizeof(ex)); @@ -439,7 +438,7 @@ static int load_aout_library(struct file *file) /* We come in here for the regular a.out style of shared libraries */ if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != QMAGIC) || N_TRSIZE(ex) || N_DRSIZE(ex) || ((ex.a_entry & 0xfff) && N_MAGIC(ex) == ZMAGIC) || - i_size_read(inode) < + i_size_read(file_inode(file)) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) { goto out; } diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index efc6a958b71..cf1a471a18a 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -129,59 +129,6 @@ int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from) return err; } -asmlinkage long sys32_sigsuspend(int history0, int history1, old_sigset_t mask) -{ - sigset_t blocked; - siginitset(&blocked, mask); - return sigsuspend(&blocked); -} - -asmlinkage long sys32_sigaltstack(const stack_ia32_t __user *uss_ptr, - stack_ia32_t __user *uoss_ptr, - struct pt_regs *regs) -{ - stack_t uss, uoss; - int ret, err = 0; - mm_segment_t seg; - - if (uss_ptr) { - u32 ptr; - - memset(&uss, 0, sizeof(stack_t)); - if (!access_ok(VERIFY_READ, uss_ptr, sizeof(stack_ia32_t))) - return -EFAULT; - - get_user_try { - get_user_ex(ptr, &uss_ptr->ss_sp); - get_user_ex(uss.ss_flags, &uss_ptr->ss_flags); - get_user_ex(uss.ss_size, &uss_ptr->ss_size); - } get_user_catch(err); - - if (err) - return -EFAULT; - uss.ss_sp = compat_ptr(ptr); - } - seg = get_fs(); - set_fs(KERNEL_DS); - ret = do_sigaltstack((stack_t __force __user *) (uss_ptr ? &uss : NULL), - (stack_t __force __user *) &uoss, regs->sp); - set_fs(seg); - if (ret >= 0 && uoss_ptr) { - if (!access_ok(VERIFY_WRITE, uoss_ptr, sizeof(stack_ia32_t))) - return -EFAULT; - - put_user_try { - put_user_ex(ptr_to_compat(uoss.ss_sp), &uoss_ptr->ss_sp); - put_user_ex(uoss.ss_flags, &uoss_ptr->ss_flags); - put_user_ex(uoss.ss_size, &uoss_ptr->ss_size); - } put_user_catch(err); - - if (err) - ret = -EFAULT; - } - return ret; -} - /* * Do a signal return; undo the signal stack. */ @@ -261,8 +208,9 @@ static int ia32_restore_sigcontext(struct pt_regs *regs, return err; } -asmlinkage long sys32_sigreturn(struct pt_regs *regs) +asmlinkage long sys32_sigreturn(void) { + struct pt_regs *regs = current_pt_regs(); struct sigframe_ia32 __user *frame = (struct sigframe_ia32 __user *)(regs->sp-8); sigset_t set; unsigned int ax; @@ -287,12 +235,12 @@ badframe: return 0; } -asmlinkage long sys32_rt_sigreturn(struct pt_regs *regs) +asmlinkage long sys32_rt_sigreturn(void) { + struct pt_regs *regs = current_pt_regs(); struct rt_sigframe_ia32 __user *frame; sigset_t set; unsigned int ax; - struct pt_regs tregs; frame = (struct rt_sigframe_ia32 __user *)(regs->sp - 4); @@ -306,8 +254,7 @@ asmlinkage long sys32_rt_sigreturn(struct pt_regs *regs) if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) goto badframe; - tregs = *regs; - if (sys32_sigaltstack(&frame->uc.uc_stack, NULL, &tregs) == -EFAULT) + if (compat_restore_altstack(&frame->uc.uc_stack)) goto badframe; return ax; @@ -362,7 +309,7 @@ static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc, /* * Determine which stack to use.. */ -static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, +static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs, size_t frame_size, void __user **fpstate) { @@ -372,16 +319,13 @@ static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, sp = regs->sp; /* This is the X/Open sanctioned signal stack switching. */ - if (ka->sa.sa_flags & SA_ONSTACK) { - if (sas_ss_flags(sp) == 0) - sp = current->sas_ss_sp + current->sas_ss_size; - } - + if (ksig->ka.sa.sa_flags & SA_ONSTACK) + sp = sigsp(sp, ksig); /* This is the legacy signal stack switching. */ else if ((regs->ss & 0xffff) != __USER32_DS && - !(ka->sa.sa_flags & SA_RESTORER) && - ka->sa.sa_restorer) - sp = (unsigned long) ka->sa.sa_restorer; + !(ksig->ka.sa.sa_flags & SA_RESTORER) && + ksig->ka.sa.sa_restorer) + sp = (unsigned long) ksig->ka.sa.sa_restorer; if (used_math()) { unsigned long fx_aligned, math_size; @@ -400,7 +344,7 @@ static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, return (void __user *) sp; } -int ia32_setup_frame(int sig, struct k_sigaction *ka, +int ia32_setup_frame(int sig, struct ksignal *ksig, compat_sigset_t *set, struct pt_regs *regs) { struct sigframe_ia32 __user *frame; @@ -419,7 +363,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, 0x80cd, /* int $0x80 */ }; - frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); + frame = get_sigframe(ksig, regs, sizeof(*frame), &fpstate); if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) return -EFAULT; @@ -436,8 +380,8 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, return -EFAULT; } - if (ka->sa.sa_flags & SA_RESTORER) { - restorer = ka->sa.sa_restorer; + if (ksig->ka.sa.sa_flags & SA_RESTORER) { + restorer = ksig->ka.sa.sa_restorer; } else { /* Return stub is in 32bit vsyscall page */ if (current->mm->context.vdso) @@ -462,7 +406,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, /* Set up registers for signal handler */ regs->sp = (unsigned long) frame; - regs->ip = (unsigned long) ka->sa.sa_handler; + regs->ip = (unsigned long) ksig->ka.sa.sa_handler; /* Make -mregparm=3 work */ regs->ax = sig; @@ -478,7 +422,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, return 0; } -int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, +int ia32_setup_rt_frame(int sig, struct ksignal *ksig, compat_sigset_t *set, struct pt_regs *regs) { struct rt_sigframe_ia32 __user *frame; @@ -499,7 +443,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, 0, }; - frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); + frame = get_sigframe(ksig, regs, sizeof(*frame), &fpstate); if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) return -EFAULT; @@ -515,13 +459,10 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, else put_user_ex(0, &frame->uc.uc_flags); put_user_ex(0, &frame->uc.uc_link); - put_user_ex(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); - put_user_ex(sas_ss_flags(regs->sp), - &frame->uc.uc_stack.ss_flags); - put_user_ex(current->sas_ss_size, &frame->uc.uc_stack.ss_size); + err |= __compat_save_altstack(&frame->uc.uc_stack, regs->sp); - if (ka->sa.sa_flags & SA_RESTORER) - restorer = ka->sa.sa_restorer; + if (ksig->ka.sa.sa_flags & SA_RESTORER) + restorer = ksig->ka.sa.sa_restorer; else restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn); @@ -534,7 +475,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, put_user_ex(*((u64 *)&code), (u64 __user *)frame->retcode); } put_user_catch(err); - err |= copy_siginfo_to_user32(&frame->info, info); + err |= copy_siginfo_to_user32(&frame->info, &ksig->info); err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, fpstate, regs, set->sig[0]); err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); @@ -544,7 +485,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, /* Set up registers for signal handler */ regs->sp = (unsigned long) frame; - regs->ip = (unsigned long) ka->sa.sa_handler; + regs->ip = (unsigned long) ksig->ka.sa.sa_handler; /* Make -mregparm=3 work */ regs->ax = sig; diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 076745fc804..474dc1b59f7 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -207,7 +207,7 @@ sysexit_from_sys_call: testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) jnz ia32_ret_from_sys_call TRACE_IRQS_ON - sti + ENABLE_INTERRUPTS(CLBR_NONE) movl %eax,%esi /* second arg, syscall return value */ cmpl $-MAX_ERRNO,%eax /* is it an error ? */ jbe 1f @@ -217,7 +217,7 @@ sysexit_from_sys_call: call __audit_syscall_exit movq RAX-ARGOFFSET(%rsp),%rax /* reload syscall return value */ movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi - cli + DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF testl %edi,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) jz \exit @@ -456,20 +456,22 @@ ia32_badsys: ALIGN GLOBAL(\label) leaq \func(%rip),%rax - leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */ jmp ia32_ptregs_common .endm CFI_STARTPROC32 - PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn, %rdi - PTREGSCALL stub32_sigreturn, sys32_sigreturn, %rdi - PTREGSCALL stub32_sigaltstack, sys32_sigaltstack, %rdx - PTREGSCALL stub32_execve, compat_sys_execve, %rcx - PTREGSCALL stub32_fork, sys_fork, %rdi - PTREGSCALL stub32_clone, sys32_clone, %rdx - PTREGSCALL stub32_vfork, sys_vfork, %rdi - PTREGSCALL stub32_iopl, sys_iopl, %rsi + PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn + PTREGSCALL stub32_sigreturn, sys32_sigreturn + PTREGSCALL stub32_execve, compat_sys_execve + PTREGSCALL stub32_fork, sys_fork + PTREGSCALL stub32_vfork, sys_vfork + + ALIGN +GLOBAL(stub32_clone) + leaq sys_clone(%rip),%rax + mov %r8, %rcx + jmp ia32_ptregs_common ALIGN ia32_ptregs_common: diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index 86d68d1c880..ad7a20cbc69 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -172,183 +172,12 @@ asmlinkage long sys32_mprotect(unsigned long start, size_t len, return sys_mprotect(start, len, prot); } -asmlinkage long sys32_rt_sigaction(int sig, struct sigaction32 __user *act, - struct sigaction32 __user *oact, - unsigned int sigsetsize) -{ - struct k_sigaction new_ka, old_ka; - int ret; - compat_sigset_t set32; - - /* XXX: Don't preclude handling different sized sigset_t's. */ - if (sigsetsize != sizeof(compat_sigset_t)) - return -EINVAL; - - if (act) { - compat_uptr_t handler, restorer; - - if (!access_ok(VERIFY_READ, act, sizeof(*act)) || - __get_user(handler, &act->sa_handler) || - __get_user(new_ka.sa.sa_flags, &act->sa_flags) || - __get_user(restorer, &act->sa_restorer) || - __copy_from_user(&set32, &act->sa_mask, - sizeof(compat_sigset_t))) - return -EFAULT; - new_ka.sa.sa_handler = compat_ptr(handler); - new_ka.sa.sa_restorer = compat_ptr(restorer); - - /* - * FIXME: here we rely on _COMPAT_NSIG_WORS to be >= - * than _NSIG_WORDS << 1 - */ - switch (_NSIG_WORDS) { - case 4: new_ka.sa.sa_mask.sig[3] = set32.sig[6] - | (((long)set32.sig[7]) << 32); - case 3: new_ka.sa.sa_mask.sig[2] = set32.sig[4] - | (((long)set32.sig[5]) << 32); - case 2: new_ka.sa.sa_mask.sig[1] = set32.sig[2] - | (((long)set32.sig[3]) << 32); - case 1: new_ka.sa.sa_mask.sig[0] = set32.sig[0] - | (((long)set32.sig[1]) << 32); - } - } - - ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); - - if (!ret && oact) { - /* - * FIXME: here we rely on _COMPAT_NSIG_WORS to be >= - * than _NSIG_WORDS << 1 - */ - switch (_NSIG_WORDS) { - case 4: - set32.sig[7] = (old_ka.sa.sa_mask.sig[3] >> 32); - set32.sig[6] = old_ka.sa.sa_mask.sig[3]; - case 3: - set32.sig[5] = (old_ka.sa.sa_mask.sig[2] >> 32); - set32.sig[4] = old_ka.sa.sa_mask.sig[2]; - case 2: - set32.sig[3] = (old_ka.sa.sa_mask.sig[1] >> 32); - set32.sig[2] = old_ka.sa.sa_mask.sig[1]; - case 1: - set32.sig[1] = (old_ka.sa.sa_mask.sig[0] >> 32); - set32.sig[0] = old_ka.sa.sa_mask.sig[0]; - } - if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) || - __put_user(ptr_to_compat(old_ka.sa.sa_handler), - &oact->sa_handler) || - __put_user(ptr_to_compat(old_ka.sa.sa_restorer), - &oact->sa_restorer) || - __put_user(old_ka.sa.sa_flags, &oact->sa_flags) || - __copy_to_user(&oact->sa_mask, &set32, - sizeof(compat_sigset_t))) - return -EFAULT; - } - - return ret; -} - -asmlinkage long sys32_sigaction(int sig, struct old_sigaction32 __user *act, - struct old_sigaction32 __user *oact) -{ - struct k_sigaction new_ka, old_ka; - int ret; - - if (act) { - compat_old_sigset_t mask; - compat_uptr_t handler, restorer; - - if (!access_ok(VERIFY_READ, act, sizeof(*act)) || - __get_user(handler, &act->sa_handler) || - __get_user(new_ka.sa.sa_flags, &act->sa_flags) || - __get_user(restorer, &act->sa_restorer) || - __get_user(mask, &act->sa_mask)) - return -EFAULT; - - new_ka.sa.sa_handler = compat_ptr(handler); - new_ka.sa.sa_restorer = compat_ptr(restorer); - - siginitset(&new_ka.sa.sa_mask, mask); - } - - ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); - - if (!ret && oact) { - if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) || - __put_user(ptr_to_compat(old_ka.sa.sa_handler), - &oact->sa_handler) || - __put_user(ptr_to_compat(old_ka.sa.sa_restorer), - &oact->sa_restorer) || - __put_user(old_ka.sa.sa_flags, &oact->sa_flags) || - __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask)) - return -EFAULT; - } - - return ret; -} - asmlinkage long sys32_waitpid(compat_pid_t pid, unsigned int __user *stat_addr, int options) { return compat_sys_wait4(pid, stat_addr, options, NULL); } -/* 32-bit timeval and related flotsam. */ - -asmlinkage long sys32_sched_rr_get_interval(compat_pid_t pid, - struct compat_timespec __user *interval) -{ - struct timespec t; - int ret; - mm_segment_t old_fs = get_fs(); - - set_fs(KERNEL_DS); - ret = sys_sched_rr_get_interval(pid, (struct timespec __user *)&t); - set_fs(old_fs); - if (put_compat_timespec(&t, interval)) - return -EFAULT; - return ret; -} - -asmlinkage long sys32_rt_sigpending(compat_sigset_t __user *set, - compat_size_t sigsetsize) -{ - sigset_t s; - compat_sigset_t s32; - int ret; - mm_segment_t old_fs = get_fs(); - - set_fs(KERNEL_DS); - ret = sys_rt_sigpending((sigset_t __user *)&s, sigsetsize); - set_fs(old_fs); - if (!ret) { - switch (_NSIG_WORDS) { - case 4: s32.sig[7] = (s.sig[3] >> 32); s32.sig[6] = s.sig[3]; - case 3: s32.sig[5] = (s.sig[2] >> 32); s32.sig[4] = s.sig[2]; - case 2: s32.sig[3] = (s.sig[1] >> 32); s32.sig[2] = s.sig[1]; - case 1: s32.sig[1] = (s.sig[0] >> 32); s32.sig[0] = s.sig[0]; - } - if (copy_to_user(set, &s32, sizeof(compat_sigset_t))) - return -EFAULT; - } - return ret; -} - -asmlinkage long sys32_rt_sigqueueinfo(int pid, int sig, - compat_siginfo_t __user *uinfo) -{ - siginfo_t info; - int ret; - mm_segment_t old_fs = get_fs(); - - if (copy_siginfo_from_user32(&info, uinfo)) - return -EFAULT; - set_fs(KERNEL_DS); - ret = sys_rt_sigqueueinfo(pid, sig, (siginfo_t __user *)&info); - set_fs(old_fs); - return ret; -} - /* warning: next two assume little endian */ asmlinkage long sys32_pread(unsigned int fd, char __user *ubuf, u32 count, u32 poslo, u32 poshi) @@ -385,26 +214,10 @@ asmlinkage long sys32_sendfile(int out_fd, int in_fd, return ret; } -asmlinkage long sys32_clone(unsigned int clone_flags, unsigned int newsp, - struct pt_regs *regs) -{ - void __user *parent_tid = (void __user *)regs->dx; - void __user *child_tid = (void __user *)regs->di; - - if (!newsp) - newsp = regs->sp; - return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); -} - /* * Some system calls that need sign extended arguments. This could be * done by a generic wrapper. */ -long sys32_lseek(unsigned int fd, int offset, unsigned int whence) -{ - return sys_lseek(fd, offset, whence); -} - long sys32_kill(int pid, int sig) { return sys_kill(pid, sig); diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index 66e5f0ef052..7f669853317 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -1,27 +1,4 @@ -include include/asm-generic/Kbuild.asm -header-y += boot.h -header-y += bootparam.h -header-y += debugreg.h -header-y += e820.h -header-y += hw_breakpoint.h -header-y += hyperv.h -header-y += ist.h -header-y += ldt.h -header-y += mce.h -header-y += msr-index.h -header-y += msr.h -header-y += mtrr.h -header-y += posix_types_32.h -header-y += posix_types_64.h -header-y += posix_types_x32.h -header-y += prctl.h -header-y += processor-flags.h -header-y += ptrace-abi.h -header-y += sigcontext32.h -header-y += ucontext.h -header-y += vm86.h -header-y += vsyscall.h genhdr-y += unistd_32.h genhdr-y += unistd_64.h diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 0c44630d178..b31bf97775f 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -49,10 +49,6 @@ /* Asm macros */ -#define ACPI_ASM_MACROS -#define BREAKPOINT3 -#define ACPI_DISABLE_IRQS() local_irq_disable() -#define ACPI_ENABLE_IRQS() local_irq_enable() #define ACPI_FLUSH_CPU_CACHE() wbinvd() int __acpi_acquire_global_lock(unsigned int *lock); diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h index b3341e9cd8f..a54ee1d054d 100644 --- a/arch/x86/include/asm/amd_nb.h +++ b/arch/x86/include/asm/amd_nb.h @@ -81,6 +81,23 @@ static inline struct amd_northbridge *node_to_amd_nb(int node) return (node < amd_northbridges.num) ? &amd_northbridges.nb[node] : NULL; } +static inline u16 amd_get_node_id(struct pci_dev *pdev) +{ + struct pci_dev *misc; + int i; + + for (i = 0; i != amd_nb_num(); i++) { + misc = node_to_amd_nb(i)->misc; + + if (pci_domain_nr(misc->bus) == pci_domain_nr(pdev->bus) && + PCI_SLOT(misc->devfn) == PCI_SLOT(pdev->devfn)) + return i; + } + + WARN(1, "Unable to find AMD Northbridge id for %s\n", pci_name(pdev)); + return 0; +} + #else #define amd_nb_num(x) 0 diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index b6c3b821acf..722aa3b0462 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -172,23 +172,7 @@ static inline int atomic_add_negative(int i, atomic_t *v) */ static inline int atomic_add_return(int i, atomic_t *v) { -#ifdef CONFIG_M386 - int __i; - unsigned long flags; - if (unlikely(boot_cpu_data.x86 <= 3)) - goto no_xadd; -#endif - /* Modern 486+ processor */ return i + xadd(&v->counter, i); - -#ifdef CONFIG_M386 -no_xadd: /* Legacy 386 processor */ - raw_local_irq_save(flags); - __i = atomic_read(v); - atomic_set(v, i + __i); - raw_local_irq_restore(flags); - return i + __i; -#endif } /** diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h index b13fe63bdc5..4fa687a47a6 100644 --- a/arch/x86/include/asm/boot.h +++ b/arch/x86/include/asm/boot.h @@ -1,14 +1,9 @@ #ifndef _ASM_X86_BOOT_H #define _ASM_X86_BOOT_H -/* Internal svga startup constants */ -#define NORMAL_VGA 0xffff /* 80x25 mode */ -#define EXTENDED_VGA 0xfffe /* 80x50 mode */ -#define ASK_VGA 0xfffd /* ask for it at bootup */ - -#ifdef __KERNEL__ #include <asm/pgtable_types.h> +#include <uapi/asm/boot.h> /* Physical address where kernel should be loaded. */ #define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \ @@ -42,6 +37,4 @@ #define BOOT_STACK_SIZE 0x1000 #endif -#endif /* __KERNEL__ */ - #endif /* _ASM_X86_BOOT_H */ diff --git a/arch/x86/include/asm/bootparam_utils.h b/arch/x86/include/asm/bootparam_utils.h new file mode 100644 index 00000000000..653668d140f --- /dev/null +++ b/arch/x86/include/asm/bootparam_utils.h @@ -0,0 +1,54 @@ +#ifndef _ASM_X86_BOOTPARAM_UTILS_H +#define _ASM_X86_BOOTPARAM_UTILS_H + +#include <asm/bootparam.h> + +/* + * This file is included from multiple environments. Do not + * add completing #includes to make it standalone. + */ + +/* + * Deal with bootloaders which fail to initialize unknown fields in + * boot_params to zero. The list fields in this list are taken from + * analysis of kexec-tools; if other broken bootloaders initialize a + * different set of fields we will need to figure out how to disambiguate. + * + * Note: efi_info is commonly left uninitialized, but that field has a + * private magic, so it is better to leave it unchanged. + */ +static void sanitize_boot_params(struct boot_params *boot_params) +{ + /* + * IMPORTANT NOTE TO BOOTLOADER AUTHORS: do not simply clear + * this field. The purpose of this field is to guarantee + * compliance with the x86 boot spec located in + * Documentation/x86/boot.txt . That spec says that the + * *whole* structure should be cleared, after which only the + * portion defined by struct setup_header (boot_params->hdr) + * should be copied in. + * + * If you're having an issue because the sentinel is set, you + * need to change the whole structure to be cleared, not this + * (or any other) individual field, or you will soon have + * problems again. + */ + if (boot_params->sentinel) { + /* fields in boot_params are left uninitialized, clear them */ + memset(&boot_params->olpc_ofw_header, 0, + (char *)&boot_params->efi_info - + (char *)&boot_params->olpc_ofw_header); + memset(&boot_params->kbd_status, 0, + (char *)&boot_params->hdr - + (char *)&boot_params->kbd_status); + memset(&boot_params->_pad7[0], 0, + (char *)&boot_params->edd_mbr_sig_buffer[0] - + (char *)&boot_params->_pad7[0]); + memset(&boot_params->_pad8[0], 0, + (char *)&boot_params->eddbuf[0] - + (char *)&boot_params->_pad8[0]); + memset(&boot_params->_pad9[0], 0, sizeof(boot_params->_pad9)); + } +} + +#endif /* _ASM_X86_BOOTPARAM_UTILS_H */ diff --git a/arch/x86/include/asm/clocksource.h b/arch/x86/include/asm/clocksource.h index 0bdbbb3b9ce..16a57f4ed64 100644 --- a/arch/x86/include/asm/clocksource.h +++ b/arch/x86/include/asm/clocksource.h @@ -8,6 +8,7 @@ #define VCLOCK_NONE 0 /* No vDSO clock available. */ #define VCLOCK_TSC 1 /* vDSO should use vread_tsc. */ #define VCLOCK_HPET 2 /* vDSO should use vread_hpet. */ +#define VCLOCK_PVCLOCK 3 /* vDSO should use vread_pvclock. */ struct arch_clocksource_data { int vclock_mode; diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h index 53f4b219336..f8bf2eecab8 100644 --- a/arch/x86/include/asm/cmpxchg_32.h +++ b/arch/x86/include/asm/cmpxchg_32.h @@ -34,9 +34,7 @@ static inline void set_64bit(volatile u64 *ptr, u64 value) : "memory"); } -#ifdef CONFIG_X86_CMPXCHG #define __HAVE_ARCH_CMPXCHG 1 -#endif #ifdef CONFIG_X86_CMPXCHG64 #define cmpxchg64(ptr, o, n) \ @@ -73,59 +71,6 @@ static inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new) return prev; } -#ifndef CONFIG_X86_CMPXCHG -/* - * Building a kernel capable running on 80386. It may be necessary to - * simulate the cmpxchg on the 80386 CPU. For that purpose we define - * a function for each of the sizes we support. - */ - -extern unsigned long cmpxchg_386_u8(volatile void *, u8, u8); -extern unsigned long cmpxchg_386_u16(volatile void *, u16, u16); -extern unsigned long cmpxchg_386_u32(volatile void *, u32, u32); - -static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old, - unsigned long new, int size) -{ - switch (size) { - case 1: - return cmpxchg_386_u8(ptr, old, new); - case 2: - return cmpxchg_386_u16(ptr, old, new); - case 4: - return cmpxchg_386_u32(ptr, old, new); - } - return old; -} - -#define cmpxchg(ptr, o, n) \ -({ \ - __typeof__(*(ptr)) __ret; \ - if (likely(boot_cpu_data.x86 > 3)) \ - __ret = (__typeof__(*(ptr)))__cmpxchg((ptr), \ - (unsigned long)(o), (unsigned long)(n), \ - sizeof(*(ptr))); \ - else \ - __ret = (__typeof__(*(ptr)))cmpxchg_386((ptr), \ - (unsigned long)(o), (unsigned long)(n), \ - sizeof(*(ptr))); \ - __ret; \ -}) -#define cmpxchg_local(ptr, o, n) \ -({ \ - __typeof__(*(ptr)) __ret; \ - if (likely(boot_cpu_data.x86 > 3)) \ - __ret = (__typeof__(*(ptr)))__cmpxchg_local((ptr), \ - (unsigned long)(o), (unsigned long)(n), \ - sizeof(*(ptr))); \ - else \ - __ret = (__typeof__(*(ptr)))cmpxchg_386((ptr), \ - (unsigned long)(o), (unsigned long)(n), \ - sizeof(*(ptr))); \ - __ret; \ -}) -#endif - #ifndef CONFIG_X86_CMPXCHG64 /* * Building a kernel capable running on 80386 and 80486. It may be necessary diff --git a/arch/x86/include/asm/rcu.h b/arch/x86/include/asm/context_tracking.h index d1ac07a2397..1616562683e 100644 --- a/arch/x86/include/asm/rcu.h +++ b/arch/x86/include/asm/context_tracking.h @@ -1,27 +1,26 @@ -#ifndef _ASM_X86_RCU_H -#define _ASM_X86_RCU_H +#ifndef _ASM_X86_CONTEXT_TRACKING_H +#define _ASM_X86_CONTEXT_TRACKING_H #ifndef __ASSEMBLY__ - -#include <linux/rcupdate.h> +#include <linux/context_tracking.h> #include <asm/ptrace.h> static inline void exception_enter(struct pt_regs *regs) { - rcu_user_exit(); + user_exit(); } static inline void exception_exit(struct pt_regs *regs) { -#ifdef CONFIG_RCU_USER_QS +#ifdef CONFIG_CONTEXT_TRACKING if (user_mode(regs)) - rcu_user_enter(); + user_enter(); #endif } #else /* __ASSEMBLY__ */ -#ifdef CONFIG_RCU_USER_QS +#ifdef CONFIG_CONTEXT_TRACKING # define SCHEDULE_USER call schedule_user #else # define SCHEDULE_USER call schedule diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h index 4564c8e28a3..5f9a1243190 100644 --- a/arch/x86/include/asm/cpu.h +++ b/arch/x86/include/asm/cpu.h @@ -28,6 +28,10 @@ struct x86_cpu { #ifdef CONFIG_HOTPLUG_CPU extern int arch_register_cpu(int num); extern void arch_unregister_cpu(int); +extern void __cpuinit start_cpu0(void); +#ifdef CONFIG_DEBUG_HOTPLUG_CPU0 +extern int _debug_hotplug_cpu(int cpu, int action); +#endif #endif DECLARE_PER_CPU(int, cpu_state); diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 8c297aa53ee..93fe929d1ce 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -167,6 +167,7 @@ #define X86_FEATURE_TBM (6*32+21) /* trailing bit manipulations */ #define X86_FEATURE_TOPOEXT (6*32+22) /* topology extensions CPUID leafs */ #define X86_FEATURE_PERFCTR_CORE (6*32+23) /* core performance counter extensions */ +#define X86_FEATURE_PERFCTR_NB (6*32+24) /* NB performance counter extensions */ /* * Auxiliary flags: Linux defined - For features scattered in various @@ -202,6 +203,7 @@ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ #define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ +#define X86_FEATURE_TSC_ADJUST (9*32+ 1) /* TSC adjustment MSR 0x3b */ #define X86_FEATURE_BMI1 (9*32+ 3) /* 1st group bit manipulation extensions */ #define X86_FEATURE_HLE (9*32+ 4) /* Hardware Lock Elision */ #define X86_FEATURE_AVX2 (9*32+ 5) /* AVX2 instructions */ @@ -308,15 +310,11 @@ extern const char * const x86_power_flags[32]; #define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR) #define cpu_has_pclmulqdq boot_cpu_has(X86_FEATURE_PCLMULQDQ) #define cpu_has_perfctr_core boot_cpu_has(X86_FEATURE_PERFCTR_CORE) +#define cpu_has_perfctr_nb boot_cpu_has(X86_FEATURE_PERFCTR_NB) #define cpu_has_cx8 boot_cpu_has(X86_FEATURE_CX8) #define cpu_has_cx16 boot_cpu_has(X86_FEATURE_CX16) #define cpu_has_eager_fpu boot_cpu_has(X86_FEATURE_EAGER_FPU) - -#if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64) -# define cpu_has_invlpg 1 -#else -# define cpu_has_invlpg (boot_cpu_data.x86 > 3) -#endif +#define cpu_has_topoext boot_cpu_has(X86_FEATURE_TOPOEXT) #ifdef CONFIG_X86_64 diff --git a/arch/x86/include/asm/crypto/camellia.h b/arch/x86/include/asm/crypto/camellia.h new file mode 100644 index 00000000000..98038add801 --- /dev/null +++ b/arch/x86/include/asm/crypto/camellia.h @@ -0,0 +1,82 @@ +#ifndef ASM_X86_CAMELLIA_H +#define ASM_X86_CAMELLIA_H + +#include <linux/kernel.h> +#include <linux/crypto.h> + +#define CAMELLIA_MIN_KEY_SIZE 16 +#define CAMELLIA_MAX_KEY_SIZE 32 +#define CAMELLIA_BLOCK_SIZE 16 +#define CAMELLIA_TABLE_BYTE_LEN 272 +#define CAMELLIA_PARALLEL_BLOCKS 2 + +struct camellia_ctx { + u64 key_table[CAMELLIA_TABLE_BYTE_LEN / sizeof(u64)]; + u32 key_length; +}; + +struct camellia_lrw_ctx { + struct lrw_table_ctx lrw_table; + struct camellia_ctx camellia_ctx; +}; + +struct camellia_xts_ctx { + struct camellia_ctx tweak_ctx; + struct camellia_ctx crypt_ctx; +}; + +extern int __camellia_setkey(struct camellia_ctx *cctx, + const unsigned char *key, + unsigned int key_len, u32 *flags); + +extern int lrw_camellia_setkey(struct crypto_tfm *tfm, const u8 *key, + unsigned int keylen); +extern void lrw_camellia_exit_tfm(struct crypto_tfm *tfm); + +extern int xts_camellia_setkey(struct crypto_tfm *tfm, const u8 *key, + unsigned int keylen); + +/* regular block cipher functions */ +asmlinkage void __camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst, + const u8 *src, bool xor); +asmlinkage void camellia_dec_blk(struct camellia_ctx *ctx, u8 *dst, + const u8 *src); + +/* 2-way parallel cipher functions */ +asmlinkage void __camellia_enc_blk_2way(struct camellia_ctx *ctx, u8 *dst, + const u8 *src, bool xor); +asmlinkage void camellia_dec_blk_2way(struct camellia_ctx *ctx, u8 *dst, + const u8 *src); + +static inline void camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst, + const u8 *src) +{ + __camellia_enc_blk(ctx, dst, src, false); +} + +static inline void camellia_enc_blk_xor(struct camellia_ctx *ctx, u8 *dst, + const u8 *src) +{ + __camellia_enc_blk(ctx, dst, src, true); +} + +static inline void camellia_enc_blk_2way(struct camellia_ctx *ctx, u8 *dst, + const u8 *src) +{ + __camellia_enc_blk_2way(ctx, dst, src, false); +} + +static inline void camellia_enc_blk_xor_2way(struct camellia_ctx *ctx, u8 *dst, + const u8 *src) +{ + __camellia_enc_blk_2way(ctx, dst, src, true); +} + +/* glue helpers */ +extern void camellia_decrypt_cbc_2way(void *ctx, u128 *dst, const u128 *src); +extern void camellia_crypt_ctr(void *ctx, u128 *dst, const u128 *src, + le128 *iv); +extern void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src, + le128 *iv); + +#endif /* ASM_X86_CAMELLIA_H */ diff --git a/arch/x86/include/asm/crypto/glue_helper.h b/arch/x86/include/asm/crypto/glue_helper.h index 3e408bddc96..e2d65b061d2 100644 --- a/arch/x86/include/asm/crypto/glue_helper.h +++ b/arch/x86/include/asm/crypto/glue_helper.h @@ -13,7 +13,7 @@ typedef void (*common_glue_func_t)(void *ctx, u8 *dst, const u8 *src); typedef void (*common_glue_cbc_func_t)(void *ctx, u128 *dst, const u128 *src); typedef void (*common_glue_ctr_func_t)(void *ctx, u128 *dst, const u128 *src, - u128 *iv); + le128 *iv); #define GLUE_FUNC_CAST(fn) ((common_glue_func_t)(fn)) #define GLUE_CBC_FUNC_CAST(fn) ((common_glue_cbc_func_t)(fn)) @@ -71,23 +71,29 @@ static inline void glue_fpu_end(bool fpu_enabled) kernel_fpu_end(); } -static inline void u128_to_be128(be128 *dst, const u128 *src) +static inline void le128_to_be128(be128 *dst, const le128 *src) { - dst->a = cpu_to_be64(src->a); - dst->b = cpu_to_be64(src->b); + dst->a = cpu_to_be64(le64_to_cpu(src->a)); + dst->b = cpu_to_be64(le64_to_cpu(src->b)); } -static inline void be128_to_u128(u128 *dst, const be128 *src) +static inline void be128_to_le128(le128 *dst, const be128 *src) { - dst->a = be64_to_cpu(src->a); - dst->b = be64_to_cpu(src->b); + dst->a = cpu_to_le64(be64_to_cpu(src->a)); + dst->b = cpu_to_le64(be64_to_cpu(src->b)); } -static inline void u128_inc(u128 *i) +static inline void le128_inc(le128 *i) { - i->b++; - if (!i->b) - i->a++; + u64 a = le64_to_cpu(i->a); + u64 b = le64_to_cpu(i->b); + + b++; + if (!b) + a++; + + i->a = cpu_to_le64(a); + i->b = cpu_to_le64(b); } extern int glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx, diff --git a/arch/x86/include/asm/crypto/serpent-avx.h b/arch/x86/include/asm/crypto/serpent-avx.h index 432deedd294..0da1d3e2a55 100644 --- a/arch/x86/include/asm/crypto/serpent-avx.h +++ b/arch/x86/include/asm/crypto/serpent-avx.h @@ -6,27 +6,14 @@ #define SERPENT_PARALLEL_BLOCKS 8 -asmlinkage void __serpent_enc_blk_8way_avx(struct serpent_ctx *ctx, u8 *dst, - const u8 *src, bool xor); -asmlinkage void serpent_dec_blk_8way_avx(struct serpent_ctx *ctx, u8 *dst, +asmlinkage void serpent_ecb_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst, + const u8 *src); +asmlinkage void serpent_ecb_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst, const u8 *src); -static inline void serpent_enc_blk_xway(struct serpent_ctx *ctx, u8 *dst, - const u8 *src) -{ - __serpent_enc_blk_8way_avx(ctx, dst, src, false); -} - -static inline void serpent_enc_blk_xway_xor(struct serpent_ctx *ctx, u8 *dst, - const u8 *src) -{ - __serpent_enc_blk_8way_avx(ctx, dst, src, true); -} - -static inline void serpent_dec_blk_xway(struct serpent_ctx *ctx, u8 *dst, - const u8 *src) -{ - serpent_dec_blk_8way_avx(ctx, dst, src); -} +asmlinkage void serpent_cbc_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst, + const u8 *src); +asmlinkage void serpent_ctr_8way_avx(struct serpent_ctx *ctx, u8 *dst, + const u8 *src, le128 *iv); #endif diff --git a/arch/x86/include/asm/crypto/twofish.h b/arch/x86/include/asm/crypto/twofish.h index 9d2c514bd5f..878c51ceebb 100644 --- a/arch/x86/include/asm/crypto/twofish.h +++ b/arch/x86/include/asm/crypto/twofish.h @@ -31,9 +31,9 @@ asmlinkage void twofish_dec_blk_3way(struct twofish_ctx *ctx, u8 *dst, /* helpers from twofish_x86_64-3way module */ extern void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src); extern void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, - u128 *iv); + le128 *iv); extern void twofish_enc_blk_ctr_3way(void *ctx, u128 *dst, const u128 *src, - u128 *iv); + le128 *iv); extern int lrw_twofish_setkey(struct crypto_tfm *tfm, const u8 *key, unsigned int keylen); diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index 2d91580bf22..4b528a970bd 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h @@ -2,83 +2,8 @@ #define _ASM_X86_DEBUGREG_H -/* Indicate the register numbers for a number of the specific - debug registers. Registers 0-3 contain the addresses we wish to trap on */ -#define DR_FIRSTADDR 0 /* u_debugreg[DR_FIRSTADDR] */ -#define DR_LASTADDR 3 /* u_debugreg[DR_LASTADDR] */ - -#define DR_STATUS 6 /* u_debugreg[DR_STATUS] */ -#define DR_CONTROL 7 /* u_debugreg[DR_CONTROL] */ - -/* Define a few things for the status register. We can use this to determine - which debugging register was responsible for the trap. The other bits - are either reserved or not of interest to us. */ - -/* Define reserved bits in DR6 which are always set to 1 */ -#define DR6_RESERVED (0xFFFF0FF0) - -#define DR_TRAP0 (0x1) /* db0 */ -#define DR_TRAP1 (0x2) /* db1 */ -#define DR_TRAP2 (0x4) /* db2 */ -#define DR_TRAP3 (0x8) /* db3 */ -#define DR_TRAP_BITS (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3) - -#define DR_STEP (0x4000) /* single-step */ -#define DR_SWITCH (0x8000) /* task switch */ - -/* Now define a bunch of things for manipulating the control register. - The top two bytes of the control register consist of 4 fields of 4 - bits - each field corresponds to one of the four debug registers, - and indicates what types of access we trap on, and how large the data - field is that we are looking at */ - -#define DR_CONTROL_SHIFT 16 /* Skip this many bits in ctl register */ -#define DR_CONTROL_SIZE 4 /* 4 control bits per register */ - -#define DR_RW_EXECUTE (0x0) /* Settings for the access types to trap on */ -#define DR_RW_WRITE (0x1) -#define DR_RW_READ (0x3) - -#define DR_LEN_1 (0x0) /* Settings for data length to trap on */ -#define DR_LEN_2 (0x4) -#define DR_LEN_4 (0xC) -#define DR_LEN_8 (0x8) - -/* The low byte to the control register determine which registers are - enabled. There are 4 fields of two bits. One bit is "local", meaning - that the processor will reset the bit after a task switch and the other - is global meaning that we have to explicitly reset the bit. With linux, - you can use either one, since we explicitly zero the register when we enter - kernel mode. */ - -#define DR_LOCAL_ENABLE_SHIFT 0 /* Extra shift to the local enable bit */ -#define DR_GLOBAL_ENABLE_SHIFT 1 /* Extra shift to the global enable bit */ -#define DR_LOCAL_ENABLE (0x1) /* Local enable for reg 0 */ -#define DR_GLOBAL_ENABLE (0x2) /* Global enable for reg 0 */ -#define DR_ENABLE_SIZE 2 /* 2 enable bits per register */ - -#define DR_LOCAL_ENABLE_MASK (0x55) /* Set local bits for all 4 regs */ -#define DR_GLOBAL_ENABLE_MASK (0xAA) /* Set global bits for all 4 regs */ - -/* The second byte to the control register has a few special things. - We can slow the instruction pipeline for instructions coming via the - gdt or the ldt if we want to. I am not sure why this is an advantage */ - -#ifdef __i386__ -#define DR_CONTROL_RESERVED (0xFC00) /* Reserved by Intel */ -#else -#define DR_CONTROL_RESERVED (0xFFFFFFFF0000FC00UL) /* Reserved */ -#endif - -#define DR_LOCAL_SLOWDOWN (0x100) /* Local slow the pipeline */ -#define DR_GLOBAL_SLOWDOWN (0x200) /* Global slow the pipeline */ - -/* - * HW breakpoint additions - */ -#ifdef __KERNEL__ - #include <linux/bug.h> +#include <uapi/asm/debugreg.h> DECLARE_PER_CPU(unsigned long, cpu_dr7); @@ -190,6 +115,4 @@ static inline void debug_stack_usage_dec(void) { } #endif /* X86_64 */ -#endif /* __KERNEL__ */ - #endif /* _ASM_X86_DEBUGREG_H */ diff --git a/arch/x86/include/asm/device.h b/arch/x86/include/asm/device.h index 93e1c55f14a..03dd72957d2 100644 --- a/arch/x86/include/asm/device.h +++ b/arch/x86/include/asm/device.h @@ -2,9 +2,6 @@ #define _ASM_X86_DEVICE_H struct dev_archdata { -#ifdef CONFIG_ACPI - void *acpi_handle; -#endif #ifdef CONFIG_X86_DEV_DMA_OPS struct dma_map_ops *dma_ops; #endif diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index f7b4c7903e7..808dae63eee 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -47,6 +47,7 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev) static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) { struct dma_map_ops *ops = get_dma_ops(dev); + debug_dma_mapping_error(dev, dma_addr); if (ops->mapping_error) return ops->mapping_error(dev, dma_addr); diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h index 37782566af2..cccd07fa5e3 100644 --- a/arch/x86/include/asm/e820.h +++ b/arch/x86/include/asm/e820.h @@ -1,81 +1,14 @@ #ifndef _ASM_X86_E820_H #define _ASM_X86_E820_H -#define E820MAP 0x2d0 /* our map */ -#define E820MAX 128 /* number of entries in E820MAP */ -/* - * Legacy E820 BIOS limits us to 128 (E820MAX) nodes due to the - * constrained space in the zeropage. If we have more nodes than - * that, and if we've booted off EFI firmware, then the EFI tables - * passed us from the EFI firmware can list more nodes. Size our - * internal memory map tables to have room for these additional - * nodes, based on up to three entries per node for which the - * kernel was built: MAX_NUMNODES == (1 << CONFIG_NODES_SHIFT), - * plus E820MAX, allowing space for the possible duplicate E820 - * entries that might need room in the same arrays, prior to the - * call to sanitize_e820_map() to remove duplicates. The allowance - * of three memory map entries per node is "enough" entries for - * the initial hardware platform motivating this mechanism to make - * use of additional EFI map entries. Future platforms may want - * to allow more than three entries per node or otherwise refine - * this size. - */ - -/* - * Odd: 'make headers_check' complains about numa.h if I try - * to collapse the next two #ifdef lines to a single line: - * #if defined(__KERNEL__) && defined(CONFIG_EFI) - */ -#ifdef __KERNEL__ #ifdef CONFIG_EFI #include <linux/numa.h> #define E820_X_MAX (E820MAX + 3 * MAX_NUMNODES) #else /* ! CONFIG_EFI */ #define E820_X_MAX E820MAX #endif -#else /* ! __KERNEL__ */ -#define E820_X_MAX E820MAX -#endif - -#define E820NR 0x1e8 /* # entries in E820MAP */ - -#define E820_RAM 1 -#define E820_RESERVED 2 -#define E820_ACPI 3 -#define E820_NVS 4 -#define E820_UNUSABLE 5 - -/* - * reserved RAM used by kernel itself - * if CONFIG_INTEL_TXT is enabled, memory of this type will be - * included in the S3 integrity calculation and so should not include - * any memory that BIOS might alter over the S3 transition - */ -#define E820_RESERVED_KERN 128 - +#include <uapi/asm/e820.h> #ifndef __ASSEMBLY__ -#include <linux/types.h> -struct e820entry { - __u64 addr; /* start of memory segment */ - __u64 size; /* size of memory segment */ - __u32 type; /* type of memory segment */ -} __attribute__((packed)); - -struct e820map { - __u32 nr_map; - struct e820entry map[E820_X_MAX]; -}; - -#define ISA_START_ADDRESS 0xa0000 -#define ISA_END_ADDRESS 0x100000 - -#define BIOS_BEGIN 0x000a0000 -#define BIOS_END 0x00100000 - -#define BIOS_ROM_BASE 0xffe00000 -#define BIOS_ROM_END 0xffffffff - -#ifdef __KERNEL__ /* see comment in arch/x86/kernel/e820.c */ extern struct e820map e820; extern struct e820map e820_saved; @@ -137,13 +70,8 @@ static inline bool is_ISA_range(u64 s, u64 e) return s >= ISA_START_ADDRESS && e <= ISA_END_ADDRESS; } -#endif /* __KERNEL__ */ #endif /* __ASSEMBLY__ */ - -#ifdef __KERNEL__ #include <linux/ioport.h> #define HIGH_MEMORY (1024*1024) -#endif /* __KERNEL__ */ - #endif /* _ASM_X86_E820_H */ diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 6e8fdf5ad11..60c89f30c72 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -94,6 +94,7 @@ extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size, #endif /* CONFIG_X86_32 */ extern int add_efi_memmap; +extern unsigned long x86_efi_facility; extern void efi_set_executable(efi_memory_desc_t *md, bool executable); extern int efi_memblock_x86_reserve_range(void); extern void efi_call_phys_prelog(void); @@ -101,7 +102,14 @@ extern void efi_call_phys_epilog(void); extern void efi_unmap_memmap(void); extern void efi_memory_uc(u64 addr, unsigned long size); -#ifndef CONFIG_EFI +#ifdef CONFIG_EFI + +static inline bool efi_is_native(void) +{ + return IS_ENABLED(CONFIG_X86_64) == efi_enabled(EFI_64BIT); +} + +#else /* * IF EFI is not configured, have the EFI calls return -ENOSYS. */ diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 5939f44fe0c..9c999c1674f 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -354,12 +354,10 @@ static inline int mmap_is_ia32(void) return 0; } -/* The first two values are special, do not change. See align_addr() */ +/* Do not change the values. See get_align_mask() */ enum align_flags { ALIGN_VA_32 = BIT(0), ALIGN_VA_64 = BIT(1), - ALIGN_VDSO = BIT(2), - ALIGN_TOPDOWN = BIT(3), }; struct va_alignment { @@ -368,5 +366,5 @@ struct va_alignment { } ____cacheline_aligned; extern struct va_alignment va_align; -extern unsigned long align_addr(unsigned long, struct file *, enum align_flags); +extern unsigned long align_vdso_addr(unsigned long); #endif /* _ASM_X86_ELF_H */ diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 4da3c0c4c97..a09c2857106 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -19,6 +19,7 @@ #include <asm/acpi.h> #include <asm/apicdef.h> #include <asm/page.h> +#include <asm/pvclock.h> #ifdef CONFIG_X86_32 #include <linux/threads.h> #include <asm/kmap_types.h> @@ -81,6 +82,10 @@ enum fixed_addresses { VVAR_PAGE, VSYSCALL_HPET, #endif +#ifdef CONFIG_PARAVIRT_CLOCK + PVCLOCK_FIXMAP_BEGIN, + PVCLOCK_FIXMAP_END = PVCLOCK_FIXMAP_BEGIN+PVCLOCK_VSYSCALL_NR_PAGES-1, +#endif FIX_DBGP_BASE, FIX_EARLYCON_MEM_BASE, #ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index 831dbb9c6c0..e25cc33ec54 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -26,9 +26,10 @@ #ifdef CONFIG_X86_64 # include <asm/sigcontext32.h> # include <asm/user32.h> -int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, +struct ksignal; +int ia32_setup_rt_frame(int sig, struct ksignal *ksig, compat_sigset_t *set, struct pt_regs *regs); -int ia32_setup_frame(int sig, struct k_sigaction *ka, +int ia32_setup_frame(int sig, struct ksignal *ksig, compat_sigset_t *set, struct pt_regs *regs); #else # define user_i387_ia32_struct user_i387_struct @@ -399,14 +400,17 @@ static inline void drop_init_fpu(struct task_struct *tsk) typedef struct { int preload; } fpu_switch_t; /* - * FIXME! We could do a totally lazy restore, but we need to - * add a per-cpu "this was the task that last touched the FPU - * on this CPU" variable, and the task needs to have a "I last - * touched the FPU on this CPU" and check them. + * Must be run with preemption disabled: this clears the fpu_owner_task, + * on this CPU. * - * We don't do that yet, so "fpu_lazy_restore()" always returns - * false, but some day.. + * This will disable any lazy FPU state restore of the current FPU state, + * but if the current thread owns the FPU, it will still be saved by. */ +static inline void __cpu_disable_lazy_restore(unsigned int cpu) +{ + per_cpu(fpu_owner_task, cpu) = NULL; +} + static inline int fpu_lazy_restore(struct task_struct *new, unsigned int cpu) { return new == this_cpu_read_stable(fpu_owner_task) && diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 9a25b522d37..0525a8bdf65 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -44,7 +44,6 @@ #ifdef CONFIG_DYNAMIC_FTRACE #define ARCH_SUPPORTS_FTRACE_OPS 1 -#define ARCH_SUPPORTS_FTRACE_SAVE_REGS #endif #ifndef __ASSEMBLY__ @@ -73,4 +72,28 @@ int ftrace_int3_handler(struct pt_regs *regs); #endif /* __ASSEMBLY__ */ #endif /* CONFIG_FUNCTION_TRACER */ + +#if !defined(__ASSEMBLY__) && !defined(COMPILE_OFFSETS) + +#if defined(CONFIG_FTRACE_SYSCALLS) && defined(CONFIG_IA32_EMULATION) +#include <asm/compat.h> + +/* + * Because ia32 syscalls do not map to x86_64 syscall numbers + * this screws up the trace output when tracing a ia32 task. + * Instead of reporting bogus syscalls, just do not trace them. + * + * If the user realy wants these, then they should use the + * raw syscall tracepoints with filtering. + */ +#define ARCH_TRACE_IGNORE_COMPAT_SYSCALLS 1 +static inline bool arch_trace_is_compat_syscall(struct pt_regs *regs) +{ + if (is_compat_task()) + return true; + return false; +} +#endif /* CONFIG_FTRACE_SYSCALLS && CONFIG_IA32_EMULATION */ +#endif /* !__ASSEMBLY__ && !COMPILE_OFFSETS */ + #endif /* _ASM_X86_FTRACE_H */ diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h index f373046e63e..be27ba1e947 100644 --- a/arch/x86/include/asm/futex.h +++ b/arch/x86/include/asm/futex.h @@ -55,12 +55,6 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; -#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_BSWAP) - /* Real i386 machines can only support FUTEX_OP_SET */ - if (op != FUTEX_OP_SET && boot_cpu_data.x86 == 3) - return -ENOSYS; -#endif - pagefault_disable(); switch (op) { @@ -118,12 +112,6 @@ static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, { int ret = 0; -#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_BSWAP) - /* Real i386 machines have no cmpxchg instruction */ - if (boot_cpu_data.x86 == 3) - return -ENOSYS; -#endif - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h index 434e2106cc8..b18df579c0e 100644 --- a/arch/x86/include/asm/hpet.h +++ b/arch/x86/include/asm/hpet.h @@ -80,9 +80,9 @@ extern void hpet_msi_write(struct hpet_dev *hdev, struct msi_msg *msg); extern void hpet_msi_read(struct hpet_dev *hdev, struct msi_msg *msg); #ifdef CONFIG_PCI_MSI -extern int arch_setup_hpet_msi(unsigned int irq, unsigned int id); +extern int default_setup_hpet_msi(unsigned int irq, unsigned int id); #else -static inline int arch_setup_hpet_msi(unsigned int irq, unsigned int id) +static inline int default_setup_hpet_msi(unsigned int irq, unsigned int id) { return -EINVAL; } @@ -111,6 +111,7 @@ extern void hpet_unregister_irq_handler(rtc_irq_handler handler); static inline int hpet_enable(void) { return 0; } static inline int is_hpet_enabled(void) { return 0; } #define hpet_readl(a) 0 +#define default_setup_hpet_msi NULL #endif #endif /* _ASM_X86_HPET_H */ diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h index 824ca07860d..ef1c4d2d41e 100644 --- a/arch/x86/include/asm/hw_breakpoint.h +++ b/arch/x86/include/asm/hw_breakpoint.h @@ -1,7 +1,8 @@ #ifndef _I386_HW_BREAKPOINT_H #define _I386_HW_BREAKPOINT_H -#ifdef __KERNEL__ +#include <uapi/asm/hw_breakpoint.h> + #define __ARCH_HW_BREAKPOINT_H /* @@ -71,6 +72,4 @@ extern int arch_bp_generic_fields(int x86_len, int x86_type, extern struct pmu perf_ops_bp; -#endif /* __KERNEL__ */ #endif /* _I386_HW_BREAKPOINT_H */ - diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index eb92a6ed2be..10a78c3d3d5 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -101,6 +101,7 @@ static inline void set_io_apic_irq_attr(struct io_apic_irq_attr *irq_attr, irq_attr->polarity = polarity; } +/* Intel specific interrupt remapping information */ struct irq_2_iommu { struct intel_iommu *iommu; u16 irte_index; @@ -108,6 +109,12 @@ struct irq_2_iommu { u8 irte_mask; }; +/* AMD specific interrupt remapping information */ +struct irq_2_irte { + u16 devid; /* Device ID for IRTE table */ + u16 index; /* Index into IRTE table*/ +}; + /* * This is performance-critical, we want to do it O(1) * @@ -120,7 +127,11 @@ struct irq_cfg { u8 vector; u8 move_in_progress : 1; #ifdef CONFIG_IRQ_REMAP - struct irq_2_iommu irq_2_iommu; + u8 remapped : 1; + union { + struct irq_2_iommu irq_2_iommu; + struct irq_2_irte irq_2_irte; + }; #endif }; diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h index b518c750993..86095ed1413 100644 --- a/arch/x86/include/asm/hypervisor.h +++ b/arch/x86/include/asm/hypervisor.h @@ -25,6 +25,7 @@ extern void init_hypervisor(struct cpuinfo_x86 *c); extern void init_hypervisor_platform(void); +extern bool hypervisor_x2apic_available(void); /* * x86 hypervisor information @@ -41,6 +42,9 @@ struct hypervisor_x86 { /* Platform setup (run once per boot) */ void (*init_platform)(void); + + /* X2APIC detection (run once per boot) */ + bool (*x2apic_available)(void); }; extern const struct hypervisor_x86 *x86_hyper; @@ -51,13 +55,4 @@ extern const struct hypervisor_x86 x86_hyper_ms_hyperv; extern const struct hypervisor_x86 x86_hyper_xen_hvm; extern const struct hypervisor_x86 x86_hyper_kvm; -static inline bool hypervisor_x2apic_available(void) -{ - if (kvm_para_available()) - return true; - if (xen_x2apic_para_available()) - return true; - return false; -} - #endif diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h index e6232773ce4..d0e8e014104 100644 --- a/arch/x86/include/asm/ia32.h +++ b/arch/x86/include/asm/ia32.h @@ -13,32 +13,11 @@ #include <asm/sigcontext32.h> /* signal.h */ -struct sigaction32 { - unsigned int sa_handler; /* Really a pointer, but need to deal - with 32 bits */ - unsigned int sa_flags; - unsigned int sa_restorer; /* Another 32 bit pointer */ - compat_sigset_t sa_mask; /* A 32 bit mask */ -}; - -struct old_sigaction32 { - unsigned int sa_handler; /* Really a pointer, but need to deal - with 32 bits */ - compat_old_sigset_t sa_mask; /* A 32 bit mask */ - unsigned int sa_flags; - unsigned int sa_restorer; /* Another 32 bit pointer */ -}; - -typedef struct sigaltstack_ia32 { - unsigned int ss_sp; - int ss_flags; - unsigned int ss_size; -} stack_ia32_t; struct ucontext_ia32 { unsigned int uc_flags; unsigned int uc_link; - stack_ia32_t uc_stack; + compat_stack_t uc_stack; struct sigcontext_ia32 uc_mcontext; compat_sigset_t uc_sigmask; /* mask last for extensibility */ }; @@ -46,7 +25,7 @@ struct ucontext_ia32 { struct ucontext_x32 { unsigned int uc_flags; unsigned int uc_link; - stack_ia32_t uc_stack; + compat_stack_t uc_stack; unsigned int uc__pad0; /* needed for alignment */ struct sigcontext uc_mcontext; /* the 64-bit sigcontext type */ compat_sigset_t uc_sigmask; /* mask last for extensibility */ diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h index adcc0ae73d0..223042086f4 100644 --- a/arch/x86/include/asm/init.h +++ b/arch/x86/include/asm/init.h @@ -1,20 +1,14 @@ -#ifndef _ASM_X86_INIT_32_H -#define _ASM_X86_INIT_32_H +#ifndef _ASM_X86_INIT_H +#define _ASM_X86_INIT_H -#ifdef CONFIG_X86_32 -extern void __init early_ioremap_page_table_range_init(void); -#endif +struct x86_mapping_info { + void *(*alloc_pgt_page)(void *); /* allocate buf for page table */ + void *context; /* context for alloc_pgt_page */ + unsigned long pmd_flag; /* page flag for PMD entry */ + bool kernel_mapping; /* kernel mapping or ident mapping */ +}; -extern void __init zone_sizes_init(void); +int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, + unsigned long addr, unsigned long end); -extern unsigned long __init -kernel_physical_mapping_init(unsigned long start, - unsigned long end, - unsigned long page_size_mask); - - -extern unsigned long __initdata pgt_buf_start; -extern unsigned long __meminitdata pgt_buf_end; -extern unsigned long __meminitdata pgt_buf_top; - -#endif /* _ASM_X86_INIT_32_H */ +#endif /* _ASM_X86_INIT_H */ diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 73d8c5398ea..459e50a424d 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -144,11 +144,24 @@ extern int timer_through_8259; (mp_irq_entries && !skip_ioapic_setup && io_apic_irqs) struct io_apic_irq_attr; +struct irq_cfg; extern int io_apic_set_pci_routing(struct device *dev, int irq, struct io_apic_irq_attr *irq_attr); void setup_IO_APIC_irq_extra(u32 gsi); extern void ioapic_insert_resources(void); +extern int native_setup_ioapic_entry(int, struct IO_APIC_route_entry *, + unsigned int, int, + struct io_apic_irq_attr *); +extern int native_setup_ioapic_entry(int, struct IO_APIC_route_entry *, + unsigned int, int, + struct io_apic_irq_attr *); +extern void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg); + +extern void native_compose_msi_msg(struct pci_dev *pdev, + unsigned int irq, unsigned int dest, + struct msi_msg *msg, u8 hpet_id); +extern void native_eoi_ioapic_pin(int apic, int pin, int vector); int io_apic_setup_irq_pin_once(unsigned int irq, int node, struct io_apic_irq_attr *attr); extern int save_ioapic_entries(void); @@ -179,6 +192,12 @@ extern void __init native_io_apic_init_mappings(void); extern unsigned int native_io_apic_read(unsigned int apic, unsigned int reg); extern void native_io_apic_write(unsigned int apic, unsigned int reg, unsigned int val); extern void native_io_apic_modify(unsigned int apic, unsigned int reg, unsigned int val); +extern void native_disable_io_apic(void); +extern void native_io_apic_print_entries(unsigned int apic, unsigned int nr_entries); +extern void intel_ir_io_apic_print_entries(unsigned int apic, unsigned int nr_entries); +extern int native_ioapic_set_affinity(struct irq_data *, + const struct cpumask *, + bool); static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) { @@ -193,6 +212,9 @@ static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned { x86_io_apic_ops.modify(apic, reg, value); } + +extern void io_apic_eoi(unsigned int apic, unsigned int vector); + #else /* !CONFIG_X86_IO_APIC */ #define io_apic_assign_pci_irqs 0 @@ -223,6 +245,12 @@ static inline void disable_ioapic_support(void) { } #define native_io_apic_read NULL #define native_io_apic_write NULL #define native_io_apic_modify NULL +#define native_disable_io_apic NULL +#define native_io_apic_print_entries NULL +#define native_ioapic_set_affinity NULL +#define native_setup_ioapic_entry NULL +#define native_compose_msi_msg NULL +#define native_eoi_ioapic_pin NULL #endif #endif /* _ASM_X86_IO_APIC_H */ diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index 5fb9bbbd2f1..95fd3527f63 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -26,8 +26,6 @@ #ifdef CONFIG_IRQ_REMAP -extern int irq_remapping_enabled; - extern void setup_irq_remapping_ops(void); extern int irq_remapping_supported(void); extern int irq_remapping_prepare(void); @@ -40,21 +38,19 @@ extern int setup_ioapic_remapped_entry(int irq, unsigned int destination, int vector, struct io_apic_irq_attr *attr); -extern int set_remapped_irq_affinity(struct irq_data *data, - const struct cpumask *mask, - bool force); extern void free_remapped_irq(int irq); extern void compose_remapped_msi_msg(struct pci_dev *pdev, unsigned int irq, unsigned int dest, struct msi_msg *msg, u8 hpet_id); -extern int msi_alloc_remapped_irq(struct pci_dev *pdev, int irq, int nvec); -extern int msi_setup_remapped_irq(struct pci_dev *pdev, unsigned int irq, - int index, int sub_handle); extern int setup_hpet_msi_remapped(unsigned int irq, unsigned int id); +extern void panic_if_irq_remap(const char *msg); +extern bool setup_remapped_irq(int irq, + struct irq_cfg *cfg, + struct irq_chip *chip); -#else /* CONFIG_IRQ_REMAP */ +void irq_remap_modify_chip_defaults(struct irq_chip *chip); -#define irq_remapping_enabled 0 +#else /* CONFIG_IRQ_REMAP */ static inline void setup_irq_remapping_ops(void) { } static inline int irq_remapping_supported(void) { return 0; } @@ -71,30 +67,30 @@ static inline int setup_ioapic_remapped_entry(int irq, { return -ENODEV; } -static inline int set_remapped_irq_affinity(struct irq_data *data, - const struct cpumask *mask, - bool force) -{ - return 0; -} static inline void free_remapped_irq(int irq) { } static inline void compose_remapped_msi_msg(struct pci_dev *pdev, unsigned int irq, unsigned int dest, struct msi_msg *msg, u8 hpet_id) { } -static inline int msi_alloc_remapped_irq(struct pci_dev *pdev, int irq, int nvec) +static inline int setup_hpet_msi_remapped(unsigned int irq, unsigned int id) { return -ENODEV; } -static inline int msi_setup_remapped_irq(struct pci_dev *pdev, unsigned int irq, - int index, int sub_handle) + +static inline void panic_if_irq_remap(const char *msg) +{ +} + +static inline void irq_remap_modify_chip_defaults(struct irq_chip *chip) { - return -ENODEV; } -static inline int setup_hpet_msi_remapped(unsigned int irq, unsigned int id) + +static inline bool setup_remapped_irq(int irq, + struct irq_cfg *cfg, + struct irq_chip *chip) { - return -ENODEV; + return false; } #endif /* CONFIG_IRQ_REMAP */ diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 1508e518c7e..aac5fa62a86 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -109,8 +109,8 @@ #define UV_BAU_MESSAGE 0xf5 -/* Xen vector callback to receive events in a HVM domain */ -#define XEN_HVM_EVTCHN_CALLBACK 0xf3 +/* Vector on which hypervisor callbacks will be delivered */ +#define HYPERVISOR_CALLBACK_VECTOR 0xf3 /* * Local APIC timer IRQ vector is on a different priority level, diff --git a/arch/x86/include/asm/ist.h b/arch/x86/include/asm/ist.h index 7e5dff1de0e..c9803f1a203 100644 --- a/arch/x86/include/asm/ist.h +++ b/arch/x86/include/asm/ist.h @@ -1,6 +1,3 @@ -#ifndef _ASM_X86_IST_H -#define _ASM_X86_IST_H - /* * Include file for the interface to IST BIOS * Copyright 2002 Andy Grover <andrew.grover@intel.com> @@ -15,20 +12,12 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. */ +#ifndef _ASM_X86_IST_H +#define _ASM_X86_IST_H +#include <uapi/asm/ist.h> -#include <linux/types.h> - -struct ist_info { - __u32 signature; - __u32 command; - __u32 event; - __u32 perf_level; -}; - -#ifdef __KERNEL__ extern struct ist_info ist_info; -#endif /* __KERNEL__ */ #endif /* _ASM_X86_IST_H */ diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index 317ff1703d0..17483a492f1 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -48,11 +48,11 @@ # define vmcore_elf_check_arch_cross(x) ((x)->e_machine == EM_X86_64) #else /* Maximum physical address we can use pages from */ -# define KEXEC_SOURCE_MEMORY_LIMIT (0xFFFFFFFFFFUL) +# define KEXEC_SOURCE_MEMORY_LIMIT (MAXMEM-1) /* Maximum address we can reach in physical address mode */ -# define KEXEC_DESTINATION_MEMORY_LIMIT (0xFFFFFFFFFFUL) +# define KEXEC_DESTINATION_MEMORY_LIMIT (MAXMEM-1) /* Maximum address we can use for the control pages */ -# define KEXEC_CONTROL_MEMORY_LIMIT (0xFFFFFFFFFFUL) +# define KEXEC_CONTROL_MEMORY_LIMIT (MAXMEM-1) /* Allocate one page for the pdp and the second for the code */ # define KEXEC_CONTROL_PAGE_SIZE (4096UL + 4096UL) @@ -163,6 +163,9 @@ struct kimage_arch { }; #endif +typedef void crash_vmclear_fn(void); +extern crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss; + #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_KEXEC_H */ diff --git a/arch/x86/include/asm/kvm_guest.h b/arch/x86/include/asm/kvm_guest.h new file mode 100644 index 00000000000..a92b1763c41 --- /dev/null +++ b/arch/x86/include/asm/kvm_guest.h @@ -0,0 +1,6 @@ +#ifndef _ASM_X86_KVM_GUEST_H +#define _ASM_X86_KVM_GUEST_H + +int kvm_setup_vsyscall_timeinfo(void); + +#endif /* _ASM_X86_KVM_GUEST_H */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b2e11f45243..635a74d2240 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -22,6 +22,8 @@ #include <linux/kvm_para.h> #include <linux/kvm_types.h> #include <linux/perf_event.h> +#include <linux/pvclock_gtod.h> +#include <linux/clocksource.h> #include <asm/pvclock-abi.h> #include <asm/desc.h> @@ -31,10 +33,10 @@ #define KVM_MAX_VCPUS 254 #define KVM_SOFT_MAX_VCPUS 160 -#define KVM_MEMORY_SLOTS 32 -/* memory slots that does not exposed to userspace */ -#define KVM_PRIVATE_MEM_SLOTS 4 -#define KVM_MEM_SLOTS_NUM (KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS) +#define KVM_USER_MEM_SLOTS 125 +/* memory slots that are not exposed to userspace */ +#define KVM_PRIVATE_MEM_SLOTS 3 +#define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS) #define KVM_MMIO_SIZE 16 @@ -217,11 +219,6 @@ struct kvm_mmu_page { u64 *spt; /* hold the gfn of each spte inside spt */ gfn_t *gfns; - /* - * One bit set per slot which has memory - * in this shadow page. - */ - DECLARE_BITMAP(slot_bitmap, KVM_MEM_SLOTS_NUM); bool unsync; int root_count; /* Currently serving as active root */ unsigned int unsync_children; @@ -442,6 +439,7 @@ struct kvm_vcpu_arch { s8 virtual_tsc_shift; u32 virtual_tsc_mult; u32 virtual_tsc_khz; + s64 ia32_tsc_adjust_msr; atomic_t nmi_queued; /* unprocessed asynchronous NMIs */ unsigned nmi_pending; /* NMI queued after currently running handler */ @@ -499,6 +497,13 @@ struct kvm_vcpu_arch { u64 msr_val; struct gfn_to_hva_cache data; } pv_eoi; + + /* + * Indicate whether the access faults on its page table in guest + * which is set when fix page fault and used to detect unhandeable + * instruction. + */ + bool write_fault_to_shadow_pgtable; }; struct kvm_lpage_info { @@ -559,6 +564,12 @@ struct kvm_arch { u64 cur_tsc_write; u64 cur_tsc_offset; u8 cur_tsc_generation; + int nr_vcpus_matched_tsc; + + spinlock_t pvclock_gtod_sync_lock; + bool use_master_clock; + u64 master_kernel_ns; + cycle_t master_cycle_now; struct kvm_xen_hvm_config xen_hvm_config; @@ -612,6 +623,12 @@ struct kvm_vcpu_stat { struct x86_instruction_info; +struct msr_data { + bool host_initiated; + u32 index; + u64 data; +}; + struct kvm_x86_ops { int (*cpu_has_kvm_support)(void); /* __init */ int (*disabled_by_bios)(void); /* __init */ @@ -634,7 +651,7 @@ struct kvm_x86_ops { void (*update_db_bp_intercept)(struct kvm_vcpu *vcpu); int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); - int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); + int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr); u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg); void (*get_segment)(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); @@ -682,6 +699,11 @@ struct kvm_x86_ops { void (*enable_nmi_window)(struct kvm_vcpu *vcpu); void (*enable_irq_window)(struct kvm_vcpu *vcpu); void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); + int (*vm_has_apicv)(struct kvm *kvm); + void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); + void (*hwapic_isr_update)(struct kvm *kvm, int isr); + void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); + void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set); int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); int (*get_tdp_level)(void); u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); @@ -697,10 +719,11 @@ struct kvm_x86_ops { bool (*has_wbinvd_exit)(void); void (*set_tsc_khz)(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale); + u64 (*read_tsc_offset)(struct kvm_vcpu *vcpu); void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); u64 (*compute_tsc_offset)(struct kvm_vcpu *vcpu, u64 target_tsc); - u64 (*read_l1_tsc)(struct kvm_vcpu *vcpu); + u64 (*read_l1_tsc)(struct kvm_vcpu *vcpu, u64 host_tsc); void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2); @@ -785,7 +808,7 @@ static inline int emulate_instruction(struct kvm_vcpu *vcpu, void kvm_enable_efer_bits(u64); int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data); -int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); +int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr); struct x86_emulate_ctxt; @@ -812,7 +835,7 @@ void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l); int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr); int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); -int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data); +int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr); unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu); void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); @@ -975,6 +998,7 @@ int kvm_age_hva(struct kvm *kvm, unsigned long hva); int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); int cpuid_maxphyaddr(struct kvm_vcpu *vcpu); +int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v); int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); int kvm_cpu_get_interrupt(struct kvm_vcpu *v); diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index eb3e9d85e1f..695399f2d5e 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -1,103 +1,8 @@ #ifndef _ASM_X86_KVM_PARA_H #define _ASM_X86_KVM_PARA_H -#include <linux/types.h> -#include <asm/hyperv.h> - -/* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It - * should be used to determine that a VM is running under KVM. - */ -#define KVM_CPUID_SIGNATURE 0x40000000 - -/* This CPUID returns a feature bitmap in eax. Before enabling a particular - * paravirtualization, the appropriate feature bit should be checked. - */ -#define KVM_CPUID_FEATURES 0x40000001 -#define KVM_FEATURE_CLOCKSOURCE 0 -#define KVM_FEATURE_NOP_IO_DELAY 1 -#define KVM_FEATURE_MMU_OP 2 -/* This indicates that the new set of kvmclock msrs - * are available. The use of 0x11 and 0x12 is deprecated - */ -#define KVM_FEATURE_CLOCKSOURCE2 3 -#define KVM_FEATURE_ASYNC_PF 4 -#define KVM_FEATURE_STEAL_TIME 5 -#define KVM_FEATURE_PV_EOI 6 - -/* The last 8 bits are used to indicate how to interpret the flags field - * in pvclock structure. If no bits are set, all flags are ignored. - */ -#define KVM_FEATURE_CLOCKSOURCE_STABLE_BIT 24 - -#define MSR_KVM_WALL_CLOCK 0x11 -#define MSR_KVM_SYSTEM_TIME 0x12 - -#define KVM_MSR_ENABLED 1 -/* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */ -#define MSR_KVM_WALL_CLOCK_NEW 0x4b564d00 -#define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01 -#define MSR_KVM_ASYNC_PF_EN 0x4b564d02 -#define MSR_KVM_STEAL_TIME 0x4b564d03 -#define MSR_KVM_PV_EOI_EN 0x4b564d04 - -struct kvm_steal_time { - __u64 steal; - __u32 version; - __u32 flags; - __u32 pad[12]; -}; - -#define KVM_STEAL_ALIGNMENT_BITS 5 -#define KVM_STEAL_VALID_BITS ((-1ULL << (KVM_STEAL_ALIGNMENT_BITS + 1))) -#define KVM_STEAL_RESERVED_MASK (((1 << KVM_STEAL_ALIGNMENT_BITS) - 1 ) << 1) - -#define KVM_MAX_MMU_OP_BATCH 32 - -#define KVM_ASYNC_PF_ENABLED (1 << 0) -#define KVM_ASYNC_PF_SEND_ALWAYS (1 << 1) - -/* Operations for KVM_HC_MMU_OP */ -#define KVM_MMU_OP_WRITE_PTE 1 -#define KVM_MMU_OP_FLUSH_TLB 2 -#define KVM_MMU_OP_RELEASE_PT 3 - -/* Payload for KVM_HC_MMU_OP */ -struct kvm_mmu_op_header { - __u32 op; - __u32 pad; -}; - -struct kvm_mmu_op_write_pte { - struct kvm_mmu_op_header header; - __u64 pte_phys; - __u64 pte_val; -}; - -struct kvm_mmu_op_flush_tlb { - struct kvm_mmu_op_header header; -}; - -struct kvm_mmu_op_release_pt { - struct kvm_mmu_op_header header; - __u64 pt_phys; -}; - -#define KVM_PV_REASON_PAGE_NOT_PRESENT 1 -#define KVM_PV_REASON_PAGE_READY 2 - -struct kvm_vcpu_pv_apf_data { - __u32 reason; - __u8 pad[60]; - __u32 enabled; -}; - -#define KVM_PV_EOI_BIT 0 -#define KVM_PV_EOI_MASK (0x1 << KVM_PV_EOI_BIT) -#define KVM_PV_EOI_ENABLED KVM_PV_EOI_MASK -#define KVM_PV_EOI_DISABLED 0x0 - -#ifdef __KERNEL__ #include <asm/processor.h> +#include <uapi/asm/kvm_para.h> extern void kvmclock_init(void); extern int kvm_register_clock(char *txt); @@ -122,7 +27,7 @@ static inline bool kvm_check_and_clear_guest_paused(void) * * Up to four arguments may be passed in rbx, rcx, rdx, and rsi respectively. * The hypercall number should be placed in rax and the return value will be - * placed in rax. No other registers will be clobbered unless explicited + * placed in rax. No other registers will be clobbered unless explicitly * noted by the particular hypercall. */ @@ -180,13 +85,13 @@ static inline long kvm_hypercall4(unsigned int nr, unsigned long p1, return ret; } -static inline int kvm_para_available(void) +static inline bool kvm_para_available(void) { unsigned int eax, ebx, ecx, edx; char signature[13]; if (boot_cpu_data.cpuid_level < 0) - return 0; /* So we don't blow up on old processors */ + return false; /* So we don't blow up on old processors */ if (cpu_has_hypervisor) { cpuid(KVM_CPUID_SIGNATURE, &eax, &ebx, &ecx, &edx); @@ -196,10 +101,10 @@ static inline int kvm_para_available(void) signature[12] = 0; if (strcmp(signature, "KVMKVMKVM") == 0) - return 1; + return true; } - return 0; + return false; } static inline unsigned int kvm_arch_para_features(void) @@ -228,6 +133,4 @@ static inline void kvm_disable_steal_time(void) } #endif -#endif /* __KERNEL__ */ - #endif /* _ASM_X86_KVM_PARA_H */ diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h index 48142971b25..79327e9483a 100644 --- a/arch/x86/include/asm/linkage.h +++ b/arch/x86/include/asm/linkage.h @@ -27,20 +27,20 @@ #define __asmlinkage_protect0(ret) \ __asmlinkage_protect_n(ret) #define __asmlinkage_protect1(ret, arg1) \ - __asmlinkage_protect_n(ret, "g" (arg1)) + __asmlinkage_protect_n(ret, "m" (arg1)) #define __asmlinkage_protect2(ret, arg1, arg2) \ - __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2)) + __asmlinkage_protect_n(ret, "m" (arg1), "m" (arg2)) #define __asmlinkage_protect3(ret, arg1, arg2, arg3) \ - __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3)) + __asmlinkage_protect_n(ret, "m" (arg1), "m" (arg2), "m" (arg3)) #define __asmlinkage_protect4(ret, arg1, arg2, arg3, arg4) \ - __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3), \ - "g" (arg4)) + __asmlinkage_protect_n(ret, "m" (arg1), "m" (arg2), "m" (arg3), \ + "m" (arg4)) #define __asmlinkage_protect5(ret, arg1, arg2, arg3, arg4, arg5) \ - __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3), \ - "g" (arg4), "g" (arg5)) + __asmlinkage_protect_n(ret, "m" (arg1), "m" (arg2), "m" (arg3), \ + "m" (arg4), "m" (arg5)) #define __asmlinkage_protect6(ret, arg1, arg2, arg3, arg4, arg5, arg6) \ - __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3), \ - "g" (arg4), "g" (arg5), "g" (arg6)) + __asmlinkage_protect_n(ret, "m" (arg1), "m" (arg2), "m" (arg3), \ + "m" (arg4), "m" (arg5), "m" (arg6)) #endif /* CONFIG_X86_32 */ diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h index c8bed0da434..2d89e3980cb 100644 --- a/arch/x86/include/asm/local.h +++ b/arch/x86/include/asm/local.h @@ -124,27 +124,11 @@ static inline int local_add_negative(long i, local_t *l) */ static inline long local_add_return(long i, local_t *l) { - long __i; -#ifdef CONFIG_M386 - unsigned long flags; - if (unlikely(boot_cpu_data.x86 <= 3)) - goto no_xadd; -#endif - /* Modern 486+ processor */ - __i = i; + long __i = i; asm volatile(_ASM_XADD "%0, %1;" : "+r" (i), "+m" (l->a.counter) : : "memory"); return i + __i; - -#ifdef CONFIG_M386 -no_xadd: /* Legacy 386 processor */ - local_irq_save(flags); - __i = local_read(l); - local_set(l, i + __i); - local_irq_restore(flags); - return i + __i; -#endif } static inline long local_sub_return(long i, local_t *l) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 54d73b1f00a..f4076af1f4e 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -1,8 +1,7 @@ #ifndef _ASM_X86_MCE_H #define _ASM_X86_MCE_H -#include <linux/types.h> -#include <asm/ioctls.h> +#include <uapi/asm/mce.h> /* * Machine Check support for x86 @@ -16,7 +15,7 @@ #define MCG_EXT_CNT_MASK 0xff0000 /* Number of Extended registers */ #define MCG_EXT_CNT_SHIFT 16 #define MCG_EXT_CNT(c) (((c) & MCG_EXT_CNT_MASK) >> MCG_EXT_CNT_SHIFT) -#define MCG_SER_P (1ULL<<24) /* MCA recovery/new status bits */ +#define MCG_SER_P (1ULL<<24) /* MCA recovery/new status bits */ /* MCG_STATUS register defines */ #define MCG_STATUS_RIPV (1ULL<<0) /* restart ip valid */ @@ -64,28 +63,15 @@ #define MCJ_EXCEPTION 0x8 /* raise as exception */ #define MCJ_IRQ_BRAODCAST 0x10 /* do IRQ broadcasting */ -/* Fields are zero when not available */ -struct mce { - __u64 status; - __u64 misc; - __u64 addr; - __u64 mcgstatus; - __u64 ip; - __u64 tsc; /* cpu time stamp counter */ - __u64 time; /* wall time_t when error was detected */ - __u8 cpuvendor; /* cpu vendor as encoded in system.h */ - __u8 inject_flags; /* software inject flags */ - __u16 pad; - __u32 cpuid; /* CPUID 1 EAX */ - __u8 cs; /* code segment */ - __u8 bank; /* machine check bank */ - __u8 cpu; /* cpu number; obsolete; use extcpu now */ - __u8 finished; /* entry is valid */ - __u32 extcpu; /* linux cpu number that detected the error */ - __u32 socketid; /* CPU socket ID */ - __u32 apicid; /* CPU initial apic ID */ - __u64 mcgcap; /* MCGCAP MSR: machine check capabilities of CPU */ -}; +#define MCE_OVERFLOW 0 /* bit 0 in flags means overflow */ + +/* Software defined banks */ +#define MCE_EXTENDED_BANK 128 +#define MCE_THERMAL_BANK (MCE_EXTENDED_BANK + 0) +#define K8_MCE_THRESHOLD_BASE (MCE_EXTENDED_BANK + 1) + +#define MCE_LOG_LEN 32 +#define MCE_LOG_SIGNATURE "MACHINECHECK" /* * This structure contains all data related to the MCE log. Also @@ -93,9 +79,6 @@ struct mce { * debugging tools. Each entry is only valid when its finished flag * is set. */ - -#define MCE_LOG_LEN 32 - struct mce_log { char signature[12]; /* "MACHINECHECK" */ unsigned len; /* = MCE_LOG_LEN */ @@ -105,20 +88,22 @@ struct mce_log { struct mce entry[MCE_LOG_LEN]; }; -#define MCE_OVERFLOW 0 /* bit 0 in flags means overflow */ - -#define MCE_LOG_SIGNATURE "MACHINECHECK" - -#define MCE_GET_RECORD_LEN _IOR('M', 1, int) -#define MCE_GET_LOG_LEN _IOR('M', 2, int) -#define MCE_GETCLEAR_FLAGS _IOR('M', 3, int) - -/* Software defined banks */ -#define MCE_EXTENDED_BANK 128 -#define MCE_THERMAL_BANK MCE_EXTENDED_BANK + 0 -#define K8_MCE_THRESHOLD_BASE (MCE_EXTENDED_BANK + 1) +struct mca_config { + bool dont_log_ce; + bool cmci_disabled; + bool ignore_ce; + bool disabled; + bool ser; + bool bios_cmci_threshold; + u8 banks; + s8 bootlog; + int tolerant; + int monarch_timeout; + int panic_timeout; + u32 rip_msr; +}; -#ifdef __KERNEL__ +extern struct mca_config mca_cfg; extern void mce_register_decode_chain(struct notifier_block *nb); extern void mce_unregister_decode_chain(struct notifier_block *nb); @@ -126,7 +111,6 @@ extern void mce_unregister_decode_chain(struct notifier_block *nb); #include <linux/init.h> #include <linux/atomic.h> -extern int mce_disabled; extern int mce_p5_enabled; #ifdef CONFIG_X86_MCE @@ -159,9 +143,6 @@ DECLARE_PER_CPU(struct device *, mce_device); #define MAX_NR_BANKS 32 #ifdef CONFIG_X86_MCE_INTEL -extern int mce_cmci_disabled; -extern int mce_ignore_ce; -extern int mce_bios_cmci_threshold; void mce_intel_feature_init(struct cpuinfo_x86 *c); void cmci_clear(void); void cmci_reenable(void); @@ -247,5 +228,4 @@ struct cper_sec_mem_err; extern void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err); -#endif /* __KERNEL__ */ #endif /* _ASM_X86_MCE_H */ diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index 43d921b4752..6825e2efd1b 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -57,4 +57,18 @@ static inline struct microcode_ops * __init init_amd_microcode(void) static inline void __exit exit_amd_microcode(void) {} #endif +#ifdef CONFIG_MICROCODE_EARLY +#define MAX_UCODE_COUNT 128 +extern void __init load_ucode_bsp(void); +extern __init void load_ucode_ap(void); +extern int __init save_microcode_in_initrd(void); +#else +static inline void __init load_ucode_bsp(void) {} +static inline __init void load_ucode_ap(void) {} +static inline int __init save_microcode_in_initrd(void) +{ + return 0; +} +#endif + #endif /* _ASM_X86_MICROCODE_H */ diff --git a/arch/x86/include/asm/microcode_intel.h b/arch/x86/include/asm/microcode_intel.h new file mode 100644 index 00000000000..5356f927d41 --- /dev/null +++ b/arch/x86/include/asm/microcode_intel.h @@ -0,0 +1,85 @@ +#ifndef _ASM_X86_MICROCODE_INTEL_H +#define _ASM_X86_MICROCODE_INTEL_H + +#include <asm/microcode.h> + +struct microcode_header_intel { + unsigned int hdrver; + unsigned int rev; + unsigned int date; + unsigned int sig; + unsigned int cksum; + unsigned int ldrver; + unsigned int pf; + unsigned int datasize; + unsigned int totalsize; + unsigned int reserved[3]; +}; + +struct microcode_intel { + struct microcode_header_intel hdr; + unsigned int bits[0]; +}; + +/* microcode format is extended from prescott processors */ +struct extended_signature { + unsigned int sig; + unsigned int pf; + unsigned int cksum; +}; + +struct extended_sigtable { + unsigned int count; + unsigned int cksum; + unsigned int reserved[3]; + struct extended_signature sigs[0]; +}; + +#define DEFAULT_UCODE_DATASIZE (2000) +#define MC_HEADER_SIZE (sizeof(struct microcode_header_intel)) +#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) +#define EXT_HEADER_SIZE (sizeof(struct extended_sigtable)) +#define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature)) +#define DWSIZE (sizeof(u32)) + +#define get_totalsize(mc) \ + (((struct microcode_intel *)mc)->hdr.totalsize ? \ + ((struct microcode_intel *)mc)->hdr.totalsize : \ + DEFAULT_UCODE_TOTALSIZE) + +#define get_datasize(mc) \ + (((struct microcode_intel *)mc)->hdr.datasize ? \ + ((struct microcode_intel *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE) + +#define sigmatch(s1, s2, p1, p2) \ + (((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0)))) + +#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE) + +extern int +get_matching_microcode(unsigned int csig, int cpf, void *mc, int rev); +extern int microcode_sanity_check(void *mc, int print_err); +extern int get_matching_sig(unsigned int csig, int cpf, void *mc, int rev); +extern int +update_match_revision(struct microcode_header_intel *mc_header, int rev); + +#ifdef CONFIG_MICROCODE_INTEL_EARLY +extern void __init load_ucode_intel_bsp(void); +extern void __cpuinit load_ucode_intel_ap(void); +extern void show_ucode_info_early(void); +#else +static inline __init void load_ucode_intel_bsp(void) {} +static inline __cpuinit void load_ucode_intel_ap(void) {} +static inline void show_ucode_info_early(void) {} +#endif + +#if defined(CONFIG_MICROCODE_INTEL_EARLY) && defined(CONFIG_HOTPLUG_CPU) +extern int save_mc_for_early(u8 *mc); +#else +static inline int save_mc_for_early(u8 *mc) +{ + return 0; +} +#endif + +#endif /* _ASM_X86_MICROCODE_INTEL_H */ diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h index eb05fb3b02f..8a9b3e288cb 100644 --- a/arch/x86/include/asm/mmzone_32.h +++ b/arch/x86/include/asm/mmzone_32.h @@ -14,12 +14,6 @@ extern struct pglist_data *node_data[]; #include <asm/numaq.h> -extern void resume_map_numa_kva(pgd_t *pgd); - -#else /* !CONFIG_NUMA */ - -static inline void resume_map_numa_kva(pgd_t *pgd) {} - #endif /* CONFIG_NUMA */ #ifdef CONFIG_DISCONTIGMEM diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h index 9eae7752ae9..e3b7819caee 100644 --- a/arch/x86/include/asm/module.h +++ b/arch/x86/include/asm/module.h @@ -5,8 +5,6 @@ #ifdef CONFIG_X86_64 /* X86_64 does not define MODULE_PROC_FAMILY */ -#elif defined CONFIG_M386 -#define MODULE_PROC_FAMILY "386 " #elif defined CONFIG_M486 #define MODULE_PROC_FAMILY "486 " #elif defined CONFIG_M586 diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 79ce5685ab6..c2934be2446 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -11,4 +11,8 @@ struct ms_hyperv_info { extern struct ms_hyperv_info ms_hyperv; +void hyperv_callback_vector(void); +void hyperv_vector_handler(struct pt_regs *regs); +void hv_register_vmbus_handler(int irq, irq_handler_t handler); + #endif diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 813ed103f45..9264802e282 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -1,18 +1,10 @@ #ifndef _ASM_X86_MSR_H #define _ASM_X86_MSR_H -#include <asm/msr-index.h> +#include <uapi/asm/msr.h> #ifndef __ASSEMBLY__ -#include <linux/types.h> -#include <linux/ioctl.h> - -#define X86_IOC_RDMSR_REGS _IOWR('c', 0xA0, __u32[8]) -#define X86_IOC_WRMSR_REGS _IOWR('c', 0xA1, __u32[8]) - -#ifdef __KERNEL__ - #include <asm/asm.h> #include <asm/errno.h> #include <asm/cpumask.h> @@ -271,6 +263,5 @@ static inline int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]) return wrmsr_safe_regs(regs); } #endif /* CONFIG_SMP */ -#endif /* __KERNEL__ */ #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_MSR_H */ diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h index 7e3f17f92c6..e235582f993 100644 --- a/arch/x86/include/asm/mtrr.h +++ b/arch/x86/include/asm/mtrr.h @@ -23,97 +23,8 @@ #ifndef _ASM_X86_MTRR_H #define _ASM_X86_MTRR_H -#include <linux/types.h> -#include <linux/ioctl.h> -#include <linux/errno.h> +#include <uapi/asm/mtrr.h> -#define MTRR_IOCTL_BASE 'M' - -/* Warning: this structure has a different order from i386 - on x86-64. The 32bit emulation code takes care of that. - But you need to use this for 64bit, otherwise your X server - will break. */ - -#ifdef __i386__ -struct mtrr_sentry { - unsigned long base; /* Base address */ - unsigned int size; /* Size of region */ - unsigned int type; /* Type of region */ -}; - -struct mtrr_gentry { - unsigned int regnum; /* Register number */ - unsigned long base; /* Base address */ - unsigned int size; /* Size of region */ - unsigned int type; /* Type of region */ -}; - -#else /* __i386__ */ - -struct mtrr_sentry { - __u64 base; /* Base address */ - __u32 size; /* Size of region */ - __u32 type; /* Type of region */ -}; - -struct mtrr_gentry { - __u64 base; /* Base address */ - __u32 size; /* Size of region */ - __u32 regnum; /* Register number */ - __u32 type; /* Type of region */ - __u32 _pad; /* Unused */ -}; - -#endif /* !__i386__ */ - -struct mtrr_var_range { - __u32 base_lo; - __u32 base_hi; - __u32 mask_lo; - __u32 mask_hi; -}; - -/* In the Intel processor's MTRR interface, the MTRR type is always held in - an 8 bit field: */ -typedef __u8 mtrr_type; - -#define MTRR_NUM_FIXED_RANGES 88 -#define MTRR_MAX_VAR_RANGES 256 - -struct mtrr_state_type { - struct mtrr_var_range var_ranges[MTRR_MAX_VAR_RANGES]; - mtrr_type fixed_ranges[MTRR_NUM_FIXED_RANGES]; - unsigned char enabled; - unsigned char have_fixed; - mtrr_type def_type; -}; - -#define MTRRphysBase_MSR(reg) (0x200 + 2 * (reg)) -#define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1) - -/* These are the various ioctls */ -#define MTRRIOC_ADD_ENTRY _IOW(MTRR_IOCTL_BASE, 0, struct mtrr_sentry) -#define MTRRIOC_SET_ENTRY _IOW(MTRR_IOCTL_BASE, 1, struct mtrr_sentry) -#define MTRRIOC_DEL_ENTRY _IOW(MTRR_IOCTL_BASE, 2, struct mtrr_sentry) -#define MTRRIOC_GET_ENTRY _IOWR(MTRR_IOCTL_BASE, 3, struct mtrr_gentry) -#define MTRRIOC_KILL_ENTRY _IOW(MTRR_IOCTL_BASE, 4, struct mtrr_sentry) -#define MTRRIOC_ADD_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 5, struct mtrr_sentry) -#define MTRRIOC_SET_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 6, struct mtrr_sentry) -#define MTRRIOC_DEL_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 7, struct mtrr_sentry) -#define MTRRIOC_GET_PAGE_ENTRY _IOWR(MTRR_IOCTL_BASE, 8, struct mtrr_gentry) -#define MTRRIOC_KILL_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 9, struct mtrr_sentry) - -/* These are the region types */ -#define MTRR_TYPE_UNCACHABLE 0 -#define MTRR_TYPE_WRCOMB 1 -/*#define MTRR_TYPE_ 2*/ -/*#define MTRR_TYPE_ 3*/ -#define MTRR_TYPE_WRTHROUGH 4 -#define MTRR_TYPE_WRPROT 5 -#define MTRR_TYPE_WRBACK 6 -#define MTRR_NUM_TYPES 7 - -#ifdef __KERNEL__ /* The following functions are for use by other drivers */ # ifdef CONFIG_MTRR @@ -208,6 +119,4 @@ struct mtrr_gentry32 { _IOW(MTRR_IOCTL_BASE, 9, struct mtrr_sentry32) #endif /* CONFIG_COMPAT */ -#endif /* __KERNEL__ */ - #endif /* _ASM_X86_MTRR_H */ diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index bcdff997668..2f366d0ac6b 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -4,7 +4,8 @@ #define MWAIT_SUBSTATE_MASK 0xf #define MWAIT_CSTATE_MASK 0xf #define MWAIT_SUBSTATE_SIZE 4 -#define MWAIT_MAX_NUM_CSTATES 8 +#define MWAIT_HINT2CSTATE(hint) (((hint) >> MWAIT_SUBSTATE_SIZE) & MWAIT_CSTATE_MASK) +#define MWAIT_HINT2SUBSTATE(hint) ((hint) & MWAIT_CSTATE_MASK) #define CPUID_MWAIT_LEAF 5 #define CPUID5_ECX_EXTENSIONS_SUPPORTED 0x1 diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h index 49119fcea2d..1b99ee5c9f0 100644 --- a/arch/x86/include/asm/numa.h +++ b/arch/x86/include/asm/numa.h @@ -54,13 +54,11 @@ static inline int numa_cpu_node(int cpu) #ifdef CONFIG_X86_32 # include <asm/numa_32.h> -#else -# include <asm/numa_64.h> #endif #ifdef CONFIG_NUMA -extern void __cpuinit numa_set_node(int cpu, int node); -extern void __cpuinit numa_clear_node(int cpu); +extern void numa_set_node(int cpu, int node); +extern void numa_clear_node(int cpu); extern void __init init_cpu_to_node(void); extern void __cpuinit numa_add_cpu(int cpu); extern void __cpuinit numa_remove_cpu(int cpu); diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h deleted file mode 100644 index 0c05f7ae46e..00000000000 --- a/arch/x86/include/asm/numa_64.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef _ASM_X86_NUMA_64_H -#define _ASM_X86_NUMA_64_H - -extern unsigned long numa_free_all_bootmem(void); - -#endif /* _ASM_X86_NUMA_64_H */ diff --git a/arch/x86/include/asm/numachip/numachip.h b/arch/x86/include/asm/numachip/numachip.h new file mode 100644 index 00000000000..1c6f7f6212c --- /dev/null +++ b/arch/x86/include/asm/numachip/numachip.h @@ -0,0 +1,19 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Numascale NumaConnect-specific header file + * + * Copyright (C) 2012 Numascale AS. All rights reserved. + * + * Send feedback to <support@numascale.com> + * + */ + +#ifndef _ASM_X86_NUMACHIP_NUMACHIP_H +#define _ASM_X86_NUMACHIP_NUMACHIP_H + +extern int __init pci_numachip_init(void); + +#endif /* _ASM_X86_NUMACHIP_NUMACHIP_H */ diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h index 8ca82839288..c87892442e5 100644 --- a/arch/x86/include/asm/page.h +++ b/arch/x86/include/asm/page.h @@ -17,6 +17,10 @@ struct page; +#include <linux/range.h> +extern struct range pfn_mapped[]; +extern int nr_pfn_mapped; + static inline void clear_user_page(void *page, unsigned long vaddr, struct page *pg) { @@ -44,7 +48,8 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr, * case properly. Once all supported versions of gcc understand it, we can * remove this Voodoo magic stuff. (i.e. once gcc3.x is deprecated) */ -#define __pa_symbol(x) __pa(__phys_reloc_hide((unsigned long)(x))) +#define __pa_symbol(x) \ + __phys_addr_symbol(__phys_reloc_hide((unsigned long)(x))) #define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) diff --git a/arch/x86/include/asm/page_32.h b/arch/x86/include/asm/page_32.h index da4e762406f..4d550d04b60 100644 --- a/arch/x86/include/asm/page_32.h +++ b/arch/x86/include/asm/page_32.h @@ -15,6 +15,7 @@ extern unsigned long __phys_addr(unsigned long); #else #define __phys_addr(x) __phys_addr_nodebug(x) #endif +#define __phys_addr_symbol(x) __phys_addr(x) #define __phys_reloc_hide(x) RELOC_HIDE((x), 0) #ifdef CONFIG_FLATMEM diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index 072694ed81a..0f1ddee6a0c 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -3,4 +3,40 @@ #include <asm/page_64_types.h> +#ifndef __ASSEMBLY__ + +/* duplicated to the one in bootmem.h */ +extern unsigned long max_pfn; +extern unsigned long phys_base; + +static inline unsigned long __phys_addr_nodebug(unsigned long x) +{ + unsigned long y = x - __START_KERNEL_map; + + /* use the carry flag to determine if x was < __START_KERNEL_map */ + x = y + ((x > y) ? phys_base : (__START_KERNEL_map - PAGE_OFFSET)); + + return x; +} + +#ifdef CONFIG_DEBUG_VIRTUAL +extern unsigned long __phys_addr(unsigned long); +extern unsigned long __phys_addr_symbol(unsigned long); +#else +#define __phys_addr(x) __phys_addr_nodebug(x) +#define __phys_addr_symbol(x) \ + ((unsigned long)(x) - __START_KERNEL_map + phys_base) +#endif + +#define __phys_reloc_hide(x) (x) + +#ifdef CONFIG_FLATMEM +#define pfn_valid(pfn) ((pfn) < max_pfn) +#endif + +void clear_page(void *page); +void copy_page(void *to, void *from); + +#endif /* !__ASSEMBLY__ */ + #endif /* _ASM_X86_PAGE_64_H */ diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 320f7bb95f7..8b491e66eaa 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -50,26 +50,4 @@ #define KERNEL_IMAGE_SIZE (512 * 1024 * 1024) #define KERNEL_IMAGE_START _AC(0xffffffff80000000, UL) -#ifndef __ASSEMBLY__ -void clear_page(void *page); -void copy_page(void *to, void *from); - -/* duplicated to the one in bootmem.h */ -extern unsigned long max_pfn; -extern unsigned long phys_base; - -extern unsigned long __phys_addr(unsigned long); -#define __phys_reloc_hide(x) (x) - -#define vmemmap ((struct page *)VMEMMAP_START) - -extern void init_extra_mapping_uc(unsigned long phys, unsigned long size); -extern void init_extra_mapping_wb(unsigned long phys, unsigned long size); - -#endif /* !__ASSEMBLY__ */ - -#ifdef CONFIG_FLATMEM -#define pfn_valid(pfn) ((pfn) < max_pfn) -#endif - #endif /* _ASM_X86_PAGE_64_DEFS_H */ diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index e21fdd10479..54c97879195 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -51,6 +51,8 @@ static inline phys_addr_t get_max_mapped(void) return (phys_addr_t)max_pfn_mapped << PAGE_SHIFT; } +bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn); + extern unsigned long init_memory_mapping(unsigned long start, unsigned long end); diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index a0facf3908d..5edd1742cfd 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -528,7 +528,6 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, PVOP_VCALL4(pv_mmu_ops.set_pte_at, mm, addr, ptep, pte.pte); } -#ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd) { @@ -539,7 +538,6 @@ static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, PVOP_VCALL4(pv_mmu_ops.set_pmd_at, mm, addr, pmdp, native_pmd_val(pmd)); } -#endif static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) { diff --git a/arch/x86/include/asm/parport.h b/arch/x86/include/asm/parport.h index 3c4ffeb467e..0d2d3b29118 100644 --- a/arch/x86/include/asm/parport.h +++ b/arch/x86/include/asm/parport.h @@ -1,8 +1,8 @@ #ifndef _ASM_X86_PARPORT_H #define _ASM_X86_PARPORT_H -static int __devinit parport_pc_find_isa_ports(int autoirq, int autodma); -static int __devinit parport_pc_find_nonpci_ports(int autoirq, int autodma) +static int parport_pc_find_isa_ports(int autoirq, int autodma); +static int parport_pc_find_nonpci_ports(int autoirq, int autodma) { return parport_pc_find_isa_ports(autoirq, autodma); } diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index 6e41b934392..d9e9e6c7ed3 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -14,6 +14,9 @@ struct pci_sysdata { int domain; /* PCI domain */ int node; /* NUMA node */ +#ifdef CONFIG_ACPI + void *acpi; /* ACPI-specific data */ +#endif #ifdef CONFIG_X86_64 void *iommu; /* IOMMU private data */ #endif @@ -121,9 +124,12 @@ static inline void x86_restore_msi_irqs(struct pci_dev *dev, int irq) #define arch_teardown_msi_irq x86_teardown_msi_irq #define arch_restore_msi_irqs x86_restore_msi_irqs /* implemented in arch/x86/kernel/apic/io_apic. */ +struct msi_desc; int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type); void native_teardown_msi_irq(unsigned int irq); void native_restore_msi_irqs(struct pci_dev *dev, int irq); +int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, + unsigned int irq_base, unsigned int irq_offset); /* default to the implementation in drivers/lib/msi.c */ #define HAVE_DEFAULT_MSI_TEARDOWN_IRQS #define HAVE_DEFAULT_MSI_RESTORE_IRQS @@ -171,4 +177,16 @@ cpumask_of_pcibus(const struct pci_bus *bus) } #endif +struct pci_setup_rom { + struct setup_data data; + uint16_t vendor; + uint16_t devid; + uint64_t pcilen; + unsigned long segment; + unsigned long bus; + unsigned long device; + unsigned long function; + uint8_t romdata[0]; +}; + #endif /* _ASM_X86_PCI_H */ diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index 73e8eeff22e..fa1195dae42 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -54,7 +54,6 @@ void pcibios_set_cache_line_size(void); /* pci-pc.c */ extern int pcibios_last_bus; -extern struct pci_bus *pci_root_bus; extern struct pci_ops pci_root_ops; void pcibios_scan_specific_bus(int busn); @@ -140,11 +139,10 @@ struct pci_mmcfg_region { extern int __init pci_mmcfg_arch_init(void); extern void __init pci_mmcfg_arch_free(void); -extern int __devinit pci_mmcfg_arch_map(struct pci_mmcfg_region *cfg); +extern int pci_mmcfg_arch_map(struct pci_mmcfg_region *cfg); extern void pci_mmcfg_arch_unmap(struct pci_mmcfg_region *cfg); -extern int __devinit pci_mmconfig_insert(struct device *dev, - u16 seg, u8 start, - u8 end, phys_addr_t addr); +extern int pci_mmconfig_insert(struct device *dev, u16 seg, u8 start, u8 end, + phys_addr_t addr); extern int pci_mmconfig_delete(u16 seg, u8 start, u8 end); extern struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int bus); diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 1104afaba52..0da5200ee79 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -406,7 +406,6 @@ do { \ #define this_cpu_xchg_2(pcp, nval) percpu_xchg_op(pcp, nval) #define this_cpu_xchg_4(pcp, nval) percpu_xchg_op(pcp, nval) -#ifndef CONFIG_M386 #define __this_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val) #define __this_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val) #define __this_cpu_add_return_4(pcp, val) percpu_add_return_op(pcp, val) @@ -421,8 +420,6 @@ do { \ #define this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) #define this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) -#endif /* !CONFIG_M386 */ - #ifdef CONFIG_X86_CMPXCHG64 #define percpu_cmpxchg8b_double(pcp1, pcp2, o1, o2, n1, n2) \ ({ \ diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 4fabcdf1cfa..57cb6340221 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -29,8 +29,13 @@ #define ARCH_PERFMON_EVENTSEL_INV (1ULL << 23) #define ARCH_PERFMON_EVENTSEL_CMASK 0xFF000000ULL -#define AMD_PERFMON_EVENTSEL_GUESTONLY (1ULL << 40) -#define AMD_PERFMON_EVENTSEL_HOSTONLY (1ULL << 41) +#define AMD64_EVENTSEL_INT_CORE_ENABLE (1ULL << 36) +#define AMD64_EVENTSEL_GUESTONLY (1ULL << 40) +#define AMD64_EVENTSEL_HOSTONLY (1ULL << 41) + +#define AMD64_EVENTSEL_INT_CORE_SEL_SHIFT 37 +#define AMD64_EVENTSEL_INT_CORE_SEL_MASK \ + (0xFULL << AMD64_EVENTSEL_INT_CORE_SEL_SHIFT) #define AMD64_EVENTSEL_EVENT \ (ARCH_PERFMON_EVENTSEL_EVENT | (0x0FULL << 32)) @@ -46,8 +51,12 @@ #define AMD64_RAW_EVENT_MASK \ (X86_RAW_EVENT_MASK | \ AMD64_EVENTSEL_EVENT) +#define AMD64_RAW_EVENT_MASK_NB \ + (AMD64_EVENTSEL_EVENT | \ + ARCH_PERFMON_EVENTSEL_UMASK) #define AMD64_NUM_COUNTERS 4 #define AMD64_NUM_COUNTERS_CORE 6 +#define AMD64_NUM_COUNTERS_NB 4 #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index a1f780d45f7..1e672234c4f 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -142,6 +142,11 @@ static inline unsigned long pmd_pfn(pmd_t pmd) return (pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT; } +static inline unsigned long pud_pfn(pud_t pud) +{ + return (pud_val(pud) & PTE_PFN_MASK) >> PAGE_SHIFT; +} + #define pte_page(pte) pfn_to_page(pte_pfn(pte)) static inline int pmd_large(pmd_t pte) @@ -390,6 +395,7 @@ pte_t *populate_extra_pte(unsigned long vaddr); #ifndef __ASSEMBLY__ #include <linux/mm_types.h> +#include <linux/log2.h> static inline int pte_none(pte_t pte) { @@ -404,7 +410,14 @@ static inline int pte_same(pte_t a, pte_t b) static inline int pte_present(pte_t a) { - return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE); + return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE | + _PAGE_NUMA); +} + +#define pte_accessible pte_accessible +static inline int pte_accessible(pte_t a) +{ + return pte_flags(a) & _PAGE_PRESENT; } static inline int pte_hidden(pte_t pte) @@ -420,7 +433,8 @@ static inline int pmd_present(pmd_t pmd) * the _PAGE_PSE flag will remain set at all times while the * _PAGE_PRESENT bit is clear). */ - return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE); + return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE | + _PAGE_NUMA); } static inline int pmd_none(pmd_t pmd) @@ -479,6 +493,11 @@ static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address) static inline int pmd_bad(pmd_t pmd) { +#ifdef CONFIG_NUMA_BALANCING + /* pmd_numa check */ + if ((pmd_flags(pmd) & (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA) + return 0; +#endif return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE; } @@ -602,6 +621,8 @@ static inline int pgd_none(pgd_t pgd) #ifndef __ASSEMBLY__ extern int direct_gbpages; +void init_mem_mapping(void); +void early_alloc_pgt_buf(void); /* local pte updates need not use xchg for locking */ static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep) @@ -768,6 +789,32 @@ static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) memcpy(dst, src, count * sizeof(pgd_t)); } +#define PTE_SHIFT ilog2(PTRS_PER_PTE) +static inline int page_level_shift(enum pg_level level) +{ + return (PAGE_SHIFT - PTE_SHIFT) + level * PTE_SHIFT; +} +static inline unsigned long page_level_size(enum pg_level level) +{ + return 1UL << page_level_shift(level); +} +static inline unsigned long page_level_mask(enum pg_level level) +{ + return ~(page_level_size(level) - 1); +} + +/* + * The x86 doesn't have any external MMU info: the kernel page + * tables contain all the necessary information. + */ +static inline void update_mmu_cache(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) +{ +} +static inline void update_mmu_cache_pmd(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmd) +{ +} #include <asm-generic/pgtable.h> #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h index 8faa215a503..9ee322103c6 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h @@ -66,13 +66,6 @@ do { \ __flush_tlb_one((vaddr)); \ } while (0) -/* - * The i386 doesn't have any external MMU info: the kernel page - * tables contain all the necessary information. - */ -#define update_mmu_cache(vma, address, ptep) do { } while (0) -#define update_mmu_cache_pmd(vma, address, pmd) do { } while (0) - #endif /* !__ASSEMBLY__ */ /* diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 47356f9df82..e22c1dbf7fe 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -142,9 +142,6 @@ static inline int pgd_large(pgd_t pgd) { return 0; } #define pte_offset_map(dir, address) pte_offset_kernel((dir), (address)) #define pte_unmap(pte) ((void)(pte))/* NOP */ -#define update_mmu_cache(vma, address, ptep) do { } while (0) -#define update_mmu_cache_pmd(vma, address, pmd) do { } while (0) - /* Encode and de-code a swap entry */ #if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE #define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1) @@ -183,6 +180,11 @@ extern void cleanup_highmap(void); #define __HAVE_ARCH_PTE_SAME +#define vmemmap ((struct page *)VMEMMAP_START) + +extern void init_extra_mapping_uc(unsigned long phys, unsigned long size); +extern void init_extra_mapping_wb(unsigned long phys, unsigned long size); + #endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_PGTABLE_64_H */ diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 766ea16fbbb..2d883440cb9 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -1,6 +1,8 @@ #ifndef _ASM_X86_PGTABLE_64_DEFS_H #define _ASM_X86_PGTABLE_64_DEFS_H +#include <asm/sparsemem.h> + #ifndef __ASSEMBLY__ #include <linux/types.h> @@ -60,4 +62,6 @@ typedef struct { pteval_t pte; } pte_t; #define MODULES_END _AC(0xffffffffff000000, UL) #define MODULES_LEN (MODULES_END - MODULES_VADDR) +#define EARLY_DYNAMIC_PAGE_TABLES 64 + #endif /* _ASM_X86_PGTABLE_64_DEFS_H */ diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index ec8a1fc9505..567b5d0632b 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -64,6 +64,26 @@ #define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE) #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) +/* + * _PAGE_NUMA indicates that this page will trigger a numa hinting + * minor page fault to gather numa placement statistics (see + * pte_numa()). The bit picked (8) is within the range between + * _PAGE_FILE (6) and _PAGE_PROTNONE (8) bits. Therefore, it doesn't + * require changes to the swp entry format because that bit is always + * zero when the pte is not present. + * + * The bit picked must be always zero when the pmd is present and not + * present, so that we don't lose information when we set it while + * atomically clearing the present bit. + * + * Because we shared the same bit (8) with _PAGE_PROTNONE this can be + * interpreted as _PAGE_NUMA only in places that _PAGE_PROTNONE + * couldn't reach, like handle_mm_fault() (see access_error in + * arch/x86/mm/fault.c, the vma protection must not be PROT_NONE for + * handle_mm_fault() to be invoked). + */ +#define _PAGE_NUMA _PAGE_PROTNONE + #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ _PAGE_ACCESSED | _PAGE_DIRTY) #define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \ @@ -301,7 +321,6 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, /* Install a pte for a particular vaddr in kernel space. */ void set_pte_vaddr(unsigned long vaddr, pte_t pte); -extern void native_pagetable_reserve(u64 start, u64 end); #ifdef CONFIG_X86_32 extern void native_pagetable_init(void); #else @@ -311,7 +330,7 @@ extern void native_pagetable_init(void); struct seq_file; extern void arch_report_meminfo(struct seq_file *m); -enum { +enum pg_level { PG_LEVEL_NONE, PG_LEVEL_4K, PG_LEVEL_2M, @@ -332,6 +351,8 @@ static inline void update_page_count(int level, unsigned long pages) { } * as a pte too. */ extern pte_t *lookup_address(unsigned long address, unsigned int *level); +extern int __split_large_page(pte_t *kpte, unsigned long address, pte_t *pbase); +extern phys_addr_t slow_virt_to_phys(void *__address); #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/include/asm/posix_types.h b/arch/x86/include/asm/posix_types.h index bad3665c25f..f565f6dd59d 100644 --- a/arch/x86/include/asm/posix_types.h +++ b/arch/x86/include/asm/posix_types.h @@ -1,15 +1,5 @@ -#ifdef __KERNEL__ # ifdef CONFIG_X86_32 # include <asm/posix_types_32.h> # else # include <asm/posix_types_64.h> # endif -#else -# ifdef __i386__ -# include <asm/posix_types_32.h> -# elif defined(__ILP32__) -# include <asm/posix_types_x32.h> -# else -# include <asm/posix_types_64.h> -# endif -#endif diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h index 680cf09ed10..39fb618e221 100644 --- a/arch/x86/include/asm/processor-flags.h +++ b/arch/x86/include/asm/processor-flags.h @@ -1,106 +1,11 @@ #ifndef _ASM_X86_PROCESSOR_FLAGS_H #define _ASM_X86_PROCESSOR_FLAGS_H -/* Various flags defined: can be included from assembler. */ -/* - * EFLAGS bits - */ -#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */ -#define X86_EFLAGS_BIT1 0x00000002 /* Bit 1 - always on */ -#define X86_EFLAGS_PF 0x00000004 /* Parity Flag */ -#define X86_EFLAGS_AF 0x00000010 /* Auxiliary carry Flag */ -#define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */ -#define X86_EFLAGS_SF 0x00000080 /* Sign Flag */ -#define X86_EFLAGS_TF 0x00000100 /* Trap Flag */ -#define X86_EFLAGS_IF 0x00000200 /* Interrupt Flag */ -#define X86_EFLAGS_DF 0x00000400 /* Direction Flag */ -#define X86_EFLAGS_OF 0x00000800 /* Overflow Flag */ -#define X86_EFLAGS_IOPL 0x00003000 /* IOPL mask */ -#define X86_EFLAGS_NT 0x00004000 /* Nested Task */ -#define X86_EFLAGS_RF 0x00010000 /* Resume Flag */ -#define X86_EFLAGS_VM 0x00020000 /* Virtual Mode */ -#define X86_EFLAGS_AC 0x00040000 /* Alignment Check */ -#define X86_EFLAGS_VIF 0x00080000 /* Virtual Interrupt Flag */ -#define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */ -#define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */ +#include <uapi/asm/processor-flags.h> -/* - * Basic CPU control in CR0 - */ -#define X86_CR0_PE 0x00000001 /* Protection Enable */ -#define X86_CR0_MP 0x00000002 /* Monitor Coprocessor */ -#define X86_CR0_EM 0x00000004 /* Emulation */ -#define X86_CR0_TS 0x00000008 /* Task Switched */ -#define X86_CR0_ET 0x00000010 /* Extension Type */ -#define X86_CR0_NE 0x00000020 /* Numeric Error */ -#define X86_CR0_WP 0x00010000 /* Write Protect */ -#define X86_CR0_AM 0x00040000 /* Alignment Mask */ -#define X86_CR0_NW 0x20000000 /* Not Write-through */ -#define X86_CR0_CD 0x40000000 /* Cache Disable */ -#define X86_CR0_PG 0x80000000 /* Paging */ - -/* - * Paging options in CR3 - */ -#define X86_CR3_PWT 0x00000008 /* Page Write Through */ -#define X86_CR3_PCD 0x00000010 /* Page Cache Disable */ -#define X86_CR3_PCID_MASK 0x00000fff /* PCID Mask */ - -/* - * Intel CPU features in CR4 - */ -#define X86_CR4_VME 0x00000001 /* enable vm86 extensions */ -#define X86_CR4_PVI 0x00000002 /* virtual interrupts flag enable */ -#define X86_CR4_TSD 0x00000004 /* disable time stamp at ipl 3 */ -#define X86_CR4_DE 0x00000008 /* enable debugging extensions */ -#define X86_CR4_PSE 0x00000010 /* enable page size extensions */ -#define X86_CR4_PAE 0x00000020 /* enable physical address extensions */ -#define X86_CR4_MCE 0x00000040 /* Machine check enable */ -#define X86_CR4_PGE 0x00000080 /* enable global pages */ -#define X86_CR4_PCE 0x00000100 /* enable performance counters at ipl 3 */ -#define X86_CR4_OSFXSR 0x00000200 /* enable fast FPU save and restore */ -#define X86_CR4_OSXMMEXCPT 0x00000400 /* enable unmasked SSE exceptions */ -#define X86_CR4_VMXE 0x00002000 /* enable VMX virtualization */ -#define X86_CR4_RDWRGSFS 0x00010000 /* enable RDWRGSFS support */ -#define X86_CR4_PCIDE 0x00020000 /* enable PCID support */ -#define X86_CR4_OSXSAVE 0x00040000 /* enable xsave and xrestore */ -#define X86_CR4_SMEP 0x00100000 /* enable SMEP support */ -#define X86_CR4_SMAP 0x00200000 /* enable SMAP support */ - -/* - * x86-64 Task Priority Register, CR8 - */ -#define X86_CR8_TPR 0x0000000F /* task priority register */ - -/* - * AMD and Transmeta use MSRs for configuration; see <asm/msr-index.h> - */ - -/* - * NSC/Cyrix CPU configuration register indexes - */ -#define CX86_PCR0 0x20 -#define CX86_GCR 0xb8 -#define CX86_CCR0 0xc0 -#define CX86_CCR1 0xc1 -#define CX86_CCR2 0xc2 -#define CX86_CCR3 0xc3 -#define CX86_CCR4 0xe8 -#define CX86_CCR5 0xe9 -#define CX86_CCR6 0xea -#define CX86_CCR7 0xeb -#define CX86_PCR1 0xf0 -#define CX86_DIR0 0xfe -#define CX86_DIR1 0xff -#define CX86_ARR_BASE 0xc4 -#define CX86_RCR_BASE 0xdc - -#ifdef __KERNEL__ #ifdef CONFIG_VM86 #define X86_VM_MASK X86_EFLAGS_VM #else #define X86_VM_MASK 0 /* No VM86 support */ #endif -#endif - #endif /* _ASM_X86_PROCESSOR_FLAGS_H */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index ad1fc851167..3270116b148 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -89,7 +89,6 @@ struct cpuinfo_x86 { char wp_works_ok; /* It doesn't on 386's */ /* Problems on some 486Dx4's and old 386's: */ - char hlt_works_ok; char hard_math; char rfu; char fdiv_bug; @@ -165,21 +164,10 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); extern const struct seq_operations cpuinfo_op; -static inline int hlt_works(int cpu) -{ -#ifdef CONFIG_X86_32 - return cpu_data(cpu).hlt_works_ok; -#else - return 1; -#endif -} - #define cache_line_size() (boot_cpu_data.x86_cache_alignment) extern void cpu_detect(struct cpuinfo_x86 *c); -extern struct pt_regs *idle_regs(struct pt_regs *); - extern void early_cpu_init(void); extern void identify_boot_cpu(void); extern void identify_secondary_cpu(struct cpuinfo_x86 *); @@ -187,11 +175,19 @@ extern void print_cpu_info(struct cpuinfo_x86 *); void print_cpu_msr(struct cpuinfo_x86 *); extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c); extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c); -extern unsigned short num_cache_leaves; +extern void init_amd_cacheinfo(struct cpuinfo_x86 *c); extern void detect_extended_topology(struct cpuinfo_x86 *c); extern void detect_ht(struct cpuinfo_x86 *c); +#ifdef CONFIG_X86_32 +extern int have_cpuid_p(void); +#else +static inline int have_cpuid_p(void) +{ + return 1; +} +#endif static inline void native_cpuid(unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) { @@ -672,18 +668,29 @@ static inline void sync_core(void) { int tmp; -#if defined(CONFIG_M386) || defined(CONFIG_M486) - if (boot_cpu_data.x86 < 5) - /* There is no speculative execution. - * jmp is a barrier to prefetching. */ - asm volatile("jmp 1f\n1:\n" ::: "memory"); - else +#ifdef CONFIG_M486 + /* + * Do a CPUID if available, otherwise do a jump. The jump + * can conveniently enough be the jump around CPUID. + */ + asm volatile("cmpl %2,%1\n\t" + "jl 1f\n\t" + "cpuid\n" + "1:" + : "=a" (tmp) + : "rm" (boot_cpu_data.cpuid_level), "ri" (0), "0" (1) + : "ebx", "ecx", "edx", "memory"); +#else + /* + * CPUID is a barrier to speculative execution. + * Prefetched instructions are automatically + * invalidated when modified. + */ + asm volatile("cpuid" + : "=a" (tmp) + : "0" (1) + : "ebx", "ecx", "edx", "memory"); #endif - /* cpuid is a barrier to speculative execution. - * Prefetched instructions are automatically - * invalidated when modified. */ - asm volatile("cpuid" : "=a" (tmp) : "0" (1) - : "ebx", "ecx", "edx", "memory"); } static inline void __monitor(const void *eax, unsigned long ecx, @@ -716,12 +723,13 @@ extern unsigned long boot_option_idle_override; extern bool amd_e400_c1e_detected; enum idle_boot_override {IDLE_NO_OVERRIDE=0, IDLE_HALT, IDLE_NOMWAIT, - IDLE_POLL, IDLE_FORCE_MWAIT}; + IDLE_POLL}; extern void enable_sep_cpu(void); extern int sysenter_setup(void); extern void early_trap_init(void); +void early_trap_pf_init(void); /* Defined in head.S */ extern struct desc_ptr early_gdt_descr; @@ -934,7 +942,7 @@ extern void start_thread(struct pt_regs *regs, unsigned long new_ip, extern int get_tsc_mode(unsigned long adr); extern int set_tsc_mode(unsigned int val); -extern int amd_get_nb_id(int cpu); +extern u16 amd_get_nb_id(int cpu); struct aperfmperf { u64 aperf, mperf; @@ -989,7 +997,11 @@ extern unsigned long arch_align_stack(unsigned long sp); extern void free_init_pages(char *what, unsigned long begin, unsigned long end); void default_idle(void); -bool set_pm_idle_to_default(void); +#ifdef CONFIG_XEN +bool xen_set_default_idle(void); +#else +#define xen_set_default_idle 0 +#endif void stop_this_cpu(void *dummy); diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index 6f414ed8862..6fd3fd76979 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -5,8 +5,6 @@ /* misc architecture specific prototypes */ -void early_idt_handler(void); - void system_call(void); void syscall_init(void); diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index dcfde52979c..942a08623a1 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -1,44 +1,12 @@ #ifndef _ASM_X86_PTRACE_H #define _ASM_X86_PTRACE_H -#include <linux/compiler.h> /* For __user */ -#include <asm/ptrace-abi.h> -#include <asm/processor-flags.h> - -#ifdef __KERNEL__ #include <asm/segment.h> #include <asm/page_types.h> -#endif +#include <uapi/asm/ptrace.h> #ifndef __ASSEMBLY__ - #ifdef __i386__ -/* this struct defines the way the registers are stored on the - stack during a system call. */ - -#ifndef __KERNEL__ - -struct pt_regs { - long ebx; - long ecx; - long edx; - long esi; - long edi; - long ebp; - long eax; - int xds; - int xes; - int xfs; - int xgs; - long orig_eax; - long eip; - int xcs; - long eflags; - long esp; - int xss; -}; - -#else /* __KERNEL__ */ struct pt_regs { unsigned long bx; @@ -60,42 +28,8 @@ struct pt_regs { unsigned long ss; }; -#endif /* __KERNEL__ */ - #else /* __i386__ */ -#ifndef __KERNEL__ - -struct pt_regs { - unsigned long r15; - unsigned long r14; - unsigned long r13; - unsigned long r12; - unsigned long rbp; - unsigned long rbx; -/* arguments: non interrupts/non tracing syscalls only save up to here*/ - unsigned long r11; - unsigned long r10; - unsigned long r9; - unsigned long r8; - unsigned long rax; - unsigned long rcx; - unsigned long rdx; - unsigned long rsi; - unsigned long rdi; - unsigned long orig_rax; -/* end of arguments */ -/* cpu exception frame or undefined */ - unsigned long rip; - unsigned long cs; - unsigned long eflags; - unsigned long rsp; - unsigned long ss; -/* top of stack page */ -}; - -#else /* __KERNEL__ */ - struct pt_regs { unsigned long r15; unsigned long r14; @@ -124,12 +58,8 @@ struct pt_regs { /* top of stack page */ }; -#endif /* __KERNEL__ */ #endif /* !__i386__ */ - -#ifdef __KERNEL__ - #include <linux/init.h> #ifdef CONFIG_PARAVIRT #include <asm/paravirt_types.h> @@ -203,23 +133,23 @@ static inline bool user_64bit_mode(struct pt_regs *regs) return regs->cs == __USER_CS || regs->cs == pv_info.extra_user_64bit_cs; #endif } + +#define current_user_stack_pointer() this_cpu_read(old_rsp) +/* ia32 vs. x32 difference */ +#define compat_user_stack_pointer() \ + (test_thread_flag(TIF_IA32) \ + ? current_pt_regs()->sp \ + : this_cpu_read(old_rsp)) #endif -/* - * X86_32 CPUs don't save ss and esp if the CPU is already in kernel mode - * when it traps. The previous stack will be directly underneath the saved - * registers, and 'sp/ss' won't even have been saved. Thus the '®s->sp'. - * - * This is valid only for kernel mode traps. - */ -static inline unsigned long kernel_stack_pointer(struct pt_regs *regs) -{ #ifdef CONFIG_X86_32 - return (unsigned long)(®s->sp); +extern unsigned long kernel_stack_pointer(struct pt_regs *regs); #else +static inline unsigned long kernel_stack_pointer(struct pt_regs *regs) +{ return regs->sp; -#endif } +#endif #define GET_IP(regs) ((regs)->ip) #define GET_FP(regs) ((regs)->bp) @@ -246,6 +176,15 @@ static inline unsigned long regs_get_register(struct pt_regs *regs, { if (unlikely(offset > MAX_REG_OFFSET)) return 0; +#ifdef CONFIG_X86_32 + /* + * Traps from the kernel do not save sp and ss. + * Use the helper function to retrieve sp. + */ + if (offset == offsetof(struct pt_regs, sp) && + regs->cs == __KERNEL_CS) + return kernel_stack_pointer(regs); +#endif return *(unsigned long *)((unsigned long)regs + offset); } @@ -299,8 +238,5 @@ extern int do_get_thread_area(struct task_struct *p, int idx, extern int do_set_thread_area(struct task_struct *p, int idx, struct user_desc __user *info, int can_allocate); -#endif /* __KERNEL__ */ - #endif /* !__ASSEMBLY__ */ - #endif /* _ASM_X86_PTRACE_H */ diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index c59cc97fe6c..109a9dd5d45 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -6,6 +6,7 @@ /* some helper functions for xen and kvm pv clock sources */ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src); +u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src); void pvclock_set_flags(u8 flags); unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src); void pvclock_read_wallclock(struct pvclock_wall_clock *wall, @@ -56,4 +57,50 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift) return product; } +static __always_inline +u64 pvclock_get_nsec_offset(const struct pvclock_vcpu_time_info *src) +{ + u64 delta = __native_read_tsc() - src->tsc_timestamp; + return pvclock_scale_delta(delta, src->tsc_to_system_mul, + src->tsc_shift); +} + +static __always_inline +unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src, + cycle_t *cycles, u8 *flags) +{ + unsigned version; + cycle_t ret, offset; + u8 ret_flags; + + version = src->version; + /* Note: emulated platforms which do not advertise SSE2 support + * result in kvmclock not using the necessary RDTSC barriers. + * Without barriers, it is possible that RDTSC instruction reads from + * the time stamp counter outside rdtsc_barrier protected section + * below, resulting in violation of monotonicity. + */ + rdtsc_barrier(); + offset = pvclock_get_nsec_offset(src); + ret = src->system_time + offset; + ret_flags = src->flags; + rdtsc_barrier(); + + *cycles = ret; + *flags = ret_flags; + return version; +} + +struct pvclock_vsyscall_time_info { + struct pvclock_vcpu_time_info pvti; + u32 migrate_count; +} __attribute__((__aligned__(SMP_CACHE_BYTES))); + +#define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info) +#define PVCLOCK_VSYSCALL_NR_PAGES (((NR_CPUS-1)/(PAGE_SIZE/PVTI_SIZE))+1) + +int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i, + int size); +struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu); + #endif /* _ASM_X86_PVCLOCK_H */ diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h index fe1ec5bcd84..9c6b890d5e7 100644 --- a/arch/x86/include/asm/realmode.h +++ b/arch/x86/include/asm/realmode.h @@ -58,6 +58,7 @@ extern unsigned char boot_gdt[]; extern unsigned char secondary_startup_64[]; #endif -extern void __init setup_real_mode(void); +void reserve_real_mode(void); +void setup_real_mode(void); #endif /* _ARCH_X86_REALMODE_H */ diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h index 6c7fc25f2c3..5c6e4fb370f 100644 --- a/arch/x86/include/asm/required-features.h +++ b/arch/x86/include/asm/required-features.h @@ -47,6 +47,12 @@ # define NEED_NOPL 0 #endif +#ifdef CONFIG_MATOM +# define NEED_MOVBE (1<<(X86_FEATURE_MOVBE & 31)) +#else +# define NEED_MOVBE 0 +#endif + #ifdef CONFIG_X86_64 #ifdef CONFIG_PARAVIRT /* Paravirtualized systems may not have PSE or PGE available */ @@ -80,7 +86,7 @@ #define REQUIRED_MASK2 0 #define REQUIRED_MASK3 (NEED_NOPL) -#define REQUIRED_MASK4 0 +#define REQUIRED_MASK4 (NEED_MOVBE) #define REQUIRED_MASK5 0 #define REQUIRED_MASK6 0 #define REQUIRED_MASK7 0 diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index d0f19f9fb84..b7bf3505e1e 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -1,7 +1,8 @@ #ifndef _ASM_X86_SETUP_H #define _ASM_X86_SETUP_H -#ifdef __KERNEL__ +#include <uapi/asm/setup.h> + #define COMMAND_LINE_SIZE 2048 @@ -123,6 +124,4 @@ void __init x86_64_start_reservations(char *real_mode_data); .size .brk.name,.-1b; \ .popsection #endif /* __ASSEMBLY__ */ -#endif /* __KERNEL__ */ - #endif /* _ASM_X86_SETUP_H */ diff --git a/arch/x86/include/asm/sigcontext.h b/arch/x86/include/asm/sigcontext.h index 5ca71c065ee..9dfce4e0417 100644 --- a/arch/x86/include/asm/sigcontext.h +++ b/arch/x86/include/asm/sigcontext.h @@ -1,104 +1,9 @@ #ifndef _ASM_X86_SIGCONTEXT_H #define _ASM_X86_SIGCONTEXT_H -#include <linux/compiler.h> -#include <linux/types.h> - -#define FP_XSTATE_MAGIC1 0x46505853U -#define FP_XSTATE_MAGIC2 0x46505845U -#define FP_XSTATE_MAGIC2_SIZE sizeof(FP_XSTATE_MAGIC2) - -/* - * bytes 464..511 in the current 512byte layout of fxsave/fxrstor frame - * are reserved for SW usage. On cpu's supporting xsave/xrstor, these bytes - * are used to extended the fpstate pointer in the sigcontext, which now - * includes the extended state information along with fpstate information. - * - * Presence of FP_XSTATE_MAGIC1 at the beginning of this SW reserved - * area and FP_XSTATE_MAGIC2 at the end of memory layout - * (extended_size - FP_XSTATE_MAGIC2_SIZE) indicates the presence of the - * extended state information in the memory layout pointed by the fpstate - * pointer in sigcontext. - */ -struct _fpx_sw_bytes { - __u32 magic1; /* FP_XSTATE_MAGIC1 */ - __u32 extended_size; /* total size of the layout referred by - * fpstate pointer in the sigcontext. - */ - __u64 xstate_bv; - /* feature bit mask (including fp/sse/extended - * state) that is present in the memory - * layout. - */ - __u32 xstate_size; /* actual xsave state size, based on the - * features saved in the layout. - * 'extended_size' will be greater than - * 'xstate_size'. - */ - __u32 padding[7]; /* for future use. */ -}; +#include <uapi/asm/sigcontext.h> #ifdef __i386__ -/* - * As documented in the iBCS2 standard.. - * - * The first part of "struct _fpstate" is just the normal i387 - * hardware setup, the extra "status" word is used to save the - * coprocessor status word before entering the handler. - * - * Pentium III FXSR, SSE support - * Gareth Hughes <gareth@valinux.com>, May 2000 - * - * The FPU state data structure has had to grow to accommodate the - * extended FPU state required by the Streaming SIMD Extensions. - * There is no documented standard to accomplish this at the moment. - */ -struct _fpreg { - unsigned short significand[4]; - unsigned short exponent; -}; - -struct _fpxreg { - unsigned short significand[4]; - unsigned short exponent; - unsigned short padding[3]; -}; - -struct _xmmreg { - unsigned long element[4]; -}; - -struct _fpstate { - /* Regular FPU environment */ - unsigned long cw; - unsigned long sw; - unsigned long tag; - unsigned long ipoff; - unsigned long cssel; - unsigned long dataoff; - unsigned long datasel; - struct _fpreg _st[8]; - unsigned short status; - unsigned short magic; /* 0xffff = regular FPU data only */ - - /* FXSR FPU environment */ - unsigned long _fxsr_env[6]; /* FXSR FPU env is ignored */ - unsigned long mxcsr; - unsigned long reserved; - struct _fpxreg _fxsr_st[8]; /* FXSR FPU reg data is ignored */ - struct _xmmreg _xmm[8]; - unsigned long padding1[44]; - - union { - unsigned long padding2[12]; - struct _fpx_sw_bytes sw_reserved; /* represents the extended - * state info */ - }; -}; - -#define X86_FXSR_MAGIC 0x0000 - -#ifdef __KERNEL__ struct sigcontext { unsigned short gs, __gsh; unsigned short fs, __fsh; @@ -131,62 +36,7 @@ struct sigcontext { unsigned long oldmask; unsigned long cr2; }; -#else /* __KERNEL__ */ -/* - * User-space might still rely on the old definition: - */ -struct sigcontext { - unsigned short gs, __gsh; - unsigned short fs, __fsh; - unsigned short es, __esh; - unsigned short ds, __dsh; - unsigned long edi; - unsigned long esi; - unsigned long ebp; - unsigned long esp; - unsigned long ebx; - unsigned long edx; - unsigned long ecx; - unsigned long eax; - unsigned long trapno; - unsigned long err; - unsigned long eip; - unsigned short cs, __csh; - unsigned long eflags; - unsigned long esp_at_signal; - unsigned short ss, __ssh; - struct _fpstate __user *fpstate; - unsigned long oldmask; - unsigned long cr2; -}; -#endif /* !__KERNEL__ */ - #else /* __i386__ */ - -/* FXSAVE frame */ -/* Note: reserved1/2 may someday contain valuable data. Always save/restore - them when you change signal frames. */ -struct _fpstate { - __u16 cwd; - __u16 swd; - __u16 twd; /* Note this is not the same as the - 32bit/x87/FSAVE twd */ - __u16 fop; - __u64 rip; - __u64 rdp; - __u32 mxcsr; - __u32 mxcsr_mask; - __u32 st_space[32]; /* 8*16 bytes for each FP-reg */ - __u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg */ - __u32 reserved2[12]; - union { - __u32 reserved3[12]; - struct _fpx_sw_bytes sw_reserved; /* represents the extended - * state information */ - }; -}; - -#ifdef __KERNEL__ struct sigcontext { unsigned long r8; unsigned long r9; @@ -225,69 +75,5 @@ struct sigcontext { void __user *fpstate; /* zero when no FPU/extended context */ unsigned long reserved1[8]; }; -#else /* __KERNEL__ */ -/* - * User-space might still rely on the old definition: - */ -struct sigcontext { - __u64 r8; - __u64 r9; - __u64 r10; - __u64 r11; - __u64 r12; - __u64 r13; - __u64 r14; - __u64 r15; - __u64 rdi; - __u64 rsi; - __u64 rbp; - __u64 rbx; - __u64 rdx; - __u64 rax; - __u64 rcx; - __u64 rsp; - __u64 rip; - __u64 eflags; /* RFLAGS */ - __u16 cs; - __u16 gs; - __u16 fs; - __u16 __pad0; - __u64 err; - __u64 trapno; - __u64 oldmask; - __u64 cr2; - struct _fpstate __user *fpstate; /* zero when no FPU context */ -#ifdef __ILP32__ - __u32 __fpstate_pad; -#endif - __u64 reserved1[8]; -}; -#endif /* !__KERNEL__ */ - #endif /* !__i386__ */ - -struct _xsave_hdr { - __u64 xstate_bv; - __u64 reserved1[2]; - __u64 reserved2[5]; -}; - -struct _ymmh_state { - /* 16 * 16 bytes for each YMMH-reg */ - __u32 ymmh_space[64]; -}; - -/* - * Extended state pointed by the fpstate pointer in the sigcontext. - * In addition to the fpstate, information encoded in the xstate_hdr - * indicates the presence of other extended state information - * supported by the processor and OS. - */ -struct _xstate { - struct _fpstate fpstate; - struct _xsave_hdr xstate_hdr; - struct _ymmh_state ymmh; - /* new processor state extensions go here */ -}; - #endif /* _ASM_X86_SIGCONTEXT_H */ diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h index 323973f4abf..35e67a45718 100644 --- a/arch/x86/include/asm/signal.h +++ b/arch/x86/include/asm/signal.h @@ -2,14 +2,6 @@ #define _ASM_X86_SIGNAL_H #ifndef __ASSEMBLY__ -#include <linux/types.h> -#include <linux/time.h> -#include <linux/compiler.h> - -/* Avoid too many header ordering problems. */ -struct siginfo; - -#ifdef __KERNEL__ #include <linux/linkage.h> /* Most things should be clean enough to redefine this at will, if care @@ -35,159 +27,13 @@ typedef struct { typedef sigset_t compat_sigset_t; #endif -#else -/* Here we must cater to libcs that poke about in kernel headers. */ - -#define NSIG 32 -typedef unsigned long sigset_t; - -#endif /* __KERNEL__ */ #endif /* __ASSEMBLY__ */ - -#define SIGHUP 1 -#define SIGINT 2 -#define SIGQUIT 3 -#define SIGILL 4 -#define SIGTRAP 5 -#define SIGABRT 6 -#define SIGIOT 6 -#define SIGBUS 7 -#define SIGFPE 8 -#define SIGKILL 9 -#define SIGUSR1 10 -#define SIGSEGV 11 -#define SIGUSR2 12 -#define SIGPIPE 13 -#define SIGALRM 14 -#define SIGTERM 15 -#define SIGSTKFLT 16 -#define SIGCHLD 17 -#define SIGCONT 18 -#define SIGSTOP 19 -#define SIGTSTP 20 -#define SIGTTIN 21 -#define SIGTTOU 22 -#define SIGURG 23 -#define SIGXCPU 24 -#define SIGXFSZ 25 -#define SIGVTALRM 26 -#define SIGPROF 27 -#define SIGWINCH 28 -#define SIGIO 29 -#define SIGPOLL SIGIO -/* -#define SIGLOST 29 -*/ -#define SIGPWR 30 -#define SIGSYS 31 -#define SIGUNUSED 31 - -/* These should not be considered constants from userland. */ -#define SIGRTMIN 32 -#define SIGRTMAX _NSIG - -/* - * SA_FLAGS values: - * - * SA_ONSTACK indicates that a registered stack_t will be used. - * SA_RESTART flag to get restarting signals (which were the default long ago) - * SA_NOCLDSTOP flag to turn off SIGCHLD when children stop. - * SA_RESETHAND clears the handler when the signal is delivered. - * SA_NOCLDWAIT flag on SIGCHLD to inhibit zombies. - * SA_NODEFER prevents the current signal from being masked in the handler. - * - * SA_ONESHOT and SA_NOMASK are the historical Linux names for the Single - * Unix names RESETHAND and NODEFER respectively. - */ -#define SA_NOCLDSTOP 0x00000001u -#define SA_NOCLDWAIT 0x00000002u -#define SA_SIGINFO 0x00000004u -#define SA_ONSTACK 0x08000000u -#define SA_RESTART 0x10000000u -#define SA_NODEFER 0x40000000u -#define SA_RESETHAND 0x80000000u - -#define SA_NOMASK SA_NODEFER -#define SA_ONESHOT SA_RESETHAND - -#define SA_RESTORER 0x04000000 - -/* - * sigaltstack controls - */ -#define SS_ONSTACK 1 -#define SS_DISABLE 2 - -#define MINSIGSTKSZ 2048 -#define SIGSTKSZ 8192 - -#include <asm-generic/signal-defs.h> - +#include <uapi/asm/signal.h> #ifndef __ASSEMBLY__ - -# ifdef __KERNEL__ extern void do_notify_resume(struct pt_regs *, void *, __u32); -# endif /* __KERNEL__ */ -#ifdef __i386__ -# ifdef __KERNEL__ -struct old_sigaction { - __sighandler_t sa_handler; - old_sigset_t sa_mask; - unsigned long sa_flags; - __sigrestore_t sa_restorer; -}; - -struct sigaction { - __sighandler_t sa_handler; - unsigned long sa_flags; - __sigrestore_t sa_restorer; - sigset_t sa_mask; /* mask last for extensibility */ -}; - -struct k_sigaction { - struct sigaction sa; -}; - -# else /* __KERNEL__ */ -/* Here we must cater to libcs that poke about in kernel headers. */ - -struct sigaction { - union { - __sighandler_t _sa_handler; - void (*_sa_sigaction)(int, struct siginfo *, void *); - } _u; - sigset_t sa_mask; - unsigned long sa_flags; - void (*sa_restorer)(void); -}; - -#define sa_handler _u._sa_handler -#define sa_sigaction _u._sa_sigaction - -# endif /* ! __KERNEL__ */ -#else /* __i386__ */ +#define __ARCH_HAS_SA_RESTORER -struct sigaction { - __sighandler_t sa_handler; - unsigned long sa_flags; - __sigrestore_t sa_restorer; - sigset_t sa_mask; /* mask last for extensibility */ -}; - -struct k_sigaction { - struct sigaction sa; -}; - -#endif /* !__i386__ */ - -typedef struct sigaltstack { - void __user *ss_sp; - int ss_flags; - size_t ss_size; -} stack_t; - -#ifdef __KERNEL__ #include <asm/sigcontext.h> #ifdef __i386__ @@ -260,9 +106,5 @@ struct pt_regs; #endif /* !__i386__ */ -#define ptrace_signal_deliver(regs, cookie) do { } while (0) - -#endif /* __KERNEL__ */ #endif /* __ASSEMBLY__ */ - #endif /* _ASM_X86_SIGNAL_H */ diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 4f19a152603..b073aaea747 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -166,6 +166,7 @@ void native_send_call_func_ipi(const struct cpumask *mask); void native_send_call_func_single_ipi(int cpu); void x86_idle_thread_init(unsigned int cpu, struct task_struct *idle); +void smp_store_boot_cpu_info(void); void smp_store_cpu_info(int id); #define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu) diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index cdf5674dd23..6136d99f537 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -1,134 +1,8 @@ #ifndef __SVM_H #define __SVM_H -#define SVM_EXIT_READ_CR0 0x000 -#define SVM_EXIT_READ_CR3 0x003 -#define SVM_EXIT_READ_CR4 0x004 -#define SVM_EXIT_READ_CR8 0x008 -#define SVM_EXIT_WRITE_CR0 0x010 -#define SVM_EXIT_WRITE_CR3 0x013 -#define SVM_EXIT_WRITE_CR4 0x014 -#define SVM_EXIT_WRITE_CR8 0x018 -#define SVM_EXIT_READ_DR0 0x020 -#define SVM_EXIT_READ_DR1 0x021 -#define SVM_EXIT_READ_DR2 0x022 -#define SVM_EXIT_READ_DR3 0x023 -#define SVM_EXIT_READ_DR4 0x024 -#define SVM_EXIT_READ_DR5 0x025 -#define SVM_EXIT_READ_DR6 0x026 -#define SVM_EXIT_READ_DR7 0x027 -#define SVM_EXIT_WRITE_DR0 0x030 -#define SVM_EXIT_WRITE_DR1 0x031 -#define SVM_EXIT_WRITE_DR2 0x032 -#define SVM_EXIT_WRITE_DR3 0x033 -#define SVM_EXIT_WRITE_DR4 0x034 -#define SVM_EXIT_WRITE_DR5 0x035 -#define SVM_EXIT_WRITE_DR6 0x036 -#define SVM_EXIT_WRITE_DR7 0x037 -#define SVM_EXIT_EXCP_BASE 0x040 -#define SVM_EXIT_INTR 0x060 -#define SVM_EXIT_NMI 0x061 -#define SVM_EXIT_SMI 0x062 -#define SVM_EXIT_INIT 0x063 -#define SVM_EXIT_VINTR 0x064 -#define SVM_EXIT_CR0_SEL_WRITE 0x065 -#define SVM_EXIT_IDTR_READ 0x066 -#define SVM_EXIT_GDTR_READ 0x067 -#define SVM_EXIT_LDTR_READ 0x068 -#define SVM_EXIT_TR_READ 0x069 -#define SVM_EXIT_IDTR_WRITE 0x06a -#define SVM_EXIT_GDTR_WRITE 0x06b -#define SVM_EXIT_LDTR_WRITE 0x06c -#define SVM_EXIT_TR_WRITE 0x06d -#define SVM_EXIT_RDTSC 0x06e -#define SVM_EXIT_RDPMC 0x06f -#define SVM_EXIT_PUSHF 0x070 -#define SVM_EXIT_POPF 0x071 -#define SVM_EXIT_CPUID 0x072 -#define SVM_EXIT_RSM 0x073 -#define SVM_EXIT_IRET 0x074 -#define SVM_EXIT_SWINT 0x075 -#define SVM_EXIT_INVD 0x076 -#define SVM_EXIT_PAUSE 0x077 -#define SVM_EXIT_HLT 0x078 -#define SVM_EXIT_INVLPG 0x079 -#define SVM_EXIT_INVLPGA 0x07a -#define SVM_EXIT_IOIO 0x07b -#define SVM_EXIT_MSR 0x07c -#define SVM_EXIT_TASK_SWITCH 0x07d -#define SVM_EXIT_FERR_FREEZE 0x07e -#define SVM_EXIT_SHUTDOWN 0x07f -#define SVM_EXIT_VMRUN 0x080 -#define SVM_EXIT_VMMCALL 0x081 -#define SVM_EXIT_VMLOAD 0x082 -#define SVM_EXIT_VMSAVE 0x083 -#define SVM_EXIT_STGI 0x084 -#define SVM_EXIT_CLGI 0x085 -#define SVM_EXIT_SKINIT 0x086 -#define SVM_EXIT_RDTSCP 0x087 -#define SVM_EXIT_ICEBP 0x088 -#define SVM_EXIT_WBINVD 0x089 -#define SVM_EXIT_MONITOR 0x08a -#define SVM_EXIT_MWAIT 0x08b -#define SVM_EXIT_MWAIT_COND 0x08c -#define SVM_EXIT_XSETBV 0x08d -#define SVM_EXIT_NPF 0x400 - -#define SVM_EXIT_ERR -1 - -#define SVM_EXIT_REASONS \ - { SVM_EXIT_READ_CR0, "read_cr0" }, \ - { SVM_EXIT_READ_CR3, "read_cr3" }, \ - { SVM_EXIT_READ_CR4, "read_cr4" }, \ - { SVM_EXIT_READ_CR8, "read_cr8" }, \ - { SVM_EXIT_WRITE_CR0, "write_cr0" }, \ - { SVM_EXIT_WRITE_CR3, "write_cr3" }, \ - { SVM_EXIT_WRITE_CR4, "write_cr4" }, \ - { SVM_EXIT_WRITE_CR8, "write_cr8" }, \ - { SVM_EXIT_READ_DR0, "read_dr0" }, \ - { SVM_EXIT_READ_DR1, "read_dr1" }, \ - { SVM_EXIT_READ_DR2, "read_dr2" }, \ - { SVM_EXIT_READ_DR3, "read_dr3" }, \ - { SVM_EXIT_WRITE_DR0, "write_dr0" }, \ - { SVM_EXIT_WRITE_DR1, "write_dr1" }, \ - { SVM_EXIT_WRITE_DR2, "write_dr2" }, \ - { SVM_EXIT_WRITE_DR3, "write_dr3" }, \ - { SVM_EXIT_WRITE_DR5, "write_dr5" }, \ - { SVM_EXIT_WRITE_DR7, "write_dr7" }, \ - { SVM_EXIT_EXCP_BASE + DB_VECTOR, "DB excp" }, \ - { SVM_EXIT_EXCP_BASE + BP_VECTOR, "BP excp" }, \ - { SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" }, \ - { SVM_EXIT_EXCP_BASE + PF_VECTOR, "PF excp" }, \ - { SVM_EXIT_EXCP_BASE + NM_VECTOR, "NM excp" }, \ - { SVM_EXIT_EXCP_BASE + MC_VECTOR, "MC excp" }, \ - { SVM_EXIT_INTR, "interrupt" }, \ - { SVM_EXIT_NMI, "nmi" }, \ - { SVM_EXIT_SMI, "smi" }, \ - { SVM_EXIT_INIT, "init" }, \ - { SVM_EXIT_VINTR, "vintr" }, \ - { SVM_EXIT_CPUID, "cpuid" }, \ - { SVM_EXIT_INVD, "invd" }, \ - { SVM_EXIT_HLT, "hlt" }, \ - { SVM_EXIT_INVLPG, "invlpg" }, \ - { SVM_EXIT_INVLPGA, "invlpga" }, \ - { SVM_EXIT_IOIO, "io" }, \ - { SVM_EXIT_MSR, "msr" }, \ - { SVM_EXIT_TASK_SWITCH, "task_switch" }, \ - { SVM_EXIT_SHUTDOWN, "shutdown" }, \ - { SVM_EXIT_VMRUN, "vmrun" }, \ - { SVM_EXIT_VMMCALL, "hypercall" }, \ - { SVM_EXIT_VMLOAD, "vmload" }, \ - { SVM_EXIT_VMSAVE, "vmsave" }, \ - { SVM_EXIT_STGI, "stgi" }, \ - { SVM_EXIT_CLGI, "clgi" }, \ - { SVM_EXIT_SKINIT, "skinit" }, \ - { SVM_EXIT_WBINVD, "wbinvd" }, \ - { SVM_EXIT_MONITOR, "monitor" }, \ - { SVM_EXIT_MWAIT, "mwait" }, \ - { SVM_EXIT_XSETBV, "xsetbv" }, \ - { SVM_EXIT_NPF, "npf" } - -#ifdef __KERNEL__ +#include <uapi/asm/svm.h> + enum { INTERCEPT_INTR, @@ -403,5 +277,3 @@ struct __attribute__ ((__packed__)) vmcb { #define SVM_INVLPGA ".byte 0x0f, 0x01, 0xdf" #endif - -#endif diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h index a9a8cf3da49..8459efc3968 100644 --- a/arch/x86/include/asm/sys_ia32.h +++ b/arch/x86/include/asm/sys_ia32.h @@ -32,31 +32,17 @@ struct mmap_arg_struct32; asmlinkage long sys32_mmap(struct mmap_arg_struct32 __user *); asmlinkage long sys32_mprotect(unsigned long, size_t, unsigned long); -struct sigaction32; -struct old_sigaction32; -asmlinkage long sys32_rt_sigaction(int, struct sigaction32 __user *, - struct sigaction32 __user *, unsigned int); -asmlinkage long sys32_sigaction(int, struct old_sigaction32 __user *, - struct old_sigaction32 __user *); asmlinkage long sys32_alarm(unsigned int); asmlinkage long sys32_waitpid(compat_pid_t, unsigned int __user *, int); asmlinkage long sys32_sysfs(int, u32, u32); -asmlinkage long sys32_sched_rr_get_interval(compat_pid_t, - struct compat_timespec __user *); -asmlinkage long sys32_rt_sigpending(compat_sigset_t __user *, compat_size_t); -asmlinkage long sys32_rt_sigqueueinfo(int, int, compat_siginfo_t __user *); - asmlinkage long sys32_pread(unsigned int, char __user *, u32, u32, u32); asmlinkage long sys32_pwrite(unsigned int, const char __user *, u32, u32, u32); asmlinkage long sys32_personality(unsigned long); asmlinkage long sys32_sendfile(int, int, compat_off_t __user *, s32); -asmlinkage long sys32_clone(unsigned int, unsigned int, struct pt_regs *); - -long sys32_lseek(unsigned int, int, unsigned int); long sys32_kill(int, int); long sys32_fadvise64_64(int, __u32, __u32, __u32, __u32, int); long sys32_vm86_warning(void); @@ -70,11 +56,8 @@ asmlinkage long sys32_fallocate(int, int, unsigned, unsigned, unsigned, unsigned); /* ia32/ia32_signal.c */ -asmlinkage long sys32_sigsuspend(int, int, old_sigset_t); -asmlinkage long sys32_sigaltstack(const stack_ia32_t __user *, - stack_ia32_t __user *, struct pt_regs *); -asmlinkage long sys32_sigreturn(struct pt_regs *); -asmlinkage long sys32_rt_sigreturn(struct pt_regs *); +asmlinkage long sys32_sigreturn(void); +asmlinkage long sys32_rt_sigreturn(void); /* ia32/ipc32.c */ asmlinkage long sys32_ipc(u32, int, int, int, compat_uptr_t, u32); diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h index 2be0b880417..6cf0a9cc60c 100644 --- a/arch/x86/include/asm/syscalls.h +++ b/arch/x86/include/asm/syscalls.h @@ -18,25 +18,13 @@ /* Common in X86_32 and X86_64 */ /* kernel/ioport.c */ asmlinkage long sys_ioperm(unsigned long, unsigned long, int); -long sys_iopl(unsigned int, struct pt_regs *); - -/* kernel/process.c */ -int sys_fork(struct pt_regs *); -int sys_vfork(struct pt_regs *); -long sys_execve(const char __user *, - const char __user *const __user *, - const char __user *const __user *); -long sys_clone(unsigned long, unsigned long, void __user *, - void __user *, struct pt_regs *); +asmlinkage long sys_iopl(unsigned int); /* kernel/ldt.c */ asmlinkage int sys_modify_ldt(int, void __user *, unsigned long); /* kernel/signal.c */ -long sys_rt_sigreturn(struct pt_regs *); -long sys_sigaltstack(const stack_t __user *, stack_t __user *, - struct pt_regs *); - +long sys_rt_sigreturn(void); /* kernel/tls.c */ asmlinkage int sys_set_thread_area(struct user_desc __user *); @@ -46,14 +34,11 @@ asmlinkage int sys_get_thread_area(struct user_desc __user *); #ifdef CONFIG_X86_32 /* kernel/signal.c */ -asmlinkage int sys_sigsuspend(int, int, old_sigset_t); -asmlinkage int sys_sigaction(int, const struct old_sigaction __user *, - struct old_sigaction __user *); -unsigned long sys_sigreturn(struct pt_regs *); +unsigned long sys_sigreturn(void); /* kernel/vm86_32.c */ -int sys_vm86old(struct vm86_struct __user *, struct pt_regs *); -int sys_vm86(unsigned long, unsigned long, struct pt_regs *); +int sys_vm86old(struct vm86_struct __user *); +int sys_vm86(unsigned long, unsigned long); #else /* CONFIG_X86_32 */ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 2d946e63ee8..2cd056e3ada 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -20,7 +20,6 @@ struct task_struct; struct exec_domain; #include <asm/processor.h> -#include <asm/ftrace.h> #include <linux/atomic.h> struct thread_info { diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 74a44333545..50a7fc0f824 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -20,10 +20,20 @@ static inline void __native_flush_tlb(void) native_write_cr3(native_read_cr3()); } +static inline void __native_flush_tlb_global_irq_disabled(void) +{ + unsigned long cr4; + + cr4 = native_read_cr4(); + /* clear PGE */ + native_write_cr4(cr4 & ~X86_CR4_PGE); + /* write old PGE again and flush TLBs */ + native_write_cr4(cr4); +} + static inline void __native_flush_tlb_global(void) { unsigned long flags; - unsigned long cr4; /* * Read-modify-write to CR4 - protect it from preemption and @@ -32,11 +42,7 @@ static inline void __native_flush_tlb_global(void) */ raw_local_irq_save(flags); - cr4 = native_read_cr4(); - /* clear PGE */ - native_write_cr4(cr4 & ~X86_CR4_PGE); - /* write old PGE again and flush TLBs */ - native_write_cr4(cr4); + __native_flush_tlb_global_irq_disabled(); raw_local_irq_restore(flags); } @@ -56,10 +62,7 @@ static inline void __flush_tlb_all(void) static inline void __flush_tlb_one(unsigned long addr) { - if (cpu_has_invlpg) __flush_tlb_single(addr); - else - __flush_tlb(); } #define TLB_FLUSH_ALL -1UL diff --git a/arch/x86/include/asm/trace_clock.h b/arch/x86/include/asm/trace_clock.h new file mode 100644 index 00000000000..beab86cc282 --- /dev/null +++ b/arch/x86/include/asm/trace_clock.h @@ -0,0 +1,20 @@ +#ifndef _ASM_X86_TRACE_CLOCK_H +#define _ASM_X86_TRACE_CLOCK_H + +#include <linux/compiler.h> +#include <linux/types.h> + +#ifdef CONFIG_X86_TSC + +extern u64 notrace trace_clock_x86_tsc(void); + +# define ARCH_TRACE_CLOCKS \ + { trace_clock_x86_tsc, "x86-tsc", .in_ns = 0 }, + +#else /* !CONFIG_X86_TSC */ + +#define ARCH_TRACE_CLOCKS + +#endif + +#endif /* _ASM_X86_TRACE_CLOCK_H */ diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 7ccf8d13153..5ee26875bae 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -125,13 +125,12 @@ extern int __get_user_4(void); extern int __get_user_8(void); extern int __get_user_bad(void); -#define __get_user_x(size, ret, x, ptr) \ - asm volatile("call __get_user_" #size \ - : "=a" (ret), "=d" (x) \ - : "0" (ptr)) \ - -/* Careful: we have to cast the result to the type of the pointer - * for sign reasons */ +/* + * This is a type: either unsigned long, if the argument fits into + * that type, or otherwise unsigned long long. + */ +#define __inttype(x) \ +__typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL)) /** * get_user: - Get a simple variable from user space. @@ -150,38 +149,26 @@ extern int __get_user_bad(void); * Returns zero on success, or -EFAULT on error. * On error, the variable @x is set to zero. */ -#ifdef CONFIG_X86_32 -#define __get_user_8(__ret_gu, __val_gu, ptr) \ - __get_user_x(X, __ret_gu, __val_gu, ptr) -#else -#define __get_user_8(__ret_gu, __val_gu, ptr) \ - __get_user_x(8, __ret_gu, __val_gu, ptr) -#endif - +/* + * Careful: we have to cast the result to the type of the pointer + * for sign reasons. + * + * The use of %edx as the register specifier is a bit of a + * simplification, as gcc only cares about it as the starting point + * and not size: for a 64-bit value it will use %ecx:%edx on 32 bits + * (%ecx being the next register in gcc's x86 register sequence), and + * %rdx on 64 bits. + */ #define get_user(x, ptr) \ ({ \ int __ret_gu; \ - unsigned long __val_gu; \ + register __inttype(*(ptr)) __val_gu asm("%edx"); \ __chk_user_ptr(ptr); \ might_fault(); \ - switch (sizeof(*(ptr))) { \ - case 1: \ - __get_user_x(1, __ret_gu, __val_gu, ptr); \ - break; \ - case 2: \ - __get_user_x(2, __ret_gu, __val_gu, ptr); \ - break; \ - case 4: \ - __get_user_x(4, __ret_gu, __val_gu, ptr); \ - break; \ - case 8: \ - __get_user_8(__ret_gu, __val_gu, ptr); \ - break; \ - default: \ - __get_user_x(X, __ret_gu, __val_gu, ptr); \ - break; \ - } \ - (x) = (__typeof__(*(ptr)))__val_gu; \ + asm volatile("call __get_user_%P3" \ + : "=a" (__ret_gu), "=r" (__val_gu) \ + : "0" (ptr), "i" (sizeof(*(ptr)))); \ + (x) = (__typeof__(*(ptr))) __val_gu; \ __ret_gu; \ }) @@ -237,8 +224,6 @@ extern void __put_user_2(void); extern void __put_user_4(void); extern void __put_user_8(void); -#ifdef CONFIG_X86_WP_WORKS_OK - /** * put_user: - Write a simple value into user space. * @x: Value to copy to user space. @@ -326,29 +311,6 @@ do { \ } \ } while (0) -#else - -#define __put_user_size(x, ptr, size, retval, errret) \ -do { \ - __typeof__(*(ptr))__pus_tmp = x; \ - retval = 0; \ - \ - if (unlikely(__copy_to_user_ll(ptr, &__pus_tmp, size) != 0)) \ - retval = errret; \ -} while (0) - -#define put_user(x, ptr) \ -({ \ - int __ret_pu; \ - __typeof__(*(ptr))__pus_tmp = x; \ - __ret_pu = 0; \ - if (unlikely(__copy_to_user_ll(ptr, &__pus_tmp, \ - sizeof(*(ptr))) != 0)) \ - __ret_pu = -EFAULT; \ - __ret_pu; \ -}) -#endif - #ifdef CONFIG_X86_32 #define __get_user_asm_u64(x, ptr, retval, errret) (x) = __get_user_bad() #define __get_user_asm_ex_u64(x, ptr) (x) = __get_user_bad() @@ -543,29 +505,12 @@ struct __large_struct { unsigned long buf[100]; }; (x) = (__force __typeof__(*(ptr)))__gue_val; \ } while (0) -#ifdef CONFIG_X86_WP_WORKS_OK - #define put_user_try uaccess_try #define put_user_catch(err) uaccess_catch(err) #define put_user_ex(x, ptr) \ __put_user_size_ex((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr))) -#else /* !CONFIG_X86_WP_WORKS_OK */ - -#define put_user_try do { \ - int __uaccess_err = 0; - -#define put_user_catch(err) \ - (err) |= __uaccess_err; \ -} while (0) - -#define put_user_ex(x, ptr) do { \ - __uaccess_err |= __put_user(x, ptr); \ -} while (0) - -#endif /* CONFIG_X86_WP_WORKS_OK */ - extern unsigned long copy_from_user_nmi(void *to, const void __user *from, unsigned long n); extern __must_check long diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h index 16f3fc6ebf2..3d5df1c4447 100644 --- a/arch/x86/include/asm/unistd.h +++ b/arch/x86/include/asm/unistd.h @@ -1,10 +1,8 @@ #ifndef _ASM_X86_UNISTD_H #define _ASM_X86_UNISTD_H 1 -/* x32 syscall flag bit */ -#define __X32_SYSCALL_BIT 0x40000000 +#include <uapi/asm/unistd.h> -#ifdef __KERNEL__ # ifdef CONFIG_X86_X32_ABI # define __SYSCALL_MASK (~(__X32_SYSCALL_BIT)) @@ -40,8 +38,6 @@ # define __ARCH_WANT_SYS_OLD_GETRLIMIT # define __ARCH_WANT_SYS_OLD_UNAME # define __ARCH_WANT_SYS_PAUSE -# define __ARCH_WANT_SYS_RT_SIGACTION -# define __ARCH_WANT_SYS_RT_SIGSUSPEND # define __ARCH_WANT_SYS_SGETMASK # define __ARCH_WANT_SYS_SIGNAL # define __ARCH_WANT_SYS_SIGPENDING @@ -50,7 +46,9 @@ # define __ARCH_WANT_SYS_TIME # define __ARCH_WANT_SYS_UTIME # define __ARCH_WANT_SYS_WAITPID -# define __ARCH_WANT_SYS_EXECVE +# define __ARCH_WANT_SYS_FORK +# define __ARCH_WANT_SYS_VFORK +# define __ARCH_WANT_SYS_CLONE /* * "Conditional" syscalls @@ -60,14 +58,4 @@ */ # define cond_syscall(x) asm(".weak\t" #x "\n\t.set\t" #x ",sys_ni_syscall") -#else -# ifdef __i386__ -# include <asm/unistd_32.h> -# elif defined(__ILP32__) -# include <asm/unistd_x32.h> -# else -# include <asm/unistd_64.h> -# endif -#endif - #endif /* _ASM_X86_UNISTD_H */ diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h index b47c2a82ff1..062921ef34e 100644 --- a/arch/x86/include/asm/uv/uv.h +++ b/arch/x86/include/asm/uv/uv.h @@ -16,7 +16,7 @@ extern void uv_system_init(void); extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm, unsigned long start, - unsigned end, + unsigned long end, unsigned int cpu); #else /* X86_UV */ diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index 21f7385badb..2c32df95bb7 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -5,7 +5,7 @@ * * SGI UV architectural definitions * - * Copyright (C) 2007-2010 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2007-2013 Silicon Graphics, Inc. All rights reserved. */ #ifndef _ASM_X86_UV_UV_HUB_H @@ -175,6 +175,7 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); */ #define UV1_HUB_REVISION_BASE 1 #define UV2_HUB_REVISION_BASE 3 +#define UV3_HUB_REVISION_BASE 5 static inline int is_uv1_hub(void) { @@ -183,6 +184,23 @@ static inline int is_uv1_hub(void) static inline int is_uv2_hub(void) { + return ((uv_hub_info->hub_revision >= UV2_HUB_REVISION_BASE) && + (uv_hub_info->hub_revision < UV3_HUB_REVISION_BASE)); +} + +static inline int is_uv3_hub(void) +{ + return uv_hub_info->hub_revision >= UV3_HUB_REVISION_BASE; +} + +static inline int is_uv_hub(void) +{ + return uv_hub_info->hub_revision; +} + +/* code common to uv2 and uv3 only */ +static inline int is_uvx_hub(void) +{ return uv_hub_info->hub_revision >= UV2_HUB_REVISION_BASE; } @@ -230,14 +248,23 @@ union uvh_apicid { #define UV2_LOCAL_MMR_SIZE (32UL * 1024 * 1024) #define UV2_GLOBAL_MMR32_SIZE (32UL * 1024 * 1024) -#define UV_LOCAL_MMR_BASE (is_uv1_hub() ? UV1_LOCAL_MMR_BASE \ - : UV2_LOCAL_MMR_BASE) -#define UV_GLOBAL_MMR32_BASE (is_uv1_hub() ? UV1_GLOBAL_MMR32_BASE \ - : UV2_GLOBAL_MMR32_BASE) -#define UV_LOCAL_MMR_SIZE (is_uv1_hub() ? UV1_LOCAL_MMR_SIZE : \ - UV2_LOCAL_MMR_SIZE) +#define UV3_LOCAL_MMR_BASE 0xfa000000UL +#define UV3_GLOBAL_MMR32_BASE 0xfc000000UL +#define UV3_LOCAL_MMR_SIZE (32UL * 1024 * 1024) +#define UV3_GLOBAL_MMR32_SIZE (32UL * 1024 * 1024) + +#define UV_LOCAL_MMR_BASE (is_uv1_hub() ? UV1_LOCAL_MMR_BASE : \ + (is_uv2_hub() ? UV2_LOCAL_MMR_BASE : \ + UV3_LOCAL_MMR_BASE)) +#define UV_GLOBAL_MMR32_BASE (is_uv1_hub() ? UV1_GLOBAL_MMR32_BASE :\ + (is_uv2_hub() ? UV2_GLOBAL_MMR32_BASE :\ + UV3_GLOBAL_MMR32_BASE)) +#define UV_LOCAL_MMR_SIZE (is_uv1_hub() ? UV1_LOCAL_MMR_SIZE : \ + (is_uv2_hub() ? UV2_LOCAL_MMR_SIZE : \ + UV3_LOCAL_MMR_SIZE)) #define UV_GLOBAL_MMR32_SIZE (is_uv1_hub() ? UV1_GLOBAL_MMR32_SIZE :\ - UV2_GLOBAL_MMR32_SIZE) + (is_uv2_hub() ? UV2_GLOBAL_MMR32_SIZE :\ + UV3_GLOBAL_MMR32_SIZE)) #define UV_GLOBAL_MMR64_BASE (uv_hub_info->global_mmr_base) #define UV_GLOBAL_GRU_MMR_BASE 0x4000000 @@ -599,6 +626,7 @@ static inline void uv_hub_send_ipi(int pnode, int apicid, int vector) * 1 - UV1 rev 1.0 initial silicon * 2 - UV1 rev 2.0 production silicon * 3 - UV2 rev 1.0 initial silicon + * 5 - UV3 rev 1.0 initial silicon */ static inline int uv_get_min_hub_revision_id(void) { diff --git a/arch/x86/include/asm/uv/uv_mmrs.h b/arch/x86/include/asm/uv/uv_mmrs.h index cf1d73643f6..bd5f80e58a2 100644 --- a/arch/x86/include/asm/uv/uv_mmrs.h +++ b/arch/x86/include/asm/uv/uv_mmrs.h @@ -5,16 +5,25 @@ * * SGI UV MMR definitions * - * Copyright (C) 2007-2011 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2007-2013 Silicon Graphics, Inc. All rights reserved. */ #ifndef _ASM_X86_UV_UV_MMRS_H #define _ASM_X86_UV_UV_MMRS_H /* - * This file contains MMR definitions for both UV1 & UV2 hubs. + * This file contains MMR definitions for all UV hubs types. * - * In general, MMR addresses and structures are identical on both hubs. + * To minimize coding differences between hub types, the symbols are + * grouped by architecture types. + * + * UVH - definitions common to all UV hub types. + * UVXH - definitions common to all UV eXtended hub types (currently 2 & 3). + * UV1H - definitions specific to UV type 1 hub. + * UV2H - definitions specific to UV type 2 hub. + * UV3H - definitions specific to UV type 3 hub. + * + * So in general, MMR addresses and structures are identical on all hubs types. * These MMRs are identified as: * #define UVH_xxx <address> * union uvh_xxx { @@ -23,24 +32,36 @@ * } s; * }; * - * If the MMR exists on both hub type but has different addresses or - * contents, the MMR definition is similar to: - * #define UV1H_xxx <uv1 address> - * #define UV2H_xxx <uv2address> - * #define UVH_xxx (is_uv1_hub() ? UV1H_xxx : UV2H_xxx) + * If the MMR exists on all hub types but have different addresses: + * #define UV1Hxxx a + * #define UV2Hxxx b + * #define UV3Hxxx c + * #define UVHxxx (is_uv1_hub() ? UV1Hxxx : + * (is_uv2_hub() ? UV2Hxxx : + * UV3Hxxx)) + * + * If the MMR exists on all hub types > 1 but have different addresses: + * #define UV2Hxxx b + * #define UV3Hxxx c + * #define UVXHxxx (is_uv2_hub() ? UV2Hxxx : + * UV3Hxxx)) + * * union uvh_xxx { * unsigned long v; - * struct uv1h_int_cmpd_s { (Common fields only) + * struct uvh_xxx_s { # Common fields only * } s; - * struct uv1h_int_cmpd_s { (Full UV1 definition) + * struct uv1h_xxx_s { # Full UV1 definition (*) * } s1; - * struct uv2h_int_cmpd_s { (Full UV2 definition) + * struct uv2h_xxx_s { # Full UV2 definition (*) * } s2; + * struct uv3h_xxx_s { # Full UV3 definition (*) + * } s3; * }; + * (* - if present and different than the common struct) * - * Only essential difference are enumerated. For example, if the address is - * the same for both UV1 & UV2, only a single #define is generated. Likewise, - * if the contents is the same for both hubs, only the "s" structure is + * Only essential differences are enumerated. For example, if the address is + * the same for all UV's, only a single #define is generated. Likewise, + * if the contents is the same for all hubs, only the "s" structure is * generated. * * If the MMR exists on ONLY 1 type of hub, no generic definition is @@ -51,6 +72,8 @@ * struct uvh_int_cmpd_s { * } sn; * }; + * + * (GEN Flags: mflags_opt= undefs=0 UV23=UVXH) */ #define UV_MMR_ENABLE (1UL << 63) @@ -58,15 +81,18 @@ #define UV1_HUB_PART_NUMBER 0x88a5 #define UV2_HUB_PART_NUMBER 0x8eb8 #define UV2_HUB_PART_NUMBER_X 0x1111 +#define UV3_HUB_PART_NUMBER 0x9578 +#define UV3_HUB_PART_NUMBER_X 0x4321 -/* Compat: if this #define is present, UV headers support UV2 */ +/* Compat: Indicate which UV Hubs are supported. */ #define UV2_HUB_IS_SUPPORTED 1 +#define UV3_HUB_IS_SUPPORTED 1 /* ========================================================================= */ /* UVH_BAU_DATA_BROADCAST */ /* ========================================================================= */ -#define UVH_BAU_DATA_BROADCAST 0x61688UL -#define UVH_BAU_DATA_BROADCAST_32 0x440 +#define UVH_BAU_DATA_BROADCAST 0x61688UL +#define UVH_BAU_DATA_BROADCAST_32 0x440 #define UVH_BAU_DATA_BROADCAST_ENABLE_SHFT 0 #define UVH_BAU_DATA_BROADCAST_ENABLE_MASK 0x0000000000000001UL @@ -82,8 +108,8 @@ union uvh_bau_data_broadcast_u { /* ========================================================================= */ /* UVH_BAU_DATA_CONFIG */ /* ========================================================================= */ -#define UVH_BAU_DATA_CONFIG 0x61680UL -#define UVH_BAU_DATA_CONFIG_32 0x438 +#define UVH_BAU_DATA_CONFIG 0x61680UL +#define UVH_BAU_DATA_CONFIG_32 0x438 #define UVH_BAU_DATA_CONFIG_VECTOR_SHFT 0 #define UVH_BAU_DATA_CONFIG_DM_SHFT 8 @@ -121,10 +147,14 @@ union uvh_bau_data_config_u { /* ========================================================================= */ /* UVH_EVENT_OCCURRED0 */ /* ========================================================================= */ -#define UVH_EVENT_OCCURRED0 0x70000UL -#define UVH_EVENT_OCCURRED0_32 0x5e8 +#define UVH_EVENT_OCCURRED0 0x70000UL +#define UVH_EVENT_OCCURRED0_32 0x5e8 + +#define UVH_EVENT_OCCURRED0_LB_HCERR_SHFT 0 +#define UVH_EVENT_OCCURRED0_RH_AOERR0_SHFT 11 +#define UVH_EVENT_OCCURRED0_LB_HCERR_MASK 0x0000000000000001UL +#define UVH_EVENT_OCCURRED0_RH_AOERR0_MASK 0x0000000000000800UL -#define UV1H_EVENT_OCCURRED0_LB_HCERR_SHFT 0 #define UV1H_EVENT_OCCURRED0_GR0_HCERR_SHFT 1 #define UV1H_EVENT_OCCURRED0_GR1_HCERR_SHFT 2 #define UV1H_EVENT_OCCURRED0_LH_HCERR_SHFT 3 @@ -135,7 +165,6 @@ union uvh_bau_data_config_u { #define UV1H_EVENT_OCCURRED0_GR0_AOERR0_SHFT 8 #define UV1H_EVENT_OCCURRED0_GR1_AOERR0_SHFT 9 #define UV1H_EVENT_OCCURRED0_LH_AOERR0_SHFT 10 -#define UV1H_EVENT_OCCURRED0_RH_AOERR0_SHFT 11 #define UV1H_EVENT_OCCURRED0_XN_AOERR0_SHFT 12 #define UV1H_EVENT_OCCURRED0_SI_AOERR0_SHFT 13 #define UV1H_EVENT_OCCURRED0_LB_AOERR1_SHFT 14 @@ -181,7 +210,6 @@ union uvh_bau_data_config_u { #define UV1H_EVENT_OCCURRED0_RTC3_SHFT 54 #define UV1H_EVENT_OCCURRED0_BAU_DATA_SHFT 55 #define UV1H_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_SHFT 56 -#define UV1H_EVENT_OCCURRED0_LB_HCERR_MASK 0x0000000000000001UL #define UV1H_EVENT_OCCURRED0_GR0_HCERR_MASK 0x0000000000000002UL #define UV1H_EVENT_OCCURRED0_GR1_HCERR_MASK 0x0000000000000004UL #define UV1H_EVENT_OCCURRED0_LH_HCERR_MASK 0x0000000000000008UL @@ -192,7 +220,6 @@ union uvh_bau_data_config_u { #define UV1H_EVENT_OCCURRED0_GR0_AOERR0_MASK 0x0000000000000100UL #define UV1H_EVENT_OCCURRED0_GR1_AOERR0_MASK 0x0000000000000200UL #define UV1H_EVENT_OCCURRED0_LH_AOERR0_MASK 0x0000000000000400UL -#define UV1H_EVENT_OCCURRED0_RH_AOERR0_MASK 0x0000000000000800UL #define UV1H_EVENT_OCCURRED0_XN_AOERR0_MASK 0x0000000000001000UL #define UV1H_EVENT_OCCURRED0_SI_AOERR0_MASK 0x0000000000002000UL #define UV1H_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000004000UL @@ -239,188 +266,130 @@ union uvh_bau_data_config_u { #define UV1H_EVENT_OCCURRED0_BAU_DATA_MASK 0x0080000000000000UL #define UV1H_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_MASK 0x0100000000000000UL -#define UV2H_EVENT_OCCURRED0_LB_HCERR_SHFT 0 -#define UV2H_EVENT_OCCURRED0_QP_HCERR_SHFT 1 -#define UV2H_EVENT_OCCURRED0_RH_HCERR_SHFT 2 -#define UV2H_EVENT_OCCURRED0_LH0_HCERR_SHFT 3 -#define UV2H_EVENT_OCCURRED0_LH1_HCERR_SHFT 4 -#define UV2H_EVENT_OCCURRED0_GR0_HCERR_SHFT 5 -#define UV2H_EVENT_OCCURRED0_GR1_HCERR_SHFT 6 -#define UV2H_EVENT_OCCURRED0_NI0_HCERR_SHFT 7 -#define UV2H_EVENT_OCCURRED0_NI1_HCERR_SHFT 8 -#define UV2H_EVENT_OCCURRED0_LB_AOERR0_SHFT 9 -#define UV2H_EVENT_OCCURRED0_QP_AOERR0_SHFT 10 -#define UV2H_EVENT_OCCURRED0_RH_AOERR0_SHFT 11 -#define UV2H_EVENT_OCCURRED0_LH0_AOERR0_SHFT 12 -#define UV2H_EVENT_OCCURRED0_LH1_AOERR0_SHFT 13 -#define UV2H_EVENT_OCCURRED0_GR0_AOERR0_SHFT 14 -#define UV2H_EVENT_OCCURRED0_GR1_AOERR0_SHFT 15 -#define UV2H_EVENT_OCCURRED0_XB_AOERR0_SHFT 16 -#define UV2H_EVENT_OCCURRED0_RT_AOERR0_SHFT 17 -#define UV2H_EVENT_OCCURRED0_NI0_AOERR0_SHFT 18 -#define UV2H_EVENT_OCCURRED0_NI1_AOERR0_SHFT 19 -#define UV2H_EVENT_OCCURRED0_LB_AOERR1_SHFT 20 -#define UV2H_EVENT_OCCURRED0_QP_AOERR1_SHFT 21 -#define UV2H_EVENT_OCCURRED0_RH_AOERR1_SHFT 22 -#define UV2H_EVENT_OCCURRED0_LH0_AOERR1_SHFT 23 -#define UV2H_EVENT_OCCURRED0_LH1_AOERR1_SHFT 24 -#define UV2H_EVENT_OCCURRED0_GR0_AOERR1_SHFT 25 -#define UV2H_EVENT_OCCURRED0_GR1_AOERR1_SHFT 26 -#define UV2H_EVENT_OCCURRED0_XB_AOERR1_SHFT 27 -#define UV2H_EVENT_OCCURRED0_RT_AOERR1_SHFT 28 -#define UV2H_EVENT_OCCURRED0_NI0_AOERR1_SHFT 29 -#define UV2H_EVENT_OCCURRED0_NI1_AOERR1_SHFT 30 -#define UV2H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 31 -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 32 -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 33 -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 34 -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 35 -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 36 -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 37 -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 38 -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 39 -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 40 -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 41 -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 42 -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 43 -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 44 -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 45 -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 46 -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 47 -#define UV2H_EVENT_OCCURRED0_L1_NMI_INT_SHFT 48 -#define UV2H_EVENT_OCCURRED0_STOP_CLOCK_SHFT 49 -#define UV2H_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 50 -#define UV2H_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 51 -#define UV2H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 52 -#define UV2H_EVENT_OCCURRED0_IPI_INT_SHFT 53 -#define UV2H_EVENT_OCCURRED0_EXTIO_INT0_SHFT 54 -#define UV2H_EVENT_OCCURRED0_EXTIO_INT1_SHFT 55 -#define UV2H_EVENT_OCCURRED0_EXTIO_INT2_SHFT 56 -#define UV2H_EVENT_OCCURRED0_EXTIO_INT3_SHFT 57 -#define UV2H_EVENT_OCCURRED0_PROFILE_INT_SHFT 58 -#define UV2H_EVENT_OCCURRED0_LB_HCERR_MASK 0x0000000000000001UL -#define UV2H_EVENT_OCCURRED0_QP_HCERR_MASK 0x0000000000000002UL -#define UV2H_EVENT_OCCURRED0_RH_HCERR_MASK 0x0000000000000004UL -#define UV2H_EVENT_OCCURRED0_LH0_HCERR_MASK 0x0000000000000008UL -#define UV2H_EVENT_OCCURRED0_LH1_HCERR_MASK 0x0000000000000010UL -#define UV2H_EVENT_OCCURRED0_GR0_HCERR_MASK 0x0000000000000020UL -#define UV2H_EVENT_OCCURRED0_GR1_HCERR_MASK 0x0000000000000040UL -#define UV2H_EVENT_OCCURRED0_NI0_HCERR_MASK 0x0000000000000080UL -#define UV2H_EVENT_OCCURRED0_NI1_HCERR_MASK 0x0000000000000100UL -#define UV2H_EVENT_OCCURRED0_LB_AOERR0_MASK 0x0000000000000200UL -#define UV2H_EVENT_OCCURRED0_QP_AOERR0_MASK 0x0000000000000400UL -#define UV2H_EVENT_OCCURRED0_RH_AOERR0_MASK 0x0000000000000800UL -#define UV2H_EVENT_OCCURRED0_LH0_AOERR0_MASK 0x0000000000001000UL -#define UV2H_EVENT_OCCURRED0_LH1_AOERR0_MASK 0x0000000000002000UL -#define UV2H_EVENT_OCCURRED0_GR0_AOERR0_MASK 0x0000000000004000UL -#define UV2H_EVENT_OCCURRED0_GR1_AOERR0_MASK 0x0000000000008000UL -#define UV2H_EVENT_OCCURRED0_XB_AOERR0_MASK 0x0000000000010000UL -#define UV2H_EVENT_OCCURRED0_RT_AOERR0_MASK 0x0000000000020000UL -#define UV2H_EVENT_OCCURRED0_NI0_AOERR0_MASK 0x0000000000040000UL -#define UV2H_EVENT_OCCURRED0_NI1_AOERR0_MASK 0x0000000000080000UL -#define UV2H_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000100000UL -#define UV2H_EVENT_OCCURRED0_QP_AOERR1_MASK 0x0000000000200000UL -#define UV2H_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000000400000UL -#define UV2H_EVENT_OCCURRED0_LH0_AOERR1_MASK 0x0000000000800000UL -#define UV2H_EVENT_OCCURRED0_LH1_AOERR1_MASK 0x0000000001000000UL -#define UV2H_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000002000000UL -#define UV2H_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000004000000UL -#define UV2H_EVENT_OCCURRED0_XB_AOERR1_MASK 0x0000000008000000UL -#define UV2H_EVENT_OCCURRED0_RT_AOERR1_MASK 0x0000000010000000UL -#define UV2H_EVENT_OCCURRED0_NI0_AOERR1_MASK 0x0000000020000000UL -#define UV2H_EVENT_OCCURRED0_NI1_AOERR1_MASK 0x0000000040000000UL -#define UV2H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000000080000000UL -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000000100000000UL -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000000200000000UL -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000000400000000UL -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000000800000000UL -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000001000000000UL -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000002000000000UL -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000004000000000UL -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000008000000000UL -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000010000000000UL -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000020000000000UL -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0000040000000000UL -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0000080000000000UL -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0000100000000000UL -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0000200000000000UL -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0000400000000000UL -#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0000800000000000UL -#define UV2H_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0001000000000000UL -#define UV2H_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0002000000000000UL -#define UV2H_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0004000000000000UL -#define UV2H_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0008000000000000UL -#define UV2H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0010000000000000UL -#define UV2H_EVENT_OCCURRED0_IPI_INT_MASK 0x0020000000000000UL -#define UV2H_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x0040000000000000UL -#define UV2H_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x0080000000000000UL -#define UV2H_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x0100000000000000UL -#define UV2H_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x0200000000000000UL -#define UV2H_EVENT_OCCURRED0_PROFILE_INT_MASK 0x0400000000000000UL +#define UVXH_EVENT_OCCURRED0_QP_HCERR_SHFT 1 +#define UVXH_EVENT_OCCURRED0_RH_HCERR_SHFT 2 +#define UVXH_EVENT_OCCURRED0_LH0_HCERR_SHFT 3 +#define UVXH_EVENT_OCCURRED0_LH1_HCERR_SHFT 4 +#define UVXH_EVENT_OCCURRED0_GR0_HCERR_SHFT 5 +#define UVXH_EVENT_OCCURRED0_GR1_HCERR_SHFT 6 +#define UVXH_EVENT_OCCURRED0_NI0_HCERR_SHFT 7 +#define UVXH_EVENT_OCCURRED0_NI1_HCERR_SHFT 8 +#define UVXH_EVENT_OCCURRED0_LB_AOERR0_SHFT 9 +#define UVXH_EVENT_OCCURRED0_QP_AOERR0_SHFT 10 +#define UVXH_EVENT_OCCURRED0_LH0_AOERR0_SHFT 12 +#define UVXH_EVENT_OCCURRED0_LH1_AOERR0_SHFT 13 +#define UVXH_EVENT_OCCURRED0_GR0_AOERR0_SHFT 14 +#define UVXH_EVENT_OCCURRED0_GR1_AOERR0_SHFT 15 +#define UVXH_EVENT_OCCURRED0_XB_AOERR0_SHFT 16 +#define UVXH_EVENT_OCCURRED0_RT_AOERR0_SHFT 17 +#define UVXH_EVENT_OCCURRED0_NI0_AOERR0_SHFT 18 +#define UVXH_EVENT_OCCURRED0_NI1_AOERR0_SHFT 19 +#define UVXH_EVENT_OCCURRED0_LB_AOERR1_SHFT 20 +#define UVXH_EVENT_OCCURRED0_QP_AOERR1_SHFT 21 +#define UVXH_EVENT_OCCURRED0_RH_AOERR1_SHFT 22 +#define UVXH_EVENT_OCCURRED0_LH0_AOERR1_SHFT 23 +#define UVXH_EVENT_OCCURRED0_LH1_AOERR1_SHFT 24 +#define UVXH_EVENT_OCCURRED0_GR0_AOERR1_SHFT 25 +#define UVXH_EVENT_OCCURRED0_GR1_AOERR1_SHFT 26 +#define UVXH_EVENT_OCCURRED0_XB_AOERR1_SHFT 27 +#define UVXH_EVENT_OCCURRED0_RT_AOERR1_SHFT 28 +#define UVXH_EVENT_OCCURRED0_NI0_AOERR1_SHFT 29 +#define UVXH_EVENT_OCCURRED0_NI1_AOERR1_SHFT 30 +#define UVXH_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 31 +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 32 +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 33 +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 34 +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 35 +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 36 +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 37 +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 38 +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 39 +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 40 +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 41 +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 42 +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 43 +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 44 +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 45 +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 46 +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 47 +#define UVXH_EVENT_OCCURRED0_L1_NMI_INT_SHFT 48 +#define UVXH_EVENT_OCCURRED0_STOP_CLOCK_SHFT 49 +#define UVXH_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 50 +#define UVXH_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 51 +#define UVXH_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 52 +#define UVXH_EVENT_OCCURRED0_IPI_INT_SHFT 53 +#define UVXH_EVENT_OCCURRED0_EXTIO_INT0_SHFT 54 +#define UVXH_EVENT_OCCURRED0_EXTIO_INT1_SHFT 55 +#define UVXH_EVENT_OCCURRED0_EXTIO_INT2_SHFT 56 +#define UVXH_EVENT_OCCURRED0_EXTIO_INT3_SHFT 57 +#define UVXH_EVENT_OCCURRED0_PROFILE_INT_SHFT 58 +#define UVXH_EVENT_OCCURRED0_QP_HCERR_MASK 0x0000000000000002UL +#define UVXH_EVENT_OCCURRED0_RH_HCERR_MASK 0x0000000000000004UL +#define UVXH_EVENT_OCCURRED0_LH0_HCERR_MASK 0x0000000000000008UL +#define UVXH_EVENT_OCCURRED0_LH1_HCERR_MASK 0x0000000000000010UL +#define UVXH_EVENT_OCCURRED0_GR0_HCERR_MASK 0x0000000000000020UL +#define UVXH_EVENT_OCCURRED0_GR1_HCERR_MASK 0x0000000000000040UL +#define UVXH_EVENT_OCCURRED0_NI0_HCERR_MASK 0x0000000000000080UL +#define UVXH_EVENT_OCCURRED0_NI1_HCERR_MASK 0x0000000000000100UL +#define UVXH_EVENT_OCCURRED0_LB_AOERR0_MASK 0x0000000000000200UL +#define UVXH_EVENT_OCCURRED0_QP_AOERR0_MASK 0x0000000000000400UL +#define UVXH_EVENT_OCCURRED0_LH0_AOERR0_MASK 0x0000000000001000UL +#define UVXH_EVENT_OCCURRED0_LH1_AOERR0_MASK 0x0000000000002000UL +#define UVXH_EVENT_OCCURRED0_GR0_AOERR0_MASK 0x0000000000004000UL +#define UVXH_EVENT_OCCURRED0_GR1_AOERR0_MASK 0x0000000000008000UL +#define UVXH_EVENT_OCCURRED0_XB_AOERR0_MASK 0x0000000000010000UL +#define UVXH_EVENT_OCCURRED0_RT_AOERR0_MASK 0x0000000000020000UL +#define UVXH_EVENT_OCCURRED0_NI0_AOERR0_MASK 0x0000000000040000UL +#define UVXH_EVENT_OCCURRED0_NI1_AOERR0_MASK 0x0000000000080000UL +#define UVXH_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000100000UL +#define UVXH_EVENT_OCCURRED0_QP_AOERR1_MASK 0x0000000000200000UL +#define UVXH_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000000400000UL +#define UVXH_EVENT_OCCURRED0_LH0_AOERR1_MASK 0x0000000000800000UL +#define UVXH_EVENT_OCCURRED0_LH1_AOERR1_MASK 0x0000000001000000UL +#define UVXH_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000002000000UL +#define UVXH_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000004000000UL +#define UVXH_EVENT_OCCURRED0_XB_AOERR1_MASK 0x0000000008000000UL +#define UVXH_EVENT_OCCURRED0_RT_AOERR1_MASK 0x0000000010000000UL +#define UVXH_EVENT_OCCURRED0_NI0_AOERR1_MASK 0x0000000020000000UL +#define UVXH_EVENT_OCCURRED0_NI1_AOERR1_MASK 0x0000000040000000UL +#define UVXH_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000000080000000UL +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000000100000000UL +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000000200000000UL +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000000400000000UL +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000000800000000UL +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000001000000000UL +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000002000000000UL +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000004000000000UL +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000008000000000UL +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000010000000000UL +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000020000000000UL +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0000040000000000UL +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0000080000000000UL +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0000100000000000UL +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0000200000000000UL +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0000400000000000UL +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0000800000000000UL +#define UVXH_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0001000000000000UL +#define UVXH_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0002000000000000UL +#define UVXH_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0004000000000000UL +#define UVXH_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0008000000000000UL +#define UVXH_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0010000000000000UL +#define UVXH_EVENT_OCCURRED0_IPI_INT_MASK 0x0020000000000000UL +#define UVXH_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x0040000000000000UL +#define UVXH_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x0080000000000000UL +#define UVXH_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x0100000000000000UL +#define UVXH_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x0200000000000000UL +#define UVXH_EVENT_OCCURRED0_PROFILE_INT_MASK 0x0400000000000000UL union uvh_event_occurred0_u { unsigned long v; - struct uv1h_event_occurred0_s { + struct uvh_event_occurred0_s { unsigned long lb_hcerr:1; /* RW, W1C */ - unsigned long gr0_hcerr:1; /* RW, W1C */ - unsigned long gr1_hcerr:1; /* RW, W1C */ - unsigned long lh_hcerr:1; /* RW, W1C */ - unsigned long rh_hcerr:1; /* RW, W1C */ - unsigned long xn_hcerr:1; /* RW, W1C */ - unsigned long si_hcerr:1; /* RW, W1C */ - unsigned long lb_aoerr0:1; /* RW, W1C */ - unsigned long gr0_aoerr0:1; /* RW, W1C */ - unsigned long gr1_aoerr0:1; /* RW, W1C */ - unsigned long lh_aoerr0:1; /* RW, W1C */ + unsigned long rsvd_1_10:10; unsigned long rh_aoerr0:1; /* RW, W1C */ - unsigned long xn_aoerr0:1; /* RW, W1C */ - unsigned long si_aoerr0:1; /* RW, W1C */ - unsigned long lb_aoerr1:1; /* RW, W1C */ - unsigned long gr0_aoerr1:1; /* RW, W1C */ - unsigned long gr1_aoerr1:1; /* RW, W1C */ - unsigned long lh_aoerr1:1; /* RW, W1C */ - unsigned long rh_aoerr1:1; /* RW, W1C */ - unsigned long xn_aoerr1:1; /* RW, W1C */ - unsigned long si_aoerr1:1; /* RW, W1C */ - unsigned long rh_vpi_int:1; /* RW, W1C */ - unsigned long system_shutdown_int:1; /* RW, W1C */ - unsigned long lb_irq_int_0:1; /* RW, W1C */ - unsigned long lb_irq_int_1:1; /* RW, W1C */ - unsigned long lb_irq_int_2:1; /* RW, W1C */ - unsigned long lb_irq_int_3:1; /* RW, W1C */ - unsigned long lb_irq_int_4:1; /* RW, W1C */ - unsigned long lb_irq_int_5:1; /* RW, W1C */ - unsigned long lb_irq_int_6:1; /* RW, W1C */ - unsigned long lb_irq_int_7:1; /* RW, W1C */ - unsigned long lb_irq_int_8:1; /* RW, W1C */ - unsigned long lb_irq_int_9:1; /* RW, W1C */ - unsigned long lb_irq_int_10:1; /* RW, W1C */ - unsigned long lb_irq_int_11:1; /* RW, W1C */ - unsigned long lb_irq_int_12:1; /* RW, W1C */ - unsigned long lb_irq_int_13:1; /* RW, W1C */ - unsigned long lb_irq_int_14:1; /* RW, W1C */ - unsigned long lb_irq_int_15:1; /* RW, W1C */ - unsigned long l1_nmi_int:1; /* RW, W1C */ - unsigned long stop_clock:1; /* RW, W1C */ - unsigned long asic_to_l1:1; /* RW, W1C */ - unsigned long l1_to_asic:1; /* RW, W1C */ - unsigned long ltc_int:1; /* RW, W1C */ - unsigned long la_seq_trigger:1; /* RW, W1C */ - unsigned long ipi_int:1; /* RW, W1C */ - unsigned long extio_int0:1; /* RW, W1C */ - unsigned long extio_int1:1; /* RW, W1C */ - unsigned long extio_int2:1; /* RW, W1C */ - unsigned long extio_int3:1; /* RW, W1C */ - unsigned long profile_int:1; /* RW, W1C */ - unsigned long rtc0:1; /* RW, W1C */ - unsigned long rtc1:1; /* RW, W1C */ - unsigned long rtc2:1; /* RW, W1C */ - unsigned long rtc3:1; /* RW, W1C */ - unsigned long bau_data:1; /* RW, W1C */ - unsigned long power_management_req:1; /* RW, W1C */ - unsigned long rsvd_57_63:7; - } s1; - struct uv2h_event_occurred0_s { + unsigned long rsvd_12_63:52; + } s; + struct uvxh_event_occurred0_s { unsigned long lb_hcerr:1; /* RW */ unsigned long qp_hcerr:1; /* RW */ unsigned long rh_hcerr:1; /* RW */ @@ -481,19 +450,20 @@ union uvh_event_occurred0_u { unsigned long extio_int3:1; /* RW */ unsigned long profile_int:1; /* RW */ unsigned long rsvd_59_63:5; - } s2; + } sx; }; /* ========================================================================= */ /* UVH_EVENT_OCCURRED0_ALIAS */ /* ========================================================================= */ -#define UVH_EVENT_OCCURRED0_ALIAS 0x0000000000070008UL -#define UVH_EVENT_OCCURRED0_ALIAS_32 0x5f0 +#define UVH_EVENT_OCCURRED0_ALIAS 0x70008UL +#define UVH_EVENT_OCCURRED0_ALIAS_32 0x5f0 + /* ========================================================================= */ /* UVH_GR0_TLB_INT0_CONFIG */ /* ========================================================================= */ -#define UVH_GR0_TLB_INT0_CONFIG 0x61b00UL +#define UVH_GR0_TLB_INT0_CONFIG 0x61b00UL #define UVH_GR0_TLB_INT0_CONFIG_VECTOR_SHFT 0 #define UVH_GR0_TLB_INT0_CONFIG_DM_SHFT 8 @@ -531,7 +501,7 @@ union uvh_gr0_tlb_int0_config_u { /* ========================================================================= */ /* UVH_GR0_TLB_INT1_CONFIG */ /* ========================================================================= */ -#define UVH_GR0_TLB_INT1_CONFIG 0x61b40UL +#define UVH_GR0_TLB_INT1_CONFIG 0x61b40UL #define UVH_GR0_TLB_INT1_CONFIG_VECTOR_SHFT 0 #define UVH_GR0_TLB_INT1_CONFIG_DM_SHFT 8 @@ -571,9 +541,11 @@ union uvh_gr0_tlb_int1_config_u { /* ========================================================================= */ #define UV1H_GR0_TLB_MMR_CONTROL 0x401080UL #define UV2H_GR0_TLB_MMR_CONTROL 0xc01080UL -#define UVH_GR0_TLB_MMR_CONTROL (is_uv1_hub() ? \ - UV1H_GR0_TLB_MMR_CONTROL : \ - UV2H_GR0_TLB_MMR_CONTROL) +#define UV3H_GR0_TLB_MMR_CONTROL 0xc01080UL +#define UVH_GR0_TLB_MMR_CONTROL \ + (is_uv1_hub() ? UV1H_GR0_TLB_MMR_CONTROL : \ + (is_uv2_hub() ? UV2H_GR0_TLB_MMR_CONTROL : \ + UV3H_GR0_TLB_MMR_CONTROL)) #define UVH_GR0_TLB_MMR_CONTROL_INDEX_SHFT 0 #define UVH_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT 12 @@ -611,6 +583,21 @@ union uvh_gr0_tlb_int1_config_u { #define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBRREG_MASK 0x0100000000000000UL #define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBLRUV_MASK 0x1000000000000000UL +#define UVXH_GR0_TLB_MMR_CONTROL_INDEX_SHFT 0 +#define UVXH_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT 12 +#define UVXH_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16 +#define UVXH_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20 +#define UVXH_GR0_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30 +#define UVXH_GR0_TLB_MMR_CONTROL_MMR_READ_SHFT 31 +#define UVXH_GR0_TLB_MMR_CONTROL_MMR_OP_DONE_SHFT 32 +#define UVXH_GR0_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000000fffUL +#define UVXH_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000003000UL +#define UVXH_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL +#define UVXH_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL +#define UVXH_GR0_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL +#define UVXH_GR0_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL +#define UVXH_GR0_TLB_MMR_CONTROL_MMR_OP_DONE_MASK 0x0000000100000000UL + #define UV2H_GR0_TLB_MMR_CONTROL_INDEX_SHFT 0 #define UV2H_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT 12 #define UV2H_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16 @@ -630,6 +617,23 @@ union uvh_gr0_tlb_int1_config_u { #define UV2H_GR0_TLB_MMR_CONTROL_MMR_INJ_CON_MASK 0x0001000000000000UL #define UV2H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_MASK 0x0010000000000000UL +#define UV3H_GR0_TLB_MMR_CONTROL_INDEX_SHFT 0 +#define UV3H_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT 12 +#define UV3H_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16 +#define UV3H_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20 +#define UV3H_GR0_TLB_MMR_CONTROL_ECC_SEL_SHFT 21 +#define UV3H_GR0_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30 +#define UV3H_GR0_TLB_MMR_CONTROL_MMR_READ_SHFT 31 +#define UV3H_GR0_TLB_MMR_CONTROL_MMR_OP_DONE_SHFT 32 +#define UV3H_GR0_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000000fffUL +#define UV3H_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000003000UL +#define UV3H_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL +#define UV3H_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL +#define UV3H_GR0_TLB_MMR_CONTROL_ECC_SEL_MASK 0x0000000000200000UL +#define UV3H_GR0_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL +#define UV3H_GR0_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL +#define UV3H_GR0_TLB_MMR_CONTROL_MMR_OP_DONE_MASK 0x0000000100000000UL + union uvh_gr0_tlb_mmr_control_u { unsigned long v; struct uvh_gr0_tlb_mmr_control_s { @@ -642,7 +646,9 @@ union uvh_gr0_tlb_mmr_control_u { unsigned long rsvd_21_29:9; unsigned long mmr_write:1; /* WP */ unsigned long mmr_read:1; /* WP */ - unsigned long rsvd_32_63:32; + unsigned long rsvd_32_48:17; + unsigned long rsvd_49_51:3; + unsigned long rsvd_52_63:12; } s; struct uv1h_gr0_tlb_mmr_control_s { unsigned long index:12; /* RW */ @@ -666,6 +672,23 @@ union uvh_gr0_tlb_mmr_control_u { unsigned long mmr_inj_tlblruv:1; /* RW */ unsigned long rsvd_61_63:3; } s1; + struct uvxh_gr0_tlb_mmr_control_s { + unsigned long index:12; /* RW */ + unsigned long mem_sel:2; /* RW */ + unsigned long rsvd_14_15:2; + unsigned long auto_valid_en:1; /* RW */ + unsigned long rsvd_17_19:3; + unsigned long mmr_hash_index_en:1; /* RW */ + unsigned long rsvd_21_29:9; + unsigned long mmr_write:1; /* WP */ + unsigned long mmr_read:1; /* WP */ + unsigned long mmr_op_done:1; /* RW */ + unsigned long rsvd_33_47:15; + unsigned long rsvd_48:1; + unsigned long rsvd_49_51:3; + unsigned long rsvd_52:1; + unsigned long rsvd_53_63:11; + } sx; struct uv2h_gr0_tlb_mmr_control_s { unsigned long index:12; /* RW */ unsigned long mem_sel:2; /* RW */ @@ -683,6 +706,24 @@ union uvh_gr0_tlb_mmr_control_u { unsigned long mmr_inj_tlbram:1; /* RW */ unsigned long rsvd_53_63:11; } s2; + struct uv3h_gr0_tlb_mmr_control_s { + unsigned long index:12; /* RW */ + unsigned long mem_sel:2; /* RW */ + unsigned long rsvd_14_15:2; + unsigned long auto_valid_en:1; /* RW */ + unsigned long rsvd_17_19:3; + unsigned long mmr_hash_index_en:1; /* RW */ + unsigned long ecc_sel:1; /* RW */ + unsigned long rsvd_22_29:8; + unsigned long mmr_write:1; /* WP */ + unsigned long mmr_read:1; /* WP */ + unsigned long mmr_op_done:1; /* RW */ + unsigned long rsvd_33_47:15; + unsigned long undef_48:1; /* Undefined */ + unsigned long rsvd_49_51:3; + unsigned long undef_52:1; /* Undefined */ + unsigned long rsvd_53_63:11; + } s3; }; /* ========================================================================= */ @@ -690,9 +731,11 @@ union uvh_gr0_tlb_mmr_control_u { /* ========================================================================= */ #define UV1H_GR0_TLB_MMR_READ_DATA_HI 0x4010a0UL #define UV2H_GR0_TLB_MMR_READ_DATA_HI 0xc010a0UL -#define UVH_GR0_TLB_MMR_READ_DATA_HI (is_uv1_hub() ? \ - UV1H_GR0_TLB_MMR_READ_DATA_HI : \ - UV2H_GR0_TLB_MMR_READ_DATA_HI) +#define UV3H_GR0_TLB_MMR_READ_DATA_HI 0xc010a0UL +#define UVH_GR0_TLB_MMR_READ_DATA_HI \ + (is_uv1_hub() ? UV1H_GR0_TLB_MMR_READ_DATA_HI : \ + (is_uv2_hub() ? UV2H_GR0_TLB_MMR_READ_DATA_HI : \ + UV3H_GR0_TLB_MMR_READ_DATA_HI)) #define UVH_GR0_TLB_MMR_READ_DATA_HI_PFN_SHFT 0 #define UVH_GR0_TLB_MMR_READ_DATA_HI_GAA_SHFT 41 @@ -703,6 +746,46 @@ union uvh_gr0_tlb_mmr_control_u { #define UVH_GR0_TLB_MMR_READ_DATA_HI_DIRTY_MASK 0x0000080000000000UL #define UVH_GR0_TLB_MMR_READ_DATA_HI_LARGER_MASK 0x0000100000000000UL +#define UV1H_GR0_TLB_MMR_READ_DATA_HI_PFN_SHFT 0 +#define UV1H_GR0_TLB_MMR_READ_DATA_HI_GAA_SHFT 41 +#define UV1H_GR0_TLB_MMR_READ_DATA_HI_DIRTY_SHFT 43 +#define UV1H_GR0_TLB_MMR_READ_DATA_HI_LARGER_SHFT 44 +#define UV1H_GR0_TLB_MMR_READ_DATA_HI_PFN_MASK 0x000001ffffffffffUL +#define UV1H_GR0_TLB_MMR_READ_DATA_HI_GAA_MASK 0x0000060000000000UL +#define UV1H_GR0_TLB_MMR_READ_DATA_HI_DIRTY_MASK 0x0000080000000000UL +#define UV1H_GR0_TLB_MMR_READ_DATA_HI_LARGER_MASK 0x0000100000000000UL + +#define UVXH_GR0_TLB_MMR_READ_DATA_HI_PFN_SHFT 0 +#define UVXH_GR0_TLB_MMR_READ_DATA_HI_GAA_SHFT 41 +#define UVXH_GR0_TLB_MMR_READ_DATA_HI_DIRTY_SHFT 43 +#define UVXH_GR0_TLB_MMR_READ_DATA_HI_LARGER_SHFT 44 +#define UVXH_GR0_TLB_MMR_READ_DATA_HI_PFN_MASK 0x000001ffffffffffUL +#define UVXH_GR0_TLB_MMR_READ_DATA_HI_GAA_MASK 0x0000060000000000UL +#define UVXH_GR0_TLB_MMR_READ_DATA_HI_DIRTY_MASK 0x0000080000000000UL +#define UVXH_GR0_TLB_MMR_READ_DATA_HI_LARGER_MASK 0x0000100000000000UL + +#define UV2H_GR0_TLB_MMR_READ_DATA_HI_PFN_SHFT 0 +#define UV2H_GR0_TLB_MMR_READ_DATA_HI_GAA_SHFT 41 +#define UV2H_GR0_TLB_MMR_READ_DATA_HI_DIRTY_SHFT 43 +#define UV2H_GR0_TLB_MMR_READ_DATA_HI_LARGER_SHFT 44 +#define UV2H_GR0_TLB_MMR_READ_DATA_HI_PFN_MASK 0x000001ffffffffffUL +#define UV2H_GR0_TLB_MMR_READ_DATA_HI_GAA_MASK 0x0000060000000000UL +#define UV2H_GR0_TLB_MMR_READ_DATA_HI_DIRTY_MASK 0x0000080000000000UL +#define UV2H_GR0_TLB_MMR_READ_DATA_HI_LARGER_MASK 0x0000100000000000UL + +#define UV3H_GR0_TLB_MMR_READ_DATA_HI_PFN_SHFT 0 +#define UV3H_GR0_TLB_MMR_READ_DATA_HI_GAA_SHFT 41 +#define UV3H_GR0_TLB_MMR_READ_DATA_HI_DIRTY_SHFT 43 +#define UV3H_GR0_TLB_MMR_READ_DATA_HI_LARGER_SHFT 44 +#define UV3H_GR0_TLB_MMR_READ_DATA_HI_AA_EXT_SHFT 45 +#define UV3H_GR0_TLB_MMR_READ_DATA_HI_WAY_ECC_SHFT 55 +#define UV3H_GR0_TLB_MMR_READ_DATA_HI_PFN_MASK 0x000001ffffffffffUL +#define UV3H_GR0_TLB_MMR_READ_DATA_HI_GAA_MASK 0x0000060000000000UL +#define UV3H_GR0_TLB_MMR_READ_DATA_HI_DIRTY_MASK 0x0000080000000000UL +#define UV3H_GR0_TLB_MMR_READ_DATA_HI_LARGER_MASK 0x0000100000000000UL +#define UV3H_GR0_TLB_MMR_READ_DATA_HI_AA_EXT_MASK 0x0000200000000000UL +#define UV3H_GR0_TLB_MMR_READ_DATA_HI_WAY_ECC_MASK 0xff80000000000000UL + union uvh_gr0_tlb_mmr_read_data_hi_u { unsigned long v; struct uvh_gr0_tlb_mmr_read_data_hi_s { @@ -712,6 +795,36 @@ union uvh_gr0_tlb_mmr_read_data_hi_u { unsigned long larger:1; /* RO */ unsigned long rsvd_45_63:19; } s; + struct uv1h_gr0_tlb_mmr_read_data_hi_s { + unsigned long pfn:41; /* RO */ + unsigned long gaa:2; /* RO */ + unsigned long dirty:1; /* RO */ + unsigned long larger:1; /* RO */ + unsigned long rsvd_45_63:19; + } s1; + struct uvxh_gr0_tlb_mmr_read_data_hi_s { + unsigned long pfn:41; /* RO */ + unsigned long gaa:2; /* RO */ + unsigned long dirty:1; /* RO */ + unsigned long larger:1; /* RO */ + unsigned long rsvd_45_63:19; + } sx; + struct uv2h_gr0_tlb_mmr_read_data_hi_s { + unsigned long pfn:41; /* RO */ + unsigned long gaa:2; /* RO */ + unsigned long dirty:1; /* RO */ + unsigned long larger:1; /* RO */ + unsigned long rsvd_45_63:19; + } s2; + struct uv3h_gr0_tlb_mmr_read_data_hi_s { + unsigned long pfn:41; /* RO */ + unsigned long gaa:2; /* RO */ + unsigned long dirty:1; /* RO */ + unsigned long larger:1; /* RO */ + unsigned long aa_ext:1; /* RO */ + unsigned long undef_46_54:9; /* Undefined */ + unsigned long way_ecc:9; /* RO */ + } s3; }; /* ========================================================================= */ @@ -719,9 +832,11 @@ union uvh_gr0_tlb_mmr_read_data_hi_u { /* ========================================================================= */ #define UV1H_GR0_TLB_MMR_READ_DATA_LO 0x4010a8UL #define UV2H_GR0_TLB_MMR_READ_DATA_LO 0xc010a8UL -#define UVH_GR0_TLB_MMR_READ_DATA_LO (is_uv1_hub() ? \ - UV1H_GR0_TLB_MMR_READ_DATA_LO : \ - UV2H_GR0_TLB_MMR_READ_DATA_LO) +#define UV3H_GR0_TLB_MMR_READ_DATA_LO 0xc010a8UL +#define UVH_GR0_TLB_MMR_READ_DATA_LO \ + (is_uv1_hub() ? UV1H_GR0_TLB_MMR_READ_DATA_LO : \ + (is_uv2_hub() ? UV2H_GR0_TLB_MMR_READ_DATA_LO : \ + UV3H_GR0_TLB_MMR_READ_DATA_LO)) #define UVH_GR0_TLB_MMR_READ_DATA_LO_VPN_SHFT 0 #define UVH_GR0_TLB_MMR_READ_DATA_LO_ASID_SHFT 39 @@ -730,6 +845,34 @@ union uvh_gr0_tlb_mmr_read_data_hi_u { #define UVH_GR0_TLB_MMR_READ_DATA_LO_ASID_MASK 0x7fffff8000000000UL #define UVH_GR0_TLB_MMR_READ_DATA_LO_VALID_MASK 0x8000000000000000UL +#define UV1H_GR0_TLB_MMR_READ_DATA_LO_VPN_SHFT 0 +#define UV1H_GR0_TLB_MMR_READ_DATA_LO_ASID_SHFT 39 +#define UV1H_GR0_TLB_MMR_READ_DATA_LO_VALID_SHFT 63 +#define UV1H_GR0_TLB_MMR_READ_DATA_LO_VPN_MASK 0x0000007fffffffffUL +#define UV1H_GR0_TLB_MMR_READ_DATA_LO_ASID_MASK 0x7fffff8000000000UL +#define UV1H_GR0_TLB_MMR_READ_DATA_LO_VALID_MASK 0x8000000000000000UL + +#define UVXH_GR0_TLB_MMR_READ_DATA_LO_VPN_SHFT 0 +#define UVXH_GR0_TLB_MMR_READ_DATA_LO_ASID_SHFT 39 +#define UVXH_GR0_TLB_MMR_READ_DATA_LO_VALID_SHFT 63 +#define UVXH_GR0_TLB_MMR_READ_DATA_LO_VPN_MASK 0x0000007fffffffffUL +#define UVXH_GR0_TLB_MMR_READ_DATA_LO_ASID_MASK 0x7fffff8000000000UL +#define UVXH_GR0_TLB_MMR_READ_DATA_LO_VALID_MASK 0x8000000000000000UL + +#define UV2H_GR0_TLB_MMR_READ_DATA_LO_VPN_SHFT 0 +#define UV2H_GR0_TLB_MMR_READ_DATA_LO_ASID_SHFT 39 +#define UV2H_GR0_TLB_MMR_READ_DATA_LO_VALID_SHFT 63 +#define UV2H_GR0_TLB_MMR_READ_DATA_LO_VPN_MASK 0x0000007fffffffffUL +#define UV2H_GR0_TLB_MMR_READ_DATA_LO_ASID_MASK 0x7fffff8000000000UL +#define UV2H_GR0_TLB_MMR_READ_DATA_LO_VALID_MASK 0x8000000000000000UL + +#define UV3H_GR0_TLB_MMR_READ_DATA_LO_VPN_SHFT 0 +#define UV3H_GR0_TLB_MMR_READ_DATA_LO_ASID_SHFT 39 +#define UV3H_GR0_TLB_MMR_READ_DATA_LO_VALID_SHFT 63 +#define UV3H_GR0_TLB_MMR_READ_DATA_LO_VPN_MASK 0x0000007fffffffffUL +#define UV3H_GR0_TLB_MMR_READ_DATA_LO_ASID_MASK 0x7fffff8000000000UL +#define UV3H_GR0_TLB_MMR_READ_DATA_LO_VALID_MASK 0x8000000000000000UL + union uvh_gr0_tlb_mmr_read_data_lo_u { unsigned long v; struct uvh_gr0_tlb_mmr_read_data_lo_s { @@ -737,12 +880,32 @@ union uvh_gr0_tlb_mmr_read_data_lo_u { unsigned long asid:24; /* RO */ unsigned long valid:1; /* RO */ } s; + struct uv1h_gr0_tlb_mmr_read_data_lo_s { + unsigned long vpn:39; /* RO */ + unsigned long asid:24; /* RO */ + unsigned long valid:1; /* RO */ + } s1; + struct uvxh_gr0_tlb_mmr_read_data_lo_s { + unsigned long vpn:39; /* RO */ + unsigned long asid:24; /* RO */ + unsigned long valid:1; /* RO */ + } sx; + struct uv2h_gr0_tlb_mmr_read_data_lo_s { + unsigned long vpn:39; /* RO */ + unsigned long asid:24; /* RO */ + unsigned long valid:1; /* RO */ + } s2; + struct uv3h_gr0_tlb_mmr_read_data_lo_s { + unsigned long vpn:39; /* RO */ + unsigned long asid:24; /* RO */ + unsigned long valid:1; /* RO */ + } s3; }; /* ========================================================================= */ /* UVH_GR1_TLB_INT0_CONFIG */ /* ========================================================================= */ -#define UVH_GR1_TLB_INT0_CONFIG 0x61f00UL +#define UVH_GR1_TLB_INT0_CONFIG 0x61f00UL #define UVH_GR1_TLB_INT0_CONFIG_VECTOR_SHFT 0 #define UVH_GR1_TLB_INT0_CONFIG_DM_SHFT 8 @@ -780,7 +943,7 @@ union uvh_gr1_tlb_int0_config_u { /* ========================================================================= */ /* UVH_GR1_TLB_INT1_CONFIG */ /* ========================================================================= */ -#define UVH_GR1_TLB_INT1_CONFIG 0x61f40UL +#define UVH_GR1_TLB_INT1_CONFIG 0x61f40UL #define UVH_GR1_TLB_INT1_CONFIG_VECTOR_SHFT 0 #define UVH_GR1_TLB_INT1_CONFIG_DM_SHFT 8 @@ -820,9 +983,11 @@ union uvh_gr1_tlb_int1_config_u { /* ========================================================================= */ #define UV1H_GR1_TLB_MMR_CONTROL 0x801080UL #define UV2H_GR1_TLB_MMR_CONTROL 0x1001080UL -#define UVH_GR1_TLB_MMR_CONTROL (is_uv1_hub() ? \ - UV1H_GR1_TLB_MMR_CONTROL : \ - UV2H_GR1_TLB_MMR_CONTROL) +#define UV3H_GR1_TLB_MMR_CONTROL 0x1001080UL +#define UVH_GR1_TLB_MMR_CONTROL \ + (is_uv1_hub() ? UV1H_GR1_TLB_MMR_CONTROL : \ + (is_uv2_hub() ? UV2H_GR1_TLB_MMR_CONTROL : \ + UV3H_GR1_TLB_MMR_CONTROL)) #define UVH_GR1_TLB_MMR_CONTROL_INDEX_SHFT 0 #define UVH_GR1_TLB_MMR_CONTROL_MEM_SEL_SHFT 12 @@ -860,6 +1025,21 @@ union uvh_gr1_tlb_int1_config_u { #define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBRREG_MASK 0x0100000000000000UL #define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBLRUV_MASK 0x1000000000000000UL +#define UVXH_GR1_TLB_MMR_CONTROL_INDEX_SHFT 0 +#define UVXH_GR1_TLB_MMR_CONTROL_MEM_SEL_SHFT 12 +#define UVXH_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16 +#define UVXH_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20 +#define UVXH_GR1_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30 +#define UVXH_GR1_TLB_MMR_CONTROL_MMR_READ_SHFT 31 +#define UVXH_GR1_TLB_MMR_CONTROL_MMR_OP_DONE_SHFT 32 +#define UVXH_GR1_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000000fffUL +#define UVXH_GR1_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000003000UL +#define UVXH_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL +#define UVXH_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL +#define UVXH_GR1_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL +#define UVXH_GR1_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL +#define UVXH_GR1_TLB_MMR_CONTROL_MMR_OP_DONE_MASK 0x0000000100000000UL + #define UV2H_GR1_TLB_MMR_CONTROL_INDEX_SHFT 0 #define UV2H_GR1_TLB_MMR_CONTROL_MEM_SEL_SHFT 12 #define UV2H_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16 @@ -879,6 +1059,23 @@ union uvh_gr1_tlb_int1_config_u { #define UV2H_GR1_TLB_MMR_CONTROL_MMR_INJ_CON_MASK 0x0001000000000000UL #define UV2H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_MASK 0x0010000000000000UL +#define UV3H_GR1_TLB_MMR_CONTROL_INDEX_SHFT 0 +#define UV3H_GR1_TLB_MMR_CONTROL_MEM_SEL_SHFT 12 +#define UV3H_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16 +#define UV3H_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20 +#define UV3H_GR1_TLB_MMR_CONTROL_ECC_SEL_SHFT 21 +#define UV3H_GR1_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30 +#define UV3H_GR1_TLB_MMR_CONTROL_MMR_READ_SHFT 31 +#define UV3H_GR1_TLB_MMR_CONTROL_MMR_OP_DONE_SHFT 32 +#define UV3H_GR1_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000000fffUL +#define UV3H_GR1_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000003000UL +#define UV3H_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL +#define UV3H_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL +#define UV3H_GR1_TLB_MMR_CONTROL_ECC_SEL_MASK 0x0000000000200000UL +#define UV3H_GR1_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL +#define UV3H_GR1_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL +#define UV3H_GR1_TLB_MMR_CONTROL_MMR_OP_DONE_MASK 0x0000000100000000UL + union uvh_gr1_tlb_mmr_control_u { unsigned long v; struct uvh_gr1_tlb_mmr_control_s { @@ -891,7 +1088,9 @@ union uvh_gr1_tlb_mmr_control_u { unsigned long rsvd_21_29:9; unsigned long mmr_write:1; /* WP */ unsigned long mmr_read:1; /* WP */ - unsigned long rsvd_32_63:32; + unsigned long rsvd_32_48:17; + unsigned long rsvd_49_51:3; + unsigned long rsvd_52_63:12; } s; struct uv1h_gr1_tlb_mmr_control_s { unsigned long index:12; /* RW */ @@ -915,6 +1114,23 @@ union uvh_gr1_tlb_mmr_control_u { unsigned long mmr_inj_tlblruv:1; /* RW */ unsigned long rsvd_61_63:3; } s1; + struct uvxh_gr1_tlb_mmr_control_s { + unsigned long index:12; /* RW */ + unsigned long mem_sel:2; /* RW */ + unsigned long rsvd_14_15:2; + unsigned long auto_valid_en:1; /* RW */ + unsigned long rsvd_17_19:3; + unsigned long mmr_hash_index_en:1; /* RW */ + unsigned long rsvd_21_29:9; + unsigned long mmr_write:1; /* WP */ + unsigned long mmr_read:1; /* WP */ + unsigned long mmr_op_done:1; /* RW */ + unsigned long rsvd_33_47:15; + unsigned long rsvd_48:1; + unsigned long rsvd_49_51:3; + unsigned long rsvd_52:1; + unsigned long rsvd_53_63:11; + } sx; struct uv2h_gr1_tlb_mmr_control_s { unsigned long index:12; /* RW */ unsigned long mem_sel:2; /* RW */ @@ -932,6 +1148,24 @@ union uvh_gr1_tlb_mmr_control_u { unsigned long mmr_inj_tlbram:1; /* RW */ unsigned long rsvd_53_63:11; } s2; + struct uv3h_gr1_tlb_mmr_control_s { + unsigned long index:12; /* RW */ + unsigned long mem_sel:2; /* RW */ + unsigned long rsvd_14_15:2; + unsigned long auto_valid_en:1; /* RW */ + unsigned long rsvd_17_19:3; + unsigned long mmr_hash_index_en:1; /* RW */ + unsigned long ecc_sel:1; /* RW */ + unsigned long rsvd_22_29:8; + unsigned long mmr_write:1; /* WP */ + unsigned long mmr_read:1; /* WP */ + unsigned long mmr_op_done:1; /* RW */ + unsigned long rsvd_33_47:15; + unsigned long undef_48:1; /* Undefined */ + unsigned long rsvd_49_51:3; + unsigned long undef_52:1; /* Undefined */ + unsigned long rsvd_53_63:11; + } s3; }; /* ========================================================================= */ @@ -939,9 +1173,11 @@ union uvh_gr1_tlb_mmr_control_u { /* ========================================================================= */ #define UV1H_GR1_TLB_MMR_READ_DATA_HI 0x8010a0UL #define UV2H_GR1_TLB_MMR_READ_DATA_HI 0x10010a0UL -#define UVH_GR1_TLB_MMR_READ_DATA_HI (is_uv1_hub() ? \ - UV1H_GR1_TLB_MMR_READ_DATA_HI : \ - UV2H_GR1_TLB_MMR_READ_DATA_HI) +#define UV3H_GR1_TLB_MMR_READ_DATA_HI 0x10010a0UL +#define UVH_GR1_TLB_MMR_READ_DATA_HI \ + (is_uv1_hub() ? UV1H_GR1_TLB_MMR_READ_DATA_HI : \ + (is_uv2_hub() ? UV2H_GR1_TLB_MMR_READ_DATA_HI : \ + UV3H_GR1_TLB_MMR_READ_DATA_HI)) #define UVH_GR1_TLB_MMR_READ_DATA_HI_PFN_SHFT 0 #define UVH_GR1_TLB_MMR_READ_DATA_HI_GAA_SHFT 41 @@ -952,6 +1188,46 @@ union uvh_gr1_tlb_mmr_control_u { #define UVH_GR1_TLB_MMR_READ_DATA_HI_DIRTY_MASK 0x0000080000000000UL #define UVH_GR1_TLB_MMR_READ_DATA_HI_LARGER_MASK 0x0000100000000000UL +#define UV1H_GR1_TLB_MMR_READ_DATA_HI_PFN_SHFT 0 +#define UV1H_GR1_TLB_MMR_READ_DATA_HI_GAA_SHFT 41 +#define UV1H_GR1_TLB_MMR_READ_DATA_HI_DIRTY_SHFT 43 +#define UV1H_GR1_TLB_MMR_READ_DATA_HI_LARGER_SHFT 44 +#define UV1H_GR1_TLB_MMR_READ_DATA_HI_PFN_MASK 0x000001ffffffffffUL +#define UV1H_GR1_TLB_MMR_READ_DATA_HI_GAA_MASK 0x0000060000000000UL +#define UV1H_GR1_TLB_MMR_READ_DATA_HI_DIRTY_MASK 0x0000080000000000UL +#define UV1H_GR1_TLB_MMR_READ_DATA_HI_LARGER_MASK 0x0000100000000000UL + +#define UVXH_GR1_TLB_MMR_READ_DATA_HI_PFN_SHFT 0 +#define UVXH_GR1_TLB_MMR_READ_DATA_HI_GAA_SHFT 41 +#define UVXH_GR1_TLB_MMR_READ_DATA_HI_DIRTY_SHFT 43 +#define UVXH_GR1_TLB_MMR_READ_DATA_HI_LARGER_SHFT 44 +#define UVXH_GR1_TLB_MMR_READ_DATA_HI_PFN_MASK 0x000001ffffffffffUL +#define UVXH_GR1_TLB_MMR_READ_DATA_HI_GAA_MASK 0x0000060000000000UL +#define UVXH_GR1_TLB_MMR_READ_DATA_HI_DIRTY_MASK 0x0000080000000000UL +#define UVXH_GR1_TLB_MMR_READ_DATA_HI_LARGER_MASK 0x0000100000000000UL + +#define UV2H_GR1_TLB_MMR_READ_DATA_HI_PFN_SHFT 0 +#define UV2H_GR1_TLB_MMR_READ_DATA_HI_GAA_SHFT 41 +#define UV2H_GR1_TLB_MMR_READ_DATA_HI_DIRTY_SHFT 43 +#define UV2H_GR1_TLB_MMR_READ_DATA_HI_LARGER_SHFT 44 +#define UV2H_GR1_TLB_MMR_READ_DATA_HI_PFN_MASK 0x000001ffffffffffUL +#define UV2H_GR1_TLB_MMR_READ_DATA_HI_GAA_MASK 0x0000060000000000UL +#define UV2H_GR1_TLB_MMR_READ_DATA_HI_DIRTY_MASK 0x0000080000000000UL +#define UV2H_GR1_TLB_MMR_READ_DATA_HI_LARGER_MASK 0x0000100000000000UL + +#define UV3H_GR1_TLB_MMR_READ_DATA_HI_PFN_SHFT 0 +#define UV3H_GR1_TLB_MMR_READ_DATA_HI_GAA_SHFT 41 +#define UV3H_GR1_TLB_MMR_READ_DATA_HI_DIRTY_SHFT 43 +#define UV3H_GR1_TLB_MMR_READ_DATA_HI_LARGER_SHFT 44 +#define UV3H_GR1_TLB_MMR_READ_DATA_HI_AA_EXT_SHFT 45 +#define UV3H_GR1_TLB_MMR_READ_DATA_HI_WAY_ECC_SHFT 55 +#define UV3H_GR1_TLB_MMR_READ_DATA_HI_PFN_MASK 0x000001ffffffffffUL +#define UV3H_GR1_TLB_MMR_READ_DATA_HI_GAA_MASK 0x0000060000000000UL +#define UV3H_GR1_TLB_MMR_READ_DATA_HI_DIRTY_MASK 0x0000080000000000UL +#define UV3H_GR1_TLB_MMR_READ_DATA_HI_LARGER_MASK 0x0000100000000000UL +#define UV3H_GR1_TLB_MMR_READ_DATA_HI_AA_EXT_MASK 0x0000200000000000UL +#define UV3H_GR1_TLB_MMR_READ_DATA_HI_WAY_ECC_MASK 0xff80000000000000UL + union uvh_gr1_tlb_mmr_read_data_hi_u { unsigned long v; struct uvh_gr1_tlb_mmr_read_data_hi_s { @@ -961,6 +1237,36 @@ union uvh_gr1_tlb_mmr_read_data_hi_u { unsigned long larger:1; /* RO */ unsigned long rsvd_45_63:19; } s; + struct uv1h_gr1_tlb_mmr_read_data_hi_s { + unsigned long pfn:41; /* RO */ + unsigned long gaa:2; /* RO */ + unsigned long dirty:1; /* RO */ + unsigned long larger:1; /* RO */ + unsigned long rsvd_45_63:19; + } s1; + struct uvxh_gr1_tlb_mmr_read_data_hi_s { + unsigned long pfn:41; /* RO */ + unsigned long gaa:2; /* RO */ + unsigned long dirty:1; /* RO */ + unsigned long larger:1; /* RO */ + unsigned long rsvd_45_63:19; + } sx; + struct uv2h_gr1_tlb_mmr_read_data_hi_s { + unsigned long pfn:41; /* RO */ + unsigned long gaa:2; /* RO */ + unsigned long dirty:1; /* RO */ + unsigned long larger:1; /* RO */ + unsigned long rsvd_45_63:19; + } s2; + struct uv3h_gr1_tlb_mmr_read_data_hi_s { + unsigned long pfn:41; /* RO */ + unsigned long gaa:2; /* RO */ + unsigned long dirty:1; /* RO */ + unsigned long larger:1; /* RO */ + unsigned long aa_ext:1; /* RO */ + unsigned long undef_46_54:9; /* Undefined */ + unsigned long way_ecc:9; /* RO */ + } s3; }; /* ========================================================================= */ @@ -968,9 +1274,11 @@ union uvh_gr1_tlb_mmr_read_data_hi_u { /* ========================================================================= */ #define UV1H_GR1_TLB_MMR_READ_DATA_LO 0x8010a8UL #define UV2H_GR1_TLB_MMR_READ_DATA_LO 0x10010a8UL -#define UVH_GR1_TLB_MMR_READ_DATA_LO (is_uv1_hub() ? \ - UV1H_GR1_TLB_MMR_READ_DATA_LO : \ - UV2H_GR1_TLB_MMR_READ_DATA_LO) +#define UV3H_GR1_TLB_MMR_READ_DATA_LO 0x10010a8UL +#define UVH_GR1_TLB_MMR_READ_DATA_LO \ + (is_uv1_hub() ? UV1H_GR1_TLB_MMR_READ_DATA_LO : \ + (is_uv2_hub() ? UV2H_GR1_TLB_MMR_READ_DATA_LO : \ + UV3H_GR1_TLB_MMR_READ_DATA_LO)) #define UVH_GR1_TLB_MMR_READ_DATA_LO_VPN_SHFT 0 #define UVH_GR1_TLB_MMR_READ_DATA_LO_ASID_SHFT 39 @@ -979,6 +1287,34 @@ union uvh_gr1_tlb_mmr_read_data_hi_u { #define UVH_GR1_TLB_MMR_READ_DATA_LO_ASID_MASK 0x7fffff8000000000UL #define UVH_GR1_TLB_MMR_READ_DATA_LO_VALID_MASK 0x8000000000000000UL +#define UV1H_GR1_TLB_MMR_READ_DATA_LO_VPN_SHFT 0 +#define UV1H_GR1_TLB_MMR_READ_DATA_LO_ASID_SHFT 39 +#define UV1H_GR1_TLB_MMR_READ_DATA_LO_VALID_SHFT 63 +#define UV1H_GR1_TLB_MMR_READ_DATA_LO_VPN_MASK 0x0000007fffffffffUL +#define UV1H_GR1_TLB_MMR_READ_DATA_LO_ASID_MASK 0x7fffff8000000000UL +#define UV1H_GR1_TLB_MMR_READ_DATA_LO_VALID_MASK 0x8000000000000000UL + +#define UVXH_GR1_TLB_MMR_READ_DATA_LO_VPN_SHFT 0 +#define UVXH_GR1_TLB_MMR_READ_DATA_LO_ASID_SHFT 39 +#define UVXH_GR1_TLB_MMR_READ_DATA_LO_VALID_SHFT 63 +#define UVXH_GR1_TLB_MMR_READ_DATA_LO_VPN_MASK 0x0000007fffffffffUL +#define UVXH_GR1_TLB_MMR_READ_DATA_LO_ASID_MASK 0x7fffff8000000000UL +#define UVXH_GR1_TLB_MMR_READ_DATA_LO_VALID_MASK 0x8000000000000000UL + +#define UV2H_GR1_TLB_MMR_READ_DATA_LO_VPN_SHFT 0 +#define UV2H_GR1_TLB_MMR_READ_DATA_LO_ASID_SHFT 39 +#define UV2H_GR1_TLB_MMR_READ_DATA_LO_VALID_SHFT 63 +#define UV2H_GR1_TLB_MMR_READ_DATA_LO_VPN_MASK 0x0000007fffffffffUL +#define UV2H_GR1_TLB_MMR_READ_DATA_LO_ASID_MASK 0x7fffff8000000000UL +#define UV2H_GR1_TLB_MMR_READ_DATA_LO_VALID_MASK 0x8000000000000000UL + +#define UV3H_GR1_TLB_MMR_READ_DATA_LO_VPN_SHFT 0 +#define UV3H_GR1_TLB_MMR_READ_DATA_LO_ASID_SHFT 39 +#define UV3H_GR1_TLB_MMR_READ_DATA_LO_VALID_SHFT 63 +#define UV3H_GR1_TLB_MMR_READ_DATA_LO_VPN_MASK 0x0000007fffffffffUL +#define UV3H_GR1_TLB_MMR_READ_DATA_LO_ASID_MASK 0x7fffff8000000000UL +#define UV3H_GR1_TLB_MMR_READ_DATA_LO_VALID_MASK 0x8000000000000000UL + union uvh_gr1_tlb_mmr_read_data_lo_u { unsigned long v; struct uvh_gr1_tlb_mmr_read_data_lo_s { @@ -986,12 +1322,32 @@ union uvh_gr1_tlb_mmr_read_data_lo_u { unsigned long asid:24; /* RO */ unsigned long valid:1; /* RO */ } s; + struct uv1h_gr1_tlb_mmr_read_data_lo_s { + unsigned long vpn:39; /* RO */ + unsigned long asid:24; /* RO */ + unsigned long valid:1; /* RO */ + } s1; + struct uvxh_gr1_tlb_mmr_read_data_lo_s { + unsigned long vpn:39; /* RO */ + unsigned long asid:24; /* RO */ + unsigned long valid:1; /* RO */ + } sx; + struct uv2h_gr1_tlb_mmr_read_data_lo_s { + unsigned long vpn:39; /* RO */ + unsigned long asid:24; /* RO */ + unsigned long valid:1; /* RO */ + } s2; + struct uv3h_gr1_tlb_mmr_read_data_lo_s { + unsigned long vpn:39; /* RO */ + unsigned long asid:24; /* RO */ + unsigned long valid:1; /* RO */ + } s3; }; /* ========================================================================= */ /* UVH_INT_CMPB */ /* ========================================================================= */ -#define UVH_INT_CMPB 0x22080UL +#define UVH_INT_CMPB 0x22080UL #define UVH_INT_CMPB_REAL_TIME_CMPB_SHFT 0 #define UVH_INT_CMPB_REAL_TIME_CMPB_MASK 0x00ffffffffffffffUL @@ -1007,10 +1363,13 @@ union uvh_int_cmpb_u { /* ========================================================================= */ /* UVH_INT_CMPC */ /* ========================================================================= */ -#define UVH_INT_CMPC 0x22100UL +#define UVH_INT_CMPC 0x22100UL + +#define UV1H_INT_CMPC_REAL_TIME_CMPC_SHFT 0 +#define UV1H_INT_CMPC_REAL_TIME_CMPC_MASK 0x00ffffffffffffffUL -#define UVH_INT_CMPC_REAL_TIME_CMPC_SHFT 0 -#define UVH_INT_CMPC_REAL_TIME_CMPC_MASK 0xffffffffffffffUL +#define UVXH_INT_CMPC_REAL_TIME_CMP_2_SHFT 0 +#define UVXH_INT_CMPC_REAL_TIME_CMP_2_MASK 0x00ffffffffffffffUL union uvh_int_cmpc_u { unsigned long v; @@ -1023,10 +1382,13 @@ union uvh_int_cmpc_u { /* ========================================================================= */ /* UVH_INT_CMPD */ /* ========================================================================= */ -#define UVH_INT_CMPD 0x22180UL +#define UVH_INT_CMPD 0x22180UL -#define UVH_INT_CMPD_REAL_TIME_CMPD_SHFT 0 -#define UVH_INT_CMPD_REAL_TIME_CMPD_MASK 0xffffffffffffffUL +#define UV1H_INT_CMPD_REAL_TIME_CMPD_SHFT 0 +#define UV1H_INT_CMPD_REAL_TIME_CMPD_MASK 0x00ffffffffffffffUL + +#define UVXH_INT_CMPD_REAL_TIME_CMP_3_SHFT 0 +#define UVXH_INT_CMPD_REAL_TIME_CMP_3_MASK 0x00ffffffffffffffUL union uvh_int_cmpd_u { unsigned long v; @@ -1039,8 +1401,8 @@ union uvh_int_cmpd_u { /* ========================================================================= */ /* UVH_IPI_INT */ /* ========================================================================= */ -#define UVH_IPI_INT 0x60500UL -#define UVH_IPI_INT_32 0x348 +#define UVH_IPI_INT 0x60500UL +#define UVH_IPI_INT_32 0x348 #define UVH_IPI_INT_VECTOR_SHFT 0 #define UVH_IPI_INT_DELIVERY_MODE_SHFT 8 @@ -1069,8 +1431,8 @@ union uvh_ipi_int_u { /* ========================================================================= */ /* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST */ /* ========================================================================= */ -#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST 0x320050UL -#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_32 0x9c0 +#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST 0x320050UL +#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_32 0x9c0 #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_SHFT 4 #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_SHFT 49 @@ -1091,8 +1453,8 @@ union uvh_lb_bau_intd_payload_queue_first_u { /* ========================================================================= */ /* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST */ /* ========================================================================= */ -#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST 0x320060UL -#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_32 0x9c8 +#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST 0x320060UL +#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_32 0x9c8 #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_SHFT 4 #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_MASK 0x000007fffffffff0UL @@ -1109,8 +1471,8 @@ union uvh_lb_bau_intd_payload_queue_last_u { /* ========================================================================= */ /* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL */ /* ========================================================================= */ -#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL 0x320070UL -#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_32 0x9d0 +#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL 0x320070UL +#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_32 0x9d0 #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_SHFT 4 #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_MASK 0x000007fffffffff0UL @@ -1127,8 +1489,8 @@ union uvh_lb_bau_intd_payload_queue_tail_u { /* ========================================================================= */ /* UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE */ /* ========================================================================= */ -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE 0x320080UL -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_32 0xa68 +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE 0x320080UL +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_32 0xa68 #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_SHFT 0 #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_SHFT 1 @@ -1189,14 +1551,21 @@ union uvh_lb_bau_intd_software_acknowledge_u { /* ========================================================================= */ /* UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS */ /* ========================================================================= */ -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS 0x0000000000320088UL -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS_32 0xa70 +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS 0x320088UL +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS_32 0xa70 + /* ========================================================================= */ /* UVH_LB_BAU_MISC_CONTROL */ /* ========================================================================= */ -#define UVH_LB_BAU_MISC_CONTROL 0x320170UL -#define UVH_LB_BAU_MISC_CONTROL_32 0xa10 +#define UVH_LB_BAU_MISC_CONTROL 0x320170UL +#define UV1H_LB_BAU_MISC_CONTROL 0x320170UL +#define UV2H_LB_BAU_MISC_CONTROL 0x320170UL +#define UV3H_LB_BAU_MISC_CONTROL 0x320170UL +#define UVH_LB_BAU_MISC_CONTROL_32 0xa10 +#define UV1H_LB_BAU_MISC_CONTROL_32 0x320170UL +#define UV2H_LB_BAU_MISC_CONTROL_32 0x320170UL +#define UV3H_LB_BAU_MISC_CONTROL_32 0x320170UL #define UVH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0 #define UVH_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT 8 @@ -1213,6 +1582,7 @@ union uvh_lb_bau_intd_software_acknowledge_u { #define UVH_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_SHFT 24 #define UVH_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_SHFT 27 #define UVH_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_SHFT 28 +#define UVH_LB_BAU_MISC_CONTROL_FUN_SHFT 48 #define UVH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL #define UVH_LB_BAU_MISC_CONTROL_APIC_MODE_MASK 0x0000000000000100UL #define UVH_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL @@ -1228,6 +1598,7 @@ union uvh_lb_bau_intd_software_acknowledge_u { #define UVH_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL #define UVH_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL #define UVH_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL +#define UVH_LB_BAU_MISC_CONTROL_FUN_MASK 0xffff000000000000UL #define UV1H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0 #define UV1H_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT 8 @@ -1262,6 +1633,53 @@ union uvh_lb_bau_intd_software_acknowledge_u { #define UV1H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL #define UV1H_LB_BAU_MISC_CONTROL_FUN_MASK 0xffff000000000000UL +#define UVXH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0 +#define UVXH_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT 8 +#define UVXH_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT 9 +#define UVXH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10 +#define UVXH_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_SHFT 11 +#define UVXH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14 +#define UVXH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT 15 +#define UVXH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT 16 +#define UVXH_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_SHFT 20 +#define UVXH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_SHFT 21 +#define UVXH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_SHFT 22 +#define UVXH_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_SHFT 23 +#define UVXH_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_SHFT 24 +#define UVXH_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_SHFT 27 +#define UVXH_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_SHFT 28 +#define UVXH_LB_BAU_MISC_CONTROL_ENABLE_AUTOMATIC_APIC_MODE_SELECTION_SHFT 29 +#define UVXH_LB_BAU_MISC_CONTROL_APIC_MODE_STATUS_SHFT 30 +#define UVXH_LB_BAU_MISC_CONTROL_SUPPRESS_INTERRUPTS_TO_SELF_SHFT 31 +#define UVXH_LB_BAU_MISC_CONTROL_ENABLE_LOCK_BASED_SYSTEM_FLUSH_SHFT 32 +#define UVXH_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_SHFT 33 +#define UVXH_LB_BAU_MISC_CONTROL_SUPPRESS_INT_PRIO_UDT_TO_SELF_SHFT 34 +#define UVXH_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_SHFT 35 +#define UVXH_LB_BAU_MISC_CONTROL_FUN_SHFT 48 +#define UVXH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL +#define UVXH_LB_BAU_MISC_CONTROL_APIC_MODE_MASK 0x0000000000000100UL +#define UVXH_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL +#define UVXH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL +#define UVXH_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL +#define UVXH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL +#define UVXH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK 0x0000000000008000UL +#define UVXH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK 0x00000000000f0000UL +#define UVXH_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL +#define UVXH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL +#define UVXH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL +#define UVXH_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_MASK 0x0000000000800000UL +#define UVXH_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL +#define UVXH_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL +#define UVXH_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL +#define UVXH_LB_BAU_MISC_CONTROL_ENABLE_AUTOMATIC_APIC_MODE_SELECTION_MASK 0x0000000020000000UL +#define UVXH_LB_BAU_MISC_CONTROL_APIC_MODE_STATUS_MASK 0x0000000040000000UL +#define UVXH_LB_BAU_MISC_CONTROL_SUPPRESS_INTERRUPTS_TO_SELF_MASK 0x0000000080000000UL +#define UVXH_LB_BAU_MISC_CONTROL_ENABLE_LOCK_BASED_SYSTEM_FLUSH_MASK 0x0000000100000000UL +#define UVXH_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_MASK 0x0000000200000000UL +#define UVXH_LB_BAU_MISC_CONTROL_SUPPRESS_INT_PRIO_UDT_TO_SELF_MASK 0x0000000400000000UL +#define UVXH_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_MASK 0x0000000800000000UL +#define UVXH_LB_BAU_MISC_CONTROL_FUN_MASK 0xffff000000000000UL + #define UV2H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0 #define UV2H_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT 8 #define UV2H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT 9 @@ -1309,6 +1727,59 @@ union uvh_lb_bau_intd_software_acknowledge_u { #define UV2H_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_MASK 0x0000000800000000UL #define UV2H_LB_BAU_MISC_CONTROL_FUN_MASK 0xffff000000000000UL +#define UV3H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0 +#define UV3H_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT 8 +#define UV3H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT 9 +#define UV3H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10 +#define UV3H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_SHFT 11 +#define UV3H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14 +#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT 15 +#define UV3H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT 16 +#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_SHFT 20 +#define UV3H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_SHFT 21 +#define UV3H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_SHFT 22 +#define UV3H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_SHFT 23 +#define UV3H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_SHFT 24 +#define UV3H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_SHFT 27 +#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_SHFT 28 +#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_AUTOMATIC_APIC_MODE_SELECTION_SHFT 29 +#define UV3H_LB_BAU_MISC_CONTROL_APIC_MODE_STATUS_SHFT 30 +#define UV3H_LB_BAU_MISC_CONTROL_SUPPRESS_INTERRUPTS_TO_SELF_SHFT 31 +#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_LOCK_BASED_SYSTEM_FLUSH_SHFT 32 +#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_SHFT 33 +#define UV3H_LB_BAU_MISC_CONTROL_SUPPRESS_INT_PRIO_UDT_TO_SELF_SHFT 34 +#define UV3H_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_SHFT 35 +#define UV3H_LB_BAU_MISC_CONTROL_SUPPRESS_QUIESCE_MSGS_TO_QPI_SHFT 36 +#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_INTD_PREFETCH_HINT_SHFT 37 +#define UV3H_LB_BAU_MISC_CONTROL_THREAD_KILL_TIMEBASE_SHFT 38 +#define UV3H_LB_BAU_MISC_CONTROL_FUN_SHFT 48 +#define UV3H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL +#define UV3H_LB_BAU_MISC_CONTROL_APIC_MODE_MASK 0x0000000000000100UL +#define UV3H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL +#define UV3H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL +#define UV3H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL +#define UV3H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL +#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK 0x0000000000008000UL +#define UV3H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK 0x00000000000f0000UL +#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL +#define UV3H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL +#define UV3H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL +#define UV3H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_MASK 0x0000000000800000UL +#define UV3H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL +#define UV3H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL +#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL +#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_AUTOMATIC_APIC_MODE_SELECTION_MASK 0x0000000020000000UL +#define UV3H_LB_BAU_MISC_CONTROL_APIC_MODE_STATUS_MASK 0x0000000040000000UL +#define UV3H_LB_BAU_MISC_CONTROL_SUPPRESS_INTERRUPTS_TO_SELF_MASK 0x0000000080000000UL +#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_LOCK_BASED_SYSTEM_FLUSH_MASK 0x0000000100000000UL +#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_MASK 0x0000000200000000UL +#define UV3H_LB_BAU_MISC_CONTROL_SUPPRESS_INT_PRIO_UDT_TO_SELF_MASK 0x0000000400000000UL +#define UV3H_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_MASK 0x0000000800000000UL +#define UV3H_LB_BAU_MISC_CONTROL_SUPPRESS_QUIESCE_MSGS_TO_QPI_MASK 0x0000001000000000UL +#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_INTD_PREFETCH_HINT_MASK 0x0000002000000000UL +#define UV3H_LB_BAU_MISC_CONTROL_THREAD_KILL_TIMEBASE_MASK 0x00003fc000000000UL +#define UV3H_LB_BAU_MISC_CONTROL_FUN_MASK 0xffff000000000000UL + union uvh_lb_bau_misc_control_u { unsigned long v; struct uvh_lb_bau_misc_control_s { @@ -1327,7 +1798,8 @@ union uvh_lb_bau_misc_control_u { unsigned long programmed_initial_priority:3; /* RW */ unsigned long use_incoming_priority:1; /* RW */ unsigned long enable_programmed_initial_priority:1;/* RW */ - unsigned long rsvd_29_63:35; + unsigned long rsvd_29_47:19; + unsigned long fun:16; /* RW */ } s; struct uv1h_lb_bau_misc_control_s { unsigned long rejection_delay:8; /* RW */ @@ -1348,6 +1820,32 @@ union uvh_lb_bau_misc_control_u { unsigned long rsvd_29_47:19; unsigned long fun:16; /* RW */ } s1; + struct uvxh_lb_bau_misc_control_s { + unsigned long rejection_delay:8; /* RW */ + unsigned long apic_mode:1; /* RW */ + unsigned long force_broadcast:1; /* RW */ + unsigned long force_lock_nop:1; /* RW */ + unsigned long qpi_agent_presence_vector:3; /* RW */ + unsigned long descriptor_fetch_mode:1; /* RW */ + unsigned long enable_intd_soft_ack_mode:1; /* RW */ + unsigned long intd_soft_ack_timeout_period:4; /* RW */ + unsigned long enable_dual_mapping_mode:1; /* RW */ + unsigned long vga_io_port_decode_enable:1; /* RW */ + unsigned long vga_io_port_16_bit_decode:1; /* RW */ + unsigned long suppress_dest_registration:1; /* RW */ + unsigned long programmed_initial_priority:3; /* RW */ + unsigned long use_incoming_priority:1; /* RW */ + unsigned long enable_programmed_initial_priority:1;/* RW */ + unsigned long enable_automatic_apic_mode_selection:1;/* RW */ + unsigned long apic_mode_status:1; /* RO */ + unsigned long suppress_interrupts_to_self:1; /* RW */ + unsigned long enable_lock_based_system_flush:1;/* RW */ + unsigned long enable_extended_sb_status:1; /* RW */ + unsigned long suppress_int_prio_udt_to_self:1;/* RW */ + unsigned long use_legacy_descriptor_formats:1;/* RW */ + unsigned long rsvd_36_47:12; + unsigned long fun:16; /* RW */ + } sx; struct uv2h_lb_bau_misc_control_s { unsigned long rejection_delay:8; /* RW */ unsigned long apic_mode:1; /* RW */ @@ -1374,13 +1872,42 @@ union uvh_lb_bau_misc_control_u { unsigned long rsvd_36_47:12; unsigned long fun:16; /* RW */ } s2; + struct uv3h_lb_bau_misc_control_s { + unsigned long rejection_delay:8; /* RW */ + unsigned long apic_mode:1; /* RW */ + unsigned long force_broadcast:1; /* RW */ + unsigned long force_lock_nop:1; /* RW */ + unsigned long qpi_agent_presence_vector:3; /* RW */ + unsigned long descriptor_fetch_mode:1; /* RW */ + unsigned long enable_intd_soft_ack_mode:1; /* RW */ + unsigned long intd_soft_ack_timeout_period:4; /* RW */ + unsigned long enable_dual_mapping_mode:1; /* RW */ + unsigned long vga_io_port_decode_enable:1; /* RW */ + unsigned long vga_io_port_16_bit_decode:1; /* RW */ + unsigned long suppress_dest_registration:1; /* RW */ + unsigned long programmed_initial_priority:3; /* RW */ + unsigned long use_incoming_priority:1; /* RW */ + unsigned long enable_programmed_initial_priority:1;/* RW */ + unsigned long enable_automatic_apic_mode_selection:1;/* RW */ + unsigned long apic_mode_status:1; /* RO */ + unsigned long suppress_interrupts_to_self:1; /* RW */ + unsigned long enable_lock_based_system_flush:1;/* RW */ + unsigned long enable_extended_sb_status:1; /* RW */ + unsigned long suppress_int_prio_udt_to_self:1;/* RW */ + unsigned long use_legacy_descriptor_formats:1;/* RW */ + unsigned long suppress_quiesce_msgs_to_qpi:1; /* RW */ + unsigned long enable_intd_prefetch_hint:1; /* RW */ + unsigned long thread_kill_timebase:8; /* RW */ + unsigned long rsvd_46_47:2; + unsigned long fun:16; /* RW */ + } s3; }; /* ========================================================================= */ /* UVH_LB_BAU_SB_ACTIVATION_CONTROL */ /* ========================================================================= */ -#define UVH_LB_BAU_SB_ACTIVATION_CONTROL 0x320020UL -#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_32 0x9a8 +#define UVH_LB_BAU_SB_ACTIVATION_CONTROL 0x320020UL +#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_32 0x9a8 #define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INDEX_SHFT 0 #define UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT 62 @@ -1402,8 +1929,8 @@ union uvh_lb_bau_sb_activation_control_u { /* ========================================================================= */ /* UVH_LB_BAU_SB_ACTIVATION_STATUS_0 */ /* ========================================================================= */ -#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0 0x320030UL -#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_32 0x9b0 +#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0 0x320030UL +#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_32 0x9b0 #define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_SHFT 0 #define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_MASK 0xffffffffffffffffUL @@ -1418,8 +1945,8 @@ union uvh_lb_bau_sb_activation_status_0_u { /* ========================================================================= */ /* UVH_LB_BAU_SB_ACTIVATION_STATUS_1 */ /* ========================================================================= */ -#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1 0x320040UL -#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_32 0x9b8 +#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1 0x320040UL +#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_32 0x9b8 #define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_SHFT 0 #define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_MASK 0xffffffffffffffffUL @@ -1434,8 +1961,8 @@ union uvh_lb_bau_sb_activation_status_1_u { /* ========================================================================= */ /* UVH_LB_BAU_SB_DESCRIPTOR_BASE */ /* ========================================================================= */ -#define UVH_LB_BAU_SB_DESCRIPTOR_BASE 0x320010UL -#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_32 0x9a0 +#define UVH_LB_BAU_SB_DESCRIPTOR_BASE 0x320010UL +#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_32 0x9a0 #define UVH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_SHFT 12 #define UVH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT 49 @@ -1456,7 +1983,10 @@ union uvh_lb_bau_sb_descriptor_base_u { /* ========================================================================= */ /* UVH_NODE_ID */ /* ========================================================================= */ -#define UVH_NODE_ID 0x0UL +#define UVH_NODE_ID 0x0UL +#define UV1H_NODE_ID 0x0UL +#define UV2H_NODE_ID 0x0UL +#define UV3H_NODE_ID 0x0UL #define UVH_NODE_ID_FORCE1_SHFT 0 #define UVH_NODE_ID_MANUFACTURER_SHFT 1 @@ -1484,6 +2014,21 @@ union uvh_lb_bau_sb_descriptor_base_u { #define UV1H_NODE_ID_NODES_PER_BIT_MASK 0x007f000000000000UL #define UV1H_NODE_ID_NI_PORT_MASK 0x0f00000000000000UL +#define UVXH_NODE_ID_FORCE1_SHFT 0 +#define UVXH_NODE_ID_MANUFACTURER_SHFT 1 +#define UVXH_NODE_ID_PART_NUMBER_SHFT 12 +#define UVXH_NODE_ID_REVISION_SHFT 28 +#define UVXH_NODE_ID_NODE_ID_SHFT 32 +#define UVXH_NODE_ID_NODES_PER_BIT_SHFT 50 +#define UVXH_NODE_ID_NI_PORT_SHFT 57 +#define UVXH_NODE_ID_FORCE1_MASK 0x0000000000000001UL +#define UVXH_NODE_ID_MANUFACTURER_MASK 0x0000000000000ffeUL +#define UVXH_NODE_ID_PART_NUMBER_MASK 0x000000000ffff000UL +#define UVXH_NODE_ID_REVISION_MASK 0x00000000f0000000UL +#define UVXH_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL +#define UVXH_NODE_ID_NODES_PER_BIT_MASK 0x01fc000000000000UL +#define UVXH_NODE_ID_NI_PORT_MASK 0x3e00000000000000UL + #define UV2H_NODE_ID_FORCE1_SHFT 0 #define UV2H_NODE_ID_MANUFACTURER_SHFT 1 #define UV2H_NODE_ID_PART_NUMBER_SHFT 12 @@ -1499,6 +2044,25 @@ union uvh_lb_bau_sb_descriptor_base_u { #define UV2H_NODE_ID_NODES_PER_BIT_MASK 0x01fc000000000000UL #define UV2H_NODE_ID_NI_PORT_MASK 0x3e00000000000000UL +#define UV3H_NODE_ID_FORCE1_SHFT 0 +#define UV3H_NODE_ID_MANUFACTURER_SHFT 1 +#define UV3H_NODE_ID_PART_NUMBER_SHFT 12 +#define UV3H_NODE_ID_REVISION_SHFT 28 +#define UV3H_NODE_ID_NODE_ID_SHFT 32 +#define UV3H_NODE_ID_ROUTER_SELECT_SHFT 48 +#define UV3H_NODE_ID_RESERVED_2_SHFT 49 +#define UV3H_NODE_ID_NODES_PER_BIT_SHFT 50 +#define UV3H_NODE_ID_NI_PORT_SHFT 57 +#define UV3H_NODE_ID_FORCE1_MASK 0x0000000000000001UL +#define UV3H_NODE_ID_MANUFACTURER_MASK 0x0000000000000ffeUL +#define UV3H_NODE_ID_PART_NUMBER_MASK 0x000000000ffff000UL +#define UV3H_NODE_ID_REVISION_MASK 0x00000000f0000000UL +#define UV3H_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL +#define UV3H_NODE_ID_ROUTER_SELECT_MASK 0x0001000000000000UL +#define UV3H_NODE_ID_RESERVED_2_MASK 0x0002000000000000UL +#define UV3H_NODE_ID_NODES_PER_BIT_MASK 0x01fc000000000000UL +#define UV3H_NODE_ID_NI_PORT_MASK 0x3e00000000000000UL + union uvh_node_id_u { unsigned long v; struct uvh_node_id_s { @@ -1521,6 +2085,17 @@ union uvh_node_id_u { unsigned long ni_port:4; /* RO */ unsigned long rsvd_60_63:4; } s1; + struct uvxh_node_id_s { + unsigned long force1:1; /* RO */ + unsigned long manufacturer:11; /* RO */ + unsigned long part_number:16; /* RO */ + unsigned long revision:4; /* RO */ + unsigned long node_id:15; /* RW */ + unsigned long rsvd_47_49:3; + unsigned long nodes_per_bit:7; /* RO */ + unsigned long ni_port:5; /* RO */ + unsigned long rsvd_62_63:2; + } sx; struct uv2h_node_id_s { unsigned long force1:1; /* RO */ unsigned long manufacturer:11; /* RO */ @@ -1532,13 +2107,26 @@ union uvh_node_id_u { unsigned long ni_port:5; /* RO */ unsigned long rsvd_62_63:2; } s2; + struct uv3h_node_id_s { + unsigned long force1:1; /* RO */ + unsigned long manufacturer:11; /* RO */ + unsigned long part_number:16; /* RO */ + unsigned long revision:4; /* RO */ + unsigned long node_id:15; /* RW */ + unsigned long rsvd_47:1; + unsigned long router_select:1; /* RO */ + unsigned long rsvd_49:1; + unsigned long nodes_per_bit:7; /* RO */ + unsigned long ni_port:5; /* RO */ + unsigned long rsvd_62_63:2; + } s3; }; /* ========================================================================= */ /* UVH_NODE_PRESENT_TABLE */ /* ========================================================================= */ -#define UVH_NODE_PRESENT_TABLE 0x1400UL -#define UVH_NODE_PRESENT_TABLE_DEPTH 16 +#define UVH_NODE_PRESENT_TABLE 0x1400UL +#define UVH_NODE_PRESENT_TABLE_DEPTH 16 #define UVH_NODE_PRESENT_TABLE_NODES_SHFT 0 #define UVH_NODE_PRESENT_TABLE_NODES_MASK 0xffffffffffffffffUL @@ -1553,7 +2141,7 @@ union uvh_node_present_table_u { /* ========================================================================= */ /* UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR */ /* ========================================================================= */ -#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR 0x16000c8UL +#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR 0x16000c8UL #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_SHFT 24 #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_SHFT 48 @@ -1577,7 +2165,7 @@ union uvh_rh_gam_alias210_overlay_config_0_mmr_u { /* ========================================================================= */ /* UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR */ /* ========================================================================= */ -#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR 0x16000d8UL +#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR 0x16000d8UL #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_SHFT 24 #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_SHFT 48 @@ -1601,7 +2189,7 @@ union uvh_rh_gam_alias210_overlay_config_1_mmr_u { /* ========================================================================= */ /* UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR */ /* ========================================================================= */ -#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR 0x16000e8UL +#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR 0x16000e8UL #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_SHFT 24 #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_SHFT 48 @@ -1625,7 +2213,7 @@ union uvh_rh_gam_alias210_overlay_config_2_mmr_u { /* ========================================================================= */ /* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR */ /* ========================================================================= */ -#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR 0x16000d0UL +#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR 0x16000d0UL #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT 24 #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_MASK 0x00003fffff000000UL @@ -1642,7 +2230,7 @@ union uvh_rh_gam_alias210_redirect_config_0_mmr_u { /* ========================================================================= */ /* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR */ /* ========================================================================= */ -#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR 0x16000e0UL +#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR 0x16000e0UL #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_SHFT 24 #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_MASK 0x00003fffff000000UL @@ -1659,7 +2247,7 @@ union uvh_rh_gam_alias210_redirect_config_1_mmr_u { /* ========================================================================= */ /* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR */ /* ========================================================================= */ -#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR 0x16000f0UL +#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR 0x16000f0UL #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_SHFT 24 #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_MASK 0x00003fffff000000UL @@ -1676,7 +2264,10 @@ union uvh_rh_gam_alias210_redirect_config_2_mmr_u { /* ========================================================================= */ /* UVH_RH_GAM_CONFIG_MMR */ /* ========================================================================= */ -#define UVH_RH_GAM_CONFIG_MMR 0x1600000UL +#define UVH_RH_GAM_CONFIG_MMR 0x1600000UL +#define UV1H_RH_GAM_CONFIG_MMR 0x1600000UL +#define UV2H_RH_GAM_CONFIG_MMR 0x1600000UL +#define UV3H_RH_GAM_CONFIG_MMR 0x1600000UL #define UVH_RH_GAM_CONFIG_MMR_M_SKT_SHFT 0 #define UVH_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6 @@ -1690,11 +2281,21 @@ union uvh_rh_gam_alias210_redirect_config_2_mmr_u { #define UV1H_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL #define UV1H_RH_GAM_CONFIG_MMR_MMIOL_CFG_MASK 0x0000000000001000UL +#define UVXH_RH_GAM_CONFIG_MMR_M_SKT_SHFT 0 +#define UVXH_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6 +#define UVXH_RH_GAM_CONFIG_MMR_M_SKT_MASK 0x000000000000003fUL +#define UVXH_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL + #define UV2H_RH_GAM_CONFIG_MMR_M_SKT_SHFT 0 #define UV2H_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6 #define UV2H_RH_GAM_CONFIG_MMR_M_SKT_MASK 0x000000000000003fUL #define UV2H_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL +#define UV3H_RH_GAM_CONFIG_MMR_M_SKT_SHFT 0 +#define UV3H_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6 +#define UV3H_RH_GAM_CONFIG_MMR_M_SKT_MASK 0x000000000000003fUL +#define UV3H_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL + union uvh_rh_gam_config_mmr_u { unsigned long v; struct uvh_rh_gam_config_mmr_s { @@ -1709,20 +2310,37 @@ union uvh_rh_gam_config_mmr_u { unsigned long mmiol_cfg:1; /* RW */ unsigned long rsvd_13_63:51; } s1; + struct uvxh_rh_gam_config_mmr_s { + unsigned long m_skt:6; /* RW */ + unsigned long n_skt:4; /* RW */ + unsigned long rsvd_10_63:54; + } sx; struct uv2h_rh_gam_config_mmr_s { unsigned long m_skt:6; /* RW */ unsigned long n_skt:4; /* RW */ unsigned long rsvd_10_63:54; } s2; + struct uv3h_rh_gam_config_mmr_s { + unsigned long m_skt:6; /* RW */ + unsigned long n_skt:4; /* RW */ + unsigned long rsvd_10_63:54; + } s3; }; /* ========================================================================= */ /* UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR */ /* ========================================================================= */ -#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL +#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL +#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL +#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL +#define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL #define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28 +#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52 +#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 #define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL +#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL +#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL #define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28 #define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_SHFT 48 @@ -1733,6 +2351,13 @@ union uvh_rh_gam_config_mmr_u { #define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL #define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL +#define UVXH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28 +#define UVXH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52 +#define UVXH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 +#define UVXH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL +#define UVXH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL +#define UVXH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL + #define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28 #define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52 #define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 @@ -1740,12 +2365,23 @@ union uvh_rh_gam_config_mmr_u { #define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL #define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL +#define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28 +#define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52 +#define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_MODE_SHFT 62 +#define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 +#define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL +#define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL +#define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_MODE_MASK 0x4000000000000000UL +#define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL + union uvh_rh_gam_gru_overlay_config_mmr_u { unsigned long v; struct uvh_rh_gam_gru_overlay_config_mmr_s { unsigned long rsvd_0_27:28; unsigned long base:18; /* RW */ - unsigned long rsvd_46_62:17; + unsigned long rsvd_46_51:6; + unsigned long n_gru:4; /* RW */ + unsigned long rsvd_56_62:7; unsigned long enable:1; /* RW */ } s; struct uv1h_rh_gam_gru_overlay_config_mmr_s { @@ -1758,6 +2394,14 @@ union uvh_rh_gam_gru_overlay_config_mmr_u { unsigned long rsvd_56_62:7; unsigned long enable:1; /* RW */ } s1; + struct uvxh_rh_gam_gru_overlay_config_mmr_s { + unsigned long rsvd_0_27:28; + unsigned long base:18; /* RW */ + unsigned long rsvd_46_51:6; + unsigned long n_gru:4; /* RW */ + unsigned long rsvd_56_62:7; + unsigned long enable:1; /* RW */ + } sx; struct uv2h_rh_gam_gru_overlay_config_mmr_s { unsigned long rsvd_0_27:28; unsigned long base:18; /* RW */ @@ -1766,12 +2410,22 @@ union uvh_rh_gam_gru_overlay_config_mmr_u { unsigned long rsvd_56_62:7; unsigned long enable:1; /* RW */ } s2; + struct uv3h_rh_gam_gru_overlay_config_mmr_s { + unsigned long rsvd_0_27:28; + unsigned long base:18; /* RW */ + unsigned long rsvd_46_51:6; + unsigned long n_gru:4; /* RW */ + unsigned long rsvd_56_61:6; + unsigned long mode:1; /* RW */ + unsigned long enable:1; /* RW */ + } s3; }; /* ========================================================================= */ /* UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR */ /* ========================================================================= */ -#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR 0x1600030UL +#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR 0x1600030UL +#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR 0x1600030UL #define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT 30 #define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_SHFT 46 @@ -1814,10 +2468,15 @@ union uvh_rh_gam_mmioh_overlay_config_mmr_u { /* ========================================================================= */ /* UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR */ /* ========================================================================= */ -#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR 0x1600028UL +#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR 0x1600028UL +#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR 0x1600028UL +#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR 0x1600028UL +#define UV3H_RH_GAM_MMR_OVERLAY_CONFIG_MMR 0x1600028UL #define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26 +#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 #define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL +#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL #define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26 #define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_DUAL_HUB_SHFT 46 @@ -1826,11 +2485,21 @@ union uvh_rh_gam_mmioh_overlay_config_mmr_u { #define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_DUAL_HUB_MASK 0x0000400000000000UL #define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL +#define UVXH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26 +#define UVXH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 +#define UVXH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL +#define UVXH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL + #define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26 #define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 #define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL #define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL +#define UV3H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26 +#define UV3H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 +#define UV3H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL +#define UV3H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL + union uvh_rh_gam_mmr_overlay_config_mmr_u { unsigned long v; struct uvh_rh_gam_mmr_overlay_config_mmr_s { @@ -1846,18 +2515,30 @@ union uvh_rh_gam_mmr_overlay_config_mmr_u { unsigned long rsvd_47_62:16; unsigned long enable:1; /* RW */ } s1; + struct uvxh_rh_gam_mmr_overlay_config_mmr_s { + unsigned long rsvd_0_25:26; + unsigned long base:20; /* RW */ + unsigned long rsvd_46_62:17; + unsigned long enable:1; /* RW */ + } sx; struct uv2h_rh_gam_mmr_overlay_config_mmr_s { unsigned long rsvd_0_25:26; unsigned long base:20; /* RW */ unsigned long rsvd_46_62:17; unsigned long enable:1; /* RW */ } s2; + struct uv3h_rh_gam_mmr_overlay_config_mmr_s { + unsigned long rsvd_0_25:26; + unsigned long base:20; /* RW */ + unsigned long rsvd_46_62:17; + unsigned long enable:1; /* RW */ + } s3; }; /* ========================================================================= */ /* UVH_RTC */ /* ========================================================================= */ -#define UVH_RTC 0x340000UL +#define UVH_RTC 0x340000UL #define UVH_RTC_REAL_TIME_CLOCK_SHFT 0 #define UVH_RTC_REAL_TIME_CLOCK_MASK 0x00ffffffffffffffUL @@ -1873,7 +2554,7 @@ union uvh_rtc_u { /* ========================================================================= */ /* UVH_RTC1_INT_CONFIG */ /* ========================================================================= */ -#define UVH_RTC1_INT_CONFIG 0x615c0UL +#define UVH_RTC1_INT_CONFIG 0x615c0UL #define UVH_RTC1_INT_CONFIG_VECTOR_SHFT 0 #define UVH_RTC1_INT_CONFIG_DM_SHFT 8 @@ -1911,8 +2592,8 @@ union uvh_rtc1_int_config_u { /* ========================================================================= */ /* UVH_SCRATCH5 */ /* ========================================================================= */ -#define UVH_SCRATCH5 0x2d0200UL -#define UVH_SCRATCH5_32 0x778 +#define UVH_SCRATCH5 0x2d0200UL +#define UVH_SCRATCH5_32 0x778 #define UVH_SCRATCH5_SCRATCH5_SHFT 0 #define UVH_SCRATCH5_SCRATCH5_MASK 0xffffffffffffffffUL @@ -1925,79 +2606,79 @@ union uvh_scratch5_u { }; /* ========================================================================= */ -/* UV2H_EVENT_OCCURRED2 */ -/* ========================================================================= */ -#define UV2H_EVENT_OCCURRED2 0x70100UL -#define UV2H_EVENT_OCCURRED2_32 0xb68 - -#define UV2H_EVENT_OCCURRED2_RTC_0_SHFT 0 -#define UV2H_EVENT_OCCURRED2_RTC_1_SHFT 1 -#define UV2H_EVENT_OCCURRED2_RTC_2_SHFT 2 -#define UV2H_EVENT_OCCURRED2_RTC_3_SHFT 3 -#define UV2H_EVENT_OCCURRED2_RTC_4_SHFT 4 -#define UV2H_EVENT_OCCURRED2_RTC_5_SHFT 5 -#define UV2H_EVENT_OCCURRED2_RTC_6_SHFT 6 -#define UV2H_EVENT_OCCURRED2_RTC_7_SHFT 7 -#define UV2H_EVENT_OCCURRED2_RTC_8_SHFT 8 -#define UV2H_EVENT_OCCURRED2_RTC_9_SHFT 9 -#define UV2H_EVENT_OCCURRED2_RTC_10_SHFT 10 -#define UV2H_EVENT_OCCURRED2_RTC_11_SHFT 11 -#define UV2H_EVENT_OCCURRED2_RTC_12_SHFT 12 -#define UV2H_EVENT_OCCURRED2_RTC_13_SHFT 13 -#define UV2H_EVENT_OCCURRED2_RTC_14_SHFT 14 -#define UV2H_EVENT_OCCURRED2_RTC_15_SHFT 15 -#define UV2H_EVENT_OCCURRED2_RTC_16_SHFT 16 -#define UV2H_EVENT_OCCURRED2_RTC_17_SHFT 17 -#define UV2H_EVENT_OCCURRED2_RTC_18_SHFT 18 -#define UV2H_EVENT_OCCURRED2_RTC_19_SHFT 19 -#define UV2H_EVENT_OCCURRED2_RTC_20_SHFT 20 -#define UV2H_EVENT_OCCURRED2_RTC_21_SHFT 21 -#define UV2H_EVENT_OCCURRED2_RTC_22_SHFT 22 -#define UV2H_EVENT_OCCURRED2_RTC_23_SHFT 23 -#define UV2H_EVENT_OCCURRED2_RTC_24_SHFT 24 -#define UV2H_EVENT_OCCURRED2_RTC_25_SHFT 25 -#define UV2H_EVENT_OCCURRED2_RTC_26_SHFT 26 -#define UV2H_EVENT_OCCURRED2_RTC_27_SHFT 27 -#define UV2H_EVENT_OCCURRED2_RTC_28_SHFT 28 -#define UV2H_EVENT_OCCURRED2_RTC_29_SHFT 29 -#define UV2H_EVENT_OCCURRED2_RTC_30_SHFT 30 -#define UV2H_EVENT_OCCURRED2_RTC_31_SHFT 31 -#define UV2H_EVENT_OCCURRED2_RTC_0_MASK 0x0000000000000001UL -#define UV2H_EVENT_OCCURRED2_RTC_1_MASK 0x0000000000000002UL -#define UV2H_EVENT_OCCURRED2_RTC_2_MASK 0x0000000000000004UL -#define UV2H_EVENT_OCCURRED2_RTC_3_MASK 0x0000000000000008UL -#define UV2H_EVENT_OCCURRED2_RTC_4_MASK 0x0000000000000010UL -#define UV2H_EVENT_OCCURRED2_RTC_5_MASK 0x0000000000000020UL -#define UV2H_EVENT_OCCURRED2_RTC_6_MASK 0x0000000000000040UL -#define UV2H_EVENT_OCCURRED2_RTC_7_MASK 0x0000000000000080UL -#define UV2H_EVENT_OCCURRED2_RTC_8_MASK 0x0000000000000100UL -#define UV2H_EVENT_OCCURRED2_RTC_9_MASK 0x0000000000000200UL -#define UV2H_EVENT_OCCURRED2_RTC_10_MASK 0x0000000000000400UL -#define UV2H_EVENT_OCCURRED2_RTC_11_MASK 0x0000000000000800UL -#define UV2H_EVENT_OCCURRED2_RTC_12_MASK 0x0000000000001000UL -#define UV2H_EVENT_OCCURRED2_RTC_13_MASK 0x0000000000002000UL -#define UV2H_EVENT_OCCURRED2_RTC_14_MASK 0x0000000000004000UL -#define UV2H_EVENT_OCCURRED2_RTC_15_MASK 0x0000000000008000UL -#define UV2H_EVENT_OCCURRED2_RTC_16_MASK 0x0000000000010000UL -#define UV2H_EVENT_OCCURRED2_RTC_17_MASK 0x0000000000020000UL -#define UV2H_EVENT_OCCURRED2_RTC_18_MASK 0x0000000000040000UL -#define UV2H_EVENT_OCCURRED2_RTC_19_MASK 0x0000000000080000UL -#define UV2H_EVENT_OCCURRED2_RTC_20_MASK 0x0000000000100000UL -#define UV2H_EVENT_OCCURRED2_RTC_21_MASK 0x0000000000200000UL -#define UV2H_EVENT_OCCURRED2_RTC_22_MASK 0x0000000000400000UL -#define UV2H_EVENT_OCCURRED2_RTC_23_MASK 0x0000000000800000UL -#define UV2H_EVENT_OCCURRED2_RTC_24_MASK 0x0000000001000000UL -#define UV2H_EVENT_OCCURRED2_RTC_25_MASK 0x0000000002000000UL -#define UV2H_EVENT_OCCURRED2_RTC_26_MASK 0x0000000004000000UL -#define UV2H_EVENT_OCCURRED2_RTC_27_MASK 0x0000000008000000UL -#define UV2H_EVENT_OCCURRED2_RTC_28_MASK 0x0000000010000000UL -#define UV2H_EVENT_OCCURRED2_RTC_29_MASK 0x0000000020000000UL -#define UV2H_EVENT_OCCURRED2_RTC_30_MASK 0x0000000040000000UL -#define UV2H_EVENT_OCCURRED2_RTC_31_MASK 0x0000000080000000UL - -union uv2h_event_occurred2_u { +/* UVXH_EVENT_OCCURRED2 */ +/* ========================================================================= */ +#define UVXH_EVENT_OCCURRED2 0x70100UL +#define UVXH_EVENT_OCCURRED2_32 0xb68 + +#define UVXH_EVENT_OCCURRED2_RTC_0_SHFT 0 +#define UVXH_EVENT_OCCURRED2_RTC_1_SHFT 1 +#define UVXH_EVENT_OCCURRED2_RTC_2_SHFT 2 +#define UVXH_EVENT_OCCURRED2_RTC_3_SHFT 3 +#define UVXH_EVENT_OCCURRED2_RTC_4_SHFT 4 +#define UVXH_EVENT_OCCURRED2_RTC_5_SHFT 5 +#define UVXH_EVENT_OCCURRED2_RTC_6_SHFT 6 +#define UVXH_EVENT_OCCURRED2_RTC_7_SHFT 7 +#define UVXH_EVENT_OCCURRED2_RTC_8_SHFT 8 +#define UVXH_EVENT_OCCURRED2_RTC_9_SHFT 9 +#define UVXH_EVENT_OCCURRED2_RTC_10_SHFT 10 +#define UVXH_EVENT_OCCURRED2_RTC_11_SHFT 11 +#define UVXH_EVENT_OCCURRED2_RTC_12_SHFT 12 +#define UVXH_EVENT_OCCURRED2_RTC_13_SHFT 13 +#define UVXH_EVENT_OCCURRED2_RTC_14_SHFT 14 +#define UVXH_EVENT_OCCURRED2_RTC_15_SHFT 15 +#define UVXH_EVENT_OCCURRED2_RTC_16_SHFT 16 +#define UVXH_EVENT_OCCURRED2_RTC_17_SHFT 17 +#define UVXH_EVENT_OCCURRED2_RTC_18_SHFT 18 +#define UVXH_EVENT_OCCURRED2_RTC_19_SHFT 19 +#define UVXH_EVENT_OCCURRED2_RTC_20_SHFT 20 +#define UVXH_EVENT_OCCURRED2_RTC_21_SHFT 21 +#define UVXH_EVENT_OCCURRED2_RTC_22_SHFT 22 +#define UVXH_EVENT_OCCURRED2_RTC_23_SHFT 23 +#define UVXH_EVENT_OCCURRED2_RTC_24_SHFT 24 +#define UVXH_EVENT_OCCURRED2_RTC_25_SHFT 25 +#define UVXH_EVENT_OCCURRED2_RTC_26_SHFT 26 +#define UVXH_EVENT_OCCURRED2_RTC_27_SHFT 27 +#define UVXH_EVENT_OCCURRED2_RTC_28_SHFT 28 +#define UVXH_EVENT_OCCURRED2_RTC_29_SHFT 29 +#define UVXH_EVENT_OCCURRED2_RTC_30_SHFT 30 +#define UVXH_EVENT_OCCURRED2_RTC_31_SHFT 31 +#define UVXH_EVENT_OCCURRED2_RTC_0_MASK 0x0000000000000001UL +#define UVXH_EVENT_OCCURRED2_RTC_1_MASK 0x0000000000000002UL +#define UVXH_EVENT_OCCURRED2_RTC_2_MASK 0x0000000000000004UL +#define UVXH_EVENT_OCCURRED2_RTC_3_MASK 0x0000000000000008UL +#define UVXH_EVENT_OCCURRED2_RTC_4_MASK 0x0000000000000010UL +#define UVXH_EVENT_OCCURRED2_RTC_5_MASK 0x0000000000000020UL +#define UVXH_EVENT_OCCURRED2_RTC_6_MASK 0x0000000000000040UL +#define UVXH_EVENT_OCCURRED2_RTC_7_MASK 0x0000000000000080UL +#define UVXH_EVENT_OCCURRED2_RTC_8_MASK 0x0000000000000100UL +#define UVXH_EVENT_OCCURRED2_RTC_9_MASK 0x0000000000000200UL +#define UVXH_EVENT_OCCURRED2_RTC_10_MASK 0x0000000000000400UL +#define UVXH_EVENT_OCCURRED2_RTC_11_MASK 0x0000000000000800UL +#define UVXH_EVENT_OCCURRED2_RTC_12_MASK 0x0000000000001000UL +#define UVXH_EVENT_OCCURRED2_RTC_13_MASK 0x0000000000002000UL +#define UVXH_EVENT_OCCURRED2_RTC_14_MASK 0x0000000000004000UL +#define UVXH_EVENT_OCCURRED2_RTC_15_MASK 0x0000000000008000UL +#define UVXH_EVENT_OCCURRED2_RTC_16_MASK 0x0000000000010000UL +#define UVXH_EVENT_OCCURRED2_RTC_17_MASK 0x0000000000020000UL +#define UVXH_EVENT_OCCURRED2_RTC_18_MASK 0x0000000000040000UL +#define UVXH_EVENT_OCCURRED2_RTC_19_MASK 0x0000000000080000UL +#define UVXH_EVENT_OCCURRED2_RTC_20_MASK 0x0000000000100000UL +#define UVXH_EVENT_OCCURRED2_RTC_21_MASK 0x0000000000200000UL +#define UVXH_EVENT_OCCURRED2_RTC_22_MASK 0x0000000000400000UL +#define UVXH_EVENT_OCCURRED2_RTC_23_MASK 0x0000000000800000UL +#define UVXH_EVENT_OCCURRED2_RTC_24_MASK 0x0000000001000000UL +#define UVXH_EVENT_OCCURRED2_RTC_25_MASK 0x0000000002000000UL +#define UVXH_EVENT_OCCURRED2_RTC_26_MASK 0x0000000004000000UL +#define UVXH_EVENT_OCCURRED2_RTC_27_MASK 0x0000000008000000UL +#define UVXH_EVENT_OCCURRED2_RTC_28_MASK 0x0000000010000000UL +#define UVXH_EVENT_OCCURRED2_RTC_29_MASK 0x0000000020000000UL +#define UVXH_EVENT_OCCURRED2_RTC_30_MASK 0x0000000040000000UL +#define UVXH_EVENT_OCCURRED2_RTC_31_MASK 0x0000000080000000UL + +union uvxh_event_occurred2_u { unsigned long v; - struct uv2h_event_occurred2_s { + struct uvxh_event_occurred2_s { unsigned long rtc_0:1; /* RW */ unsigned long rtc_1:1; /* RW */ unsigned long rtc_2:1; /* RW */ @@ -2031,29 +2712,46 @@ union uv2h_event_occurred2_u { unsigned long rtc_30:1; /* RW */ unsigned long rtc_31:1; /* RW */ unsigned long rsvd_32_63:32; - } s1; + } sx; }; /* ========================================================================= */ -/* UV2H_EVENT_OCCURRED2_ALIAS */ +/* UVXH_EVENT_OCCURRED2_ALIAS */ /* ========================================================================= */ -#define UV2H_EVENT_OCCURRED2_ALIAS 0x70108UL -#define UV2H_EVENT_OCCURRED2_ALIAS_32 0xb70 +#define UVXH_EVENT_OCCURRED2_ALIAS 0x70108UL +#define UVXH_EVENT_OCCURRED2_ALIAS_32 0xb70 + /* ========================================================================= */ -/* UV2H_LB_BAU_SB_ACTIVATION_STATUS_2 */ +/* UVXH_LB_BAU_SB_ACTIVATION_STATUS_2 */ /* ========================================================================= */ -#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2 0x320130UL -#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2_32 0x9f0 +#define UVXH_LB_BAU_SB_ACTIVATION_STATUS_2 0x320130UL +#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2 0x320130UL +#define UV3H_LB_BAU_SB_ACTIVATION_STATUS_2 0x320130UL +#define UVXH_LB_BAU_SB_ACTIVATION_STATUS_2_32 0x9f0 +#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2_32 0x320130UL +#define UV3H_LB_BAU_SB_ACTIVATION_STATUS_2_32 0x320130UL + +#define UVXH_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_SHFT 0 +#define UVXH_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_MASK 0xffffffffffffffffUL #define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_SHFT 0 #define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_MASK 0xffffffffffffffffUL -union uv2h_lb_bau_sb_activation_status_2_u { +#define UV3H_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_SHFT 0 +#define UV3H_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_MASK 0xffffffffffffffffUL + +union uvxh_lb_bau_sb_activation_status_2_u { unsigned long v; + struct uvxh_lb_bau_sb_activation_status_2_s { + unsigned long aux_error:64; /* RW */ + } sx; struct uv2h_lb_bau_sb_activation_status_2_s { unsigned long aux_error:64; /* RW */ - } s1; + } s2; + struct uv3h_lb_bau_sb_activation_status_2_s { + unsigned long aux_error:64; /* RW */ + } s3; }; /* ========================================================================= */ @@ -2073,5 +2771,87 @@ union uv1h_lb_target_physical_apic_id_mask_u { } s1; }; +/* ========================================================================= */ +/* UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR */ +/* ========================================================================= */ +#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR 0x1603000UL + +#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_BASE_SHFT 26 +#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_SHFT 46 +#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_ENABLE_SHFT 63 +#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_BASE_MASK 0x00003ffffc000000UL +#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_MASK 0x000fc00000000000UL +#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_ENABLE_MASK 0x8000000000000000UL + +union uv3h_rh_gam_mmioh_overlay_config0_mmr_u { + unsigned long v; + struct uv3h_rh_gam_mmioh_overlay_config0_mmr_s { + unsigned long rsvd_0_25:26; + unsigned long base:20; /* RW */ + unsigned long m_io:6; /* RW */ + unsigned long n_io:4; + unsigned long rsvd_56_62:7; + unsigned long enable:1; /* RW */ + } s3; +}; + +/* ========================================================================= */ +/* UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR */ +/* ========================================================================= */ +#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR 0x1604000UL + +#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_BASE_SHFT 26 +#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_SHFT 46 +#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_ENABLE_SHFT 63 +#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_BASE_MASK 0x00003ffffc000000UL +#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_MASK 0x000fc00000000000UL +#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_ENABLE_MASK 0x8000000000000000UL + +union uv3h_rh_gam_mmioh_overlay_config1_mmr_u { + unsigned long v; + struct uv3h_rh_gam_mmioh_overlay_config1_mmr_s { + unsigned long rsvd_0_25:26; + unsigned long base:20; /* RW */ + unsigned long m_io:6; /* RW */ + unsigned long n_io:4; + unsigned long rsvd_56_62:7; + unsigned long enable:1; /* RW */ + } s3; +}; + +/* ========================================================================= */ +/* UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR */ +/* ========================================================================= */ +#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR 0x1603800UL +#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH 128 + +#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_NASID_SHFT 0 +#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_NASID_MASK 0x0000000000007fffUL + +union uv3h_rh_gam_mmioh_redirect_config0_mmr_u { + unsigned long v; + struct uv3h_rh_gam_mmioh_redirect_config0_mmr_s { + unsigned long nasid:15; /* RW */ + unsigned long rsvd_15_63:49; + } s3; +}; + +/* ========================================================================= */ +/* UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR */ +/* ========================================================================= */ +#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR 0x1604800UL +#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_DEPTH 128 + +#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_NASID_SHFT 0 +#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_NASID_MASK 0x0000000000007fffUL + +union uv3h_rh_gam_mmioh_redirect_config1_mmr_u { + unsigned long v; + struct uv3h_rh_gam_mmioh_redirect_config1_mmr_s { + unsigned long nasid:15; /* RW */ + unsigned long rsvd_15_63:49; + } s3; +}; + #endif /* _ASM_X86_UV_UV_MMRS_H */ diff --git a/arch/x86/include/asm/vm86.h b/arch/x86/include/asm/vm86.h index f9303602fbc..1d8de3f3fec 100644 --- a/arch/x86/include/asm/vm86.h +++ b/arch/x86/include/asm/vm86.h @@ -1,133 +1,9 @@ #ifndef _ASM_X86_VM86_H #define _ASM_X86_VM86_H -/* - * I'm guessing at the VIF/VIP flag usage, but hope that this is how - * the Pentium uses them. Linux will return from vm86 mode when both - * VIF and VIP is set. - * - * On a Pentium, we could probably optimize the virtual flags directly - * in the eflags register instead of doing it "by hand" in vflags... - * - * Linus - */ - -#include <asm/processor-flags.h> - -#define BIOSSEG 0x0f000 - -#define CPU_086 0 -#define CPU_186 1 -#define CPU_286 2 -#define CPU_386 3 -#define CPU_486 4 -#define CPU_586 5 - -/* - * Return values for the 'vm86()' system call - */ -#define VM86_TYPE(retval) ((retval) & 0xff) -#define VM86_ARG(retval) ((retval) >> 8) - -#define VM86_SIGNAL 0 /* return due to signal */ -#define VM86_UNKNOWN 1 /* unhandled GP fault - - IO-instruction or similar */ -#define VM86_INTx 2 /* int3/int x instruction (ARG = x) */ -#define VM86_STI 3 /* sti/popf/iret instruction enabled - virtual interrupts */ - -/* - * Additional return values when invoking new vm86() - */ -#define VM86_PICRETURN 4 /* return due to pending PIC request */ -#define VM86_TRAP 6 /* return due to DOS-debugger request */ - -/* - * function codes when invoking new vm86() - */ -#define VM86_PLUS_INSTALL_CHECK 0 -#define VM86_ENTER 1 -#define VM86_ENTER_NO_BYPASS 2 -#define VM86_REQUEST_IRQ 3 -#define VM86_FREE_IRQ 4 -#define VM86_GET_IRQ_BITS 5 -#define VM86_GET_AND_RESET_IRQ 6 - -/* - * This is the stack-layout seen by the user space program when we have - * done a translation of "SAVE_ALL" from vm86 mode. The real kernel layout - * is 'kernel_vm86_regs' (see below). - */ - -struct vm86_regs { -/* - * normal regs, with special meaning for the segment descriptors.. - */ - long ebx; - long ecx; - long edx; - long esi; - long edi; - long ebp; - long eax; - long __null_ds; - long __null_es; - long __null_fs; - long __null_gs; - long orig_eax; - long eip; - unsigned short cs, __csh; - long eflags; - long esp; - unsigned short ss, __ssh; -/* - * these are specific to v86 mode: - */ - unsigned short es, __esh; - unsigned short ds, __dsh; - unsigned short fs, __fsh; - unsigned short gs, __gsh; -}; - -struct revectored_struct { - unsigned long __map[8]; /* 256 bits */ -}; - -struct vm86_struct { - struct vm86_regs regs; - unsigned long flags; - unsigned long screen_bitmap; - unsigned long cpu_type; - struct revectored_struct int_revectored; - struct revectored_struct int21_revectored; -}; - -/* - * flags masks - */ -#define VM86_SCREEN_BITMAP 0x0001 - -struct vm86plus_info_struct { - unsigned long force_return_for_pic:1; - unsigned long vm86dbg_active:1; /* for debugger */ - unsigned long vm86dbg_TFpendig:1; /* for debugger */ - unsigned long unused:28; - unsigned long is_vm86pus:1; /* for vm86 internal use */ - unsigned char vm86dbg_intxxtab[32]; /* for debugger */ -}; -struct vm86plus_struct { - struct vm86_regs regs; - unsigned long flags; - unsigned long screen_bitmap; - unsigned long cpu_type; - struct revectored_struct int_revectored; - struct revectored_struct int21_revectored; - struct vm86plus_info_struct vm86plus; -}; - -#ifdef __KERNEL__ #include <asm/ptrace.h> +#include <uapi/asm/vm86.h> /* * This is the (kernel) stack-layout when we have done a "SAVE_ALL" from vm86 @@ -203,6 +79,4 @@ static inline int handle_vm86_trap(struct kernel_vm86_regs *a, long b, int c) #endif /* CONFIG_VM86 */ -#endif /* __KERNEL__ */ - #endif /* _ASM_X86_VM86_H */ diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 36ec21c36d6..b6fbf860e39 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -1,6 +1,3 @@ -#ifndef VMX_H -#define VMX_H - /* * vmx.h: VMX Architecture related definitions * Copyright (c) 2004, Intel Corporation. @@ -24,90 +21,12 @@ * Yaniv Kamay <yaniv@qumranet.com> * */ +#ifndef VMX_H +#define VMX_H -#define VMX_EXIT_REASONS_FAILED_VMENTRY 0x80000000 - -#define EXIT_REASON_EXCEPTION_NMI 0 -#define EXIT_REASON_EXTERNAL_INTERRUPT 1 -#define EXIT_REASON_TRIPLE_FAULT 2 - -#define EXIT_REASON_PENDING_INTERRUPT 7 -#define EXIT_REASON_NMI_WINDOW 8 -#define EXIT_REASON_TASK_SWITCH 9 -#define EXIT_REASON_CPUID 10 -#define EXIT_REASON_HLT 12 -#define EXIT_REASON_INVD 13 -#define EXIT_REASON_INVLPG 14 -#define EXIT_REASON_RDPMC 15 -#define EXIT_REASON_RDTSC 16 -#define EXIT_REASON_VMCALL 18 -#define EXIT_REASON_VMCLEAR 19 -#define EXIT_REASON_VMLAUNCH 20 -#define EXIT_REASON_VMPTRLD 21 -#define EXIT_REASON_VMPTRST 22 -#define EXIT_REASON_VMREAD 23 -#define EXIT_REASON_VMRESUME 24 -#define EXIT_REASON_VMWRITE 25 -#define EXIT_REASON_VMOFF 26 -#define EXIT_REASON_VMON 27 -#define EXIT_REASON_CR_ACCESS 28 -#define EXIT_REASON_DR_ACCESS 29 -#define EXIT_REASON_IO_INSTRUCTION 30 -#define EXIT_REASON_MSR_READ 31 -#define EXIT_REASON_MSR_WRITE 32 -#define EXIT_REASON_INVALID_STATE 33 -#define EXIT_REASON_MWAIT_INSTRUCTION 36 -#define EXIT_REASON_MONITOR_INSTRUCTION 39 -#define EXIT_REASON_PAUSE_INSTRUCTION 40 -#define EXIT_REASON_MCE_DURING_VMENTRY 41 -#define EXIT_REASON_TPR_BELOW_THRESHOLD 43 -#define EXIT_REASON_APIC_ACCESS 44 -#define EXIT_REASON_EPT_VIOLATION 48 -#define EXIT_REASON_EPT_MISCONFIG 49 -#define EXIT_REASON_WBINVD 54 -#define EXIT_REASON_XSETBV 55 -#define EXIT_REASON_INVPCID 58 - -#define VMX_EXIT_REASONS \ - { EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \ - { EXIT_REASON_EXTERNAL_INTERRUPT, "EXTERNAL_INTERRUPT" }, \ - { EXIT_REASON_TRIPLE_FAULT, "TRIPLE_FAULT" }, \ - { EXIT_REASON_PENDING_INTERRUPT, "PENDING_INTERRUPT" }, \ - { EXIT_REASON_NMI_WINDOW, "NMI_WINDOW" }, \ - { EXIT_REASON_TASK_SWITCH, "TASK_SWITCH" }, \ - { EXIT_REASON_CPUID, "CPUID" }, \ - { EXIT_REASON_HLT, "HLT" }, \ - { EXIT_REASON_INVLPG, "INVLPG" }, \ - { EXIT_REASON_RDPMC, "RDPMC" }, \ - { EXIT_REASON_RDTSC, "RDTSC" }, \ - { EXIT_REASON_VMCALL, "VMCALL" }, \ - { EXIT_REASON_VMCLEAR, "VMCLEAR" }, \ - { EXIT_REASON_VMLAUNCH, "VMLAUNCH" }, \ - { EXIT_REASON_VMPTRLD, "VMPTRLD" }, \ - { EXIT_REASON_VMPTRST, "VMPTRST" }, \ - { EXIT_REASON_VMREAD, "VMREAD" }, \ - { EXIT_REASON_VMRESUME, "VMRESUME" }, \ - { EXIT_REASON_VMWRITE, "VMWRITE" }, \ - { EXIT_REASON_VMOFF, "VMOFF" }, \ - { EXIT_REASON_VMON, "VMON" }, \ - { EXIT_REASON_CR_ACCESS, "CR_ACCESS" }, \ - { EXIT_REASON_DR_ACCESS, "DR_ACCESS" }, \ - { EXIT_REASON_IO_INSTRUCTION, "IO_INSTRUCTION" }, \ - { EXIT_REASON_MSR_READ, "MSR_READ" }, \ - { EXIT_REASON_MSR_WRITE, "MSR_WRITE" }, \ - { EXIT_REASON_MWAIT_INSTRUCTION, "MWAIT_INSTRUCTION" }, \ - { EXIT_REASON_MONITOR_INSTRUCTION, "MONITOR_INSTRUCTION" }, \ - { EXIT_REASON_PAUSE_INSTRUCTION, "PAUSE_INSTRUCTION" }, \ - { EXIT_REASON_MCE_DURING_VMENTRY, "MCE_DURING_VMENTRY" }, \ - { EXIT_REASON_TPR_BELOW_THRESHOLD, "TPR_BELOW_THRESHOLD" }, \ - { EXIT_REASON_APIC_ACCESS, "APIC_ACCESS" }, \ - { EXIT_REASON_EPT_VIOLATION, "EPT_VIOLATION" }, \ - { EXIT_REASON_EPT_MISCONFIG, "EPT_MISCONFIG" }, \ - { EXIT_REASON_WBINVD, "WBINVD" } - -#ifdef __KERNEL__ #include <linux/types.h> +#include <uapi/asm/vmx.h> /* * Definitions of Primary Processor-Based VM-Execution Controls. @@ -138,9 +57,12 @@ #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 #define SECONDARY_EXEC_ENABLE_EPT 0x00000002 #define SECONDARY_EXEC_RDTSCP 0x00000008 +#define SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE 0x00000010 #define SECONDARY_EXEC_ENABLE_VPID 0x00000020 #define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 #define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080 +#define SECONDARY_EXEC_APIC_REGISTER_VIRT 0x00000100 +#define SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY 0x00000200 #define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400 #define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000 @@ -178,6 +100,7 @@ enum vmcs_field { GUEST_GS_SELECTOR = 0x0000080a, GUEST_LDTR_SELECTOR = 0x0000080c, GUEST_TR_SELECTOR = 0x0000080e, + GUEST_INTR_STATUS = 0x00000810, HOST_ES_SELECTOR = 0x00000c00, HOST_CS_SELECTOR = 0x00000c02, HOST_SS_SELECTOR = 0x00000c04, @@ -205,6 +128,14 @@ enum vmcs_field { APIC_ACCESS_ADDR_HIGH = 0x00002015, EPT_POINTER = 0x0000201a, EPT_POINTER_HIGH = 0x0000201b, + EOI_EXIT_BITMAP0 = 0x0000201c, + EOI_EXIT_BITMAP0_HIGH = 0x0000201d, + EOI_EXIT_BITMAP1 = 0x0000201e, + EOI_EXIT_BITMAP1_HIGH = 0x0000201f, + EOI_EXIT_BITMAP2 = 0x00002020, + EOI_EXIT_BITMAP2_HIGH = 0x00002021, + EOI_EXIT_BITMAP3 = 0x00002022, + EOI_EXIT_BITMAP3_HIGH = 0x00002023, GUEST_PHYSICAL_ADDRESS = 0x00002400, GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401, VMCS_LINK_POINTER = 0x00002800, @@ -427,9 +358,9 @@ enum vmcs_field { #define AR_RESERVD_MASK 0xfffe0f00 -#define TSS_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 0) -#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 1) -#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 2) +#define TSS_PRIVATE_MEMSLOT (KVM_USER_MEM_SLOTS + 0) +#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (KVM_USER_MEM_SLOTS + 1) +#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT (KVM_USER_MEM_SLOTS + 2) #define VMX_NR_VPIDS (1 << 16) #define VMX_VPID_EXTENT_SINGLE_CONTEXT 1 @@ -445,8 +376,7 @@ enum vmcs_field { #define VMX_EPTP_WB_BIT (1ull << 14) #define VMX_EPT_2MB_PAGE_BIT (1ull << 16) #define VMX_EPT_1GB_PAGE_BIT (1ull << 17) -#define VMX_EPT_AD_BIT (1ull << 21) -#define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24) +#define VMX_EPT_AD_BIT (1ull << 21) #define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) #define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) @@ -527,5 +457,3 @@ enum vm_instruction_error_number { }; #endif - -#endif diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h index eaea1d31f75..2a46ca720af 100644 --- a/arch/x86/include/asm/vsyscall.h +++ b/arch/x86/include/asm/vsyscall.h @@ -1,20 +1,8 @@ #ifndef _ASM_X86_VSYSCALL_H #define _ASM_X86_VSYSCALL_H -enum vsyscall_num { - __NR_vgettimeofday, - __NR_vtime, - __NR_vgetcpu, -}; - -#define VSYSCALL_START (-10UL << 20) -#define VSYSCALL_SIZE 1024 -#define VSYSCALL_END (-2UL << 20) -#define VSYSCALL_MAPPED_PAGES 1 -#define VSYSCALL_ADDR(vsyscall_nr) (VSYSCALL_START+VSYSCALL_SIZE*(vsyscall_nr)) - -#ifdef __KERNEL__ #include <linux/seqlock.h> +#include <uapi/asm/vsyscall.h> #define VGETCPU_RDTSCP 1 #define VGETCPU_LSL 2 @@ -33,6 +21,24 @@ extern void map_vsyscall(void); */ extern bool emulate_vsyscall(struct pt_regs *regs, unsigned long address); -#endif /* __KERNEL__ */ +#ifdef CONFIG_X86_64 + +#define VGETCPU_CPU_MASK 0xfff + +static inline unsigned int __getcpu(void) +{ + unsigned int p; + + if (VVAR(vgetcpu_mode) == VGETCPU_RDTSCP) { + /* Load per CPU data from RDTSCP */ + native_read_tscp(&p); + } else { + /* Load per CPU data from GDT */ + asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG)); + } + + return p; +} +#endif /* CONFIG_X86_64 */ #endif /* _ASM_X86_VSYSCALL_H */ diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 57693498519..d8d99222b36 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -69,17 +69,6 @@ struct x86_init_oem { }; /** - * struct x86_init_mapping - platform specific initial kernel pagetable setup - * @pagetable_reserve: reserve a range of addresses for kernel pagetable usage - * - * For more details on the purpose of this hook, look in - * init_memory_mapping and the commit that added it. - */ -struct x86_init_mapping { - void (*pagetable_reserve)(u64 start, u64 end); -}; - -/** * struct x86_init_paging - platform specific paging functions * @pagetable_init: platform specific paging initialization call to setup * the kernel pagetables and prepare accessors functions. @@ -136,7 +125,6 @@ struct x86_init_ops { struct x86_init_mpparse mpparse; struct x86_init_irqs irqs; struct x86_init_oem oem; - struct x86_init_mapping mapping; struct x86_init_paging paging; struct x86_init_timers timers; struct x86_init_iommu iommu; @@ -181,19 +169,38 @@ struct x86_platform_ops { }; struct pci_dev; +struct msi_msg; struct x86_msi_ops { int (*setup_msi_irqs)(struct pci_dev *dev, int nvec, int type); + void (*compose_msi_msg)(struct pci_dev *dev, unsigned int irq, + unsigned int dest, struct msi_msg *msg, + u8 hpet_id); void (*teardown_msi_irq)(unsigned int irq); void (*teardown_msi_irqs)(struct pci_dev *dev); void (*restore_msi_irqs)(struct pci_dev *dev, int irq); + int (*setup_hpet_msi)(unsigned int irq, unsigned int id); }; +struct IO_APIC_route_entry; +struct io_apic_irq_attr; +struct irq_data; +struct cpumask; + struct x86_io_apic_ops { - void (*init) (void); - unsigned int (*read) (unsigned int apic, unsigned int reg); - void (*write) (unsigned int apic, unsigned int reg, unsigned int value); - void (*modify)(unsigned int apic, unsigned int reg, unsigned int value); + void (*init) (void); + unsigned int (*read) (unsigned int apic, unsigned int reg); + void (*write) (unsigned int apic, unsigned int reg, unsigned int value); + void (*modify) (unsigned int apic, unsigned int reg, unsigned int value); + void (*disable)(void); + void (*print_entries)(unsigned int apic, unsigned int nr_entries); + int (*set_affinity)(struct irq_data *data, + const struct cpumask *mask, + bool force); + int (*setup_entry)(int irq, struct IO_APIC_route_entry *entry, + unsigned int destination, int vector, + struct io_apic_irq_attr *attr); + void (*eoi_ioapic_pin)(int apic, int pin, int vector); }; extern struct x86_init_ops x86_init; diff --git a/arch/x86/include/asm/xen/events.h b/arch/x86/include/asm/xen/events.h index cc146d51449..ca842f2769e 100644 --- a/arch/x86/include/asm/xen/events.h +++ b/arch/x86/include/asm/xen/events.h @@ -16,4 +16,7 @@ static inline int xen_irqs_disabled(struct pt_regs *regs) return raw_irqs_disabled_flags(regs->flags); } +/* No need for a barrier -- XCHG is a barrier on x86. */ +#define xchg_xen_ulong(ptr, val) xchg((ptr), (val)) + #endif /* _ASM_X86_XEN_EVENTS_H */ diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 59c226d120c..c20d1ce62dc 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -359,18 +359,14 @@ HYPERVISOR_update_va_mapping(unsigned long va, pte_t new_val, return _hypercall4(int, update_va_mapping, va, new_val.pte, new_val.pte >> 32, flags); } +extern int __must_check xen_event_channel_op_compat(int, void *); static inline int HYPERVISOR_event_channel_op(int cmd, void *arg) { int rc = _hypercall2(int, event_channel_op, cmd, arg); - if (unlikely(rc == -ENOSYS)) { - struct evtchn_op op; - op.cmd = cmd; - memcpy(&op.u, arg, sizeof(op.u)); - rc = _hypercall1(int, event_channel_op_compat, &op); - memcpy(arg, &op.u, sizeof(op.u)); - } + if (unlikely(rc == -ENOSYS)) + rc = xen_event_channel_op_compat(cmd, arg); return rc; } @@ -386,17 +382,14 @@ HYPERVISOR_console_io(int cmd, int count, char *str) return _hypercall3(int, console_io, cmd, count, str); } +extern int __must_check HYPERVISOR_physdev_op_compat(int, void *); + static inline int HYPERVISOR_physdev_op(int cmd, void *arg) { int rc = _hypercall2(int, physdev_op, cmd, arg); - if (unlikely(rc == -ENOSYS)) { - struct physdev_op op; - op.cmd = cmd; - memcpy(&op.u, arg, sizeof(op.u)); - rc = _hypercall1(int, physdev_op_compat, &op); - memcpy(arg, &op.u, sizeof(op.u)); - } + if (unlikely(rc == -ENOSYS)) + rc = HYPERVISOR_physdev_op_compat(cmd, arg); return rc; } diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h index 54d52ff1304..fd9cb7695b5 100644 --- a/arch/x86/include/asm/xen/interface.h +++ b/arch/x86/include/asm/xen/interface.h @@ -63,6 +63,7 @@ DEFINE_GUEST_HANDLE(void); DEFINE_GUEST_HANDLE(uint64_t); DEFINE_GUEST_HANDLE(uint32_t); DEFINE_GUEST_HANDLE(xen_pfn_t); +DEFINE_GUEST_HANDLE(xen_ulong_t); #endif #ifndef HYPERVISOR_VIRT_START diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 472b9b78301..6aef9fbc09b 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -212,4 +212,6 @@ unsigned long arbitrary_virt_to_mfn(void *vaddr); void make_lowmem_page_readonly(void *vaddr); void make_lowmem_page_readwrite(void *vaddr); +#define xen_remap(cookie, size) ioremap((cookie), (size)); + #endif /* _ASM_X86_XEN_PAGE_H */ diff --git a/arch/x86/include/asm/xor.h b/arch/x86/include/asm/xor.h index f8fde90bc45..d8829751b3f 100644 --- a/arch/x86/include/asm/xor.h +++ b/arch/x86/include/asm/xor.h @@ -1,10 +1,499 @@ #ifdef CONFIG_KMEMCHECK /* kmemcheck doesn't handle MMX/SSE/SSE2 instructions */ # include <asm-generic/xor.h> +#elif !defined(_ASM_X86_XOR_H) +#define _ASM_X86_XOR_H + +/* + * Optimized RAID-5 checksumming functions for SSE. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * You should have received a copy of the GNU General Public License + * (for example /usr/src/linux/COPYING); if not, write to the Free + * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* + * Cache avoiding checksumming functions utilizing KNI instructions + * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo) + */ + +/* + * Based on + * High-speed RAID5 checksumming functions utilizing SSE instructions. + * Copyright (C) 1998 Ingo Molnar. + */ + +/* + * x86-64 changes / gcc fixes from Andi Kleen. + * Copyright 2002 Andi Kleen, SuSE Labs. + * + * This hasn't been optimized for the hammer yet, but there are likely + * no advantages to be gotten from x86-64 here anyways. + */ + +#include <asm/i387.h> + +#ifdef CONFIG_X86_32 +/* reduce register pressure */ +# define XOR_CONSTANT_CONSTRAINT "i" #else +# define XOR_CONSTANT_CONSTRAINT "re" +#endif + +#define OFFS(x) "16*("#x")" +#define PF_OFFS(x) "256+16*("#x")" +#define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n" +#define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n" +#define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n" +#define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n" +#define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n" +#define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n" +#define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n" +#define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n" +#define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n" +#define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n" +#define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n" +#define NOP(x) + +#define BLK64(pf, op, i) \ + pf(i) \ + op(i, 0) \ + op(i + 1, 1) \ + op(i + 2, 2) \ + op(i + 3, 3) + +static void +xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) +{ + unsigned long lines = bytes >> 8; + + kernel_fpu_begin(); + + asm volatile( +#undef BLOCK +#define BLOCK(i) \ + LD(i, 0) \ + LD(i + 1, 1) \ + PF1(i) \ + PF1(i + 2) \ + LD(i + 2, 2) \ + LD(i + 3, 3) \ + PF0(i + 4) \ + PF0(i + 6) \ + XO1(i, 0) \ + XO1(i + 1, 1) \ + XO1(i + 2, 2) \ + XO1(i + 3, 3) \ + ST(i, 0) \ + ST(i + 1, 1) \ + ST(i + 2, 2) \ + ST(i + 3, 3) \ + + + PF0(0) + PF0(2) + + " .align 32 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " add %[inc], %[p1] ;\n" + " add %[inc], %[p2] ;\n" + " dec %[cnt] ;\n" + " jnz 1b ;\n" + : [cnt] "+r" (lines), + [p1] "+r" (p1), [p2] "+r" (p2) + : [inc] XOR_CONSTANT_CONSTRAINT (256UL) + : "memory"); + + kernel_fpu_end(); +} + +static void +xor_sse_2_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2) +{ + unsigned long lines = bytes >> 8; + + kernel_fpu_begin(); + + asm volatile( +#undef BLOCK +#define BLOCK(i) \ + BLK64(PF0, LD, i) \ + BLK64(PF1, XO1, i) \ + BLK64(NOP, ST, i) \ + + " .align 32 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " add %[inc], %[p1] ;\n" + " add %[inc], %[p2] ;\n" + " dec %[cnt] ;\n" + " jnz 1b ;\n" + : [cnt] "+r" (lines), + [p1] "+r" (p1), [p2] "+r" (p2) + : [inc] XOR_CONSTANT_CONSTRAINT (256UL) + : "memory"); + + kernel_fpu_end(); +} + +static void +xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, + unsigned long *p3) +{ + unsigned long lines = bytes >> 8; + + kernel_fpu_begin(); + + asm volatile( +#undef BLOCK +#define BLOCK(i) \ + PF1(i) \ + PF1(i + 2) \ + LD(i, 0) \ + LD(i + 1, 1) \ + LD(i + 2, 2) \ + LD(i + 3, 3) \ + PF2(i) \ + PF2(i + 2) \ + PF0(i + 4) \ + PF0(i + 6) \ + XO1(i, 0) \ + XO1(i + 1, 1) \ + XO1(i + 2, 2) \ + XO1(i + 3, 3) \ + XO2(i, 0) \ + XO2(i + 1, 1) \ + XO2(i + 2, 2) \ + XO2(i + 3, 3) \ + ST(i, 0) \ + ST(i + 1, 1) \ + ST(i + 2, 2) \ + ST(i + 3, 3) \ + + + PF0(0) + PF0(2) + + " .align 32 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " add %[inc], %[p1] ;\n" + " add %[inc], %[p2] ;\n" + " add %[inc], %[p3] ;\n" + " dec %[cnt] ;\n" + " jnz 1b ;\n" + : [cnt] "+r" (lines), + [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3) + : [inc] XOR_CONSTANT_CONSTRAINT (256UL) + : "memory"); + + kernel_fpu_end(); +} + +static void +xor_sse_3_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2, + unsigned long *p3) +{ + unsigned long lines = bytes >> 8; + + kernel_fpu_begin(); + + asm volatile( +#undef BLOCK +#define BLOCK(i) \ + BLK64(PF0, LD, i) \ + BLK64(PF1, XO1, i) \ + BLK64(PF2, XO2, i) \ + BLK64(NOP, ST, i) \ + + " .align 32 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " add %[inc], %[p1] ;\n" + " add %[inc], %[p2] ;\n" + " add %[inc], %[p3] ;\n" + " dec %[cnt] ;\n" + " jnz 1b ;\n" + : [cnt] "+r" (lines), + [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3) + : [inc] XOR_CONSTANT_CONSTRAINT (256UL) + : "memory"); + + kernel_fpu_end(); +} + +static void +xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, + unsigned long *p3, unsigned long *p4) +{ + unsigned long lines = bytes >> 8; + + kernel_fpu_begin(); + + asm volatile( +#undef BLOCK +#define BLOCK(i) \ + PF1(i) \ + PF1(i + 2) \ + LD(i, 0) \ + LD(i + 1, 1) \ + LD(i + 2, 2) \ + LD(i + 3, 3) \ + PF2(i) \ + PF2(i + 2) \ + XO1(i, 0) \ + XO1(i + 1, 1) \ + XO1(i + 2, 2) \ + XO1(i + 3, 3) \ + PF3(i) \ + PF3(i + 2) \ + PF0(i + 4) \ + PF0(i + 6) \ + XO2(i, 0) \ + XO2(i + 1, 1) \ + XO2(i + 2, 2) \ + XO2(i + 3, 3) \ + XO3(i, 0) \ + XO3(i + 1, 1) \ + XO3(i + 2, 2) \ + XO3(i + 3, 3) \ + ST(i, 0) \ + ST(i + 1, 1) \ + ST(i + 2, 2) \ + ST(i + 3, 3) \ + + + PF0(0) + PF0(2) + + " .align 32 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " add %[inc], %[p1] ;\n" + " add %[inc], %[p2] ;\n" + " add %[inc], %[p3] ;\n" + " add %[inc], %[p4] ;\n" + " dec %[cnt] ;\n" + " jnz 1b ;\n" + : [cnt] "+r" (lines), [p1] "+r" (p1), + [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4) + : [inc] XOR_CONSTANT_CONSTRAINT (256UL) + : "memory"); + + kernel_fpu_end(); +} + +static void +xor_sse_4_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2, + unsigned long *p3, unsigned long *p4) +{ + unsigned long lines = bytes >> 8; + + kernel_fpu_begin(); + + asm volatile( +#undef BLOCK +#define BLOCK(i) \ + BLK64(PF0, LD, i) \ + BLK64(PF1, XO1, i) \ + BLK64(PF2, XO2, i) \ + BLK64(PF3, XO3, i) \ + BLK64(NOP, ST, i) \ + + " .align 32 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " add %[inc], %[p1] ;\n" + " add %[inc], %[p2] ;\n" + " add %[inc], %[p3] ;\n" + " add %[inc], %[p4] ;\n" + " dec %[cnt] ;\n" + " jnz 1b ;\n" + : [cnt] "+r" (lines), [p1] "+r" (p1), + [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4) + : [inc] XOR_CONSTANT_CONSTRAINT (256UL) + : "memory"); + + kernel_fpu_end(); +} + +static void +xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, + unsigned long *p3, unsigned long *p4, unsigned long *p5) +{ + unsigned long lines = bytes >> 8; + + kernel_fpu_begin(); + + asm volatile( +#undef BLOCK +#define BLOCK(i) \ + PF1(i) \ + PF1(i + 2) \ + LD(i, 0) \ + LD(i + 1, 1) \ + LD(i + 2, 2) \ + LD(i + 3, 3) \ + PF2(i) \ + PF2(i + 2) \ + XO1(i, 0) \ + XO1(i + 1, 1) \ + XO1(i + 2, 2) \ + XO1(i + 3, 3) \ + PF3(i) \ + PF3(i + 2) \ + XO2(i, 0) \ + XO2(i + 1, 1) \ + XO2(i + 2, 2) \ + XO2(i + 3, 3) \ + PF4(i) \ + PF4(i + 2) \ + PF0(i + 4) \ + PF0(i + 6) \ + XO3(i, 0) \ + XO3(i + 1, 1) \ + XO3(i + 2, 2) \ + XO3(i + 3, 3) \ + XO4(i, 0) \ + XO4(i + 1, 1) \ + XO4(i + 2, 2) \ + XO4(i + 3, 3) \ + ST(i, 0) \ + ST(i + 1, 1) \ + ST(i + 2, 2) \ + ST(i + 3, 3) \ + + + PF0(0) + PF0(2) + + " .align 32 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " add %[inc], %[p1] ;\n" + " add %[inc], %[p2] ;\n" + " add %[inc], %[p3] ;\n" + " add %[inc], %[p4] ;\n" + " add %[inc], %[p5] ;\n" + " dec %[cnt] ;\n" + " jnz 1b ;\n" + : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2), + [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5) + : [inc] XOR_CONSTANT_CONSTRAINT (256UL) + : "memory"); + + kernel_fpu_end(); +} + +static void +xor_sse_5_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2, + unsigned long *p3, unsigned long *p4, unsigned long *p5) +{ + unsigned long lines = bytes >> 8; + + kernel_fpu_begin(); + + asm volatile( +#undef BLOCK +#define BLOCK(i) \ + BLK64(PF0, LD, i) \ + BLK64(PF1, XO1, i) \ + BLK64(PF2, XO2, i) \ + BLK64(PF3, XO3, i) \ + BLK64(PF4, XO4, i) \ + BLK64(NOP, ST, i) \ + + " .align 32 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " add %[inc], %[p1] ;\n" + " add %[inc], %[p2] ;\n" + " add %[inc], %[p3] ;\n" + " add %[inc], %[p4] ;\n" + " add %[inc], %[p5] ;\n" + " dec %[cnt] ;\n" + " jnz 1b ;\n" + : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2), + [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5) + : [inc] XOR_CONSTANT_CONSTRAINT (256UL) + : "memory"); + + kernel_fpu_end(); +} + +static struct xor_block_template xor_block_sse_pf64 = { + .name = "prefetch64-sse", + .do_2 = xor_sse_2_pf64, + .do_3 = xor_sse_3_pf64, + .do_4 = xor_sse_4_pf64, + .do_5 = xor_sse_5_pf64, +}; + +#undef LD +#undef XO1 +#undef XO2 +#undef XO3 +#undef XO4 +#undef ST +#undef NOP +#undef BLK64 +#undef BLOCK + +#undef XOR_CONSTANT_CONSTRAINT + #ifdef CONFIG_X86_32 # include <asm/xor_32.h> #else # include <asm/xor_64.h> #endif -#endif + +#define XOR_SELECT_TEMPLATE(FASTEST) \ + AVX_SELECT(FASTEST) + +#endif /* _ASM_X86_XOR_H */ diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h index f79cb7ec0e0..ce05722e3c6 100644 --- a/arch/x86/include/asm/xor_32.h +++ b/arch/x86/include/asm/xor_32.h @@ -2,7 +2,7 @@ #define _ASM_X86_XOR_32_H /* - * Optimized RAID-5 checksumming functions for MMX and SSE. + * Optimized RAID-5 checksumming functions for MMX. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -529,290 +529,6 @@ static struct xor_block_template xor_block_p5_mmx = { .do_5 = xor_p5_mmx_5, }; -/* - * Cache avoiding checksumming functions utilizing KNI instructions - * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo) - */ - -#define OFFS(x) "16*("#x")" -#define PF_OFFS(x) "256+16*("#x")" -#define PF0(x) " prefetchnta "PF_OFFS(x)"(%1) ;\n" -#define LD(x, y) " movaps "OFFS(x)"(%1), %%xmm"#y" ;\n" -#define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%1) ;\n" -#define PF1(x) " prefetchnta "PF_OFFS(x)"(%2) ;\n" -#define PF2(x) " prefetchnta "PF_OFFS(x)"(%3) ;\n" -#define PF3(x) " prefetchnta "PF_OFFS(x)"(%4) ;\n" -#define PF4(x) " prefetchnta "PF_OFFS(x)"(%5) ;\n" -#define PF5(x) " prefetchnta "PF_OFFS(x)"(%6) ;\n" -#define XO1(x, y) " xorps "OFFS(x)"(%2), %%xmm"#y" ;\n" -#define XO2(x, y) " xorps "OFFS(x)"(%3), %%xmm"#y" ;\n" -#define XO3(x, y) " xorps "OFFS(x)"(%4), %%xmm"#y" ;\n" -#define XO4(x, y) " xorps "OFFS(x)"(%5), %%xmm"#y" ;\n" -#define XO5(x, y) " xorps "OFFS(x)"(%6), %%xmm"#y" ;\n" - - -static void -xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) -{ - unsigned long lines = bytes >> 8; - - kernel_fpu_begin(); - - asm volatile( -#undef BLOCK -#define BLOCK(i) \ - LD(i, 0) \ - LD(i + 1, 1) \ - PF1(i) \ - PF1(i + 2) \ - LD(i + 2, 2) \ - LD(i + 3, 3) \ - PF0(i + 4) \ - PF0(i + 6) \ - XO1(i, 0) \ - XO1(i + 1, 1) \ - XO1(i + 2, 2) \ - XO1(i + 3, 3) \ - ST(i, 0) \ - ST(i + 1, 1) \ - ST(i + 2, 2) \ - ST(i + 3, 3) \ - - - PF0(0) - PF0(2) - - " .align 32 ;\n" - " 1: ;\n" - - BLOCK(0) - BLOCK(4) - BLOCK(8) - BLOCK(12) - - " addl $256, %1 ;\n" - " addl $256, %2 ;\n" - " decl %0 ;\n" - " jnz 1b ;\n" - : "+r" (lines), - "+r" (p1), "+r" (p2) - : - : "memory"); - - kernel_fpu_end(); -} - -static void -xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, - unsigned long *p3) -{ - unsigned long lines = bytes >> 8; - - kernel_fpu_begin(); - - asm volatile( -#undef BLOCK -#define BLOCK(i) \ - PF1(i) \ - PF1(i + 2) \ - LD(i,0) \ - LD(i + 1, 1) \ - LD(i + 2, 2) \ - LD(i + 3, 3) \ - PF2(i) \ - PF2(i + 2) \ - PF0(i + 4) \ - PF0(i + 6) \ - XO1(i,0) \ - XO1(i + 1, 1) \ - XO1(i + 2, 2) \ - XO1(i + 3, 3) \ - XO2(i,0) \ - XO2(i + 1, 1) \ - XO2(i + 2, 2) \ - XO2(i + 3, 3) \ - ST(i,0) \ - ST(i + 1, 1) \ - ST(i + 2, 2) \ - ST(i + 3, 3) \ - - - PF0(0) - PF0(2) - - " .align 32 ;\n" - " 1: ;\n" - - BLOCK(0) - BLOCK(4) - BLOCK(8) - BLOCK(12) - - " addl $256, %1 ;\n" - " addl $256, %2 ;\n" - " addl $256, %3 ;\n" - " decl %0 ;\n" - " jnz 1b ;\n" - : "+r" (lines), - "+r" (p1), "+r"(p2), "+r"(p3) - : - : "memory" ); - - kernel_fpu_end(); -} - -static void -xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, - unsigned long *p3, unsigned long *p4) -{ - unsigned long lines = bytes >> 8; - - kernel_fpu_begin(); - - asm volatile( -#undef BLOCK -#define BLOCK(i) \ - PF1(i) \ - PF1(i + 2) \ - LD(i,0) \ - LD(i + 1, 1) \ - LD(i + 2, 2) \ - LD(i + 3, 3) \ - PF2(i) \ - PF2(i + 2) \ - XO1(i,0) \ - XO1(i + 1, 1) \ - XO1(i + 2, 2) \ - XO1(i + 3, 3) \ - PF3(i) \ - PF3(i + 2) \ - PF0(i + 4) \ - PF0(i + 6) \ - XO2(i,0) \ - XO2(i + 1, 1) \ - XO2(i + 2, 2) \ - XO2(i + 3, 3) \ - XO3(i,0) \ - XO3(i + 1, 1) \ - XO3(i + 2, 2) \ - XO3(i + 3, 3) \ - ST(i,0) \ - ST(i + 1, 1) \ - ST(i + 2, 2) \ - ST(i + 3, 3) \ - - - PF0(0) - PF0(2) - - " .align 32 ;\n" - " 1: ;\n" - - BLOCK(0) - BLOCK(4) - BLOCK(8) - BLOCK(12) - - " addl $256, %1 ;\n" - " addl $256, %2 ;\n" - " addl $256, %3 ;\n" - " addl $256, %4 ;\n" - " decl %0 ;\n" - " jnz 1b ;\n" - : "+r" (lines), - "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4) - : - : "memory" ); - - kernel_fpu_end(); -} - -static void -xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, - unsigned long *p3, unsigned long *p4, unsigned long *p5) -{ - unsigned long lines = bytes >> 8; - - kernel_fpu_begin(); - - /* Make sure GCC forgets anything it knows about p4 or p5, - such that it won't pass to the asm volatile below a - register that is shared with any other variable. That's - because we modify p4 and p5 there, but we can't mark them - as read/write, otherwise we'd overflow the 10-asm-operands - limit of GCC < 3.1. */ - asm("" : "+r" (p4), "+r" (p5)); - - asm volatile( -#undef BLOCK -#define BLOCK(i) \ - PF1(i) \ - PF1(i + 2) \ - LD(i,0) \ - LD(i + 1, 1) \ - LD(i + 2, 2) \ - LD(i + 3, 3) \ - PF2(i) \ - PF2(i + 2) \ - XO1(i,0) \ - XO1(i + 1, 1) \ - XO1(i + 2, 2) \ - XO1(i + 3, 3) \ - PF3(i) \ - PF3(i + 2) \ - XO2(i,0) \ - XO2(i + 1, 1) \ - XO2(i + 2, 2) \ - XO2(i + 3, 3) \ - PF4(i) \ - PF4(i + 2) \ - PF0(i + 4) \ - PF0(i + 6) \ - XO3(i,0) \ - XO3(i + 1, 1) \ - XO3(i + 2, 2) \ - XO3(i + 3, 3) \ - XO4(i,0) \ - XO4(i + 1, 1) \ - XO4(i + 2, 2) \ - XO4(i + 3, 3) \ - ST(i,0) \ - ST(i + 1, 1) \ - ST(i + 2, 2) \ - ST(i + 3, 3) \ - - - PF0(0) - PF0(2) - - " .align 32 ;\n" - " 1: ;\n" - - BLOCK(0) - BLOCK(4) - BLOCK(8) - BLOCK(12) - - " addl $256, %1 ;\n" - " addl $256, %2 ;\n" - " addl $256, %3 ;\n" - " addl $256, %4 ;\n" - " addl $256, %5 ;\n" - " decl %0 ;\n" - " jnz 1b ;\n" - : "+r" (lines), - "+r" (p1), "+r" (p2), "+r" (p3) - : "r" (p4), "r" (p5) - : "memory"); - - /* p4 and p5 were modified, and now the variables are dead. - Clobber them just to be sure nobody does something stupid - like assuming they have some legal value. */ - asm("" : "=r" (p4), "=r" (p5)); - - kernel_fpu_end(); -} - static struct xor_block_template xor_block_pIII_sse = { .name = "pIII_sse", .do_2 = xor_sse_2, @@ -827,26 +543,25 @@ static struct xor_block_template xor_block_pIII_sse = { /* Also try the generic routines. */ #include <asm-generic/xor.h> +/* We force the use of the SSE xor block because it can write around L2. + We may also be able to load into the L1 only depending on how the cpu + deals with a load to a line that is being prefetched. */ #undef XOR_TRY_TEMPLATES #define XOR_TRY_TEMPLATES \ do { \ - xor_speed(&xor_block_8regs); \ - xor_speed(&xor_block_8regs_p); \ - xor_speed(&xor_block_32regs); \ - xor_speed(&xor_block_32regs_p); \ AVX_XOR_SPEED; \ - if (cpu_has_xmm) \ + if (cpu_has_xmm) { \ xor_speed(&xor_block_pIII_sse); \ - if (cpu_has_mmx) { \ + xor_speed(&xor_block_sse_pf64); \ + } else if (cpu_has_mmx) { \ xor_speed(&xor_block_pII_mmx); \ xor_speed(&xor_block_p5_mmx); \ + } else { \ + xor_speed(&xor_block_8regs); \ + xor_speed(&xor_block_8regs_p); \ + xor_speed(&xor_block_32regs); \ + xor_speed(&xor_block_32regs_p); \ } \ } while (0) -/* We force the use of the SSE xor block because it can write around L2. - We may also be able to load into the L1 only depending on how the cpu - deals with a load to a line that is being prefetched. */ -#define XOR_SELECT_TEMPLATE(FASTEST) \ - AVX_SELECT(cpu_has_xmm ? &xor_block_pIII_sse : FASTEST) - #endif /* _ASM_X86_XOR_32_H */ diff --git a/arch/x86/include/asm/xor_64.h b/arch/x86/include/asm/xor_64.h index 87ac522c4af..546f1e3b87c 100644 --- a/arch/x86/include/asm/xor_64.h +++ b/arch/x86/include/asm/xor_64.h @@ -1,301 +1,6 @@ #ifndef _ASM_X86_XOR_64_H #define _ASM_X86_XOR_64_H -/* - * Optimized RAID-5 checksumming functions for MMX and SSE. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * You should have received a copy of the GNU General Public License - * (for example /usr/src/linux/COPYING); if not, write to the Free - * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - - -/* - * Cache avoiding checksumming functions utilizing KNI instructions - * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo) - */ - -/* - * Based on - * High-speed RAID5 checksumming functions utilizing SSE instructions. - * Copyright (C) 1998 Ingo Molnar. - */ - -/* - * x86-64 changes / gcc fixes from Andi Kleen. - * Copyright 2002 Andi Kleen, SuSE Labs. - * - * This hasn't been optimized for the hammer yet, but there are likely - * no advantages to be gotten from x86-64 here anyways. - */ - -#include <asm/i387.h> - -#define OFFS(x) "16*("#x")" -#define PF_OFFS(x) "256+16*("#x")" -#define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n" -#define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n" -#define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n" -#define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n" -#define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n" -#define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n" -#define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n" -#define PF5(x) " prefetchnta "PF_OFFS(x)"(%[p6]) ;\n" -#define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n" -#define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n" -#define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n" -#define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n" -#define XO5(x, y) " xorps "OFFS(x)"(%[p6]), %%xmm"#y" ;\n" - - -static void -xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) -{ - unsigned int lines = bytes >> 8; - - kernel_fpu_begin(); - - asm volatile( -#undef BLOCK -#define BLOCK(i) \ - LD(i, 0) \ - LD(i + 1, 1) \ - PF1(i) \ - PF1(i + 2) \ - LD(i + 2, 2) \ - LD(i + 3, 3) \ - PF0(i + 4) \ - PF0(i + 6) \ - XO1(i, 0) \ - XO1(i + 1, 1) \ - XO1(i + 2, 2) \ - XO1(i + 3, 3) \ - ST(i, 0) \ - ST(i + 1, 1) \ - ST(i + 2, 2) \ - ST(i + 3, 3) \ - - - PF0(0) - PF0(2) - - " .align 32 ;\n" - " 1: ;\n" - - BLOCK(0) - BLOCK(4) - BLOCK(8) - BLOCK(12) - - " addq %[inc], %[p1] ;\n" - " addq %[inc], %[p2] ;\n" - " decl %[cnt] ; jnz 1b" - : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines) - : [inc] "r" (256UL) - : "memory"); - - kernel_fpu_end(); -} - -static void -xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, - unsigned long *p3) -{ - unsigned int lines = bytes >> 8; - - kernel_fpu_begin(); - asm volatile( -#undef BLOCK -#define BLOCK(i) \ - PF1(i) \ - PF1(i + 2) \ - LD(i, 0) \ - LD(i + 1, 1) \ - LD(i + 2, 2) \ - LD(i + 3, 3) \ - PF2(i) \ - PF2(i + 2) \ - PF0(i + 4) \ - PF0(i + 6) \ - XO1(i, 0) \ - XO1(i + 1, 1) \ - XO1(i + 2, 2) \ - XO1(i + 3, 3) \ - XO2(i, 0) \ - XO2(i + 1, 1) \ - XO2(i + 2, 2) \ - XO2(i + 3, 3) \ - ST(i, 0) \ - ST(i + 1, 1) \ - ST(i + 2, 2) \ - ST(i + 3, 3) \ - - - PF0(0) - PF0(2) - - " .align 32 ;\n" - " 1: ;\n" - - BLOCK(0) - BLOCK(4) - BLOCK(8) - BLOCK(12) - - " addq %[inc], %[p1] ;\n" - " addq %[inc], %[p2] ;\n" - " addq %[inc], %[p3] ;\n" - " decl %[cnt] ; jnz 1b" - : [cnt] "+r" (lines), - [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3) - : [inc] "r" (256UL) - : "memory"); - kernel_fpu_end(); -} - -static void -xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, - unsigned long *p3, unsigned long *p4) -{ - unsigned int lines = bytes >> 8; - - kernel_fpu_begin(); - - asm volatile( -#undef BLOCK -#define BLOCK(i) \ - PF1(i) \ - PF1(i + 2) \ - LD(i, 0) \ - LD(i + 1, 1) \ - LD(i + 2, 2) \ - LD(i + 3, 3) \ - PF2(i) \ - PF2(i + 2) \ - XO1(i, 0) \ - XO1(i + 1, 1) \ - XO1(i + 2, 2) \ - XO1(i + 3, 3) \ - PF3(i) \ - PF3(i + 2) \ - PF0(i + 4) \ - PF0(i + 6) \ - XO2(i, 0) \ - XO2(i + 1, 1) \ - XO2(i + 2, 2) \ - XO2(i + 3, 3) \ - XO3(i, 0) \ - XO3(i + 1, 1) \ - XO3(i + 2, 2) \ - XO3(i + 3, 3) \ - ST(i, 0) \ - ST(i + 1, 1) \ - ST(i + 2, 2) \ - ST(i + 3, 3) \ - - - PF0(0) - PF0(2) - - " .align 32 ;\n" - " 1: ;\n" - - BLOCK(0) - BLOCK(4) - BLOCK(8) - BLOCK(12) - - " addq %[inc], %[p1] ;\n" - " addq %[inc], %[p2] ;\n" - " addq %[inc], %[p3] ;\n" - " addq %[inc], %[p4] ;\n" - " decl %[cnt] ; jnz 1b" - : [cnt] "+c" (lines), - [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4) - : [inc] "r" (256UL) - : "memory" ); - - kernel_fpu_end(); -} - -static void -xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, - unsigned long *p3, unsigned long *p4, unsigned long *p5) -{ - unsigned int lines = bytes >> 8; - - kernel_fpu_begin(); - - asm volatile( -#undef BLOCK -#define BLOCK(i) \ - PF1(i) \ - PF1(i + 2) \ - LD(i, 0) \ - LD(i + 1, 1) \ - LD(i + 2, 2) \ - LD(i + 3, 3) \ - PF2(i) \ - PF2(i + 2) \ - XO1(i, 0) \ - XO1(i + 1, 1) \ - XO1(i + 2, 2) \ - XO1(i + 3, 3) \ - PF3(i) \ - PF3(i + 2) \ - XO2(i, 0) \ - XO2(i + 1, 1) \ - XO2(i + 2, 2) \ - XO2(i + 3, 3) \ - PF4(i) \ - PF4(i + 2) \ - PF0(i + 4) \ - PF0(i + 6) \ - XO3(i, 0) \ - XO3(i + 1, 1) \ - XO3(i + 2, 2) \ - XO3(i + 3, 3) \ - XO4(i, 0) \ - XO4(i + 1, 1) \ - XO4(i + 2, 2) \ - XO4(i + 3, 3) \ - ST(i, 0) \ - ST(i + 1, 1) \ - ST(i + 2, 2) \ - ST(i + 3, 3) \ - - - PF0(0) - PF0(2) - - " .align 32 ;\n" - " 1: ;\n" - - BLOCK(0) - BLOCK(4) - BLOCK(8) - BLOCK(12) - - " addq %[inc], %[p1] ;\n" - " addq %[inc], %[p2] ;\n" - " addq %[inc], %[p3] ;\n" - " addq %[inc], %[p4] ;\n" - " addq %[inc], %[p5] ;\n" - " decl %[cnt] ; jnz 1b" - : [cnt] "+c" (lines), - [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4), - [p5] "+r" (p5) - : [inc] "r" (256UL) - : "memory"); - - kernel_fpu_end(); -} - static struct xor_block_template xor_block_sse = { .name = "generic_sse", .do_2 = xor_sse_2, @@ -308,17 +13,15 @@ static struct xor_block_template xor_block_sse = { /* Also try the AVX routines */ #include <asm/xor_avx.h> +/* We force the use of the SSE xor block because it can write around L2. + We may also be able to load into the L1 only depending on how the cpu + deals with a load to a line that is being prefetched. */ #undef XOR_TRY_TEMPLATES #define XOR_TRY_TEMPLATES \ do { \ AVX_XOR_SPEED; \ + xor_speed(&xor_block_sse_pf64); \ xor_speed(&xor_block_sse); \ } while (0) -/* We force the use of the SSE xor block because it can write around L2. - We may also be able to load into the L1 only depending on how the cpu - deals with a load to a line that is being prefetched. */ -#define XOR_SELECT_TEMPLATE(FASTEST) \ - AVX_SELECT(&xor_block_sse) - #endif /* _ASM_X86_XOR_64_H */ diff --git a/arch/x86/include/uapi/asm/Kbuild b/arch/x86/include/uapi/asm/Kbuild index 83b6e9a0dce..09409c44f9a 100644 --- a/arch/x86/include/uapi/asm/Kbuild +++ b/arch/x86/include/uapi/asm/Kbuild @@ -4,3 +4,61 @@ include include/uapi/asm-generic/Kbuild.asm genhdr-y += unistd_32.h genhdr-y += unistd_64.h genhdr-y += unistd_x32.h +header-y += a.out.h +header-y += auxvec.h +header-y += bitsperlong.h +header-y += boot.h +header-y += bootparam.h +header-y += byteorder.h +header-y += debugreg.h +header-y += e820.h +header-y += errno.h +header-y += fcntl.h +header-y += hw_breakpoint.h +header-y += hyperv.h +header-y += ioctl.h +header-y += ioctls.h +header-y += ipcbuf.h +header-y += ist.h +header-y += kvm.h +header-y += kvm_para.h +header-y += ldt.h +header-y += mce.h +header-y += mman.h +header-y += msgbuf.h +header-y += msr-index.h +header-y += msr.h +header-y += mtrr.h +header-y += param.h +header-y += perf_regs.h +header-y += poll.h +header-y += posix_types.h +header-y += posix_types_32.h +header-y += posix_types_64.h +header-y += posix_types_x32.h +header-y += prctl.h +header-y += processor-flags.h +header-y += ptrace-abi.h +header-y += ptrace.h +header-y += resource.h +header-y += sembuf.h +header-y += setup.h +header-y += shmbuf.h +header-y += sigcontext.h +header-y += sigcontext32.h +header-y += siginfo.h +header-y += signal.h +header-y += socket.h +header-y += sockios.h +header-y += stat.h +header-y += statfs.h +header-y += svm.h +header-y += swab.h +header-y += termbits.h +header-y += termios.h +header-y += types.h +header-y += ucontext.h +header-y += unistd.h +header-y += vm86.h +header-y += vmx.h +header-y += vsyscall.h diff --git a/arch/x86/include/asm/a.out.h b/arch/x86/include/uapi/asm/a.out.h index 4684f97a5bb..4684f97a5bb 100644 --- a/arch/x86/include/asm/a.out.h +++ b/arch/x86/include/uapi/asm/a.out.h diff --git a/arch/x86/include/asm/auxvec.h b/arch/x86/include/uapi/asm/auxvec.h index 77203ac352d..77203ac352d 100644 --- a/arch/x86/include/asm/auxvec.h +++ b/arch/x86/include/uapi/asm/auxvec.h diff --git a/arch/x86/include/asm/bitsperlong.h b/arch/x86/include/uapi/asm/bitsperlong.h index b0ae1c4dc79..b0ae1c4dc79 100644 --- a/arch/x86/include/asm/bitsperlong.h +++ b/arch/x86/include/uapi/asm/bitsperlong.h diff --git a/arch/x86/include/uapi/asm/boot.h b/arch/x86/include/uapi/asm/boot.h new file mode 100644 index 00000000000..94292c4c812 --- /dev/null +++ b/arch/x86/include/uapi/asm/boot.h @@ -0,0 +1,10 @@ +#ifndef _UAPI_ASM_X86_BOOT_H +#define _UAPI_ASM_X86_BOOT_H + +/* Internal svga startup constants */ +#define NORMAL_VGA 0xffff /* 80x25 mode */ +#define EXTENDED_VGA 0xfffe /* 80x50 mode */ +#define ASK_VGA 0xfffd /* ask for it at bootup */ + + +#endif /* _UAPI_ASM_X86_BOOT_H */ diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index 2ad874cb661..c15ddaf9071 100644 --- a/arch/x86/include/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -1,6 +1,31 @@ #ifndef _ASM_X86_BOOTPARAM_H #define _ASM_X86_BOOTPARAM_H +/* setup_data types */ +#define SETUP_NONE 0 +#define SETUP_E820_EXT 1 +#define SETUP_DTB 2 +#define SETUP_PCI 3 + +/* ram_size flags */ +#define RAMDISK_IMAGE_START_MASK 0x07FF +#define RAMDISK_PROMPT_FLAG 0x8000 +#define RAMDISK_LOAD_FLAG 0x4000 + +/* loadflags */ +#define LOADED_HIGH (1<<0) +#define QUIET_FLAG (1<<5) +#define KEEP_SEGMENTS (1<<6) +#define CAN_USE_HEAP (1<<7) + +/* xloadflags */ +#define XLF_KERNEL_64 (1<<0) +#define XLF_CAN_BE_LOADED_ABOVE_4G (1<<1) +#define XLF_EFI_HANDOVER_32 (1<<2) +#define XLF_EFI_HANDOVER_64 (1<<3) + +#ifndef __ASSEMBLY__ + #include <linux/types.h> #include <linux/screen_info.h> #include <linux/apm_bios.h> @@ -9,11 +34,6 @@ #include <asm/ist.h> #include <video/edid.h> -/* setup data types */ -#define SETUP_NONE 0 -#define SETUP_E820_EXT 1 -#define SETUP_DTB 2 - /* extensible setup data list node */ struct setup_data { __u64 next; @@ -27,9 +47,6 @@ struct setup_header { __u16 root_flags; __u32 syssize; __u16 ram_size; -#define RAMDISK_IMAGE_START_MASK 0x07FF -#define RAMDISK_PROMPT_FLAG 0x8000 -#define RAMDISK_LOAD_FLAG 0x4000 __u16 vid_mode; __u16 root_dev; __u16 boot_flag; @@ -41,10 +58,6 @@ struct setup_header { __u16 kernel_version; __u8 type_of_loader; __u8 loadflags; -#define LOADED_HIGH (1<<0) -#define QUIET_FLAG (1<<5) -#define KEEP_SEGMENTS (1<<6) -#define CAN_USE_HEAP (1<<7) __u16 setup_move_size; __u32 code32_start; __u32 ramdisk_image; @@ -57,7 +70,8 @@ struct setup_header { __u32 initrd_addr_max; __u32 kernel_alignment; __u8 relocatable_kernel; - __u8 _pad2[3]; + __u8 min_alignment; + __u16 xloadflags; __u32 cmdline_size; __u32 hardware_subarch; __u64 hardware_subarch_data; @@ -105,7 +119,10 @@ struct boot_params { __u8 hd1_info[16]; /* obsolete! */ /* 0x090 */ struct sys_desc_table sys_desc_table; /* 0x0a0 */ struct olpc_ofw_header olpc_ofw_header; /* 0x0b0 */ - __u8 _pad4[128]; /* 0x0c0 */ + __u32 ext_ramdisk_image; /* 0x0c0 */ + __u32 ext_ramdisk_size; /* 0x0c4 */ + __u32 ext_cmd_line_ptr; /* 0x0c8 */ + __u8 _pad4[116]; /* 0x0cc */ struct edid_info edid_info; /* 0x140 */ struct efi_info efi_info; /* 0x1c0 */ __u32 alt_mem_k; /* 0x1e0 */ @@ -114,7 +131,20 @@ struct boot_params { __u8 eddbuf_entries; /* 0x1e9 */ __u8 edd_mbr_sig_buf_entries; /* 0x1ea */ __u8 kbd_status; /* 0x1eb */ - __u8 _pad6[5]; /* 0x1ec */ + __u8 _pad5[3]; /* 0x1ec */ + /* + * The sentinel is set to a nonzero value (0xff) in header.S. + * + * A bootloader is supposed to only take setup_header and put + * it into a clean boot_params buffer. If it turns out that + * it is clumsy or too generous with the buffer, it most + * probably will pick up the sentinel variable too. The fact + * that this variable then is still 0xff will let kernel + * know that some variables in boot_params are invalid and + * kernel should zero out certain portions of boot_params. + */ + __u8 sentinel; /* 0x1ef */ + __u8 _pad6[1]; /* 0x1f0 */ struct setup_header hdr; /* setup header */ /* 0x1f1 */ __u8 _pad7[0x290-0x1f1-sizeof(struct setup_header)]; __u32 edd_mbr_sig_buffer[EDD_MBR_SIG_MAX]; /* 0x290 */ @@ -133,6 +163,6 @@ enum { X86_NR_SUBARCHS, }; - +#endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_BOOTPARAM_H */ diff --git a/arch/x86/include/asm/byteorder.h b/arch/x86/include/uapi/asm/byteorder.h index b13a7a88f3e..b13a7a88f3e 100644 --- a/arch/x86/include/asm/byteorder.h +++ b/arch/x86/include/uapi/asm/byteorder.h diff --git a/arch/x86/include/uapi/asm/debugreg.h b/arch/x86/include/uapi/asm/debugreg.h new file mode 100644 index 00000000000..3c0874dd986 --- /dev/null +++ b/arch/x86/include/uapi/asm/debugreg.h @@ -0,0 +1,80 @@ +#ifndef _UAPI_ASM_X86_DEBUGREG_H +#define _UAPI_ASM_X86_DEBUGREG_H + + +/* Indicate the register numbers for a number of the specific + debug registers. Registers 0-3 contain the addresses we wish to trap on */ +#define DR_FIRSTADDR 0 /* u_debugreg[DR_FIRSTADDR] */ +#define DR_LASTADDR 3 /* u_debugreg[DR_LASTADDR] */ + +#define DR_STATUS 6 /* u_debugreg[DR_STATUS] */ +#define DR_CONTROL 7 /* u_debugreg[DR_CONTROL] */ + +/* Define a few things for the status register. We can use this to determine + which debugging register was responsible for the trap. The other bits + are either reserved or not of interest to us. */ + +/* Define reserved bits in DR6 which are always set to 1 */ +#define DR6_RESERVED (0xFFFF0FF0) + +#define DR_TRAP0 (0x1) /* db0 */ +#define DR_TRAP1 (0x2) /* db1 */ +#define DR_TRAP2 (0x4) /* db2 */ +#define DR_TRAP3 (0x8) /* db3 */ +#define DR_TRAP_BITS (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3) + +#define DR_STEP (0x4000) /* single-step */ +#define DR_SWITCH (0x8000) /* task switch */ + +/* Now define a bunch of things for manipulating the control register. + The top two bytes of the control register consist of 4 fields of 4 + bits - each field corresponds to one of the four debug registers, + and indicates what types of access we trap on, and how large the data + field is that we are looking at */ + +#define DR_CONTROL_SHIFT 16 /* Skip this many bits in ctl register */ +#define DR_CONTROL_SIZE 4 /* 4 control bits per register */ + +#define DR_RW_EXECUTE (0x0) /* Settings for the access types to trap on */ +#define DR_RW_WRITE (0x1) +#define DR_RW_READ (0x3) + +#define DR_LEN_1 (0x0) /* Settings for data length to trap on */ +#define DR_LEN_2 (0x4) +#define DR_LEN_4 (0xC) +#define DR_LEN_8 (0x8) + +/* The low byte to the control register determine which registers are + enabled. There are 4 fields of two bits. One bit is "local", meaning + that the processor will reset the bit after a task switch and the other + is global meaning that we have to explicitly reset the bit. With linux, + you can use either one, since we explicitly zero the register when we enter + kernel mode. */ + +#define DR_LOCAL_ENABLE_SHIFT 0 /* Extra shift to the local enable bit */ +#define DR_GLOBAL_ENABLE_SHIFT 1 /* Extra shift to the global enable bit */ +#define DR_LOCAL_ENABLE (0x1) /* Local enable for reg 0 */ +#define DR_GLOBAL_ENABLE (0x2) /* Global enable for reg 0 */ +#define DR_ENABLE_SIZE 2 /* 2 enable bits per register */ + +#define DR_LOCAL_ENABLE_MASK (0x55) /* Set local bits for all 4 regs */ +#define DR_GLOBAL_ENABLE_MASK (0xAA) /* Set global bits for all 4 regs */ + +/* The second byte to the control register has a few special things. + We can slow the instruction pipeline for instructions coming via the + gdt or the ldt if we want to. I am not sure why this is an advantage */ + +#ifdef __i386__ +#define DR_CONTROL_RESERVED (0xFC00) /* Reserved by Intel */ +#else +#define DR_CONTROL_RESERVED (0xFFFFFFFF0000FC00UL) /* Reserved */ +#endif + +#define DR_LOCAL_SLOWDOWN (0x100) /* Local slow the pipeline */ +#define DR_GLOBAL_SLOWDOWN (0x200) /* Global slow the pipeline */ + +/* + * HW breakpoint additions + */ + +#endif /* _UAPI_ASM_X86_DEBUGREG_H */ diff --git a/arch/x86/include/uapi/asm/e820.h b/arch/x86/include/uapi/asm/e820.h new file mode 100644 index 00000000000..bbae0247070 --- /dev/null +++ b/arch/x86/include/uapi/asm/e820.h @@ -0,0 +1,75 @@ +#ifndef _UAPI_ASM_X86_E820_H +#define _UAPI_ASM_X86_E820_H +#define E820MAP 0x2d0 /* our map */ +#define E820MAX 128 /* number of entries in E820MAP */ + +/* + * Legacy E820 BIOS limits us to 128 (E820MAX) nodes due to the + * constrained space in the zeropage. If we have more nodes than + * that, and if we've booted off EFI firmware, then the EFI tables + * passed us from the EFI firmware can list more nodes. Size our + * internal memory map tables to have room for these additional + * nodes, based on up to three entries per node for which the + * kernel was built: MAX_NUMNODES == (1 << CONFIG_NODES_SHIFT), + * plus E820MAX, allowing space for the possible duplicate E820 + * entries that might need room in the same arrays, prior to the + * call to sanitize_e820_map() to remove duplicates. The allowance + * of three memory map entries per node is "enough" entries for + * the initial hardware platform motivating this mechanism to make + * use of additional EFI map entries. Future platforms may want + * to allow more than three entries per node or otherwise refine + * this size. + */ + +/* + * Odd: 'make headers_check' complains about numa.h if I try + * to collapse the next two #ifdef lines to a single line: + * #if defined(__KERNEL__) && defined(CONFIG_EFI) + */ +#ifndef __KERNEL__ +#define E820_X_MAX E820MAX +#endif + +#define E820NR 0x1e8 /* # entries in E820MAP */ + +#define E820_RAM 1 +#define E820_RESERVED 2 +#define E820_ACPI 3 +#define E820_NVS 4 +#define E820_UNUSABLE 5 + + +/* + * reserved RAM used by kernel itself + * if CONFIG_INTEL_TXT is enabled, memory of this type will be + * included in the S3 integrity calculation and so should not include + * any memory that BIOS might alter over the S3 transition + */ +#define E820_RESERVED_KERN 128 + +#ifndef __ASSEMBLY__ +#include <linux/types.h> +struct e820entry { + __u64 addr; /* start of memory segment */ + __u64 size; /* size of memory segment */ + __u32 type; /* type of memory segment */ +} __attribute__((packed)); + +struct e820map { + __u32 nr_map; + struct e820entry map[E820_X_MAX]; +}; + +#define ISA_START_ADDRESS 0xa0000 +#define ISA_END_ADDRESS 0x100000 + +#define BIOS_BEGIN 0x000a0000 +#define BIOS_END 0x00100000 + +#define BIOS_ROM_BASE 0xffe00000 +#define BIOS_ROM_END 0xffffffff + +#endif /* __ASSEMBLY__ */ + + +#endif /* _UAPI_ASM_X86_E820_H */ diff --git a/arch/x86/include/asm/errno.h b/arch/x86/include/uapi/asm/errno.h index 4c82b503d92..4c82b503d92 100644 --- a/arch/x86/include/asm/errno.h +++ b/arch/x86/include/uapi/asm/errno.h diff --git a/arch/x86/include/asm/fcntl.h b/arch/x86/include/uapi/asm/fcntl.h index 46ab12db573..46ab12db573 100644 --- a/arch/x86/include/asm/fcntl.h +++ b/arch/x86/include/uapi/asm/fcntl.h diff --git a/arch/x86/include/uapi/asm/hw_breakpoint.h b/arch/x86/include/uapi/asm/hw_breakpoint.h new file mode 100644 index 00000000000..79a9626b550 --- /dev/null +++ b/arch/x86/include/uapi/asm/hw_breakpoint.h @@ -0,0 +1 @@ +/* */ diff --git a/arch/x86/include/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h index b80420bcd09..b80420bcd09 100644 --- a/arch/x86/include/asm/hyperv.h +++ b/arch/x86/include/uapi/asm/hyperv.h diff --git a/arch/x86/include/asm/ioctl.h b/arch/x86/include/uapi/asm/ioctl.h index b279fe06dfe..b279fe06dfe 100644 --- a/arch/x86/include/asm/ioctl.h +++ b/arch/x86/include/uapi/asm/ioctl.h diff --git a/arch/x86/include/asm/ioctls.h b/arch/x86/include/uapi/asm/ioctls.h index ec34c760665..ec34c760665 100644 --- a/arch/x86/include/asm/ioctls.h +++ b/arch/x86/include/uapi/asm/ioctls.h diff --git a/arch/x86/include/asm/ipcbuf.h b/arch/x86/include/uapi/asm/ipcbuf.h index 84c7e51cb6d..84c7e51cb6d 100644 --- a/arch/x86/include/asm/ipcbuf.h +++ b/arch/x86/include/uapi/asm/ipcbuf.h diff --git a/arch/x86/include/uapi/asm/ist.h b/arch/x86/include/uapi/asm/ist.h new file mode 100644 index 00000000000..bad9f5ea407 --- /dev/null +++ b/arch/x86/include/uapi/asm/ist.h @@ -0,0 +1,29 @@ +/* + * Include file for the interface to IST BIOS + * Copyright 2002 Andy Grover <andrew.grover@intel.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#ifndef _UAPI_ASM_X86_IST_H +#define _UAPI_ASM_X86_IST_H + + + +#include <linux/types.h> + +struct ist_info { + __u32 signature; + __u32 command; + __u32 event; + __u32 perf_level; +}; + +#endif /* _UAPI_ASM_X86_IST_H */ diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index a65ec29e6ff..a65ec29e6ff 100644 --- a/arch/x86/include/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h new file mode 100644 index 00000000000..06fdbd987e9 --- /dev/null +++ b/arch/x86/include/uapi/asm/kvm_para.h @@ -0,0 +1,100 @@ +#ifndef _UAPI_ASM_X86_KVM_PARA_H +#define _UAPI_ASM_X86_KVM_PARA_H + +#include <linux/types.h> +#include <asm/hyperv.h> + +/* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It + * should be used to determine that a VM is running under KVM. + */ +#define KVM_CPUID_SIGNATURE 0x40000000 + +/* This CPUID returns a feature bitmap in eax. Before enabling a particular + * paravirtualization, the appropriate feature bit should be checked. + */ +#define KVM_CPUID_FEATURES 0x40000001 +#define KVM_FEATURE_CLOCKSOURCE 0 +#define KVM_FEATURE_NOP_IO_DELAY 1 +#define KVM_FEATURE_MMU_OP 2 +/* This indicates that the new set of kvmclock msrs + * are available. The use of 0x11 and 0x12 is deprecated + */ +#define KVM_FEATURE_CLOCKSOURCE2 3 +#define KVM_FEATURE_ASYNC_PF 4 +#define KVM_FEATURE_STEAL_TIME 5 +#define KVM_FEATURE_PV_EOI 6 + +/* The last 8 bits are used to indicate how to interpret the flags field + * in pvclock structure. If no bits are set, all flags are ignored. + */ +#define KVM_FEATURE_CLOCKSOURCE_STABLE_BIT 24 + +#define MSR_KVM_WALL_CLOCK 0x11 +#define MSR_KVM_SYSTEM_TIME 0x12 + +#define KVM_MSR_ENABLED 1 +/* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */ +#define MSR_KVM_WALL_CLOCK_NEW 0x4b564d00 +#define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01 +#define MSR_KVM_ASYNC_PF_EN 0x4b564d02 +#define MSR_KVM_STEAL_TIME 0x4b564d03 +#define MSR_KVM_PV_EOI_EN 0x4b564d04 + +struct kvm_steal_time { + __u64 steal; + __u32 version; + __u32 flags; + __u32 pad[12]; +}; + +#define KVM_STEAL_ALIGNMENT_BITS 5 +#define KVM_STEAL_VALID_BITS ((-1ULL << (KVM_STEAL_ALIGNMENT_BITS + 1))) +#define KVM_STEAL_RESERVED_MASK (((1 << KVM_STEAL_ALIGNMENT_BITS) - 1 ) << 1) + +#define KVM_MAX_MMU_OP_BATCH 32 + +#define KVM_ASYNC_PF_ENABLED (1 << 0) +#define KVM_ASYNC_PF_SEND_ALWAYS (1 << 1) + +/* Operations for KVM_HC_MMU_OP */ +#define KVM_MMU_OP_WRITE_PTE 1 +#define KVM_MMU_OP_FLUSH_TLB 2 +#define KVM_MMU_OP_RELEASE_PT 3 + +/* Payload for KVM_HC_MMU_OP */ +struct kvm_mmu_op_header { + __u32 op; + __u32 pad; +}; + +struct kvm_mmu_op_write_pte { + struct kvm_mmu_op_header header; + __u64 pte_phys; + __u64 pte_val; +}; + +struct kvm_mmu_op_flush_tlb { + struct kvm_mmu_op_header header; +}; + +struct kvm_mmu_op_release_pt { + struct kvm_mmu_op_header header; + __u64 pt_phys; +}; + +#define KVM_PV_REASON_PAGE_NOT_PRESENT 1 +#define KVM_PV_REASON_PAGE_READY 2 + +struct kvm_vcpu_pv_apf_data { + __u32 reason; + __u8 pad[60]; + __u32 enabled; +}; + +#define KVM_PV_EOI_BIT 0 +#define KVM_PV_EOI_MASK (0x1 << KVM_PV_EOI_BIT) +#define KVM_PV_EOI_ENABLED KVM_PV_EOI_MASK +#define KVM_PV_EOI_DISABLED 0x0 + + +#endif /* _UAPI_ASM_X86_KVM_PARA_H */ diff --git a/arch/x86/include/asm/ldt.h b/arch/x86/include/uapi/asm/ldt.h index 46727eb37bf..46727eb37bf 100644 --- a/arch/x86/include/asm/ldt.h +++ b/arch/x86/include/uapi/asm/ldt.h diff --git a/arch/x86/include/uapi/asm/mce.h b/arch/x86/include/uapi/asm/mce.h new file mode 100644 index 00000000000..a0eab85ce7b --- /dev/null +++ b/arch/x86/include/uapi/asm/mce.h @@ -0,0 +1,34 @@ +#ifndef _UAPI_ASM_X86_MCE_H +#define _UAPI_ASM_X86_MCE_H + +#include <linux/types.h> +#include <asm/ioctls.h> + +/* Fields are zero when not available */ +struct mce { + __u64 status; + __u64 misc; + __u64 addr; + __u64 mcgstatus; + __u64 ip; + __u64 tsc; /* cpu time stamp counter */ + __u64 time; /* wall time_t when error was detected */ + __u8 cpuvendor; /* cpu vendor as encoded in system.h */ + __u8 inject_flags; /* software inject flags */ + __u16 pad; + __u32 cpuid; /* CPUID 1 EAX */ + __u8 cs; /* code segment */ + __u8 bank; /* machine check bank */ + __u8 cpu; /* cpu number; obsolete; use extcpu now */ + __u8 finished; /* entry is valid */ + __u32 extcpu; /* linux cpu number that detected the error */ + __u32 socketid; /* CPU socket ID */ + __u32 apicid; /* CPU initial apic ID */ + __u64 mcgcap; /* MCGCAP MSR: machine check capabilities of CPU */ +}; + +#define MCE_GET_RECORD_LEN _IOR('M', 1, int) +#define MCE_GET_LOG_LEN _IOR('M', 2, int) +#define MCE_GETCLEAR_FLAGS _IOR('M', 3, int) + +#endif /* _UAPI_ASM_X86_MCE_H */ diff --git a/arch/x86/include/asm/mman.h b/arch/x86/include/uapi/asm/mman.h index 593e51d4643..513b05f15bb 100644 --- a/arch/x86/include/asm/mman.h +++ b/arch/x86/include/uapi/asm/mman.h @@ -3,6 +3,9 @@ #define MAP_32BIT 0x40 /* only give out 32bit addresses */ +#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT) +#define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT) + #include <asm-generic/mman.h> #endif /* _ASM_X86_MMAN_H */ diff --git a/arch/x86/include/asm/msgbuf.h b/arch/x86/include/uapi/asm/msgbuf.h index 809134c644a..809134c644a 100644 --- a/arch/x86/include/asm/msgbuf.h +++ b/arch/x86/include/uapi/asm/msgbuf.h diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h index 7f0edceb756..892ce40a747 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/uapi/asm/msr-index.h @@ -35,11 +35,14 @@ #define MSR_IA32_PERFCTR0 0x000000c1 #define MSR_IA32_PERFCTR1 0x000000c2 #define MSR_FSB_FREQ 0x000000cd +#define MSR_NHM_PLATFORM_INFO 0x000000ce #define MSR_NHM_SNB_PKG_CST_CFG_CTL 0x000000e2 #define NHM_C3_AUTO_DEMOTE (1UL << 25) #define NHM_C1_AUTO_DEMOTE (1UL << 26) #define ATM_LNC_C6_AUTO_DEMOTE (1UL << 25) +#define SNB_C1_AUTO_UNDEMOTE (1UL << 27) +#define SNB_C3_AUTO_UNDEMOTE (1UL << 28) #define MSR_MTRRcap 0x000000fe #define MSR_IA32_BBL_CR_CTL 0x00000119 @@ -55,6 +58,8 @@ #define MSR_OFFCORE_RSP_0 0x000001a6 #define MSR_OFFCORE_RSP_1 0x000001a7 +#define MSR_NHM_TURBO_RATIO_LIMIT 0x000001ad +#define MSR_IVT_TURBO_RATIO_LIMIT 0x000001ae #define MSR_LBR_SELECT 0x000001c8 #define MSR_LBR_TOS 0x000001c9 @@ -98,11 +103,45 @@ #define DEBUGCTLMSR_BTS_OFF_USR (1UL << 10) #define DEBUGCTLMSR_FREEZE_LBRS_ON_PMI (1UL << 11) +#define MSR_IA32_POWER_CTL 0x000001fc + #define MSR_IA32_MC0_CTL 0x00000400 #define MSR_IA32_MC0_STATUS 0x00000401 #define MSR_IA32_MC0_ADDR 0x00000402 #define MSR_IA32_MC0_MISC 0x00000403 +/* C-state Residency Counters */ +#define MSR_PKG_C3_RESIDENCY 0x000003f8 +#define MSR_PKG_C6_RESIDENCY 0x000003f9 +#define MSR_PKG_C7_RESIDENCY 0x000003fa +#define MSR_CORE_C3_RESIDENCY 0x000003fc +#define MSR_CORE_C6_RESIDENCY 0x000003fd +#define MSR_CORE_C7_RESIDENCY 0x000003fe +#define MSR_PKG_C2_RESIDENCY 0x0000060d + +/* Run Time Average Power Limiting (RAPL) Interface */ + +#define MSR_RAPL_POWER_UNIT 0x00000606 + +#define MSR_PKG_POWER_LIMIT 0x00000610 +#define MSR_PKG_ENERGY_STATUS 0x00000611 +#define MSR_PKG_PERF_STATUS 0x00000613 +#define MSR_PKG_POWER_INFO 0x00000614 + +#define MSR_DRAM_POWER_LIMIT 0x00000618 +#define MSR_DRAM_ENERGY_STATUS 0x00000619 +#define MSR_DRAM_PERF_STATUS 0x0000061b +#define MSR_DRAM_POWER_INFO 0x0000061c + +#define MSR_PP0_POWER_LIMIT 0x00000638 +#define MSR_PP0_ENERGY_STATUS 0x00000639 +#define MSR_PP0_POLICY 0x0000063a +#define MSR_PP0_PERF_STATUS 0x0000063b + +#define MSR_PP1_POWER_LIMIT 0x00000640 +#define MSR_PP1_ENERGY_STATUS 0x00000641 +#define MSR_PP1_POLICY 0x00000642 + #define MSR_AMD64_MC0_MASK 0xc0010044 #define MSR_IA32_MCx_CTL(x) (MSR_IA32_MC0_CTL + 4*(x)) @@ -136,6 +175,7 @@ #define MSR_AMD64_OSVW_ID_LENGTH 0xc0010140 #define MSR_AMD64_OSVW_STATUS 0xc0010141 #define MSR_AMD64_DC_CFG 0xc0011022 +#define MSR_AMD64_BU_CFG2 0xc001102a #define MSR_AMD64_IBSFETCHCTL 0xc0011030 #define MSR_AMD64_IBSFETCHLINAD 0xc0011031 #define MSR_AMD64_IBSFETCHPHYSAD 0xc0011032 @@ -157,6 +197,8 @@ /* Fam 15h MSRs */ #define MSR_F15H_PERF_CTL 0xc0010200 #define MSR_F15H_PERF_CTR 0xc0010201 +#define MSR_F15H_NB_PERF_CTL 0xc0010240 +#define MSR_F15H_NB_PERF_CTR 0xc0010241 /* Fam 10h MSRs */ #define MSR_FAM10H_MMIO_CONF_BASE 0xc0010058 @@ -235,7 +277,9 @@ #define MSR_IA32_PLATFORM_ID 0x00000017 #define MSR_IA32_EBL_CR_POWERON 0x0000002a #define MSR_EBC_FREQUENCY_ID 0x0000002c +#define MSR_SMI_COUNT 0x00000034 #define MSR_IA32_FEATURE_CONTROL 0x0000003a +#define MSR_IA32_TSC_ADJUST 0x0000003b #define FEATURE_CONTROL_LOCKED (1<<0) #define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX (1<<1) @@ -337,6 +381,8 @@ #define MSR_IA32_MISC_ENABLE_TURBO_DISABLE (1ULL << 38) #define MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE (1ULL << 39) +#define MSR_IA32_TSC_DEADLINE 0x000006E0 + /* P4/Xeon+ specific */ #define MSR_IA32_MCG_EAX 0x00000180 #define MSR_IA32_MCG_EBX 0x00000181 diff --git a/arch/x86/include/uapi/asm/msr.h b/arch/x86/include/uapi/asm/msr.h new file mode 100644 index 00000000000..155e51048fa --- /dev/null +++ b/arch/x86/include/uapi/asm/msr.h @@ -0,0 +1,15 @@ +#ifndef _UAPI_ASM_X86_MSR_H +#define _UAPI_ASM_X86_MSR_H + +#include <asm/msr-index.h> + +#ifndef __ASSEMBLY__ + +#include <linux/types.h> +#include <linux/ioctl.h> + +#define X86_IOC_RDMSR_REGS _IOWR('c', 0xA0, __u32[8]) +#define X86_IOC_WRMSR_REGS _IOWR('c', 0xA1, __u32[8]) + +#endif /* __ASSEMBLY__ */ +#endif /* _UAPI_ASM_X86_MSR_H */ diff --git a/arch/x86/include/uapi/asm/mtrr.h b/arch/x86/include/uapi/asm/mtrr.h new file mode 100644 index 00000000000..d0acb658c8f --- /dev/null +++ b/arch/x86/include/uapi/asm/mtrr.h @@ -0,0 +1,117 @@ +/* Generic MTRR (Memory Type Range Register) ioctls. + + Copyright (C) 1997-1999 Richard Gooch + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public + License along with this library; if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + Richard Gooch may be reached by email at rgooch@atnf.csiro.au + The postal address is: + Richard Gooch, c/o ATNF, P. O. Box 76, Epping, N.S.W., 2121, Australia. +*/ +#ifndef _UAPI_ASM_X86_MTRR_H +#define _UAPI_ASM_X86_MTRR_H + +#include <linux/types.h> +#include <linux/ioctl.h> +#include <linux/errno.h> + +#define MTRR_IOCTL_BASE 'M' + +/* Warning: this structure has a different order from i386 + on x86-64. The 32bit emulation code takes care of that. + But you need to use this for 64bit, otherwise your X server + will break. */ + +#ifdef __i386__ +struct mtrr_sentry { + unsigned long base; /* Base address */ + unsigned int size; /* Size of region */ + unsigned int type; /* Type of region */ +}; + +struct mtrr_gentry { + unsigned int regnum; /* Register number */ + unsigned long base; /* Base address */ + unsigned int size; /* Size of region */ + unsigned int type; /* Type of region */ +}; + +#else /* __i386__ */ + +struct mtrr_sentry { + __u64 base; /* Base address */ + __u32 size; /* Size of region */ + __u32 type; /* Type of region */ +}; + +struct mtrr_gentry { + __u64 base; /* Base address */ + __u32 size; /* Size of region */ + __u32 regnum; /* Register number */ + __u32 type; /* Type of region */ + __u32 _pad; /* Unused */ +}; + +#endif /* !__i386__ */ + +struct mtrr_var_range { + __u32 base_lo; + __u32 base_hi; + __u32 mask_lo; + __u32 mask_hi; +}; + +/* In the Intel processor's MTRR interface, the MTRR type is always held in + an 8 bit field: */ +typedef __u8 mtrr_type; + +#define MTRR_NUM_FIXED_RANGES 88 +#define MTRR_MAX_VAR_RANGES 256 + +struct mtrr_state_type { + struct mtrr_var_range var_ranges[MTRR_MAX_VAR_RANGES]; + mtrr_type fixed_ranges[MTRR_NUM_FIXED_RANGES]; + unsigned char enabled; + unsigned char have_fixed; + mtrr_type def_type; +}; + +#define MTRRphysBase_MSR(reg) (0x200 + 2 * (reg)) +#define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1) + +/* These are the various ioctls */ +#define MTRRIOC_ADD_ENTRY _IOW(MTRR_IOCTL_BASE, 0, struct mtrr_sentry) +#define MTRRIOC_SET_ENTRY _IOW(MTRR_IOCTL_BASE, 1, struct mtrr_sentry) +#define MTRRIOC_DEL_ENTRY _IOW(MTRR_IOCTL_BASE, 2, struct mtrr_sentry) +#define MTRRIOC_GET_ENTRY _IOWR(MTRR_IOCTL_BASE, 3, struct mtrr_gentry) +#define MTRRIOC_KILL_ENTRY _IOW(MTRR_IOCTL_BASE, 4, struct mtrr_sentry) +#define MTRRIOC_ADD_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 5, struct mtrr_sentry) +#define MTRRIOC_SET_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 6, struct mtrr_sentry) +#define MTRRIOC_DEL_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 7, struct mtrr_sentry) +#define MTRRIOC_GET_PAGE_ENTRY _IOWR(MTRR_IOCTL_BASE, 8, struct mtrr_gentry) +#define MTRRIOC_KILL_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 9, struct mtrr_sentry) + +/* These are the region types */ +#define MTRR_TYPE_UNCACHABLE 0 +#define MTRR_TYPE_WRCOMB 1 +/*#define MTRR_TYPE_ 2*/ +/*#define MTRR_TYPE_ 3*/ +#define MTRR_TYPE_WRTHROUGH 4 +#define MTRR_TYPE_WRPROT 5 +#define MTRR_TYPE_WRBACK 6 +#define MTRR_NUM_TYPES 7 + + +#endif /* _UAPI_ASM_X86_MTRR_H */ diff --git a/arch/x86/include/asm/param.h b/arch/x86/include/uapi/asm/param.h index 965d4542797..965d4542797 100644 --- a/arch/x86/include/asm/param.h +++ b/arch/x86/include/uapi/asm/param.h diff --git a/arch/x86/include/asm/perf_regs.h b/arch/x86/include/uapi/asm/perf_regs.h index 3f2207bfd17..3f2207bfd17 100644 --- a/arch/x86/include/asm/perf_regs.h +++ b/arch/x86/include/uapi/asm/perf_regs.h diff --git a/arch/x86/include/asm/poll.h b/arch/x86/include/uapi/asm/poll.h index c98509d3149..c98509d3149 100644 --- a/arch/x86/include/asm/poll.h +++ b/arch/x86/include/uapi/asm/poll.h diff --git a/arch/x86/include/uapi/asm/posix_types.h b/arch/x86/include/uapi/asm/posix_types.h new file mode 100644 index 00000000000..85506b38362 --- /dev/null +++ b/arch/x86/include/uapi/asm/posix_types.h @@ -0,0 +1,9 @@ +#ifndef __KERNEL__ +# ifdef __i386__ +# include <asm/posix_types_32.h> +# elif defined(__ILP32__) +# include <asm/posix_types_x32.h> +# else +# include <asm/posix_types_64.h> +# endif +#endif diff --git a/arch/x86/include/asm/posix_types_32.h b/arch/x86/include/uapi/asm/posix_types_32.h index 8e525059e7d..8e525059e7d 100644 --- a/arch/x86/include/asm/posix_types_32.h +++ b/arch/x86/include/uapi/asm/posix_types_32.h diff --git a/arch/x86/include/asm/posix_types_64.h b/arch/x86/include/uapi/asm/posix_types_64.h index cba0c1ead16..cba0c1ead16 100644 --- a/arch/x86/include/asm/posix_types_64.h +++ b/arch/x86/include/uapi/asm/posix_types_64.h diff --git a/arch/x86/include/asm/posix_types_x32.h b/arch/x86/include/uapi/asm/posix_types_x32.h index 85f9bdafa93..85f9bdafa93 100644 --- a/arch/x86/include/asm/posix_types_x32.h +++ b/arch/x86/include/uapi/asm/posix_types_x32.h diff --git a/arch/x86/include/asm/prctl.h b/arch/x86/include/uapi/asm/prctl.h index 3ac5032fae0..3ac5032fae0 100644 --- a/arch/x86/include/asm/prctl.h +++ b/arch/x86/include/uapi/asm/prctl.h diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h new file mode 100644 index 00000000000..54991a74604 --- /dev/null +++ b/arch/x86/include/uapi/asm/processor-flags.h @@ -0,0 +1,99 @@ +#ifndef _UAPI_ASM_X86_PROCESSOR_FLAGS_H +#define _UAPI_ASM_X86_PROCESSOR_FLAGS_H +/* Various flags defined: can be included from assembler. */ + +/* + * EFLAGS bits + */ +#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */ +#define X86_EFLAGS_BIT1 0x00000002 /* Bit 1 - always on */ +#define X86_EFLAGS_PF 0x00000004 /* Parity Flag */ +#define X86_EFLAGS_AF 0x00000010 /* Auxiliary carry Flag */ +#define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */ +#define X86_EFLAGS_SF 0x00000080 /* Sign Flag */ +#define X86_EFLAGS_TF 0x00000100 /* Trap Flag */ +#define X86_EFLAGS_IF 0x00000200 /* Interrupt Flag */ +#define X86_EFLAGS_DF 0x00000400 /* Direction Flag */ +#define X86_EFLAGS_OF 0x00000800 /* Overflow Flag */ +#define X86_EFLAGS_IOPL 0x00003000 /* IOPL mask */ +#define X86_EFLAGS_NT 0x00004000 /* Nested Task */ +#define X86_EFLAGS_RF 0x00010000 /* Resume Flag */ +#define X86_EFLAGS_VM 0x00020000 /* Virtual Mode */ +#define X86_EFLAGS_AC 0x00040000 /* Alignment Check */ +#define X86_EFLAGS_VIF 0x00080000 /* Virtual Interrupt Flag */ +#define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */ +#define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */ + +/* + * Basic CPU control in CR0 + */ +#define X86_CR0_PE 0x00000001 /* Protection Enable */ +#define X86_CR0_MP 0x00000002 /* Monitor Coprocessor */ +#define X86_CR0_EM 0x00000004 /* Emulation */ +#define X86_CR0_TS 0x00000008 /* Task Switched */ +#define X86_CR0_ET 0x00000010 /* Extension Type */ +#define X86_CR0_NE 0x00000020 /* Numeric Error */ +#define X86_CR0_WP 0x00010000 /* Write Protect */ +#define X86_CR0_AM 0x00040000 /* Alignment Mask */ +#define X86_CR0_NW 0x20000000 /* Not Write-through */ +#define X86_CR0_CD 0x40000000 /* Cache Disable */ +#define X86_CR0_PG 0x80000000 /* Paging */ + +/* + * Paging options in CR3 + */ +#define X86_CR3_PWT 0x00000008 /* Page Write Through */ +#define X86_CR3_PCD 0x00000010 /* Page Cache Disable */ +#define X86_CR3_PCID_MASK 0x00000fff /* PCID Mask */ + +/* + * Intel CPU features in CR4 + */ +#define X86_CR4_VME 0x00000001 /* enable vm86 extensions */ +#define X86_CR4_PVI 0x00000002 /* virtual interrupts flag enable */ +#define X86_CR4_TSD 0x00000004 /* disable time stamp at ipl 3 */ +#define X86_CR4_DE 0x00000008 /* enable debugging extensions */ +#define X86_CR4_PSE 0x00000010 /* enable page size extensions */ +#define X86_CR4_PAE 0x00000020 /* enable physical address extensions */ +#define X86_CR4_MCE 0x00000040 /* Machine check enable */ +#define X86_CR4_PGE 0x00000080 /* enable global pages */ +#define X86_CR4_PCE 0x00000100 /* enable performance counters at ipl 3 */ +#define X86_CR4_OSFXSR 0x00000200 /* enable fast FPU save and restore */ +#define X86_CR4_OSXMMEXCPT 0x00000400 /* enable unmasked SSE exceptions */ +#define X86_CR4_VMXE 0x00002000 /* enable VMX virtualization */ +#define X86_CR4_RDWRGSFS 0x00010000 /* enable RDWRGSFS support */ +#define X86_CR4_PCIDE 0x00020000 /* enable PCID support */ +#define X86_CR4_OSXSAVE 0x00040000 /* enable xsave and xrestore */ +#define X86_CR4_SMEP 0x00100000 /* enable SMEP support */ +#define X86_CR4_SMAP 0x00200000 /* enable SMAP support */ + +/* + * x86-64 Task Priority Register, CR8 + */ +#define X86_CR8_TPR 0x0000000F /* task priority register */ + +/* + * AMD and Transmeta use MSRs for configuration; see <asm/msr-index.h> + */ + +/* + * NSC/Cyrix CPU configuration register indexes + */ +#define CX86_PCR0 0x20 +#define CX86_GCR 0xb8 +#define CX86_CCR0 0xc0 +#define CX86_CCR1 0xc1 +#define CX86_CCR2 0xc2 +#define CX86_CCR3 0xc3 +#define CX86_CCR4 0xe8 +#define CX86_CCR5 0xe9 +#define CX86_CCR6 0xea +#define CX86_CCR7 0xeb +#define CX86_PCR1 0xf0 +#define CX86_DIR0 0xfe +#define CX86_DIR1 0xff +#define CX86_ARR_BASE 0xc4 +#define CX86_RCR_BASE 0xdc + + +#endif /* _UAPI_ASM_X86_PROCESSOR_FLAGS_H */ diff --git a/arch/x86/include/asm/ptrace-abi.h b/arch/x86/include/uapi/asm/ptrace-abi.h index 7b0a55a8885..7b0a55a8885 100644 --- a/arch/x86/include/asm/ptrace-abi.h +++ b/arch/x86/include/uapi/asm/ptrace-abi.h diff --git a/arch/x86/include/uapi/asm/ptrace.h b/arch/x86/include/uapi/asm/ptrace.h new file mode 100644 index 00000000000..ac4b9aa4d99 --- /dev/null +++ b/arch/x86/include/uapi/asm/ptrace.h @@ -0,0 +1,78 @@ +#ifndef _UAPI_ASM_X86_PTRACE_H +#define _UAPI_ASM_X86_PTRACE_H + +#include <linux/compiler.h> /* For __user */ +#include <asm/ptrace-abi.h> +#include <asm/processor-flags.h> + + +#ifndef __ASSEMBLY__ + +#ifdef __i386__ +/* this struct defines the way the registers are stored on the + stack during a system call. */ + +#ifndef __KERNEL__ + +struct pt_regs { + long ebx; + long ecx; + long edx; + long esi; + long edi; + long ebp; + long eax; + int xds; + int xes; + int xfs; + int xgs; + long orig_eax; + long eip; + int xcs; + long eflags; + long esp; + int xss; +}; + +#endif /* __KERNEL__ */ + +#else /* __i386__ */ + +#ifndef __KERNEL__ + +struct pt_regs { + unsigned long r15; + unsigned long r14; + unsigned long r13; + unsigned long r12; + unsigned long rbp; + unsigned long rbx; +/* arguments: non interrupts/non tracing syscalls only save up to here*/ + unsigned long r11; + unsigned long r10; + unsigned long r9; + unsigned long r8; + unsigned long rax; + unsigned long rcx; + unsigned long rdx; + unsigned long rsi; + unsigned long rdi; + unsigned long orig_rax; +/* end of arguments */ +/* cpu exception frame or undefined */ + unsigned long rip; + unsigned long cs; + unsigned long eflags; + unsigned long rsp; + unsigned long ss; +/* top of stack page */ +}; + +#endif /* __KERNEL__ */ +#endif /* !__i386__ */ + + + +#endif /* !__ASSEMBLY__ */ + +#endif /* _UAPI_ASM_X86_PTRACE_H */ diff --git a/arch/x86/include/asm/resource.h b/arch/x86/include/uapi/asm/resource.h index 04bc4db8921..04bc4db8921 100644 --- a/arch/x86/include/asm/resource.h +++ b/arch/x86/include/uapi/asm/resource.h diff --git a/arch/x86/include/asm/sembuf.h b/arch/x86/include/uapi/asm/sembuf.h index ee50c801f7b..ee50c801f7b 100644 --- a/arch/x86/include/asm/sembuf.h +++ b/arch/x86/include/uapi/asm/sembuf.h diff --git a/arch/x86/include/uapi/asm/setup.h b/arch/x86/include/uapi/asm/setup.h new file mode 100644 index 00000000000..79a9626b550 --- /dev/null +++ b/arch/x86/include/uapi/asm/setup.h @@ -0,0 +1 @@ +/* */ diff --git a/arch/x86/include/asm/shmbuf.h b/arch/x86/include/uapi/asm/shmbuf.h index 83c05fc2de3..83c05fc2de3 100644 --- a/arch/x86/include/asm/shmbuf.h +++ b/arch/x86/include/uapi/asm/shmbuf.h diff --git a/arch/x86/include/uapi/asm/sigcontext.h b/arch/x86/include/uapi/asm/sigcontext.h new file mode 100644 index 00000000000..d8b9f9081e8 --- /dev/null +++ b/arch/x86/include/uapi/asm/sigcontext.h @@ -0,0 +1,221 @@ +#ifndef _UAPI_ASM_X86_SIGCONTEXT_H +#define _UAPI_ASM_X86_SIGCONTEXT_H + +#include <linux/compiler.h> +#include <linux/types.h> + +#define FP_XSTATE_MAGIC1 0x46505853U +#define FP_XSTATE_MAGIC2 0x46505845U +#define FP_XSTATE_MAGIC2_SIZE sizeof(FP_XSTATE_MAGIC2) + +/* + * bytes 464..511 in the current 512byte layout of fxsave/fxrstor frame + * are reserved for SW usage. On cpu's supporting xsave/xrstor, these bytes + * are used to extended the fpstate pointer in the sigcontext, which now + * includes the extended state information along with fpstate information. + * + * Presence of FP_XSTATE_MAGIC1 at the beginning of this SW reserved + * area and FP_XSTATE_MAGIC2 at the end of memory layout + * (extended_size - FP_XSTATE_MAGIC2_SIZE) indicates the presence of the + * extended state information in the memory layout pointed by the fpstate + * pointer in sigcontext. + */ +struct _fpx_sw_bytes { + __u32 magic1; /* FP_XSTATE_MAGIC1 */ + __u32 extended_size; /* total size of the layout referred by + * fpstate pointer in the sigcontext. + */ + __u64 xstate_bv; + /* feature bit mask (including fp/sse/extended + * state) that is present in the memory + * layout. + */ + __u32 xstate_size; /* actual xsave state size, based on the + * features saved in the layout. + * 'extended_size' will be greater than + * 'xstate_size'. + */ + __u32 padding[7]; /* for future use. */ +}; + +#ifdef __i386__ +/* + * As documented in the iBCS2 standard.. + * + * The first part of "struct _fpstate" is just the normal i387 + * hardware setup, the extra "status" word is used to save the + * coprocessor status word before entering the handler. + * + * Pentium III FXSR, SSE support + * Gareth Hughes <gareth@valinux.com>, May 2000 + * + * The FPU state data structure has had to grow to accommodate the + * extended FPU state required by the Streaming SIMD Extensions. + * There is no documented standard to accomplish this at the moment. + */ +struct _fpreg { + unsigned short significand[4]; + unsigned short exponent; +}; + +struct _fpxreg { + unsigned short significand[4]; + unsigned short exponent; + unsigned short padding[3]; +}; + +struct _xmmreg { + unsigned long element[4]; +}; + +struct _fpstate { + /* Regular FPU environment */ + unsigned long cw; + unsigned long sw; + unsigned long tag; + unsigned long ipoff; + unsigned long cssel; + unsigned long dataoff; + unsigned long datasel; + struct _fpreg _st[8]; + unsigned short status; + unsigned short magic; /* 0xffff = regular FPU data only */ + + /* FXSR FPU environment */ + unsigned long _fxsr_env[6]; /* FXSR FPU env is ignored */ + unsigned long mxcsr; + unsigned long reserved; + struct _fpxreg _fxsr_st[8]; /* FXSR FPU reg data is ignored */ + struct _xmmreg _xmm[8]; + unsigned long padding1[44]; + + union { + unsigned long padding2[12]; + struct _fpx_sw_bytes sw_reserved; /* represents the extended + * state info */ + }; +}; + +#define X86_FXSR_MAGIC 0x0000 + +#ifndef __KERNEL__ +/* + * User-space might still rely on the old definition: + */ +struct sigcontext { + unsigned short gs, __gsh; + unsigned short fs, __fsh; + unsigned short es, __esh; + unsigned short ds, __dsh; + unsigned long edi; + unsigned long esi; + unsigned long ebp; + unsigned long esp; + unsigned long ebx; + unsigned long edx; + unsigned long ecx; + unsigned long eax; + unsigned long trapno; + unsigned long err; + unsigned long eip; + unsigned short cs, __csh; + unsigned long eflags; + unsigned long esp_at_signal; + unsigned short ss, __ssh; + struct _fpstate __user *fpstate; + unsigned long oldmask; + unsigned long cr2; +}; +#endif /* !__KERNEL__ */ + +#else /* __i386__ */ + +/* FXSAVE frame */ +/* Note: reserved1/2 may someday contain valuable data. Always save/restore + them when you change signal frames. */ +struct _fpstate { + __u16 cwd; + __u16 swd; + __u16 twd; /* Note this is not the same as the + 32bit/x87/FSAVE twd */ + __u16 fop; + __u64 rip; + __u64 rdp; + __u32 mxcsr; + __u32 mxcsr_mask; + __u32 st_space[32]; /* 8*16 bytes for each FP-reg */ + __u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg */ + __u32 reserved2[12]; + union { + __u32 reserved3[12]; + struct _fpx_sw_bytes sw_reserved; /* represents the extended + * state information */ + }; +}; + +#ifndef __KERNEL__ +/* + * User-space might still rely on the old definition: + */ +struct sigcontext { + __u64 r8; + __u64 r9; + __u64 r10; + __u64 r11; + __u64 r12; + __u64 r13; + __u64 r14; + __u64 r15; + __u64 rdi; + __u64 rsi; + __u64 rbp; + __u64 rbx; + __u64 rdx; + __u64 rax; + __u64 rcx; + __u64 rsp; + __u64 rip; + __u64 eflags; /* RFLAGS */ + __u16 cs; + __u16 gs; + __u16 fs; + __u16 __pad0; + __u64 err; + __u64 trapno; + __u64 oldmask; + __u64 cr2; + struct _fpstate __user *fpstate; /* zero when no FPU context */ +#ifdef __ILP32__ + __u32 __fpstate_pad; +#endif + __u64 reserved1[8]; +}; +#endif /* !__KERNEL__ */ + +#endif /* !__i386__ */ + +struct _xsave_hdr { + __u64 xstate_bv; + __u64 reserved1[2]; + __u64 reserved2[5]; +}; + +struct _ymmh_state { + /* 16 * 16 bytes for each YMMH-reg */ + __u32 ymmh_space[64]; +}; + +/* + * Extended state pointed by the fpstate pointer in the sigcontext. + * In addition to the fpstate, information encoded in the xstate_hdr + * indicates the presence of other extended state information + * supported by the processor and OS. + */ +struct _xstate { + struct _fpstate fpstate; + struct _xsave_hdr xstate_hdr; + struct _ymmh_state ymmh; + /* new processor state extensions go here */ +}; + +#endif /* _UAPI_ASM_X86_SIGCONTEXT_H */ diff --git a/arch/x86/include/asm/sigcontext32.h b/arch/x86/include/uapi/asm/sigcontext32.h index ad1478c4ae1..ad1478c4ae1 100644 --- a/arch/x86/include/asm/sigcontext32.h +++ b/arch/x86/include/uapi/asm/sigcontext32.h diff --git a/arch/x86/include/asm/siginfo.h b/arch/x86/include/uapi/asm/siginfo.h index 34c47b3341c..34c47b3341c 100644 --- a/arch/x86/include/asm/siginfo.h +++ b/arch/x86/include/uapi/asm/siginfo.h diff --git a/arch/x86/include/uapi/asm/signal.h b/arch/x86/include/uapi/asm/signal.h new file mode 100644 index 00000000000..8264f47cf53 --- /dev/null +++ b/arch/x86/include/uapi/asm/signal.h @@ -0,0 +1,135 @@ +#ifndef _UAPI_ASM_X86_SIGNAL_H +#define _UAPI_ASM_X86_SIGNAL_H + +#ifndef __ASSEMBLY__ +#include <linux/types.h> +#include <linux/time.h> +#include <linux/compiler.h> + +/* Avoid too many header ordering problems. */ +struct siginfo; + +#ifndef __KERNEL__ +/* Here we must cater to libcs that poke about in kernel headers. */ + +#define NSIG 32 +typedef unsigned long sigset_t; + +#endif /* __KERNEL__ */ +#endif /* __ASSEMBLY__ */ + + +#define SIGHUP 1 +#define SIGINT 2 +#define SIGQUIT 3 +#define SIGILL 4 +#define SIGTRAP 5 +#define SIGABRT 6 +#define SIGIOT 6 +#define SIGBUS 7 +#define SIGFPE 8 +#define SIGKILL 9 +#define SIGUSR1 10 +#define SIGSEGV 11 +#define SIGUSR2 12 +#define SIGPIPE 13 +#define SIGALRM 14 +#define SIGTERM 15 +#define SIGSTKFLT 16 +#define SIGCHLD 17 +#define SIGCONT 18 +#define SIGSTOP 19 +#define SIGTSTP 20 +#define SIGTTIN 21 +#define SIGTTOU 22 +#define SIGURG 23 +#define SIGXCPU 24 +#define SIGXFSZ 25 +#define SIGVTALRM 26 +#define SIGPROF 27 +#define SIGWINCH 28 +#define SIGIO 29 +#define SIGPOLL SIGIO +/* +#define SIGLOST 29 +*/ +#define SIGPWR 30 +#define SIGSYS 31 +#define SIGUNUSED 31 + +/* These should not be considered constants from userland. */ +#define SIGRTMIN 32 +#define SIGRTMAX _NSIG + +/* + * SA_FLAGS values: + * + * SA_ONSTACK indicates that a registered stack_t will be used. + * SA_RESTART flag to get restarting signals (which were the default long ago) + * SA_NOCLDSTOP flag to turn off SIGCHLD when children stop. + * SA_RESETHAND clears the handler when the signal is delivered. + * SA_NOCLDWAIT flag on SIGCHLD to inhibit zombies. + * SA_NODEFER prevents the current signal from being masked in the handler. + * + * SA_ONESHOT and SA_NOMASK are the historical Linux names for the Single + * Unix names RESETHAND and NODEFER respectively. + */ +#define SA_NOCLDSTOP 0x00000001u +#define SA_NOCLDWAIT 0x00000002u +#define SA_SIGINFO 0x00000004u +#define SA_ONSTACK 0x08000000u +#define SA_RESTART 0x10000000u +#define SA_NODEFER 0x40000000u +#define SA_RESETHAND 0x80000000u + +#define SA_NOMASK SA_NODEFER +#define SA_ONESHOT SA_RESETHAND + +#define SA_RESTORER 0x04000000 + +#define MINSIGSTKSZ 2048 +#define SIGSTKSZ 8192 + +#include <asm-generic/signal-defs.h> + +#ifndef __ASSEMBLY__ + + +# ifndef __KERNEL__ +/* Here we must cater to libcs that poke about in kernel headers. */ +#ifdef __i386__ + +struct sigaction { + union { + __sighandler_t _sa_handler; + void (*_sa_sigaction)(int, struct siginfo *, void *); + } _u; + sigset_t sa_mask; + unsigned long sa_flags; + void (*sa_restorer)(void); +}; + +#define sa_handler _u._sa_handler +#define sa_sigaction _u._sa_sigaction + +#else /* __i386__ */ + +struct sigaction { + __sighandler_t sa_handler; + unsigned long sa_flags; + __sigrestore_t sa_restorer; + sigset_t sa_mask; /* mask last for extensibility */ +}; + +#endif /* !__i386__ */ +# endif /* ! __KERNEL__ */ + +typedef struct sigaltstack { + void __user *ss_sp; + int ss_flags; + size_t ss_size; +} stack_t; + +#endif /* __ASSEMBLY__ */ + +#endif /* _UAPI_ASM_X86_SIGNAL_H */ diff --git a/arch/x86/include/asm/socket.h b/arch/x86/include/uapi/asm/socket.h index 6b71384b9d8..6b71384b9d8 100644 --- a/arch/x86/include/asm/socket.h +++ b/arch/x86/include/uapi/asm/socket.h diff --git a/arch/x86/include/asm/sockios.h b/arch/x86/include/uapi/asm/sockios.h index def6d4746ee..def6d4746ee 100644 --- a/arch/x86/include/asm/sockios.h +++ b/arch/x86/include/uapi/asm/sockios.h diff --git a/arch/x86/include/asm/stat.h b/arch/x86/include/uapi/asm/stat.h index 7b3ddc34858..7b3ddc34858 100644 --- a/arch/x86/include/asm/stat.h +++ b/arch/x86/include/uapi/asm/stat.h diff --git a/arch/x86/include/asm/statfs.h b/arch/x86/include/uapi/asm/statfs.h index 2d0adbf99a8..2d0adbf99a8 100644 --- a/arch/x86/include/asm/statfs.h +++ b/arch/x86/include/uapi/asm/statfs.h diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h new file mode 100644 index 00000000000..b5d7640abc5 --- /dev/null +++ b/arch/x86/include/uapi/asm/svm.h @@ -0,0 +1,132 @@ +#ifndef _UAPI__SVM_H +#define _UAPI__SVM_H + +#define SVM_EXIT_READ_CR0 0x000 +#define SVM_EXIT_READ_CR3 0x003 +#define SVM_EXIT_READ_CR4 0x004 +#define SVM_EXIT_READ_CR8 0x008 +#define SVM_EXIT_WRITE_CR0 0x010 +#define SVM_EXIT_WRITE_CR3 0x013 +#define SVM_EXIT_WRITE_CR4 0x014 +#define SVM_EXIT_WRITE_CR8 0x018 +#define SVM_EXIT_READ_DR0 0x020 +#define SVM_EXIT_READ_DR1 0x021 +#define SVM_EXIT_READ_DR2 0x022 +#define SVM_EXIT_READ_DR3 0x023 +#define SVM_EXIT_READ_DR4 0x024 +#define SVM_EXIT_READ_DR5 0x025 +#define SVM_EXIT_READ_DR6 0x026 +#define SVM_EXIT_READ_DR7 0x027 +#define SVM_EXIT_WRITE_DR0 0x030 +#define SVM_EXIT_WRITE_DR1 0x031 +#define SVM_EXIT_WRITE_DR2 0x032 +#define SVM_EXIT_WRITE_DR3 0x033 +#define SVM_EXIT_WRITE_DR4 0x034 +#define SVM_EXIT_WRITE_DR5 0x035 +#define SVM_EXIT_WRITE_DR6 0x036 +#define SVM_EXIT_WRITE_DR7 0x037 +#define SVM_EXIT_EXCP_BASE 0x040 +#define SVM_EXIT_INTR 0x060 +#define SVM_EXIT_NMI 0x061 +#define SVM_EXIT_SMI 0x062 +#define SVM_EXIT_INIT 0x063 +#define SVM_EXIT_VINTR 0x064 +#define SVM_EXIT_CR0_SEL_WRITE 0x065 +#define SVM_EXIT_IDTR_READ 0x066 +#define SVM_EXIT_GDTR_READ 0x067 +#define SVM_EXIT_LDTR_READ 0x068 +#define SVM_EXIT_TR_READ 0x069 +#define SVM_EXIT_IDTR_WRITE 0x06a +#define SVM_EXIT_GDTR_WRITE 0x06b +#define SVM_EXIT_LDTR_WRITE 0x06c +#define SVM_EXIT_TR_WRITE 0x06d +#define SVM_EXIT_RDTSC 0x06e +#define SVM_EXIT_RDPMC 0x06f +#define SVM_EXIT_PUSHF 0x070 +#define SVM_EXIT_POPF 0x071 +#define SVM_EXIT_CPUID 0x072 +#define SVM_EXIT_RSM 0x073 +#define SVM_EXIT_IRET 0x074 +#define SVM_EXIT_SWINT 0x075 +#define SVM_EXIT_INVD 0x076 +#define SVM_EXIT_PAUSE 0x077 +#define SVM_EXIT_HLT 0x078 +#define SVM_EXIT_INVLPG 0x079 +#define SVM_EXIT_INVLPGA 0x07a +#define SVM_EXIT_IOIO 0x07b +#define SVM_EXIT_MSR 0x07c +#define SVM_EXIT_TASK_SWITCH 0x07d +#define SVM_EXIT_FERR_FREEZE 0x07e +#define SVM_EXIT_SHUTDOWN 0x07f +#define SVM_EXIT_VMRUN 0x080 +#define SVM_EXIT_VMMCALL 0x081 +#define SVM_EXIT_VMLOAD 0x082 +#define SVM_EXIT_VMSAVE 0x083 +#define SVM_EXIT_STGI 0x084 +#define SVM_EXIT_CLGI 0x085 +#define SVM_EXIT_SKINIT 0x086 +#define SVM_EXIT_RDTSCP 0x087 +#define SVM_EXIT_ICEBP 0x088 +#define SVM_EXIT_WBINVD 0x089 +#define SVM_EXIT_MONITOR 0x08a +#define SVM_EXIT_MWAIT 0x08b +#define SVM_EXIT_MWAIT_COND 0x08c +#define SVM_EXIT_XSETBV 0x08d +#define SVM_EXIT_NPF 0x400 + +#define SVM_EXIT_ERR -1 + +#define SVM_EXIT_REASONS \ + { SVM_EXIT_READ_CR0, "read_cr0" }, \ + { SVM_EXIT_READ_CR3, "read_cr3" }, \ + { SVM_EXIT_READ_CR4, "read_cr4" }, \ + { SVM_EXIT_READ_CR8, "read_cr8" }, \ + { SVM_EXIT_WRITE_CR0, "write_cr0" }, \ + { SVM_EXIT_WRITE_CR3, "write_cr3" }, \ + { SVM_EXIT_WRITE_CR4, "write_cr4" }, \ + { SVM_EXIT_WRITE_CR8, "write_cr8" }, \ + { SVM_EXIT_READ_DR0, "read_dr0" }, \ + { SVM_EXIT_READ_DR1, "read_dr1" }, \ + { SVM_EXIT_READ_DR2, "read_dr2" }, \ + { SVM_EXIT_READ_DR3, "read_dr3" }, \ + { SVM_EXIT_WRITE_DR0, "write_dr0" }, \ + { SVM_EXIT_WRITE_DR1, "write_dr1" }, \ + { SVM_EXIT_WRITE_DR2, "write_dr2" }, \ + { SVM_EXIT_WRITE_DR3, "write_dr3" }, \ + { SVM_EXIT_WRITE_DR5, "write_dr5" }, \ + { SVM_EXIT_WRITE_DR7, "write_dr7" }, \ + { SVM_EXIT_EXCP_BASE + DB_VECTOR, "DB excp" }, \ + { SVM_EXIT_EXCP_BASE + BP_VECTOR, "BP excp" }, \ + { SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" }, \ + { SVM_EXIT_EXCP_BASE + PF_VECTOR, "PF excp" }, \ + { SVM_EXIT_EXCP_BASE + NM_VECTOR, "NM excp" }, \ + { SVM_EXIT_EXCP_BASE + MC_VECTOR, "MC excp" }, \ + { SVM_EXIT_INTR, "interrupt" }, \ + { SVM_EXIT_NMI, "nmi" }, \ + { SVM_EXIT_SMI, "smi" }, \ + { SVM_EXIT_INIT, "init" }, \ + { SVM_EXIT_VINTR, "vintr" }, \ + { SVM_EXIT_CPUID, "cpuid" }, \ + { SVM_EXIT_INVD, "invd" }, \ + { SVM_EXIT_HLT, "hlt" }, \ + { SVM_EXIT_INVLPG, "invlpg" }, \ + { SVM_EXIT_INVLPGA, "invlpga" }, \ + { SVM_EXIT_IOIO, "io" }, \ + { SVM_EXIT_MSR, "msr" }, \ + { SVM_EXIT_TASK_SWITCH, "task_switch" }, \ + { SVM_EXIT_SHUTDOWN, "shutdown" }, \ + { SVM_EXIT_VMRUN, "vmrun" }, \ + { SVM_EXIT_VMMCALL, "hypercall" }, \ + { SVM_EXIT_VMLOAD, "vmload" }, \ + { SVM_EXIT_VMSAVE, "vmsave" }, \ + { SVM_EXIT_STGI, "stgi" }, \ + { SVM_EXIT_CLGI, "clgi" }, \ + { SVM_EXIT_SKINIT, "skinit" }, \ + { SVM_EXIT_WBINVD, "wbinvd" }, \ + { SVM_EXIT_MONITOR, "monitor" }, \ + { SVM_EXIT_MWAIT, "mwait" }, \ + { SVM_EXIT_XSETBV, "xsetbv" }, \ + { SVM_EXIT_NPF, "npf" } + + +#endif /* _UAPI__SVM_H */ diff --git a/arch/x86/include/asm/swab.h b/arch/x86/include/uapi/asm/swab.h index 557cd9f0066..7f235c7105c 100644 --- a/arch/x86/include/asm/swab.h +++ b/arch/x86/include/uapi/asm/swab.h @@ -6,22 +6,7 @@ static inline __attribute_const__ __u32 __arch_swab32(__u32 val) { -#ifdef __i386__ -# ifdef CONFIG_X86_BSWAP - asm("bswap %0" : "=r" (val) : "0" (val)); -# else - asm("xchgb %b0,%h0\n\t" /* swap lower bytes */ - "rorl $16,%0\n\t" /* swap words */ - "xchgb %b0,%h0" /* swap higher bytes */ - : "=q" (val) - : "0" (val)); -# endif - -#else /* __i386__ */ - asm("bswapl %0" - : "=r" (val) - : "0" (val)); -#endif + asm("bswapl %0" : "=r" (val) : "0" (val)); return val; } #define __arch_swab32 __arch_swab32 @@ -37,22 +22,12 @@ static inline __attribute_const__ __u64 __arch_swab64(__u64 val) __u64 u; } v; v.u = val; -# ifdef CONFIG_X86_BSWAP asm("bswapl %0 ; bswapl %1 ; xchgl %0,%1" : "=r" (v.s.a), "=r" (v.s.b) : "0" (v.s.a), "1" (v.s.b)); -# else - v.s.a = __arch_swab32(v.s.a); - v.s.b = __arch_swab32(v.s.b); - asm("xchgl %0,%1" - : "=r" (v.s.a), "=r" (v.s.b) - : "0" (v.s.a), "1" (v.s.b)); -# endif return v.u; #else /* __i386__ */ - asm("bswapq %0" - : "=r" (val) - : "0" (val)); + asm("bswapq %0" : "=r" (val) : "0" (val)); return val; #endif } diff --git a/arch/x86/include/asm/termbits.h b/arch/x86/include/uapi/asm/termbits.h index 3935b106de7..3935b106de7 100644 --- a/arch/x86/include/asm/termbits.h +++ b/arch/x86/include/uapi/asm/termbits.h diff --git a/arch/x86/include/asm/termios.h b/arch/x86/include/uapi/asm/termios.h index 280d78a9d96..280d78a9d96 100644 --- a/arch/x86/include/asm/termios.h +++ b/arch/x86/include/uapi/asm/termios.h diff --git a/arch/x86/include/asm/types.h b/arch/x86/include/uapi/asm/types.h index 8e8c23fef08..8e8c23fef08 100644 --- a/arch/x86/include/asm/types.h +++ b/arch/x86/include/uapi/asm/types.h diff --git a/arch/x86/include/asm/ucontext.h b/arch/x86/include/uapi/asm/ucontext.h index b7c29c8017f..b7c29c8017f 100644 --- a/arch/x86/include/asm/ucontext.h +++ b/arch/x86/include/uapi/asm/ucontext.h diff --git a/arch/x86/include/uapi/asm/unistd.h b/arch/x86/include/uapi/asm/unistd.h new file mode 100644 index 00000000000..a26df0d75cd --- /dev/null +++ b/arch/x86/include/uapi/asm/unistd.h @@ -0,0 +1,17 @@ +#ifndef _UAPI_ASM_X86_UNISTD_H +#define _UAPI_ASM_X86_UNISTD_H + +/* x32 syscall flag bit */ +#define __X32_SYSCALL_BIT 0x40000000 + +#ifndef __KERNEL__ +# ifdef __i386__ +# include <asm/unistd_32.h> +# elif defined(__ILP32__) +# include <asm/unistd_x32.h> +# else +# include <asm/unistd_64.h> +# endif +#endif + +#endif /* _UAPI_ASM_X86_UNISTD_H */ diff --git a/arch/x86/include/uapi/asm/vm86.h b/arch/x86/include/uapi/asm/vm86.h new file mode 100644 index 00000000000..e0b243e9d85 --- /dev/null +++ b/arch/x86/include/uapi/asm/vm86.h @@ -0,0 +1,129 @@ +#ifndef _UAPI_ASM_X86_VM86_H +#define _UAPI_ASM_X86_VM86_H + +/* + * I'm guessing at the VIF/VIP flag usage, but hope that this is how + * the Pentium uses them. Linux will return from vm86 mode when both + * VIF and VIP is set. + * + * On a Pentium, we could probably optimize the virtual flags directly + * in the eflags register instead of doing it "by hand" in vflags... + * + * Linus + */ + +#include <asm/processor-flags.h> + +#define BIOSSEG 0x0f000 + +#define CPU_086 0 +#define CPU_186 1 +#define CPU_286 2 +#define CPU_386 3 +#define CPU_486 4 +#define CPU_586 5 + +/* + * Return values for the 'vm86()' system call + */ +#define VM86_TYPE(retval) ((retval) & 0xff) +#define VM86_ARG(retval) ((retval) >> 8) + +#define VM86_SIGNAL 0 /* return due to signal */ +#define VM86_UNKNOWN 1 /* unhandled GP fault + - IO-instruction or similar */ +#define VM86_INTx 2 /* int3/int x instruction (ARG = x) */ +#define VM86_STI 3 /* sti/popf/iret instruction enabled + virtual interrupts */ + +/* + * Additional return values when invoking new vm86() + */ +#define VM86_PICRETURN 4 /* return due to pending PIC request */ +#define VM86_TRAP 6 /* return due to DOS-debugger request */ + +/* + * function codes when invoking new vm86() + */ +#define VM86_PLUS_INSTALL_CHECK 0 +#define VM86_ENTER 1 +#define VM86_ENTER_NO_BYPASS 2 +#define VM86_REQUEST_IRQ 3 +#define VM86_FREE_IRQ 4 +#define VM86_GET_IRQ_BITS 5 +#define VM86_GET_AND_RESET_IRQ 6 + +/* + * This is the stack-layout seen by the user space program when we have + * done a translation of "SAVE_ALL" from vm86 mode. The real kernel layout + * is 'kernel_vm86_regs' (see below). + */ + +struct vm86_regs { +/* + * normal regs, with special meaning for the segment descriptors.. + */ + long ebx; + long ecx; + long edx; + long esi; + long edi; + long ebp; + long eax; + long __null_ds; + long __null_es; + long __null_fs; + long __null_gs; + long orig_eax; + long eip; + unsigned short cs, __csh; + long eflags; + long esp; + unsigned short ss, __ssh; +/* + * these are specific to v86 mode: + */ + unsigned short es, __esh; + unsigned short ds, __dsh; + unsigned short fs, __fsh; + unsigned short gs, __gsh; +}; + +struct revectored_struct { + unsigned long __map[8]; /* 256 bits */ +}; + +struct vm86_struct { + struct vm86_regs regs; + unsigned long flags; + unsigned long screen_bitmap; + unsigned long cpu_type; + struct revectored_struct int_revectored; + struct revectored_struct int21_revectored; +}; + +/* + * flags masks + */ +#define VM86_SCREEN_BITMAP 0x0001 + +struct vm86plus_info_struct { + unsigned long force_return_for_pic:1; + unsigned long vm86dbg_active:1; /* for debugger */ + unsigned long vm86dbg_TFpendig:1; /* for debugger */ + unsigned long unused:28; + unsigned long is_vm86pus:1; /* for vm86 internal use */ + unsigned char vm86dbg_intxxtab[32]; /* for debugger */ +}; +struct vm86plus_struct { + struct vm86_regs regs; + unsigned long flags; + unsigned long screen_bitmap; + unsigned long cpu_type; + struct revectored_struct int_revectored; + struct revectored_struct int21_revectored; + struct vm86plus_info_struct vm86plus; +}; + + +#endif /* _UAPI_ASM_X86_VM86_H */ diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h new file mode 100644 index 00000000000..2871fccfee6 --- /dev/null +++ b/arch/x86/include/uapi/asm/vmx.h @@ -0,0 +1,116 @@ +/* + * vmx.h: VMX Architecture related definitions + * Copyright (c) 2004, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * A few random additions are: + * Copyright (C) 2006 Qumranet + * Avi Kivity <avi@qumranet.com> + * Yaniv Kamay <yaniv@qumranet.com> + * + */ +#ifndef _UAPIVMX_H +#define _UAPIVMX_H + + +#define VMX_EXIT_REASONS_FAILED_VMENTRY 0x80000000 + +#define EXIT_REASON_EXCEPTION_NMI 0 +#define EXIT_REASON_EXTERNAL_INTERRUPT 1 +#define EXIT_REASON_TRIPLE_FAULT 2 + +#define EXIT_REASON_PENDING_INTERRUPT 7 +#define EXIT_REASON_NMI_WINDOW 8 +#define EXIT_REASON_TASK_SWITCH 9 +#define EXIT_REASON_CPUID 10 +#define EXIT_REASON_HLT 12 +#define EXIT_REASON_INVD 13 +#define EXIT_REASON_INVLPG 14 +#define EXIT_REASON_RDPMC 15 +#define EXIT_REASON_RDTSC 16 +#define EXIT_REASON_VMCALL 18 +#define EXIT_REASON_VMCLEAR 19 +#define EXIT_REASON_VMLAUNCH 20 +#define EXIT_REASON_VMPTRLD 21 +#define EXIT_REASON_VMPTRST 22 +#define EXIT_REASON_VMREAD 23 +#define EXIT_REASON_VMRESUME 24 +#define EXIT_REASON_VMWRITE 25 +#define EXIT_REASON_VMOFF 26 +#define EXIT_REASON_VMON 27 +#define EXIT_REASON_CR_ACCESS 28 +#define EXIT_REASON_DR_ACCESS 29 +#define EXIT_REASON_IO_INSTRUCTION 30 +#define EXIT_REASON_MSR_READ 31 +#define EXIT_REASON_MSR_WRITE 32 +#define EXIT_REASON_INVALID_STATE 33 +#define EXIT_REASON_MWAIT_INSTRUCTION 36 +#define EXIT_REASON_MONITOR_INSTRUCTION 39 +#define EXIT_REASON_PAUSE_INSTRUCTION 40 +#define EXIT_REASON_MCE_DURING_VMENTRY 41 +#define EXIT_REASON_TPR_BELOW_THRESHOLD 43 +#define EXIT_REASON_APIC_ACCESS 44 +#define EXIT_REASON_EOI_INDUCED 45 +#define EXIT_REASON_EPT_VIOLATION 48 +#define EXIT_REASON_EPT_MISCONFIG 49 +#define EXIT_REASON_WBINVD 54 +#define EXIT_REASON_XSETBV 55 +#define EXIT_REASON_APIC_WRITE 56 +#define EXIT_REASON_INVPCID 58 + +#define VMX_EXIT_REASONS \ + { EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \ + { EXIT_REASON_EXTERNAL_INTERRUPT, "EXTERNAL_INTERRUPT" }, \ + { EXIT_REASON_TRIPLE_FAULT, "TRIPLE_FAULT" }, \ + { EXIT_REASON_PENDING_INTERRUPT, "PENDING_INTERRUPT" }, \ + { EXIT_REASON_NMI_WINDOW, "NMI_WINDOW" }, \ + { EXIT_REASON_TASK_SWITCH, "TASK_SWITCH" }, \ + { EXIT_REASON_CPUID, "CPUID" }, \ + { EXIT_REASON_HLT, "HLT" }, \ + { EXIT_REASON_INVLPG, "INVLPG" }, \ + { EXIT_REASON_RDPMC, "RDPMC" }, \ + { EXIT_REASON_RDTSC, "RDTSC" }, \ + { EXIT_REASON_VMCALL, "VMCALL" }, \ + { EXIT_REASON_VMCLEAR, "VMCLEAR" }, \ + { EXIT_REASON_VMLAUNCH, "VMLAUNCH" }, \ + { EXIT_REASON_VMPTRLD, "VMPTRLD" }, \ + { EXIT_REASON_VMPTRST, "VMPTRST" }, \ + { EXIT_REASON_VMREAD, "VMREAD" }, \ + { EXIT_REASON_VMRESUME, "VMRESUME" }, \ + { EXIT_REASON_VMWRITE, "VMWRITE" }, \ + { EXIT_REASON_VMOFF, "VMOFF" }, \ + { EXIT_REASON_VMON, "VMON" }, \ + { EXIT_REASON_CR_ACCESS, "CR_ACCESS" }, \ + { EXIT_REASON_DR_ACCESS, "DR_ACCESS" }, \ + { EXIT_REASON_IO_INSTRUCTION, "IO_INSTRUCTION" }, \ + { EXIT_REASON_MSR_READ, "MSR_READ" }, \ + { EXIT_REASON_MSR_WRITE, "MSR_WRITE" }, \ + { EXIT_REASON_MWAIT_INSTRUCTION, "MWAIT_INSTRUCTION" }, \ + { EXIT_REASON_MONITOR_INSTRUCTION, "MONITOR_INSTRUCTION" }, \ + { EXIT_REASON_PAUSE_INSTRUCTION, "PAUSE_INSTRUCTION" }, \ + { EXIT_REASON_MCE_DURING_VMENTRY, "MCE_DURING_VMENTRY" }, \ + { EXIT_REASON_TPR_BELOW_THRESHOLD, "TPR_BELOW_THRESHOLD" }, \ + { EXIT_REASON_APIC_ACCESS, "APIC_ACCESS" }, \ + { EXIT_REASON_EPT_VIOLATION, "EPT_VIOLATION" }, \ + { EXIT_REASON_EPT_MISCONFIG, "EPT_MISCONFIG" }, \ + { EXIT_REASON_WBINVD, "WBINVD" }, \ + { EXIT_REASON_APIC_WRITE, "APIC_WRITE" }, \ + { EXIT_REASON_EOI_INDUCED, "EOI_INDUCED" }, \ + { EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \ + { EXIT_REASON_INVD, "INVD" }, \ + { EXIT_REASON_INVPCID, "INVPCID" } + + +#endif /* _UAPIVMX_H */ diff --git a/arch/x86/include/uapi/asm/vsyscall.h b/arch/x86/include/uapi/asm/vsyscall.h new file mode 100644 index 00000000000..85dc1b3825a --- /dev/null +++ b/arch/x86/include/uapi/asm/vsyscall.h @@ -0,0 +1,17 @@ +#ifndef _UAPI_ASM_X86_VSYSCALL_H +#define _UAPI_ASM_X86_VSYSCALL_H + +enum vsyscall_num { + __NR_vgettimeofday, + __NR_vtime, + __NR_vgetcpu, +}; + +#define VSYSCALL_START (-10UL << 20) +#define VSYSCALL_SIZE 1024 +#define VSYSCALL_END (-2UL << 20) +#define VSYSCALL_MAPPED_PAGES 1 +#define VSYSCALL_ADDR(vsyscall_nr) (VSYSCALL_START+VSYSCALL_SIZE*(vsyscall_nr)) + + +#endif /* _UAPI_ASM_X86_VSYSCALL_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 91ce48f05f9..7bd3bd31010 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -9,7 +9,6 @@ CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) ifdef CONFIG_FUNCTION_TRACER # Do not profile debug and lowlevel utilities CFLAGS_REMOVE_tsc.o = -pg -CFLAGS_REMOVE_rtc.o = -pg CFLAGS_REMOVE_paravirt-spinlocks.o = -pg CFLAGS_REMOVE_pvclock.o = -pg CFLAGS_REMOVE_kvmclock.o = -pg @@ -62,11 +61,11 @@ obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o +obj-$(CONFIG_X86_TSC) += trace_clock.o obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o -obj-$(CONFIG_KPROBES) += kprobes.o -obj-$(CONFIG_OPTPROBES) += kprobes-opt.o +obj-y += kprobes/ obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o obj-$(CONFIG_KGDB) += kgdb.o @@ -88,6 +87,9 @@ obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o +obj-$(CONFIG_MICROCODE_EARLY) += microcode_core_early.o +obj-$(CONFIG_MICROCODE_INTEL_EARLY) += microcode_intel_early.o +obj-$(CONFIG_MICROCODE_INTEL_LIB) += microcode_intel_lib.o microcode-y := microcode_core.o microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index e651f7a589a..230c8ea878e 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -51,7 +51,6 @@ EXPORT_SYMBOL(acpi_disabled); #ifdef CONFIG_X86_64 # include <asm/proto.h> -# include <asm/numa_64.h> #endif /* X86 */ #define BAD_MADT_ENTRY(entry, end) ( \ @@ -574,6 +573,12 @@ int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) return irq; } +EXPORT_SYMBOL_GPL(acpi_register_gsi); + +void acpi_unregister_gsi(u32 gsi) +{ +} +EXPORT_SYMBOL_GPL(acpi_unregister_gsi); void __init acpi_set_irq_model_pic(void) { @@ -691,6 +696,10 @@ EXPORT_SYMBOL(acpi_map_lsapic); int acpi_unmap_lsapic(int cpu) { +#ifdef CONFIG_ACPI_NUMA + set_apicid_to_node(per_cpu(x86_cpu_to_apicid, cpu), NUMA_NO_NODE); +#endif + per_cpu(x86_cpu_to_apicid, cpu) = -1; set_cpu_present(cpu, false); num_processors--; @@ -1700,3 +1709,9 @@ int __acpi_release_global_lock(unsigned int *lock) } while (unlikely (val != old)); return old & 0x1; } + +void __init arch_reserve_mem_area(acpi_physical_address addr, size_t size) +{ + e820_add_region(addr, size, E820_ACPI); + update_e820(); +} diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 11676cf65ae..0532f5d6e4e 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -69,7 +69,7 @@ int acpi_suspend_lowlevel(void) #ifndef CONFIG_64BIT header->pmode_entry = (u32)&wakeup_pmode_return; - header->pmode_cr3 = (u32)__pa(&initial_page_table); + header->pmode_cr3 = (u32)__pa_symbol(initial_page_table); saved_magic = 0x12345678; #else /* CONFIG_64BIT */ #ifdef CONFIG_SMP @@ -101,6 +101,8 @@ static int __init acpi_sleep_setup(char *str) #endif if (strncmp(str, "nonvs", 5) == 0) acpi_nvs_nosave(); + if (strncmp(str, "nonvs_s3", 8) == 0) + acpi_nvs_nosave_s3(); if (strncmp(str, "old_ordering", 12) == 0) acpi_old_suspend_ordering(); str = strchr(str, ','); diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index e66311200cb..b574b295a2f 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -768,10 +768,9 @@ int __init gart_iommu_init(void) aper_base = info.aper_base; end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT); - if (end_pfn > max_low_pfn_mapped) { - start_pfn = (aper_base>>PAGE_SHIFT); + start_pfn = PFN_DOWN(aper_base); + if (!pfn_range_is_mapped(start_pfn, end_pfn)) init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT); - } pr_info("PCI-DMA: using GART IOMMU.\n"); iommu_size = check_iommu_size(info.aper_base, aper_size); diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index afdc3f756de..c9876efecaf 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -240,7 +240,7 @@ static int apbt_cpuhp_notify(struct notifier_block *n, dw_apb_clockevent_pause(adev->timer); if (system_state == SYSTEM_RUNNING) { pr_debug("skipping APBT CPU %lu offline\n", cpu); - } else if (adev) { + } else { pr_debug("APBT clockevent for cpu %lu offline\n", cpu); dw_apb_clockevent_stop(adev->timer); } @@ -311,7 +311,6 @@ void __init apbt_time_init(void) #ifdef CONFIG_SMP int i; struct sfi_timer_table_entry *p_mtmr; - unsigned int percpu_timer; struct apbt_dev *adev; #endif @@ -346,13 +345,10 @@ void __init apbt_time_init(void) return; } pr_debug("%s: %d CPUs online\n", __func__, num_online_cpus()); - if (num_possible_cpus() <= sfi_mtimer_num) { - percpu_timer = 1; + if (num_possible_cpus() <= sfi_mtimer_num) apbt_num_timers_used = num_possible_cpus(); - } else { - percpu_timer = 0; + else apbt_num_timers_used = 1; - } pr_debug("%s: %d APB timers used\n", __func__, apbt_num_timers_used); /* here we set up per CPU timer data structure */ diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index b17416e72fb..904611bf0e5 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -90,21 +90,6 @@ EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid); */ DEFINE_EARLY_PER_CPU_READ_MOSTLY(int, x86_cpu_to_logical_apicid, BAD_APICID); -/* - * Knob to control our willingness to enable the local APIC. - * - * +1=force-enable - */ -static int force_enable_local_apic __initdata; -/* - * APIC command line parameters - */ -static int __init parse_lapic(char *arg) -{ - force_enable_local_apic = 1; - return 0; -} -early_param("lapic", parse_lapic); /* Local APIC was disabled by the BIOS and enabled by the kernel */ static int enabled_via_apicbase; @@ -133,6 +118,25 @@ static inline void imcr_apic_to_pic(void) } #endif +/* + * Knob to control our willingness to enable the local APIC. + * + * +1=force-enable + */ +static int force_enable_local_apic __initdata; +/* + * APIC command line parameters + */ +static int __init parse_lapic(char *arg) +{ + if (config_enabled(CONFIG_X86_32) && !arg) + force_enable_local_apic = 1; + else if (arg && !strncmp(arg, "notscdeadline", 13)) + setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); + return 0; +} +early_param("lapic", parse_lapic); + #ifdef CONFIG_X86_64 static int apic_calibrate_pmtmr __initdata; static __init int setup_apicpmtimer(char *s) @@ -315,6 +319,7 @@ int lapic_get_maxlvt(void) /* Clock divisor */ #define APIC_DIVISOR 16 +#define TSC_DIVISOR 32 /* * This function sets up the local APIC timer, with a timeout of @@ -333,6 +338,9 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) lvtt_value = LOCAL_TIMER_VECTOR; if (!oneshot) lvtt_value |= APIC_LVT_TIMER_PERIODIC; + else if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) + lvtt_value |= APIC_LVT_TIMER_TSCDEADLINE; + if (!lapic_is_integrated()) lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV); @@ -341,6 +349,11 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) apic_write(APIC_LVTT, lvtt_value); + if (lvtt_value & APIC_LVT_TIMER_TSCDEADLINE) { + printk_once(KERN_DEBUG "TSC deadline timer enabled\n"); + return; + } + /* * Divide PICLK by 16 */ @@ -453,6 +466,16 @@ static int lapic_next_event(unsigned long delta, return 0; } +static int lapic_next_deadline(unsigned long delta, + struct clock_event_device *evt) +{ + u64 tsc; + + rdtscll(tsc); + wrmsrl(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR)); + return 0; +} + /* * Setup the lapic timer in periodic or oneshot mode */ @@ -533,7 +556,15 @@ static void __cpuinit setup_APIC_timer(void) memcpy(levt, &lapic_clockevent, sizeof(*levt)); levt->cpumask = cpumask_of(smp_processor_id()); - clockevents_register_device(levt); + if (this_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) { + levt->features &= ~(CLOCK_EVT_FEAT_PERIODIC | + CLOCK_EVT_FEAT_DUMMY); + levt->set_next_event = lapic_next_deadline; + clockevents_config_and_register(levt, + (tsc_khz / TSC_DIVISOR) * 1000, + 0xF, ~0UL); + } else + clockevents_register_device(levt); } /* @@ -661,7 +692,9 @@ static int __init calibrate_APIC_clock(void) * in the clockevent structure and return. */ - if (lapic_timer_frequency) { + if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) { + return 0; + } else if (lapic_timer_frequency) { apic_printk(APIC_VERBOSE, "lapic timer already calibrated %d\n", lapic_timer_frequency); lapic_clockevent.mult = div_sc(lapic_timer_frequency/APIC_DIVISOR, @@ -674,6 +707,9 @@ static int __init calibrate_APIC_clock(void) return 0; } + apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n" + "calibrating APIC timer ...\n"); + local_irq_disable(); /* Replace the global interrupt handler */ @@ -811,9 +847,6 @@ void __init setup_boot_APIC_clock(void) return; } - apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n" - "calibrating APIC timer ...\n"); - if (calibrate_APIC_clock()) { /* No broadcast on UP ! */ if (num_possible_cpus() > 1) @@ -1444,8 +1477,7 @@ void __init bsp_end_local_APIC_setup(void) * Now that local APIC setup is completed for BP, configure the fault * handling for interrupt remapping. */ - if (irq_remapping_enabled) - irq_remap_enable_fault_handling(); + irq_remap_enable_fault_handling(); } @@ -2218,8 +2250,7 @@ static int lapic_suspend(void) local_irq_save(flags); disable_local_APIC(); - if (irq_remapping_enabled) - irq_remapping_disable(); + irq_remapping_disable(); local_irq_restore(flags); return 0; @@ -2235,16 +2266,15 @@ static void lapic_resume(void) return; local_irq_save(flags); - if (irq_remapping_enabled) { - /* - * IO-APIC and PIC have their own resume routines. - * We just mask them here to make sure the interrupt - * subsystem is completely quiet while we enable x2apic - * and interrupt-remapping. - */ - mask_ioapic_entries(); - legacy_pic->mask_all(); - } + + /* + * IO-APIC and PIC have their own resume routines. + * We just mask them here to make sure the interrupt + * subsystem is completely quiet while we enable x2apic + * and interrupt-remapping. + */ + mask_ioapic_entries(); + legacy_pic->mask_all(); if (x2apic_mode) enable_x2apic(); @@ -2287,8 +2317,7 @@ static void lapic_resume(void) apic_write(APIC_ESR, 0); apic_read(APIC_ESR); - if (irq_remapping_enabled) - irq_remapping_reenable(x2apic_mode); + irq_remapping_reenable(x2apic_mode); local_irq_restore(flags); } diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index a65829ac2b9..9a9110918ca 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -22,11 +22,13 @@ #include <linux/hardirq.h> #include <linux/delay.h> +#include <asm/numachip/numachip.h> #include <asm/numachip/numachip_csr.h> #include <asm/smp.h> #include <asm/apic.h> #include <asm/ipi.h> #include <asm/apic_flat_64.h> +#include <asm/pgtable.h> static int numachip_system __read_mostly; @@ -179,6 +181,7 @@ static int __init numachip_system_init(void) return 0; x86_cpuinit.fixup_cpu_id = fixup_cpu_id; + x86_init.pci.arch_init = pci_numachip_init; map_csrs(); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 1817fa91102..9ed796ccc32 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -68,22 +68,6 @@ #define for_each_irq_pin(entry, head) \ for (entry = head; entry; entry = entry->next) -#ifdef CONFIG_IRQ_REMAP -static void irq_remap_modify_chip_defaults(struct irq_chip *chip); -static inline bool irq_remapped(struct irq_cfg *cfg) -{ - return cfg->irq_2_iommu.iommu != NULL; -} -#else -static inline bool irq_remapped(struct irq_cfg *cfg) -{ - return false; -} -static inline void irq_remap_modify_chip_defaults(struct irq_chip *chip) -{ -} -#endif - /* * Is the SiS APIC rmw bug present ? * -1 = don't know, 0 = no, 1 = yes @@ -234,11 +218,11 @@ int __init arch_early_irq_init(void) zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_KERNEL, node); /* * For legacy IRQ's, start with assigning irq0 to irq15 to - * IRQ0_VECTOR to IRQ15_VECTOR on cpu 0. + * IRQ0_VECTOR to IRQ15_VECTOR for all cpu's. */ if (i < legacy_pic->nr_legacy_irqs) { cfg[i].vector = IRQ0_VECTOR + i; - cpumask_set_cpu(0, cfg[i].domain); + cpumask_setall(cfg[i].domain); } } @@ -300,9 +284,9 @@ static struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node) return cfg; } -static int alloc_irq_from(unsigned int from, int node) +static int alloc_irqs_from(unsigned int from, unsigned int count, int node) { - return irq_alloc_desc_from(from, node); + return irq_alloc_descs_from(from, count, node); } static void free_irq_at(unsigned int at, struct irq_cfg *cfg) @@ -326,7 +310,7 @@ static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) + (mpc_ioapic_addr(idx) & ~PAGE_MASK); } -static inline void io_apic_eoi(unsigned int apic, unsigned int vector) +void io_apic_eoi(unsigned int apic, unsigned int vector) { struct io_apic __iomem *io_apic = io_apic_base(apic); writel(vector, &io_apic->eoi); @@ -573,19 +557,10 @@ static void unmask_ioapic_irq(struct irq_data *data) * Otherwise, we simulate the EOI message manually by changing the trigger * mode to edge and then back to level, with RTE being masked during this. */ -static void __eoi_ioapic_pin(int apic, int pin, int vector, struct irq_cfg *cfg) +void native_eoi_ioapic_pin(int apic, int pin, int vector) { if (mpc_ioapic_ver(apic) >= 0x20) { - /* - * Intr-remapping uses pin number as the virtual vector - * in the RTE. Actual vector is programmed in - * intr-remapping table entry. Hence for the io-apic - * EOI we use the pin number. - */ - if (cfg && irq_remapped(cfg)) - io_apic_eoi(apic, pin); - else - io_apic_eoi(apic, vector); + io_apic_eoi(apic, vector); } else { struct IO_APIC_route_entry entry, entry1; @@ -606,14 +581,15 @@ static void __eoi_ioapic_pin(int apic, int pin, int vector, struct irq_cfg *cfg) } } -static void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) +void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) { struct irq_pin_list *entry; unsigned long flags; raw_spin_lock_irqsave(&ioapic_lock, flags); for_each_irq_pin(entry, cfg->irq_2_pin) - __eoi_ioapic_pin(entry->apic, entry->pin, cfg->vector, cfg); + x86_io_apic_ops.eoi_ioapic_pin(entry->apic, entry->pin, + cfg->vector); raw_spin_unlock_irqrestore(&ioapic_lock, flags); } @@ -650,7 +626,7 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) } raw_spin_lock_irqsave(&ioapic_lock, flags); - __eoi_ioapic_pin(apic, pin, entry.vector, NULL); + x86_io_apic_ops.eoi_ioapic_pin(apic, pin, entry.vector); raw_spin_unlock_irqrestore(&ioapic_lock, flags); } @@ -1141,7 +1117,8 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) * allocation for the members that are not used anymore. */ cpumask_andnot(cfg->old_domain, cfg->domain, tmp_mask); - cfg->move_in_progress = 1; + cfg->move_in_progress = + cpumask_intersects(cfg->old_domain, cpu_online_mask); cpumask_and(cfg->domain, cfg->domain, tmp_mask); break; } @@ -1172,8 +1149,9 @@ next: current_vector = vector; current_offset = offset; if (cfg->vector) { - cfg->move_in_progress = 1; cpumask_copy(cfg->old_domain, cfg->domain); + cfg->move_in_progress = + cpumask_intersects(cfg->old_domain, cpu_online_mask); } for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) per_cpu(vector_irq, new_cpu)[vector] = irq; @@ -1241,12 +1219,6 @@ void __setup_vector_irq(int cpu) cfg = irq_get_chip_data(irq); if (!cfg) continue; - /* - * If it is a legacy IRQ handled by the legacy PIC, this cpu - * will be part of the irq_cfg's domain. - */ - if (irq < legacy_pic->nr_legacy_irqs && !IO_APIC_IRQ(irq)) - cpumask_set_cpu(cpu, cfg->domain); if (!cpumask_test_cpu(cpu, cfg->domain)) continue; @@ -1308,25 +1280,18 @@ static void ioapic_register_intr(unsigned int irq, struct irq_cfg *cfg, fasteoi = false; } - if (irq_remapped(cfg)) { - irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); - irq_remap_modify_chip_defaults(chip); + if (setup_remapped_irq(irq, cfg, chip)) fasteoi = trigger != 0; - } hdl = fasteoi ? handle_fasteoi_irq : handle_edge_irq; irq_set_chip_and_handler_name(irq, chip, hdl, fasteoi ? "fasteoi" : "edge"); } -static int setup_ioapic_entry(int irq, struct IO_APIC_route_entry *entry, - unsigned int destination, int vector, - struct io_apic_irq_attr *attr) +int native_setup_ioapic_entry(int irq, struct IO_APIC_route_entry *entry, + unsigned int destination, int vector, + struct io_apic_irq_attr *attr) { - if (irq_remapping_enabled) - return setup_ioapic_remapped_entry(irq, entry, destination, - vector, attr); - memset(entry, 0, sizeof(*entry)); entry->delivery_mode = apic->irq_delivery_mode; @@ -1356,16 +1321,6 @@ static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg, if (!IO_APIC_IRQ(irq)) return; - /* - * For legacy irqs, cfg->domain starts with cpu 0. Now that IO-APIC - * can handle this irq and the apic driver is finialized at this point, - * update the cfg->domain. - */ - if (irq < legacy_pic->nr_legacy_irqs && - cpumask_equal(cfg->domain, cpumask_of(0))) - apic->vector_allocation_domain(0, cfg->domain, - apic->target_cpus()); - if (assign_irq_vector(irq, cfg, apic->target_cpus())) return; @@ -1384,8 +1339,8 @@ static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg, attr->ioapic, mpc_ioapic_id(attr->ioapic), attr->ioapic_pin, cfg->vector, irq, attr->trigger, attr->polarity, dest); - if (setup_ioapic_entry(irq, &entry, dest, cfg->vector, attr)) { - pr_warn("Failed to setup ioapic entry for ioapic %d, pin %d\n", + if (x86_io_apic_ops.setup_entry(irq, &entry, dest, cfg->vector, attr)) { + pr_warn("Failed to setup ioapic entry for ioapic %d, pin %d\n", mpc_ioapic_id(attr->ioapic), attr->ioapic_pin); __clear_irq_vector(irq, cfg); @@ -1493,9 +1448,6 @@ static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx, struct IO_APIC_route_entry entry; unsigned int dest; - if (irq_remapping_enabled) - return; - memset(&entry, 0, sizeof(entry)); /* @@ -1527,9 +1479,63 @@ static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx, ioapic_write_entry(ioapic_idx, pin, entry); } -__apicdebuginit(void) print_IO_APIC(int ioapic_idx) +void native_io_apic_print_entries(unsigned int apic, unsigned int nr_entries) { int i; + + pr_debug(" NR Dst Mask Trig IRR Pol Stat Dmod Deli Vect:\n"); + + for (i = 0; i <= nr_entries; i++) { + struct IO_APIC_route_entry entry; + + entry = ioapic_read_entry(apic, i); + + pr_debug(" %02x %02X ", i, entry.dest); + pr_cont("%1d %1d %1d %1d %1d " + "%1d %1d %02X\n", + entry.mask, + entry.trigger, + entry.irr, + entry.polarity, + entry.delivery_status, + entry.dest_mode, + entry.delivery_mode, + entry.vector); + } +} + +void intel_ir_io_apic_print_entries(unsigned int apic, + unsigned int nr_entries) +{ + int i; + + pr_debug(" NR Indx Fmt Mask Trig IRR Pol Stat Indx2 Zero Vect:\n"); + + for (i = 0; i <= nr_entries; i++) { + struct IR_IO_APIC_route_entry *ir_entry; + struct IO_APIC_route_entry entry; + + entry = ioapic_read_entry(apic, i); + + ir_entry = (struct IR_IO_APIC_route_entry *)&entry; + + pr_debug(" %02x %04X ", i, ir_entry->index); + pr_cont("%1d %1d %1d %1d %1d " + "%1d %1d %X %02X\n", + ir_entry->format, + ir_entry->mask, + ir_entry->trigger, + ir_entry->irr, + ir_entry->polarity, + ir_entry->delivery_status, + ir_entry->index2, + ir_entry->zero, + ir_entry->vector); + } +} + +__apicdebuginit(void) print_IO_APIC(int ioapic_idx) +{ union IO_APIC_reg_00 reg_00; union IO_APIC_reg_01 reg_01; union IO_APIC_reg_02 reg_02; @@ -1582,58 +1588,7 @@ __apicdebuginit(void) print_IO_APIC(int ioapic_idx) printk(KERN_DEBUG ".... IRQ redirection table:\n"); - if (irq_remapping_enabled) { - printk(KERN_DEBUG " NR Indx Fmt Mask Trig IRR" - " Pol Stat Indx2 Zero Vect:\n"); - } else { - printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol" - " Stat Dmod Deli Vect:\n"); - } - - for (i = 0; i <= reg_01.bits.entries; i++) { - if (irq_remapping_enabled) { - struct IO_APIC_route_entry entry; - struct IR_IO_APIC_route_entry *ir_entry; - - entry = ioapic_read_entry(ioapic_idx, i); - ir_entry = (struct IR_IO_APIC_route_entry *) &entry; - printk(KERN_DEBUG " %02x %04X ", - i, - ir_entry->index - ); - pr_cont("%1d %1d %1d %1d %1d " - "%1d %1d %X %02X\n", - ir_entry->format, - ir_entry->mask, - ir_entry->trigger, - ir_entry->irr, - ir_entry->polarity, - ir_entry->delivery_status, - ir_entry->index2, - ir_entry->zero, - ir_entry->vector - ); - } else { - struct IO_APIC_route_entry entry; - - entry = ioapic_read_entry(ioapic_idx, i); - printk(KERN_DEBUG " %02x %02X ", - i, - entry.dest - ); - pr_cont("%1d %1d %1d %1d %1d " - "%1d %1d %02X\n", - entry.mask, - entry.trigger, - entry.irr, - entry.polarity, - entry.delivery_status, - entry.dest_mode, - entry.delivery_mode, - entry.vector - ); - } - } + x86_io_apic_ops.print_entries(ioapic_idx, reg_01.bits.entries); } __apicdebuginit(void) print_IO_APICs(void) @@ -1935,30 +1890,14 @@ void __init enable_IO_APIC(void) clear_IO_APIC(); } -/* - * Not an __init, needed by the reboot code - */ -void disable_IO_APIC(void) +void native_disable_io_apic(void) { /* - * Clear the IO-APIC before rebooting: - */ - clear_IO_APIC(); - - if (!legacy_pic->nr_legacy_irqs) - return; - - /* * If the i8259 is routed through an IOAPIC * Put that IOAPIC in virtual wire mode * so legacy interrupts can be delivered. - * - * With interrupt-remapping, for now we will use virtual wire A mode, - * as virtual wire B is little complex (need to configure both - * IOAPIC RTE as well as interrupt-remapping table entry). - * As this gets called during crash dump, keep this simple for now. */ - if (ioapic_i8259.pin != -1 && !irq_remapping_enabled) { + if (ioapic_i8259.pin != -1) { struct IO_APIC_route_entry entry; memset(&entry, 0, sizeof(entry)); @@ -1978,12 +1917,25 @@ void disable_IO_APIC(void) ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry); } + if (cpu_has_apic || apic_from_smp_config()) + disconnect_bsp_APIC(ioapic_i8259.pin != -1); + +} + +/* + * Not an __init, needed by the reboot code + */ +void disable_IO_APIC(void) +{ /* - * Use virtual wire A mode when interrupt remapping is enabled. + * Clear the IO-APIC before rebooting: */ - if (cpu_has_apic || apic_from_smp_config()) - disconnect_bsp_APIC(!irq_remapping_enabled && - ioapic_i8259.pin != -1); + clear_IO_APIC(); + + if (!legacy_pic->nr_legacy_irqs) + return; + + x86_io_apic_ops.disable(); } #ifdef CONFIG_X86_32 @@ -2199,9 +2151,11 @@ static int ioapic_retrigger_irq(struct irq_data *data) { struct irq_cfg *cfg = data->chip_data; unsigned long flags; + int cpu; raw_spin_lock_irqsave(&vector_lock, flags); - apic->send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector); + cpu = cpumask_first_and(cfg->domain, cpu_online_mask); + apic->send_IPI_mask(cpumask_of(cpu), cfg->vector); raw_spin_unlock_irqrestore(&vector_lock, flags); return 1; @@ -2334,12 +2288,8 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq apic = entry->apic; pin = entry->pin; - /* - * With interrupt-remapping, destination information comes - * from interrupt-remapping table entry. - */ - if (!irq_remapped(cfg)) - io_apic_write(apic, 0x11 + pin*2, dest); + + io_apic_write(apic, 0x11 + pin*2, dest); reg = io_apic_read(apic, 0x10 + pin*2); reg &= ~IO_APIC_REDIR_VECTOR_MASK; reg |= vector; @@ -2381,9 +2331,10 @@ int __ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, return 0; } -static int -ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, - bool force) + +int native_ioapic_set_affinity(struct irq_data *data, + const struct cpumask *mask, + bool force) { unsigned int dest, irq = data->irq; unsigned long flags; @@ -2560,33 +2511,6 @@ static void ack_apic_level(struct irq_data *data) ioapic_irqd_unmask(data, cfg, masked); } -#ifdef CONFIG_IRQ_REMAP -static void ir_ack_apic_edge(struct irq_data *data) -{ - ack_APIC_irq(); -} - -static void ir_ack_apic_level(struct irq_data *data) -{ - ack_APIC_irq(); - eoi_ioapic_irq(data->irq, data->chip_data); -} - -static void ir_print_prefix(struct irq_data *data, struct seq_file *p) -{ - seq_printf(p, " IR-%s", data->chip->name); -} - -static void irq_remap_modify_chip_defaults(struct irq_chip *chip) -{ - chip->irq_print_chip = ir_print_prefix; - chip->irq_ack = ir_ack_apic_edge; - chip->irq_eoi = ir_ack_apic_level; - - chip->irq_set_affinity = set_remapped_irq_affinity; -} -#endif /* CONFIG_IRQ_REMAP */ - static struct irq_chip ioapic_chip __read_mostly = { .name = "IO-APIC", .irq_startup = startup_ioapic_irq, @@ -2594,7 +2518,7 @@ static struct irq_chip ioapic_chip __read_mostly = { .irq_unmask = unmask_ioapic_irq, .irq_ack = ack_apic_edge, .irq_eoi = ack_apic_level, - .irq_set_affinity = ioapic_set_affinity, + .irq_set_affinity = native_ioapic_set_affinity, .irq_retrigger = ioapic_retrigger_irq, }; @@ -2793,8 +2717,7 @@ static inline void __init check_timer(void) * 8259A. */ if (pin1 == -1) { - if (irq_remapping_enabled) - panic("BIOS bug: timer not connected to IO-APIC"); + panic_if_irq_remap("BIOS bug: timer not connected to IO-APIC"); pin1 = pin2; apic1 = apic2; no_pin1 = 1; @@ -2826,8 +2749,7 @@ static inline void __init check_timer(void) clear_IO_APIC_pin(0, pin1); goto out; } - if (irq_remapping_enabled) - panic("timer doesn't work through Interrupt-remapped IO-APIC"); + panic_if_irq_remap("timer doesn't work through Interrupt-remapped IO-APIC"); local_irq_disable(); clear_IO_APIC_pin(apic1, pin1); if (!no_pin1) @@ -2994,37 +2916,58 @@ device_initcall(ioapic_init_ops); /* * Dynamic irq allocate and deallocation */ -unsigned int create_irq_nr(unsigned int from, int node) +unsigned int __create_irqs(unsigned int from, unsigned int count, int node) { - struct irq_cfg *cfg; + struct irq_cfg **cfg; unsigned long flags; - unsigned int ret = 0; - int irq; + int irq, i; if (from < nr_irqs_gsi) from = nr_irqs_gsi; - irq = alloc_irq_from(from, node); - if (irq < 0) - return 0; - cfg = alloc_irq_cfg(irq, node); - if (!cfg) { - free_irq_at(irq, NULL); + cfg = kzalloc_node(count * sizeof(cfg[0]), GFP_KERNEL, node); + if (!cfg) return 0; + + irq = alloc_irqs_from(from, count, node); + if (irq < 0) + goto out_cfgs; + + for (i = 0; i < count; i++) { + cfg[i] = alloc_irq_cfg(irq + i, node); + if (!cfg[i]) + goto out_irqs; } raw_spin_lock_irqsave(&vector_lock, flags); - if (!__assign_irq_vector(irq, cfg, apic->target_cpus())) - ret = irq; + for (i = 0; i < count; i++) + if (__assign_irq_vector(irq + i, cfg[i], apic->target_cpus())) + goto out_vecs; raw_spin_unlock_irqrestore(&vector_lock, flags); - if (ret) { - irq_set_chip_data(irq, cfg); - irq_clear_status_flags(irq, IRQ_NOREQUEST); - } else { - free_irq_at(irq, cfg); + for (i = 0; i < count; i++) { + irq_set_chip_data(irq + i, cfg[i]); + irq_clear_status_flags(irq + i, IRQ_NOREQUEST); } - return ret; + + kfree(cfg); + return irq; + +out_vecs: + for (i--; i >= 0; i--) + __clear_irq_vector(irq + i, cfg[i]); + raw_spin_unlock_irqrestore(&vector_lock, flags); +out_irqs: + for (i = 0; i < count; i++) + free_irq_at(irq + i, cfg[i]); +out_cfgs: + kfree(cfg); + return 0; +} + +unsigned int create_irq_nr(unsigned int from, int node) +{ + return __create_irqs(from, 1, node); } int create_irq(void) @@ -3049,48 +2992,35 @@ void destroy_irq(unsigned int irq) irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE); - if (irq_remapped(cfg)) - free_remapped_irq(irq); + free_remapped_irq(irq); + raw_spin_lock_irqsave(&vector_lock, flags); __clear_irq_vector(irq, cfg); raw_spin_unlock_irqrestore(&vector_lock, flags); free_irq_at(irq, cfg); } +void destroy_irqs(unsigned int irq, unsigned int count) +{ + unsigned int i; + + for (i = 0; i < count; i++) + destroy_irq(irq + i); +} + /* * MSI message composition */ -#ifdef CONFIG_PCI_MSI -static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, - struct msi_msg *msg, u8 hpet_id) +void native_compose_msi_msg(struct pci_dev *pdev, + unsigned int irq, unsigned int dest, + struct msi_msg *msg, u8 hpet_id) { - struct irq_cfg *cfg; - int err; - unsigned dest; - - if (disable_apic) - return -ENXIO; + struct irq_cfg *cfg = irq_cfg(irq); - cfg = irq_cfg(irq); - err = assign_irq_vector(irq, cfg, apic->target_cpus()); - if (err) - return err; - - err = apic->cpu_mask_to_apicid_and(cfg->domain, - apic->target_cpus(), &dest); - if (err) - return err; - - if (irq_remapped(cfg)) { - compose_remapped_msi_msg(pdev, irq, dest, msg, hpet_id); - return err; - } + msg->address_hi = MSI_ADDR_BASE_HI; if (x2apic_enabled()) - msg->address_hi = MSI_ADDR_BASE_HI | - MSI_ADDR_EXT_DEST_ID(dest); - else - msg->address_hi = MSI_ADDR_BASE_HI; + msg->address_hi |= MSI_ADDR_EXT_DEST_ID(dest); msg->address_lo = MSI_ADDR_BASE_LO | @@ -3109,8 +3039,32 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, MSI_DATA_DELIVERY_FIXED: MSI_DATA_DELIVERY_LOWPRI) | MSI_DATA_VECTOR(cfg->vector); +} - return err; +#ifdef CONFIG_PCI_MSI +static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, + struct msi_msg *msg, u8 hpet_id) +{ + struct irq_cfg *cfg; + int err; + unsigned dest; + + if (disable_apic) + return -ENXIO; + + cfg = irq_cfg(irq); + err = assign_irq_vector(irq, cfg, apic->target_cpus()); + if (err) + return err; + + err = apic->cpu_mask_to_apicid_and(cfg->domain, + apic->target_cpus(), &dest); + if (err) + return err; + + x86_msi.compose_msi_msg(pdev, irq, dest, msg, hpet_id); + + return 0; } static int @@ -3148,23 +3102,28 @@ static struct irq_chip msi_chip = { .irq_retrigger = ioapic_retrigger_irq, }; -static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq) +int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, + unsigned int irq_base, unsigned int irq_offset) { struct irq_chip *chip = &msi_chip; struct msi_msg msg; + unsigned int irq = irq_base + irq_offset; int ret; ret = msi_compose_msg(dev, irq, &msg, -1); if (ret < 0) return ret; - irq_set_msi_desc(irq, msidesc); - write_msi_msg(irq, &msg); + irq_set_msi_desc_off(irq_base, irq_offset, msidesc); - if (irq_remapped(irq_get_chip_data(irq))) { - irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); - irq_remap_modify_chip_defaults(chip); - } + /* + * MSI-X message is written per-IRQ, the offset is always 0. + * MSI message denotes a contiguous group of IRQs, written for 0th IRQ. + */ + if (!irq_offset) + write_msi_msg(irq, &msg); + + setup_remapped_irq(irq, irq_get_chip_data(irq), chip); irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge"); @@ -3175,46 +3134,26 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq) int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) { - int node, ret, sub_handle, index = 0; unsigned int irq, irq_want; struct msi_desc *msidesc; + int node, ret; - /* x86 doesn't support multiple MSI yet */ + /* Multiple MSI vectors only supported with interrupt remapping */ if (type == PCI_CAP_ID_MSI && nvec > 1) return 1; node = dev_to_node(&dev->dev); irq_want = nr_irqs_gsi; - sub_handle = 0; list_for_each_entry(msidesc, &dev->msi_list, list) { irq = create_irq_nr(irq_want, node); if (irq == 0) - return -1; + return -ENOSPC; + irq_want = irq + 1; - if (!irq_remapping_enabled) - goto no_ir; - if (!sub_handle) { - /* - * allocate the consecutive block of IRTE's - * for 'nvec' - */ - index = msi_alloc_remapped_irq(dev, irq, nvec); - if (index < 0) { - ret = index; - goto error; - } - } else { - ret = msi_setup_remapped_irq(dev, irq, index, - sub_handle); - if (ret < 0) - goto error; - } -no_ir: - ret = setup_msi_irq(dev, msidesc, irq); + ret = setup_msi_irq(dev, msidesc, irq, 0); if (ret < 0) goto error; - sub_handle++; } return 0; @@ -3310,25 +3249,19 @@ static struct irq_chip hpet_msi_type = { .irq_retrigger = ioapic_retrigger_irq, }; -int arch_setup_hpet_msi(unsigned int irq, unsigned int id) +int default_setup_hpet_msi(unsigned int irq, unsigned int id) { struct irq_chip *chip = &hpet_msi_type; struct msi_msg msg; int ret; - if (irq_remapping_enabled) { - if (!setup_hpet_msi_remapped(irq, id)) - return -1; - } - ret = msi_compose_msg(NULL, irq, &msg, id); if (ret < 0) return ret; hpet_msi_write(irq_get_handler_data(irq), &msg); irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); - if (irq_remapped(irq_get_chip_data(irq))) - irq_remap_modify_chip_defaults(chip); + setup_remapped_irq(irq, irq_get_chip_data(irq), chip); irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge"); return 0; @@ -3694,10 +3627,7 @@ void __init setup_ioapic_dest(void) else mask = apic->target_cpus(); - if (irq_remapping_enabled) - set_remapped_irq_affinity(idata, mask, false); - else - ioapic_set_affinity(idata, mask, false); + x86_io_apic_ops.set_affinity(idata, mask, false); } } diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c index cce91bf2667..7434d8556d0 100644 --- a/arch/x86/kernel/apic/ipi.c +++ b/arch/x86/kernel/apic/ipi.c @@ -106,7 +106,7 @@ void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector) unsigned long mask = cpumask_bits(cpumask)[0]; unsigned long flags; - if (WARN_ONCE(!mask, "empty IPI mask")) + if (!mask) return; local_irq_save(flags); diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index e03a1e180e8..562a76d433c 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -20,18 +20,19 @@ static int set_x2apic_phys_mode(char *arg) } early_param("x2apic_phys", set_x2apic_phys_mode); -static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +static bool x2apic_fadt_phys(void) { - if (x2apic_phys) - return x2apic_enabled(); - else if ((acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID) && - (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL) && - x2apic_enabled()) { + if ((acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID) && + (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) { printk(KERN_DEBUG "System requires x2apic physical mode\n"); - return 1; + return true; } - else - return 0; + return false; +} + +static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ + return x2apic_enabled() && (x2apic_phys || x2apic_fadt_phys()); } static void @@ -82,7 +83,7 @@ static void init_x2apic_ldr(void) static int x2apic_phys_probe(void) { - if (x2apic_mode && x2apic_phys) + if (x2apic_mode && (x2apic_phys || x2apic_fadt_phys())) return 1; return apic == &apic_x2apic_phys; diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 8cfade9510a..794f6eb54cd 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -5,7 +5,7 @@ * * SGI UV APIC functions (note: not an Intel compatible APIC) * - * Copyright (C) 2007-2010 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2007-2013 Silicon Graphics, Inc. All rights reserved. */ #include <linux/cpumask.h> #include <linux/hardirq.h> @@ -91,10 +91,16 @@ static int __init early_get_pnodeid(void) m_n_config.v = uv_early_read_mmr(UVH_RH_GAM_CONFIG_MMR); uv_min_hub_revision_id = node_id.s.revision; - if (node_id.s.part_number == UV2_HUB_PART_NUMBER) - uv_min_hub_revision_id += UV2_HUB_REVISION_BASE - 1; - if (node_id.s.part_number == UV2_HUB_PART_NUMBER_X) + switch (node_id.s.part_number) { + case UV2_HUB_PART_NUMBER: + case UV2_HUB_PART_NUMBER_X: uv_min_hub_revision_id += UV2_HUB_REVISION_BASE - 1; + break; + case UV3_HUB_PART_NUMBER: + case UV3_HUB_PART_NUMBER_X: + uv_min_hub_revision_id += UV3_HUB_REVISION_BASE - 1; + break; + } uv_hub_info->hub_revision = uv_min_hub_revision_id; pnode = (node_id.s.node_id >> 1) & ((1 << m_n_config.s.n_skt) - 1); @@ -130,13 +136,16 @@ static void __init uv_set_apicid_hibit(void) static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { - int pnodeid, is_uv1, is_uv2; + int pnodeid, is_uv1, is_uv2, is_uv3; is_uv1 = !strcmp(oem_id, "SGI"); is_uv2 = !strcmp(oem_id, "SGI2"); - if (is_uv1 || is_uv2) { + is_uv3 = !strncmp(oem_id, "SGI3", 4); /* there are varieties of UV3 */ + if (is_uv1 || is_uv2 || is_uv3) { uv_hub_info->hub_revision = - is_uv1 ? UV1_HUB_REVISION_BASE : UV2_HUB_REVISION_BASE; + (is_uv1 ? UV1_HUB_REVISION_BASE : + (is_uv2 ? UV2_HUB_REVISION_BASE : + UV3_HUB_REVISION_BASE)); pnodeid = early_get_pnodeid(); early_get_apic_pnode_shift(); x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range; @@ -450,14 +459,17 @@ static __init void map_high(char *id, unsigned long base, int pshift, paddr = base << pshift; bytes = (1UL << bshift) * (max_pnode + 1); - printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr, - paddr + bytes); + if (!paddr) { + pr_info("UV: Map %s_HI base address NULL\n", id); + return; + } + pr_info("UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr, paddr + bytes); if (map_type == map_uc) init_extra_mapping_uc(paddr, bytes); else init_extra_mapping_wb(paddr, bytes); - } + static __init void map_gru_high(int max_pnode) { union uvh_rh_gam_gru_overlay_config_mmr_u gru; @@ -468,7 +480,8 @@ static __init void map_gru_high(int max_pnode) map_high("GRU", gru.s.base, shift, shift, max_pnode, map_wb); gru_start_paddr = ((u64)gru.s.base << shift); gru_end_paddr = gru_start_paddr + (1UL << shift) * (max_pnode + 1); - + } else { + pr_info("UV: GRU disabled\n"); } } @@ -480,23 +493,146 @@ static __init void map_mmr_high(int max_pnode) mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR); if (mmr.s.enable) map_high("MMR", mmr.s.base, shift, shift, max_pnode, map_uc); + else + pr_info("UV: MMR disabled\n"); +} + +/* + * This commonality works because both 0 & 1 versions of the MMIOH OVERLAY + * and REDIRECT MMR regs are exactly the same on UV3. + */ +struct mmioh_config { + unsigned long overlay; + unsigned long redirect; + char *id; +}; + +static __initdata struct mmioh_config mmiohs[] = { + { + UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR, + UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR, + "MMIOH0" + }, + { + UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR, + UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR, + "MMIOH1" + }, +}; + +static __init void map_mmioh_high_uv3(int index, int min_pnode, int max_pnode) +{ + union uv3h_rh_gam_mmioh_overlay_config0_mmr_u overlay; + unsigned long mmr; + unsigned long base; + int i, n, shift, m_io, max_io; + int nasid, lnasid, fi, li; + char *id; + + id = mmiohs[index].id; + overlay.v = uv_read_local_mmr(mmiohs[index].overlay); + pr_info("UV: %s overlay 0x%lx base:0x%x m_io:%d\n", + id, overlay.v, overlay.s3.base, overlay.s3.m_io); + if (!overlay.s3.enable) { + pr_info("UV: %s disabled\n", id); + return; + } + + shift = UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_BASE_SHFT; + base = (unsigned long)overlay.s3.base; + m_io = overlay.s3.m_io; + mmr = mmiohs[index].redirect; + n = UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH; + min_pnode *= 2; /* convert to NASID */ + max_pnode *= 2; + max_io = lnasid = fi = li = -1; + + for (i = 0; i < n; i++) { + union uv3h_rh_gam_mmioh_redirect_config0_mmr_u redirect; + + redirect.v = uv_read_local_mmr(mmr + i * 8); + nasid = redirect.s3.nasid; + if (nasid < min_pnode || max_pnode < nasid) + nasid = -1; /* invalid NASID */ + + if (nasid == lnasid) { + li = i; + if (i != n-1) /* last entry check */ + continue; + } + + /* check if we have a cached (or last) redirect to print */ + if (lnasid != -1 || (i == n-1 && nasid != -1)) { + unsigned long addr1, addr2; + int f, l; + + if (lnasid == -1) { + f = l = i; + lnasid = nasid; + } else { + f = fi; + l = li; + } + addr1 = (base << shift) + + f * (unsigned long)(1 << m_io); + addr2 = (base << shift) + + (l + 1) * (unsigned long)(1 << m_io); + pr_info("UV: %s[%03d..%03d] NASID 0x%04x ADDR 0x%016lx - 0x%016lx\n", + id, fi, li, lnasid, addr1, addr2); + if (max_io < l) + max_io = l; + } + fi = li = i; + lnasid = nasid; + } + + pr_info("UV: %s base:0x%lx shift:%d M_IO:%d MAX_IO:%d\n", + id, base, shift, m_io, max_io); + + if (max_io >= 0) + map_high(id, base, shift, m_io, max_io, map_uc); } -static __init void map_mmioh_high(int max_pnode) +static __init void map_mmioh_high(int min_pnode, int max_pnode) { union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh; - int shift; + unsigned long mmr, base; + int shift, enable, m_io, n_io; - mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR); - if (is_uv1_hub() && mmioh.s1.enable) { - shift = UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT; - map_high("MMIOH", mmioh.s1.base, shift, mmioh.s1.m_io, - max_pnode, map_uc); + if (is_uv3_hub()) { + /* Map both MMIOH Regions */ + map_mmioh_high_uv3(0, min_pnode, max_pnode); + map_mmioh_high_uv3(1, min_pnode, max_pnode); + return; } - if (is_uv2_hub() && mmioh.s2.enable) { + + if (is_uv1_hub()) { + mmr = UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR; + shift = UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT; + mmioh.v = uv_read_local_mmr(mmr); + enable = !!mmioh.s1.enable; + base = mmioh.s1.base; + m_io = mmioh.s1.m_io; + n_io = mmioh.s1.n_io; + } else if (is_uv2_hub()) { + mmr = UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR; shift = UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT; - map_high("MMIOH", mmioh.s2.base, shift, mmioh.s2.m_io, - max_pnode, map_uc); + mmioh.v = uv_read_local_mmr(mmr); + enable = !!mmioh.s2.enable; + base = mmioh.s2.base; + m_io = mmioh.s2.m_io; + n_io = mmioh.s2.n_io; + } else + return; + + if (enable) { + max_pnode &= (1 << n_io) - 1; + pr_info( + "UV: base:0x%lx shift:%d N_IO:%d M_IO:%d max_pnode:0x%x\n", + base, shift, m_io, n_io, max_pnode); + map_high("MMIOH", base, shift, m_io, max_pnode, map_uc); + } else { + pr_info("UV: MMIOH disabled\n"); } } @@ -724,42 +860,41 @@ void uv_nmi_init(void) void __init uv_system_init(void) { union uvh_rh_gam_config_mmr_u m_n_config; - union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh; union uvh_node_id_u node_id; unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size; - int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val, n_io; - int gnode_extra, max_pnode = 0; + int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val; + int gnode_extra, min_pnode = 999999, max_pnode = -1; unsigned long mmr_base, present, paddr; - unsigned short pnode_mask, pnode_io_mask; + unsigned short pnode_mask; + char *hub = (is_uv1_hub() ? "UV1" : + (is_uv2_hub() ? "UV2" : + "UV3")); - printk(KERN_INFO "UV: Found %s hub\n", is_uv1_hub() ? "UV1" : "UV2"); + pr_info("UV: Found %s hub\n", hub); map_low_mmrs(); m_n_config.v = uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR ); m_val = m_n_config.s.m_skt; n_val = m_n_config.s.n_skt; - mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR); - n_io = is_uv1_hub() ? mmioh.s1.n_io : mmioh.s2.n_io; + pnode_mask = (1 << n_val) - 1; mmr_base = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) & ~UV_MMR_ENABLE; - pnode_mask = (1 << n_val) - 1; - pnode_io_mask = (1 << n_io) - 1; node_id.v = uv_read_local_mmr(UVH_NODE_ID); gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1; gnode_upper = ((unsigned long)gnode_extra << m_val); - printk(KERN_INFO "UV: N %d, M %d, N_IO: %d, gnode_upper 0x%lx, gnode_extra 0x%x, pnode_mask 0x%x, pnode_io_mask 0x%x\n", - n_val, m_val, n_io, gnode_upper, gnode_extra, pnode_mask, pnode_io_mask); + pr_info("UV: N:%d M:%d pnode_mask:0x%x gnode_upper/extra:0x%lx/0x%x\n", + n_val, m_val, pnode_mask, gnode_upper, gnode_extra); - printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base); + pr_info("UV: global MMR base 0x%lx\n", mmr_base); for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) uv_possible_blades += hweight64(uv_read_local_mmr( UVH_NODE_PRESENT_TABLE + i * 8)); /* uv_num_possible_blades() is really the hub count */ - printk(KERN_INFO "UV: Found %d blades, %d hubs\n", + pr_info("UV: Found %d blades, %d hubs\n", is_uv1_hub() ? uv_num_possible_blades() : (uv_num_possible_blades() + 1) / 2, uv_num_possible_blades()); @@ -794,6 +929,7 @@ void __init uv_system_init(void) uv_blade_info[blade].nr_possible_cpus = 0; uv_blade_info[blade].nr_online_cpus = 0; spin_lock_init(&uv_blade_info[blade].nmi_lock); + min_pnode = min(pnode, min_pnode); max_pnode = max(pnode, max_pnode); blade++; } @@ -856,7 +992,7 @@ void __init uv_system_init(void) map_gru_high(max_pnode); map_mmr_high(max_pnode); - map_mmioh_high(max_pnode & pnode_io_mask); + map_mmioh_high(min_pnode, max_pnode); uv_cpu_init(); uv_scir_register_cpu_notifier(); diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index d65464e4350..66b5faffe14 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -232,6 +232,7 @@ #include <linux/acpi.h> #include <linux/syscore_ops.h> #include <linux/i8253.h> +#include <linux/cpuidle.h> #include <asm/uaccess.h> #include <asm/desc.h> @@ -360,13 +361,35 @@ struct apm_user { * idle percentage above which bios idle calls are done */ #ifdef CONFIG_APM_CPU_IDLE -#warning deprecated CONFIG_APM_CPU_IDLE will be deleted in 2012 #define DEFAULT_IDLE_THRESHOLD 95 #else #define DEFAULT_IDLE_THRESHOLD 100 #endif #define DEFAULT_IDLE_PERIOD (100 / 3) +static int apm_cpu_idle(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int index); + +static struct cpuidle_driver apm_idle_driver = { + .name = "apm_idle", + .owner = THIS_MODULE, + .en_core_tk_irqen = 1, + .states = { + { /* entry 0 is for polling */ }, + { /* entry 1 is for APM idle */ + .name = "APM", + .desc = "APM idle", + .flags = CPUIDLE_FLAG_TIME_VALID, + .exit_latency = 250, /* WAG */ + .target_residency = 500, /* WAG */ + .enter = &apm_cpu_idle + }, + }, + .state_count = 2, +}; + +static struct cpuidle_device apm_cpuidle_device; + /* * Local variables */ @@ -377,7 +400,6 @@ static struct { static int clock_slowed; static int idle_threshold __read_mostly = DEFAULT_IDLE_THRESHOLD; static int idle_period __read_mostly = DEFAULT_IDLE_PERIOD; -static int set_pm_idle; static int suspends_pending; static int standbys_pending; static int ignore_sys_suspend; @@ -884,8 +906,6 @@ static void apm_do_busy(void) #define IDLE_CALC_LIMIT (HZ * 100) #define IDLE_LEAKY_MAX 16 -static void (*original_pm_idle)(void) __read_mostly; - /** * apm_cpu_idle - cpu idling for APM capable Linux * @@ -894,35 +914,36 @@ static void (*original_pm_idle)(void) __read_mostly; * Furthermore it calls the system default idle routine. */ -static void apm_cpu_idle(void) +static int apm_cpu_idle(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int index) { static int use_apm_idle; /* = 0 */ static unsigned int last_jiffies; /* = 0 */ static unsigned int last_stime; /* = 0 */ + cputime_t stime; int apm_idle_done = 0; unsigned int jiffies_since_last_check = jiffies - last_jiffies; unsigned int bucket; - WARN_ONCE(1, "deprecated apm_cpu_idle will be deleted in 2012"); recalc: + task_cputime(current, NULL, &stime); if (jiffies_since_last_check > IDLE_CALC_LIMIT) { use_apm_idle = 0; - last_jiffies = jiffies; - last_stime = current->stime; } else if (jiffies_since_last_check > idle_period) { unsigned int idle_percentage; - idle_percentage = current->stime - last_stime; + idle_percentage = stime - last_stime; idle_percentage *= 100; idle_percentage /= jiffies_since_last_check; use_apm_idle = (idle_percentage > idle_threshold); if (apm_info.forbid_idle) use_apm_idle = 0; - last_jiffies = jiffies; - last_stime = current->stime; } + last_jiffies = jiffies; + last_stime = stime; + bucket = IDLE_LEAKY_MAX; while (!need_resched()) { @@ -950,10 +971,7 @@ recalc: break; } } - if (original_pm_idle) - original_pm_idle(); - else - default_idle(); + default_idle(); local_irq_disable(); jiffies_since_last_check = jiffies - last_jiffies; if (jiffies_since_last_check > idle_period) @@ -963,7 +981,7 @@ recalc: if (apm_idle_done) apm_do_busy(); - local_irq_enable(); + return index; } /** @@ -2381,9 +2399,9 @@ static int __init apm_init(void) if (HZ != 100) idle_period = (idle_period * HZ) / 100; if (idle_threshold < 100) { - original_pm_idle = pm_idle; - pm_idle = apm_cpu_idle; - set_pm_idle = 1; + if (!cpuidle_register_driver(&apm_idle_driver)) + if (cpuidle_register_device(&apm_cpuidle_device)) + cpuidle_unregister_driver(&apm_idle_driver); } return 0; @@ -2393,15 +2411,9 @@ static void __exit apm_exit(void) { int error; - if (set_pm_idle) { - pm_idle = original_pm_idle; - /* - * We are about to unload the current idle thread pm callback - * (pm_idle), Wait for all processors to update cached/local - * copies of pm_idle before proceeding. - */ - kick_all_cpus_sync(); - } + cpuidle_unregister_device(&apm_cpuidle_device); + cpuidle_unregister_driver(&apm_idle_driver); + if (((apm_info.bios.flags & APM_BIOS_DISENGAGED) == 0) && (apm_info.connection_version > 0x0100)) { error = apm_engage_power_management(APM_DEVICE_ALL, 0); diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index f7e98a2c0d1..fa96eb0d02f 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -12,7 +12,6 @@ #include <asm/pci-direct.h> #ifdef CONFIG_X86_64 -# include <asm/numa_64.h> # include <asm/mmconfig.h> # include <asm/cacheflush.h> #endif @@ -220,8 +219,7 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c) */ WARN_ONCE(1, "WARNING: This combination of AMD" " processors is not suitable for SMP.\n"); - if (!test_taint(TAINT_UNSAFE_SMP)) - add_taint(TAINT_UNSAFE_SMP); + add_taint(TAINT_UNSAFE_SMP, LOCKDEP_NOW_UNRELIABLE); valid_k7: ; @@ -304,7 +302,7 @@ static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c) int cpu = smp_processor_id(); /* get information required for multi-node processors */ - if (cpu_has(c, X86_FEATURE_TOPOEXT)) { + if (cpu_has_topoext) { u32 eax, ebx, ecx, edx; cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); @@ -364,9 +362,9 @@ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c) #endif } -int amd_get_nb_id(int cpu) +u16 amd_get_nb_id(int cpu) { - int id = 0; + u16 id = 0; #ifdef CONFIG_SMP id = per_cpu(cpu_llc_id, cpu); #endif @@ -518,10 +516,9 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) static void __cpuinit init_amd(struct cpuinfo_x86 *c) { u32 dummy; - -#ifdef CONFIG_SMP unsigned long long value; +#ifdef CONFIG_SMP /* * Disable TLB flush filter by setting HWCR.FFDIS on K8 * bit 6 of msr C001_0015 @@ -559,12 +556,10 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) * (AMD Erratum #110, docId: 25759). */ if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM)) { - u64 val; - clear_cpu_cap(c, X86_FEATURE_LAHF_LM); - if (!rdmsrl_amd_safe(0xc001100d, &val)) { - val &= ~(1ULL << 32); - wrmsrl_amd_safe(0xc001100d, val); + if (!rdmsrl_amd_safe(0xc001100d, &value)) { + value &= ~(1ULL << 32); + wrmsrl_amd_safe(0xc001100d, value); } } @@ -617,13 +612,12 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) if ((c->x86 == 0x15) && (c->x86_model >= 0x10) && (c->x86_model <= 0x1f) && !cpu_has(c, X86_FEATURE_TOPOEXT)) { - u64 val; - if (!rdmsrl_safe(0xc0011005, &val)) { - val |= 1ULL << 54; - wrmsrl_safe(0xc0011005, val); - rdmsrl(0xc0011005, val); - if (val & (1ULL << 54)) { + if (!rdmsrl_safe(0xc0011005, &value)) { + value |= 1ULL << 54; + wrmsrl_safe(0xc0011005, value); + rdmsrl(0xc0011005, value); + if (value & (1ULL << 54)) { set_cpu_cap(c, X86_FEATURE_TOPOEXT); printk(KERN_INFO FW_INFO "CPU: Re-enabling " "disabled Topology Extensions Support\n"); @@ -631,6 +625,19 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) } } + /* + * The way access filter has a performance penalty on some workloads. + * Disable it on the affected CPUs. + */ + if ((c->x86 == 0x15) && + (c->x86_model >= 0x02) && (c->x86_model < 0x20)) { + + if (!rdmsrl_safe(0xc0011021, &value) && !(value & 0x1E)) { + value |= 0x1E; + wrmsrl_safe(0xc0011021, value); + } + } + cpu_detect_cache_sizes(c); /* Multi core CPU? */ @@ -643,12 +650,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) detect_ht(c); #endif - if (c->extended_cpuid_level >= 0x80000006) { - if (cpuid_edx(0x80000006) & 0xf000) - num_cache_leaves = 4; - else - num_cache_leaves = 3; - } + init_amd_cacheinfo(c); if (c->x86 >= 0xf) set_cpu_cap(c, X86_FEATURE_K8); @@ -676,12 +678,10 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) * benefit in doing so. */ if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) { + unsigned long pfn = tseg >> PAGE_SHIFT; + printk(KERN_DEBUG "tseg: %010llx\n", tseg); - if ((tseg>>PMD_SHIFT) < - (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) || - ((tseg>>PMD_SHIFT) < - (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) && - (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT)))) + if (pfn_range_is_mapped(pfn, pfn + 1)) set_memory_4k((unsigned long)__va(tseg), 1); } } @@ -694,13 +694,11 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) if (c->x86 > 0x11) set_cpu_cap(c, X86_FEATURE_ARAT); - /* - * Disable GART TLB Walk Errors on Fam10h. We do this here - * because this is always needed when GART is enabled, even in a - * kernel which has no MCE support built in. - */ if (c->x86 == 0x10) { /* + * Disable GART TLB Walk Errors on Fam10h. We do this here + * because this is always needed when GART is enabled, even in a + * kernel which has no MCE support built in. * BIOS should disable GartTlbWlk Errors themself. If * it doesn't do it here as suggested by the BKDG. * @@ -714,6 +712,21 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) mask |= (1 << 10); wrmsrl_safe(MSR_AMD64_MCx_MASK(4), mask); } + + /* + * On family 10h BIOS may not have properly enabled WC+ support, + * causing it to be converted to CD memtype. This may result in + * performance degradation for certain nested-paging guests. + * Prevent this conversion by clearing bit 24 in + * MSR_AMD64_BU_CFG2. + * + * NOTE: we want to use the _safe accessors so as not to #GP kvm + * guests on older kvm hosts. + */ + + rdmsrl_safe(MSR_AMD64_BU_CFG2, &value); + value &= ~(1ULL << 24); + wrmsrl_safe(MSR_AMD64_BU_CFG2, value); } rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); @@ -739,9 +752,6 @@ static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, static void __cpuinit cpu_set_tlb_flushall_shift(struct cpuinfo_x86 *c) { - if (!cpu_has_invlpg) - return; - tlb_flushall_shift = 5; if (c->x86 <= 0x11) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index d0e910da16c..af6455e3fcc 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -17,15 +17,6 @@ #include <asm/paravirt.h> #include <asm/alternative.h> -static int __init no_halt(char *s) -{ - WARN_ONCE(1, "\"no-hlt\" is deprecated, please use \"idle=poll\"\n"); - boot_cpu_data.hlt_works_ok = 0; - return 1; -} - -__setup("no-hlt", no_halt); - static int __init no_387(char *s) { boot_cpu_data.hard_math = 0; @@ -89,71 +80,18 @@ static void __init check_fpu(void) pr_warn("Hmm, FPU with FDIV bug\n"); } -static void __init check_hlt(void) -{ - if (boot_cpu_data.x86 >= 5 || paravirt_enabled()) - return; - - pr_info("Checking 'hlt' instruction... "); - if (!boot_cpu_data.hlt_works_ok) { - pr_cont("disabled\n"); - return; - } - halt(); - halt(); - halt(); - halt(); - pr_cont("OK\n"); -} - -/* - * Most 386 processors have a bug where a POPAD can lock the - * machine even from user space. - */ - -static void __init check_popad(void) -{ -#ifndef CONFIG_X86_POPAD_OK - int res, inp = (int) &res; - - pr_info("Checking for popad bug... "); - __asm__ __volatile__( - "movl $12345678,%%eax; movl $0,%%edi; pusha; popa; movl (%%edx,%%edi),%%ecx " - : "=&a" (res) - : "d" (inp) - : "ecx", "edi"); - /* - * If this fails, it means that any user program may lock the - * CPU hard. Too bad. - */ - if (res != 12345678) - pr_cont("Buggy\n"); - else - pr_cont("OK\n"); -#endif -} - /* * Check whether we are able to run this kernel safely on SMP. * - * - In order to run on a i386, we need to be compiled for i386 - * (for due to lack of "invlpg" and working WP on a i386) + * - i386 is no longer supported. * - In order to run on anything without a TSC, we need to be * compiled for a i486. */ static void __init check_config(void) { -/* - * We'd better not be a i386 if we're configured to use some - * i486+ only features! (WP works in supervisor mode and the - * new "invlpg" and "bswap" instructions) - */ -#if defined(CONFIG_X86_WP_WORKS_OK) || defined(CONFIG_X86_INVLPG) || \ - defined(CONFIG_X86_BSWAP) - if (boot_cpu_data.x86 == 3) + if (boot_cpu_data.x86 < 4) panic("Kernel requires i486+ for 'invlpg' and other features"); -#endif } @@ -165,8 +103,6 @@ void __init check_bugs(void) print_cpu_info(&boot_cpu_data); #endif check_config(); - check_hlt(); - check_popad(); init_utsname()->machine[1] = '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86); alternative_instructions(); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 7505f7b13e7..d814772c5be 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -37,6 +37,8 @@ #include <asm/mce.h> #include <asm/msr.h> #include <asm/pat.h> +#include <asm/microcode.h> +#include <asm/microcode_intel.h> #ifdef CONFIG_X86_LOCAL_APIC #include <asm/uv/uv.h> @@ -213,7 +215,7 @@ static inline int flag_is_changeable_p(u32 flag) } /* Probe for the CPUID instruction */ -static int __cpuinit have_cpuid_p(void) +int __cpuinit have_cpuid_p(void) { return flag_is_changeable_p(X86_EFLAGS_ID); } @@ -249,11 +251,6 @@ static inline int flag_is_changeable_p(u32 flag) { return 1; } -/* Probe for the CPUID instruction */ -static inline int have_cpuid_p(void) -{ - return 1; -} static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c) { } @@ -1173,15 +1170,6 @@ DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); #endif -/* Make sure %fs and %gs are initialized properly in idle threads */ -struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs) -{ - memset(regs, 0, sizeof(struct pt_regs)); - regs->fs = __KERNEL_PERCPU; - regs->gs = __KERNEL_STACK_CANARY; - - return regs; -} #endif /* CONFIG_X86_64 */ /* @@ -1232,12 +1220,18 @@ void __cpuinit cpu_init(void) int cpu; int i; + /* + * Load microcode on this cpu if a valid microcode is available. + * This is early microcode loading procedure. + */ + load_ucode_ap(); + cpu = stack_smp_processor_id(); t = &per_cpu(init_tss, cpu); oist = &per_cpu(orig_ist, cpu); #ifdef CONFIG_NUMA - if (cpu != 0 && this_cpu_read(numa_node) == 0 && + if (this_cpu_read(numa_node) == 0 && early_cpu_to_node(cpu) != NUMA_NO_NODE) set_numa_node(early_cpu_to_node(cpu)); #endif @@ -1269,8 +1263,7 @@ void __cpuinit cpu_init(void) barrier(); x86_configure_nx(); - if (cpu != 0) - enable_x2apic(); + enable_x2apic(); /* * set up and load the per-CPU TSS @@ -1324,6 +1317,8 @@ void __cpuinit cpu_init(void) struct tss_struct *t = &per_cpu(init_tss, cpu); struct thread_struct *thread = &curr->thread; + show_ucode_info_early(); + if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) { printk(KERN_WARNING "CPU#%d already initialized!\n", cpu); for (;;) diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index a8f8fa9769d..1e7e84a02eb 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -79,3 +79,10 @@ void __init init_hypervisor_platform(void) if (x86_hyper->init_platform) x86_hyper->init_platform(); } + +bool __init hypervisor_x2apic_available(void) +{ + return x86_hyper && + x86_hyper->x2apic_available && + x86_hyper->x2apic_available(); +} diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 198e019a531..1905ce98bee 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -17,7 +17,6 @@ #ifdef CONFIG_X86_64 #include <linux/topology.h> -#include <asm/numa_64.h> #endif #include "cpu.h" @@ -168,7 +167,7 @@ int __cpuinit ppro_with_ram_bug(void) #ifdef CONFIG_X86_F00F_BUG static void __cpuinit trap_init_f00f_bug(void) { - __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO); + __set_fixmap(FIX_F00F_IDT, __pa_symbol(idt_table), PAGE_KERNEL_RO); /* * Update the IDT descriptor and reload the IDT so that @@ -612,10 +611,6 @@ static void __cpuinit intel_tlb_lookup(const unsigned char desc) static void __cpuinit intel_tlb_flushall_shift_set(struct cpuinfo_x86 *c) { - if (!cpu_has_invlpg) { - tlb_flushall_shift = -1; - return; - } switch ((c->x86 << 8) + c->x86_model) { case 0x60f: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ case 0x616: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 93c5451bdd5..7c6f7d548c0 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -298,8 +298,7 @@ struct _cache_attr { unsigned int); }; -#ifdef CONFIG_AMD_NB - +#if defined(CONFIG_AMD_NB) && defined(CONFIG_SYSFS) /* * L3 cache descriptors */ @@ -524,9 +523,9 @@ store_subcaches(struct _cpuid4_info *this_leaf, const char *buf, size_t count, static struct _cache_attr subcaches = __ATTR(subcaches, 0644, show_subcaches, store_subcaches); -#else /* CONFIG_AMD_NB */ +#else #define amd_init_l3_cache(x, y) -#endif /* CONFIG_AMD_NB */ +#endif /* CONFIG_AMD_NB && CONFIG_SYSFS */ static int __cpuinit cpuid4_cache_lookup_regs(int index, @@ -538,7 +537,11 @@ __cpuinit cpuid4_cache_lookup_regs(int index, unsigned edx; if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { - amd_cpuid4(index, &eax, &ebx, &ecx); + if (cpu_has_topoext) + cpuid_count(0x8000001d, index, &eax.full, + &ebx.full, &ecx.full, &edx); + else + amd_cpuid4(index, &eax, &ebx, &ecx); amd_init_l3_cache(this_leaf, index); } else { cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); @@ -557,21 +560,39 @@ __cpuinit cpuid4_cache_lookup_regs(int index, return 0; } -static int __cpuinit find_num_cache_leaves(void) +static int __cpuinit find_num_cache_leaves(struct cpuinfo_x86 *c) { - unsigned int eax, ebx, ecx, edx; + unsigned int eax, ebx, ecx, edx, op; union _cpuid4_leaf_eax cache_eax; int i = -1; + if (c->x86_vendor == X86_VENDOR_AMD) + op = 0x8000001d; + else + op = 4; + do { ++i; - /* Do cpuid(4) loop to find out num_cache_leaves */ - cpuid_count(4, i, &eax, &ebx, &ecx, &edx); + /* Do cpuid(op) loop to find out num_cache_leaves */ + cpuid_count(op, i, &eax, &ebx, &ecx, &edx); cache_eax.full = eax; } while (cache_eax.split.type != CACHE_TYPE_NULL); return i; } +void __cpuinit init_amd_cacheinfo(struct cpuinfo_x86 *c) +{ + + if (cpu_has_topoext) { + num_cache_leaves = find_num_cache_leaves(c); + } else if (c->extended_cpuid_level >= 0x80000006) { + if (cpuid_edx(0x80000006) & 0xf000) + num_cache_leaves = 4; + else + num_cache_leaves = 3; + } +} + unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) { /* Cache sizes */ @@ -588,7 +609,7 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) if (is_initialized == 0) { /* Init num_cache_leaves from boot CPU */ - num_cache_leaves = find_num_cache_leaves(); + num_cache_leaves = find_num_cache_leaves(c); is_initialized++; } @@ -728,37 +749,50 @@ static DEFINE_PER_CPU(struct _cpuid4_info *, ici_cpuid4_info); static int __cpuinit cache_shared_amd_cpu_map_setup(unsigned int cpu, int index) { struct _cpuid4_info *this_leaf; - int ret, i, sibling; - struct cpuinfo_x86 *c = &cpu_data(cpu); + int i, sibling; - ret = 0; - if (index == 3) { - ret = 1; - for_each_cpu(i, cpu_llc_shared_mask(cpu)) { + if (cpu_has_topoext) { + unsigned int apicid, nshared, first, last; + + if (!per_cpu(ici_cpuid4_info, cpu)) + return 0; + + this_leaf = CPUID4_INFO_IDX(cpu, index); + nshared = this_leaf->base.eax.split.num_threads_sharing + 1; + apicid = cpu_data(cpu).apicid; + first = apicid - (apicid % nshared); + last = first + nshared - 1; + + for_each_online_cpu(i) { + apicid = cpu_data(i).apicid; + if ((apicid < first) || (apicid > last)) + continue; if (!per_cpu(ici_cpuid4_info, i)) continue; this_leaf = CPUID4_INFO_IDX(i, index); - for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) { - if (!cpu_online(sibling)) + + for_each_online_cpu(sibling) { + apicid = cpu_data(sibling).apicid; + if ((apicid < first) || (apicid > last)) continue; set_bit(sibling, this_leaf->shared_cpu_map); } } - } else if ((c->x86 == 0x15) && ((index == 1) || (index == 2))) { - ret = 1; - for_each_cpu(i, cpu_sibling_mask(cpu)) { + } else if (index == 3) { + for_each_cpu(i, cpu_llc_shared_mask(cpu)) { if (!per_cpu(ici_cpuid4_info, i)) continue; this_leaf = CPUID4_INFO_IDX(i, index); - for_each_cpu(sibling, cpu_sibling_mask(cpu)) { + for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) { if (!cpu_online(sibling)) continue; set_bit(sibling, this_leaf->shared_cpu_map); } } - } + } else + return 0; - return ret; + return 1; } static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) @@ -1192,7 +1226,7 @@ static struct notifier_block __cpuinitdata cacheinfo_cpu_notifier = { .notifier_call = cacheinfo_cpu_callback, }; -static int __cpuinit cache_sysfs_init(void) +static int __init cache_sysfs_init(void) { int i; diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index 6a05c1d327a..5b7d4fa5d3b 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -24,8 +24,6 @@ struct mce_bank { int mce_severity(struct mce *a, int tolerant, char **msg); struct dentry *mce_get_debugfs_dir(void); -extern int mce_ser; - extern struct mce_bank *mce_banks; #ifdef CONFIG_X86_MCE_INTEL diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 13017626f9a..beb1f1689e5 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -193,9 +193,9 @@ int mce_severity(struct mce *m, int tolerant, char **msg) continue; if ((m->mcgstatus & s->mcgmask) != s->mcgres) continue; - if (s->ser == SER_REQUIRED && !mce_ser) + if (s->ser == SER_REQUIRED && !mca_cfg.ser) continue; - if (s->ser == NO_SER && mce_ser) + if (s->ser == NO_SER && mca_cfg.ser) continue; if (s->context && ctx != s->context) continue; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 46cbf868969..7bc126346ac 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -58,34 +58,26 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex); #define CREATE_TRACE_POINTS #include <trace/events/mce.h> -int mce_disabled __read_mostly; - #define SPINUNIT 100 /* 100ns */ atomic_t mce_entry; DEFINE_PER_CPU(unsigned, mce_exception_count); -/* - * Tolerant levels: - * 0: always panic on uncorrected errors, log corrected errors - * 1: panic or SIGBUS on uncorrected errors, log corrected errors - * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors - * 3: never panic or SIGBUS, log all errors (for testing only) - */ -static int tolerant __read_mostly = 1; -static int banks __read_mostly; -static int rip_msr __read_mostly; -static int mce_bootlog __read_mostly = -1; -static int monarch_timeout __read_mostly = -1; -static int mce_panic_timeout __read_mostly; -static int mce_dont_log_ce __read_mostly; -int mce_cmci_disabled __read_mostly; -int mce_ignore_ce __read_mostly; -int mce_ser __read_mostly; -int mce_bios_cmci_threshold __read_mostly; - -struct mce_bank *mce_banks __read_mostly; +struct mce_bank *mce_banks __read_mostly; + +struct mca_config mca_cfg __read_mostly = { + .bootlog = -1, + /* + * Tolerant levels: + * 0: always panic on uncorrected errors, log corrected errors + * 1: panic or SIGBUS on uncorrected errors, log corrected errors + * 2: SIGBUS or log uncorrected errors (if possible), log corr. errors + * 3: never panic or SIGBUS, log all errors (for testing only) + */ + .tolerant = 1, + .monarch_timeout = -1 +}; /* User mode helper program triggered by machine check event */ static unsigned long mce_need_notify; @@ -302,7 +294,7 @@ static void wait_for_panic(void) while (timeout-- > 0) udelay(1); if (panic_timeout == 0) - panic_timeout = mce_panic_timeout; + panic_timeout = mca_cfg.panic_timeout; panic("Panicing machine check CPU died"); } @@ -360,7 +352,7 @@ static void mce_panic(char *msg, struct mce *final, char *exp) pr_emerg(HW_ERR "Machine check: %s\n", exp); if (!fake_panic) { if (panic_timeout == 0) - panic_timeout = mce_panic_timeout; + panic_timeout = mca_cfg.panic_timeout; panic(msg); } else pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg); @@ -372,7 +364,7 @@ static int msr_to_offset(u32 msr) { unsigned bank = __this_cpu_read(injectm.bank); - if (msr == rip_msr) + if (msr == mca_cfg.rip_msr) return offsetof(struct mce, ip); if (msr == MSR_IA32_MCx_STATUS(bank)) return offsetof(struct mce, status); @@ -451,8 +443,8 @@ static inline void mce_gather_info(struct mce *m, struct pt_regs *regs) m->cs |= 3; } /* Use accurate RIP reporting if available. */ - if (rip_msr) - m->ip = mce_rdmsrl(rip_msr); + if (mca_cfg.rip_msr) + m->ip = mce_rdmsrl(mca_cfg.rip_msr); } } @@ -513,18 +505,15 @@ static int mce_ring_add(unsigned long pfn) int mce_available(struct cpuinfo_x86 *c) { - if (mce_disabled) + if (mca_cfg.disabled) return 0; return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); } static void mce_schedule_work(void) { - if (!mce_ring_empty()) { - struct work_struct *work = &__get_cpu_var(mce_work); - if (!work_pending(work)) - schedule_work(work); - } + if (!mce_ring_empty()) + schedule_work(&__get_cpu_var(mce_work)); } DEFINE_PER_CPU(struct irq_work, mce_irq_work); @@ -565,7 +554,7 @@ static void mce_read_aux(struct mce *m, int i) /* * Mask the reported address by the reported granularity. */ - if (mce_ser && (m->status & MCI_STATUS_MISCV)) { + if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) { u8 shift = MCI_MISC_ADDR_LSB(m->misc); m->addr >>= shift; m->addr <<= shift; @@ -599,7 +588,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) mce_gather_info(&m, NULL); - for (i = 0; i < banks; i++) { + for (i = 0; i < mca_cfg.banks; i++) { if (!mce_banks[i].ctl || !test_bit(i, *b)) continue; @@ -620,7 +609,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) * TBD do the same check for MCI_STATUS_EN here? */ if (!(flags & MCP_UC) && - (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC))) + (m.status & (mca_cfg.ser ? MCI_STATUS_S : MCI_STATUS_UC))) continue; mce_read_aux(&m, i); @@ -631,7 +620,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) * Don't get the IP here because it's unlikely to * have anything to do with the actual error location. */ - if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) + if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce) mce_log(&m); /* @@ -658,14 +647,14 @@ static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, { int i, ret = 0; - for (i = 0; i < banks; i++) { + for (i = 0; i < mca_cfg.banks; i++) { m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); if (m->status & MCI_STATUS_VAL) { __set_bit(i, validp); if (quirk_no_way_out) quirk_no_way_out(i, m, regs); } - if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) + if (mce_severity(m, mca_cfg.tolerant, msg) >= MCE_PANIC_SEVERITY) ret = 1; } return ret; @@ -696,11 +685,11 @@ static int mce_timed_out(u64 *t) rmb(); if (atomic_read(&mce_paniced)) wait_for_panic(); - if (!monarch_timeout) + if (!mca_cfg.monarch_timeout) goto out; if ((s64)*t < SPINUNIT) { /* CHECKME: Make panic default for 1 too? */ - if (tolerant < 1) + if (mca_cfg.tolerant < 1) mce_panic("Timeout synchronizing machine check over CPUs", NULL, NULL); cpu_missing = 1; @@ -750,7 +739,8 @@ static void mce_reign(void) * Grade the severity of the errors of all the CPUs. */ for_each_possible_cpu(cpu) { - int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant, + int severity = mce_severity(&per_cpu(mces_seen, cpu), + mca_cfg.tolerant, &nmsg); if (severity > global_worst) { msg = nmsg; @@ -764,7 +754,7 @@ static void mce_reign(void) * This dumps all the mces in the log buffer and stops the * other CPUs. */ - if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3) + if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) mce_panic("Fatal Machine check", m, msg); /* @@ -777,7 +767,7 @@ static void mce_reign(void) * No machine check event found. Must be some external * source or one CPU is hung. Panic. */ - if (global_worst <= MCE_KEEP_SEVERITY && tolerant < 3) + if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3) mce_panic("Machine check from unknown source", NULL, NULL); /* @@ -801,7 +791,7 @@ static int mce_start(int *no_way_out) { int order; int cpus = num_online_cpus(); - u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; + u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC; if (!timeout) return -1; @@ -865,7 +855,7 @@ static int mce_start(int *no_way_out) static int mce_end(int order) { int ret = -1; - u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; + u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC; if (!timeout) goto reset; @@ -946,7 +936,7 @@ static void mce_clear_state(unsigned long *toclear) { int i; - for (i = 0; i < banks; i++) { + for (i = 0; i < mca_cfg.banks; i++) { if (test_bit(i, toclear)) mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); } @@ -1011,6 +1001,7 @@ static void mce_clear_info(struct mce_info *mi) */ void do_machine_check(struct pt_regs *regs, long error_code) { + struct mca_config *cfg = &mca_cfg; struct mce m, *final; int i; int worst = 0; @@ -1022,7 +1013,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) int order; /* * If no_way_out gets set, there is no safe way to recover from this - * MCE. If tolerant is cranked up, we'll try anyway. + * MCE. If mca_cfg.tolerant is cranked up, we'll try anyway. */ int no_way_out = 0; /* @@ -1038,7 +1029,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) this_cpu_inc(mce_exception_count); - if (!banks) + if (!cfg->banks) goto out; mce_gather_info(&m, regs); @@ -1065,7 +1056,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) * because the first one to see it will clear it. */ order = mce_start(&no_way_out); - for (i = 0; i < banks; i++) { + for (i = 0; i < cfg->banks; i++) { __clear_bit(i, toclear); if (!test_bit(i, valid_banks)) continue; @@ -1084,16 +1075,16 @@ void do_machine_check(struct pt_regs *regs, long error_code) * Non uncorrected or non signaled errors are handled by * machine_check_poll. Leave them alone, unless this panics. */ - if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) && + if (!(m.status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) && !no_way_out) continue; /* * Set taint even when machine check was not enabled. */ - add_taint(TAINT_MACHINE_CHECK); + add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); - severity = mce_severity(&m, tolerant, NULL); + severity = mce_severity(&m, cfg->tolerant, NULL); /* * When machine check was for corrected handler don't touch, @@ -1117,7 +1108,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) * When the ring overflows we just ignore the AO error. * RED-PEN add some logging mechanism when * usable_address or mce_add_ring fails. - * RED-PEN don't ignore overflow for tolerant == 0 + * RED-PEN don't ignore overflow for mca_cfg.tolerant == 0 */ if (severity == MCE_AO_SEVERITY && mce_usable_address(&m)) mce_ring_add(m.addr >> PAGE_SHIFT); @@ -1149,7 +1140,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) * issues we try to recover, or limit damage to the current * process. */ - if (tolerant < 3) { + if (cfg->tolerant < 3) { if (no_way_out) mce_panic("Fatal machine check on current CPU", &m, msg); if (worst == MCE_AR_SEVERITY) { @@ -1357,12 +1348,7 @@ int mce_notify_irq(void) /* wake processes polling /dev/mcelog */ wake_up_interruptible(&mce_chrdev_wait); - /* - * There is no risk of missing notifications because - * work_pending is always cleared before the function is - * executed. - */ - if (mce_helper[0] && !work_pending(&mce_trigger_work)) + if (mce_helper[0]) schedule_work(&mce_trigger_work); if (__ratelimit(&ratelimit)) @@ -1377,11 +1363,13 @@ EXPORT_SYMBOL_GPL(mce_notify_irq); static int __cpuinit __mcheck_cpu_mce_banks_init(void) { int i; + u8 num_banks = mca_cfg.banks; - mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL); + mce_banks = kzalloc(num_banks * sizeof(struct mce_bank), GFP_KERNEL); if (!mce_banks) return -ENOMEM; - for (i = 0; i < banks; i++) { + + for (i = 0; i < num_banks; i++) { struct mce_bank *b = &mce_banks[i]; b->ctl = -1ULL; @@ -1401,7 +1389,7 @@ static int __cpuinit __mcheck_cpu_cap_init(void) rdmsrl(MSR_IA32_MCG_CAP, cap); b = cap & MCG_BANKCNT_MASK; - if (!banks) + if (!mca_cfg.banks) pr_info("CPU supports %d MCE banks\n", b); if (b > MAX_NR_BANKS) { @@ -1411,8 +1399,9 @@ static int __cpuinit __mcheck_cpu_cap_init(void) } /* Don't support asymmetric configurations today */ - WARN_ON(banks != 0 && b != banks); - banks = b; + WARN_ON(mca_cfg.banks != 0 && b != mca_cfg.banks); + mca_cfg.banks = b; + if (!mce_banks) { int err = __mcheck_cpu_mce_banks_init(); @@ -1422,25 +1411,29 @@ static int __cpuinit __mcheck_cpu_cap_init(void) /* Use accurate RIP reporting if available. */ if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) - rip_msr = MSR_IA32_MCG_EIP; + mca_cfg.rip_msr = MSR_IA32_MCG_EIP; if (cap & MCG_SER_P) - mce_ser = 1; + mca_cfg.ser = true; return 0; } static void __mcheck_cpu_init_generic(void) { + enum mcp_flags m_fl = 0; mce_banks_t all_banks; u64 cap; int i; + if (!mca_cfg.bootlog) + m_fl = MCP_DONTLOG; + /* * Log the machine checks left over from the previous reset. */ bitmap_fill(all_banks, MAX_NR_BANKS); - machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks); + machine_check_poll(MCP_UC | m_fl, &all_banks); set_in_cr4(X86_CR4_MCE); @@ -1448,7 +1441,7 @@ static void __mcheck_cpu_init_generic(void) if (cap & MCG_CTL_P) wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); - for (i = 0; i < banks; i++) { + for (i = 0; i < mca_cfg.banks; i++) { struct mce_bank *b = &mce_banks[i]; if (!b->init) @@ -1489,6 +1482,8 @@ static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs) /* Add per CPU specific workarounds here */ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) { + struct mca_config *cfg = &mca_cfg; + if (c->x86_vendor == X86_VENDOR_UNKNOWN) { pr_info("unknown CPU type - not enabling MCE support\n"); return -EOPNOTSUPP; @@ -1496,7 +1491,7 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) /* This should be disabled by the BIOS, but isn't always */ if (c->x86_vendor == X86_VENDOR_AMD) { - if (c->x86 == 15 && banks > 4) { + if (c->x86 == 15 && cfg->banks > 4) { /* * disable GART TBL walk error reporting, which * trips off incorrectly with the IOMMU & 3ware @@ -1504,18 +1499,18 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) */ clear_bit(10, (unsigned long *)&mce_banks[4].ctl); } - if (c->x86 <= 17 && mce_bootlog < 0) { + if (c->x86 <= 17 && cfg->bootlog < 0) { /* * Lots of broken BIOS around that don't clear them * by default and leave crap in there. Don't log: */ - mce_bootlog = 0; + cfg->bootlog = 0; } /* * Various K7s with broken bank 0 around. Always disable * by default. */ - if (c->x86 == 6 && banks > 0) + if (c->x86 == 6 && cfg->banks > 0) mce_banks[0].ctl = 0; /* @@ -1566,7 +1561,7 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) * valid event later, merely don't write CTL0. */ - if (c->x86 == 6 && c->x86_model < 0x1A && banks > 0) + if (c->x86 == 6 && c->x86_model < 0x1A && cfg->banks > 0) mce_banks[0].init = 0; /* @@ -1574,23 +1569,23 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) * synchronization with a one second timeout. */ if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && - monarch_timeout < 0) - monarch_timeout = USEC_PER_SEC; + cfg->monarch_timeout < 0) + cfg->monarch_timeout = USEC_PER_SEC; /* * There are also broken BIOSes on some Pentium M and * earlier systems: */ - if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0) - mce_bootlog = 0; + if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0) + cfg->bootlog = 0; if (c->x86 == 6 && c->x86_model == 45) quirk_no_way_out = quirk_sandybridge_ifu; } - if (monarch_timeout < 0) - monarch_timeout = 0; - if (mce_bootlog != 0) - mce_panic_timeout = 30; + if (cfg->monarch_timeout < 0) + cfg->monarch_timeout = 0; + if (cfg->bootlog != 0) + cfg->panic_timeout = 30; return 0; } @@ -1635,7 +1630,7 @@ static void mce_start_timer(unsigned int cpu, struct timer_list *t) __this_cpu_write(mce_next_interval, iv); - if (mce_ignore_ce || !iv) + if (mca_cfg.ignore_ce || !iv) return; t->expires = round_jiffies(jiffies + iv); @@ -1668,7 +1663,7 @@ void (*machine_check_vector)(struct pt_regs *, long error_code) = */ void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c) { - if (mce_disabled) + if (mca_cfg.disabled) return; if (__mcheck_cpu_ancient_init(c)) @@ -1678,7 +1673,7 @@ void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c) return; if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) { - mce_disabled = 1; + mca_cfg.disabled = true; return; } @@ -1951,6 +1946,8 @@ static struct miscdevice mce_chrdev_device = { */ static int __init mcheck_enable(char *str) { + struct mca_config *cfg = &mca_cfg; + if (*str == 0) { enable_p5_mce(); return 1; @@ -1958,22 +1955,22 @@ static int __init mcheck_enable(char *str) if (*str == '=') str++; if (!strcmp(str, "off")) - mce_disabled = 1; + cfg->disabled = true; else if (!strcmp(str, "no_cmci")) - mce_cmci_disabled = 1; + cfg->cmci_disabled = true; else if (!strcmp(str, "dont_log_ce")) - mce_dont_log_ce = 1; + cfg->dont_log_ce = true; else if (!strcmp(str, "ignore_ce")) - mce_ignore_ce = 1; + cfg->ignore_ce = true; else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) - mce_bootlog = (str[0] == 'b'); + cfg->bootlog = (str[0] == 'b'); else if (!strcmp(str, "bios_cmci_threshold")) - mce_bios_cmci_threshold = 1; + cfg->bios_cmci_threshold = true; else if (isdigit(str[0])) { - get_option(&str, &tolerant); + get_option(&str, &(cfg->tolerant)); if (*str == ',') { ++str; - get_option(&str, &monarch_timeout); + get_option(&str, &(cfg->monarch_timeout)); } } else { pr_info("mce argument %s ignored. Please use /sys\n", str); @@ -2002,7 +1999,7 @@ static int mce_disable_error_reporting(void) { int i; - for (i = 0; i < banks; i++) { + for (i = 0; i < mca_cfg.banks; i++) { struct mce_bank *b = &mce_banks[i]; if (b->init) @@ -2142,15 +2139,15 @@ static ssize_t set_ignore_ce(struct device *s, if (strict_strtoull(buf, 0, &new) < 0) return -EINVAL; - if (mce_ignore_ce ^ !!new) { + if (mca_cfg.ignore_ce ^ !!new) { if (new) { /* disable ce features */ mce_timer_delete_all(); on_each_cpu(mce_disable_cmci, NULL, 1); - mce_ignore_ce = 1; + mca_cfg.ignore_ce = true; } else { /* enable ce features */ - mce_ignore_ce = 0; + mca_cfg.ignore_ce = false; on_each_cpu(mce_enable_ce, (void *)1, 1); } } @@ -2166,14 +2163,14 @@ static ssize_t set_cmci_disabled(struct device *s, if (strict_strtoull(buf, 0, &new) < 0) return -EINVAL; - if (mce_cmci_disabled ^ !!new) { + if (mca_cfg.cmci_disabled ^ !!new) { if (new) { /* disable cmci */ on_each_cpu(mce_disable_cmci, NULL, 1); - mce_cmci_disabled = 1; + mca_cfg.cmci_disabled = true; } else { /* enable cmci */ - mce_cmci_disabled = 0; + mca_cfg.cmci_disabled = false; on_each_cpu(mce_enable_ce, NULL, 1); } } @@ -2190,9 +2187,9 @@ static ssize_t store_int_with_restart(struct device *s, } static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger); -static DEVICE_INT_ATTR(tolerant, 0644, tolerant); -static DEVICE_INT_ATTR(monarch_timeout, 0644, monarch_timeout); -static DEVICE_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce); +static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant); +static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout); +static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce); static struct dev_ext_attribute dev_attr_check_interval = { __ATTR(check_interval, 0644, device_show_int, store_int_with_restart), @@ -2200,13 +2197,13 @@ static struct dev_ext_attribute dev_attr_check_interval = { }; static struct dev_ext_attribute dev_attr_ignore_ce = { - __ATTR(ignore_ce, 0644, device_show_int, set_ignore_ce), - &mce_ignore_ce + __ATTR(ignore_ce, 0644, device_show_bool, set_ignore_ce), + &mca_cfg.ignore_ce }; static struct dev_ext_attribute dev_attr_cmci_disabled = { - __ATTR(cmci_disabled, 0644, device_show_int, set_cmci_disabled), - &mce_cmci_disabled + __ATTR(cmci_disabled, 0644, device_show_bool, set_cmci_disabled), + &mca_cfg.cmci_disabled }; static struct device_attribute *mce_device_attrs[] = { @@ -2253,7 +2250,7 @@ static __cpuinit int mce_device_create(unsigned int cpu) if (err) goto error; } - for (j = 0; j < banks; j++) { + for (j = 0; j < mca_cfg.banks; j++) { err = device_create_file(dev, &mce_banks[j].attr); if (err) goto error2; @@ -2285,7 +2282,7 @@ static __cpuinit void mce_device_remove(unsigned int cpu) for (i = 0; mce_device_attrs[i]; i++) device_remove_file(dev, mce_device_attrs[i]); - for (i = 0; i < banks; i++) + for (i = 0; i < mca_cfg.banks; i++) device_remove_file(dev, &mce_banks[i].attr); device_unregister(dev); @@ -2304,7 +2301,7 @@ static void __cpuinit mce_disable_cpu(void *h) if (!(action & CPU_TASKS_FROZEN)) cmci_clear(); - for (i = 0; i < banks; i++) { + for (i = 0; i < mca_cfg.banks; i++) { struct mce_bank *b = &mce_banks[i]; if (b->init) @@ -2322,7 +2319,7 @@ static void __cpuinit mce_reenable_cpu(void *h) if (!(action & CPU_TASKS_FROZEN)) cmci_reenable(); - for (i = 0; i < banks; i++) { + for (i = 0; i < mca_cfg.banks; i++) { struct mce_bank *b = &mce_banks[i]; if (b->init) @@ -2375,7 +2372,7 @@ static __init void mce_init_banks(void) { int i; - for (i = 0; i < banks; i++) { + for (i = 0; i < mca_cfg.banks; i++) { struct mce_bank *b = &mce_banks[i]; struct device_attribute *a = &b->attr; @@ -2426,7 +2423,7 @@ device_initcall_sync(mcheck_init_device); */ static int __init mcheck_disable(char *str) { - mce_disabled = 1; + mca_cfg.disabled = true; return 1; } __setup("nomce", mcheck_disable); diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 698b6ec12e0..1ac581f38df 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -6,7 +6,7 @@ * * Written by Jacob Shin - AMD, Inc. * - * Support: borislav.petkov@amd.com + * Maintained by: Borislav Petkov <bp@alien8.de> * * April 2006 * - added support for AMD Family 0x10 processors diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 5f88abf07e9..402c454fbff 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -53,7 +53,7 @@ static int cmci_supported(int *banks) { u64 cap; - if (mce_cmci_disabled || mce_ignore_ce) + if (mca_cfg.cmci_disabled || mca_cfg.ignore_ce) return 0; /* @@ -200,7 +200,7 @@ static void cmci_discover(int banks) continue; } - if (!mce_bios_cmci_threshold) { + if (!mca_cfg.bios_cmci_threshold) { val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK; val |= CMCI_THRESHOLD; } else if (!(val & MCI_CTL2_CMCI_THRESHOLD_MASK)) { @@ -227,7 +227,7 @@ static void cmci_discover(int banks) * set the thresholds properly or does not work with * this boot option. Note down now and report later. */ - if (mce_bios_cmci_threshold && bios_zero_thresh && + if (mca_cfg.bios_cmci_threshold && bios_zero_thresh && (val & MCI_CTL2_CMCI_THRESHOLD_MASK)) bios_wrong_thresh = 1; } else { @@ -235,7 +235,7 @@ static void cmci_discover(int banks) } } raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); - if (mce_bios_cmci_threshold && bios_wrong_thresh) { + if (mca_cfg.bios_cmci_threshold && bios_wrong_thresh) { pr_info_once( "bios_cmci_threshold: Some banks do not have valid thresholds set\n"); pr_info_once( @@ -285,34 +285,39 @@ void cmci_clear(void) raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); } +static long cmci_rediscover_work_func(void *arg) +{ + int banks; + + /* Recheck banks in case CPUs don't all have the same */ + if (cmci_supported(&banks)) + cmci_discover(banks); + + return 0; +} + /* * After a CPU went down cycle through all the others and rediscover * Must run in process context. */ void cmci_rediscover(int dying) { - int banks; - int cpu; - cpumask_var_t old; + int cpu, banks; if (!cmci_supported(&banks)) return; - if (!alloc_cpumask_var(&old, GFP_KERNEL)) - return; - cpumask_copy(old, ¤t->cpus_allowed); for_each_online_cpu(cpu) { if (cpu == dying) continue; - if (set_cpus_allowed_ptr(current, cpumask_of(cpu))) + + if (cpu == smp_processor_id()) { + cmci_rediscover_work_func(NULL); continue; - /* Recheck banks in case CPUs don't all have the same */ - if (cmci_supported(&banks)) - cmci_discover(banks); - } + } - set_cpus_allowed_ptr(current, old); - free_cpumask_var(old); + work_on_cpu(cpu, cmci_rediscover_work_func, NULL); + } } /* diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c index 2d5454cd2c4..1c044b1ccc5 100644 --- a/arch/x86/kernel/cpu/mcheck/p5.c +++ b/arch/x86/kernel/cpu/mcheck/p5.c @@ -33,7 +33,7 @@ static void pentium_machine_check(struct pt_regs *regs, long error_code) smp_processor_id()); } - add_taint(TAINT_MACHINE_CHECK); + add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); } /* Set up machine check reporting for processors with Intel style MCE: */ diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c index 2d7998fb628..e9a701aecaa 100644 --- a/arch/x86/kernel/cpu/mcheck/winchip.c +++ b/arch/x86/kernel/cpu/mcheck/winchip.c @@ -15,7 +15,7 @@ static void winchip_machine_check(struct pt_regs *regs, long error_code) { printk(KERN_EMERG "CPU0: Machine Check Exception.\n"); - add_taint(TAINT_MACHINE_CHECK); + add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); } /* Set up machine check reporting on the Winchip C6 series */ diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 0a630dd4b62..a7d26d83fb7 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -14,10 +14,15 @@ #include <linux/time.h> #include <linux/clocksource.h> #include <linux/module.h> +#include <linux/hardirq.h> +#include <linux/interrupt.h> #include <asm/processor.h> #include <asm/hypervisor.h> #include <asm/hyperv.h> #include <asm/mshyperv.h> +#include <asm/desc.h> +#include <asm/idle.h> +#include <asm/irq_regs.h> struct ms_hyperv_info ms_hyperv; EXPORT_SYMBOL_GPL(ms_hyperv); @@ -30,6 +35,13 @@ static bool __init ms_hyperv_platform(void) if (!boot_cpu_has(X86_FEATURE_HYPERVISOR)) return false; + /* + * Xen emulates Hyper-V to support enlightened Windows. + * Check to see first if we are on a Xen Hypervisor. + */ + if (xen_cpuid_base()) + return false; + cpuid(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS, &eax, &hyp_signature[0], &hyp_signature[1], &hyp_signature[2]); @@ -68,7 +80,14 @@ static void __init ms_hyperv_init_platform(void) printk(KERN_INFO "HyperV: features 0x%x, hints 0x%x\n", ms_hyperv.features, ms_hyperv.hints); - clocksource_register_hz(&hyperv_cs, NSEC_PER_SEC/100); + if (ms_hyperv.features & HV_X64_MSR_TIME_REF_COUNT_AVAILABLE) + clocksource_register_hz(&hyperv_cs, NSEC_PER_SEC/100); +#if IS_ENABLED(CONFIG_HYPERV) + /* + * Setup the IDT for hypervisor callback. + */ + alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, hyperv_callback_vector); +#endif } const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = { @@ -77,3 +96,36 @@ const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = { .init_platform = ms_hyperv_init_platform, }; EXPORT_SYMBOL(x86_hyper_ms_hyperv); + +#if IS_ENABLED(CONFIG_HYPERV) +static int vmbus_irq = -1; +static irq_handler_t vmbus_isr; + +void hv_register_vmbus_handler(int irq, irq_handler_t handler) +{ + vmbus_irq = irq; + vmbus_isr = handler; +} + +void hyperv_vector_handler(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + struct irq_desc *desc; + + irq_enter(); + exit_idle(); + + desc = irq_to_desc(vmbus_irq); + + if (desc) + generic_handle_irq_desc(vmbus_irq, desc); + + irq_exit(); + set_irq_regs(old_regs); +} +#else +void hv_register_vmbus_handler(int irq, irq_handler_t handler) +{ +} +#endif +EXPORT_SYMBOL_GPL(hv_register_vmbus_handler); diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index e9fe907cd24..fa72a39e5d4 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -542,7 +542,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, if (tmp != mask_lo) { printk(KERN_WARNING "mtrr: your BIOS has configured an incorrect mask, fixing it.\n"); - add_taint(TAINT_FIRMWARE_WORKAROUND); + add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); mask_lo = tmp; } } diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 6b96110bb0c..726bf963c22 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -606,7 +606,7 @@ void __init mtrr_bp_init(void) /* * This is an AMD specific MSR, but we assume(hope?) that - * Intel will implement it to when they extend the address + * Intel will implement it too when they extend the address * bus of the Xeon. */ if (cpuid_eax(0x80000000) >= 0x80000008) { @@ -695,11 +695,16 @@ void mtrr_ap_init(void) } /** - * Save current fixed-range MTRR state of the BSP + * Save current fixed-range MTRR state of the first cpu in cpu_online_mask. */ void mtrr_save_state(void) { - smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1); + int first_cpu; + + get_online_cpus(); + first_cpu = cpumask_first(cpu_online_mask); + smp_call_function_single(first_cpu, mtrr_save_fixed_ranges, NULL, 1); + put_online_cpus(); } void set_mtrr_aps_delayed_init(void) diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 4a3374e61a9..bf0f01aea99 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -340,9 +340,6 @@ int x86_setup_perfctr(struct perf_event *event) /* BTS is currently only allowed for user-mode. */ if (!attr->exclude_kernel) return -EOPNOTSUPP; - - if (!attr->exclude_guest) - return -EOPNOTSUPP; } hwc->config |= config; @@ -385,9 +382,6 @@ int x86_pmu_hw_config(struct perf_event *event) if (event->attr.precise_ip) { int precise = 0; - if (!event->attr.exclude_guest) - return -EOPNOTSUPP; - /* Support for constant skid */ if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) { precise++; @@ -835,7 +829,7 @@ static inline void x86_assign_hw_event(struct perf_event *event, } else { hwc->config_base = x86_pmu_config_addr(hwc->idx); hwc->event_base = x86_pmu_event_addr(hwc->idx); - hwc->event_base_rdpmc = hwc->idx; + hwc->event_base_rdpmc = x86_pmu_rdpmc_index(hwc->idx); } } @@ -1316,6 +1310,114 @@ static struct attribute_group x86_pmu_format_group = { .attrs = NULL, }; +/* + * Remove all undefined events (x86_pmu.event_map(id) == 0) + * out of events_attr attributes. + */ +static void __init filter_events(struct attribute **attrs) +{ + int i, j; + + for (i = 0; attrs[i]; i++) { + if (x86_pmu.event_map(i)) + continue; + + for (j = i; attrs[j]; j++) + attrs[j] = attrs[j + 1]; + + /* Check the shifted attr. */ + i--; + } +} + +static ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, + char *page) +{ + struct perf_pmu_events_attr *pmu_attr = \ + container_of(attr, struct perf_pmu_events_attr, attr); + + u64 config = x86_pmu.event_map(pmu_attr->id); + return x86_pmu.events_sysfs_show(page, config); +} + +#define EVENT_VAR(_id) event_attr_##_id +#define EVENT_PTR(_id) &event_attr_##_id.attr.attr + +#define EVENT_ATTR(_name, _id) \ + PMU_EVENT_ATTR(_name, EVENT_VAR(_id), PERF_COUNT_HW_##_id, \ + events_sysfs_show) + +EVENT_ATTR(cpu-cycles, CPU_CYCLES ); +EVENT_ATTR(instructions, INSTRUCTIONS ); +EVENT_ATTR(cache-references, CACHE_REFERENCES ); +EVENT_ATTR(cache-misses, CACHE_MISSES ); +EVENT_ATTR(branch-instructions, BRANCH_INSTRUCTIONS ); +EVENT_ATTR(branch-misses, BRANCH_MISSES ); +EVENT_ATTR(bus-cycles, BUS_CYCLES ); +EVENT_ATTR(stalled-cycles-frontend, STALLED_CYCLES_FRONTEND ); +EVENT_ATTR(stalled-cycles-backend, STALLED_CYCLES_BACKEND ); +EVENT_ATTR(ref-cycles, REF_CPU_CYCLES ); + +static struct attribute *empty_attrs; + +static struct attribute *events_attr[] = { + EVENT_PTR(CPU_CYCLES), + EVENT_PTR(INSTRUCTIONS), + EVENT_PTR(CACHE_REFERENCES), + EVENT_PTR(CACHE_MISSES), + EVENT_PTR(BRANCH_INSTRUCTIONS), + EVENT_PTR(BRANCH_MISSES), + EVENT_PTR(BUS_CYCLES), + EVENT_PTR(STALLED_CYCLES_FRONTEND), + EVENT_PTR(STALLED_CYCLES_BACKEND), + EVENT_PTR(REF_CPU_CYCLES), + NULL, +}; + +static struct attribute_group x86_pmu_events_group = { + .name = "events", + .attrs = events_attr, +}; + +ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event) +{ + u64 umask = (config & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; + u64 cmask = (config & ARCH_PERFMON_EVENTSEL_CMASK) >> 24; + bool edge = (config & ARCH_PERFMON_EVENTSEL_EDGE); + bool pc = (config & ARCH_PERFMON_EVENTSEL_PIN_CONTROL); + bool any = (config & ARCH_PERFMON_EVENTSEL_ANY); + bool inv = (config & ARCH_PERFMON_EVENTSEL_INV); + ssize_t ret; + + /* + * We have whole page size to spend and just little data + * to write, so we can safely use sprintf. + */ + ret = sprintf(page, "event=0x%02llx", event); + + if (umask) + ret += sprintf(page + ret, ",umask=0x%02llx", umask); + + if (edge) + ret += sprintf(page + ret, ",edge"); + + if (pc) + ret += sprintf(page + ret, ",pc"); + + if (any) + ret += sprintf(page + ret, ",any"); + + if (inv) + ret += sprintf(page + ret, ",inv"); + + if (cmask) + ret += sprintf(page + ret, ",cmask=0x%02llx", cmask); + + ret += sprintf(page + ret, "\n"); + + return ret; +} + static int __init init_hw_perf_events(void) { struct x86_pmu_quirk *quirk; @@ -1362,6 +1464,11 @@ static int __init init_hw_perf_events(void) x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */ x86_pmu_format_group.attrs = x86_pmu.format_attrs; + if (!x86_pmu.events_sysfs_show) + x86_pmu_events_group.attrs = &empty_attrs; + else + filter_events(x86_pmu_events_group.attrs); + pr_info("... version: %d\n", x86_pmu.version); pr_info("... bit width: %d\n", x86_pmu.cntval_bits); pr_info("... generic registers: %d\n", x86_pmu.num_counters); @@ -1651,6 +1758,7 @@ static struct attribute_group x86_pmu_attr_group = { static const struct attribute_group *x86_pmu_attr_groups[] = { &x86_pmu_attr_group, &x86_pmu_format_group, + &x86_pmu_events_group, NULL, }; diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 271d2570029..7f5c75c2afd 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -325,6 +325,8 @@ struct x86_pmu { int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign); unsigned eventsel; unsigned perfctr; + int (*addr_offset)(int index, bool eventsel); + int (*rdpmc_index)(int index); u64 (*event_map)(int); int max_events; int num_counters; @@ -354,6 +356,8 @@ struct x86_pmu { int attr_rdpmc; struct attribute **format_attrs; + ssize_t (*events_sysfs_show)(char *page, u64 config); + /* * CPU Hotplug hooks */ @@ -444,28 +448,21 @@ extern u64 __read_mostly hw_cache_extra_regs u64 x86_perf_event_update(struct perf_event *event); -static inline int x86_pmu_addr_offset(int index) +static inline unsigned int x86_pmu_config_addr(int index) { - int offset; - - /* offset = X86_FEATURE_PERFCTR_CORE ? index << 1 : index */ - alternative_io(ASM_NOP2, - "shll $1, %%eax", - X86_FEATURE_PERFCTR_CORE, - "=a" (offset), - "a" (index)); - - return offset; + return x86_pmu.eventsel + (x86_pmu.addr_offset ? + x86_pmu.addr_offset(index, true) : index); } -static inline unsigned int x86_pmu_config_addr(int index) +static inline unsigned int x86_pmu_event_addr(int index) { - return x86_pmu.eventsel + x86_pmu_addr_offset(index); + return x86_pmu.perfctr + (x86_pmu.addr_offset ? + x86_pmu.addr_offset(index, false) : index); } -static inline unsigned int x86_pmu_event_addr(int index) +static inline int x86_pmu_rdpmc_index(int index) { - return x86_pmu.perfctr + x86_pmu_addr_offset(index); + return x86_pmu.rdpmc_index ? x86_pmu.rdpmc_index(index) : index; } int x86_setup_perfctr(struct perf_event *event); @@ -536,6 +533,9 @@ static inline void set_linear_ip(struct pt_regs *regs, unsigned long ip) regs->ip = ip; } +ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event); +ssize_t intel_event_sysfs_show(char *page, u64 config); + #ifdef CONFIG_CPU_SUP_AMD int amd_pmu_init(void); diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 4528ae7b6ec..dfdab42aed2 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -132,21 +132,102 @@ static u64 amd_pmu_event_map(int hw_event) return amd_perfmon_event_map[hw_event]; } -static int amd_pmu_hw_config(struct perf_event *event) +static struct event_constraint *amd_nb_event_constraint; + +/* + * Previously calculated offsets + */ +static unsigned int event_offsets[X86_PMC_IDX_MAX] __read_mostly; +static unsigned int count_offsets[X86_PMC_IDX_MAX] __read_mostly; +static unsigned int rdpmc_indexes[X86_PMC_IDX_MAX] __read_mostly; + +/* + * Legacy CPUs: + * 4 counters starting at 0xc0010000 each offset by 1 + * + * CPUs with core performance counter extensions: + * 6 counters starting at 0xc0010200 each offset by 2 + * + * CPUs with north bridge performance counter extensions: + * 4 additional counters starting at 0xc0010240 each offset by 2 + * (indexed right above either one of the above core counters) + */ +static inline int amd_pmu_addr_offset(int index, bool eventsel) { - int ret; + int offset, first, base; - /* pass precise event sampling to ibs: */ - if (event->attr.precise_ip && get_ibs_caps()) - return -ENOENT; + if (!index) + return index; + + if (eventsel) + offset = event_offsets[index]; + else + offset = count_offsets[index]; + + if (offset) + return offset; + + if (amd_nb_event_constraint && + test_bit(index, amd_nb_event_constraint->idxmsk)) { + /* + * calculate the offset of NB counters with respect to + * base eventsel or perfctr + */ + + first = find_first_bit(amd_nb_event_constraint->idxmsk, + X86_PMC_IDX_MAX); + + if (eventsel) + base = MSR_F15H_NB_PERF_CTL - x86_pmu.eventsel; + else + base = MSR_F15H_NB_PERF_CTR - x86_pmu.perfctr; + + offset = base + ((index - first) << 1); + } else if (!cpu_has_perfctr_core) + offset = index; + else + offset = index << 1; + + if (eventsel) + event_offsets[index] = offset; + else + count_offsets[index] = offset; + + return offset; +} + +static inline int amd_pmu_rdpmc_index(int index) +{ + int ret, first; + + if (!index) + return index; + + ret = rdpmc_indexes[index]; - ret = x86_pmu_hw_config(event); if (ret) return ret; - if (has_branch_stack(event)) - return -EOPNOTSUPP; + if (amd_nb_event_constraint && + test_bit(index, amd_nb_event_constraint->idxmsk)) { + /* + * according to the mnual, ECX value of the NB counters is + * the index of the NB counter (0, 1, 2 or 3) plus 6 + */ + first = find_first_bit(amd_nb_event_constraint->idxmsk, + X86_PMC_IDX_MAX); + ret = index - first + 6; + } else + ret = index; + + rdpmc_indexes[index] = ret; + + return ret; +} + +static int amd_core_hw_config(struct perf_event *event) +{ if (event->attr.exclude_host && event->attr.exclude_guest) /* * When HO == GO == 1 the hardware treats that as GO == HO == 0 @@ -156,14 +237,37 @@ static int amd_pmu_hw_config(struct perf_event *event) event->hw.config &= ~(ARCH_PERFMON_EVENTSEL_USR | ARCH_PERFMON_EVENTSEL_OS); else if (event->attr.exclude_host) - event->hw.config |= AMD_PERFMON_EVENTSEL_GUESTONLY; + event->hw.config |= AMD64_EVENTSEL_GUESTONLY; else if (event->attr.exclude_guest) - event->hw.config |= AMD_PERFMON_EVENTSEL_HOSTONLY; + event->hw.config |= AMD64_EVENTSEL_HOSTONLY; - if (event->attr.type != PERF_TYPE_RAW) - return 0; + return 0; +} - event->hw.config |= event->attr.config & AMD64_RAW_EVENT_MASK; +/* + * NB counters do not support the following event select bits: + * Host/Guest only + * Counter mask + * Invert counter mask + * Edge detect + * OS/User mode + */ +static int amd_nb_hw_config(struct perf_event *event) +{ + /* for NB, we only allow system wide counting mode */ + if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK) + return -EINVAL; + + if (event->attr.exclude_user || event->attr.exclude_kernel || + event->attr.exclude_host || event->attr.exclude_guest) + return -EINVAL; + + event->hw.config &= ~(ARCH_PERFMON_EVENTSEL_USR | + ARCH_PERFMON_EVENTSEL_OS); + + if (event->hw.config & ~(AMD64_RAW_EVENT_MASK_NB | + ARCH_PERFMON_EVENTSEL_INT)) + return -EINVAL; return 0; } @@ -181,6 +285,11 @@ static inline int amd_is_nb_event(struct hw_perf_event *hwc) return (hwc->config & 0xe0) == 0xe0; } +static inline int amd_is_perfctr_nb_event(struct hw_perf_event *hwc) +{ + return amd_nb_event_constraint && amd_is_nb_event(hwc); +} + static inline int amd_has_nb(struct cpu_hw_events *cpuc) { struct amd_nb *nb = cpuc->amd_nb; @@ -188,20 +297,37 @@ static inline int amd_has_nb(struct cpu_hw_events *cpuc) return nb && nb->nb_id != -1; } -static void amd_put_event_constraints(struct cpu_hw_events *cpuc, - struct perf_event *event) +static int amd_pmu_hw_config(struct perf_event *event) +{ + int ret; + + /* pass precise event sampling to ibs: */ + if (event->attr.precise_ip && get_ibs_caps()) + return -ENOENT; + + if (has_branch_stack(event)) + return -EOPNOTSUPP; + + ret = x86_pmu_hw_config(event); + if (ret) + return ret; + + if (event->attr.type == PERF_TYPE_RAW) + event->hw.config |= event->attr.config & AMD64_RAW_EVENT_MASK; + + if (amd_is_perfctr_nb_event(&event->hw)) + return amd_nb_hw_config(event); + + return amd_core_hw_config(event); +} + +static void __amd_put_nb_event_constraints(struct cpu_hw_events *cpuc, + struct perf_event *event) { - struct hw_perf_event *hwc = &event->hw; struct amd_nb *nb = cpuc->amd_nb; int i; /* - * only care about NB events - */ - if (!(amd_has_nb(cpuc) && amd_is_nb_event(hwc))) - return; - - /* * need to scan whole list because event may not have * been assigned during scheduling * @@ -215,6 +341,19 @@ static void amd_put_event_constraints(struct cpu_hw_events *cpuc, } } +static void amd_nb_interrupt_hw_config(struct hw_perf_event *hwc) +{ + int core_id = cpu_data(smp_processor_id()).cpu_core_id; + + /* deliver interrupts only to this core */ + if (hwc->config & ARCH_PERFMON_EVENTSEL_INT) { + hwc->config |= AMD64_EVENTSEL_INT_CORE_ENABLE; + hwc->config &= ~AMD64_EVENTSEL_INT_CORE_SEL_MASK; + hwc->config |= (u64)(core_id) << + AMD64_EVENTSEL_INT_CORE_SEL_SHIFT; + } +} + /* * AMD64 NorthBridge events need special treatment because * counter access needs to be synchronized across all cores @@ -247,24 +386,24 @@ static void amd_put_event_constraints(struct cpu_hw_events *cpuc, * * Given that resources are allocated (cmpxchg), they must be * eventually freed for others to use. This is accomplished by - * calling amd_put_event_constraints(). + * calling __amd_put_nb_event_constraints() * * Non NB events are not impacted by this restriction. */ static struct event_constraint * -amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) +__amd_get_nb_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event, + struct event_constraint *c) { struct hw_perf_event *hwc = &event->hw; struct amd_nb *nb = cpuc->amd_nb; - struct perf_event *old = NULL; - int max = x86_pmu.num_counters; - int i, j, k = -1; + struct perf_event *old; + int idx, new = -1; - /* - * if not NB event or no NB, then no constraints - */ - if (!(amd_has_nb(cpuc) && amd_is_nb_event(hwc))) - return &unconstrained; + if (!c) + c = &unconstrained; + + if (cpuc->is_fake) + return c; /* * detect if already present, if so reuse @@ -276,48 +415,36 @@ amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) * because of successive calls to x86_schedule_events() from * hw_perf_group_sched_in() without hw_perf_enable() */ - for (i = 0; i < max; i++) { - /* - * keep track of first free slot - */ - if (k == -1 && !nb->owners[i]) - k = i; + for_each_set_bit(idx, c->idxmsk, x86_pmu.num_counters) { + if (new == -1 || hwc->idx == idx) + /* assign free slot, prefer hwc->idx */ + old = cmpxchg(nb->owners + idx, NULL, event); + else if (nb->owners[idx] == event) + /* event already present */ + old = event; + else + continue; + + if (old && old != event) + continue; + + /* reassign to this slot */ + if (new != -1) + cmpxchg(nb->owners + new, event, NULL); + new = idx; /* already present, reuse */ - if (nb->owners[i] == event) - goto done; - } - /* - * not present, so grab a new slot - * starting either at: - */ - if (hwc->idx != -1) { - /* previous assignment */ - i = hwc->idx; - } else if (k != -1) { - /* start from free slot found */ - i = k; - } else { - /* - * event not found, no slot found in - * first pass, try again from the - * beginning - */ - i = 0; - } - j = i; - do { - old = cmpxchg(nb->owners+i, NULL, event); - if (!old) + if (old == event) break; - if (++i == max) - i = 0; - } while (i != j); -done: - if (!old) - return &nb->event_constraints[i]; - - return &emptyconstraint; + } + + if (new == -1) + return &emptyconstraint; + + if (amd_is_perfctr_nb_event(hwc)) + amd_nb_interrupt_hw_config(hwc); + + return &nb->event_constraints[new]; } static struct amd_nb *amd_alloc_nb(int cpu) @@ -364,7 +491,7 @@ static void amd_pmu_cpu_starting(int cpu) struct amd_nb *nb; int i, nb_id; - cpuc->perf_ctr_virt_mask = AMD_PERFMON_EVENTSEL_HOSTONLY; + cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY; if (boot_cpu_data.x86_max_cores < 2) return; @@ -407,6 +534,26 @@ static void amd_pmu_cpu_dead(int cpu) } } +static struct event_constraint * +amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) +{ + /* + * if not NB event or no NB, then no constraints + */ + if (!(amd_has_nb(cpuc) && amd_is_nb_event(&event->hw))) + return &unconstrained; + + return __amd_get_nb_event_constraints(cpuc, event, + amd_nb_event_constraint); +} + +static void amd_put_event_constraints(struct cpu_hw_events *cpuc, + struct perf_event *event) +{ + if (amd_has_nb(cpuc) && amd_is_nb_event(&event->hw)) + __amd_put_nb_event_constraints(cpuc, event); +} + PMU_FORMAT_ATTR(event, "config:0-7,32-35"); PMU_FORMAT_ATTR(umask, "config:8-15" ); PMU_FORMAT_ATTR(edge, "config:18" ); @@ -496,6 +643,9 @@ static struct event_constraint amd_f15_PMC30 = EVENT_CONSTRAINT_OVERLAP(0, 0x09, static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0); static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0); +static struct event_constraint amd_NBPMC96 = EVENT_CONSTRAINT(0, 0x3C0, 0); +static struct event_constraint amd_NBPMC74 = EVENT_CONSTRAINT(0, 0xF0, 0); + static struct event_constraint * amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *event) { @@ -561,13 +711,21 @@ amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *ev return &amd_f15_PMC20; } case AMD_EVENT_NB: - /* not yet implemented */ - return &emptyconstraint; + return __amd_get_nb_event_constraints(cpuc, event, + amd_nb_event_constraint); default: return &emptyconstraint; } } +static ssize_t amd_event_sysfs_show(char *page, u64 config) +{ + u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT) | + (config & AMD64_EVENTSEL_EVENT) >> 24; + + return x86_event_sysfs_show(page, config, event); +} + static __initconst const struct x86_pmu amd_pmu = { .name = "AMD", .handle_irq = x86_pmu_handle_irq, @@ -579,6 +737,8 @@ static __initconst const struct x86_pmu amd_pmu = { .schedule_events = x86_schedule_events, .eventsel = MSR_K7_EVNTSEL0, .perfctr = MSR_K7_PERFCTR0, + .addr_offset = amd_pmu_addr_offset, + .rdpmc_index = amd_pmu_rdpmc_index, .event_map = amd_pmu_event_map, .max_events = ARRAY_SIZE(amd_perfmon_event_map), .num_counters = AMD64_NUM_COUNTERS, @@ -591,6 +751,7 @@ static __initconst const struct x86_pmu amd_pmu = { .put_event_constraints = amd_put_event_constraints, .format_attrs = amd_format_attr, + .events_sysfs_show = amd_event_sysfs_show, .cpu_prepare = amd_pmu_cpu_prepare, .cpu_starting = amd_pmu_cpu_starting, @@ -599,7 +760,7 @@ static __initconst const struct x86_pmu amd_pmu = { static int setup_event_constraints(void) { - if (boot_cpu_data.x86 >= 0x15) + if (boot_cpu_data.x86 == 0x15) x86_pmu.get_event_constraints = amd_get_event_constraints_f15h; return 0; } @@ -629,6 +790,23 @@ static int setup_perfctr_core(void) return 0; } +static int setup_perfctr_nb(void) +{ + if (!cpu_has_perfctr_nb) + return -ENODEV; + + x86_pmu.num_counters += AMD64_NUM_COUNTERS_NB; + + if (cpu_has_perfctr_core) + amd_nb_event_constraint = &amd_NBPMC96; + else + amd_nb_event_constraint = &amd_NBPMC74; + + printk(KERN_INFO "perf: AMD northbridge performance counters detected\n"); + + return 0; +} + __init int amd_pmu_init(void) { /* Performance-monitoring supported from K7 and later: */ @@ -639,6 +817,7 @@ __init int amd_pmu_init(void) setup_event_constraints(); setup_perfctr_core(); + setup_perfctr_nb(); /* Events are common for all AMDs */ memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, @@ -669,7 +848,7 @@ void amd_pmu_disable_virt(void) * SVM is disabled the Guest-only bits still gets set and the counter * will not count anything. */ - cpuc->perf_ctr_virt_mask = AMD_PERFMON_EVENTSEL_HOSTONLY; + cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY; /* Reload all events */ x86_pmu_disable_all(); diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index 6336bcbd061..5f0581e713c 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -528,7 +528,7 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) if (!test_bit(IBS_STARTED, pcpu->state)) { /* * Catch spurious interrupts after stopping IBS: After - * disabling IBS there could be still incomming NMIs + * disabling IBS there could be still incoming NMIs * with samples that even have the valid bit cleared. * Mark all this NMIs as handled. */ diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 324bb523d9d..529c8931fc0 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -107,6 +107,27 @@ static struct event_constraint intel_snb_event_constraints[] __read_mostly = EVENT_CONSTRAINT_END }; +static struct event_constraint intel_ivb_event_constraints[] __read_mostly = +{ + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ + INTEL_UEVENT_CONSTRAINT(0x0148, 0x4), /* L1D_PEND_MISS.PENDING */ + INTEL_UEVENT_CONSTRAINT(0x0279, 0xf), /* IDQ.EMTPY */ + INTEL_UEVENT_CONSTRAINT(0x019c, 0xf), /* IDQ_UOPS_NOT_DELIVERED.CORE */ + INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */ + INTEL_UEVENT_CONSTRAINT(0x05a3, 0xf), /* CYCLE_ACTIVITY.STALLS_L2_PENDING */ + INTEL_UEVENT_CONSTRAINT(0x06a3, 0xf), /* CYCLE_ACTIVITY.STALLS_LDM_PENDING */ + INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */ + INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */ + INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */ + INTEL_EVENT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */ + EVENT_CONSTRAINT_END +}; + static struct extra_reg intel_westmere_extra_regs[] __read_mostly = { INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0), @@ -1603,6 +1624,13 @@ static struct attribute *intel_arch_formats_attr[] = { NULL, }; +ssize_t intel_event_sysfs_show(char *page, u64 config) +{ + u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT); + + return x86_event_sysfs_show(page, config, event); +} + static __initconst const struct x86_pmu core_pmu = { .name = "core", .handle_irq = x86_pmu_handle_irq, @@ -1628,6 +1656,7 @@ static __initconst const struct x86_pmu core_pmu = { .event_constraints = intel_core_event_constraints, .guest_get_msrs = core_guest_get_msrs, .format_attrs = intel_arch_formats_attr, + .events_sysfs_show = intel_event_sysfs_show, }; struct intel_shared_regs *allocate_shared_regs(int cpu) @@ -1766,6 +1795,7 @@ static __initconst const struct x86_pmu intel_pmu = { .pebs_aliases = intel_pebs_aliases_core2, .format_attrs = intel_arch3_formats_attr, + .events_sysfs_show = intel_event_sysfs_show, .cpu_prepare = intel_pmu_cpu_prepare, .cpu_starting = intel_pmu_cpu_starting, @@ -2010,7 +2040,10 @@ __init int intel_pmu_init(void) break; case 28: /* Atom */ - case 54: /* Cedariew */ + case 38: /* Lincroft */ + case 39: /* Penwell */ + case 53: /* Cloverview */ + case 54: /* Cedarview */ memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, sizeof(hw_cache_event_ids)); @@ -2075,6 +2108,7 @@ __init int intel_pmu_init(void) pr_cont("SandyBridge events, "); break; case 58: /* IvyBridge */ + case 62: /* IvyBridge EP */ memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, @@ -2082,7 +2116,7 @@ __init int intel_pmu_init(void) intel_pmu_lbr_init_snb(); - x86_pmu.event_constraints = intel_snb_event_constraints; + x86_pmu.event_constraints = intel_ivb_event_constraints; x86_pmu.pebs_constraints = intel_ivb_pebs_event_constraints; x86_pmu.pebs_aliases = intel_pebs_aliases_snb; x86_pmu.extra_regs = intel_snb_extra_regs; diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 826054a4f2e..b05a575d56f 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -729,3 +729,13 @@ void intel_ds_init(void) } } } + +void perf_restore_debug_store(void) +{ + struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds); + + if (!x86_pmu.bts && !x86_pmu.pebs) + return; + + wrmsrl(MSR_IA32_DS_AREA, (unsigned long)ds); +} diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index 3cf3d97cce3..b43200dbfe7 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -2500,7 +2500,7 @@ static bool pcidrv_registered; /* * add a pci uncore device */ -static int __devinit uncore_pci_add(struct intel_uncore_type *type, struct pci_dev *pdev) +static int uncore_pci_add(struct intel_uncore_type *type, struct pci_dev *pdev) { struct intel_uncore_pmu *pmu; struct intel_uncore_box *box; @@ -2571,8 +2571,8 @@ static void uncore_pci_remove(struct pci_dev *pdev) kfree(box); } -static int __devinit uncore_pci_probe(struct pci_dev *pdev, - const struct pci_device_id *id) +static int uncore_pci_probe(struct pci_dev *pdev, + const struct pci_device_id *id) { struct intel_uncore_type *type; diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c index 7d0270bd793..4820c232a0b 100644 --- a/arch/x86/kernel/cpu/perf_event_p6.c +++ b/arch/x86/kernel/cpu/perf_event_p6.c @@ -19,7 +19,7 @@ static const u64 p6_perfmon_event_map[] = }; -static __initconst u64 p6_hw_cache_event_ids +static u64 p6_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX] = @@ -227,6 +227,8 @@ static __initconst const struct x86_pmu p6_pmu = { .event_constraints = p6_event_constraints, .format_attrs = intel_p6_formats_attr, + .events_sysfs_show = intel_event_sysfs_show, + }; __init int p6_pmu_init(void) diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index fbd89556229..e280253f6f9 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -26,14 +26,8 @@ static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c, #ifdef CONFIG_X86_32 static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c) { - /* - * We use exception 16 if we have hardware math and we've either seen - * it or the CPU claims it is internal - */ - int fpu_exception = c->hard_math && (ignore_fpu_irq || cpu_has_fpu); seq_printf(m, "fdiv_bug\t: %s\n" - "hlt_bug\t\t: %s\n" "f00f_bug\t: %s\n" "coma_bug\t: %s\n" "fpu\t\t: %s\n" @@ -41,11 +35,10 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c) "cpuid level\t: %d\n" "wp\t\t: %s\n", c->fdiv_bug ? "yes" : "no", - c->hlt_works_ok ? "no" : "yes", c->f00f_bug ? "yes" : "no", c->coma_bug ? "yes" : "no", c->hard_math ? "yes" : "no", - fpu_exception ? "yes" : "no", + c->hard_math ? "yes" : "no", c->cpuid_level, c->wp_works_ok ? "yes" : "no"); } diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index d22d0c4edcf..03a36321ec5 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -33,6 +33,9 @@ #define VMWARE_PORT_CMD_GETVERSION 10 #define VMWARE_PORT_CMD_GETHZ 45 +#define VMWARE_PORT_CMD_GETVCPU_INFO 68 +#define VMWARE_PORT_CMD_LEGACY_X2APIC 3 +#define VMWARE_PORT_CMD_VCPU_RESERVED 31 #define VMWARE_PORT(cmd, eax, ebx, ecx, edx) \ __asm__("inl (%%dx)" : \ @@ -125,10 +128,20 @@ static void __cpuinit vmware_set_cpu_features(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE); } +/* Checks if hypervisor supports x2apic without VT-D interrupt remapping. */ +static bool __init vmware_legacy_x2apic_available(void) +{ + uint32_t eax, ebx, ecx, edx; + VMWARE_PORT(GETVCPU_INFO, eax, ebx, ecx, edx); + return (eax & (1 << VMWARE_PORT_CMD_VCPU_RESERVED)) == 0 && + (eax & (1 << VMWARE_PORT_CMD_LEGACY_X2APIC)) != 0; +} + const __refconst struct hypervisor_x86 x86_hyper_vmware = { .name = "VMware", .detect = vmware_platform, .set_cpu_features = vmware_set_cpu_features, .init_platform = vmware_platform_setup, + .x2apic_available = vmware_legacy_x2apic_available, }; EXPORT_SYMBOL(x86_hyper_vmware); diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 60c78917190..1e4dbcfe6d3 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -85,7 +85,7 @@ static ssize_t cpuid_read(struct file *file, char __user *buf, { char __user *tmp = buf; struct cpuid_regs cmd; - int cpu = iminor(file->f_path.dentry->d_inode); + int cpu = iminor(file_inode(file)); u64 pos = *ppos; ssize_t bytes = 0; int err = 0; @@ -116,7 +116,7 @@ static int cpuid_open(struct inode *inode, struct file *file) unsigned int cpu; struct cpuinfo_x86 *c; - cpu = iminor(file->f_path.dentry->d_inode); + cpu = iminor(file_inode(file)); if (cpu >= nr_cpu_ids || !cpu_online(cpu)) return -ENXIO; /* No such CPU */ diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 13ad89971d4..74467feb4dc 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -16,6 +16,7 @@ #include <linux/delay.h> #include <linux/elf.h> #include <linux/elfcore.h> +#include <linux/module.h> #include <asm/processor.h> #include <asm/hardirq.h> @@ -30,6 +31,27 @@ int in_crash_kexec; +/* + * This is used to VMCLEAR all VMCSs loaded on the + * processor. And when loading kvm_intel module, the + * callback function pointer will be assigned. + * + * protected by rcu. + */ +crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss = NULL; +EXPORT_SYMBOL_GPL(crash_vmclear_loaded_vmcss); + +static inline void cpu_crash_vmclear_loaded_vmcss(void) +{ + crash_vmclear_fn *do_vmclear_operation = NULL; + + rcu_read_lock(); + do_vmclear_operation = rcu_dereference(crash_vmclear_loaded_vmcss); + if (do_vmclear_operation) + do_vmclear_operation(); + rcu_read_unlock(); +} + #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) static void kdump_nmi_callback(int cpu, struct pt_regs *regs) @@ -46,6 +68,11 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs) #endif crash_save_cpu(regs, cpu); + /* + * VMCLEAR VMCSs loaded on all cpus if needed. + */ + cpu_crash_vmclear_loaded_vmcss(); + /* Disable VMX or SVM if needed. * * We need to disable virtualization on all CPUs. @@ -88,6 +115,11 @@ void native_machine_crash_shutdown(struct pt_regs *regs) kdump_nmi_shootdown_cpus(); + /* + * VMCLEAR VMCSs loaded on this cpu if needed. + */ + cpu_crash_vmclear_loaded_vmcss(); + /* Booting kdump kernel with VMX or SVM enabled won't work, * because (among other limitations) we can't disable paging * with the virt flags. diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index ae42418bc50..c8797d55b24 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -232,7 +232,7 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) bust_spinlocks(0); die_owner = -1; - add_taint(TAINT_DIE); + add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE); die_nest_count--; if (!die_nest_count) /* Nest count reaches zero, release the lock. */ diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index df06ade26be..d32abeabbda 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -835,7 +835,7 @@ static int __init parse_memopt(char *p) } early_param("mem", parse_memopt); -static int __init parse_memmap_opt(char *p) +static int __init parse_memmap_one(char *p) { char *oldp; u64 start_at, mem_size; @@ -877,6 +877,20 @@ static int __init parse_memmap_opt(char *p) return *p == '\0' ? 0 : -EINVAL; } +static int __init parse_memmap_opt(char *str) +{ + while (str) { + char *k = strchr(str, ','); + + if (k) + *k++ = 0; + + parse_memmap_one(str); + str = k; + } + + return 0; +} early_param("memmap", parse_memmap_opt); void __init finish_e820_parsing(void) diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 88b725aa1d5..8f3e2dec1df 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -699,70 +699,6 @@ END(syscall_badsys) */ .popsection -/* - * System calls that need a pt_regs pointer. - */ -#define PTREGSCALL0(name) \ -ENTRY(ptregs_##name) ; \ - leal 4(%esp),%eax; \ - jmp sys_##name; \ -ENDPROC(ptregs_##name) - -#define PTREGSCALL1(name) \ -ENTRY(ptregs_##name) ; \ - leal 4(%esp),%edx; \ - movl (PT_EBX+4)(%esp),%eax; \ - jmp sys_##name; \ -ENDPROC(ptregs_##name) - -#define PTREGSCALL2(name) \ -ENTRY(ptregs_##name) ; \ - leal 4(%esp),%ecx; \ - movl (PT_ECX+4)(%esp),%edx; \ - movl (PT_EBX+4)(%esp),%eax; \ - jmp sys_##name; \ -ENDPROC(ptregs_##name) - -#define PTREGSCALL3(name) \ -ENTRY(ptregs_##name) ; \ - CFI_STARTPROC; \ - leal 4(%esp),%eax; \ - pushl_cfi %eax; \ - movl PT_EDX(%eax),%ecx; \ - movl PT_ECX(%eax),%edx; \ - movl PT_EBX(%eax),%eax; \ - call sys_##name; \ - addl $4,%esp; \ - CFI_ADJUST_CFA_OFFSET -4; \ - ret; \ - CFI_ENDPROC; \ -ENDPROC(ptregs_##name) - -PTREGSCALL1(iopl) -PTREGSCALL0(fork) -PTREGSCALL0(vfork) -PTREGSCALL2(sigaltstack) -PTREGSCALL0(sigreturn) -PTREGSCALL0(rt_sigreturn) -PTREGSCALL2(vm86) -PTREGSCALL1(vm86old) - -/* Clone is an oddball. The 4th arg is in %edi */ -ENTRY(ptregs_clone) - CFI_STARTPROC - leal 4(%esp),%eax - pushl_cfi %eax - pushl_cfi PT_EDI(%eax) - movl PT_EDX(%eax),%ecx - movl PT_ECX(%eax),%edx - movl PT_EBX(%eax),%eax - call sys_clone - addl $8,%esp - CFI_ADJUST_CFA_OFFSET -8 - ret - CFI_ENDPROC -ENDPROC(ptregs_clone) - .macro FIXUP_ESPFIX_STACK /* * Switch back for ESPFIX stack to the normal zerobased stack @@ -1084,7 +1020,6 @@ ENTRY(xen_failsafe_callback) lea 16(%esp),%esp CFI_ADJUST_CFA_OFFSET -16 jz 5f - addl $16,%esp jmp iret_exc 5: pushl_cfi $-1 /* orig_ax = -1 => not a system call */ SAVE_ALL @@ -1111,11 +1046,18 @@ ENTRY(xen_failsafe_callback) _ASM_EXTABLE(4b,9b) ENDPROC(xen_failsafe_callback) -BUILD_INTERRUPT3(xen_hvm_callback_vector, XEN_HVM_EVTCHN_CALLBACK, +BUILD_INTERRUPT3(xen_hvm_callback_vector, HYPERVISOR_CALLBACK_VECTOR, xen_evtchn_do_upcall) #endif /* CONFIG_XEN */ +#if IS_ENABLED(CONFIG_HYPERV) + +BUILD_INTERRUPT3(hyperv_callback_vector, HYPERVISOR_CALLBACK_VECTOR, + hyperv_vector_handler) + +#endif /* CONFIG_HYPERV */ + #ifdef CONFIG_FUNCTION_TRACER #ifdef CONFIG_DYNAMIC_FTRACE diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index b51b2c7ee51..c1d01e6ca79 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -56,7 +56,7 @@ #include <asm/ftrace.h> #include <asm/percpu.h> #include <asm/asm.h> -#include <asm/rcu.h> +#include <asm/context_tracking.h> #include <asm/smap.h> #include <linux/err.h> @@ -828,28 +828,38 @@ int_restore_rest: CFI_ENDPROC END(system_call) -/* - * Certain special system calls that need to save a complete full stack frame. - */ - .macro PTREGSCALL label,func,arg -ENTRY(\label) - PARTIAL_FRAME 1 8 /* offset 8: return address */ - subq $REST_SKIP, %rsp - CFI_ADJUST_CFA_OFFSET REST_SKIP - call save_rest + .macro FORK_LIKE func +ENTRY(stub_\func) + CFI_STARTPROC + popq %r11 /* save return address */ + PARTIAL_FRAME 0 + SAVE_REST + pushq %r11 /* put it back on stack */ + FIXUP_TOP_OF_STACK %r11, 8 DEFAULT_FRAME 0 8 /* offset 8: return address */ - leaq 8(%rsp), \arg /* pt_regs pointer */ + call sys_\func + RESTORE_TOP_OF_STACK %r11, 8 + ret $REST_SKIP /* pop extended registers */ + CFI_ENDPROC +END(stub_\func) + .endm + + .macro FIXED_FRAME label,func +ENTRY(\label) + CFI_STARTPROC + PARTIAL_FRAME 0 8 /* offset 8: return address */ + FIXUP_TOP_OF_STACK %r11, 8-ARGOFFSET call \func - jmp ptregscall_common + RESTORE_TOP_OF_STACK %r11, 8-ARGOFFSET + ret CFI_ENDPROC END(\label) .endm - PTREGSCALL stub_clone, sys_clone, %r8 - PTREGSCALL stub_fork, sys_fork, %rdi - PTREGSCALL stub_vfork, sys_vfork, %rdi - PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx - PTREGSCALL stub_iopl, sys_iopl, %rsi + FORK_LIKE clone + FORK_LIKE fork + FORK_LIKE vfork + FIXED_FRAME stub_iopl, sys_iopl ENTRY(ptregscall_common) DEFAULT_FRAME 1 8 /* offset 8: return address */ @@ -871,7 +881,6 @@ ENTRY(stub_execve) SAVE_REST FIXUP_TOP_OF_STACK %r11 call sys_execve - RESTORE_TOP_OF_STACK %r11 movq %rax,RAX(%rsp) RESTORE_REST jmp int_ret_from_sys_call @@ -887,7 +896,6 @@ ENTRY(stub_rt_sigreturn) addq $8, %rsp PARTIAL_FRAME 0 SAVE_REST - movq %rsp,%rdi FIXUP_TOP_OF_STACK %r11 call sys_rt_sigreturn movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer @@ -897,14 +905,11 @@ ENTRY(stub_rt_sigreturn) END(stub_rt_sigreturn) #ifdef CONFIG_X86_X32_ABI - PTREGSCALL stub_x32_sigaltstack, sys32_sigaltstack, %rdx - ENTRY(stub_x32_rt_sigreturn) CFI_STARTPROC addq $8, %rsp PARTIAL_FRAME 0 SAVE_REST - movq %rsp,%rdi FIXUP_TOP_OF_STACK %r11 call sys32_x32_rt_sigreturn movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer @@ -995,8 +1000,8 @@ END(interrupt) */ .p2align CONFIG_X86_L1_CACHE_SHIFT common_interrupt: - ASM_CLAC XCPT_FRAME + ASM_CLAC addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */ interrupt do_IRQ /* 0(%rsp): old_rsp-ARGOFFSET */ @@ -1135,8 +1140,8 @@ END(common_interrupt) */ .macro apicinterrupt num sym do_sym ENTRY(\sym) - ASM_CLAC INTR_FRAME + ASM_CLAC pushq_cfi $~(\num) .Lcommon_\sym: interrupt \do_sym @@ -1190,8 +1195,8 @@ apicinterrupt IRQ_WORK_VECTOR \ */ .macro zeroentry sym do_sym ENTRY(\sym) - ASM_CLAC INTR_FRAME + ASM_CLAC PARAVIRT_ADJUST_EXCEPTION_FRAME pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ subq $ORIG_RAX-R15, %rsp @@ -1208,8 +1213,8 @@ END(\sym) .macro paranoidzeroentry sym do_sym ENTRY(\sym) - ASM_CLAC INTR_FRAME + ASM_CLAC PARAVIRT_ADJUST_EXCEPTION_FRAME pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ subq $ORIG_RAX-R15, %rsp @@ -1227,8 +1232,8 @@ END(\sym) #define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8) .macro paranoidzeroentry_ist sym do_sym ist ENTRY(\sym) - ASM_CLAC INTR_FRAME + ASM_CLAC PARAVIRT_ADJUST_EXCEPTION_FRAME pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ subq $ORIG_RAX-R15, %rsp @@ -1247,8 +1252,8 @@ END(\sym) .macro errorentry sym do_sym ENTRY(\sym) - ASM_CLAC XCPT_FRAME + ASM_CLAC PARAVIRT_ADJUST_EXCEPTION_FRAME subq $ORIG_RAX-R15, %rsp CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 @@ -1266,8 +1271,8 @@ END(\sym) /* error code is on the stack already */ .macro paranoiderrorentry sym do_sym ENTRY(\sym) - ASM_CLAC XCPT_FRAME + ASM_CLAC PARAVIRT_ADJUST_EXCEPTION_FRAME subq $ORIG_RAX-R15, %rsp CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 @@ -1441,11 +1446,16 @@ ENTRY(xen_failsafe_callback) CFI_ENDPROC END(xen_failsafe_callback) -apicinterrupt XEN_HVM_EVTCHN_CALLBACK \ +apicinterrupt HYPERVISOR_CALLBACK_VECTOR \ xen_hvm_callback_vector xen_evtchn_do_upcall #endif /* CONFIG_XEN */ +#if IS_ENABLED(CONFIG_HYPERV) +apicinterrupt HYPERVISOR_CALLBACK_VECTOR \ + hyperv_callback_vector hyperv_vector_handler +#endif /* CONFIG_HYPERV */ + /* * Some functions should be protected against kprobes */ @@ -1699,9 +1709,10 @@ nested_nmi: 1: /* Set up the interrupted NMIs stack to jump to repeat_nmi */ - leaq -6*8(%rsp), %rdx + leaq -1*8(%rsp), %rdx movq %rdx, %rsp - CFI_ADJUST_CFA_OFFSET 6*8 + CFI_ADJUST_CFA_OFFSET 1*8 + leaq -10*8(%rsp), %rdx pushq_cfi $__KERNEL_DS pushq_cfi %rdx pushfq_cfi @@ -1709,8 +1720,8 @@ nested_nmi: pushq_cfi $repeat_nmi /* Put stack back */ - addq $(11*8), %rsp - CFI_ADJUST_CFA_OFFSET -11*8 + addq $(6*8), %rsp + CFI_ADJUST_CFA_OFFSET -6*8 nested_nmi_out: popq_cfi %rdx @@ -1736,18 +1747,18 @@ first_nmi: * +-------------------------+ * | NMI executing variable | * +-------------------------+ - * | Saved SS | - * | Saved Return RSP | - * | Saved RFLAGS | - * | Saved CS | - * | Saved RIP | - * +-------------------------+ * | copied SS | * | copied Return RSP | * | copied RFLAGS | * | copied CS | * | copied RIP | * +-------------------------+ + * | Saved SS | + * | Saved Return RSP | + * | Saved RFLAGS | + * | Saved CS | + * | Saved RIP | + * +-------------------------+ * | pt_regs | * +-------------------------+ * @@ -1763,9 +1774,15 @@ first_nmi: /* Set the NMI executing variable on the stack. */ pushq_cfi $1 + /* + * Leave room for the "copied" frame + */ + subq $(5*8), %rsp + CFI_ADJUST_CFA_OFFSET 5*8 + /* Copy the stack frame to the Saved frame */ .rept 5 - pushq_cfi 6*8(%rsp) + pushq_cfi 11*8(%rsp) .endr CFI_DEF_CFA_OFFSET SS+8-RIP @@ -1786,12 +1803,15 @@ repeat_nmi: * is benign for the non-repeat case, where 1 was pushed just above * to this very stack slot). */ - movq $1, 5*8(%rsp) + movq $1, 10*8(%rsp) /* Make another copy, this one may be modified by nested NMIs */ + addq $(10*8), %rsp + CFI_ADJUST_CFA_OFFSET -10*8 .rept 5 - pushq_cfi 4*8(%rsp) + pushq_cfi -6*8(%rsp) .endr + subq $(5*8), %rsp CFI_DEF_CFA_OFFSET SS+8-RIP end_repeat_nmi: @@ -1841,9 +1861,11 @@ end_repeat_nmi: nmi_swapgs: SWAPGS_UNSAFE_STACK nmi_restore: - RESTORE_ALL 8 + /* Pop the extra iret frame at once */ + RESTORE_ALL 6*8 + /* Clear the NMI executing stack variable */ - movq $0, 10*8(%rsp) + movq $0, 5*8(%rsp) jmp irq_return CFI_ENDPROC END(nmi) diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 1d414029f1d..42a392a9fd0 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -89,7 +89,7 @@ do_ftrace_mod_code(unsigned long ip, const void *new_code) * kernel identity mapping to modify code. */ if (within(ip, (unsigned long)_text, (unsigned long)_etext)) - ip = (unsigned long)__va(__pa(ip)); + ip = (unsigned long)__va(__pa_symbol(ip)); return probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE); } @@ -279,7 +279,7 @@ static int ftrace_write(unsigned long ip, const char *val, int size) * kernel identity mapping to modify code. */ if (within(ip, (unsigned long)_text, (unsigned long)_etext)) - ip = (unsigned long)__va(__pa(ip)); + ip = (unsigned long)__va(__pa_symbol(ip)); return probe_kernel_write((void *)ip, val, size); } diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c index 48d9d4ea102..992f442ca15 100644 --- a/arch/x86/kernel/head.c +++ b/arch/x86/kernel/head.c @@ -5,8 +5,6 @@ #include <asm/setup.h> #include <asm/bios_ebda.h> -#define BIOS_LOWMEM_KILOBYTES 0x413 - /* * The BIOS places the EBDA/XBDA at the top of conventional * memory, and usually decreases the reported amount of @@ -16,17 +14,30 @@ * chipset: reserve a page before VGA to prevent PCI prefetch * into it (errata #56). Usually the page is reserved anyways, * unless you have no PS/2 mouse plugged in. + * + * This functions is deliberately very conservative. Losing + * memory in the bottom megabyte is rarely a problem, as long + * as we have enough memory to install the trampoline. Using + * memory that is in use by the BIOS or by some DMA device + * the BIOS didn't shut down *is* a big problem. */ + +#define BIOS_LOWMEM_KILOBYTES 0x413 +#define LOWMEM_CAP 0x9f000U /* Absolute maximum */ +#define INSANE_CUTOFF 0x20000U /* Less than this = insane */ + void __init reserve_ebda_region(void) { unsigned int lowmem, ebda_addr; - /* To determine the position of the EBDA and the */ - /* end of conventional memory, we need to look at */ - /* the BIOS data area. In a paravirtual environment */ - /* that area is absent. We'll just have to assume */ - /* that the paravirt case can handle memory setup */ - /* correctly, without our help. */ + /* + * To determine the position of the EBDA and the + * end of conventional memory, we need to look at + * the BIOS data area. In a paravirtual environment + * that area is absent. We'll just have to assume + * that the paravirt case can handle memory setup + * correctly, without our help. + */ if (paravirt_enabled()) return; @@ -37,19 +48,23 @@ void __init reserve_ebda_region(void) /* start of EBDA area */ ebda_addr = get_bios_ebda(); - /* Fixup: bios puts an EBDA in the top 64K segment */ - /* of conventional memory, but does not adjust lowmem. */ - if ((lowmem - ebda_addr) <= 0x10000) - lowmem = ebda_addr; + /* + * Note: some old Dells seem to need 4k EBDA without + * reporting so, so just consider the memory above 0x9f000 + * to be off limits (bugzilla 2990). + */ + + /* If the EBDA address is below 128K, assume it is bogus */ + if (ebda_addr < INSANE_CUTOFF) + ebda_addr = LOWMEM_CAP; - /* Fixup: bios does not report an EBDA at all. */ - /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */ - if ((ebda_addr == 0) && (lowmem >= 0x9f000)) - lowmem = 0x9f000; + /* If lowmem is less than 128K, assume it is bogus */ + if (lowmem < INSANE_CUTOFF) + lowmem = LOWMEM_CAP; - /* Paranoia: should never happen, but... */ - if ((lowmem == 0) || (lowmem >= 0x100000)) - lowmem = 0x9f000; + /* Use the lower of the lowmem and EBDA markers as the cutoff */ + lowmem = min(lowmem, ebda_addr); + lowmem = min(lowmem, LOWMEM_CAP); /* Absolute cap */ /* reserve all memory between lowmem and the 1MB mark */ memblock_reserve(lowmem, 0x100000 - lowmem); diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index c18f59d1010..138463a2487 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -18,6 +18,7 @@ #include <asm/io_apic.h> #include <asm/bios_ebda.h> #include <asm/tlbflush.h> +#include <asm/bootparam_utils.h> static void __init i386_default_early_setup(void) { @@ -30,19 +31,7 @@ static void __init i386_default_early_setup(void) void __init i386_start_kernel(void) { - memblock_reserve(__pa_symbol(&_text), - __pa_symbol(&__bss_stop) - __pa_symbol(&_text)); - -#ifdef CONFIG_BLK_DEV_INITRD - /* Reserve INITRD */ - if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { - /* Assume only end is not page aligned */ - u64 ramdisk_image = boot_params.hdr.ramdisk_image; - u64 ramdisk_size = boot_params.hdr.ramdisk_size; - u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); - memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image); - } -#endif + sanitize_boot_params(&boot_params); /* Call the subarch specific early setup function */ switch (boot_params.hdr.hardware_subarch) { @@ -57,11 +46,5 @@ void __init i386_start_kernel(void) break; } - /* - * At this point everything still needed from the boot loader - * or BIOS or kernel text should be early reserved or marked not - * RAM in e820. All other memory is free game. - */ - start_kernel(); } diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 037df57a99a..c5e403f6d86 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -25,12 +25,84 @@ #include <asm/kdebug.h> #include <asm/e820.h> #include <asm/bios_ebda.h> +#include <asm/bootparam_utils.h> +#include <asm/microcode.h> -static void __init zap_identity_mappings(void) +/* + * Manage page tables very early on. + */ +extern pgd_t early_level4_pgt[PTRS_PER_PGD]; +extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD]; +static unsigned int __initdata next_early_pgt = 2; + +/* Wipe all early page tables except for the kernel symbol map */ +static void __init reset_early_page_tables(void) { - pgd_t *pgd = pgd_offset_k(0UL); - pgd_clear(pgd); - __flush_tlb_all(); + unsigned long i; + + for (i = 0; i < PTRS_PER_PGD-1; i++) + early_level4_pgt[i].pgd = 0; + + next_early_pgt = 0; + + write_cr3(__pa(early_level4_pgt)); +} + +/* Create a new PMD entry */ +int __init early_make_pgtable(unsigned long address) +{ + unsigned long physaddr = address - __PAGE_OFFSET; + unsigned long i; + pgdval_t pgd, *pgd_p; + pudval_t pud, *pud_p; + pmdval_t pmd, *pmd_p; + + /* Invalid address or early pgt is done ? */ + if (physaddr >= MAXMEM || read_cr3() != __pa(early_level4_pgt)) + return -1; + +again: + pgd_p = &early_level4_pgt[pgd_index(address)].pgd; + pgd = *pgd_p; + + /* + * The use of __START_KERNEL_map rather than __PAGE_OFFSET here is + * critical -- __PAGE_OFFSET would point us back into the dynamic + * range and we might end up looping forever... + */ + if (pgd) + pud_p = (pudval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base); + else { + if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) { + reset_early_page_tables(); + goto again; + } + + pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++]; + for (i = 0; i < PTRS_PER_PUD; i++) + pud_p[i] = 0; + *pgd_p = (pgdval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; + } + pud_p += pud_index(address); + pud = *pud_p; + + if (pud) + pmd_p = (pmdval_t *)((pud & PTE_PFN_MASK) + __START_KERNEL_map - phys_base); + else { + if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) { + reset_early_page_tables(); + goto again; + } + + pmd_p = (pmdval_t *)early_dynamic_pgts[next_early_pgt++]; + for (i = 0; i < PTRS_PER_PMD; i++) + pmd_p[i] = 0; + *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; + } + pmd = (physaddr & PMD_MASK) + (__PAGE_KERNEL_LARGE & ~_PAGE_GLOBAL); + pmd_p[pmd_index(address)] = pmd; + + return 0; } /* Don't add a printk in there. printk relies on the PDA which is not initialized @@ -41,13 +113,25 @@ static void __init clear_bss(void) (unsigned long) __bss_stop - (unsigned long) __bss_start); } +static unsigned long get_cmd_line_ptr(void) +{ + unsigned long cmd_line_ptr = boot_params.hdr.cmd_line_ptr; + + cmd_line_ptr |= (u64)boot_params.ext_cmd_line_ptr << 32; + + return cmd_line_ptr; +} + static void __init copy_bootdata(char *real_mode_data) { char * command_line; + unsigned long cmd_line_ptr; memcpy(&boot_params, real_mode_data, sizeof boot_params); - if (boot_params.hdr.cmd_line_ptr) { - command_line = __va(boot_params.hdr.cmd_line_ptr); + sanitize_boot_params(&boot_params); + cmd_line_ptr = get_cmd_line_ptr(); + if (cmd_line_ptr) { + command_line = __va(cmd_line_ptr); memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE); } } @@ -70,54 +154,40 @@ void __init x86_64_start_kernel(char * real_mode_data) (__START_KERNEL & PGDIR_MASK))); BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END); + /* Kill off the identity-map trampoline */ + reset_early_page_tables(); + /* clear bss before set_intr_gate with early_idt_handler */ clear_bss(); - /* Make NULL pointers segfault */ - zap_identity_mappings(); - - max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT; - - for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) { -#ifdef CONFIG_EARLY_PRINTK + for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) set_intr_gate(i, &early_idt_handlers[i]); -#else - set_intr_gate(i, early_idt_handler); -#endif - } load_idt((const struct desc_ptr *)&idt_descr); + copy_bootdata(__va(real_mode_data)); + + /* + * Load microcode early on BSP. + */ + load_ucode_bsp(); + if (console_loglevel == 10) early_printk("Kernel alive\n"); + clear_page(init_level4_pgt); + /* set init_level4_pgt kernel high mapping*/ + init_level4_pgt[511] = early_level4_pgt[511]; + x86_64_start_reservations(real_mode_data); } void __init x86_64_start_reservations(char *real_mode_data) { - copy_bootdata(__va(real_mode_data)); - - memblock_reserve(__pa_symbol(&_text), - __pa_symbol(&__bss_stop) - __pa_symbol(&_text)); - -#ifdef CONFIG_BLK_DEV_INITRD - /* Reserve INITRD */ - if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { - /* Assume only end is not page aligned */ - unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; - unsigned long ramdisk_size = boot_params.hdr.ramdisk_size; - unsigned long ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); - memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image); - } -#endif + /* version is always not zero if it is copied */ + if (!boot_params.hdr.version) + copy_bootdata(__va(real_mode_data)); reserve_ebda_region(); - /* - * At this point everything still needed from the boot loader - * or BIOS or kernel text should be early reserved or marked not - * RAM in e820. All other memory is free game. - */ - start_kernel(); } diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 957a47aec64..73afd11799c 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -144,6 +144,11 @@ ENTRY(startup_32) movl %eax, pa(olpc_ofw_pgd) #endif +#ifdef CONFIG_MICROCODE_EARLY + /* Early load ucode on BSP. */ + call load_ucode_bsp +#endif + /* * Initialize page tables. This creates a PDE and a set of page * tables, which are located immediately beyond __brk_base. The variable @@ -266,6 +271,19 @@ num_subarch_entries = (. - subarch_entries) / 4 jmp default_entry #endif /* CONFIG_PARAVIRT */ +#ifdef CONFIG_HOTPLUG_CPU +/* + * Boot CPU0 entry point. It's called from play_dead(). Everything has been set + * up already except stack. We just set up stack here. Then call + * start_secondary(). + */ +ENTRY(start_cpu0) + movl stack_start, %ecx + movl %ecx, %esp + jmp *(initial_code) +ENDPROC(start_cpu0) +#endif + /* * Non-boot CPU entry point; entered from trampoline.S * We can't lgdt here, because lgdt itself uses a data segment, but @@ -286,33 +304,59 @@ ENTRY(startup_32_smp) movl %eax,%ss leal -__PAGE_OFFSET(%ecx),%esp +#ifdef CONFIG_MICROCODE_EARLY + /* Early load ucode on AP. */ + call load_ucode_ap +#endif + + default_entry: +#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \ + X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \ + X86_CR0_PG) + movl $(CR0_STATE & ~X86_CR0_PG),%eax + movl %eax,%cr0 + +/* + * We want to start out with EFLAGS unambiguously cleared. Some BIOSes leave + * bits like NT set. This would confuse the debugger if this code is traced. So + * initialize them properly now before switching to protected mode. That means + * DF in particular (even though we have cleared it earlier after copying the + * command line) because GCC expects it. + */ + pushl $0 + popfl + /* - * New page tables may be in 4Mbyte page mode and may - * be using the global pages. + * New page tables may be in 4Mbyte page mode and may be using the global pages. * - * NOTE! If we are on a 486 we may have no cr4 at all! - * Specifically, cr4 exists if and only if CPUID exists, - * which in turn exists if and only if EFLAGS.ID exists. + * NOTE! If we are on a 486 we may have no cr4 at all! Specifically, cr4 exists + * if and only if CPUID exists and has flags other than the FPU flag set. */ + movl $-1,pa(X86_CPUID) # preset CPUID level movl $X86_EFLAGS_ID,%ecx pushl %ecx - popfl + popfl # set EFLAGS=ID pushfl - popl %eax - pushl $0 - popfl - pushfl - popl %edx - xorl %edx,%eax - testl %ecx,%eax - jz 6f # No ID flag = no CPUID = no CR4 + popl %eax # get EFLAGS + testl $X86_EFLAGS_ID,%eax # did EFLAGS.ID remained set? + jz enable_paging # hw disallowed setting of ID bit + # which means no CPUID and no CR4 + + xorl %eax,%eax + cpuid + movl %eax,pa(X86_CPUID) # save largest std CPUID function + + movl $1,%eax + cpuid + andl $~1,%edx # Ignore CPUID.FPU + jz enable_paging # No flags or only CPUID.FPU = no CR4 movl pa(mmu_cr4_features),%eax movl %eax,%cr4 testb $X86_CR4_PAE, %al # check if PAE is enabled - jz 6f + jz enable_paging /* Check if extended functions are implemented */ movl $0x80000000, %eax @@ -320,7 +364,7 @@ default_entry: /* Value must be in the range 0x80000001 to 0x8000ffff */ subl $0x80000001, %eax cmpl $(0x8000ffff-0x80000001), %eax - ja 6f + ja enable_paging /* Clear bogus XD_DISABLE bits */ call verify_cpu @@ -329,7 +373,7 @@ default_entry: cpuid /* Execute Disable bit supported? */ btl $(X86_FEATURE_NX & 31), %edx - jnc 6f + jnc enable_paging /* Setup EFER (Extended Feature Enable Register) */ movl $MSR_EFER, %ecx @@ -339,15 +383,14 @@ default_entry: /* Make changes effective */ wrmsr -6: +enable_paging: /* * Enable paging */ movl $pa(initial_page_table), %eax movl %eax,%cr3 /* set the page table pointer.. */ - movl %cr0,%eax - orl $X86_CR0_PG,%eax + movl $CR0_STATE,%eax movl %eax,%cr0 /* ..and set paging (PG) bit */ ljmp $__BOOT_CS,$1f /* Clear prefetch and normalize %eip */ 1: @@ -355,14 +398,6 @@ default_entry: addl $__PAGE_OFFSET, %esp /* - * Initialize eflags. Some BIOS's leave bits like NT set. This would - * confuse the debugger if this code is traced. - * XXX - best to initialize before switching to protected mode. - */ - pushl $0 - popfl - -/* * start system 32-bit setup. We need to re-do some of the things done * in 16-bit mode for the "real" operations. */ @@ -371,31 +406,11 @@ default_entry: jz 1f # Did we do this already? call *%eax 1: - -/* check if it is 486 or 386. */ + /* - * XXX - this does a lot of unnecessary setup. Alignment checks don't - * apply at our cpl of 0 and the stack ought to be aligned already, and - * we don't need to preserve eflags. + * Check if it is 486 */ - movl $-1,X86_CPUID # -1 for no CPUID initially - movb $3,X86 # at least 386 - pushfl # push EFLAGS - popl %eax # get EFLAGS - movl %eax,%ecx # save original EFLAGS - xorl $0x240000,%eax # flip AC and ID bits in EFLAGS - pushl %eax # copy to EFLAGS - popfl # set EFLAGS - pushfl # get new EFLAGS - popl %eax # put it in eax - xorl %ecx,%eax # change in flags - pushl %ecx # restore original EFLAGS - popfl - testl $0x40000,%eax # check if AC bit changed - je is386 - - movb $4,X86 # at least 486 - testl $0x200000,%eax # check if ID bit changed + cmpl $-1,X86_CPUID je is486 /* get vendor info */ @@ -421,11 +436,10 @@ default_entry: movb %cl,X86_MASK movl %edx,X86_CAPABILITY -is486: movl $0x50022,%ecx # set AM, WP, NE and MP - jmp 2f - -is386: movl $2,%ecx # set MP -2: movl %cr0,%eax +is486: + movb $4,X86 + movl $0x50022,%ecx # set AM, WP, NE and MP + movl %cr0,%eax andl $0x80000011,%eax # Save PG,PE,ET orl %ecx,%eax movl %eax,%cr0 @@ -450,7 +464,6 @@ is386: movl $2,%ecx # set MP xorl %eax,%eax # Clear LDT lldt %ax - cld # gcc2 wants the direction flag cleared at all times pushl $0 # fake return address for unwinder jmp *(initial_code) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 94bf9cc2c7e..6859e962644 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -47,14 +47,13 @@ L3_START_KERNEL = pud_index(__START_KERNEL_map) .code64 .globl startup_64 startup_64: - /* - * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1, + * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0, * and someone has loaded an identity mapped page table * for us. These identity mapped page tables map all of the * kernel pages and possibly all of memory. * - * %esi holds a physical pointer to real_mode_data. + * %rsi holds a physical pointer to real_mode_data. * * We come here either directly from a 64bit bootloader, or from * arch/x86_64/boot/compressed/head.S. @@ -66,7 +65,8 @@ startup_64: * tables and then reload them. */ - /* Compute the delta between the address I am compiled to run at and the + /* + * Compute the delta between the address I am compiled to run at and the * address I am actually running at. */ leaq _text(%rip), %rbp @@ -78,45 +78,62 @@ startup_64: testl %eax, %eax jnz bad_address - /* Is the address too large? */ - leaq _text(%rip), %rdx - movq $PGDIR_SIZE, %rax - cmpq %rax, %rdx - jae bad_address - - /* Fixup the physical addresses in the page table + /* + * Is the address too large? */ - addq %rbp, init_level4_pgt + 0(%rip) - addq %rbp, init_level4_pgt + (L4_PAGE_OFFSET*8)(%rip) - addq %rbp, init_level4_pgt + (L4_START_KERNEL*8)(%rip) + leaq _text(%rip), %rax + shrq $MAX_PHYSMEM_BITS, %rax + jnz bad_address - addq %rbp, level3_ident_pgt + 0(%rip) + /* + * Fixup the physical addresses in the page table + */ + addq %rbp, early_level4_pgt + (L4_START_KERNEL*8)(%rip) addq %rbp, level3_kernel_pgt + (510*8)(%rip) addq %rbp, level3_kernel_pgt + (511*8)(%rip) addq %rbp, level2_fixmap_pgt + (506*8)(%rip) - /* Add an Identity mapping if I am above 1G */ + /* + * Set up the identity mapping for the switchover. These + * entries should *NOT* have the global bit set! This also + * creates a bunch of nonsense entries but that is fine -- + * it avoids problems around wraparound. + */ leaq _text(%rip), %rdi - andq $PMD_PAGE_MASK, %rdi + leaq early_level4_pgt(%rip), %rbx movq %rdi, %rax - shrq $PUD_SHIFT, %rax - andq $(PTRS_PER_PUD - 1), %rax - jz ident_complete + shrq $PGDIR_SHIFT, %rax - leaq (level2_spare_pgt - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx - leaq level3_ident_pgt(%rip), %rbx - movq %rdx, 0(%rbx, %rax, 8) + leaq (4096 + _KERNPG_TABLE)(%rbx), %rdx + movq %rdx, 0(%rbx,%rax,8) + movq %rdx, 8(%rbx,%rax,8) + addq $4096, %rdx movq %rdi, %rax - shrq $PMD_SHIFT, %rax - andq $(PTRS_PER_PMD - 1), %rax - leaq __PAGE_KERNEL_IDENT_LARGE_EXEC(%rdi), %rdx - leaq level2_spare_pgt(%rip), %rbx - movq %rdx, 0(%rbx, %rax, 8) -ident_complete: + shrq $PUD_SHIFT, %rax + andl $(PTRS_PER_PUD-1), %eax + movq %rdx, (4096+0)(%rbx,%rax,8) + movq %rdx, (4096+8)(%rbx,%rax,8) + + addq $8192, %rbx + movq %rdi, %rax + shrq $PMD_SHIFT, %rdi + addq $(__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL), %rax + leaq (_end - 1)(%rip), %rcx + shrq $PMD_SHIFT, %rcx + subq %rdi, %rcx + incl %ecx + +1: + andq $(PTRS_PER_PMD - 1), %rdi + movq %rax, (%rbx,%rdi,8) + incq %rdi + addq $PMD_SIZE, %rax + decl %ecx + jnz 1b /* * Fixup the kernel text+data virtual addresses. Note that @@ -124,7 +141,6 @@ ident_complete: * cleanup_highmap() fixes this up along with the mappings * beyond _end. */ - leaq level2_kernel_pgt(%rip), %rdi leaq 4096(%rdi), %r8 /* See if it is a valid page table entry */ @@ -139,17 +155,14 @@ ident_complete: /* Fixup phys_base */ addq %rbp, phys_base(%rip) - /* Due to ENTRY(), sometimes the empty space gets filled with - * zeros. Better take a jmp than relying on empty space being - * filled with 0x90 (nop) - */ - jmp secondary_startup_64 + movq $(early_level4_pgt - __START_KERNEL_map), %rax + jmp 1f ENTRY(secondary_startup_64) /* - * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1, + * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0, * and someone has loaded a mapped page table. * - * %esi holds a physical pointer to real_mode_data. + * %rsi holds a physical pointer to real_mode_data. * * We come here either from startup_64 (using physical addresses) * or from trampoline.S (using virtual addresses). @@ -159,12 +172,14 @@ ENTRY(secondary_startup_64) * after the boot processor executes this code. */ + movq $(init_level4_pgt - __START_KERNEL_map), %rax +1: + /* Enable PAE mode and PGE */ - movl $(X86_CR4_PAE | X86_CR4_PGE), %eax - movq %rax, %cr4 + movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx + movq %rcx, %cr4 /* Setup early boot stage 4 level pagetables. */ - movq $(init_level4_pgt - __START_KERNEL_map), %rax addq phys_base(%rip), %rax movq %rax, %cr3 @@ -196,7 +211,7 @@ ENTRY(secondary_startup_64) movq %rax, %cr0 /* Setup a boot time stack */ - movq stack_start(%rip),%rsp + movq stack_start(%rip), %rsp /* zero EFLAGS after setting rsp */ pushq $0 @@ -236,15 +251,33 @@ ENTRY(secondary_startup_64) movl initial_gs+4(%rip),%edx wrmsr - /* esi is pointer to real mode structure with interesting info. + /* rsi is pointer to real mode structure with interesting info. pass it to C */ - movl %esi, %edi + movq %rsi, %rdi /* Finally jump to run C code and to be on real kernel address * Since we are running on identity-mapped space we have to jump * to the full 64bit address, this is only possible as indirect * jump. In addition we need to ensure %cs is set so we make this * a far return. + * + * Note: do not change to far jump indirect with 64bit offset. + * + * AMD does not support far jump indirect with 64bit offset. + * AMD64 Architecture Programmer's Manual, Volume 3: states only + * JMP FAR mem16:16 FF /5 Far jump indirect, + * with the target specified by a far pointer in memory. + * JMP FAR mem16:32 FF /5 Far jump indirect, + * with the target specified by a far pointer in memory. + * + * Intel64 does support 64bit offset. + * Software Developer Manual Vol 2: states: + * FF /5 JMP m16:16 Jump far, absolute indirect, + * address given in m16:16 + * FF /5 JMP m16:32 Jump far, absolute indirect, + * address given in m16:32. + * REX.W + FF /5 JMP m16:64 Jump far, absolute indirect, + * address given in m16:64. */ movq initial_code(%rip),%rax pushq $0 # fake return address to stop unwinder @@ -252,15 +285,31 @@ ENTRY(secondary_startup_64) pushq %rax # target address in negative space lretq +#ifdef CONFIG_HOTPLUG_CPU +/* + * Boot CPU0 entry point. It's called from play_dead(). Everything has been set + * up already except stack. We just set up stack here. Then call + * start_secondary(). + */ +ENTRY(start_cpu0) + movq stack_start(%rip),%rsp + movq initial_code(%rip),%rax + pushq $0 # fake return address to stop unwinder + pushq $__KERNEL_CS # set correct cs + pushq %rax # target address in negative space + lretq +ENDPROC(start_cpu0) +#endif + /* SMP bootup changes these two */ __REFDATA - .align 8 - ENTRY(initial_code) + .balign 8 + GLOBAL(initial_code) .quad x86_64_start_kernel - ENTRY(initial_gs) + GLOBAL(initial_gs) .quad INIT_PER_CPU_VAR(irq_stack_union) - ENTRY(stack_start) + GLOBAL(stack_start) .quad init_thread_union+THREAD_SIZE-8 .word 0 __FINITDATA @@ -268,7 +317,7 @@ ENTRY(secondary_startup_64) bad_address: jmp bad_address - .section ".init.text","ax" + __INIT .globl early_idt_handlers early_idt_handlers: # 104(%rsp) %rflags @@ -287,6 +336,7 @@ early_idt_handlers: i = i + 1 .endr +/* This is global to keep gas from relaxing the jumps */ ENTRY(early_idt_handler) cld @@ -305,14 +355,22 @@ ENTRY(early_idt_handler) pushq %r11 # 0(%rsp) cmpl $__KERNEL_CS,96(%rsp) - jne 10f + jne 11f + + cmpl $14,72(%rsp) # Page fault? + jnz 10f + GET_CR2_INTO(%rdi) # can clobber any volatile register if pv + call early_make_pgtable + andl %eax,%eax + jz 20f # All good +10: leaq 88(%rsp),%rdi # Pointer to %rip call early_fixup_exception andl %eax,%eax jnz 20f # Found an exception entry -10: +11: #ifdef CONFIG_EARLY_PRINTK GET_CR2_INTO(%r9) # can clobber any volatile register if pv movl 80(%rsp),%r8d # error code @@ -334,7 +392,7 @@ ENTRY(early_idt_handler) 1: hlt jmp 1b -20: # Exception table entry found +20: # Exception table entry found or page table generated popq %r11 popq %r10 popq %r9 @@ -347,6 +405,9 @@ ENTRY(early_idt_handler) addq $16,%rsp # drop vector number and error code decl early_recursion_flag(%rip) INTERRUPT_RETURN +ENDPROC(early_idt_handler) + + __INITDATA .balign 4 early_recursion_flag: @@ -358,11 +419,10 @@ early_idt_msg: early_idt_ripmsg: .asciz "RIP %s\n" #endif /* CONFIG_EARLY_PRINTK */ - .previous #define NEXT_PAGE(name) \ .balign PAGE_SIZE; \ -ENTRY(name) +GLOBAL(name) /* Automate the creation of 1 to 1 mapping pmd entries */ #define PMDS(START, PERM, COUNT) \ @@ -372,24 +432,37 @@ ENTRY(name) i = i + 1 ; \ .endr + __INITDATA +NEXT_PAGE(early_level4_pgt) + .fill 511,8,0 + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE + +NEXT_PAGE(early_dynamic_pgts) + .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0 + .data - /* - * This default setting generates an ident mapping at address 0x100000 - * and a mapping for the kernel that precisely maps virtual address - * 0xffffffff80000000 to physical address 0x000000. (always using - * 2Mbyte large pages provided by PAE mode) - */ + +#ifndef CONFIG_XEN NEXT_PAGE(init_level4_pgt) - .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE - .org init_level4_pgt + L4_PAGE_OFFSET*8, 0 - .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE - .org init_level4_pgt + L4_START_KERNEL*8, 0 + .fill 512,8,0 +#else +NEXT_PAGE(init_level4_pgt) + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .org init_level4_pgt + L4_PAGE_OFFSET*8, 0 + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .org init_level4_pgt + L4_START_KERNEL*8, 0 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ - .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE NEXT_PAGE(level3_ident_pgt) .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE - .fill 511,8,0 + .fill 511, 8, 0 +NEXT_PAGE(level2_ident_pgt) + /* Since I easily can, map the first 1G. + * Don't set NX because code runs from these pages. + */ + PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) +#endif NEXT_PAGE(level3_kernel_pgt) .fill L3_START_KERNEL,8,0 @@ -397,21 +470,6 @@ NEXT_PAGE(level3_kernel_pgt) .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE -NEXT_PAGE(level2_fixmap_pgt) - .fill 506,8,0 - .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE - /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */ - .fill 5,8,0 - -NEXT_PAGE(level1_fixmap_pgt) - .fill 512,8,0 - -NEXT_PAGE(level2_ident_pgt) - /* Since I easily can, map the first 1G. - * Don't set NX because code runs from these pages. - */ - PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) - NEXT_PAGE(level2_kernel_pgt) /* * 512 MB kernel mapping. We spend a full page on this pagetable @@ -426,11 +484,16 @@ NEXT_PAGE(level2_kernel_pgt) PMDS(0, __PAGE_KERNEL_LARGE_EXEC, KERNEL_IMAGE_SIZE/PMD_SIZE) -NEXT_PAGE(level2_spare_pgt) - .fill 512, 8, 0 +NEXT_PAGE(level2_fixmap_pgt) + .fill 506,8,0 + .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE + /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */ + .fill 5,8,0 + +NEXT_PAGE(level1_fixmap_pgt) + .fill 512,8,0 #undef PMDS -#undef NEXT_PAGE .data .align 16 @@ -456,6 +519,5 @@ ENTRY(nmi_idt_table) .skip IDT_ENTRIES * 16 __PAGE_ALIGNED_BSS - .align PAGE_SIZE -ENTRY(empty_zero_page) +NEXT_PAGE(empty_zero_page) .skip PAGE_SIZE diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 1460a5df92f..da85a8e830a 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -434,7 +434,7 @@ void hpet_msi_unmask(struct irq_data *data) /* unmask it */ cfg = hpet_readl(HPET_Tn_CFG(hdev->num)); - cfg |= HPET_TN_FSB; + cfg |= HPET_TN_ENABLE | HPET_TN_FSB; hpet_writel(cfg, HPET_Tn_CFG(hdev->num)); } @@ -445,7 +445,7 @@ void hpet_msi_mask(struct irq_data *data) /* mask it */ cfg = hpet_readl(HPET_Tn_CFG(hdev->num)); - cfg &= ~HPET_TN_FSB; + cfg &= ~(HPET_TN_ENABLE | HPET_TN_FSB); hpet_writel(cfg, HPET_Tn_CFG(hdev->num)); } @@ -478,7 +478,7 @@ static int hpet_msi_next_event(unsigned long delta, static int hpet_setup_msi_irq(unsigned int irq) { - if (arch_setup_hpet_msi(irq, hpet_blockid)) { + if (x86_msi.setup_hpet_msi(irq, hpet_blockid)) { destroy_irq(irq); return -EINVAL; } diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c index 9c3bd4a2050..0fa69127209 100644 --- a/arch/x86/kernel/i386_ksyms_32.c +++ b/arch/x86/kernel/i386_ksyms_32.c @@ -26,6 +26,7 @@ EXPORT_SYMBOL(csum_partial_copy_generic); EXPORT_SYMBOL(__get_user_1); EXPORT_SYMBOL(__get_user_2); EXPORT_SYMBOL(__get_user_4); +EXPORT_SYMBOL(__get_user_8); EXPORT_SYMBOL(__put_user_1); EXPORT_SYMBOL(__put_user_2); diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 675a0501244..245a71db401 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -175,7 +175,11 @@ void __cpuinit fpu_init(void) cr0 |= X86_CR0_EM; write_cr0(cr0); - if (!smp_processor_id()) + /* + * init_thread_xstate is only called once to avoid overriding + * xstate_size during boot time or during CPU hotplug. + */ + if (xstate_size == 0) init_thread_xstate(); mxcsr_feature_mask_init(); diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index 8c968974253..4ddaf66ea35 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -93,8 +93,9 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) * on system-call entry - see also fork() and the signal handling * code. */ -long sys_iopl(unsigned int level, struct pt_regs *regs) +SYSCALL_DEFINE1(iopl, unsigned int, level) { + struct pt_regs *regs = current_pt_regs(); unsigned int old = (regs->flags >> 12) & 3; struct thread_struct *t = ¤t->thread; diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 6e03b0d6913..7dc4e459c2b 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -42,39 +42,6 @@ * (these are usually mapped into the 0x30-0xff vector range) */ -#ifdef CONFIG_X86_32 -/* - * Note that on a 486, we don't want to do a SIGFPE on an irq13 - * as the irq is unreliable, and exception 16 works correctly - * (ie as explained in the intel literature). On a 386, you - * can't use exception 16 due to bad IBM design, so we have to - * rely on the less exact irq13. - * - * Careful.. Not only is IRQ13 unreliable, but it is also - * leads to races. IBM designers who came up with it should - * be shot. - */ - -static irqreturn_t math_error_irq(int cpl, void *dev_id) -{ - outb(0, 0xF0); - if (ignore_fpu_irq || !boot_cpu_data.hard_math) - return IRQ_NONE; - math_error(get_irq_regs(), 0, X86_TRAP_MF); - return IRQ_HANDLED; -} - -/* - * New motherboards sometimes make IRQ 13 be a PCI interrupt, - * so allow interrupt sharing. - */ -static struct irqaction fpu_irq = { - .handler = math_error_irq, - .name = "fpu", - .flags = IRQF_NO_THREAD, -}; -#endif - /* * IRQ2 is cascade interrupt to second interrupt controller */ @@ -242,13 +209,6 @@ void __init native_init_IRQ(void) setup_irq(2, &irq2); #ifdef CONFIG_X86_32 - /* - * External FPU? Set up irq13 if so, for - * original braindamaged IBM FERR coupling. - */ - if (boot_cpu_data.hard_math && !cpu_has_fpu) - setup_irq(FPU_IRQ, &fpu_irq); - irq_ctx_init(smp_processor_id()); #endif } diff --git a/arch/x86/kernel/kprobes/Makefile b/arch/x86/kernel/kprobes/Makefile new file mode 100644 index 00000000000..0d33169cc1a --- /dev/null +++ b/arch/x86/kernel/kprobes/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for kernel probes +# + +obj-$(CONFIG_KPROBES) += core.o +obj-$(CONFIG_OPTPROBES) += opt.o +obj-$(CONFIG_KPROBES_ON_FTRACE) += ftrace.o diff --git a/arch/x86/kernel/kprobes-common.h b/arch/x86/kernel/kprobes/common.h index 3230b68ef29..2e9d4b5af03 100644 --- a/arch/x86/kernel/kprobes-common.h +++ b/arch/x86/kernel/kprobes/common.h @@ -99,4 +99,15 @@ static inline unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsig return addr; } #endif + +#ifdef CONFIG_KPROBES_ON_FTRACE +extern int skip_singlestep(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb); +#else +static inline int skip_singlestep(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb) +{ + return 0; +} +#endif #endif diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes/core.c index 57916c0d3cf..3f06e614998 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes/core.c @@ -58,7 +58,7 @@ #include <asm/insn.h> #include <asm/debugreg.h> -#include "kprobes-common.h" +#include "common.h" void jprobe_return_end(void); @@ -78,7 +78,7 @@ DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); * Groups, and some special opcodes can not boost. * This is non-const and volatile to keep gcc from statically * optimizing it out, as variable_test_bit makes gcc think only - * *(unsigned long*) is used. + * *(unsigned long*) is used. */ static volatile u32 twobyte_is_boostable[256 / 32] = { /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ @@ -117,7 +117,7 @@ static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op) struct __arch_relative_insn { u8 op; s32 raddr; - } __attribute__((packed)) *insn; + } __packed *insn; insn = (struct __arch_relative_insn *)from; insn->raddr = (s32)((long)(to) - ((long)(from) + 5)); @@ -541,23 +541,6 @@ reenter_kprobe(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb return 1; } -#ifdef KPROBES_CAN_USE_FTRACE -static void __kprobes skip_singlestep(struct kprobe *p, struct pt_regs *regs, - struct kprobe_ctlblk *kcb) -{ - /* - * Emulate singlestep (and also recover regs->ip) - * as if there is a 5byte nop - */ - regs->ip = (unsigned long)p->addr + MCOUNT_INSN_SIZE; - if (unlikely(p->post_handler)) { - kcb->kprobe_status = KPROBE_HIT_SSDONE; - p->post_handler(p, regs, 0); - } - __this_cpu_write(current_kprobe, NULL); -} -#endif - /* * Interrupts are disabled on entry as trap3 is an interrupt gate and they * remain disabled throughout this function. @@ -616,13 +599,8 @@ static int __kprobes kprobe_handler(struct pt_regs *regs) } else if (kprobe_running()) { p = __this_cpu_read(current_kprobe); if (p->break_handler && p->break_handler(p, regs)) { -#ifdef KPROBES_CAN_USE_FTRACE - if (kprobe_ftrace(p)) { - skip_singlestep(p, regs, kcb); - return 1; - } -#endif - setup_singlestep(p, regs, kcb, 0); + if (!skip_singlestep(p, regs, kcb)) + setup_singlestep(p, regs, kcb, 0); return 1; } } /* else: not a kprobe fault; let the kernel handle it */ @@ -674,7 +652,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs) { struct kretprobe_instance *ri = NULL; struct hlist_head *head, empty_rp; - struct hlist_node *node, *tmp; + struct hlist_node *tmp; unsigned long flags, orig_ret_address = 0; unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline; kprobe_opcode_t *correct_ret_addr = NULL; @@ -704,7 +682,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs) * will be the real return address, and all the rest will * point to kretprobe_trampoline. */ - hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { + hlist_for_each_entry_safe(ri, tmp, head, hlist) { if (ri->task != current) /* another task is sharing our hash bucket */ continue; @@ -723,7 +701,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs) kretprobe_assert(ri, orig_ret_address, trampoline_address); correct_ret_addr = ri->ret_addr; - hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { + hlist_for_each_entry_safe(ri, tmp, head, hlist) { if (ri->task != current) /* another task is sharing our hash bucket */ continue; @@ -750,7 +728,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs) kretprobe_hash_unlock(current, &flags); - hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { + hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) { hlist_del(&ri->hlist); kfree(ri); } @@ -1075,50 +1053,6 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) return 0; } -#ifdef KPROBES_CAN_USE_FTRACE -/* Ftrace callback handler for kprobes */ -void __kprobes kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *ops, struct pt_regs *regs) -{ - struct kprobe *p; - struct kprobe_ctlblk *kcb; - unsigned long flags; - - /* Disable irq for emulating a breakpoint and avoiding preempt */ - local_irq_save(flags); - - p = get_kprobe((kprobe_opcode_t *)ip); - if (unlikely(!p) || kprobe_disabled(p)) - goto end; - - kcb = get_kprobe_ctlblk(); - if (kprobe_running()) { - kprobes_inc_nmissed_count(p); - } else { - /* Kprobe handler expects regs->ip = ip + 1 as breakpoint hit */ - regs->ip = ip + sizeof(kprobe_opcode_t); - - __this_cpu_write(current_kprobe, p); - kcb->kprobe_status = KPROBE_HIT_ACTIVE; - if (!p->pre_handler || !p->pre_handler(p, regs)) - skip_singlestep(p, regs, kcb); - /* - * If pre_handler returns !0, it sets regs->ip and - * resets current kprobe. - */ - } -end: - local_irq_restore(flags); -} - -int __kprobes arch_prepare_kprobe_ftrace(struct kprobe *p) -{ - p->ainsn.insn = NULL; - p->ainsn.boostable = -1; - return 0; -} -#endif - int __init arch_init_kprobes(void) { return arch_init_optprobes(); diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c new file mode 100644 index 00000000000..23ef5c556f0 --- /dev/null +++ b/arch/x86/kernel/kprobes/ftrace.c @@ -0,0 +1,93 @@ +/* + * Dynamic Ftrace based Kprobes Optimization + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) Hitachi Ltd., 2012 + */ +#include <linux/kprobes.h> +#include <linux/ptrace.h> +#include <linux/hardirq.h> +#include <linux/preempt.h> +#include <linux/ftrace.h> + +#include "common.h" + +static int __skip_singlestep(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb) +{ + /* + * Emulate singlestep (and also recover regs->ip) + * as if there is a 5byte nop + */ + regs->ip = (unsigned long)p->addr + MCOUNT_INSN_SIZE; + if (unlikely(p->post_handler)) { + kcb->kprobe_status = KPROBE_HIT_SSDONE; + p->post_handler(p, regs, 0); + } + __this_cpu_write(current_kprobe, NULL); + return 1; +} + +int __kprobes skip_singlestep(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb) +{ + if (kprobe_ftrace(p)) + return __skip_singlestep(p, regs, kcb); + else + return 0; +} + +/* Ftrace callback handler for kprobes */ +void __kprobes kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *ops, struct pt_regs *regs) +{ + struct kprobe *p; + struct kprobe_ctlblk *kcb; + unsigned long flags; + + /* Disable irq for emulating a breakpoint and avoiding preempt */ + local_irq_save(flags); + + p = get_kprobe((kprobe_opcode_t *)ip); + if (unlikely(!p) || kprobe_disabled(p)) + goto end; + + kcb = get_kprobe_ctlblk(); + if (kprobe_running()) { + kprobes_inc_nmissed_count(p); + } else { + /* Kprobe handler expects regs->ip = ip + 1 as breakpoint hit */ + regs->ip = ip + sizeof(kprobe_opcode_t); + + __this_cpu_write(current_kprobe, p); + kcb->kprobe_status = KPROBE_HIT_ACTIVE; + if (!p->pre_handler || !p->pre_handler(p, regs)) + __skip_singlestep(p, regs, kcb); + /* + * If pre_handler returns !0, it sets regs->ip and + * resets current kprobe. + */ + } +end: + local_irq_restore(flags); +} + +int __kprobes arch_prepare_kprobe_ftrace(struct kprobe *p) +{ + p->ainsn.insn = NULL; + p->ainsn.boostable = -1; + return 0; +} diff --git a/arch/x86/kernel/kprobes-opt.c b/arch/x86/kernel/kprobes/opt.c index c5e410eed40..76dc6f09572 100644 --- a/arch/x86/kernel/kprobes-opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -37,7 +37,7 @@ #include <asm/insn.h> #include <asm/debugreg.h> -#include "kprobes-common.h" +#include "common.h" unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr) { diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 4180a874c76..b686a904d7c 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -42,6 +42,8 @@ #include <asm/apic.h> #include <asm/apicdef.h> #include <asm/hypervisor.h> +#include <asm/kvm_guest.h> +#include <asm/context_tracking.h> static int kvmapf = 1; @@ -62,6 +64,15 @@ static int parse_no_stealacc(char *arg) early_param("no-steal-acc", parse_no_stealacc); +static int kvmclock_vsyscall = 1; +static int parse_no_kvmclock_vsyscall(char *arg) +{ + kvmclock_vsyscall = 0; + return 0; +} + +early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall); + static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64); static int has_steal_clock = 0; @@ -110,11 +121,8 @@ void kvm_async_pf_task_wait(u32 token) struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; struct kvm_task_sleep_node n, *e; DEFINE_WAIT(wait); - int cpu, idle; - cpu = get_cpu(); - idle = idle_cpu(cpu); - put_cpu(); + rcu_irq_enter(); spin_lock(&b->lock); e = _find_apf_task(b, token); @@ -123,12 +131,14 @@ void kvm_async_pf_task_wait(u32 token) hlist_del(&e->link); kfree(e); spin_unlock(&b->lock); + + rcu_irq_exit(); return; } n.token = token; n.cpu = smp_processor_id(); - n.halted = idle || preempt_count() > 1; + n.halted = is_idle_task(current) || preempt_count() > 1; init_waitqueue_head(&n.wq); hlist_add_head(&n.link, &b->list); spin_unlock(&b->lock); @@ -147,13 +157,16 @@ void kvm_async_pf_task_wait(u32 token) /* * We cannot reschedule. So halt. */ + rcu_irq_exit(); native_safe_halt(); + rcu_irq_enter(); local_irq_disable(); } } if (!n.halted) finish_wait(&n.wq, &wait); + rcu_irq_exit(); return; } EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait); @@ -247,10 +260,10 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code) break; case KVM_PV_REASON_PAGE_NOT_PRESENT: /* page is swapped out by the host. */ - rcu_irq_enter(); + exception_enter(regs); exit_idle(); kvm_async_pf_task_wait((u32)read_cr2()); - rcu_irq_exit(); + exception_exit(regs); break; case KVM_PV_REASON_PAGE_READY: rcu_irq_enter(); @@ -284,9 +297,9 @@ static void kvm_register_steal_time(void) memset(st, 0, sizeof(*st)); - wrmsrl(MSR_KVM_STEAL_TIME, (__pa(st) | KVM_MSR_ENABLED)); - printk(KERN_INFO "kvm-stealtime: cpu %d, msr %lx\n", - cpu, __pa(st)); + wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED)); + pr_info("kvm-stealtime: cpu %d, msr %llx\n", + cpu, (unsigned long long) slow_virt_to_phys(st)); } static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED; @@ -311,7 +324,7 @@ void __cpuinit kvm_guest_cpu_init(void) return; if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) { - u64 pa = __pa(&__get_cpu_var(apf_reason)); + u64 pa = slow_virt_to_phys(&__get_cpu_var(apf_reason)); #ifdef CONFIG_PREEMPT pa |= KVM_ASYNC_PF_SEND_ALWAYS; @@ -327,7 +340,8 @@ void __cpuinit kvm_guest_cpu_init(void) /* Size alignment is implied but just to make it explicit. */ BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4); __get_cpu_var(kvm_apic_eoi) = 0; - pa = __pa(&__get_cpu_var(kvm_apic_eoi)) | KVM_MSR_ENABLED; + pa = slow_virt_to_phys(&__get_cpu_var(kvm_apic_eoi)) + | KVM_MSR_ENABLED; wrmsrl(MSR_KVM_PV_EOI_EN, pa); } @@ -471,6 +485,9 @@ void __init kvm_guest_init(void) if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) apic_set_eoi_write(kvm_guest_apic_eoi_write); + if (kvmclock_vsyscall) + kvm_setup_vsyscall_timeinfo(); + #ifdef CONFIG_SMP smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; register_cpu_notifier(&kvm_cpu_notifier); @@ -489,6 +506,7 @@ static bool __init kvm_detect(void) const struct hypervisor_x86 x86_hyper_kvm __refconst = { .name = "KVM", .detect = kvm_detect, + .x2apic_available = kvm_para_available, }; EXPORT_SYMBOL_GPL(x86_hyper_kvm); diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index f1b42b3a186..0732f0089a3 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -23,6 +23,7 @@ #include <asm/apic.h> #include <linux/percpu.h> #include <linux/hardirq.h> +#include <linux/memblock.h> #include <asm/x86_init.h> #include <asm/reboot.h> @@ -39,7 +40,7 @@ static int parse_no_kvmclock(char *arg) early_param("no-kvmclock", parse_no_kvmclock); /* The hypervisor will put information about time periodically here */ -static DEFINE_PER_CPU_SHARED_ALIGNED(struct pvclock_vcpu_time_info, hv_clock); +static struct pvclock_vsyscall_time_info *hv_clock; static struct pvclock_wall_clock wall_clock; /* @@ -52,15 +53,20 @@ static unsigned long kvm_get_wallclock(void) struct pvclock_vcpu_time_info *vcpu_time; struct timespec ts; int low, high; + int cpu; low = (int)__pa_symbol(&wall_clock); high = ((u64)__pa_symbol(&wall_clock) >> 32); native_write_msr(msr_kvm_wall_clock, low, high); - vcpu_time = &get_cpu_var(hv_clock); + preempt_disable(); + cpu = smp_processor_id(); + + vcpu_time = &hv_clock[cpu].pvti; pvclock_read_wallclock(&wall_clock, vcpu_time, &ts); - put_cpu_var(hv_clock); + + preempt_enable(); return ts.tv_sec; } @@ -74,9 +80,11 @@ static cycle_t kvm_clock_read(void) { struct pvclock_vcpu_time_info *src; cycle_t ret; + int cpu; preempt_disable_notrace(); - src = &__get_cpu_var(hv_clock); + cpu = smp_processor_id(); + src = &hv_clock[cpu].pvti; ret = pvclock_clocksource_read(src); preempt_enable_notrace(); return ret; @@ -99,8 +107,15 @@ static cycle_t kvm_clock_get_cycles(struct clocksource *cs) static unsigned long kvm_get_tsc_khz(void) { struct pvclock_vcpu_time_info *src; - src = &per_cpu(hv_clock, 0); - return pvclock_tsc_khz(src); + int cpu; + unsigned long tsc_khz; + + preempt_disable(); + cpu = smp_processor_id(); + src = &hv_clock[cpu].pvti; + tsc_khz = pvclock_tsc_khz(src); + preempt_enable(); + return tsc_khz; } static void kvm_get_preset_lpj(void) @@ -119,10 +134,14 @@ bool kvm_check_and_clear_guest_paused(void) { bool ret = false; struct pvclock_vcpu_time_info *src; + int cpu = smp_processor_id(); - src = &__get_cpu_var(hv_clock); + if (!hv_clock) + return ret; + + src = &hv_clock[cpu].pvti; if ((src->flags & PVCLOCK_GUEST_STOPPED) != 0) { - __this_cpu_and(hv_clock.flags, ~PVCLOCK_GUEST_STOPPED); + src->flags &= ~PVCLOCK_GUEST_STOPPED; ret = true; } @@ -141,9 +160,10 @@ int kvm_register_clock(char *txt) { int cpu = smp_processor_id(); int low, high, ret; + struct pvclock_vcpu_time_info *src = &hv_clock[cpu].pvti; - low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1; - high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32); + low = (int)slow_virt_to_phys(src) | 1; + high = ((u64)slow_virt_to_phys(src) >> 32); ret = native_write_msr_safe(msr_kvm_system_time, low, high); printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n", cpu, high, low, txt); @@ -197,6 +217,11 @@ static void kvm_shutdown(void) void __init kvmclock_init(void) { + unsigned long mem; + int size; + + size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS); + if (!kvm_para_available()) return; @@ -209,8 +234,16 @@ void __init kvmclock_init(void) printk(KERN_INFO "kvm-clock: Using msrs %x and %x", msr_kvm_system_time, msr_kvm_wall_clock); - if (kvm_register_clock("boot clock")) + mem = memblock_alloc(size, PAGE_SIZE); + if (!mem) return; + hv_clock = __va(mem); + + if (kvm_register_clock("boot clock")) { + hv_clock = NULL; + memblock_free(mem, size); + return; + } pv_time_ops.sched_clock = kvm_clock_read; x86_platform.calibrate_tsc = kvm_get_tsc_khz; x86_platform.get_wallclock = kvm_get_wallclock; @@ -233,3 +266,37 @@ void __init kvmclock_init(void) if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); } + +int __init kvm_setup_vsyscall_timeinfo(void) +{ +#ifdef CONFIG_X86_64 + int cpu; + int ret; + u8 flags; + struct pvclock_vcpu_time_info *vcpu_time; + unsigned int size; + + size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS); + + preempt_disable(); + cpu = smp_processor_id(); + + vcpu_time = &hv_clock[cpu].pvti; + flags = pvclock_read_flags(vcpu_time); + + if (!(flags & PVCLOCK_TSC_STABLE_BIT)) { + preempt_enable(); + return 1; + } + + if ((ret = pvclock_init_vsyscall(hv_clock, size))) { + preempt_enable(); + return ret; + } + + preempt_enable(); + + kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK; +#endif + return 0; +} diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index b3ea9db39db..4eabc160696 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -16,125 +16,12 @@ #include <linux/io.h> #include <linux/suspend.h> +#include <asm/init.h> #include <asm/pgtable.h> #include <asm/tlbflush.h> #include <asm/mmu_context.h> #include <asm/debugreg.h> -static int init_one_level2_page(struct kimage *image, pgd_t *pgd, - unsigned long addr) -{ - pud_t *pud; - pmd_t *pmd; - struct page *page; - int result = -ENOMEM; - - addr &= PMD_MASK; - pgd += pgd_index(addr); - if (!pgd_present(*pgd)) { - page = kimage_alloc_control_pages(image, 0); - if (!page) - goto out; - pud = (pud_t *)page_address(page); - clear_page(pud); - set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); - } - pud = pud_offset(pgd, addr); - if (!pud_present(*pud)) { - page = kimage_alloc_control_pages(image, 0); - if (!page) - goto out; - pmd = (pmd_t *)page_address(page); - clear_page(pmd); - set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); - } - pmd = pmd_offset(pud, addr); - if (!pmd_present(*pmd)) - set_pmd(pmd, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC)); - result = 0; -out: - return result; -} - -static void init_level2_page(pmd_t *level2p, unsigned long addr) -{ - unsigned long end_addr; - - addr &= PAGE_MASK; - end_addr = addr + PUD_SIZE; - while (addr < end_addr) { - set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC)); - addr += PMD_SIZE; - } -} - -static int init_level3_page(struct kimage *image, pud_t *level3p, - unsigned long addr, unsigned long last_addr) -{ - unsigned long end_addr; - int result; - - result = 0; - addr &= PAGE_MASK; - end_addr = addr + PGDIR_SIZE; - while ((addr < last_addr) && (addr < end_addr)) { - struct page *page; - pmd_t *level2p; - - page = kimage_alloc_control_pages(image, 0); - if (!page) { - result = -ENOMEM; - goto out; - } - level2p = (pmd_t *)page_address(page); - init_level2_page(level2p, addr); - set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE)); - addr += PUD_SIZE; - } - /* clear the unused entries */ - while (addr < end_addr) { - pud_clear(level3p++); - addr += PUD_SIZE; - } -out: - return result; -} - - -static int init_level4_page(struct kimage *image, pgd_t *level4p, - unsigned long addr, unsigned long last_addr) -{ - unsigned long end_addr; - int result; - - result = 0; - addr &= PAGE_MASK; - end_addr = addr + (PTRS_PER_PGD * PGDIR_SIZE); - while ((addr < last_addr) && (addr < end_addr)) { - struct page *page; - pud_t *level3p; - - page = kimage_alloc_control_pages(image, 0); - if (!page) { - result = -ENOMEM; - goto out; - } - level3p = (pud_t *)page_address(page); - result = init_level3_page(image, level3p, addr, last_addr); - if (result) - goto out; - set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE)); - addr += PGDIR_SIZE; - } - /* clear the unused entries */ - while (addr < end_addr) { - pgd_clear(level4p++); - addr += PGDIR_SIZE; - } -out: - return result; -} - static void free_transition_pgtable(struct kimage *image) { free_page((unsigned long)image->arch.pud); @@ -184,22 +71,62 @@ err: return result; } +static void *alloc_pgt_page(void *data) +{ + struct kimage *image = (struct kimage *)data; + struct page *page; + void *p = NULL; + + page = kimage_alloc_control_pages(image, 0); + if (page) { + p = page_address(page); + clear_page(p); + } + + return p; +} static int init_pgtable(struct kimage *image, unsigned long start_pgtable) { + struct x86_mapping_info info = { + .alloc_pgt_page = alloc_pgt_page, + .context = image, + .pmd_flag = __PAGE_KERNEL_LARGE_EXEC, + }; + unsigned long mstart, mend; pgd_t *level4p; int result; + int i; + level4p = (pgd_t *)__va(start_pgtable); - result = init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT); - if (result) - return result; + clear_page(level4p); + for (i = 0; i < nr_pfn_mapped; i++) { + mstart = pfn_mapped[i].start << PAGE_SHIFT; + mend = pfn_mapped[i].end << PAGE_SHIFT; + + result = kernel_ident_mapping_init(&info, + level4p, mstart, mend); + if (result) + return result; + } + /* - * image->start may be outside 0 ~ max_pfn, for example when - * jump back to original kernel from kexeced kernel + * segments's mem ranges could be outside 0 ~ max_pfn, + * for example when jump back to original kernel from kexeced kernel. + * or first kernel is booted with user mem map, and second kernel + * could be loaded out of that range. */ - result = init_one_level2_page(image, level4p, image->start); - if (result) - return result; + for (i = 0; i < image->nr_segments; i++) { + mstart = image->segment[i].mem; + mend = mstart + image->segment[i].memsz; + + result = kernel_ident_mapping_init(&info, + level4p, mstart, mend); + + if (result) + return result; + } + return init_transition_pgtable(image, level4p); } diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 7720ff5a9ee..efdec7cd8e0 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -8,8 +8,8 @@ * Tigran Aivazian <tigran@aivazian.fsnet.co.uk> * * Maintainers: - * Andreas Herrmann <andreas.herrmann3@amd.com> - * Borislav Petkov <borislav.petkov@amd.com> + * Andreas Herrmann <herrmann.der.user@googlemail.com> + * Borislav Petkov <bp@alien8.de> * * This driver allows to upgrade microcode on F10h AMD * CPUs and later. @@ -190,6 +190,7 @@ static unsigned int verify_patch_size(int cpu, u32 patch_size, #define F1XH_MPB_MAX_SIZE 2048 #define F14H_MPB_MAX_SIZE 1824 #define F15H_MPB_MAX_SIZE 4096 +#define F16H_MPB_MAX_SIZE 3458 switch (c->x86) { case 0x14: @@ -198,6 +199,9 @@ static unsigned int verify_patch_size(int cpu, u32 patch_size, case 0x15: max_size = F15H_MPB_MAX_SIZE; break; + case 0x16: + max_size = F16H_MPB_MAX_SIZE; + break; default: max_size = F1XH_MPB_MAX_SIZE; break; diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 3a04b224d0c..22db92bbdf1 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -364,10 +364,7 @@ static struct attribute_group mc_attr_group = { static void microcode_fini_cpu(int cpu) { - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - microcode_ops->microcode_fini_cpu(cpu); - uci->valid = 0; } static enum ucode_state microcode_resume_cpu(int cpu) @@ -383,6 +380,10 @@ static enum ucode_state microcode_resume_cpu(int cpu) static enum ucode_state microcode_init_cpu(int cpu, bool refresh_fw) { enum ucode_state ustate; + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + if (uci && uci->valid) + return UCODE_OK; if (collect_cpu_info(cpu)) return UCODE_ERROR; diff --git a/arch/x86/kernel/microcode_core_early.c b/arch/x86/kernel/microcode_core_early.c new file mode 100644 index 00000000000..577db8417d1 --- /dev/null +++ b/arch/x86/kernel/microcode_core_early.c @@ -0,0 +1,76 @@ +/* + * X86 CPU microcode early update for Linux + * + * Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com> + * H Peter Anvin" <hpa@zytor.com> + * + * This driver allows to early upgrade microcode on Intel processors + * belonging to IA-32 family - PentiumPro, Pentium II, + * Pentium III, Xeon, Pentium 4, etc. + * + * Reference: Section 9.11 of Volume 3, IA-32 Intel Architecture + * Software Developer's Manual. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <linux/module.h> +#include <asm/microcode_intel.h> +#include <asm/processor.h> + +#define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24)) +#define CPUID_INTEL1 QCHAR('G', 'e', 'n', 'u') +#define CPUID_INTEL2 QCHAR('i', 'n', 'e', 'I') +#define CPUID_INTEL3 QCHAR('n', 't', 'e', 'l') +#define CPUID_AMD1 QCHAR('A', 'u', 't', 'h') +#define CPUID_AMD2 QCHAR('e', 'n', 't', 'i') +#define CPUID_AMD3 QCHAR('c', 'A', 'M', 'D') + +#define CPUID_IS(a, b, c, ebx, ecx, edx) \ + (!((ebx ^ (a))|(edx ^ (b))|(ecx ^ (c)))) + +/* + * In early loading microcode phase on BSP, boot_cpu_data is not set up yet. + * x86_vendor() gets vendor id for BSP. + * + * In 32 bit AP case, accessing boot_cpu_data needs linear address. To simplify + * coding, we still use x86_vendor() to get vendor id for AP. + * + * x86_vendor() gets vendor information directly through cpuid. + */ +static int __cpuinit x86_vendor(void) +{ + u32 eax = 0x00000000; + u32 ebx, ecx = 0, edx; + + if (!have_cpuid_p()) + return X86_VENDOR_UNKNOWN; + + native_cpuid(&eax, &ebx, &ecx, &edx); + + if (CPUID_IS(CPUID_INTEL1, CPUID_INTEL2, CPUID_INTEL3, ebx, ecx, edx)) + return X86_VENDOR_INTEL; + + if (CPUID_IS(CPUID_AMD1, CPUID_AMD2, CPUID_AMD3, ebx, ecx, edx)) + return X86_VENDOR_AMD; + + return X86_VENDOR_UNKNOWN; +} + +void __init load_ucode_bsp(void) +{ + int vendor = x86_vendor(); + + if (vendor == X86_VENDOR_INTEL) + load_ucode_intel_bsp(); +} + +void __cpuinit load_ucode_ap(void) +{ + int vendor = x86_vendor(); + + if (vendor == X86_VENDOR_INTEL) + load_ucode_intel_ap(); +} diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index 3544aed3933..5fb2cebf556 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -79,7 +79,7 @@ #include <linux/module.h> #include <linux/vmalloc.h> -#include <asm/microcode.h> +#include <asm/microcode_intel.h> #include <asm/processor.h> #include <asm/msr.h> @@ -87,59 +87,6 @@ MODULE_DESCRIPTION("Microcode Update Driver"); MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>"); MODULE_LICENSE("GPL"); -struct microcode_header_intel { - unsigned int hdrver; - unsigned int rev; - unsigned int date; - unsigned int sig; - unsigned int cksum; - unsigned int ldrver; - unsigned int pf; - unsigned int datasize; - unsigned int totalsize; - unsigned int reserved[3]; -}; - -struct microcode_intel { - struct microcode_header_intel hdr; - unsigned int bits[0]; -}; - -/* microcode format is extended from prescott processors */ -struct extended_signature { - unsigned int sig; - unsigned int pf; - unsigned int cksum; -}; - -struct extended_sigtable { - unsigned int count; - unsigned int cksum; - unsigned int reserved[3]; - struct extended_signature sigs[0]; -}; - -#define DEFAULT_UCODE_DATASIZE (2000) -#define MC_HEADER_SIZE (sizeof(struct microcode_header_intel)) -#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) -#define EXT_HEADER_SIZE (sizeof(struct extended_sigtable)) -#define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature)) -#define DWSIZE (sizeof(u32)) - -#define get_totalsize(mc) \ - (((struct microcode_intel *)mc)->hdr.totalsize ? \ - ((struct microcode_intel *)mc)->hdr.totalsize : \ - DEFAULT_UCODE_TOTALSIZE) - -#define get_datasize(mc) \ - (((struct microcode_intel *)mc)->hdr.datasize ? \ - ((struct microcode_intel *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE) - -#define sigmatch(s1, s2, p1, p2) \ - (((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0)))) - -#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE) - static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) { struct cpuinfo_x86 *c = &cpu_data(cpu_num); @@ -162,128 +109,25 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) return 0; } -static inline int update_match_cpu(struct cpu_signature *csig, int sig, int pf) -{ - return (!sigmatch(sig, csig->sig, pf, csig->pf)) ? 0 : 1; -} - -static inline int -update_match_revision(struct microcode_header_intel *mc_header, int rev) -{ - return (mc_header->rev <= rev) ? 0 : 1; -} - -static int microcode_sanity_check(void *mc) -{ - unsigned long total_size, data_size, ext_table_size; - struct microcode_header_intel *mc_header = mc; - struct extended_sigtable *ext_header = NULL; - int sum, orig_sum, ext_sigcount = 0, i; - struct extended_signature *ext_sig; - - total_size = get_totalsize(mc_header); - data_size = get_datasize(mc_header); - - if (data_size + MC_HEADER_SIZE > total_size) { - pr_err("error! Bad data size in microcode data file\n"); - return -EINVAL; - } - - if (mc_header->ldrver != 1 || mc_header->hdrver != 1) { - pr_err("error! Unknown microcode update format\n"); - return -EINVAL; - } - ext_table_size = total_size - (MC_HEADER_SIZE + data_size); - if (ext_table_size) { - if ((ext_table_size < EXT_HEADER_SIZE) - || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) { - pr_err("error! Small exttable size in microcode data file\n"); - return -EINVAL; - } - ext_header = mc + MC_HEADER_SIZE + data_size; - if (ext_table_size != exttable_size(ext_header)) { - pr_err("error! Bad exttable size in microcode data file\n"); - return -EFAULT; - } - ext_sigcount = ext_header->count; - } - - /* check extended table checksum */ - if (ext_table_size) { - int ext_table_sum = 0; - int *ext_tablep = (int *)ext_header; - - i = ext_table_size / DWSIZE; - while (i--) - ext_table_sum += ext_tablep[i]; - if (ext_table_sum) { - pr_warning("aborting, bad extended signature table checksum\n"); - return -EINVAL; - } - } - - /* calculate the checksum */ - orig_sum = 0; - i = (MC_HEADER_SIZE + data_size) / DWSIZE; - while (i--) - orig_sum += ((int *)mc)[i]; - if (orig_sum) { - pr_err("aborting, bad checksum\n"); - return -EINVAL; - } - if (!ext_table_size) - return 0; - /* check extended signature checksum */ - for (i = 0; i < ext_sigcount; i++) { - ext_sig = (void *)ext_header + EXT_HEADER_SIZE + - EXT_SIGNATURE_SIZE * i; - sum = orig_sum - - (mc_header->sig + mc_header->pf + mc_header->cksum) - + (ext_sig->sig + ext_sig->pf + ext_sig->cksum); - if (sum) { - pr_err("aborting, bad checksum\n"); - return -EINVAL; - } - } - return 0; -} - /* * return 0 - no update found * return 1 - found update */ -static int -get_matching_microcode(struct cpu_signature *cpu_sig, void *mc, int rev) +static int get_matching_mc(struct microcode_intel *mc_intel, int cpu) { - struct microcode_header_intel *mc_header = mc; - struct extended_sigtable *ext_header; - unsigned long total_size = get_totalsize(mc_header); - int ext_sigcount, i; - struct extended_signature *ext_sig; - - if (!update_match_revision(mc_header, rev)) - return 0; - - if (update_match_cpu(cpu_sig, mc_header->sig, mc_header->pf)) - return 1; + struct cpu_signature cpu_sig; + unsigned int csig, cpf, crev; - /* Look for ext. headers: */ - if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE) - return 0; + collect_cpu_info(cpu, &cpu_sig); - ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE; - ext_sigcount = ext_header->count; - ext_sig = (void *)ext_header + EXT_HEADER_SIZE; + csig = cpu_sig.sig; + cpf = cpu_sig.pf; + crev = cpu_sig.rev; - for (i = 0; i < ext_sigcount; i++) { - if (update_match_cpu(cpu_sig, ext_sig->sig, ext_sig->pf)) - return 1; - ext_sig++; - } - return 0; + return get_matching_microcode(csig, cpf, mc_intel, crev); } -static int apply_microcode(int cpu) +int apply_microcode(int cpu) { struct microcode_intel *mc_intel; struct ucode_cpu_info *uci; @@ -300,6 +144,14 @@ static int apply_microcode(int cpu) if (mc_intel == NULL) return 0; + /* + * Microcode on this CPU could be updated earlier. Only apply the + * microcode patch in mc_intel when it is newer than the one on this + * CPU. + */ + if (get_matching_mc(mc_intel, cpu) == 0) + return 0; + /* write microcode via MSR 0x79 */ wrmsr(MSR_IA32_UCODE_WRITE, (unsigned long) mc_intel->bits, @@ -338,6 +190,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, unsigned int leftover = size; enum ucode_state state = UCODE_OK; unsigned int curr_mc_size = 0; + unsigned int csig, cpf; while (leftover) { struct microcode_header_intel mc_header; @@ -362,11 +215,13 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, } if (get_ucode_data(mc, ucode_ptr, mc_size) || - microcode_sanity_check(mc) < 0) { + microcode_sanity_check(mc, 1) < 0) { break; } - if (get_matching_microcode(&uci->cpu_sig, mc, new_rev)) { + csig = uci->cpu_sig.sig; + cpf = uci->cpu_sig.pf; + if (get_matching_microcode(csig, cpf, mc, new_rev)) { vfree(new_mc); new_rev = mc_header.rev; new_mc = mc; @@ -393,6 +248,13 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, vfree(uci->mc); uci->mc = (struct microcode_intel *)new_mc; + /* + * If early loading microcode is supported, save this mc into + * permanent memory. So it will be loaded early when a CPU is hot added + * or resumes. + */ + save_mc_for_early(new_mc); + pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n", cpu, new_rev, uci->cpu_sig.rev); out: diff --git a/arch/x86/kernel/microcode_intel_early.c b/arch/x86/kernel/microcode_intel_early.c new file mode 100644 index 00000000000..7890bc83895 --- /dev/null +++ b/arch/x86/kernel/microcode_intel_early.c @@ -0,0 +1,796 @@ +/* + * Intel CPU microcode early update for Linux + * + * Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com> + * H Peter Anvin" <hpa@zytor.com> + * + * This allows to early upgrade microcode on Intel processors + * belonging to IA-32 family - PentiumPro, Pentium II, + * Pentium III, Xeon, Pentium 4, etc. + * + * Reference: Section 9.11 of Volume 3, IA-32 Intel Architecture + * Software Developer's Manual. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/earlycpio.h> +#include <linux/initrd.h> +#include <linux/cpu.h> +#include <asm/msr.h> +#include <asm/microcode_intel.h> +#include <asm/processor.h> +#include <asm/tlbflush.h> +#include <asm/setup.h> + +unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT]; +struct mc_saved_data { + unsigned int mc_saved_count; + struct microcode_intel **mc_saved; +} mc_saved_data; + +static enum ucode_state __cpuinit +generic_load_microcode_early(struct microcode_intel **mc_saved_p, + unsigned int mc_saved_count, + struct ucode_cpu_info *uci) +{ + struct microcode_intel *ucode_ptr, *new_mc = NULL; + int new_rev = uci->cpu_sig.rev; + enum ucode_state state = UCODE_OK; + unsigned int mc_size; + struct microcode_header_intel *mc_header; + unsigned int csig = uci->cpu_sig.sig; + unsigned int cpf = uci->cpu_sig.pf; + int i; + + for (i = 0; i < mc_saved_count; i++) { + ucode_ptr = mc_saved_p[i]; + + mc_header = (struct microcode_header_intel *)ucode_ptr; + mc_size = get_totalsize(mc_header); + if (get_matching_microcode(csig, cpf, ucode_ptr, new_rev)) { + new_rev = mc_header->rev; + new_mc = ucode_ptr; + } + } + + if (!new_mc) { + state = UCODE_NFOUND; + goto out; + } + + uci->mc = (struct microcode_intel *)new_mc; +out: + return state; +} + +static void __cpuinit +microcode_pointer(struct microcode_intel **mc_saved, + unsigned long *mc_saved_in_initrd, + unsigned long initrd_start, int mc_saved_count) +{ + int i; + + for (i = 0; i < mc_saved_count; i++) + mc_saved[i] = (struct microcode_intel *) + (mc_saved_in_initrd[i] + initrd_start); +} + +#ifdef CONFIG_X86_32 +static void __cpuinit +microcode_phys(struct microcode_intel **mc_saved_tmp, + struct mc_saved_data *mc_saved_data) +{ + int i; + struct microcode_intel ***mc_saved; + + mc_saved = (struct microcode_intel ***) + __pa_symbol(&mc_saved_data->mc_saved); + for (i = 0; i < mc_saved_data->mc_saved_count; i++) { + struct microcode_intel *p; + + p = *(struct microcode_intel **) + __pa(mc_saved_data->mc_saved + i); + mc_saved_tmp[i] = (struct microcode_intel *)__pa(p); + } +} +#endif + +static enum ucode_state __cpuinit +load_microcode(struct mc_saved_data *mc_saved_data, + unsigned long *mc_saved_in_initrd, + unsigned long initrd_start, + struct ucode_cpu_info *uci) +{ + struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; + unsigned int count = mc_saved_data->mc_saved_count; + + if (!mc_saved_data->mc_saved) { + microcode_pointer(mc_saved_tmp, mc_saved_in_initrd, + initrd_start, count); + + return generic_load_microcode_early(mc_saved_tmp, count, uci); + } else { +#ifdef CONFIG_X86_32 + microcode_phys(mc_saved_tmp, mc_saved_data); + return generic_load_microcode_early(mc_saved_tmp, count, uci); +#else + return generic_load_microcode_early(mc_saved_data->mc_saved, + count, uci); +#endif + } +} + +static u8 get_x86_family(unsigned long sig) +{ + u8 x86; + + x86 = (sig >> 8) & 0xf; + + if (x86 == 0xf) + x86 += (sig >> 20) & 0xff; + + return x86; +} + +static u8 get_x86_model(unsigned long sig) +{ + u8 x86, x86_model; + + x86 = get_x86_family(sig); + x86_model = (sig >> 4) & 0xf; + + if (x86 == 0x6 || x86 == 0xf) + x86_model += ((sig >> 16) & 0xf) << 4; + + return x86_model; +} + +/* + * Given CPU signature and a microcode patch, this function finds if the + * microcode patch has matching family and model with the CPU. + */ +static enum ucode_state +matching_model_microcode(struct microcode_header_intel *mc_header, + unsigned long sig) +{ + u8 x86, x86_model; + u8 x86_ucode, x86_model_ucode; + struct extended_sigtable *ext_header; + unsigned long total_size = get_totalsize(mc_header); + unsigned long data_size = get_datasize(mc_header); + int ext_sigcount, i; + struct extended_signature *ext_sig; + + x86 = get_x86_family(sig); + x86_model = get_x86_model(sig); + + x86_ucode = get_x86_family(mc_header->sig); + x86_model_ucode = get_x86_model(mc_header->sig); + + if (x86 == x86_ucode && x86_model == x86_model_ucode) + return UCODE_OK; + + /* Look for ext. headers: */ + if (total_size <= data_size + MC_HEADER_SIZE) + return UCODE_NFOUND; + + ext_header = (struct extended_sigtable *) + mc_header + data_size + MC_HEADER_SIZE; + ext_sigcount = ext_header->count; + ext_sig = (void *)ext_header + EXT_HEADER_SIZE; + + for (i = 0; i < ext_sigcount; i++) { + x86_ucode = get_x86_family(ext_sig->sig); + x86_model_ucode = get_x86_model(ext_sig->sig); + + if (x86 == x86_ucode && x86_model == x86_model_ucode) + return UCODE_OK; + + ext_sig++; + } + + return UCODE_NFOUND; +} + +static int +save_microcode(struct mc_saved_data *mc_saved_data, + struct microcode_intel **mc_saved_src, + unsigned int mc_saved_count) +{ + int i, j; + struct microcode_intel **mc_saved_p; + int ret; + + if (!mc_saved_count) + return -EINVAL; + + /* + * Copy new microcode data. + */ + mc_saved_p = kmalloc(mc_saved_count*sizeof(struct microcode_intel *), + GFP_KERNEL); + if (!mc_saved_p) + return -ENOMEM; + + for (i = 0; i < mc_saved_count; i++) { + struct microcode_intel *mc = mc_saved_src[i]; + struct microcode_header_intel *mc_header = &mc->hdr; + unsigned long mc_size = get_totalsize(mc_header); + mc_saved_p[i] = kmalloc(mc_size, GFP_KERNEL); + if (!mc_saved_p[i]) { + ret = -ENOMEM; + goto err; + } + if (!mc_saved_src[i]) { + ret = -EINVAL; + goto err; + } + memcpy(mc_saved_p[i], mc, mc_size); + } + + /* + * Point to newly saved microcode. + */ + mc_saved_data->mc_saved = mc_saved_p; + mc_saved_data->mc_saved_count = mc_saved_count; + + return 0; + +err: + for (j = 0; j <= i; j++) + kfree(mc_saved_p[j]); + kfree(mc_saved_p); + + return ret; +} + +/* + * A microcode patch in ucode_ptr is saved into mc_saved + * - if it has matching signature and newer revision compared to an existing + * patch mc_saved. + * - or if it is a newly discovered microcode patch. + * + * The microcode patch should have matching model with CPU. + */ +static void _save_mc(struct microcode_intel **mc_saved, u8 *ucode_ptr, + unsigned int *mc_saved_count_p) +{ + int i; + int found = 0; + unsigned int mc_saved_count = *mc_saved_count_p; + struct microcode_header_intel *mc_header; + + mc_header = (struct microcode_header_intel *)ucode_ptr; + for (i = 0; i < mc_saved_count; i++) { + unsigned int sig, pf; + unsigned int new_rev; + struct microcode_header_intel *mc_saved_header = + (struct microcode_header_intel *)mc_saved[i]; + sig = mc_saved_header->sig; + pf = mc_saved_header->pf; + new_rev = mc_header->rev; + + if (get_matching_sig(sig, pf, ucode_ptr, new_rev)) { + found = 1; + if (update_match_revision(mc_header, new_rev)) { + /* + * Found an older ucode saved before. + * Replace the older one with this newer + * one. + */ + mc_saved[i] = + (struct microcode_intel *)ucode_ptr; + break; + } + } + } + if (i >= mc_saved_count && !found) + /* + * This ucode is first time discovered in ucode file. + * Save it to memory. + */ + mc_saved[mc_saved_count++] = + (struct microcode_intel *)ucode_ptr; + + *mc_saved_count_p = mc_saved_count; +} + +/* + * Get microcode matching with BSP's model. Only CPUs with the same model as + * BSP can stay in the platform. + */ +static enum ucode_state __init +get_matching_model_microcode(int cpu, unsigned long start, + void *data, size_t size, + struct mc_saved_data *mc_saved_data, + unsigned long *mc_saved_in_initrd, + struct ucode_cpu_info *uci) +{ + u8 *ucode_ptr = data; + unsigned int leftover = size; + enum ucode_state state = UCODE_OK; + unsigned int mc_size; + struct microcode_header_intel *mc_header; + struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; + unsigned int mc_saved_count = mc_saved_data->mc_saved_count; + int i; + + while (leftover) { + mc_header = (struct microcode_header_intel *)ucode_ptr; + + mc_size = get_totalsize(mc_header); + if (!mc_size || mc_size > leftover || + microcode_sanity_check(ucode_ptr, 0) < 0) + break; + + leftover -= mc_size; + + /* + * Since APs with same family and model as the BSP may boot in + * the platform, we need to find and save microcode patches + * with the same family and model as the BSP. + */ + if (matching_model_microcode(mc_header, uci->cpu_sig.sig) != + UCODE_OK) { + ucode_ptr += mc_size; + continue; + } + + _save_mc(mc_saved_tmp, ucode_ptr, &mc_saved_count); + + ucode_ptr += mc_size; + } + + if (leftover) { + state = UCODE_ERROR; + goto out; + } + + if (mc_saved_count == 0) { + state = UCODE_NFOUND; + goto out; + } + + for (i = 0; i < mc_saved_count; i++) + mc_saved_in_initrd[i] = (unsigned long)mc_saved_tmp[i] - start; + + mc_saved_data->mc_saved_count = mc_saved_count; +out: + return state; +} + +#define native_rdmsr(msr, val1, val2) \ +do { \ + u64 __val = native_read_msr((msr)); \ + (void)((val1) = (u32)__val); \ + (void)((val2) = (u32)(__val >> 32)); \ +} while (0) + +#define native_wrmsr(msr, low, high) \ + native_write_msr(msr, low, high); + +static int __cpuinit collect_cpu_info_early(struct ucode_cpu_info *uci) +{ + unsigned int val[2]; + u8 x86, x86_model; + struct cpu_signature csig; + unsigned int eax, ebx, ecx, edx; + + csig.sig = 0; + csig.pf = 0; + csig.rev = 0; + + memset(uci, 0, sizeof(*uci)); + + eax = 0x00000001; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + csig.sig = eax; + + x86 = get_x86_family(csig.sig); + x86_model = get_x86_model(csig.sig); + + if ((x86_model >= 5) || (x86 > 6)) { + /* get processor flags from MSR 0x17 */ + native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); + csig.pf = 1 << ((val[1] >> 18) & 7); + } + native_wrmsr(MSR_IA32_UCODE_REV, 0, 0); + + /* As documented in the SDM: Do a CPUID 1 here */ + sync_core(); + + /* get the current revision from MSR 0x8B */ + native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); + + csig.rev = val[1]; + + uci->cpu_sig = csig; + uci->valid = 1; + + return 0; +} + +#ifdef DEBUG +static void __ref show_saved_mc(void) +{ + int i, j; + unsigned int sig, pf, rev, total_size, data_size, date; + struct ucode_cpu_info uci; + + if (mc_saved_data.mc_saved_count == 0) { + pr_debug("no micorcode data saved.\n"); + return; + } + pr_debug("Total microcode saved: %d\n", mc_saved_data.mc_saved_count); + + collect_cpu_info_early(&uci); + + sig = uci.cpu_sig.sig; + pf = uci.cpu_sig.pf; + rev = uci.cpu_sig.rev; + pr_debug("CPU%d: sig=0x%x, pf=0x%x, rev=0x%x\n", + smp_processor_id(), sig, pf, rev); + + for (i = 0; i < mc_saved_data.mc_saved_count; i++) { + struct microcode_header_intel *mc_saved_header; + struct extended_sigtable *ext_header; + int ext_sigcount; + struct extended_signature *ext_sig; + + mc_saved_header = (struct microcode_header_intel *) + mc_saved_data.mc_saved[i]; + sig = mc_saved_header->sig; + pf = mc_saved_header->pf; + rev = mc_saved_header->rev; + total_size = get_totalsize(mc_saved_header); + data_size = get_datasize(mc_saved_header); + date = mc_saved_header->date; + + pr_debug("mc_saved[%d]: sig=0x%x, pf=0x%x, rev=0x%x, toal size=0x%x, date = %04x-%02x-%02x\n", + i, sig, pf, rev, total_size, + date & 0xffff, + date >> 24, + (date >> 16) & 0xff); + + /* Look for ext. headers: */ + if (total_size <= data_size + MC_HEADER_SIZE) + continue; + + ext_header = (struct extended_sigtable *) + mc_saved_header + data_size + MC_HEADER_SIZE; + ext_sigcount = ext_header->count; + ext_sig = (void *)ext_header + EXT_HEADER_SIZE; + + for (j = 0; j < ext_sigcount; j++) { + sig = ext_sig->sig; + pf = ext_sig->pf; + + pr_debug("\tExtended[%d]: sig=0x%x, pf=0x%x\n", + j, sig, pf); + + ext_sig++; + } + + } +} +#else +static inline void show_saved_mc(void) +{ +} +#endif + +#if defined(CONFIG_MICROCODE_INTEL_EARLY) && defined(CONFIG_HOTPLUG_CPU) +/* + * Save this mc into mc_saved_data. So it will be loaded early when a CPU is + * hot added or resumes. + * + * Please make sure this mc should be a valid microcode patch before calling + * this function. + */ +int save_mc_for_early(u8 *mc) +{ + struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; + unsigned int mc_saved_count_init; + unsigned int mc_saved_count; + struct microcode_intel **mc_saved; + int ret = 0; + int i; + + /* + * Hold hotplug lock so mc_saved_data is not accessed by a CPU in + * hotplug. + */ + cpu_hotplug_driver_lock(); + + mc_saved_count_init = mc_saved_data.mc_saved_count; + mc_saved_count = mc_saved_data.mc_saved_count; + mc_saved = mc_saved_data.mc_saved; + + if (mc_saved && mc_saved_count) + memcpy(mc_saved_tmp, mc_saved, + mc_saved_count * sizeof(struct mirocode_intel *)); + /* + * Save the microcode patch mc in mc_save_tmp structure if it's a newer + * version. + */ + + _save_mc(mc_saved_tmp, mc, &mc_saved_count); + + /* + * Save the mc_save_tmp in global mc_saved_data. + */ + ret = save_microcode(&mc_saved_data, mc_saved_tmp, mc_saved_count); + if (ret) { + pr_err("Can not save microcode patch.\n"); + goto out; + } + + show_saved_mc(); + + /* + * Free old saved microcod data. + */ + if (mc_saved) { + for (i = 0; i < mc_saved_count_init; i++) + kfree(mc_saved[i]); + kfree(mc_saved); + } + +out: + cpu_hotplug_driver_unlock(); + + return ret; +} +EXPORT_SYMBOL_GPL(save_mc_for_early); +#endif + +static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin"; +static __init enum ucode_state +scan_microcode(unsigned long start, unsigned long end, + struct mc_saved_data *mc_saved_data, + unsigned long *mc_saved_in_initrd, + struct ucode_cpu_info *uci) +{ + unsigned int size = end - start + 1; + struct cpio_data cd; + long offset = 0; +#ifdef CONFIG_X86_32 + char *p = (char *)__pa_symbol(ucode_name); +#else + char *p = ucode_name; +#endif + + cd.data = NULL; + cd.size = 0; + + cd = find_cpio_data(p, (void *)start, size, &offset); + if (!cd.data) + return UCODE_ERROR; + + + return get_matching_model_microcode(0, start, cd.data, cd.size, + mc_saved_data, mc_saved_in_initrd, + uci); +} + +/* + * Print ucode update info. + */ +static void __cpuinit +print_ucode_info(struct ucode_cpu_info *uci, unsigned int date) +{ + int cpu = smp_processor_id(); + + pr_info("CPU%d microcode updated early to revision 0x%x, date = %04x-%02x-%02x\n", + cpu, + uci->cpu_sig.rev, + date & 0xffff, + date >> 24, + (date >> 16) & 0xff); +} + +#ifdef CONFIG_X86_32 + +static int delay_ucode_info; +static int current_mc_date; + +/* + * Print early updated ucode info after printk works. This is delayed info dump. + */ +void __cpuinit show_ucode_info_early(void) +{ + struct ucode_cpu_info uci; + + if (delay_ucode_info) { + collect_cpu_info_early(&uci); + print_ucode_info(&uci, current_mc_date); + delay_ucode_info = 0; + } +} + +/* + * At this point, we can not call printk() yet. Keep microcode patch number in + * mc_saved_data.mc_saved and delay printing microcode info in + * show_ucode_info_early() until printk() works. + */ +static void __cpuinit print_ucode(struct ucode_cpu_info *uci) +{ + struct microcode_intel *mc_intel; + int *delay_ucode_info_p; + int *current_mc_date_p; + + mc_intel = uci->mc; + if (mc_intel == NULL) + return; + + delay_ucode_info_p = (int *)__pa_symbol(&delay_ucode_info); + current_mc_date_p = (int *)__pa_symbol(¤t_mc_date); + + *delay_ucode_info_p = 1; + *current_mc_date_p = mc_intel->hdr.date; +} +#else + +/* + * Flush global tlb. We only do this in x86_64 where paging has been enabled + * already and PGE should be enabled as well. + */ +static inline void __cpuinit flush_tlb_early(void) +{ + __native_flush_tlb_global_irq_disabled(); +} + +static inline void __cpuinit print_ucode(struct ucode_cpu_info *uci) +{ + struct microcode_intel *mc_intel; + + mc_intel = uci->mc; + if (mc_intel == NULL) + return; + + print_ucode_info(uci, mc_intel->hdr.date); +} +#endif + +static int apply_microcode_early(struct mc_saved_data *mc_saved_data, + struct ucode_cpu_info *uci) +{ + struct microcode_intel *mc_intel; + unsigned int val[2]; + + mc_intel = uci->mc; + if (mc_intel == NULL) + return 0; + + /* write microcode via MSR 0x79 */ + native_wrmsr(MSR_IA32_UCODE_WRITE, + (unsigned long) mc_intel->bits, + (unsigned long) mc_intel->bits >> 16 >> 16); + native_wrmsr(MSR_IA32_UCODE_REV, 0, 0); + + /* As documented in the SDM: Do a CPUID 1 here */ + sync_core(); + + /* get the current revision from MSR 0x8B */ + native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); + if (val[1] != mc_intel->hdr.rev) + return -1; + +#ifdef CONFIG_X86_64 + /* Flush global tlb. This is precaution. */ + flush_tlb_early(); +#endif + uci->cpu_sig.rev = val[1]; + + print_ucode(uci); + + return 0; +} + +/* + * This function converts microcode patch offsets previously stored in + * mc_saved_in_initrd to pointers and stores the pointers in mc_saved_data. + */ +int __init save_microcode_in_initrd(void) +{ + unsigned int count = mc_saved_data.mc_saved_count; + struct microcode_intel *mc_saved[MAX_UCODE_COUNT]; + int ret = 0; + + if (count == 0) + return ret; + + microcode_pointer(mc_saved, mc_saved_in_initrd, initrd_start, count); + ret = save_microcode(&mc_saved_data, mc_saved, count); + if (ret) + pr_err("Can not save microcod patches from initrd"); + + show_saved_mc(); + + return ret; +} + +static void __init +_load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data, + unsigned long *mc_saved_in_initrd, + unsigned long initrd_start_early, + unsigned long initrd_end_early, + struct ucode_cpu_info *uci) +{ + collect_cpu_info_early(uci); + scan_microcode(initrd_start_early, initrd_end_early, mc_saved_data, + mc_saved_in_initrd, uci); + load_microcode(mc_saved_data, mc_saved_in_initrd, + initrd_start_early, uci); + apply_microcode_early(mc_saved_data, uci); +} + +void __init +load_ucode_intel_bsp(void) +{ + u64 ramdisk_image, ramdisk_size; + unsigned long initrd_start_early, initrd_end_early; + struct ucode_cpu_info uci; +#ifdef CONFIG_X86_32 + struct boot_params *boot_params_p; + + boot_params_p = (struct boot_params *)__pa_symbol(&boot_params); + ramdisk_image = boot_params_p->hdr.ramdisk_image; + ramdisk_size = boot_params_p->hdr.ramdisk_size; + initrd_start_early = ramdisk_image; + initrd_end_early = initrd_start_early + ramdisk_size; + + _load_ucode_intel_bsp( + (struct mc_saved_data *)__pa_symbol(&mc_saved_data), + (unsigned long *)__pa_symbol(&mc_saved_in_initrd), + initrd_start_early, initrd_end_early, &uci); +#else + ramdisk_image = boot_params.hdr.ramdisk_image; + ramdisk_size = boot_params.hdr.ramdisk_size; + initrd_start_early = ramdisk_image + PAGE_OFFSET; + initrd_end_early = initrd_start_early + ramdisk_size; + + _load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd, + initrd_start_early, initrd_end_early, &uci); +#endif +} + +void __cpuinit load_ucode_intel_ap(void) +{ + struct mc_saved_data *mc_saved_data_p; + struct ucode_cpu_info uci; + unsigned long *mc_saved_in_initrd_p; + unsigned long initrd_start_addr; +#ifdef CONFIG_X86_32 + unsigned long *initrd_start_p; + + mc_saved_in_initrd_p = + (unsigned long *)__pa_symbol(mc_saved_in_initrd); + mc_saved_data_p = (struct mc_saved_data *)__pa_symbol(&mc_saved_data); + initrd_start_p = (unsigned long *)__pa_symbol(&initrd_start); + initrd_start_addr = (unsigned long)__pa_symbol(*initrd_start_p); +#else + mc_saved_data_p = &mc_saved_data; + mc_saved_in_initrd_p = mc_saved_in_initrd; + initrd_start_addr = initrd_start; +#endif + + /* + * If there is no valid ucode previously saved in memory, no need to + * update ucode on this AP. + */ + if (mc_saved_data_p->mc_saved_count == 0) + return; + + collect_cpu_info_early(&uci); + load_microcode(mc_saved_data_p, mc_saved_in_initrd_p, + initrd_start_addr, &uci); + apply_microcode_early(mc_saved_data_p, &uci); +} diff --git a/arch/x86/kernel/microcode_intel_lib.c b/arch/x86/kernel/microcode_intel_lib.c new file mode 100644 index 00000000000..ce69320d017 --- /dev/null +++ b/arch/x86/kernel/microcode_intel_lib.c @@ -0,0 +1,174 @@ +/* + * Intel CPU Microcode Update Driver for Linux + * + * Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com> + * H Peter Anvin" <hpa@zytor.com> + * + * This driver allows to upgrade microcode on Intel processors + * belonging to IA-32 family - PentiumPro, Pentium II, + * Pentium III, Xeon, Pentium 4, etc. + * + * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture + * Software Developer's Manual + * Order Number 253668 or free download from: + * + * http://developer.intel.com/Assets/PDF/manual/253668.pdf + * + * For more information, go to http://www.urbanmyth.org/microcode + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ +#include <linux/firmware.h> +#include <linux/uaccess.h> +#include <linux/kernel.h> +#include <linux/module.h> + +#include <asm/microcode_intel.h> +#include <asm/processor.h> +#include <asm/msr.h> + +static inline int +update_match_cpu(unsigned int csig, unsigned int cpf, + unsigned int sig, unsigned int pf) +{ + return (!sigmatch(sig, csig, pf, cpf)) ? 0 : 1; +} + +int +update_match_revision(struct microcode_header_intel *mc_header, int rev) +{ + return (mc_header->rev <= rev) ? 0 : 1; +} + +int microcode_sanity_check(void *mc, int print_err) +{ + unsigned long total_size, data_size, ext_table_size; + struct microcode_header_intel *mc_header = mc; + struct extended_sigtable *ext_header = NULL; + int sum, orig_sum, ext_sigcount = 0, i; + struct extended_signature *ext_sig; + + total_size = get_totalsize(mc_header); + data_size = get_datasize(mc_header); + + if (data_size + MC_HEADER_SIZE > total_size) { + if (print_err) + pr_err("error! Bad data size in microcode data file\n"); + return -EINVAL; + } + + if (mc_header->ldrver != 1 || mc_header->hdrver != 1) { + if (print_err) + pr_err("error! Unknown microcode update format\n"); + return -EINVAL; + } + ext_table_size = total_size - (MC_HEADER_SIZE + data_size); + if (ext_table_size) { + if ((ext_table_size < EXT_HEADER_SIZE) + || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) { + if (print_err) + pr_err("error! Small exttable size in microcode data file\n"); + return -EINVAL; + } + ext_header = mc + MC_HEADER_SIZE + data_size; + if (ext_table_size != exttable_size(ext_header)) { + if (print_err) + pr_err("error! Bad exttable size in microcode data file\n"); + return -EFAULT; + } + ext_sigcount = ext_header->count; + } + + /* check extended table checksum */ + if (ext_table_size) { + int ext_table_sum = 0; + int *ext_tablep = (int *)ext_header; + + i = ext_table_size / DWSIZE; + while (i--) + ext_table_sum += ext_tablep[i]; + if (ext_table_sum) { + if (print_err) + pr_warn("aborting, bad extended signature table checksum\n"); + return -EINVAL; + } + } + + /* calculate the checksum */ + orig_sum = 0; + i = (MC_HEADER_SIZE + data_size) / DWSIZE; + while (i--) + orig_sum += ((int *)mc)[i]; + if (orig_sum) { + if (print_err) + pr_err("aborting, bad checksum\n"); + return -EINVAL; + } + if (!ext_table_size) + return 0; + /* check extended signature checksum */ + for (i = 0; i < ext_sigcount; i++) { + ext_sig = (void *)ext_header + EXT_HEADER_SIZE + + EXT_SIGNATURE_SIZE * i; + sum = orig_sum + - (mc_header->sig + mc_header->pf + mc_header->cksum) + + (ext_sig->sig + ext_sig->pf + ext_sig->cksum); + if (sum) { + if (print_err) + pr_err("aborting, bad checksum\n"); + return -EINVAL; + } + } + return 0; +} +EXPORT_SYMBOL_GPL(microcode_sanity_check); + +/* + * return 0 - no update found + * return 1 - found update + */ +int get_matching_sig(unsigned int csig, int cpf, void *mc, int rev) +{ + struct microcode_header_intel *mc_header = mc; + struct extended_sigtable *ext_header; + unsigned long total_size = get_totalsize(mc_header); + int ext_sigcount, i; + struct extended_signature *ext_sig; + + if (update_match_cpu(csig, cpf, mc_header->sig, mc_header->pf)) + return 1; + + /* Look for ext. headers: */ + if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE) + return 0; + + ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE; + ext_sigcount = ext_header->count; + ext_sig = (void *)ext_header + EXT_HEADER_SIZE; + + for (i = 0; i < ext_sigcount; i++) { + if (update_match_cpu(csig, cpf, ext_sig->sig, ext_sig->pf)) + return 1; + ext_sig++; + } + return 0; +} + +/* + * return 0 - no update found + * return 1 - found update + */ +int get_matching_microcode(unsigned int csig, int cpf, void *mc, int rev) +{ + struct microcode_header_intel *mc_header = mc; + + if (!update_match_revision(mc_header, rev)) + return 0; + + return get_matching_sig(csig, cpf, mc, rev); +} +EXPORT_SYMBOL_GPL(get_matching_microcode); diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index a7c5661f849..ce130493b80 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -71,7 +71,7 @@ static ssize_t msr_read(struct file *file, char __user *buf, u32 __user *tmp = (u32 __user *) buf; u32 data[2]; u32 reg = *ppos; - int cpu = iminor(file->f_path.dentry->d_inode); + int cpu = iminor(file_inode(file)); int err = 0; ssize_t bytes = 0; @@ -99,7 +99,7 @@ static ssize_t msr_write(struct file *file, const char __user *buf, const u32 __user *tmp = (const u32 __user *)buf; u32 data[2]; u32 reg = *ppos; - int cpu = iminor(file->f_path.dentry->d_inode); + int cpu = iminor(file_inode(file)); int err = 0; ssize_t bytes = 0; @@ -125,7 +125,7 @@ static long msr_ioctl(struct file *file, unsigned int ioc, unsigned long arg) { u32 __user *uregs = (u32 __user *)arg; u32 regs[8]; - int cpu = iminor(file->f_path.dentry->d_inode); + int cpu = iminor(file_inode(file)); int err; switch (ioc) { @@ -171,10 +171,12 @@ static long msr_ioctl(struct file *file, unsigned int ioc, unsigned long arg) static int msr_open(struct inode *inode, struct file *file) { - unsigned int cpu; + unsigned int cpu = iminor(file_inode(file)); struct cpuinfo_x86 *c; - cpu = iminor(file->f_path.dentry->d_inode); + if (!capable(CAP_SYS_RAWIO)) + return -EPERM; + if (cpu >= nr_cpu_ids || !cpu_online(cpu)) return -ENXIO; /* No such CPU */ diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index f84f5c57de3..60308053fdb 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -509,3 +509,4 @@ void local_touch_nmi(void) { __this_cpu_write(last_nmi_rip, 0); } +EXPORT_SYMBOL_GPL(local_touch_nmi); diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index de2b7ad7027..872079a67e4 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -56,7 +56,7 @@ struct device x86_dma_fallback_dev = { EXPORT_SYMBOL(x86_dma_fallback_dev); /* Number of entries preallocated for DMA-API debugging */ -#define PREALLOC_DMA_DEBUG_ENTRIES 32768 +#define PREALLOC_DMA_DEBUG_ENTRIES 65536 int dma_set_mask(struct device *dev, u64 mask) { @@ -265,7 +265,7 @@ rootfs_initcall(pci_iommu_init); #ifdef CONFIG_PCI /* Many VIA bridges seem to corrupt data for DAC. Disable it here */ -static __devinit void via_no_dac(struct pci_dev *dev) +static void via_no_dac(struct pci_dev *dev) { if (forbid_dac == 0) { dev_info(&dev->dev, "disabling DAC on VIA PCI bridge\n"); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index b644e1c765d..14ae10031ff 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -262,54 +262,13 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, propagate_user_return_notify(prev_p, next_p); } -int sys_fork(struct pt_regs *regs) -{ - return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL); -} - -/* - * This is trivial, and on the face of it looks like it - * could equally well be done in user mode. - * - * Not so, for quite unobvious reasons - register pressure. - * In user mode vfork() cannot have a stack frame, and if - * done by calling the "clone()" system call directly, you - * do not have enough call-clobbered registers to hold all - * the information you need. - */ -int sys_vfork(struct pt_regs *regs) -{ - return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0, - NULL, NULL); -} - -long -sys_clone(unsigned long clone_flags, unsigned long newsp, - void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) -{ - if (!newsp) - newsp = regs->sp; - return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); -} - /* * Idle related variables and functions */ unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE; EXPORT_SYMBOL(boot_option_idle_override); -/* - * Powermanagement idle function, if any.. - */ -void (*pm_idle)(void); -#ifdef CONFIG_APM_MODULE -EXPORT_SYMBOL(pm_idle); -#endif - -static inline int hlt_use_halt(void) -{ - return 1; -} +static void (*x86_idle)(void); #ifndef CONFIG_SMP static inline void play_dead(void) @@ -386,7 +345,7 @@ void cpu_idle(void) rcu_idle_enter(); if (cpuidle_idle_call()) - pm_idle(); + x86_idle(); rcu_idle_exit(); start_critical_timings(); @@ -410,41 +369,35 @@ void cpu_idle(void) */ void default_idle(void) { - if (hlt_use_halt()) { - trace_power_start_rcuidle(POWER_CSTATE, 1, smp_processor_id()); - trace_cpu_idle_rcuidle(1, smp_processor_id()); - current_thread_info()->status &= ~TS_POLLING; - /* - * TS_POLLING-cleared state must be visible before we - * test NEED_RESCHED: - */ - smp_mb(); + trace_cpu_idle_rcuidle(1, smp_processor_id()); + current_thread_info()->status &= ~TS_POLLING; + /* + * TS_POLLING-cleared state must be visible before we + * test NEED_RESCHED: + */ + smp_mb(); - if (!need_resched()) - safe_halt(); /* enables interrupts racelessly */ - else - local_irq_enable(); - current_thread_info()->status |= TS_POLLING; - trace_power_end_rcuidle(smp_processor_id()); - trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); - } else { + if (!need_resched()) + safe_halt(); /* enables interrupts racelessly */ + else local_irq_enable(); - /* loop is done by the caller */ - cpu_relax(); - } + current_thread_info()->status |= TS_POLLING; + trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); } #ifdef CONFIG_APM_MODULE EXPORT_SYMBOL(default_idle); #endif -bool set_pm_idle_to_default(void) +#ifdef CONFIG_XEN +bool xen_set_default_idle(void) { - bool ret = !!pm_idle; + bool ret = !!x86_idle; - pm_idle = default_idle; + x86_idle = default_idle; return ret; } +#endif void stop_this_cpu(void *dummy) { local_irq_disable(); @@ -454,31 +407,8 @@ void stop_this_cpu(void *dummy) set_cpu_online(smp_processor_id(), false); disable_local_APIC(); - for (;;) { - if (hlt_works(smp_processor_id())) - halt(); - } -} - -/* Default MONITOR/MWAIT with no hints, used for default C1 state */ -static void mwait_idle(void) -{ - if (!need_resched()) { - trace_power_start_rcuidle(POWER_CSTATE, 1, smp_processor_id()); - trace_cpu_idle_rcuidle(1, smp_processor_id()); - if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) - clflush((void *)¤t_thread_info()->flags); - - __monitor((void *)¤t_thread_info()->flags, 0, 0); - smp_mb(); - if (!need_resched()) - __sti_mwait(0, 0); - else - local_irq_enable(); - trace_power_end_rcuidle(smp_processor_id()); - trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); - } else - local_irq_enable(); + for (;;) + halt(); } /* @@ -488,62 +418,13 @@ static void mwait_idle(void) */ static void poll_idle(void) { - trace_power_start_rcuidle(POWER_CSTATE, 0, smp_processor_id()); trace_cpu_idle_rcuidle(0, smp_processor_id()); local_irq_enable(); while (!need_resched()) cpu_relax(); - trace_power_end_rcuidle(smp_processor_id()); trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); } -/* - * mwait selection logic: - * - * It depends on the CPU. For AMD CPUs that support MWAIT this is - * wrong. Family 0x10 and 0x11 CPUs will enter C1 on HLT. Powersavings - * then depend on a clock divisor and current Pstate of the core. If - * all cores of a processor are in halt state (C1) the processor can - * enter the C1E (C1 enhanced) state. If mwait is used this will never - * happen. - * - * idle=mwait overrides this decision and forces the usage of mwait. - */ - -#define MWAIT_INFO 0x05 -#define MWAIT_ECX_EXTENDED_INFO 0x01 -#define MWAIT_EDX_C1 0xf0 - -int mwait_usable(const struct cpuinfo_x86 *c) -{ - u32 eax, ebx, ecx, edx; - - /* Use mwait if idle=mwait boot option is given */ - if (boot_option_idle_override == IDLE_FORCE_MWAIT) - return 1; - - /* - * Any idle= boot option other than idle=mwait means that we must not - * use mwait. Eg: idle=halt or idle=poll or idle=nomwait - */ - if (boot_option_idle_override != IDLE_NO_OVERRIDE) - return 0; - - if (c->cpuid_level < MWAIT_INFO) - return 0; - - cpuid(MWAIT_INFO, &eax, &ebx, &ecx, &edx); - /* Check, whether EDX has extended info about MWAIT */ - if (!(ecx & MWAIT_ECX_EXTENDED_INFO)) - return 1; - - /* - * edx enumeratios MONITOR/MWAIT extensions. Check, whether - * C1 supports MWAIT - */ - return (edx & MWAIT_EDX_C1); -} - bool amd_e400_c1e_detected; EXPORT_SYMBOL(amd_e400_c1e_detected); @@ -608,31 +489,24 @@ static void amd_e400_idle(void) void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP - if (pm_idle == poll_idle && smp_num_siblings > 1) { + if (x86_idle == poll_idle && smp_num_siblings > 1) pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n"); - } #endif - if (pm_idle) + if (x86_idle) return; - if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) { - /* - * One CPU supports mwait => All CPUs supports mwait - */ - pr_info("using mwait in idle threads\n"); - pm_idle = mwait_idle; - } else if (cpu_has_amd_erratum(amd_erratum_400)) { + if (cpu_has_amd_erratum(amd_erratum_400)) { /* E400: APIC timer interrupt does not wake up CPU from C1e */ pr_info("using AMD E400 aware idle routine\n"); - pm_idle = amd_e400_idle; + x86_idle = amd_e400_idle; } else - pm_idle = default_idle; + x86_idle = default_idle; } void __init init_amd_e400_c1e_mask(void) { /* If we're using amd_e400_idle, we need to allocate amd_e400_c1e_mask. */ - if (pm_idle == amd_e400_idle) + if (x86_idle == amd_e400_idle) zalloc_cpumask_var(&amd_e400_c1e_mask, GFP_KERNEL); } @@ -643,11 +517,8 @@ static int __init idle_setup(char *str) if (!strcmp(str, "poll")) { pr_info("using polling idle threads\n"); - pm_idle = poll_idle; + x86_idle = poll_idle; boot_option_idle_override = IDLE_POLL; - } else if (!strcmp(str, "mwait")) { - boot_option_idle_override = IDLE_FORCE_MWAIT; - WARN_ONCE(1, "\"idle=mwait\" will be removed in 2012\n"); } else if (!strcmp(str, "halt")) { /* * When the boot option of idle=halt is added, halt is @@ -656,7 +527,7 @@ static int __init idle_setup(char *str) * To continue to load the CPU idle driver, don't touch * the boot_option_idle_override. */ - pm_idle = default_idle; + x86_idle = default_idle; boot_option_idle_override = IDLE_HALT; } else if (!strcmp(str, "nomwait")) { /* diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 44e0bff38e7..b5a8905785e 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -128,8 +128,7 @@ void release_thread(struct task_struct *dead_task) } int copy_thread(unsigned long clone_flags, unsigned long sp, - unsigned long arg, - struct task_struct *p, struct pt_regs *regs) + unsigned long arg, struct task_struct *p) { struct pt_regs *childregs = task_pt_regs(p); struct task_struct *tsk; @@ -138,7 +137,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, p->thread.sp = (unsigned long) childregs; p->thread.sp0 = (unsigned long) (childregs+1); - if (unlikely(!regs)) { + if (unlikely(p->flags & PF_KTHREAD)) { /* kernel thread */ memset(childregs, 0, sizeof(struct pt_regs)); p->thread.ip = (unsigned long) ret_from_kernel_thread; @@ -156,12 +155,13 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); return 0; } - *childregs = *regs; + *childregs = *current_pt_regs(); childregs->ax = 0; - childregs->sp = sp; + if (sp) + childregs->sp = sp; p->thread.ip = (unsigned long) ret_from_fork; - task_user_gs(p) = get_user_gs(regs); + task_user_gs(p) = get_user_gs(current_pt_regs()); p->fpu_counter = 0; p->thread.io_bitmap_ptr = NULL; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 16c6365e2b8..0f49677da51 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -117,7 +117,7 @@ void release_thread(struct task_struct *dead_task) { if (dead_task->mm) { if (dead_task->mm->context.size) { - pr_warn("WARNING: dead process %8s still has LDT? <%p/%d>\n", + pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n", dead_task->comm, dead_task->mm->context.ldt, dead_task->mm->context.size); @@ -146,8 +146,7 @@ static inline u32 read_32bit_tls(struct task_struct *t, int tls) } int copy_thread(unsigned long clone_flags, unsigned long sp, - unsigned long arg, - struct task_struct *p, struct pt_regs *regs) + unsigned long arg, struct task_struct *p) { int err; struct pt_regs *childregs; @@ -169,7 +168,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, savesegment(ds, p->thread.ds); memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); - if (unlikely(!regs)) { + if (unlikely(p->flags & PF_KTHREAD)) { /* kernel thread */ memset(childregs, 0, sizeof(struct pt_regs)); childregs->sp = (unsigned long)childregs; @@ -181,10 +180,11 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_BIT1; return 0; } - *childregs = *regs; + *childregs = *current_pt_regs(); childregs->ax = 0; - childregs->sp = sp; + if (sp) + childregs->sp = sp; err = -ENOMEM; memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index b00b33a1839..29a8120e6fe 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -22,6 +22,8 @@ #include <linux/perf_event.h> #include <linux/hw_breakpoint.h> #include <linux/rcupdate.h> +#include <linux/export.h> +#include <linux/context_tracking.h> #include <asm/uaccess.h> #include <asm/pgtable.h> @@ -166,6 +168,35 @@ static inline bool invalid_selector(u16 value) #define FLAG_MASK FLAG_MASK_32 +/* + * X86_32 CPUs don't save ss and esp if the CPU is already in kernel mode + * when it traps. The previous stack will be directly underneath the saved + * registers, and 'sp/ss' won't even have been saved. Thus the '®s->sp'. + * + * Now, if the stack is empty, '®s->sp' is out of range. In this + * case we try to take the previous stack. To always return a non-null + * stack pointer we fall back to regs as stack if no previous stack + * exists. + * + * This is valid only for kernel mode traps. + */ +unsigned long kernel_stack_pointer(struct pt_regs *regs) +{ + unsigned long context = (unsigned long)regs & ~(THREAD_SIZE - 1); + unsigned long sp = (unsigned long)®s->sp; + struct thread_info *tinfo; + + if (context == (sp & ~(THREAD_SIZE - 1))) + return sp; + + tinfo = (struct thread_info *)context; + if (tinfo->previous_esp) + return tinfo->previous_esp; + + return (unsigned long)regs; +} +EXPORT_SYMBOL_GPL(kernel_stack_pointer); + static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long regno) { BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0); @@ -1461,7 +1492,7 @@ long syscall_trace_enter(struct pt_regs *regs) { long ret = 0; - rcu_user_exit(); + user_exit(); /* * If we stepped into a sysenter/syscall insn, it trapped in @@ -1511,6 +1542,13 @@ void syscall_trace_leave(struct pt_regs *regs) { bool step; + /* + * We may come here right after calling schedule_user() + * or do_notify_resume(), in which case we can be in RCU + * user mode. + */ + user_exit(); + audit_syscall_exit(regs); if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) @@ -1527,5 +1565,5 @@ void syscall_trace_leave(struct pt_regs *regs) if (step || test_thread_flag(TIF_SYSCALL_TRACE)) tracehook_report_syscall_exit(regs, step); - rcu_user_enter(); + user_enter(); } diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 42eb3300dfc..2cb9470ea85 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -17,23 +17,13 @@ #include <linux/kernel.h> #include <linux/percpu.h> +#include <linux/notifier.h> +#include <linux/sched.h> +#include <linux/gfp.h> +#include <linux/bootmem.h> +#include <asm/fixmap.h> #include <asm/pvclock.h> -/* - * These are perodically updated - * xen: magic shared_info page - * kvm: gpa registered via msr - * and then copied here. - */ -struct pvclock_shadow_time { - u64 tsc_timestamp; /* TSC at last update of time vals. */ - u64 system_timestamp; /* Time, in nanosecs, since boot. */ - u32 tsc_to_nsec_mul; - int tsc_shift; - u32 version; - u8 flags; -}; - static u8 valid_flags __read_mostly = 0; void pvclock_set_flags(u8 flags) @@ -41,34 +31,6 @@ void pvclock_set_flags(u8 flags) valid_flags = flags; } -static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow) -{ - u64 delta = native_read_tsc() - shadow->tsc_timestamp; - return pvclock_scale_delta(delta, shadow->tsc_to_nsec_mul, - shadow->tsc_shift); -} - -/* - * Reads a consistent set of time-base values from hypervisor, - * into a shadow data area. - */ -static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst, - struct pvclock_vcpu_time_info *src) -{ - do { - dst->version = src->version; - rmb(); /* fetch version before data */ - dst->tsc_timestamp = src->tsc_timestamp; - dst->system_timestamp = src->system_time; - dst->tsc_to_nsec_mul = src->tsc_to_system_mul; - dst->tsc_shift = src->tsc_shift; - dst->flags = src->flags; - rmb(); /* test version after fetching data */ - } while ((src->version & 1) || (dst->version != src->version)); - - return dst->version; -} - unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src) { u64 pv_tsc_khz = 1000000ULL << 32; @@ -88,23 +50,32 @@ void pvclock_resume(void) atomic64_set(&last_value, 0); } +u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src) +{ + unsigned version; + cycle_t ret; + u8 flags; + + do { + version = __pvclock_read_cycles(src, &ret, &flags); + } while ((src->version & 1) || version != src->version); + + return flags & valid_flags; +} + cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) { - struct pvclock_shadow_time shadow; unsigned version; - cycle_t ret, offset; + cycle_t ret; u64 last; + u8 flags; do { - version = pvclock_get_time_values(&shadow, src); - barrier(); - offset = pvclock_get_nsec_offset(&shadow); - ret = shadow.system_timestamp + offset; - barrier(); - } while (version != src->version); + version = __pvclock_read_cycles(src, &ret, &flags); + } while ((src->version & 1) || version != src->version); if ((valid_flags & PVCLOCK_TSC_STABLE_BIT) && - (shadow.flags & PVCLOCK_TSC_STABLE_BIT)) + (flags & PVCLOCK_TSC_STABLE_BIT)) return ret; /* @@ -156,3 +127,71 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock, set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); } + +static struct pvclock_vsyscall_time_info *pvclock_vdso_info; + +static struct pvclock_vsyscall_time_info * +pvclock_get_vsyscall_user_time_info(int cpu) +{ + if (!pvclock_vdso_info) { + BUG(); + return NULL; + } + + return &pvclock_vdso_info[cpu]; +} + +struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu) +{ + return &pvclock_get_vsyscall_user_time_info(cpu)->pvti; +} + +#ifdef CONFIG_X86_64 +static int pvclock_task_migrate(struct notifier_block *nb, unsigned long l, + void *v) +{ + struct task_migration_notifier *mn = v; + struct pvclock_vsyscall_time_info *pvti; + + pvti = pvclock_get_vsyscall_user_time_info(mn->from_cpu); + + /* this is NULL when pvclock vsyscall is not initialized */ + if (unlikely(pvti == NULL)) + return NOTIFY_DONE; + + pvti->migrate_count++; + + return NOTIFY_DONE; +} + +static struct notifier_block pvclock_migrate = { + .notifier_call = pvclock_task_migrate, +}; + +/* + * Initialize the generic pvclock vsyscall state. This will allocate + * a/some page(s) for the per-vcpu pvclock information, set up a + * fixmap mapping for the page(s) + */ + +int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i, + int size) +{ + int idx; + + WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE); + + pvclock_vdso_info = i; + + for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) { + __set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx, + __pa(i) + (idx*PAGE_SIZE), + PAGE_KERNEL_VVAR); + } + + + register_task_migration_notifier(&pvclock_migrate); + + return 0; +} +#endif diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 1b27de56356..26ee48a33dc 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -8,7 +8,7 @@ #if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI) -static void __devinit quirk_intel_irqbalance(struct pci_dev *dev) +static void quirk_intel_irqbalance(struct pci_dev *dev) { u8 config; u16 word; @@ -512,7 +512,7 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS, #if defined(CONFIG_PCI) && defined(CONFIG_NUMA) /* Set correct numa_node information for AMD NB functions */ -static void __devinit quirk_amd_nb_node(struct pci_dev *dev) +static void quirk_amd_nb_node(struct pci_dev *dev) { struct pci_dev *nb_ht; unsigned int devfn; diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 4e8ba39eaf0..76fa1e9a2b3 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -584,7 +584,7 @@ static void native_machine_emergency_restart(void) break; case BOOT_EFI: - if (efi_enabled) + if (efi_enabled(EFI_RUNTIME_SERVICES)) efi.reset_system(reboot_mode ? EFI_RESET_WARM : EFI_RESET_COLD, diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 4929c1be0ac..2e8f3d3b564 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -149,7 +149,6 @@ unsigned long mach_get_cmos_time(void) if (century) { century = bcd2bin(century); year += century * 100; - printk(KERN_INFO "Extended CMOS year: %d\n", century * 100); } else year += CMOS_YEARS_OFFS; @@ -195,12 +194,6 @@ void read_persistent_clock(struct timespec *ts) ts->tv_nsec = 0; } -unsigned long long native_read_tsc(void) -{ - return __native_read_tsc(); -} -EXPORT_SYMBOL(native_read_tsc); - static struct resource rtc_resources[] = { [0] = { diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index ca45696f30f..90d8cc930f5 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -108,17 +108,16 @@ #include <asm/topology.h> #include <asm/apicdef.h> #include <asm/amd_nb.h> -#ifdef CONFIG_X86_64 -#include <asm/numa_64.h> -#endif #include <asm/mce.h> #include <asm/alternative.h> #include <asm/prom.h> /* - * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. - * The direct mapping extends to max_pfn_mapped, so that we can directly access - * apertures, ACPI and other tables without having to play with fixmaps. + * max_low_pfn_mapped: highest direct mapped pfn under 4GB + * max_pfn_mapped: highest direct mapped pfn over 4GB + * + * The direct mapping only covers E820_RAM regions, so the ranges and gaps are + * represented by pfn_mapped */ unsigned long max_low_pfn_mapped; unsigned long max_pfn_mapped; @@ -143,11 +142,7 @@ int default_check_phys_apicid_present(int phys_apicid) } #endif -#ifndef CONFIG_DEBUG_BOOT_PARAMS -struct boot_params __initdata boot_params; -#else struct boot_params boot_params; -#endif /* * Machine setup.. @@ -176,9 +171,15 @@ static struct resource bss_resource = { #ifdef CONFIG_X86_32 /* cpu data as detected by the assembly code in head.S */ -struct cpuinfo_x86 new_cpu_data __cpuinitdata = {0, 0, 0, 0, -1, 1, 0, 0, -1}; +struct cpuinfo_x86 new_cpu_data __cpuinitdata = { + .wp_works_ok = -1, + .fdiv_bug = -1, +}; /* common cpu data for all cpus */ -struct cpuinfo_x86 boot_cpu_data __read_mostly = {0, 0, 0, 0, -1, 1, 0, 0, -1}; +struct cpuinfo_x86 boot_cpu_data __read_mostly = { + .wp_works_ok = -1, + .fdiv_bug = -1, +}; EXPORT_SYMBOL(boot_cpu_data); unsigned int def_to_bigsmp; @@ -280,18 +281,7 @@ void * __init extend_brk(size_t size, size_t align) return ret; } -#ifdef CONFIG_X86_64 -static void __init init_gbpages(void) -{ - if (direct_gbpages && cpu_has_gbpages) - printk(KERN_INFO "Using GB pages for direct mapping\n"); - else - direct_gbpages = 0; -} -#else -static inline void init_gbpages(void) -{ -} +#ifdef CONFIG_X86_32 static void __init cleanup_highmap(void) { } @@ -300,8 +290,8 @@ static void __init cleanup_highmap(void) static void __init reserve_brk(void) { if (_brk_end > _brk_start) - memblock_reserve(__pa(_brk_start), - __pa(_brk_end) - __pa(_brk_start)); + memblock_reserve(__pa_symbol(_brk_start), + _brk_end - _brk_start); /* Mark brk area as locked down and no longer taking any new allocations */ @@ -310,27 +300,43 @@ static void __init reserve_brk(void) #ifdef CONFIG_BLK_DEV_INITRD +static u64 __init get_ramdisk_image(void) +{ + u64 ramdisk_image = boot_params.hdr.ramdisk_image; + + ramdisk_image |= (u64)boot_params.ext_ramdisk_image << 32; + + return ramdisk_image; +} +static u64 __init get_ramdisk_size(void) +{ + u64 ramdisk_size = boot_params.hdr.ramdisk_size; + + ramdisk_size |= (u64)boot_params.ext_ramdisk_size << 32; + + return ramdisk_size; +} + #define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) static void __init relocate_initrd(void) { /* Assume only end is not page aligned */ - u64 ramdisk_image = boot_params.hdr.ramdisk_image; - u64 ramdisk_size = boot_params.hdr.ramdisk_size; + u64 ramdisk_image = get_ramdisk_image(); + u64 ramdisk_size = get_ramdisk_size(); u64 area_size = PAGE_ALIGN(ramdisk_size); - u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT; u64 ramdisk_here; unsigned long slop, clen, mapaddr; char *p, *q; - /* We need to move the initrd down into lowmem */ - ramdisk_here = memblock_find_in_range(0, end_of_lowmem, area_size, - PAGE_SIZE); + /* We need to move the initrd down into directly mapped mem */ + ramdisk_here = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), + area_size, PAGE_SIZE); if (!ramdisk_here) panic("Cannot find place for new RAMDISK of size %lld\n", ramdisk_size); - /* Note: this includes all the lowmem currently occupied by + /* Note: this includes all the mem currently occupied by the initrd, we rely on that fact to keep the data intact. */ memblock_reserve(ramdisk_here, area_size); initrd_start = ramdisk_here + PAGE_OFFSET; @@ -340,17 +346,7 @@ static void __init relocate_initrd(void) q = (char *)initrd_start; - /* Copy any lowmem portion of the initrd */ - if (ramdisk_image < end_of_lowmem) { - clen = end_of_lowmem - ramdisk_image; - p = (char *)__va(ramdisk_image); - memcpy(q, p, clen); - q += clen; - ramdisk_image += clen; - ramdisk_size -= clen; - } - - /* Copy the highmem portion of the initrd */ + /* Copy the initrd */ while (ramdisk_size) { slop = ramdisk_image & ~PAGE_MASK; clen = ramdisk_size; @@ -364,22 +360,35 @@ static void __init relocate_initrd(void) ramdisk_image += clen; ramdisk_size -= clen; } - /* high pages is not converted by early_res_to_bootmem */ - ramdisk_image = boot_params.hdr.ramdisk_image; - ramdisk_size = boot_params.hdr.ramdisk_size; + + ramdisk_image = get_ramdisk_image(); + ramdisk_size = get_ramdisk_size(); printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to" " [mem %#010llx-%#010llx]\n", ramdisk_image, ramdisk_image + ramdisk_size - 1, ramdisk_here, ramdisk_here + ramdisk_size - 1); } +static void __init early_reserve_initrd(void) +{ + /* Assume only end is not page aligned */ + u64 ramdisk_image = get_ramdisk_image(); + u64 ramdisk_size = get_ramdisk_size(); + u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); + + if (!boot_params.hdr.type_of_loader || + !ramdisk_image || !ramdisk_size) + return; /* No initrd provided by bootloader */ + + memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image); +} static void __init reserve_initrd(void) { /* Assume only end is not page aligned */ - u64 ramdisk_image = boot_params.hdr.ramdisk_image; - u64 ramdisk_size = boot_params.hdr.ramdisk_size; + u64 ramdisk_image = get_ramdisk_image(); + u64 ramdisk_size = get_ramdisk_size(); u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); - u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT; + u64 mapped_size; if (!boot_params.hdr.type_of_loader || !ramdisk_image || !ramdisk_size) @@ -387,22 +396,18 @@ static void __init reserve_initrd(void) initrd_start = 0; - if (ramdisk_size >= (end_of_lowmem>>1)) { + mapped_size = memblock_mem_size(max_pfn_mapped); + if (ramdisk_size >= (mapped_size>>1)) panic("initrd too large to handle, " "disabling initrd (%lld needed, %lld available)\n", - ramdisk_size, end_of_lowmem>>1); - } + ramdisk_size, mapped_size>>1); printk(KERN_INFO "RAMDISK: [mem %#010llx-%#010llx]\n", ramdisk_image, ramdisk_end - 1); - - if (ramdisk_end <= end_of_lowmem) { - /* All in lowmem, easy case */ - /* - * don't need to reserve again, already reserved early - * in i386_start_kernel - */ + if (pfn_range_is_mapped(PFN_DOWN(ramdisk_image), + PFN_DOWN(ramdisk_end))) { + /* All are mapped, easy case */ initrd_start = ramdisk_image + PAGE_OFFSET; initrd_end = initrd_start + ramdisk_size; return; @@ -413,6 +418,9 @@ static void __init reserve_initrd(void) memblock_free(ramdisk_image, ramdisk_end - ramdisk_image); } #else +static void __init early_reserve_initrd(void) +{ +} static void __init reserve_initrd(void) { } @@ -423,8 +431,6 @@ static void __init parse_setup_data(void) struct setup_data *data; u64 pa_data; - if (boot_params.hdr.version < 0x0209) - return; pa_data = boot_params.hdr.setup_data; while (pa_data) { u32 data_len, map_len; @@ -460,8 +466,6 @@ static void __init e820_reserve_setup_data(void) u64 pa_data; int found = 0; - if (boot_params.hdr.version < 0x0209) - return; pa_data = boot_params.hdr.setup_data; while (pa_data) { data = early_memremap(pa_data, sizeof(*data)); @@ -485,8 +489,6 @@ static void __init memblock_x86_reserve_range_setup_data(void) struct setup_data *data; u64 pa_data; - if (boot_params.hdr.version < 0x0209) - return; pa_data = boot_params.hdr.setup_data; while (pa_data) { data = early_memremap(pa_data, sizeof(*data)); @@ -505,17 +507,51 @@ static void __init memblock_x86_reserve_range_setup_data(void) /* * Keep the crash kernel below this limit. On 32 bits earlier kernels * would limit the kernel to the low 512 MiB due to mapping restrictions. - * On 64 bits, kexec-tools currently limits us to 896 MiB; increase this - * limit once kexec-tools are fixed. */ #ifdef CONFIG_X86_32 # define CRASH_KERNEL_ADDR_MAX (512 << 20) #else -# define CRASH_KERNEL_ADDR_MAX (896 << 20) +# define CRASH_KERNEL_ADDR_MAX MAXMEM #endif +static void __init reserve_crashkernel_low(void) +{ +#ifdef CONFIG_X86_64 + const unsigned long long alignment = 16<<20; /* 16M */ + unsigned long long low_base = 0, low_size = 0; + unsigned long total_low_mem; + unsigned long long base; + int ret; + + total_low_mem = memblock_mem_size(1UL<<(32-PAGE_SHIFT)); + ret = parse_crashkernel_low(boot_command_line, total_low_mem, + &low_size, &base); + if (ret != 0 || low_size <= 0) + return; + + low_base = memblock_find_in_range(low_size, (1ULL<<32), + low_size, alignment); + + if (!low_base) { + pr_info("crashkernel low reservation failed - No suitable area found.\n"); + + return; + } + + memblock_reserve(low_base, low_size); + pr_info("Reserving %ldMB of low memory at %ldMB for crashkernel (System low RAM: %ldMB)\n", + (unsigned long)(low_size >> 20), + (unsigned long)(low_base >> 20), + (unsigned long)(total_low_mem >> 20)); + crashk_low_res.start = low_base; + crashk_low_res.end = low_base + low_size - 1; + insert_resource(&iomem_resource, &crashk_low_res); +#endif +} + static void __init reserve_crashkernel(void) { + const unsigned long long alignment = 16<<20; /* 16M */ unsigned long long total_mem; unsigned long long crash_size, crash_base; int ret; @@ -529,8 +565,6 @@ static void __init reserve_crashkernel(void) /* 0 means: find the address automatically */ if (crash_base <= 0) { - const unsigned long long alignment = 16<<20; /* 16M */ - /* * kexec want bzImage is below CRASH_KERNEL_ADDR_MAX */ @@ -541,6 +575,7 @@ static void __init reserve_crashkernel(void) pr_info("crashkernel reservation failed - No suitable area found.\n"); return; } + } else { unsigned long long start; @@ -562,6 +597,9 @@ static void __init reserve_crashkernel(void) crashk_res.start = crash_base; crashk_res.end = crash_base + crash_size - 1; insert_resource(&iomem_resource, &crashk_res); + + if (crash_base >= (1ULL<<32)) + reserve_crashkernel_low(); } #else static void __init reserve_crashkernel(void) @@ -612,7 +650,82 @@ static __init void reserve_ibft_region(void) memblock_reserve(addr, size); } -static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10; +static bool __init snb_gfx_workaround_needed(void) +{ +#ifdef CONFIG_PCI + int i; + u16 vendor, devid; + static const __initconst u16 snb_ids[] = { + 0x0102, + 0x0112, + 0x0122, + 0x0106, + 0x0116, + 0x0126, + 0x010a, + }; + + /* Assume no if something weird is going on with PCI */ + if (!early_pci_allowed()) + return false; + + vendor = read_pci_config_16(0, 2, 0, PCI_VENDOR_ID); + if (vendor != 0x8086) + return false; + + devid = read_pci_config_16(0, 2, 0, PCI_DEVICE_ID); + for (i = 0; i < ARRAY_SIZE(snb_ids); i++) + if (devid == snb_ids[i]) + return true; +#endif + + return false; +} + +/* + * Sandy Bridge graphics has trouble with certain ranges, exclude + * them from allocation. + */ +static void __init trim_snb_memory(void) +{ + static const __initconst unsigned long bad_pages[] = { + 0x20050000, + 0x20110000, + 0x20130000, + 0x20138000, + 0x40004000, + }; + int i; + + if (!snb_gfx_workaround_needed()) + return; + + printk(KERN_DEBUG "reserving inaccessible SNB gfx pages\n"); + + /* + * Reserve all memory below the 1 MB mark that has not + * already been reserved. + */ + memblock_reserve(0, 1<<20); + + for (i = 0; i < ARRAY_SIZE(bad_pages); i++) { + if (memblock_reserve(bad_pages[i], PAGE_SIZE)) + printk(KERN_WARNING "failed to reserve 0x%08lx\n", + bad_pages[i]); + } +} + +/* + * Here we put platform-specific memory range workarounds, i.e. + * memory known to be corrupt or otherwise in need to be reserved on + * specific platforms. + * + * If this gets used more widely it could use a real dispatch mechanism. + */ +static void __init trim_platform_memory_ranges(void) +{ + trim_snb_memory(); +} static void __init trim_bios_range(void) { @@ -625,8 +738,7 @@ static void __init trim_bios_range(void) * since some BIOSes are known to corrupt low memory. See the * Kconfig help text for X86_RESERVE_LOW. */ - e820_update_range(0, ALIGN(reserve_low, PAGE_SIZE), - E820_RAM, E820_RESERVED); + e820_update_range(0, PAGE_SIZE, E820_RAM, E820_RESERVED); /* * special case: Some BIOSen report the PC BIOS @@ -634,9 +746,33 @@ static void __init trim_bios_range(void) * take them out. */ e820_remove_range(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_RAM, 1); + sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); } +/* called before trim_bios_range() to spare extra sanitize */ +static void __init e820_add_kernel_range(void) +{ + u64 start = __pa_symbol(_text); + u64 size = __pa_symbol(_end) - start; + + /* + * Complain if .text .data and .bss are not marked as E820_RAM and + * attempt to fix it by adding the range. We may have a confused BIOS, + * or the user may have used memmap=exactmap or memmap=xxM$yyM to + * exclude kernel range. If we really are running on top non-RAM, + * we will crash later anyways. + */ + if (e820_all_mapped(start, start + size, E820_RAM)) + return; + + pr_warn(".text .data .bss are not marked as E820_RAM!\n"); + e820_remove_range(start, size, E820_RAM, 0); + e820_add_region(start, size, E820_RAM); +} + +static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10; + static int __init parse_reservelow(char *p) { unsigned long long size; @@ -659,6 +795,11 @@ static int __init parse_reservelow(char *p) early_param("reservelow", parse_reservelow); +static void __init trim_low_memory_range(void) +{ + memblock_reserve(0, ALIGN(reserve_low, PAGE_SIZE)); +} + /* * Determine if we were loaded by an EFI loader. If so, then we have also been * passed the efi memmap, systab, etc., so we should use these data structures @@ -674,6 +815,17 @@ early_param("reservelow", parse_reservelow); void __init setup_arch(char **cmdline_p) { + memblock_reserve(__pa_symbol(_text), + (unsigned long)__bss_stop - (unsigned long)_text); + + early_reserve_initrd(); + + /* + * At this point everything still needed from the boot loader + * or BIOS or kernel text should be early reserved or marked not + * RAM in e820. All other memory is free game. + */ + #ifdef CONFIG_X86_32 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); visws_early_detect(); @@ -733,15 +885,15 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_EFI if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, "EL32", 4)) { - efi_enabled = 1; - efi_64bit = false; + set_bit(EFI_BOOT, &x86_efi_facility); } else if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, "EL64", 4)) { - efi_enabled = 1; - efi_64bit = true; + set_bit(EFI_BOOT, &x86_efi_facility); + set_bit(EFI_64BIT, &x86_efi_facility); } - if (efi_enabled && efi_memblock_x86_reserve_range()) - efi_enabled = 0; + + if (efi_enabled(EFI_BOOT)) + efi_memblock_x86_reserve_range(); #endif x86_init.oem.arch_setup(); @@ -761,12 +913,12 @@ void __init setup_arch(char **cmdline_p) init_mm.end_data = (unsigned long) _edata; init_mm.brk = _brk_end; - code_resource.start = virt_to_phys(_text); - code_resource.end = virt_to_phys(_etext)-1; - data_resource.start = virt_to_phys(_etext); - data_resource.end = virt_to_phys(_edata)-1; - bss_resource.start = virt_to_phys(&__bss_start); - bss_resource.end = virt_to_phys(&__bss_stop)-1; + code_resource.start = __pa_symbol(_text); + code_resource.end = __pa_symbol(_etext)-1; + data_resource.start = __pa_symbol(_etext); + data_resource.end = __pa_symbol(_edata)-1; + bss_resource.start = __pa_symbol(__bss_start); + bss_resource.end = __pa_symbol(__bss_stop)-1; #ifdef CONFIG_CMDLINE_BOOL #ifdef CONFIG_CMDLINE_OVERRIDE @@ -814,7 +966,7 @@ void __init setup_arch(char **cmdline_p) finish_e820_parsing(); - if (efi_enabled) + if (efi_enabled(EFI_BOOT)) efi_init(); dmi_scan_machine(); @@ -832,6 +984,7 @@ void __init setup_arch(char **cmdline_p) insert_resource(&iomem_resource, &data_resource); insert_resource(&iomem_resource, &bss_resource); + e820_add_kernel_range(); trim_bios_range(); #ifdef CONFIG_X86_32 if (ppro_with_ram_bug()) { @@ -881,6 +1034,8 @@ void __init setup_arch(char **cmdline_p) reserve_ibft_region(); + early_alloc_pgt_buf(); + /* * Need to conclude brk, before memblock_x86_fill() * it could use memblock_find_in_range, could overlap with @@ -890,14 +1045,14 @@ void __init setup_arch(char **cmdline_p) cleanup_highmap(); - memblock.current_limit = get_max_mapped(); + memblock.current_limit = ISA_END_ADDRESS; memblock_x86_fill(); /* * The EFI specification says that boot service code won't be called * after ExitBootServices(). This is, in fact, a lie. */ - if (efi_enabled) + if (efi_enabled(EFI_MEMMAP)) efi_reserve_boot_services(); /* preallocate 4k for mptable mpc */ @@ -907,39 +1062,22 @@ void __init setup_arch(char **cmdline_p) setup_bios_corruption_check(); #endif +#ifdef CONFIG_X86_32 printk(KERN_DEBUG "initial memory mapped: [mem 0x00000000-%#010lx]\n", (max_pfn_mapped<<PAGE_SHIFT) - 1); +#endif - setup_real_mode(); - - init_gbpages(); - - /* max_pfn_mapped is updated here */ - max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT); - max_pfn_mapped = max_low_pfn_mapped; + reserve_real_mode(); -#ifdef CONFIG_X86_64 - if (max_pfn > max_low_pfn) { - int i; - unsigned long start, end; - unsigned long start_pfn, end_pfn; + trim_platform_memory_ranges(); + trim_low_memory_range(); - for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, - NULL) { + init_mem_mapping(); - end = PFN_PHYS(end_pfn); - if (end <= (1UL<<32)) - continue; + early_trap_pf_init(); - start = PFN_PHYS(start_pfn); - max_pfn_mapped = init_memory_mapping( - max((1UL<<32), start), end); - } + setup_real_mode(); - /* can we preseve max_low_pfn ?*/ - max_low_pfn = max_pfn; - } -#endif memblock.current_limit = get_max_mapped(); dma_contiguous_reserve(0); @@ -956,6 +1094,10 @@ void __init setup_arch(char **cmdline_p) reserve_initrd(); +#if defined(CONFIG_ACPI) && defined(CONFIG_BLK_DEV_INITRD) + acpi_initrd_override((void *)initrd_start, initrd_end - initrd_start); +#endif + reserve_crashkernel(); vsmp_init(); @@ -1034,7 +1176,7 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_VT #if defined(CONFIG_VGA_CONSOLE) - if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY)) + if (!efi_enabled(EFI_BOOT) || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY)) conswitchp = &vga_con; #elif defined(CONFIG_DUMMY_CONSOLE) conswitchp = &dummy_con; @@ -1051,14 +1193,13 @@ void __init setup_arch(char **cmdline_p) register_refined_jiffies(CLOCK_TICK_RATE); #ifdef CONFIG_EFI - /* Once setup is done above, disable efi_enabled on mismatched - * firmware/kernel archtectures since there is no support for - * runtime services. + /* Once setup is done above, unmap the EFI memory map on + * mismatched firmware/kernel archtectures since there is no + * support for runtime services. */ - if (efi_enabled && IS_ENABLED(CONFIG_X86_64) != efi_64bit) { + if (efi_enabled(EFI_BOOT) && !efi_is_native()) { pr_info("efi: Setup done, disabling due to 32/64-bit mismatch\n"); efi_unmap_memmap(); - efi_enabled = 0; } #endif } diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 70b27ee6118..69562992e45 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -22,6 +22,7 @@ #include <linux/uaccess.h> #include <linux/user-return-notifier.h> #include <linux/uprobes.h> +#include <linux/context_tracking.h> #include <asm/processor.h> #include <asm/ucontext.h> @@ -277,7 +278,7 @@ static const struct { }; static int -__setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, +__setup_frame(int sig, struct ksignal *ksig, sigset_t *set, struct pt_regs *regs) { struct sigframe __user *frame; @@ -285,7 +286,7 @@ __setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, int err = 0; void __user *fpstate = NULL; - frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); + frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fpstate); if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) return -EFAULT; @@ -306,8 +307,8 @@ __setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, restorer = VDSO32_SYMBOL(current->mm->context.vdso, sigreturn); else restorer = &frame->retcode; - if (ka->sa.sa_flags & SA_RESTORER) - restorer = ka->sa.sa_restorer; + if (ksig->ka.sa.sa_flags & SA_RESTORER) + restorer = ksig->ka.sa.sa_restorer; /* Set up to return from userspace. */ err |= __put_user(restorer, &frame->pretcode); @@ -326,7 +327,7 @@ __setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, /* Set up registers for signal handler */ regs->sp = (unsigned long)frame; - regs->ip = (unsigned long)ka->sa.sa_handler; + regs->ip = (unsigned long)ksig->ka.sa.sa_handler; regs->ax = (unsigned long)sig; regs->dx = 0; regs->cx = 0; @@ -339,7 +340,7 @@ __setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, return 0; } -static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, +static int __setup_rt_frame(int sig, struct ksignal *ksig, sigset_t *set, struct pt_regs *regs) { struct rt_sigframe __user *frame; @@ -347,7 +348,7 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, int err = 0; void __user *fpstate = NULL; - frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); + frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fpstate); if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) return -EFAULT; @@ -363,15 +364,12 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, else put_user_ex(0, &frame->uc.uc_flags); put_user_ex(0, &frame->uc.uc_link); - put_user_ex(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); - put_user_ex(sas_ss_flags(regs->sp), - &frame->uc.uc_stack.ss_flags); - put_user_ex(current->sas_ss_size, &frame->uc.uc_stack.ss_size); + err |= __save_altstack(&frame->uc.uc_stack, regs->sp); /* Set up to return from userspace. */ restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn); - if (ka->sa.sa_flags & SA_RESTORER) - restorer = ka->sa.sa_restorer; + if (ksig->ka.sa.sa_flags & SA_RESTORER) + restorer = ksig->ka.sa.sa_restorer; put_user_ex(restorer, &frame->pretcode); /* @@ -384,7 +382,7 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, put_user_ex(*((u64 *)&rt_retcode), (u64 *)frame->retcode); } put_user_catch(err); - err |= copy_siginfo_to_user(&frame->info, info); + err |= copy_siginfo_to_user(&frame->info, &ksig->info); err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate, regs, set->sig[0]); err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); @@ -394,7 +392,7 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, /* Set up registers for signal handler */ regs->sp = (unsigned long)frame; - regs->ip = (unsigned long)ka->sa.sa_handler; + regs->ip = (unsigned long)ksig->ka.sa.sa_handler; regs->ax = (unsigned long)sig; regs->dx = (unsigned long)&frame->info; regs->cx = (unsigned long)&frame->uc; @@ -407,21 +405,20 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, return 0; } #else /* !CONFIG_X86_32 */ -static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, +static int __setup_rt_frame(int sig, struct ksignal *ksig, sigset_t *set, struct pt_regs *regs) { struct rt_sigframe __user *frame; void __user *fp = NULL; int err = 0; - struct task_struct *me = current; - frame = get_sigframe(ka, regs, sizeof(struct rt_sigframe), &fp); + frame = get_sigframe(&ksig->ka, regs, sizeof(struct rt_sigframe), &fp); if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) return -EFAULT; - if (ka->sa.sa_flags & SA_SIGINFO) { - if (copy_siginfo_to_user(&frame->info, info)) + if (ksig->ka.sa.sa_flags & SA_SIGINFO) { + if (copy_siginfo_to_user(&frame->info, &ksig->info)) return -EFAULT; } @@ -432,16 +429,13 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, else put_user_ex(0, &frame->uc.uc_flags); put_user_ex(0, &frame->uc.uc_link); - put_user_ex(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp); - put_user_ex(sas_ss_flags(regs->sp), - &frame->uc.uc_stack.ss_flags); - put_user_ex(me->sas_ss_size, &frame->uc.uc_stack.ss_size); + err |= __save_altstack(&frame->uc.uc_stack, regs->sp); /* Set up to return from userspace. If provided, use a stub already in userspace. */ /* x86-64 should always use SA_RESTORER. */ - if (ka->sa.sa_flags & SA_RESTORER) { - put_user_ex(ka->sa.sa_restorer, &frame->pretcode); + if (ksig->ka.sa.sa_flags & SA_RESTORER) { + put_user_ex(ksig->ka.sa.sa_restorer, &frame->pretcode); } else { /* could use a vstub here */ err |= -EFAULT; @@ -463,7 +457,7 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, next argument after the signal number on the stack. */ regs->si = (unsigned long)&frame->info; regs->dx = (unsigned long)&frame->uc; - regs->ip = (unsigned long) ka->sa.sa_handler; + regs->ip = (unsigned long) ksig->ka.sa.sa_handler; regs->sp = (unsigned long)frame; @@ -475,8 +469,8 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, } #endif /* CONFIG_X86_32 */ -static int x32_setup_rt_frame(int sig, struct k_sigaction *ka, - siginfo_t *info, compat_sigset_t *set, +static int x32_setup_rt_frame(struct ksignal *ksig, + compat_sigset_t *set, struct pt_regs *regs) { #ifdef CONFIG_X86_X32_ABI @@ -485,13 +479,13 @@ static int x32_setup_rt_frame(int sig, struct k_sigaction *ka, int err = 0; void __user *fpstate = NULL; - frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); + frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fpstate); if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) return -EFAULT; - if (ka->sa.sa_flags & SA_SIGINFO) { - if (copy_siginfo_to_user32(&frame->info, info)) + if (ksig->ka.sa.sa_flags & SA_SIGINFO) { + if (copy_siginfo_to_user32(&frame->info, &ksig->info)) return -EFAULT; } @@ -502,14 +496,11 @@ static int x32_setup_rt_frame(int sig, struct k_sigaction *ka, else put_user_ex(0, &frame->uc.uc_flags); put_user_ex(0, &frame->uc.uc_link); - put_user_ex(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); - put_user_ex(sas_ss_flags(regs->sp), - &frame->uc.uc_stack.ss_flags); - put_user_ex(current->sas_ss_size, &frame->uc.uc_stack.ss_size); + err |= __compat_save_altstack(&frame->uc.uc_stack, regs->sp); put_user_ex(0, &frame->uc.uc__pad0); - if (ka->sa.sa_flags & SA_RESTORER) { - restorer = ka->sa.sa_restorer; + if (ksig->ka.sa.sa_flags & SA_RESTORER) { + restorer = ksig->ka.sa.sa_restorer; } else { /* could use a vstub here */ restorer = NULL; @@ -527,10 +518,10 @@ static int x32_setup_rt_frame(int sig, struct k_sigaction *ka, /* Set up registers for signal handler */ regs->sp = (unsigned long) frame; - regs->ip = (unsigned long) ka->sa.sa_handler; + regs->ip = (unsigned long) ksig->ka.sa.sa_handler; /* We use the x32 calling convention here... */ - regs->di = sig; + regs->di = ksig->sig; regs->si = (unsigned long) &frame->info; regs->dx = (unsigned long) &frame->uc; @@ -544,77 +535,13 @@ static int x32_setup_rt_frame(int sig, struct k_sigaction *ka, return 0; } -#ifdef CONFIG_X86_32 -/* - * Atomically swap in the new signal mask, and wait for a signal. - */ -asmlinkage int -sys_sigsuspend(int history0, int history1, old_sigset_t mask) -{ - sigset_t blocked; - siginitset(&blocked, mask); - return sigsuspend(&blocked); -} - -asmlinkage int -sys_sigaction(int sig, const struct old_sigaction __user *act, - struct old_sigaction __user *oact) -{ - struct k_sigaction new_ka, old_ka; - int ret = 0; - - if (act) { - old_sigset_t mask; - - if (!access_ok(VERIFY_READ, act, sizeof(*act))) - return -EFAULT; - - get_user_try { - get_user_ex(new_ka.sa.sa_handler, &act->sa_handler); - get_user_ex(new_ka.sa.sa_flags, &act->sa_flags); - get_user_ex(mask, &act->sa_mask); - get_user_ex(new_ka.sa.sa_restorer, &act->sa_restorer); - } get_user_catch(ret); - - if (ret) - return -EFAULT; - siginitset(&new_ka.sa.sa_mask, mask); - } - - ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); - - if (!ret && oact) { - if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact))) - return -EFAULT; - - put_user_try { - put_user_ex(old_ka.sa.sa_handler, &oact->sa_handler); - put_user_ex(old_ka.sa.sa_flags, &oact->sa_flags); - put_user_ex(old_ka.sa.sa_mask.sig[0], &oact->sa_mask); - put_user_ex(old_ka.sa.sa_restorer, &oact->sa_restorer); - } put_user_catch(ret); - - if (ret) - return -EFAULT; - } - - return ret; -} -#endif /* CONFIG_X86_32 */ - -long -sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, - struct pt_regs *regs) -{ - return do_sigaltstack(uss, uoss, regs->sp); -} - /* * Do a signal return; undo the signal stack. */ #ifdef CONFIG_X86_32 -unsigned long sys_sigreturn(struct pt_regs *regs) +unsigned long sys_sigreturn(void) { + struct pt_regs *regs = current_pt_regs(); struct sigframe __user *frame; unsigned long ax; sigset_t set; @@ -641,8 +568,9 @@ badframe: } #endif /* CONFIG_X86_32 */ -long sys_rt_sigreturn(struct pt_regs *regs) +long sys_rt_sigreturn(void) { + struct pt_regs *regs = current_pt_regs(); struct rt_sigframe __user *frame; unsigned long ax; sigset_t set; @@ -658,7 +586,7 @@ long sys_rt_sigreturn(struct pt_regs *regs) if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) goto badframe; - if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->sp) == -EFAULT) + if (restore_altstack(&frame->uc.uc_stack)) goto badframe; return ax; @@ -683,30 +611,29 @@ static int signr_convert(int sig) } static int -setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, - struct pt_regs *regs) +setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) { - int usig = signr_convert(sig); + int usig = signr_convert(ksig->sig); sigset_t *set = sigmask_to_save(); compat_sigset_t *cset = (compat_sigset_t *) set; /* Set up the stack frame */ if (is_ia32_frame()) { - if (ka->sa.sa_flags & SA_SIGINFO) - return ia32_setup_rt_frame(usig, ka, info, cset, regs); + if (ksig->ka.sa.sa_flags & SA_SIGINFO) + return ia32_setup_rt_frame(usig, ksig, cset, regs); else - return ia32_setup_frame(usig, ka, cset, regs); + return ia32_setup_frame(usig, ksig, cset, regs); } else if (is_x32_frame()) { - return x32_setup_rt_frame(usig, ka, info, cset, regs); + return x32_setup_rt_frame(ksig, cset, regs); } else { - return __setup_rt_frame(sig, ka, info, set, regs); + return __setup_rt_frame(ksig->sig, ksig, set, regs); } } static void -handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, - struct pt_regs *regs) +handle_signal(struct ksignal *ksig, struct pt_regs *regs) { + bool failed; /* Are we from a system call? */ if (syscall_get_nr(current, regs) >= 0) { /* If so, check system call restarting.. */ @@ -717,7 +644,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, break; case -ERESTARTSYS: - if (!(ka->sa.sa_flags & SA_RESTART)) { + if (!(ksig->ka.sa.sa_flags & SA_RESTART)) { regs->ax = -EINTR; break; } @@ -737,26 +664,21 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, likely(test_and_clear_thread_flag(TIF_FORCED_TF))) regs->flags &= ~X86_EFLAGS_TF; - if (setup_rt_frame(sig, ka, info, regs) < 0) { - force_sigsegv(sig, current); - return; + failed = (setup_rt_frame(ksig, regs) < 0); + if (!failed) { + /* + * Clear the direction flag as per the ABI for function entry. + */ + regs->flags &= ~X86_EFLAGS_DF; + /* + * Clear TF when entering the signal handler, but + * notify any tracer that was single-stepping it. + * The tracer may want to single-step inside the + * handler too. + */ + regs->flags &= ~X86_EFLAGS_TF; } - - /* - * Clear the direction flag as per the ABI for function entry. - */ - regs->flags &= ~X86_EFLAGS_DF; - - /* - * Clear TF when entering the signal handler, but - * notify any tracer that was single-stepping it. - * The tracer may want to single-step inside the - * handler too. - */ - regs->flags &= ~X86_EFLAGS_TF; - - signal_delivered(sig, info, ka, regs, - test_thread_flag(TIF_SINGLESTEP)); + signal_setup_done(failed, ksig, test_thread_flag(TIF_SINGLESTEP)); } #ifdef CONFIG_X86_32 @@ -773,14 +695,11 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, */ static void do_signal(struct pt_regs *regs) { - struct k_sigaction ka; - siginfo_t info; - int signr; + struct ksignal ksig; - signr = get_signal_to_deliver(&info, &ka, regs, NULL); - if (signr > 0) { + if (get_signal(&ksig)) { /* Whee! Actually deliver the signal. */ - handle_signal(signr, &info, &ka, regs); + handle_signal(&ksig, regs); return; } @@ -816,7 +735,7 @@ static void do_signal(struct pt_regs *regs) void do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) { - rcu_user_exit(); + user_exit(); #ifdef CONFIG_X86_MCE /* notify userspace of pending MCEs */ @@ -838,7 +757,7 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) if (thread_info_flags & _TIF_USER_RETURN_NOTIFY) fire_user_return_notifiers(); - rcu_user_enter(); + user_enter(); } void signal_fault(struct pt_regs *regs, void __user *frame, char *where) @@ -859,12 +778,12 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where) } #ifdef CONFIG_X86_X32_ABI -asmlinkage long sys32_x32_rt_sigreturn(struct pt_regs *regs) +asmlinkage long sys32_x32_rt_sigreturn(void) { + struct pt_regs *regs = current_pt_regs(); struct rt_sigframe_x32 __user *frame; sigset_t set; unsigned long ax; - struct pt_regs tregs; frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8); @@ -878,8 +797,7 @@ asmlinkage long sys32_x32_rt_sigreturn(struct pt_regs *regs) if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) goto badframe; - tregs = *regs; - if (sys32_sigaltstack(&frame->uc.uc_stack, NULL, &tregs) == -EFAULT) + if (compat_restore_altstack(&frame->uc.uc_stack)) goto badframe; return ax; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index c80a33bc528..9f190a2a00e 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -68,6 +68,8 @@ #include <asm/mwait.h> #include <asm/apic.h> #include <asm/io_apic.h> +#include <asm/i387.h> +#include <asm/fpu-internal.h> #include <asm/setup.h> #include <asm/uv/uv.h> #include <linux/mc146818rtc.h> @@ -125,8 +127,8 @@ EXPORT_PER_CPU_SYMBOL(cpu_info); atomic_t init_deasserted; /* - * Report back to the Boot Processor. - * Running on AP. + * Report back to the Boot Processor during boot time or to the caller processor + * during CPU online. */ static void __cpuinit smp_callin(void) { @@ -138,15 +140,17 @@ static void __cpuinit smp_callin(void) * we may get here before an INIT-deassert IPI reaches * our local APIC. We have to wait for the IPI or we'll * lock up on an APIC access. + * + * Since CPU0 is not wakened up by INIT, it doesn't wait for the IPI. */ - if (apic->wait_for_init_deassert) + cpuid = smp_processor_id(); + if (apic->wait_for_init_deassert && cpuid != 0) apic->wait_for_init_deassert(&init_deasserted); /* * (This works even if the APIC is not enabled.) */ phys_id = read_apic_id(); - cpuid = smp_processor_id(); if (cpumask_test_cpu(cpuid, cpu_callin_mask)) { panic("%s: phys CPU#%d, CPU#%d already present??\n", __func__, phys_id, cpuid); @@ -228,6 +232,8 @@ static void __cpuinit smp_callin(void) cpumask_set_cpu(cpuid, cpu_callin_mask); } +static int cpu0_logical_apicid; +static int enable_start_cpu0; /* * Activate a secondary processor. */ @@ -243,6 +249,8 @@ notrace static void __cpuinit start_secondary(void *unused) preempt_disable(); smp_callin(); + enable_start_cpu0 = 0; + #ifdef CONFIG_X86_32 /* switch away from the initial page table */ load_cr3(swapper_pg_dir); @@ -279,19 +287,30 @@ notrace static void __cpuinit start_secondary(void *unused) cpu_idle(); } +void __init smp_store_boot_cpu_info(void) +{ + int id = 0; /* CPU 0 */ + struct cpuinfo_x86 *c = &cpu_data(id); + + *c = boot_cpu_data; + c->cpu_index = id; +} + /* * The bootstrap kernel entry code has set these up. Save them for * a given CPU */ - void __cpuinit smp_store_cpu_info(int id) { struct cpuinfo_x86 *c = &cpu_data(id); *c = boot_cpu_data; c->cpu_index = id; - if (id != 0) - identify_secondary_cpu(c); + /* + * During boot time, CPU0 has this setup already. Save the info when + * bringing up AP or offlined CPU0. + */ + identify_secondary_cpu(c); } static bool __cpuinit @@ -313,7 +332,7 @@ do { \ static bool __cpuinit match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) { - if (cpu_has(c, X86_FEATURE_TOPOEXT)) { + if (cpu_has_topoext) { int cpu1 = c->cpu_index, cpu2 = o->cpu_index; if (c->phys_proc_id == o->phys_proc_id && @@ -481,7 +500,7 @@ void __inquire_remote_apic(int apicid) * won't ... remember to clear down the APIC, etc later. */ int __cpuinit -wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip) +wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip) { unsigned long send_status, accept_status = 0; int maxlvt; @@ -489,7 +508,7 @@ wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip) /* Target chip */ /* Boot on the stack */ /* Kick the second */ - apic_icr_write(APIC_DM_NMI | apic->dest_logical, logical_apicid); + apic_icr_write(APIC_DM_NMI | apic->dest_logical, apicid); pr_debug("Waiting for send to finish...\n"); send_status = safe_apic_wait_icr_idle(); @@ -649,6 +668,63 @@ static void __cpuinit announce_cpu(int cpu, int apicid) node, cpu, apicid); } +static int wakeup_cpu0_nmi(unsigned int cmd, struct pt_regs *regs) +{ + int cpu; + + cpu = smp_processor_id(); + if (cpu == 0 && !cpu_online(cpu) && enable_start_cpu0) + return NMI_HANDLED; + + return NMI_DONE; +} + +/* + * Wake up AP by INIT, INIT, STARTUP sequence. + * + * Instead of waiting for STARTUP after INITs, BSP will execute the BIOS + * boot-strap code which is not a desired behavior for waking up BSP. To + * void the boot-strap code, wake up CPU0 by NMI instead. + * + * This works to wake up soft offlined CPU0 only. If CPU0 is hard offlined + * (i.e. physically hot removed and then hot added), NMI won't wake it up. + * We'll change this code in the future to wake up hard offlined CPU0 if + * real platform and request are available. + */ +static int __cpuinit +wakeup_cpu_via_init_nmi(int cpu, unsigned long start_ip, int apicid, + int *cpu0_nmi_registered) +{ + int id; + int boot_error; + + /* + * Wake up AP by INIT, INIT, STARTUP sequence. + */ + if (cpu) + return wakeup_secondary_cpu_via_init(apicid, start_ip); + + /* + * Wake up BSP by nmi. + * + * Register a NMI handler to help wake up CPU0. + */ + boot_error = register_nmi_handler(NMI_LOCAL, + wakeup_cpu0_nmi, 0, "wake_cpu0"); + + if (!boot_error) { + enable_start_cpu0 = 1; + *cpu0_nmi_registered = 1; + if (apic->dest_logical == APIC_DEST_LOGICAL) + id = cpu0_logical_apicid; + else + id = apicid; + boot_error = wakeup_secondary_cpu_via_nmi(id, start_ip); + } + + return boot_error; +} + /* * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad * (ie clustered apic addressing mode), this is a LOGICAL apic ID. @@ -664,6 +740,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle) unsigned long boot_error = 0; int timeout; + int cpu0_nmi_registered = 0; /* Just in case we booted with a single CPU. */ alternatives_enable_smp(); @@ -711,13 +788,16 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle) } /* - * Kick the secondary CPU. Use the method in the APIC driver - * if it's defined - or use an INIT boot APIC message otherwise: + * Wake up a CPU in difference cases: + * - Use the method in the APIC driver if it's defined + * Otherwise, + * - Use an INIT boot APIC message for APs or NMI for BSP. */ if (apic->wakeup_secondary_cpu) boot_error = apic->wakeup_secondary_cpu(apicid, start_ip); else - boot_error = wakeup_secondary_cpu_via_init(apicid, start_ip); + boot_error = wakeup_cpu_via_init_nmi(cpu, start_ip, apicid, + &cpu0_nmi_registered); if (!boot_error) { /* @@ -782,6 +862,13 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle) */ smpboot_restore_warm_reset_vector(); } + /* + * Clean up the nmi handler. Do this after the callin and callout sync + * to avoid impact of possible long unregister time. + */ + if (cpu0_nmi_registered) + unregister_nmi_handler(NMI_LOCAL, "wake_cpu0"); + return boot_error; } @@ -795,7 +882,7 @@ int __cpuinit native_cpu_up(unsigned int cpu, struct task_struct *tidle) pr_debug("++++++++++++++++++++=_---CPU UP %u\n", cpu); - if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid || + if (apicid == BAD_APICID || !physid_isset(apicid, phys_cpu_present_map) || !apic->apic_id_valid(apicid)) { pr_err("%s: bad cpu %d\n", __func__, cpu); @@ -818,6 +905,9 @@ int __cpuinit native_cpu_up(unsigned int cpu, struct task_struct *tidle) per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; + /* the FPU context is blank, nobody can own it */ + __cpu_disable_lazy_restore(cpu); + err = do_boot_cpu(apicid, cpu, tidle); if (err) { pr_debug("do_boot_cpu failed %d\n", err); @@ -990,7 +1080,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) /* * Setup boot CPU information */ - smp_store_cpu_info(0); /* Final full version of the data */ + smp_store_boot_cpu_info(); /* Final full version of the data */ cpumask_copy(cpu_callin_mask, cpumask_of(0)); mb(); @@ -1026,6 +1116,11 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) */ setup_local_APIC(); + if (x2apic_mode) + cpu0_logical_apicid = apic_read(APIC_LDR); + else + cpu0_logical_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR)); + /* * Enable IO APIC before setting up error vector */ @@ -1214,19 +1309,6 @@ void cpu_disable_common(void) int native_cpu_disable(void) { - int cpu = smp_processor_id(); - - /* - * Perhaps use cpufreq to drop frequency, but that could go - * into generic code. - * - * We won't take down the boot processor on i386 due to some - * interrupts only being able to be serviced by the BSP. - * Especially so if we're not using an IOAPIC -zwane - */ - if (cpu == 0) - return -EBUSY; - clear_local_APIC(); cpu_disable_common(); @@ -1266,6 +1348,14 @@ void play_dead_common(void) local_irq_disable(); } +static bool wakeup_cpu0(void) +{ + if (smp_processor_id() == 0 && enable_start_cpu0) + return true; + + return false; +} + /* * We need to flush the caches before going to sleep, lest we have * dirty data in our caches when we come back up. @@ -1275,11 +1365,10 @@ static inline void mwait_play_dead(void) unsigned int eax, ebx, ecx, edx; unsigned int highest_cstate = 0; unsigned int highest_subcstate = 0; - int i; void *mwait_ptr; - struct cpuinfo_x86 *c = __this_cpu_ptr(&cpu_info); + int i; - if (!(this_cpu_has(X86_FEATURE_MWAIT) && mwait_usable(c))) + if (!this_cpu_has(X86_FEATURE_MWAIT)) return; if (!this_cpu_has(X86_FEATURE_CLFLSH)) return; @@ -1329,6 +1418,11 @@ static inline void mwait_play_dead(void) __monitor(mwait_ptr, 0, 0); mb(); __mwait(eax, 0); + /* + * If NMI wants to wake up CPU0, start CPU0. + */ + if (wakeup_cpu0()) + start_cpu0(); } } @@ -1339,6 +1433,11 @@ static inline void hlt_play_dead(void) while (1) { native_halt(); + /* + * If NMI wants to wake up CPU0, start CPU0. + */ + if (wakeup_cpu0()) + start_cpu0(); } } diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index cd3b2438a98..9b4d51d0c0d 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -165,10 +165,11 @@ void set_task_blockstep(struct task_struct *task, bool on) * Ensure irq/preemption can't change debugctl in between. * Note also that both TIF_BLOCKSTEP and debugctl should * be changed atomically wrt preemption. - * FIXME: this means that set/clear TIF_BLOCKSTEP is simply - * wrong if task != current, SIGKILL can wakeup the stopped - * tracee and set/clear can play with the running task, this - * can confuse the next __switch_to_xtra(). + * + * NOTE: this means that set/clear TIF_BLOCKSTEP is only safe if + * task is current or it can't be running, otherwise we can race + * with __switch_to_xtra(). We rely on ptrace_freeze_traced() but + * PTRACE_KILL is not safe. */ local_irq_disable(); debugctl = get_debugctlmsr(); diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index b4d3c3927dd..dbded5aedb8 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -21,37 +21,23 @@ /* * Align a virtual address to avoid aliasing in the I$ on AMD F15h. - * - * @flags denotes the allocation direction - bottomup or topdown - - * or vDSO; see call sites below. */ -unsigned long align_addr(unsigned long addr, struct file *filp, - enum align_flags flags) +static unsigned long get_align_mask(void) { - unsigned long tmp_addr; - /* handle 32- and 64-bit case with a single conditional */ if (va_align.flags < 0 || !(va_align.flags & (2 - mmap_is_ia32()))) - return addr; + return 0; if (!(current->flags & PF_RANDOMIZE)) - return addr; - - if (!((flags & ALIGN_VDSO) || filp)) - return addr; - - tmp_addr = addr; - - /* - * We need an address which is <= than the original - * one only when in topdown direction. - */ - if (!(flags & ALIGN_TOPDOWN)) - tmp_addr += va_align.mask; + return 0; - tmp_addr &= ~va_align.mask; + return va_align.mask; +} - return tmp_addr; +unsigned long align_vdso_addr(unsigned long addr) +{ + unsigned long align_mask = get_align_mask(); + return (addr + align_mask) & ~align_mask; } static int __init control_va_addr_alignment(char *str) @@ -126,7 +112,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; - unsigned long start_addr; + struct vm_unmapped_area_info info; unsigned long begin, end; if (flags & MAP_FIXED) @@ -144,50 +130,16 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, (!vma || addr + len <= vma->vm_start)) return addr; } - if (((flags & MAP_32BIT) || test_thread_flag(TIF_ADDR32)) - && len <= mm->cached_hole_size) { - mm->cached_hole_size = 0; - mm->free_area_cache = begin; - } - addr = mm->free_area_cache; - if (addr < begin) - addr = begin; - start_addr = addr; - -full_search: - - addr = align_addr(addr, filp, 0); - - for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { - /* At this point: (!vma || addr < vma->vm_end). */ - if (end - len < addr) { - /* - * Start a new search - just in case we missed - * some holes. - */ - if (start_addr != begin) { - start_addr = addr = begin; - mm->cached_hole_size = 0; - goto full_search; - } - return -ENOMEM; - } - if (!vma || addr + len <= vma->vm_start) { - /* - * Remember the place where we stopped the search: - */ - mm->free_area_cache = addr + len; - return addr; - } - if (addr + mm->cached_hole_size < vma->vm_start) - mm->cached_hole_size = vma->vm_start - addr; - addr = vma->vm_end; - addr = align_addr(addr, filp, 0); - } + info.flags = 0; + info.length = len; + info.low_limit = begin; + info.high_limit = end; + info.align_mask = filp ? get_align_mask() : 0; + info.align_offset = pgoff << PAGE_SHIFT; + return vm_unmapped_area(&info); } - unsigned long arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, const unsigned long len, const unsigned long pgoff, @@ -195,7 +147,8 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, { struct vm_area_struct *vma; struct mm_struct *mm = current->mm; - unsigned long addr = addr0, start_addr; + unsigned long addr = addr0; + struct vm_unmapped_area_info info; /* requested length too big for entire address space */ if (len > TASK_SIZE) @@ -204,7 +157,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, if (flags & MAP_FIXED) return addr; - /* for MAP_32BIT mappings we force the legact mmap base */ + /* for MAP_32BIT mappings we force the legacy mmap base */ if (!test_thread_flag(TIF_ADDR32) && (flags & MAP_32BIT)) goto bottomup; @@ -217,51 +170,16 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, return addr; } - /* check if free_area_cache is useful for us */ - if (len <= mm->cached_hole_size) { - mm->cached_hole_size = 0; - mm->free_area_cache = mm->mmap_base; - } - -try_again: - /* either no address requested or can't fit in requested address hole */ - start_addr = addr = mm->free_area_cache; - - if (addr < len) - goto fail; - - addr -= len; - do { - addr = align_addr(addr, filp, ALIGN_TOPDOWN); - - /* - * Lookup failure means no vma is above this address, - * else if new region fits below vma->vm_start, - * return with success: - */ - vma = find_vma(mm, addr); - if (!vma || addr+len <= vma->vm_start) - /* remember the address as a hint for next time */ - return mm->free_area_cache = addr; - - /* remember the largest hole we saw so far */ - if (addr + mm->cached_hole_size < vma->vm_start) - mm->cached_hole_size = vma->vm_start - addr; - - /* try just below the current vma->vm_start */ - addr = vma->vm_start-len; - } while (len < vma->vm_start); - -fail: - /* - * if hint left us with no space for the requested - * mapping then try again: - */ - if (start_addr != mm->mmap_base) { - mm->free_area_cache = mm->mmap_base; - mm->cached_hole_size = 0; - goto try_again; - } + info.flags = VM_UNMAPPED_AREA_TOPDOWN; + info.length = len; + info.low_limit = PAGE_SIZE; + info.high_limit = mm->mmap_base; + info.align_mask = filp ? get_align_mask() : 0; + info.align_offset = pgoff << PAGE_SHIFT; + addr = vm_unmapped_area(&info); + if (!(addr & ~PAGE_MASK)) + return addr; + VM_BUG_ON(addr != -ENOMEM); bottomup: /* @@ -270,14 +188,5 @@ bottomup: * can happen with large stack limits and large mmap() * allocations. */ - mm->cached_hole_size = ~0UL; - mm->free_area_cache = TASK_UNMAPPED_BASE; - addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags); - /* - * Restore the topdown base: - */ - mm->free_area_cache = mm->mmap_base; - mm->cached_hole_size = ~0UL; - - return addr; + return arch_get_unmapped_area(filp, addr0, len, pgoff, flags); } diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c index 76ee97709a0..6e60b5fe224 100644 --- a/arch/x86/kernel/topology.c +++ b/arch/x86/kernel/topology.c @@ -30,23 +30,110 @@ #include <linux/mmzone.h> #include <linux/init.h> #include <linux/smp.h> +#include <linux/irq.h> #include <asm/cpu.h> static DEFINE_PER_CPU(struct x86_cpu, cpu_devices); #ifdef CONFIG_HOTPLUG_CPU + +#ifdef CONFIG_BOOTPARAM_HOTPLUG_CPU0 +static int cpu0_hotpluggable = 1; +#else +static int cpu0_hotpluggable; +static int __init enable_cpu0_hotplug(char *str) +{ + cpu0_hotpluggable = 1; + return 1; +} + +__setup("cpu0_hotplug", enable_cpu0_hotplug); +#endif + +#ifdef CONFIG_DEBUG_HOTPLUG_CPU0 +/* + * This function offlines a CPU as early as possible and allows userspace to + * boot up without the CPU. The CPU can be onlined back by user after boot. + * + * This is only called for debugging CPU offline/online feature. + */ +int __ref _debug_hotplug_cpu(int cpu, int action) +{ + struct device *dev = get_cpu_device(cpu); + int ret; + + if (!cpu_is_hotpluggable(cpu)) + return -EINVAL; + + cpu_hotplug_driver_lock(); + + switch (action) { + case 0: + ret = cpu_down(cpu); + if (!ret) { + pr_info("CPU %u is now offline\n", cpu); + kobject_uevent(&dev->kobj, KOBJ_OFFLINE); + } else + pr_debug("Can't offline CPU%d.\n", cpu); + break; + case 1: + ret = cpu_up(cpu); + if (!ret) + kobject_uevent(&dev->kobj, KOBJ_ONLINE); + else + pr_debug("Can't online CPU%d.\n", cpu); + break; + default: + ret = -EINVAL; + } + + cpu_hotplug_driver_unlock(); + + return ret; +} + +static int __init debug_hotplug_cpu(void) +{ + _debug_hotplug_cpu(0, 0); + return 0; +} + +late_initcall_sync(debug_hotplug_cpu); +#endif /* CONFIG_DEBUG_HOTPLUG_CPU0 */ + int __ref arch_register_cpu(int num) { + struct cpuinfo_x86 *c = &cpu_data(num); + + /* + * Currently CPU0 is only hotpluggable on Intel platforms. Other + * vendors can add hotplug support later. + */ + if (c->x86_vendor != X86_VENDOR_INTEL) + cpu0_hotpluggable = 0; + /* - * CPU0 cannot be offlined due to several - * restrictions and assumptions in kernel. This basically - * doesn't add a control file, one cannot attempt to offline - * BSP. + * Two known BSP/CPU0 dependencies: Resume from suspend/hibernate + * depends on BSP. PIC interrupts depend on BSP. * - * Also certain PCI quirks require not to enable hotplug control - * for all CPU's. + * If the BSP depencies are under control, one can tell kernel to + * enable BSP hotplug. This basically adds a control file and + * one can attempt to offline BSP. */ - if (num) + if (num == 0 && cpu0_hotpluggable) { + unsigned int irq; + /* + * We won't take down the boot processor on i386 if some + * interrupts only are able to be serviced by the BSP in PIC. + */ + for_each_active_irq(irq) { + if (!IO_APIC_IRQ(irq) && irq_has_action(irq)) { + cpu0_hotpluggable = 0; + break; + } + } + } + if (num || cpu0_hotpluggable) per_cpu(cpu_devices, num).cpu.hotpluggable = 1; return register_cpu(&per_cpu(cpu_devices, num).cpu, num); diff --git a/arch/x86/kernel/trace_clock.c b/arch/x86/kernel/trace_clock.c new file mode 100644 index 00000000000..25b993729f9 --- /dev/null +++ b/arch/x86/kernel/trace_clock.c @@ -0,0 +1,21 @@ +/* + * X86 trace clocks + */ +#include <asm/trace_clock.h> +#include <asm/barrier.h> +#include <asm/msr.h> + +/* + * trace_clock_x86_tsc(): A clock that is just the cycle counter. + * + * Unlike the other clocks, this is not in nanoseconds. + */ +u64 notrace trace_clock_x86_tsc(void) +{ + u64 ret; + + rdtsc_barrier(); + rdtscll(ret); + + return ret; +} diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 8276dc6794c..68bda7a8415 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -55,7 +55,7 @@ #include <asm/i387.h> #include <asm/fpu-internal.h> #include <asm/mce.h> -#include <asm/rcu.h> +#include <asm/context_tracking.h> #include <asm/mach_traps.h> @@ -69,9 +69,6 @@ asmlinkage int system_call(void); -/* Do we ignore FPU interrupts ? */ -char ignore_fpu_irq; - /* * The IDT has to be page-aligned to simplify the Pentium * F0 0F bug workaround. @@ -564,9 +561,6 @@ void math_error(struct pt_regs *regs, int error_code, int trapnr) dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) { -#ifdef CONFIG_X86_32 - ignore_fpu_irq = 1; -#endif exception_enter(regs); math_error(regs, error_code, X86_TRAP_MF); exception_exit(regs); @@ -694,10 +688,19 @@ void __init early_trap_init(void) set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK); /* int3 can be called from all */ set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK); +#ifdef CONFIG_X86_32 set_intr_gate(X86_TRAP_PF, &page_fault); +#endif load_idt(&idt_descr); } +void __init early_trap_pf_init(void) +{ +#ifdef CONFIG_X86_64 + set_intr_gate(X86_TRAP_PF, &page_fault); +#endif +} + void __init trap_init(void) { int i; diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index cfa5d4f7ca5..4b9ea101fe3 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -77,6 +77,12 @@ unsigned long long sched_clock(void) __attribute__((alias("native_sched_clock"))); #endif +unsigned long long native_read_tsc(void) +{ + return __native_read_tsc(); +} +EXPORT_SYMBOL(native_read_tsc); + int check_tsc_unstable(void) { return tsc_unstable; @@ -617,7 +623,8 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) ns_now = __cycles_2_ns(tsc_now); if (cpu_khz) { - *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz; + *scale = ((NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR) + + cpu_khz / 2) / cpu_khz; *offset = ns_now - mult_frac(tsc_now, *scale, (1UL << CYC2NS_SCALE_FACTOR)); } diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index aafa5557b39..0ba4cfb4f41 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -478,6 +478,11 @@ int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) regs->ip = current->utask->xol_vaddr; pre_xol_rip_insn(auprobe, regs, autask); + autask->saved_tf = !!(regs->flags & X86_EFLAGS_TF); + regs->flags |= X86_EFLAGS_TF; + if (test_tsk_thread_flag(current, TIF_BLOCKSTEP)) + set_task_blockstep(current, false); + return 0; } @@ -603,6 +608,16 @@ int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) if (auprobe->fixups & UPROBE_FIX_CALL) result = adjust_ret_addr(regs->sp, correction); + /* + * arch_uprobe_pre_xol() doesn't save the state of TIF_BLOCKSTEP + * so we can get an extra SIGTRAP if we do not clear TF. We need + * to examine the opcode to make it right. + */ + if (utask->autask.saved_tf) + send_sig(SIGTRAP, current, 0); + else if (!(auprobe->fixups & UPROBE_FIX_SETF)) + regs->flags &= ~X86_EFLAGS_TF; + return result; } @@ -647,6 +662,10 @@ void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) current->thread.trap_nr = utask->autask.saved_trap_nr; handle_riprel_post_xol(auprobe, regs, NULL); instruction_pointer_set(regs, utask->vaddr); + + /* clear TF if it was set by us in arch_uprobe_pre_xol() */ + if (!utask->autask.saved_tf) + regs->flags &= ~X86_EFLAGS_TF; } /* @@ -661,8 +680,10 @@ static bool __skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) if (auprobe->insn[i] == 0x66) continue; - if (auprobe->insn[i] == 0x90) + if (auprobe->insn[i] == 0x90) { + regs->ip += i + 1; return true; + } break; } @@ -676,38 +697,3 @@ bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) send_sig(SIGTRAP, current, 0); return ret; } - -void arch_uprobe_enable_step(struct arch_uprobe *auprobe) -{ - struct task_struct *task = current; - struct arch_uprobe_task *autask = &task->utask->autask; - struct pt_regs *regs = task_pt_regs(task); - - autask->saved_tf = !!(regs->flags & X86_EFLAGS_TF); - - regs->flags |= X86_EFLAGS_TF; - if (test_tsk_thread_flag(task, TIF_BLOCKSTEP)) - set_task_blockstep(task, false); -} - -void arch_uprobe_disable_step(struct arch_uprobe *auprobe) -{ - struct task_struct *task = current; - struct arch_uprobe_task *autask = &task->utask->autask; - bool trapped = (task->utask->state == UTASK_SSTEP_TRAPPED); - struct pt_regs *regs = task_pt_regs(task); - /* - * The state of TIF_BLOCKSTEP was not saved so we can get an extra - * SIGTRAP if we do not clear TF. We need to examine the opcode to - * make it right. - */ - if (unlikely(trapped)) { - if (!autask->saved_tf) - regs->flags &= ~X86_EFLAGS_TF; - } else { - if (autask->saved_tf) - send_sig(SIGTRAP, task, 0); - else if (!(auprobe->fixups & UPROBE_FIX_SETF)) - regs->flags &= ~X86_EFLAGS_TF; - } -} diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 5c9687b1bde..1cf5766dde1 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -182,7 +182,7 @@ static void mark_screen_rdonly(struct mm_struct *mm) if (pud_none_or_clear_bad(pud)) goto out; pmd = pmd_offset(pud, 0xA0000); - split_huge_page_pmd(mm, pmd); + split_huge_page_pmd_mm(mm, 0xA0000, pmd); if (pmd_none_or_clear_bad(pmd)) goto out; pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl); @@ -202,7 +202,7 @@ out: static int do_vm86_irq_handling(int subfunction, int irqnumber); static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk); -int sys_vm86old(struct vm86_struct __user *v86, struct pt_regs *regs) +int sys_vm86old(struct vm86_struct __user *v86) { struct kernel_vm86_struct info; /* declare this _on top_, * this avoids wasting of stack space. @@ -222,7 +222,7 @@ int sys_vm86old(struct vm86_struct __user *v86, struct pt_regs *regs) if (tmp) goto out; memset(&info.vm86plus, 0, (int)&info.regs32 - (int)&info.vm86plus); - info.regs32 = regs; + info.regs32 = current_pt_regs(); tsk->thread.vm86_info = v86; do_sys_vm86(&info, tsk); ret = 0; /* we never return here */ @@ -231,7 +231,7 @@ out: } -int sys_vm86(unsigned long cmd, unsigned long arg, struct pt_regs *regs) +int sys_vm86(unsigned long cmd, unsigned long arg) { struct kernel_vm86_struct info; /* declare this _on top_, * this avoids wasting of stack space. @@ -272,7 +272,7 @@ int sys_vm86(unsigned long cmd, unsigned long arg, struct pt_regs *regs) ret = -EFAULT; if (tmp) goto out; - info.regs32 = regs; + info.regs32 = current_pt_regs(); info.vm86plus.is_vm86pus = 1; tsk->thread.vm86_info = (struct vm86_struct __user *)v86; do_sys_vm86(&info, tsk); diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 3a3e8c9e280..9a907a67be8 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -145,19 +145,6 @@ static int addr_to_vsyscall_nr(unsigned long addr) return nr; } -#ifdef CONFIG_SECCOMP -static int vsyscall_seccomp(struct task_struct *tsk, int syscall_nr) -{ - if (!seccomp_mode(&tsk->seccomp)) - return 0; - task_pt_regs(tsk)->orig_ax = syscall_nr; - task_pt_regs(tsk)->ax = syscall_nr; - return __secure_computing(syscall_nr); -} -#else -#define vsyscall_seccomp(_tsk, _nr) 0 -#endif - static bool write_ok_or_segv(unsigned long ptr, size_t size) { /* @@ -190,10 +177,9 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) { struct task_struct *tsk; unsigned long caller; - int vsyscall_nr; + int vsyscall_nr, syscall_nr, tmp; int prev_sig_on_uaccess_error; long ret; - int skip; /* * No point in checking CS -- the only way to get here is a user mode @@ -225,56 +211,84 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) } tsk = current; - /* - * With a real vsyscall, page faults cause SIGSEGV. We want to - * preserve that behavior to make writing exploits harder. - */ - prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error; - current_thread_info()->sig_on_uaccess_error = 1; /* + * Check for access_ok violations and find the syscall nr. + * * NULL is a valid user pointer (in the access_ok sense) on 32-bit and * 64-bit, so we don't need to special-case it here. For all the * vsyscalls, NULL means "don't write anything" not "write it at * address 0". */ - ret = -EFAULT; - skip = 0; switch (vsyscall_nr) { case 0: - skip = vsyscall_seccomp(tsk, __NR_gettimeofday); - if (skip) - break; - if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) || - !write_ok_or_segv(regs->si, sizeof(struct timezone))) - break; + !write_ok_or_segv(regs->si, sizeof(struct timezone))) { + ret = -EFAULT; + goto check_fault; + } + + syscall_nr = __NR_gettimeofday; + break; + + case 1: + if (!write_ok_or_segv(regs->di, sizeof(time_t))) { + ret = -EFAULT; + goto check_fault; + } + + syscall_nr = __NR_time; + break; + + case 2: + if (!write_ok_or_segv(regs->di, sizeof(unsigned)) || + !write_ok_or_segv(regs->si, sizeof(unsigned))) { + ret = -EFAULT; + goto check_fault; + } + + syscall_nr = __NR_getcpu; + break; + } + + /* + * Handle seccomp. regs->ip must be the original value. + * See seccomp_send_sigsys and Documentation/prctl/seccomp_filter.txt. + * + * We could optimize the seccomp disabled case, but performance + * here doesn't matter. + */ + regs->orig_ax = syscall_nr; + regs->ax = -ENOSYS; + tmp = secure_computing(syscall_nr); + if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) { + warn_bad_vsyscall(KERN_DEBUG, regs, + "seccomp tried to change syscall nr or ip"); + do_exit(SIGSYS); + } + if (tmp) + goto do_ret; /* skip requested */ + /* + * With a real vsyscall, page faults cause SIGSEGV. We want to + * preserve that behavior to make writing exploits harder. + */ + prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error; + current_thread_info()->sig_on_uaccess_error = 1; + + ret = -EFAULT; + switch (vsyscall_nr) { + case 0: ret = sys_gettimeofday( (struct timeval __user *)regs->di, (struct timezone __user *)regs->si); break; case 1: - skip = vsyscall_seccomp(tsk, __NR_time); - if (skip) - break; - - if (!write_ok_or_segv(regs->di, sizeof(time_t))) - break; - ret = sys_time((time_t __user *)regs->di); break; case 2: - skip = vsyscall_seccomp(tsk, __NR_getcpu); - if (skip) - break; - - if (!write_ok_or_segv(regs->di, sizeof(unsigned)) || - !write_ok_or_segv(regs->si, sizeof(unsigned))) - break; - ret = sys_getcpu((unsigned __user *)regs->di, (unsigned __user *)regs->si, NULL); @@ -283,12 +297,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error; - if (skip) { - if ((long)regs->ax <= 0L) /* seccomp errno emulation */ - goto do_ret; - goto done; /* seccomp trace/trap */ - } - +check_fault: if (ret == -EFAULT) { /* Bad news -- userspace fed a bad pointer to a vsyscall. */ warn_bad_vsyscall(KERN_INFO, regs, @@ -311,7 +320,6 @@ do_ret: /* Emulate a ret instruction. */ regs->ip = caller; regs->sp += 8; -done: return true; sigsegv: diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index 1330dd10295..b014d9414d0 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -59,6 +59,9 @@ EXPORT_SYMBOL(memcpy); EXPORT_SYMBOL(__memcpy); EXPORT_SYMBOL(memmove); +#ifndef CONFIG_DEBUG_VIRTUAL +EXPORT_SYMBOL(phys_base); +#endif EXPORT_SYMBOL(empty_zero_page); #ifndef CONFIG_PARAVIRT EXPORT_SYMBOL(native_load_gs_index); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 7a3d075a814..45a14dbbdda 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -19,6 +19,7 @@ #include <asm/time.h> #include <asm/irq.h> #include <asm/io_apic.h> +#include <asm/hpet.h> #include <asm/pat.h> #include <asm/tsc.h> #include <asm/iommu.h> @@ -62,10 +63,6 @@ struct x86_init_ops x86_init __initdata = { .banner = default_banner, }, - .mapping = { - .pagetable_reserve = native_pagetable_reserve, - }, - .paging = { .pagetable_init = native_pagetable_init, }, @@ -111,15 +108,22 @@ struct x86_platform_ops x86_platform = { EXPORT_SYMBOL_GPL(x86_platform); struct x86_msi_ops x86_msi = { - .setup_msi_irqs = native_setup_msi_irqs, - .teardown_msi_irq = native_teardown_msi_irq, - .teardown_msi_irqs = default_teardown_msi_irqs, - .restore_msi_irqs = default_restore_msi_irqs, + .setup_msi_irqs = native_setup_msi_irqs, + .compose_msi_msg = native_compose_msi_msg, + .teardown_msi_irq = native_teardown_msi_irq, + .teardown_msi_irqs = default_teardown_msi_irqs, + .restore_msi_irqs = default_restore_msi_irqs, + .setup_hpet_msi = default_setup_hpet_msi, }; struct x86_io_apic_ops x86_io_apic_ops = { - .init = native_io_apic_init_mappings, - .read = native_io_apic_read, - .write = native_io_apic_write, - .modify = native_io_apic_modify, + .init = native_io_apic_init_mappings, + .read = native_io_apic_read, + .write = native_io_apic_write, + .modify = native_io_apic_modify, + .disable = native_disable_io_apic, + .print_entries = native_io_apic_print_entries, + .set_affinity = native_ioapic_set_affinity, + .setup_entry = native_setup_ioapic_entry, + .eoi_ioapic_pin = native_eoi_ioapic_pin, }; diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index ec79e773342..a20ecb5b6cb 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -320,6 +320,8 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, if (index == 0) { entry->ebx &= kvm_supported_word9_x86_features; cpuid_mask(&entry->ebx, 9); + // TSC_ADJUST is emulated + entry->ebx |= F(TSC_ADJUST); } else entry->ebx = 0; entry->eax = 0; @@ -659,6 +661,7 @@ void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) } else *eax = *ebx = *ecx = *edx = 0; } +EXPORT_SYMBOL_GPL(kvm_cpuid); void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) { diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index a10e4601685..b7fd0798488 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -24,10 +24,21 @@ static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; + if (!static_cpu_has(X86_FEATURE_XSAVE)) + return 0; + best = kvm_find_cpuid_entry(vcpu, 1, 0); return best && (best->ecx & bit(X86_FEATURE_XSAVE)); } +static inline bool guest_cpuid_has_tsc_adjust(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 7, 0); + return best && (best->ebx & bit(X86_FEATURE_TSC_ADJUST)); +} + static inline bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 39171cb307e..a335cc6cde7 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -24,6 +24,7 @@ #include "kvm_cache_regs.h" #include <linux/module.h> #include <asm/kvm_emulate.h> +#include <linux/stringify.h> #include "x86.h" #include "tss.h" @@ -43,7 +44,7 @@ #define OpCL 9ull /* CL register (for shifts) */ #define OpImmByte 10ull /* 8-bit sign extended immediate */ #define OpOne 11ull /* Implied 1 */ -#define OpImm 12ull /* Sign extended immediate */ +#define OpImm 12ull /* Sign extended up to 32-bit immediate */ #define OpMem16 13ull /* Memory operand (16-bit). */ #define OpMem32 14ull /* Memory operand (32-bit). */ #define OpImmU 15ull /* Immediate operand, zero extended */ @@ -58,6 +59,7 @@ #define OpFS 24ull /* FS */ #define OpGS 25ull /* GS */ #define OpMem8 26ull /* 8-bit zero extended memory operand */ +#define OpImm64 27ull /* Sign extended 16/32/64-bit immediate */ #define OpBits 5 /* Width of operand field */ #define OpMask ((1ull << OpBits) - 1) @@ -101,6 +103,7 @@ #define SrcMemFAddr (OpMemFAddr << SrcShift) #define SrcAcc (OpAcc << SrcShift) #define SrcImmU16 (OpImmU16 << SrcShift) +#define SrcImm64 (OpImm64 << SrcShift) #define SrcDX (OpDX << SrcShift) #define SrcMem8 (OpMem8 << SrcShift) #define SrcMask (OpMask << SrcShift) @@ -113,6 +116,7 @@ #define GroupDual (2<<15) /* Alternate decoding of mod == 3 */ #define Prefix (3<<15) /* Instruction varies with 66/f2/f3 prefix */ #define RMExt (4<<15) /* Opcode extension in ModRM r/m if mod == 3 */ +#define Escape (5<<15) /* Escape to coprocessor instruction */ #define Sse (1<<18) /* SSE Vector instruction */ /* Generic ModRM decode. */ #define ModRM (1<<19) @@ -146,6 +150,8 @@ #define Aligned ((u64)1 << 41) /* Explicitly aligned (e.g. MOVDQA) */ #define Unaligned ((u64)1 << 42) /* Explicitly unaligned (e.g. MOVDQU) */ #define Avx ((u64)1 << 43) /* Advanced Vector Extensions */ +#define Fastop ((u64)1 << 44) /* Use opcode::u.fastop */ +#define NoWrite ((u64)1 << 45) /* No writeback */ #define X2(x...) x, x #define X3(x...) X2(x), x @@ -156,6 +162,27 @@ #define X8(x...) X4(x), X4(x) #define X16(x...) X8(x), X8(x) +#define NR_FASTOP (ilog2(sizeof(ulong)) + 1) +#define FASTOP_SIZE 8 + +/* + * fastop functions have a special calling convention: + * + * dst: [rdx]:rax (in/out) + * src: rbx (in/out) + * src2: rcx (in) + * flags: rflags (in/out) + * + * Moreover, they are all exactly FASTOP_SIZE bytes long, so functions for + * different operand sizes can be reached by calculation, rather than a jump + * table (which would be bigger than the code). + * + * fastop functions are declared as taking a never-defined fastop parameter, + * so they can't be called from C directly. + */ + +struct fastop; + struct opcode { u64 flags : 56; u64 intercept : 8; @@ -164,6 +191,8 @@ struct opcode { const struct opcode *group; const struct group_dual *gdual; const struct gprefix *gprefix; + const struct escape *esc; + void (*fastop)(struct fastop *fake); } u; int (*check_perm)(struct x86_emulate_ctxt *ctxt); }; @@ -180,6 +209,11 @@ struct gprefix { struct opcode pfx_f3; }; +struct escape { + struct opcode op[8]; + struct opcode high[64]; +}; + /* EFLAGS bit definitions. */ #define EFLG_ID (1<<21) #define EFLG_VIP (1<<20) @@ -407,6 +441,97 @@ static void invalidate_registers(struct x86_emulate_ctxt *ctxt) } \ } while (0) +static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)); + +#define FOP_ALIGN ".align " __stringify(FASTOP_SIZE) " \n\t" +#define FOP_RET "ret \n\t" + +#define FOP_START(op) \ + extern void em_##op(struct fastop *fake); \ + asm(".pushsection .text, \"ax\" \n\t" \ + ".global em_" #op " \n\t" \ + FOP_ALIGN \ + "em_" #op ": \n\t" + +#define FOP_END \ + ".popsection") + +#define FOPNOP() FOP_ALIGN FOP_RET + +#define FOP1E(op, dst) \ + FOP_ALIGN #op " %" #dst " \n\t" FOP_RET + +#define FASTOP1(op) \ + FOP_START(op) \ + FOP1E(op##b, al) \ + FOP1E(op##w, ax) \ + FOP1E(op##l, eax) \ + ON64(FOP1E(op##q, rax)) \ + FOP_END + +#define FOP2E(op, dst, src) \ + FOP_ALIGN #op " %" #src ", %" #dst " \n\t" FOP_RET + +#define FASTOP2(op) \ + FOP_START(op) \ + FOP2E(op##b, al, bl) \ + FOP2E(op##w, ax, bx) \ + FOP2E(op##l, eax, ebx) \ + ON64(FOP2E(op##q, rax, rbx)) \ + FOP_END + +/* 2 operand, word only */ +#define FASTOP2W(op) \ + FOP_START(op) \ + FOPNOP() \ + FOP2E(op##w, ax, bx) \ + FOP2E(op##l, eax, ebx) \ + ON64(FOP2E(op##q, rax, rbx)) \ + FOP_END + +/* 2 operand, src is CL */ +#define FASTOP2CL(op) \ + FOP_START(op) \ + FOP2E(op##b, al, cl) \ + FOP2E(op##w, ax, cl) \ + FOP2E(op##l, eax, cl) \ + ON64(FOP2E(op##q, rax, cl)) \ + FOP_END + +#define FOP3E(op, dst, src, src2) \ + FOP_ALIGN #op " %" #src2 ", %" #src ", %" #dst " \n\t" FOP_RET + +/* 3-operand, word-only, src2=cl */ +#define FASTOP3WCL(op) \ + FOP_START(op) \ + FOPNOP() \ + FOP3E(op##w, ax, bx, cl) \ + FOP3E(op##l, eax, ebx, cl) \ + ON64(FOP3E(op##q, rax, rbx, cl)) \ + FOP_END + +/* Special case for SETcc - 1 instruction per cc */ +#define FOP_SETCC(op) ".align 4; " #op " %al; ret \n\t" + +FOP_START(setcc) +FOP_SETCC(seto) +FOP_SETCC(setno) +FOP_SETCC(setc) +FOP_SETCC(setnc) +FOP_SETCC(setz) +FOP_SETCC(setnz) +FOP_SETCC(setbe) +FOP_SETCC(setnbe) +FOP_SETCC(sets) +FOP_SETCC(setns) +FOP_SETCC(setp) +FOP_SETCC(setnp) +FOP_SETCC(setl) +FOP_SETCC(setnl) +FOP_SETCC(setle) +FOP_SETCC(setnle) +FOP_END; + #define __emulate_1op_rax_rdx(ctxt, _op, _suffix, _ex) \ do { \ unsigned long _tmp; \ @@ -426,8 +551,7 @@ static void invalidate_registers(struct x86_emulate_ctxt *ctxt) _ASM_EXTABLE(1b, 3b) \ : "=m" ((ctxt)->eflags), "=&r" (_tmp), \ "+a" (*rax), "+d" (*rdx), "+qm"(_ex) \ - : "i" (EFLAGS_MASK), "m" ((ctxt)->src.val), \ - "a" (*rax), "d" (*rdx)); \ + : "i" (EFLAGS_MASK), "m" ((ctxt)->src.val)); \ } while (0) /* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */ @@ -664,7 +788,7 @@ static int __linearize(struct x86_emulate_ctxt *ctxt, ulong la; u32 lim; u16 sel; - unsigned cpl, rpl; + unsigned cpl; la = seg_base(ctxt, addr.seg) + addr.ea; switch (ctxt->mode) { @@ -677,8 +801,9 @@ static int __linearize(struct x86_emulate_ctxt *ctxt, addr.seg); if (!usable) goto bad; - /* code segment or read-only data segment */ - if (((desc.type & 8) || !(desc.type & 2)) && write) + /* code segment in protected mode or read-only data segment */ + if ((((ctxt->mode != X86EMUL_MODE_REAL) && (desc.type & 8)) + || !(desc.type & 2)) && write) goto bad; /* unreadable code segment */ if (!fetch && (desc.type & 8) && !(desc.type & 2)) @@ -697,11 +822,6 @@ static int __linearize(struct x86_emulate_ctxt *ctxt, goto bad; } cpl = ctxt->ops->cpl(ctxt); - if (ctxt->mode == X86EMUL_MODE_REAL) - rpl = 0; - else - rpl = sel & 3; - cpl = max(cpl, rpl); if (!(desc.type & 8)) { /* data segment */ if (cpl > desc.dpl) @@ -852,39 +972,50 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt, return rc; } -static int test_cc(unsigned int condition, unsigned int flags) -{ - int rc = 0; - - switch ((condition & 15) >> 1) { - case 0: /* o */ - rc |= (flags & EFLG_OF); - break; - case 1: /* b/c/nae */ - rc |= (flags & EFLG_CF); - break; - case 2: /* z/e */ - rc |= (flags & EFLG_ZF); - break; - case 3: /* be/na */ - rc |= (flags & (EFLG_CF|EFLG_ZF)); - break; - case 4: /* s */ - rc |= (flags & EFLG_SF); - break; - case 5: /* p/pe */ - rc |= (flags & EFLG_PF); - break; - case 7: /* le/ng */ - rc |= (flags & EFLG_ZF); - /* fall through */ - case 6: /* l/nge */ - rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF)); - break; - } - - /* Odd condition identifiers (lsb == 1) have inverted sense. */ - return (!!rc ^ (condition & 1)); +FASTOP2(add); +FASTOP2(or); +FASTOP2(adc); +FASTOP2(sbb); +FASTOP2(and); +FASTOP2(sub); +FASTOP2(xor); +FASTOP2(cmp); +FASTOP2(test); + +FASTOP3WCL(shld); +FASTOP3WCL(shrd); + +FASTOP2W(imul); + +FASTOP1(not); +FASTOP1(neg); +FASTOP1(inc); +FASTOP1(dec); + +FASTOP2CL(rol); +FASTOP2CL(ror); +FASTOP2CL(rcl); +FASTOP2CL(rcr); +FASTOP2CL(shl); +FASTOP2CL(shr); +FASTOP2CL(sar); + +FASTOP2W(bsf); +FASTOP2W(bsr); +FASTOP2W(bt); +FASTOP2W(bts); +FASTOP2W(btr); +FASTOP2W(btc); + +static u8 test_cc(unsigned int condition, unsigned long flags) +{ + u8 rc; + void (*fop)(void) = (void *)em_setcc + 4 * (condition & 0xf); + + flags = (flags & EFLAGS_MASK) | X86_EFLAGS_IF; + asm("push %[flags]; popf; call *%[fastop]" + : "=a"(rc) : [fastop]"r"(fop), [flags]"r"(flags)); + return rc; } static void fetch_register_operand(struct operand *op) @@ -994,6 +1125,53 @@ static void write_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg) ctxt->ops->put_fpu(ctxt); } +static int em_fninit(struct x86_emulate_ctxt *ctxt) +{ + if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM)) + return emulate_nm(ctxt); + + ctxt->ops->get_fpu(ctxt); + asm volatile("fninit"); + ctxt->ops->put_fpu(ctxt); + return X86EMUL_CONTINUE; +} + +static int em_fnstcw(struct x86_emulate_ctxt *ctxt) +{ + u16 fcw; + + if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM)) + return emulate_nm(ctxt); + + ctxt->ops->get_fpu(ctxt); + asm volatile("fnstcw %0": "+m"(fcw)); + ctxt->ops->put_fpu(ctxt); + + /* force 2 byte destination */ + ctxt->dst.bytes = 2; + ctxt->dst.val = fcw; + + return X86EMUL_CONTINUE; +} + +static int em_fnstsw(struct x86_emulate_ctxt *ctxt) +{ + u16 fsw; + + if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM)) + return emulate_nm(ctxt); + + ctxt->ops->get_fpu(ctxt); + asm volatile("fnstsw %0": "+m"(fsw)); + ctxt->ops->put_fpu(ctxt); + + /* force 2 byte destination */ + ctxt->dst.bytes = 2; + ctxt->dst.val = fsw; + + return X86EMUL_CONTINUE; +} + static void decode_register_operand(struct x86_emulate_ctxt *ctxt, struct operand *op) { @@ -1534,6 +1712,9 @@ static int writeback(struct x86_emulate_ctxt *ctxt) { int rc; + if (ctxt->d & NoWrite) + return X86EMUL_CONTINUE; + switch (ctxt->dst.type) { case OP_REG: write_register_operand(&ctxt->dst); @@ -1918,47 +2099,6 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } -static int em_grp2(struct x86_emulate_ctxt *ctxt) -{ - switch (ctxt->modrm_reg) { - case 0: /* rol */ - emulate_2op_SrcB(ctxt, "rol"); - break; - case 1: /* ror */ - emulate_2op_SrcB(ctxt, "ror"); - break; - case 2: /* rcl */ - emulate_2op_SrcB(ctxt, "rcl"); - break; - case 3: /* rcr */ - emulate_2op_SrcB(ctxt, "rcr"); - break; - case 4: /* sal/shl */ - case 6: /* sal/shl */ - emulate_2op_SrcB(ctxt, "sal"); - break; - case 5: /* shr */ - emulate_2op_SrcB(ctxt, "shr"); - break; - case 7: /* sar */ - emulate_2op_SrcB(ctxt, "sar"); - break; - } - return X86EMUL_CONTINUE; -} - -static int em_not(struct x86_emulate_ctxt *ctxt) -{ - ctxt->dst.val = ~ctxt->dst.val; - return X86EMUL_CONTINUE; -} - -static int em_neg(struct x86_emulate_ctxt *ctxt) -{ - emulate_1op(ctxt, "neg"); - return X86EMUL_CONTINUE; -} - static int em_mul_ex(struct x86_emulate_ctxt *ctxt) { u8 ex = 0; @@ -2000,12 +2140,6 @@ static int em_grp45(struct x86_emulate_ctxt *ctxt) int rc = X86EMUL_CONTINUE; switch (ctxt->modrm_reg) { - case 0: /* inc */ - emulate_1op(ctxt, "inc"); - break; - case 1: /* dec */ - emulate_1op(ctxt, "dec"); - break; case 2: /* call near abs */ { long int old_eip; old_eip = ctxt->_eip; @@ -2075,7 +2209,7 @@ static int em_cmpxchg(struct x86_emulate_ctxt *ctxt) /* Save real source value, then compare EAX against destination. */ ctxt->src.orig_val = ctxt->src.val; ctxt->src.val = reg_read(ctxt, VCPU_REGS_RAX); - emulate_2op_SrcV(ctxt, "cmp"); + fastop(ctxt, em_cmp); if (ctxt->eflags & EFLG_ZF) { /* Success: write back to memory. */ @@ -2843,7 +2977,7 @@ static int em_das(struct x86_emulate_ctxt *ctxt) ctxt->src.type = OP_IMM; ctxt->src.val = 0; ctxt->src.bytes = 1; - emulate_2op_SrcV(ctxt, "or"); + fastop(ctxt, em_or); ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF); if (cf) ctxt->eflags |= X86_EFLAGS_CF; @@ -2852,6 +2986,24 @@ static int em_das(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int em_aad(struct x86_emulate_ctxt *ctxt) +{ + u8 al = ctxt->dst.val & 0xff; + u8 ah = (ctxt->dst.val >> 8) & 0xff; + + al = (al + (ah * ctxt->src.val)) & 0xff; + + ctxt->dst.val = (ctxt->dst.val & 0xffff0000) | al; + + /* Set PF, ZF, SF */ + ctxt->src.type = OP_IMM; + ctxt->src.val = 0; + ctxt->src.bytes = 1; + fastop(ctxt, em_or); + + return X86EMUL_CONTINUE; +} + static int em_call(struct x86_emulate_ctxt *ctxt) { long rel = ctxt->src.val; @@ -2900,64 +3052,6 @@ static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } -static int em_add(struct x86_emulate_ctxt *ctxt) -{ - emulate_2op_SrcV(ctxt, "add"); - return X86EMUL_CONTINUE; -} - -static int em_or(struct x86_emulate_ctxt *ctxt) -{ - emulate_2op_SrcV(ctxt, "or"); - return X86EMUL_CONTINUE; -} - -static int em_adc(struct x86_emulate_ctxt *ctxt) -{ - emulate_2op_SrcV(ctxt, "adc"); - return X86EMUL_CONTINUE; -} - -static int em_sbb(struct x86_emulate_ctxt *ctxt) -{ - emulate_2op_SrcV(ctxt, "sbb"); - return X86EMUL_CONTINUE; -} - -static int em_and(struct x86_emulate_ctxt *ctxt) -{ - emulate_2op_SrcV(ctxt, "and"); - return X86EMUL_CONTINUE; -} - -static int em_sub(struct x86_emulate_ctxt *ctxt) -{ - emulate_2op_SrcV(ctxt, "sub"); - return X86EMUL_CONTINUE; -} - -static int em_xor(struct x86_emulate_ctxt *ctxt) -{ - emulate_2op_SrcV(ctxt, "xor"); - return X86EMUL_CONTINUE; -} - -static int em_cmp(struct x86_emulate_ctxt *ctxt) -{ - emulate_2op_SrcV(ctxt, "cmp"); - /* Disable writeback. */ - ctxt->dst.type = OP_NONE; - return X86EMUL_CONTINUE; -} - -static int em_test(struct x86_emulate_ctxt *ctxt) -{ - emulate_2op_SrcV(ctxt, "test"); - /* Disable writeback. */ - ctxt->dst.type = OP_NONE; - return X86EMUL_CONTINUE; -} - static int em_xchg(struct x86_emulate_ctxt *ctxt) { /* Write back the register source. */ @@ -2970,16 +3064,10 @@ static int em_xchg(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } -static int em_imul(struct x86_emulate_ctxt *ctxt) -{ - emulate_2op_SrcV_nobyte(ctxt, "imul"); - return X86EMUL_CONTINUE; -} - static int em_imul_3op(struct x86_emulate_ctxt *ctxt) { ctxt->dst.val = ctxt->src2.val; - return em_imul(ctxt); + return fastop(ctxt, em_imul); } static int em_cwd(struct x86_emulate_ctxt *ctxt) @@ -3300,47 +3388,6 @@ static int em_sti(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } -static int em_bt(struct x86_emulate_ctxt *ctxt) -{ - /* Disable writeback. */ - ctxt->dst.type = OP_NONE; - /* only subword offset */ - ctxt->src.val &= (ctxt->dst.bytes << 3) - 1; - - emulate_2op_SrcV_nobyte(ctxt, "bt"); - return X86EMUL_CONTINUE; -} - -static int em_bts(struct x86_emulate_ctxt *ctxt) -{ - emulate_2op_SrcV_nobyte(ctxt, "bts"); - return X86EMUL_CONTINUE; -} - -static int em_btr(struct x86_emulate_ctxt *ctxt) -{ - emulate_2op_SrcV_nobyte(ctxt, "btr"); - return X86EMUL_CONTINUE; -} - -static int em_btc(struct x86_emulate_ctxt *ctxt) -{ - emulate_2op_SrcV_nobyte(ctxt, "btc"); - return X86EMUL_CONTINUE; -} - -static int em_bsf(struct x86_emulate_ctxt *ctxt) -{ - emulate_2op_SrcV_nobyte(ctxt, "bsf"); - return X86EMUL_CONTINUE; -} - -static int em_bsr(struct x86_emulate_ctxt *ctxt) -{ - emulate_2op_SrcV_nobyte(ctxt, "bsr"); - return X86EMUL_CONTINUE; -} - static int em_cpuid(struct x86_emulate_ctxt *ctxt) { u32 eax, ebx, ecx, edx; @@ -3572,7 +3619,9 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt) #define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) } #define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) } #define GD(_f, _g) { .flags = ((_f) | GroupDual | ModRM), .u.gdual = (_g) } +#define E(_f, _e) { .flags = ((_f) | Escape | ModRM), .u.esc = (_e) } #define I(_f, _e) { .flags = (_f), .u.execute = (_e) } +#define F(_f, _e) { .flags = (_f) | Fastop, .u.fastop = (_e) } #define II(_f, _e, _i) \ { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i } #define IIP(_f, _e, _i, _p) \ @@ -3583,12 +3632,13 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt) #define D2bv(_f) D((_f) | ByteOp), D(_f) #define D2bvIP(_f, _i, _p) DIP((_f) | ByteOp, _i, _p), DIP(_f, _i, _p) #define I2bv(_f, _e) I((_f) | ByteOp, _e), I(_f, _e) +#define F2bv(_f, _e) F((_f) | ByteOp, _e), F(_f, _e) #define I2bvIP(_f, _e, _i, _p) \ IIP((_f) | ByteOp, _e, _i, _p), IIP(_f, _e, _i, _p) -#define I6ALU(_f, _e) I2bv((_f) | DstMem | SrcReg | ModRM, _e), \ - I2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \ - I2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e) +#define F6ALU(_f, _e) F2bv((_f) | DstMem | SrcReg | ModRM, _e), \ + F2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \ + F2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e) static const struct opcode group7_rm1[] = { DI(SrcNone | Priv, monitor), @@ -3614,25 +3664,36 @@ static const struct opcode group7_rm7[] = { }; static const struct opcode group1[] = { - I(Lock, em_add), - I(Lock | PageTable, em_or), - I(Lock, em_adc), - I(Lock, em_sbb), - I(Lock | PageTable, em_and), - I(Lock, em_sub), - I(Lock, em_xor), - I(0, em_cmp), + F(Lock, em_add), + F(Lock | PageTable, em_or), + F(Lock, em_adc), + F(Lock, em_sbb), + F(Lock | PageTable, em_and), + F(Lock, em_sub), + F(Lock, em_xor), + F(NoWrite, em_cmp), }; static const struct opcode group1A[] = { I(DstMem | SrcNone | Mov | Stack, em_pop), N, N, N, N, N, N, N, }; +static const struct opcode group2[] = { + F(DstMem | ModRM, em_rol), + F(DstMem | ModRM, em_ror), + F(DstMem | ModRM, em_rcl), + F(DstMem | ModRM, em_rcr), + F(DstMem | ModRM, em_shl), + F(DstMem | ModRM, em_shr), + F(DstMem | ModRM, em_shl), + F(DstMem | ModRM, em_sar), +}; + static const struct opcode group3[] = { - I(DstMem | SrcImm, em_test), - I(DstMem | SrcImm, em_test), - I(DstMem | SrcNone | Lock, em_not), - I(DstMem | SrcNone | Lock, em_neg), + F(DstMem | SrcImm | NoWrite, em_test), + F(DstMem | SrcImm | NoWrite, em_test), + F(DstMem | SrcNone | Lock, em_not), + F(DstMem | SrcNone | Lock, em_neg), I(SrcMem, em_mul_ex), I(SrcMem, em_imul_ex), I(SrcMem, em_div_ex), @@ -3640,14 +3701,14 @@ static const struct opcode group3[] = { }; static const struct opcode group4[] = { - I(ByteOp | DstMem | SrcNone | Lock, em_grp45), - I(ByteOp | DstMem | SrcNone | Lock, em_grp45), + F(ByteOp | DstMem | SrcNone | Lock, em_inc), + F(ByteOp | DstMem | SrcNone | Lock, em_dec), N, N, N, N, N, N, }; static const struct opcode group5[] = { - I(DstMem | SrcNone | Lock, em_grp45), - I(DstMem | SrcNone | Lock, em_grp45), + F(DstMem | SrcNone | Lock, em_inc), + F(DstMem | SrcNone | Lock, em_dec), I(SrcMem | Stack, em_grp45), I(SrcMemFAddr | ImplicitOps | Stack, em_call_far), I(SrcMem | Stack, em_grp45), @@ -3682,10 +3743,10 @@ static const struct group_dual group7 = { { static const struct opcode group8[] = { N, N, N, N, - I(DstMem | SrcImmByte, em_bt), - I(DstMem | SrcImmByte | Lock | PageTable, em_bts), - I(DstMem | SrcImmByte | Lock, em_btr), - I(DstMem | SrcImmByte | Lock | PageTable, em_btc), + F(DstMem | SrcImmByte | NoWrite, em_bt), + F(DstMem | SrcImmByte | Lock | PageTable, em_bts), + F(DstMem | SrcImmByte | Lock, em_btr), + F(DstMem | SrcImmByte | Lock | PageTable, em_btc), }; static const struct group_dual group9 = { { @@ -3707,33 +3768,96 @@ static const struct gprefix pfx_vmovntpx = { I(0, em_mov), N, N, N, }; +static const struct escape escape_d9 = { { + N, N, N, N, N, N, N, I(DstMem, em_fnstcw), +}, { + /* 0xC0 - 0xC7 */ + N, N, N, N, N, N, N, N, + /* 0xC8 - 0xCF */ + N, N, N, N, N, N, N, N, + /* 0xD0 - 0xC7 */ + N, N, N, N, N, N, N, N, + /* 0xD8 - 0xDF */ + N, N, N, N, N, N, N, N, + /* 0xE0 - 0xE7 */ + N, N, N, N, N, N, N, N, + /* 0xE8 - 0xEF */ + N, N, N, N, N, N, N, N, + /* 0xF0 - 0xF7 */ + N, N, N, N, N, N, N, N, + /* 0xF8 - 0xFF */ + N, N, N, N, N, N, N, N, +} }; + +static const struct escape escape_db = { { + N, N, N, N, N, N, N, N, +}, { + /* 0xC0 - 0xC7 */ + N, N, N, N, N, N, N, N, + /* 0xC8 - 0xCF */ + N, N, N, N, N, N, N, N, + /* 0xD0 - 0xC7 */ + N, N, N, N, N, N, N, N, + /* 0xD8 - 0xDF */ + N, N, N, N, N, N, N, N, + /* 0xE0 - 0xE7 */ + N, N, N, I(ImplicitOps, em_fninit), N, N, N, N, + /* 0xE8 - 0xEF */ + N, N, N, N, N, N, N, N, + /* 0xF0 - 0xF7 */ + N, N, N, N, N, N, N, N, + /* 0xF8 - 0xFF */ + N, N, N, N, N, N, N, N, +} }; + +static const struct escape escape_dd = { { + N, N, N, N, N, N, N, I(DstMem, em_fnstsw), +}, { + /* 0xC0 - 0xC7 */ + N, N, N, N, N, N, N, N, + /* 0xC8 - 0xCF */ + N, N, N, N, N, N, N, N, + /* 0xD0 - 0xC7 */ + N, N, N, N, N, N, N, N, + /* 0xD8 - 0xDF */ + N, N, N, N, N, N, N, N, + /* 0xE0 - 0xE7 */ + N, N, N, N, N, N, N, N, + /* 0xE8 - 0xEF */ + N, N, N, N, N, N, N, N, + /* 0xF0 - 0xF7 */ + N, N, N, N, N, N, N, N, + /* 0xF8 - 0xFF */ + N, N, N, N, N, N, N, N, +} }; + static const struct opcode opcode_table[256] = { /* 0x00 - 0x07 */ - I6ALU(Lock, em_add), + F6ALU(Lock, em_add), I(ImplicitOps | Stack | No64 | Src2ES, em_push_sreg), I(ImplicitOps | Stack | No64 | Src2ES, em_pop_sreg), /* 0x08 - 0x0F */ - I6ALU(Lock | PageTable, em_or), + F6ALU(Lock | PageTable, em_or), I(ImplicitOps | Stack | No64 | Src2CS, em_push_sreg), N, /* 0x10 - 0x17 */ - I6ALU(Lock, em_adc), + F6ALU(Lock, em_adc), I(ImplicitOps | Stack | No64 | Src2SS, em_push_sreg), I(ImplicitOps | Stack | No64 | Src2SS, em_pop_sreg), /* 0x18 - 0x1F */ - I6ALU(Lock, em_sbb), + F6ALU(Lock, em_sbb), I(ImplicitOps | Stack | No64 | Src2DS, em_push_sreg), I(ImplicitOps | Stack | No64 | Src2DS, em_pop_sreg), /* 0x20 - 0x27 */ - I6ALU(Lock | PageTable, em_and), N, N, + F6ALU(Lock | PageTable, em_and), N, N, /* 0x28 - 0x2F */ - I6ALU(Lock, em_sub), N, I(ByteOp | DstAcc | No64, em_das), + F6ALU(Lock, em_sub), N, I(ByteOp | DstAcc | No64, em_das), /* 0x30 - 0x37 */ - I6ALU(Lock, em_xor), N, N, + F6ALU(Lock, em_xor), N, N, /* 0x38 - 0x3F */ - I6ALU(0, em_cmp), N, N, + F6ALU(NoWrite, em_cmp), N, N, /* 0x40 - 0x4F */ - X16(D(DstReg)), + X8(F(DstReg, em_inc)), X8(F(DstReg, em_dec)), /* 0x50 - 0x57 */ X8(I(SrcReg | Stack, em_push)), /* 0x58 - 0x5F */ @@ -3757,7 +3881,7 @@ static const struct opcode opcode_table[256] = { G(DstMem | SrcImm, group1), G(ByteOp | DstMem | SrcImm | No64, group1), G(DstMem | SrcImmByte, group1), - I2bv(DstMem | SrcReg | ModRM, em_test), + F2bv(DstMem | SrcReg | ModRM | NoWrite, em_test), I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_xchg), /* 0x88 - 0x8F */ I2bv(DstMem | SrcReg | ModRM | Mov | PageTable, em_mov), @@ -3777,18 +3901,18 @@ static const struct opcode opcode_table[256] = { I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov), I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov), I2bv(SrcSI | DstDI | Mov | String, em_mov), - I2bv(SrcSI | DstDI | String, em_cmp), + F2bv(SrcSI | DstDI | String | NoWrite, em_cmp), /* 0xA8 - 0xAF */ - I2bv(DstAcc | SrcImm, em_test), + F2bv(DstAcc | SrcImm | NoWrite, em_test), I2bv(SrcAcc | DstDI | Mov | String, em_mov), I2bv(SrcSI | DstAcc | Mov | String, em_mov), - I2bv(SrcAcc | DstDI | String, em_cmp), + F2bv(SrcAcc | DstDI | String | NoWrite, em_cmp), /* 0xB0 - 0xB7 */ X8(I(ByteOp | DstReg | SrcImm | Mov, em_mov)), /* 0xB8 - 0xBF */ - X8(I(DstReg | SrcImm | Mov, em_mov)), + X8(I(DstReg | SrcImm64 | Mov, em_mov)), /* 0xC0 - 0xC7 */ - D2bv(DstMem | SrcImmByte | ModRM), + G(ByteOp | Src2ImmByte, group2), G(Src2ImmByte, group2), I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm), I(ImplicitOps | Stack, em_ret), I(DstReg | SrcMemFAddr | ModRM | No64 | Src2ES, em_lseg), @@ -3800,10 +3924,11 @@ static const struct opcode opcode_table[256] = { D(ImplicitOps), DI(SrcImmByte, intn), D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret), /* 0xD0 - 0xD7 */ - D2bv(DstMem | SrcOne | ModRM), D2bv(DstMem | ModRM), - N, N, N, N, + G(Src2One | ByteOp, group2), G(Src2One, group2), + G(Src2CL | ByteOp, group2), G(Src2CL, group2), + N, I(DstAcc | SrcImmByte | No64, em_aad), N, N, /* 0xD8 - 0xDF */ - N, N, N, N, N, N, N, N, + N, E(0, &escape_d9), N, E(0, &escape_db), N, E(0, &escape_dd), N, N, /* 0xE0 - 0xE7 */ X3(I(SrcImmByte, em_loop)), I(SrcImmByte, em_jcxz), @@ -3870,28 +3995,29 @@ static const struct opcode twobyte_table[256] = { X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)), /* 0xA0 - 0xA7 */ I(Stack | Src2FS, em_push_sreg), I(Stack | Src2FS, em_pop_sreg), - II(ImplicitOps, em_cpuid, cpuid), I(DstMem | SrcReg | ModRM | BitOp, em_bt), - D(DstMem | SrcReg | Src2ImmByte | ModRM), - D(DstMem | SrcReg | Src2CL | ModRM), N, N, + II(ImplicitOps, em_cpuid, cpuid), + F(DstMem | SrcReg | ModRM | BitOp | NoWrite, em_bt), + F(DstMem | SrcReg | Src2ImmByte | ModRM, em_shld), + F(DstMem | SrcReg | Src2CL | ModRM, em_shld), N, N, /* 0xA8 - 0xAF */ I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg), DI(ImplicitOps, rsm), - I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts), - D(DstMem | SrcReg | Src2ImmByte | ModRM), - D(DstMem | SrcReg | Src2CL | ModRM), - D(ModRM), I(DstReg | SrcMem | ModRM, em_imul), + F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts), + F(DstMem | SrcReg | Src2ImmByte | ModRM, em_shrd), + F(DstMem | SrcReg | Src2CL | ModRM, em_shrd), + D(ModRM), F(DstReg | SrcMem | ModRM, em_imul), /* 0xB0 - 0xB7 */ I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_cmpxchg), I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg), - I(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr), + F(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr), I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg), I(DstReg | SrcMemFAddr | ModRM | Src2GS, em_lseg), D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), /* 0xB8 - 0xBF */ N, N, G(BitOp, group8), - I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc), - I(DstReg | SrcMem | ModRM, em_bsf), I(DstReg | SrcMem | ModRM, em_bsr), + F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc), + F(DstReg | SrcMem | ModRM, em_bsf), F(DstReg | SrcMem | ModRM, em_bsr), D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), /* 0xC0 - 0xC7 */ D2bv(DstMem | SrcReg | ModRM | Lock), @@ -3950,6 +4076,9 @@ static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op, case 4: op->val = insn_fetch(s32, ctxt); break; + case 8: + op->val = insn_fetch(s64, ctxt); + break; } if (!sign_extension) { switch (op->bytes) { @@ -4028,6 +4157,9 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, case OpImm: rc = decode_imm(ctxt, op, imm_size(ctxt), true); break; + case OpImm64: + rc = decode_imm(ctxt, op, ctxt->op_bytes, true); + break; case OpMem8: ctxt->memop.bytes = 1; goto mem_common; @@ -4222,6 +4354,12 @@ done_prefixes: case 0xf3: opcode = opcode.u.gprefix->pfx_f3; break; } break; + case Escape: + if (ctxt->modrm > 0xbf) + opcode = opcode.u.esc->high[ctxt->modrm - 0xc0]; + else + opcode = opcode.u.esc->op[(ctxt->modrm >> 3) & 7]; + break; default: return EMULATION_FAILED; } @@ -4354,6 +4492,16 @@ static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt, read_mmx_reg(ctxt, &op->mm_val, op->addr.mm); } +static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)) +{ + ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF; + fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE; + asm("push %[flags]; popf; call *%[fastop]; pushf; pop %[flags]\n" + : "+a"(ctxt->dst.val), "+b"(ctxt->src.val), [flags]"+D"(flags) + : "c"(ctxt->src2.val), [fastop]"S"(fop)); + ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK); + return X86EMUL_CONTINUE; +} int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) { @@ -4483,6 +4631,13 @@ special_insn: } if (ctxt->execute) { + if (ctxt->d & Fastop) { + void (*fop)(struct fastop *) = (void *)ctxt->execute; + rc = fastop(ctxt, fop); + if (rc != X86EMUL_CONTINUE) + goto done; + goto writeback; + } rc = ctxt->execute(ctxt); if (rc != X86EMUL_CONTINUE) goto done; @@ -4493,12 +4648,6 @@ special_insn: goto twobyte_insn; switch (ctxt->b) { - case 0x40 ... 0x47: /* inc r16/r32 */ - emulate_1op(ctxt, "inc"); - break; - case 0x48 ... 0x4f: /* dec r16/r32 */ - emulate_1op(ctxt, "dec"); - break; case 0x63: /* movsxd */ if (ctxt->mode != X86EMUL_MODE_PROT64) goto cannot_emulate; @@ -4523,9 +4672,6 @@ special_insn: case 8: ctxt->dst.val = (s32)ctxt->dst.val; break; } break; - case 0xc0 ... 0xc1: - rc = em_grp2(ctxt); - break; case 0xcc: /* int3 */ rc = emulate_int(ctxt, 3); break; @@ -4536,13 +4682,6 @@ special_insn: if (ctxt->eflags & EFLG_OF) rc = emulate_int(ctxt, 4); break; - case 0xd0 ... 0xd1: /* Grp2 */ - rc = em_grp2(ctxt); - break; - case 0xd2 ... 0xd3: /* Grp2 */ - ctxt->src.val = reg_read(ctxt, VCPU_REGS_RCX); - rc = em_grp2(ctxt); - break; case 0xe9: /* jmp rel */ case 0xeb: /* jmp rel short */ jmp_rel(ctxt, ctxt->src.val); @@ -4661,14 +4800,6 @@ twobyte_insn: case 0x90 ... 0x9f: /* setcc r/m8 */ ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags); break; - case 0xa4: /* shld imm8, r, r/m */ - case 0xa5: /* shld cl, r, r/m */ - emulate_2op_cl(ctxt, "shld"); - break; - case 0xac: /* shrd imm8, r, r/m */ - case 0xad: /* shrd cl, r, r/m */ - emulate_2op_cl(ctxt, "shrd"); - break; case 0xae: /* clflush */ break; case 0xb6 ... 0xb7: /* movzx */ @@ -4682,7 +4813,7 @@ twobyte_insn: (s16) ctxt->src.val; break; case 0xc0 ... 0xc1: /* xadd */ - emulate_2op_SrcV(ctxt, "add"); + fastop(ctxt, em_add); /* Write back the register source. */ ctxt->src.val = ctxt->dst.orig_val; write_register_operand(&ctxt->src); diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 11300d2fa71..c1d30b2fc9b 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -122,7 +122,6 @@ static s64 __kpit_elapsed(struct kvm *kvm) */ remaining = hrtimer_get_remaining(&ps->timer); elapsed = ps->period - ktime_to_ns(remaining); - elapsed = mod_64(elapsed, ps->period); return elapsed; } diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 848206df096..cc31f7c06d3 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -241,6 +241,8 @@ int kvm_pic_read_irq(struct kvm *kvm) int irq, irq2, intno; struct kvm_pic *s = pic_irqchip(kvm); + s->output = 0; + pic_lock(s); irq = pic_get_irq(&s->pics[0]); if (irq >= 0) { diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 7e06ba1618b..484bc874688 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -38,49 +38,81 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) EXPORT_SYMBOL(kvm_cpu_has_pending_timer); /* + * check if there is pending interrupt from + * non-APIC source without intack. + */ +static int kvm_cpu_has_extint(struct kvm_vcpu *v) +{ + if (kvm_apic_accept_pic_intr(v)) + return pic_irqchip(v->kvm)->output; /* PIC */ + else + return 0; +} + +/* + * check if there is injectable interrupt: + * when virtual interrupt delivery enabled, + * interrupt from apic will handled by hardware, + * we don't need to check it here. + */ +int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v) +{ + if (!irqchip_in_kernel(v->kvm)) + return v->arch.interrupt.pending; + + if (kvm_cpu_has_extint(v)) + return 1; + + if (kvm_apic_vid_enabled(v->kvm)) + return 0; + + return kvm_apic_has_interrupt(v) != -1; /* LAPIC */ +} + +/* * check if there is pending interrupt without * intack. */ int kvm_cpu_has_interrupt(struct kvm_vcpu *v) { - struct kvm_pic *s; - if (!irqchip_in_kernel(v->kvm)) return v->arch.interrupt.pending; - if (kvm_apic_has_interrupt(v) == -1) { /* LAPIC */ - if (kvm_apic_accept_pic_intr(v)) { - s = pic_irqchip(v->kvm); /* PIC */ - return s->output; - } else - return 0; - } - return 1; + if (kvm_cpu_has_extint(v)) + return 1; + + return kvm_apic_has_interrupt(v) != -1; /* LAPIC */ } EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt); /* + * Read pending interrupt(from non-APIC source) + * vector and intack. + */ +static int kvm_cpu_get_extint(struct kvm_vcpu *v) +{ + if (kvm_cpu_has_extint(v)) + return kvm_pic_read_irq(v->kvm); /* PIC */ + return -1; +} + +/* * Read pending interrupt vector and intack. */ int kvm_cpu_get_interrupt(struct kvm_vcpu *v) { - struct kvm_pic *s; int vector; if (!irqchip_in_kernel(v->kvm)) return v->arch.interrupt.nr; - vector = kvm_get_apic_interrupt(v); /* APIC */ - if (vector == -1) { - if (kvm_apic_accept_pic_intr(v)) { - s = pic_irqchip(v->kvm); - s->output = 0; /* PIC */ - vector = kvm_pic_read_irq(v->kvm); - } - } - return vector; + vector = kvm_cpu_get_extint(v); + + if (kvm_apic_vid_enabled(v->kvm) || vector != -1) + return vector; /* PIC */ + + return kvm_get_apic_interrupt(v); /* APIC */ } -EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt); void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) { diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 43e9fadca5d..02b51dd4e4a 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -140,31 +140,56 @@ static inline int apic_enabled(struct kvm_lapic *apic) (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \ APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER) -static inline int apic_x2apic_mode(struct kvm_lapic *apic) -{ - return apic->vcpu->arch.apic_base & X2APIC_ENABLE; -} - static inline int kvm_apic_id(struct kvm_lapic *apic) { return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff; } -static inline u16 apic_cluster_id(struct kvm_apic_map *map, u32 ldr) +void kvm_calculate_eoi_exitmap(struct kvm_vcpu *vcpu, + struct kvm_lapic_irq *irq, + u64 *eoi_exit_bitmap) { - u16 cid; - ldr >>= 32 - map->ldr_bits; - cid = (ldr >> map->cid_shift) & map->cid_mask; + struct kvm_lapic **dst; + struct kvm_apic_map *map; + unsigned long bitmap = 1; + int i; - BUG_ON(cid >= ARRAY_SIZE(map->logical_map)); + rcu_read_lock(); + map = rcu_dereference(vcpu->kvm->arch.apic_map); - return cid; -} + if (unlikely(!map)) { + __set_bit(irq->vector, (unsigned long *)eoi_exit_bitmap); + goto out; + } -static inline u16 apic_logical_id(struct kvm_apic_map *map, u32 ldr) -{ - ldr >>= (32 - map->ldr_bits); - return ldr & map->lid_mask; + if (irq->dest_mode == 0) { /* physical mode */ + if (irq->delivery_mode == APIC_DM_LOWEST || + irq->dest_id == 0xff) { + __set_bit(irq->vector, + (unsigned long *)eoi_exit_bitmap); + goto out; + } + dst = &map->phys_map[irq->dest_id & 0xff]; + } else { + u32 mda = irq->dest_id << (32 - map->ldr_bits); + + dst = map->logical_map[apic_cluster_id(map, mda)]; + + bitmap = apic_logical_id(map, mda); + } + + for_each_set_bit(i, &bitmap, 16) { + if (!dst[i]) + continue; + if (dst[i]->vcpu == vcpu) { + __set_bit(irq->vector, + (unsigned long *)eoi_exit_bitmap); + break; + } + } + +out: + rcu_read_unlock(); } static void recalculate_apic_map(struct kvm *kvm) @@ -230,6 +255,8 @@ out: if (old) kfree_rcu(old, rcu); + + kvm_ioapic_make_eoibitmap_request(kvm); } static inline void kvm_apic_set_id(struct kvm_lapic *apic, u8 id) @@ -345,6 +372,10 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic) { int result; + /* + * Note that irr_pending is just a hint. It will be always + * true with virtual interrupt delivery enabled. + */ if (!apic->irr_pending) return -1; @@ -461,6 +492,8 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu) static inline int apic_find_highest_isr(struct kvm_lapic *apic) { int result; + + /* Note that isr_count is always 1 with vid enabled */ if (!apic->isr_count) return -1; if (likely(apic->highest_isr_cache != -1)) @@ -740,6 +773,19 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2) return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio; } +static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector) +{ + if (!(kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) && + kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) { + int trigger_mode; + if (apic_test_vector(vector, apic->regs + APIC_TMR)) + trigger_mode = IOAPIC_LEVEL_TRIG; + else + trigger_mode = IOAPIC_EDGE_TRIG; + kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); + } +} + static int apic_set_eoi(struct kvm_lapic *apic) { int vector = apic_find_highest_isr(apic); @@ -756,19 +802,26 @@ static int apic_set_eoi(struct kvm_lapic *apic) apic_clear_isr(vector, apic); apic_update_ppr(apic); - if (!(kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) && - kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) { - int trigger_mode; - if (apic_test_vector(vector, apic->regs + APIC_TMR)) - trigger_mode = IOAPIC_LEVEL_TRIG; - else - trigger_mode = IOAPIC_EDGE_TRIG; - kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); - } + kvm_ioapic_send_eoi(apic, vector); kvm_make_request(KVM_REQ_EVENT, apic->vcpu); return vector; } +/* + * this interface assumes a trap-like exit, which has already finished + * desired side effect including vISR and vPPR update. + */ +void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + trace_kvm_eoi(apic, vector); + + kvm_ioapic_send_eoi(apic, vector); + kvm_make_request(KVM_REQ_EVENT, apic->vcpu); +} +EXPORT_SYMBOL_GPL(kvm_apic_set_eoi_accelerated); + static void apic_send_ipi(struct kvm_lapic *apic) { u32 icr_low = kvm_apic_get_reg(apic, APIC_ICR); @@ -1011,7 +1064,7 @@ static void start_apic_timer(struct kvm_lapic *apic) local_irq_save(flags); now = apic->lapic_timer.timer.base->get_time(); - guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu); + guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc()); if (likely(tscdeadline > guest_tsc)) { ns = (tscdeadline - guest_tsc) * 1000000ULL; do_div(ns, this_tsc_khz); @@ -1212,6 +1265,21 @@ void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi); +/* emulate APIC access in a trap manner */ +void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset) +{ + u32 val = 0; + + /* hw has done the conditional check and inst decode */ + offset &= 0xff0; + + apic_reg_read(vcpu->arch.apic, offset, 4, &val); + + /* TODO: optimize to just emulate side effect w/o one more write */ + apic_reg_write(vcpu->arch.apic, offset, val); +} +EXPORT_SYMBOL_GPL(kvm_apic_write_nodecode); + void kvm_free_lapic(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; @@ -1288,6 +1356,7 @@ u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu) void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) { + u64 old_value = vcpu->arch.apic_base; struct kvm_lapic *apic = vcpu->arch.apic; if (!apic) { @@ -1309,11 +1378,16 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) value &= ~MSR_IA32_APICBASE_BSP; vcpu->arch.apic_base = value; - if (apic_x2apic_mode(apic)) { - u32 id = kvm_apic_id(apic); - u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf)); - kvm_apic_set_ldr(apic, ldr); + if ((old_value ^ value) & X2APIC_ENABLE) { + if (value & X2APIC_ENABLE) { + u32 id = kvm_apic_id(apic); + u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf)); + kvm_apic_set_ldr(apic, ldr); + kvm_x86_ops->set_virtual_x2apic_mode(vcpu, true); + } else + kvm_x86_ops->set_virtual_x2apic_mode(vcpu, false); } + apic->base_address = apic->vcpu->arch.apic_base & MSR_IA32_APICBASE_BASE; @@ -1359,8 +1433,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) apic_set_reg(apic, APIC_ISR + 0x10 * i, 0); apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); } - apic->irr_pending = false; - apic->isr_count = 0; + apic->irr_pending = kvm_apic_vid_enabled(vcpu->kvm); + apic->isr_count = kvm_apic_vid_enabled(vcpu->kvm); apic->highest_isr_cache = -1; update_divide_count(apic); atomic_set(&apic->lapic_timer.pending, 0); @@ -1575,8 +1649,10 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu, update_divide_count(apic); start_apic_timer(apic); apic->irr_pending = true; - apic->isr_count = count_vectors(apic->regs + APIC_ISR); + apic->isr_count = kvm_apic_vid_enabled(vcpu->kvm) ? + 1 : count_vectors(apic->regs + APIC_ISR); apic->highest_isr_cache = -1; + kvm_x86_ops->hwapic_isr_update(vcpu->kvm, apic_find_highest_isr(apic)); kvm_make_request(KVM_REQ_EVENT, vcpu); } diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index e5ebf9f3571..1676d34ddb4 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -64,6 +64,9 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu); void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data); +void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset); +void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector); + void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr); void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu); void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu); @@ -124,4 +127,35 @@ static inline int kvm_lapic_enabled(struct kvm_vcpu *vcpu) return kvm_apic_present(vcpu) && kvm_apic_sw_enabled(vcpu->arch.apic); } +static inline int apic_x2apic_mode(struct kvm_lapic *apic) +{ + return apic->vcpu->arch.apic_base & X2APIC_ENABLE; +} + +static inline bool kvm_apic_vid_enabled(struct kvm *kvm) +{ + return kvm_x86_ops->vm_has_apicv(kvm); +} + +static inline u16 apic_cluster_id(struct kvm_apic_map *map, u32 ldr) +{ + u16 cid; + ldr >>= 32 - map->ldr_bits; + cid = (ldr >> map->cid_shift) & map->cid_mask; + + BUG_ON(cid >= ARRAY_SIZE(map->logical_map)); + + return cid; +} + +static inline u16 apic_logical_id(struct kvm_apic_map *map, u32 ldr) +{ + ldr >>= (32 - map->ldr_bits); + return ldr & map->lid_mask; +} + +void kvm_calculate_eoi_exitmap(struct kvm_vcpu *vcpu, + struct kvm_lapic_irq *irq, + u64 *eoi_bitmap); + #endif diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 6f85fe0bf95..956ca358108 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -448,7 +448,8 @@ static bool __check_direct_spte_mmio_pf(u64 spte) static bool spte_is_locklessly_modifiable(u64 spte) { - return !(~spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)); + return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) == + (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE); } static bool spte_has_volatile_bits(u64 spte) @@ -831,8 +832,7 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) if (host_level == PT_PAGE_TABLE_LEVEL) return host_level; - max_level = kvm_x86_ops->get_lpage_level() < host_level ? - kvm_x86_ops->get_lpage_level() : host_level; + max_level = min(kvm_x86_ops->get_lpage_level(), host_level); for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level) if (has_wrprotected_page(vcpu->kvm, large_gfn, level)) @@ -1142,7 +1142,7 @@ spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush, bool pt_protect) } static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, - int level, bool pt_protect) + bool pt_protect) { u64 *sptep; struct rmap_iterator iter; @@ -1180,7 +1180,7 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, while (mask) { rmapp = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), PT_PAGE_TABLE_LEVEL, slot); - __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL, false); + __rmap_write_protect(kvm, rmapp, false); /* clear the first set bit */ mask &= mask - 1; @@ -1199,7 +1199,7 @@ static bool rmap_write_protect(struct kvm *kvm, u64 gfn) for (i = PT_PAGE_TABLE_LEVEL; i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { rmapp = __gfn_to_rmap(gfn, i, slot); - write_protected |= __rmap_write_protect(kvm, rmapp, i, true); + write_protected |= __rmap_write_protect(kvm, rmapp, true); } return write_protected; @@ -1460,28 +1460,14 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr) percpu_counter_add(&kvm_total_used_mmu_pages, nr); } -/* - * Remove the sp from shadow page cache, after call it, - * we can not find this sp from the cache, and the shadow - * page table is still valid. - * It should be under the protection of mmu lock. - */ -static void kvm_mmu_isolate_page(struct kvm_mmu_page *sp) +static void kvm_mmu_free_page(struct kvm_mmu_page *sp) { ASSERT(is_empty_shadow_page(sp->spt)); hlist_del(&sp->hash_link); - if (!sp->role.direct) - free_page((unsigned long)sp->gfns); -} - -/* - * Free the shadow page table and the sp, we can do it - * out of the protection of mmu lock. - */ -static void kvm_mmu_free_page(struct kvm_mmu_page *sp) -{ list_del(&sp->link); free_page((unsigned long)sp->spt); + if (!sp->role.direct) + free_page((unsigned long)sp->gfns); kmem_cache_free(mmu_page_header_cache, sp); } @@ -1522,7 +1508,6 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); set_page_private(virt_to_page(sp->spt), (unsigned long)sp); list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); - bitmap_zero(sp->slot_bitmap, KVM_MEM_SLOTS_NUM); sp->parent_ptes = 0; mmu_page_add_parent_pte(vcpu, sp, parent_pte); kvm_mod_used_mmu_pages(vcpu->kvm, +1); @@ -1659,13 +1644,13 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, static void kvm_mmu_commit_zap_page(struct kvm *kvm, struct list_head *invalid_list); -#define for_each_gfn_sp(kvm, sp, gfn, pos) \ - hlist_for_each_entry(sp, pos, \ +#define for_each_gfn_sp(kvm, sp, gfn) \ + hlist_for_each_entry(sp, \ &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \ if ((sp)->gfn != (gfn)) {} else -#define for_each_gfn_indirect_valid_sp(kvm, sp, gfn, pos) \ - hlist_for_each_entry(sp, pos, \ +#define for_each_gfn_indirect_valid_sp(kvm, sp, gfn) \ + hlist_for_each_entry(sp, \ &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \ if ((sp)->gfn != (gfn) || (sp)->role.direct || \ (sp)->role.invalid) {} else @@ -1721,11 +1706,10 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) { struct kvm_mmu_page *s; - struct hlist_node *node; LIST_HEAD(invalid_list); bool flush = false; - for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) { + for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) { if (!s->unsync) continue; @@ -1863,7 +1847,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, union kvm_mmu_page_role role; unsigned quadrant; struct kvm_mmu_page *sp; - struct hlist_node *node; bool need_sync = false; role = vcpu->arch.mmu.base_role; @@ -1878,7 +1861,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; role.quadrant = quadrant; } - for_each_gfn_sp(vcpu->kvm, sp, gfn, node) { + for_each_gfn_sp(vcpu->kvm, sp, gfn) { if (!need_sync && sp->unsync) need_sync = true; @@ -1973,9 +1956,9 @@ static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp) { u64 spte; - spte = __pa(sp->spt) - | PT_PRESENT_MASK | PT_ACCESSED_MASK - | PT_WRITABLE_MASK | PT_USER_MASK; + spte = __pa(sp->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK | + shadow_user_mask | shadow_x_mask | shadow_accessed_mask; + mmu_spte_set(sptep, spte); } @@ -2126,7 +2109,6 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, do { sp = list_first_entry(invalid_list, struct kvm_mmu_page, link); WARN_ON(!sp->role.invalid || sp->root_count); - kvm_mmu_isolate_page(sp); kvm_mmu_free_page(sp); } while (!list_empty(invalid_list)); } @@ -2144,6 +2126,8 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages) * change the value */ + spin_lock(&kvm->mmu_lock); + if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) { while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages && !list_empty(&kvm->arch.active_mmu_pages)) { @@ -2158,19 +2142,20 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages) } kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages; + + spin_unlock(&kvm->mmu_lock); } int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) { struct kvm_mmu_page *sp; - struct hlist_node *node; LIST_HEAD(invalid_list); int r; pgprintk("%s: looking for gfn %llx\n", __func__, gfn); r = 0; spin_lock(&kvm->mmu_lock); - for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { + for_each_gfn_indirect_valid_sp(kvm, sp, gfn) { pgprintk("%s: gfn %llx role %x\n", __func__, gfn, sp->role.word); r = 1; @@ -2183,14 +2168,6 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) } EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page); -static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) -{ - int slot = memslot_id(kvm, gfn); - struct kvm_mmu_page *sp = page_header(__pa(pte)); - - __set_bit(slot, sp->slot_bitmap); -} - /* * The function is based on mtrr_type_lookup() in * arch/x86/kernel/cpu/mtrr/generic.c @@ -2308,9 +2285,8 @@ static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) { struct kvm_mmu_page *s; - struct hlist_node *node; - for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) { + for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) { if (s->unsync) continue; WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); @@ -2322,19 +2298,17 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync) { struct kvm_mmu_page *s; - struct hlist_node *node; bool need_unsync = false; - for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) { + for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) { if (!can_unsync) return 1; if (s->role.level != PT_PAGE_TABLE_LEVEL) return 1; - if (!need_unsync && !s->unsync) { + if (!s->unsync) need_unsync = true; - } } if (need_unsync) kvm_unsync_pages(vcpu, gfn); @@ -2342,8 +2316,7 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, } static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, - unsigned pte_access, int user_fault, - int write_fault, int level, + unsigned pte_access, int level, gfn_t gfn, pfn_t pfn, bool speculative, bool can_unsync, bool host_writable) { @@ -2378,32 +2351,20 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, spte |= (u64)pfn << PAGE_SHIFT; - if ((pte_access & ACC_WRITE_MASK) - || (!vcpu->arch.mmu.direct_map && write_fault - && !is_write_protection(vcpu) && !user_fault)) { + if (pte_access & ACC_WRITE_MASK) { + /* + * Other vcpu creates new sp in the window between + * mapping_level() and acquiring mmu-lock. We can + * allow guest to retry the access, the mapping can + * be fixed if guest refault. + */ if (level > PT_PAGE_TABLE_LEVEL && - has_wrprotected_page(vcpu->kvm, gfn, level)) { - ret = 1; - drop_spte(vcpu->kvm, sptep); + has_wrprotected_page(vcpu->kvm, gfn, level)) goto done; - } spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE; - if (!vcpu->arch.mmu.direct_map - && !(pte_access & ACC_WRITE_MASK)) { - spte &= ~PT_USER_MASK; - /* - * If we converted a user page to a kernel page, - * so that the kernel can write to it when cr0.wp=0, - * then we should prevent the kernel from executing it - * if SMEP is enabled. - */ - if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)) - spte |= PT64_NX_MASK; - } - /* * Optimization: for pte sync, if spte was writable the hash * lookup is unnecessary (and expensive). Write protection @@ -2433,19 +2394,15 @@ done: } static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, - unsigned pt_access, unsigned pte_access, - int user_fault, int write_fault, - int *emulate, int level, gfn_t gfn, - pfn_t pfn, bool speculative, + unsigned pte_access, int write_fault, int *emulate, + int level, gfn_t gfn, pfn_t pfn, bool speculative, bool host_writable) { int was_rmapped = 0; int rmap_count; - pgprintk("%s: spte %llx access %x write_fault %d" - " user_fault %d gfn %llx\n", - __func__, *sptep, pt_access, - write_fault, user_fault, gfn); + pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__, + *sptep, write_fault, gfn); if (is_rmap_spte(*sptep)) { /* @@ -2469,9 +2426,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, was_rmapped = 1; } - if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault, - level, gfn, pfn, speculative, true, - host_writable)) { + if (set_spte(vcpu, sptep, pte_access, level, gfn, pfn, speculative, + true, host_writable)) { if (write_fault) *emulate = 1; kvm_mmu_flush_tlb(vcpu); @@ -2489,7 +2445,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, ++vcpu->kvm->stat.lpages; if (is_shadow_present_pte(*sptep)) { - page_header_update_slot(vcpu->kvm, sptep, gfn); if (!was_rmapped) { rmap_count = rmap_add(vcpu, sptep, gfn); if (rmap_count > RMAP_RECYCLE_THRESHOLD) @@ -2505,6 +2460,14 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) mmu_free_roots(vcpu); } +static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level) +{ + int bit7; + + bit7 = (gpte >> 7) & 1; + return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0; +} + static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log) { @@ -2517,6 +2480,26 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, return gfn_to_pfn_memslot_atomic(slot, gfn); } +static bool prefetch_invalid_gpte(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp, u64 *spte, + u64 gpte) +{ + if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) + goto no_present; + + if (!is_present_gpte(gpte)) + goto no_present; + + if (!(gpte & PT_ACCESSED_MASK)) + goto no_present; + + return false; + +no_present: + drop_spte(vcpu->kvm, spte); + return true; +} + static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *start, u64 *end) @@ -2535,10 +2518,9 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, return -1; for (i = 0; i < ret; i++, gfn++, start++) - mmu_set_spte(vcpu, start, ACC_ALL, - access, 0, 0, NULL, - sp->role.level, gfn, - page_to_pfn(pages[i]), true, true); + mmu_set_spte(vcpu, start, access, 0, NULL, + sp->role.level, gfn, page_to_pfn(pages[i]), + true, true); return 0; } @@ -2597,11 +2579,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { if (iterator.level == level) { - unsigned pte_access = ACC_ALL; - - mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access, - 0, write, &emulate, - level, gfn, pfn, prefault, map_writable); + mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, + write, &emulate, level, gfn, pfn, + prefault, map_writable); direct_pte_prefetch(vcpu, iterator.sptep); ++vcpu->stat.pf_fixed; break; @@ -2616,11 +2596,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, iterator.level - 1, 1, ACC_ALL, iterator.sptep); - mmu_spte_set(iterator.sptep, - __pa(sp->spt) - | PT_PRESENT_MASK | PT_WRITABLE_MASK - | shadow_user_mask | shadow_x_mask - | shadow_accessed_mask); + link_shadow_page(iterator.sptep, sp); } } return emulate; @@ -2671,7 +2647,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, * PT_PAGE_TABLE_LEVEL and there would be no adjustment done * here. */ - if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn) && + if (!is_error_noslot_pfn(pfn) && !kvm_is_mmio_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL && PageTransCompound(pfn_to_page(pfn)) && !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) { @@ -2699,18 +2675,13 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, } } -static bool mmu_invalid_pfn(pfn_t pfn) -{ - return unlikely(is_invalid_pfn(pfn)); -} - static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, pfn_t pfn, unsigned access, int *ret_val) { bool ret = true; /* The pfn is invalid, report the error! */ - if (unlikely(is_invalid_pfn(pfn))) { + if (unlikely(is_error_pfn(pfn))) { *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn); goto exit; } @@ -2862,7 +2833,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, return r; spin_lock(&vcpu->kvm->mmu_lock); - if (mmu_notifier_retry(vcpu, mmu_seq)) + if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) goto out_unlock; kvm_mmu_free_some_pages(vcpu); if (likely(!force_pt_level)) @@ -3331,7 +3302,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, return r; spin_lock(&vcpu->kvm->mmu_lock); - if (mmu_notifier_retry(vcpu, mmu_seq)) + if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) goto out_unlock; kvm_mmu_free_some_pages(vcpu); if (likely(!force_pt_level)) @@ -3399,14 +3370,6 @@ static void paging_free(struct kvm_vcpu *vcpu) nonpaging_free(vcpu); } -static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level) -{ - int bit7; - - bit7 = (gpte >> 7) & 1; - return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0; -} - static inline void protect_clean_gpte(unsigned *access, unsigned gpte) { unsigned mask; @@ -3696,6 +3659,7 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context) else r = paging32_init_context(vcpu, context); + vcpu->arch.mmu.base_role.nxe = is_nx(vcpu); vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu); vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); vcpu->arch.mmu.base_role.smep_andnot_wp @@ -3862,7 +3826,7 @@ static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ *gpa &= ~(gpa_t)7; *bytes = 8; - r = kvm_read_guest(vcpu->kvm, *gpa, &gentry, min(*bytes, 8)); + r = kvm_read_guest(vcpu->kvm, *gpa, &gentry, 8); if (r) gentry = 0; new = (const u8 *)&gentry; @@ -3964,7 +3928,6 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, gfn_t gfn = gpa >> PAGE_SHIFT; union kvm_mmu_page_role mask = { .word = 0 }; struct kvm_mmu_page *sp; - struct hlist_node *node; LIST_HEAD(invalid_list); u64 entry, gentry, *spte; int npte; @@ -3995,7 +3958,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE); mask.cr0_wp = mask.cr4_pae = mask.nxe = 1; - for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) { + for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) { if (detect_write_misaligned(sp, gpa, bytes) || detect_write_flooding(sp)) { zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp, @@ -4016,7 +3979,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, !((sp->role.word ^ vcpu->arch.mmu.base_role.word) & mask.word) && rmap_can_add(vcpu)) mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); - if (!remote_flush && need_remote_flush(entry, *spte)) + if (need_remote_flush(entry, *spte)) remote_flush = true; ++spte; } @@ -4175,26 +4138,36 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu) void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) { - struct kvm_mmu_page *sp; - bool flush = false; + struct kvm_memory_slot *memslot; + gfn_t last_gfn; + int i; - list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { - int i; - u64 *pt; + memslot = id_to_memslot(kvm->memslots, slot); + last_gfn = memslot->base_gfn + memslot->npages - 1; - if (!test_bit(slot, sp->slot_bitmap)) - continue; + spin_lock(&kvm->mmu_lock); - pt = sp->spt; - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { - if (!is_shadow_present_pte(pt[i]) || - !is_last_spte(pt[i], sp->role.level)) - continue; + for (i = PT_PAGE_TABLE_LEVEL; + i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { + unsigned long *rmapp; + unsigned long last_index, index; + + rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL]; + last_index = gfn_to_index(last_gfn, memslot->base_gfn, i); - spte_write_protect(kvm, &pt[i], &flush, false); + for (index = 0; index <= last_index; ++index, ++rmapp) { + if (*rmapp) + __rmap_write_protect(kvm, rmapp, false); + + if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { + kvm_flush_remote_tlbs(kvm); + cond_resched_lock(&kvm->mmu_lock); + } } } + kvm_flush_remote_tlbs(kvm); + spin_unlock(&kvm->mmu_lock); } void kvm_mmu_zap_all(struct kvm *kvm) diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index cd6e98333ba..b8f6172f417 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -195,12 +195,6 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page, TP_ARGS(sp) ); -DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_delay_free_pages, - TP_PROTO(struct kvm_mmu_page *sp), - - TP_ARGS(sp) -); - TRACE_EVENT( mark_mmio_spte, TP_PROTO(u64 *sptep, gfn_t gfn, unsigned access), diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 714e2c01a6f..105dd5bd550 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -151,7 +151,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, pt_element_t pte; pt_element_t __user *uninitialized_var(ptep_user); gfn_t table_gfn; - unsigned index, pt_access, pte_access, accessed_dirty, shift; + unsigned index, pt_access, pte_access, accessed_dirty; gpa_t pte_gpa; int offset; const int write_fault = access & PFERR_WRITE_MASK; @@ -249,16 +249,12 @@ retry_walk: if (!write_fault) protect_clean_gpte(&pte_access, pte); - - /* - * On a write fault, fold the dirty bit into accessed_dirty by shifting it one - * place right. - * - * On a read fault, do nothing. - */ - shift = write_fault >> ilog2(PFERR_WRITE_MASK); - shift *= PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT; - accessed_dirty &= pte >> shift; + else + /* + * On a write fault, fold the dirty bit into accessed_dirty by + * shifting it one place right. + */ + accessed_dirty &= pte >> (PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT); if (unlikely(!accessed_dirty)) { ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault); @@ -305,51 +301,43 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker, addr, access); } -static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp, u64 *spte, - pt_element_t gpte) +static bool +FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + u64 *spte, pt_element_t gpte, bool no_dirty_log) { - if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) - goto no_present; + unsigned pte_access; + gfn_t gfn; + pfn_t pfn; + + if (prefetch_invalid_gpte(vcpu, sp, spte, gpte)) + return false; - if (!is_present_gpte(gpte)) - goto no_present; + pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); - if (!(gpte & PT_ACCESSED_MASK)) - goto no_present; + gfn = gpte_to_gfn(gpte); + pte_access = sp->role.access & gpte_access(vcpu, gpte); + protect_clean_gpte(&pte_access, gpte); + pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, + no_dirty_log && (pte_access & ACC_WRITE_MASK)); + if (is_error_pfn(pfn)) + return false; - return false; + /* + * we call mmu_set_spte() with host_writable = true because + * pte_prefetch_gfn_to_pfn always gets a writable pfn. + */ + mmu_set_spte(vcpu, spte, pte_access, 0, NULL, PT_PAGE_TABLE_LEVEL, + gfn, pfn, true, true); -no_present: - drop_spte(vcpu->kvm, spte); return true; } static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, const void *pte) { - pt_element_t gpte; - unsigned pte_access; - pfn_t pfn; - - gpte = *(const pt_element_t *)pte; - if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) - return; - - pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); - pte_access = sp->role.access & gpte_access(vcpu, gpte); - protect_clean_gpte(&pte_access, gpte); - pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte)); - if (mmu_invalid_pfn(pfn)) - return; + pt_element_t gpte = *(const pt_element_t *)pte; - /* - * we call mmu_set_spte() with host_writable = true because that - * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1). - */ - mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, - NULL, PT_PAGE_TABLE_LEVEL, - gpte_to_gfn(gpte), pfn, true, true); + FNAME(prefetch_gpte)(vcpu, sp, spte, gpte, false); } static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu, @@ -395,53 +383,31 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, spte = sp->spt + i; for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { - pt_element_t gpte; - unsigned pte_access; - gfn_t gfn; - pfn_t pfn; - if (spte == sptep) continue; if (is_shadow_present_pte(*spte)) continue; - gpte = gptep[i]; - - if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) - continue; - - pte_access = sp->role.access & gpte_access(vcpu, gpte); - protect_clean_gpte(&pte_access, gpte); - gfn = gpte_to_gfn(gpte); - pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, - pte_access & ACC_WRITE_MASK); - if (mmu_invalid_pfn(pfn)) + if (!FNAME(prefetch_gpte)(vcpu, sp, spte, gptep[i], true)) break; - - mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, - NULL, PT_PAGE_TABLE_LEVEL, gfn, - pfn, true, true); } } /* * Fetch a shadow pte for a specific level in the paging hierarchy. + * If the guest tries to write a write-protected page, we need to + * emulate this operation, return 1 to indicate this case. */ -static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, +static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, struct guest_walker *gw, - int user_fault, int write_fault, int hlevel, - int *emulate, pfn_t pfn, bool map_writable, - bool prefault) + int write_fault, int hlevel, + pfn_t pfn, bool map_writable, bool prefault) { - unsigned access = gw->pt_access; struct kvm_mmu_page *sp = NULL; - int top_level; - unsigned direct_access; struct kvm_shadow_walk_iterator it; - - if (!is_present_gpte(gw->ptes[gw->level - 1])) - return NULL; + unsigned direct_access, access = gw->pt_access; + int top_level, emulate = 0; direct_access = gw->pte_access; @@ -504,18 +470,57 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, } clear_sp_write_flooding_count(it.sptep); - mmu_set_spte(vcpu, it.sptep, access, gw->pte_access, - user_fault, write_fault, emulate, it.level, - gw->gfn, pfn, prefault, map_writable); + mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault, &emulate, + it.level, gw->gfn, pfn, prefault, map_writable); FNAME(pte_prefetch)(vcpu, gw, it.sptep); - return it.sptep; + return emulate; out_gpte_changed: if (sp) kvm_mmu_put_page(sp, it.sptep); kvm_release_pfn_clean(pfn); - return NULL; + return 0; +} + + /* + * To see whether the mapped gfn can write its page table in the current + * mapping. + * + * It is the helper function of FNAME(page_fault). When guest uses large page + * size to map the writable gfn which is used as current page table, we should + * force kvm to use small page size to map it because new shadow page will be + * created when kvm establishes shadow page table that stop kvm using large + * page size. Do it early can avoid unnecessary #PF and emulation. + * + * @write_fault_to_shadow_pgtable will return true if the fault gfn is + * currently used as its page table. + * + * Note: the PDPT page table is not checked for PAE-32 bit guest. It is ok + * since the PDPT is always shadowed, that means, we can not use large page + * size to map the gfn which is used as PDPT. + */ +static bool +FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu, + struct guest_walker *walker, int user_fault, + bool *write_fault_to_shadow_pgtable) +{ + int level; + gfn_t mask = ~(KVM_PAGES_PER_HPAGE(walker->level) - 1); + bool self_changed = false; + + if (!(walker->pte_access & ACC_WRITE_MASK || + (!is_write_protection(vcpu) && !user_fault))) + return false; + + for (level = walker->level; level <= walker->max_level; level++) { + gfn_t gfn = walker->gfn ^ walker->table_gfn[level - 1]; + + self_changed |= !(gfn & mask); + *write_fault_to_shadow_pgtable |= !gfn; + } + + return self_changed; } /* @@ -538,14 +543,12 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, int write_fault = error_code & PFERR_WRITE_MASK; int user_fault = error_code & PFERR_USER_MASK; struct guest_walker walker; - u64 *sptep; - int emulate = 0; int r; pfn_t pfn; int level = PT_PAGE_TABLE_LEVEL; int force_pt_level; unsigned long mmu_seq; - bool map_writable; + bool map_writable, is_self_change_mapping; pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); @@ -573,8 +576,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, return 0; } + vcpu->arch.write_fault_to_shadow_pgtable = false; + + is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu, + &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable); + if (walker.level >= PT_DIRECTORY_LEVEL) - force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn); + force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn) + || is_self_change_mapping; else force_pt_level = 1; if (!force_pt_level) { @@ -593,25 +602,41 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, walker.gfn, pfn, walker.pte_access, &r)) return r; + /* + * Do not change pte_access if the pfn is a mmio page, otherwise + * we will cache the incorrect access into mmio spte. + */ + if (write_fault && !(walker.pte_access & ACC_WRITE_MASK) && + !is_write_protection(vcpu) && !user_fault && + !is_noslot_pfn(pfn)) { + walker.pte_access |= ACC_WRITE_MASK; + walker.pte_access &= ~ACC_USER_MASK; + + /* + * If we converted a user page to a kernel page, + * so that the kernel can write to it when cr0.wp=0, + * then we should prevent the kernel from executing it + * if SMEP is enabled. + */ + if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)) + walker.pte_access &= ~ACC_EXEC_MASK; + } + spin_lock(&vcpu->kvm->mmu_lock); - if (mmu_notifier_retry(vcpu, mmu_seq)) + if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) goto out_unlock; kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); kvm_mmu_free_some_pages(vcpu); if (!force_pt_level) transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); - sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, - level, &emulate, pfn, map_writable, prefault); - (void)sptep; - pgprintk("%s: shadow pte %p %llx emulate %d\n", __func__, - sptep, *sptep, emulate); - + r = FNAME(fetch)(vcpu, addr, &walker, write_fault, + level, pfn, map_writable, prefault); ++vcpu->stat.pf_fixed; kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); spin_unlock(&vcpu->kvm->mmu_lock); - return emulate; + return r; out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); @@ -757,7 +782,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) sizeof(pt_element_t))) return -EINVAL; - if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { + if (prefetch_invalid_gpte(vcpu, sp, &sp->spt[i], gpte)) { vcpu->kvm->tlbs_dirty++; continue; } @@ -780,7 +805,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE; - set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, + set_spte(vcpu, &sp->spt[i], pte_access, PT_PAGE_TABLE_LEVEL, gfn, spte_to_pfn(sp->spt[i]), true, false, host_writable); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index d017df3899e..e1b1ce21bc0 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -20,6 +20,7 @@ #include "mmu.h" #include "kvm_cache_regs.h" #include "x86.h" +#include "cpuid.h" #include <linux/module.h> #include <linux/mod_devicetable.h> @@ -630,15 +631,12 @@ static int svm_hardware_enable(void *garbage) return -EBUSY; if (!has_svm()) { - printk(KERN_ERR "svm_hardware_enable: err EOPNOTSUPP on %d\n", - me); + pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me); return -EINVAL; } sd = per_cpu(svm_data, me); - if (!sd) { - printk(KERN_ERR "svm_hardware_enable: svm_data is NULL on %d\n", - me); + pr_err("%s: svm_data is NULL on %d\n", __func__, me); return -EINVAL; } @@ -1012,6 +1010,13 @@ static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) svm->tsc_ratio = ratio; } +static u64 svm_read_tsc_offset(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + return svm->vmcb->control.tsc_offset; +} + static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1189,6 +1194,8 @@ static void init_vmcb(struct vcpu_svm *svm) static int svm_vcpu_reset(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + u32 dummy; + u32 eax = 1; init_vmcb(svm); @@ -1197,8 +1204,9 @@ static int svm_vcpu_reset(struct kvm_vcpu *vcpu) svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12; svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8; } - vcpu->arch.regs_avail = ~0; - vcpu->arch.regs_dirty = ~0; + + kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy); + kvm_register_write(vcpu, VCPU_REGS_RDX, eax); return 0; } @@ -1254,11 +1262,6 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; svm->asid_generation = 0; init_vmcb(svm); - kvm_write_tsc(&svm->vcpu, 0); - - err = fx_init(&svm->vcpu); - if (err) - goto free_page4; svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; if (kvm_vcpu_is_bsp(&svm->vcpu)) @@ -1268,8 +1271,6 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) return &svm->vcpu; -free_page4: - __free_page(hsave_page); free_page3: __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER); free_page2: @@ -3008,11 +3009,11 @@ static int cr8_write_interception(struct vcpu_svm *svm) return 0; } -u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu) +u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) { struct vmcb *vmcb = get_host_vmcb(to_svm(vcpu)); return vmcb->control.tsc_offset + - svm_scale_tsc(vcpu, native_read_tsc()); + svm_scale_tsc(vcpu, host_tsc); } static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) @@ -3131,13 +3132,15 @@ static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data) return 0; } -static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) +static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) { struct vcpu_svm *svm = to_svm(vcpu); + u32 ecx = msr->index; + u64 data = msr->data; switch (ecx) { case MSR_IA32_TSC: - kvm_write_tsc(vcpu, data); + kvm_write_tsc(vcpu, msr); break; case MSR_STAR: svm->vmcb->save.star = data; @@ -3192,20 +3195,24 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); break; default: - return kvm_set_msr_common(vcpu, ecx, data); + return kvm_set_msr_common(vcpu, msr); } return 0; } static int wrmsr_interception(struct vcpu_svm *svm) { + struct msr_data msr; u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u) | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); + msr.data = data; + msr.index = ecx; + msr.host_initiated = false; svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; - if (svm_set_msr(&svm->vcpu, ecx, data)) { + if (svm_set_msr(&svm->vcpu, &msr)) { trace_kvm_msr_write_ex(ecx, data); kvm_inject_gp(&svm->vcpu, 0); } else { @@ -3564,6 +3571,26 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) set_cr_intercept(svm, INTERCEPT_CR8_WRITE); } +static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) +{ + return; +} + +static int svm_vm_has_apicv(struct kvm *kvm) +{ + return 0; +} + +static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) +{ + return; +} + +static void svm_hwapic_isr_update(struct kvm *kvm, int isr) +{ + return; +} + static int svm_nmi_allowed(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -4283,6 +4310,10 @@ static struct kvm_x86_ops svm_x86_ops = { .enable_nmi_window = enable_nmi_window, .enable_irq_window = enable_irq_window, .update_cr8_intercept = update_cr8_intercept, + .set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode, + .vm_has_apicv = svm_vm_has_apicv, + .load_eoi_exitmap = svm_load_eoi_exitmap, + .hwapic_isr_update = svm_hwapic_isr_update, .set_tss_addr = svm_set_tss_addr, .get_tdp_level = get_npt_level, @@ -4302,6 +4333,7 @@ static struct kvm_x86_ops svm_x86_ops = { .has_wbinvd_exit = svm_has_wbinvd_exit, .set_tsc_khz = svm_set_tsc_khz, + .read_tsc_offset = svm_read_tsc_offset, .write_tsc_offset = svm_write_tsc_offset, .adjust_tsc_offset = svm_adjust_tsc_offset, .compute_tsc_offset = svm_compute_tsc_offset, diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index bca63f04dcc..fe5e00ed703 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -4,6 +4,7 @@ #include <linux/tracepoint.h> #include <asm/vmx.h> #include <asm/svm.h> +#include <asm/clocksource.h> #undef TRACE_SYSTEM #define TRACE_SYSTEM kvm @@ -754,6 +755,68 @@ TRACE_EVENT( __entry->write ? "Write" : "Read", __entry->gpa_match ? "GPA" : "GVA") ); + +#ifdef CONFIG_X86_64 + +#define host_clocks \ + {VCLOCK_NONE, "none"}, \ + {VCLOCK_TSC, "tsc"}, \ + {VCLOCK_HPET, "hpet"} \ + +TRACE_EVENT(kvm_update_master_clock, + TP_PROTO(bool use_master_clock, unsigned int host_clock, bool offset_matched), + TP_ARGS(use_master_clock, host_clock, offset_matched), + + TP_STRUCT__entry( + __field( bool, use_master_clock ) + __field( unsigned int, host_clock ) + __field( bool, offset_matched ) + ), + + TP_fast_assign( + __entry->use_master_clock = use_master_clock; + __entry->host_clock = host_clock; + __entry->offset_matched = offset_matched; + ), + + TP_printk("masterclock %d hostclock %s offsetmatched %u", + __entry->use_master_clock, + __print_symbolic(__entry->host_clock, host_clocks), + __entry->offset_matched) +); + +TRACE_EVENT(kvm_track_tsc, + TP_PROTO(unsigned int vcpu_id, unsigned int nr_matched, + unsigned int online_vcpus, bool use_master_clock, + unsigned int host_clock), + TP_ARGS(vcpu_id, nr_matched, online_vcpus, use_master_clock, + host_clock), + + TP_STRUCT__entry( + __field( unsigned int, vcpu_id ) + __field( unsigned int, nr_vcpus_matched_tsc ) + __field( unsigned int, online_vcpus ) + __field( bool, use_master_clock ) + __field( unsigned int, host_clock ) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->nr_vcpus_matched_tsc = nr_matched; + __entry->online_vcpus = online_vcpus; + __entry->use_master_clock = use_master_clock; + __entry->host_clock = host_clock; + ), + + TP_printk("vcpu_id %u masterclock %u offsetmatched %u nr_online %u" + " hostclock %s", + __entry->vcpu_id, __entry->use_master_clock, + __entry->nr_vcpus_matched_tsc, __entry->online_vcpus, + __print_symbolic(__entry->host_clock, host_clocks)) +); + +#endif /* CONFIG_X86_64 */ + #endif /* _TRACE_KVM_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ad6b1dd06f8..6667042714c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -42,6 +42,7 @@ #include <asm/i387.h> #include <asm/xcr.h> #include <asm/perf_event.h> +#include <asm/kexec.h> #include "trace.h" @@ -83,6 +84,8 @@ module_param(vmm_exclusive, bool, S_IRUGO); static bool __read_mostly fasteoi = 1; module_param(fasteoi, bool, S_IRUGO); +static bool __read_mostly enable_apicv_reg_vid; + /* * If nested=1, nested virtualization is supported, i.e., guests may use * VMX and be a hypervisor for its own guests. If nested=0, guests may not @@ -91,12 +94,8 @@ module_param(fasteoi, bool, S_IRUGO); static bool __read_mostly nested = 0; module_param(nested, bool, S_IRUGO); -#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ - (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) -#define KVM_GUEST_CR0_MASK \ - (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) -#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \ - (X86_CR0_WP | X86_CR0_NE) +#define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD) +#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE) #define KVM_VM_CR0_ALWAYS_ON \ (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) #define KVM_CR4_GUEST_OWNED_BITS \ @@ -623,6 +622,8 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); static void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); +static bool guest_state_valid(struct kvm_vcpu *vcpu); +static u32 vmx_segment_access_rights(struct kvm_segment *var); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); @@ -637,6 +638,8 @@ static unsigned long *vmx_io_bitmap_a; static unsigned long *vmx_io_bitmap_b; static unsigned long *vmx_msr_bitmap_legacy; static unsigned long *vmx_msr_bitmap_longmode; +static unsigned long *vmx_msr_bitmap_legacy_x2apic; +static unsigned long *vmx_msr_bitmap_longmode_x2apic; static bool cpu_has_load_ia32_efer; static bool cpu_has_load_perf_global_ctrl; @@ -761,6 +764,24 @@ static inline bool cpu_has_vmx_virtualize_apic_accesses(void) SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; } +static inline bool cpu_has_vmx_virtualize_x2apic_mode(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; +} + +static inline bool cpu_has_vmx_apic_register_virt(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_APIC_REGISTER_VIRT; +} + +static inline bool cpu_has_vmx_virtual_intr_delivery(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY; +} + static inline bool cpu_has_vmx_flexpriority(void) { return cpu_has_vmx_tpr_shadow() && @@ -802,11 +823,6 @@ static inline bool cpu_has_vmx_ept_ad_bits(void) return vmx_capability.ept & VMX_EPT_AD_BIT; } -static inline bool cpu_has_vmx_invept_individual_addr(void) -{ - return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT; -} - static inline bool cpu_has_vmx_invept_context(void) { return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT; @@ -992,6 +1008,46 @@ static void vmcs_load(struct vmcs *vmcs) vmcs, phys_addr); } +#ifdef CONFIG_KEXEC +/* + * This bitmap is used to indicate whether the vmclear + * operation is enabled on all cpus. All disabled by + * default. + */ +static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE; + +static inline void crash_enable_local_vmclear(int cpu) +{ + cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap); +} + +static inline void crash_disable_local_vmclear(int cpu) +{ + cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap); +} + +static inline int crash_local_vmclear_enabled(int cpu) +{ + return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap); +} + +static void crash_vmclear_local_loaded_vmcss(void) +{ + int cpu = raw_smp_processor_id(); + struct loaded_vmcs *v; + + if (!crash_local_vmclear_enabled(cpu)) + return; + + list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu), + loaded_vmcss_on_cpu_link) + vmcs_clear(v->vmcs); +} +#else +static inline void crash_enable_local_vmclear(int cpu) { } +static inline void crash_disable_local_vmclear(int cpu) { } +#endif /* CONFIG_KEXEC */ + static void __loaded_vmcs_clear(void *arg) { struct loaded_vmcs *loaded_vmcs = arg; @@ -1001,15 +1057,28 @@ static void __loaded_vmcs_clear(void *arg) return; /* vcpu migration can race with cpu offline */ if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs) per_cpu(current_vmcs, cpu) = NULL; + crash_disable_local_vmclear(cpu); list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link); + + /* + * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link + * is before setting loaded_vmcs->vcpu to -1 which is done in + * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist + * then adds the vmcs into percpu list before it is deleted. + */ + smp_wmb(); + loaded_vmcs_init(loaded_vmcs); + crash_enable_local_vmclear(cpu); } static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) { - if (loaded_vmcs->cpu != -1) - smp_call_function_single( - loaded_vmcs->cpu, __loaded_vmcs_clear, loaded_vmcs, 1); + int cpu = loaded_vmcs->cpu; + + if (cpu != -1) + smp_call_function_single(cpu, + __loaded_vmcs_clear, loaded_vmcs, 1); } static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx) @@ -1051,17 +1120,6 @@ static inline void ept_sync_context(u64 eptp) } } -static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa) -{ - if (enable_ept) { - if (cpu_has_vmx_invept_individual_addr()) - __invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR, - eptp, gpa); - else - ept_sync_context(eptp); - } -} - static __always_inline unsigned long vmcs_readl(unsigned long field) { unsigned long value; @@ -1535,8 +1593,18 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); local_irq_disable(); + crash_disable_local_vmclear(cpu); + + /* + * Read loaded_vmcs->cpu should be before fetching + * loaded_vmcs->loaded_vmcss_on_cpu_link. + * See the comments in __loaded_vmcs_clear(). + */ + smp_rmb(); + list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link, &per_cpu(loaded_vmcss_on_cpu, cpu)); + crash_enable_local_vmclear(cpu); local_irq_enable(); /* @@ -1646,7 +1714,6 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail); - __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); to_vmx(vcpu)->rflags = rflags; if (to_vmx(vcpu)->rmode.vm86_active) { to_vmx(vcpu)->rmode.save_rflags = rflags; @@ -1772,6 +1839,25 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) vmx->guest_msrs[from] = tmp; } +static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu) +{ + unsigned long *msr_bitmap; + + if (irqchip_in_kernel(vcpu->kvm) && apic_x2apic_mode(vcpu->arch.apic)) { + if (is_long_mode(vcpu)) + msr_bitmap = vmx_msr_bitmap_longmode_x2apic; + else + msr_bitmap = vmx_msr_bitmap_legacy_x2apic; + } else { + if (is_long_mode(vcpu)) + msr_bitmap = vmx_msr_bitmap_longmode; + else + msr_bitmap = vmx_msr_bitmap_legacy; + } + + vmcs_write64(MSR_BITMAP, __pa(msr_bitmap)); +} + /* * Set up the vmcs to automatically save and restore system * msrs. Don't touch the 64-bit msrs if the guest is in legacy @@ -1780,7 +1866,6 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) static void setup_msrs(struct vcpu_vmx *vmx) { int save_nmsrs, index; - unsigned long *msr_bitmap; save_nmsrs = 0; #ifdef CONFIG_X86_64 @@ -1812,14 +1897,8 @@ static void setup_msrs(struct vcpu_vmx *vmx) vmx->save_nmsrs = save_nmsrs; - if (cpu_has_vmx_msr_bitmap()) { - if (is_long_mode(&vmx->vcpu)) - msr_bitmap = vmx_msr_bitmap_longmode; - else - msr_bitmap = vmx_msr_bitmap_legacy; - - vmcs_write64(MSR_BITMAP, __pa(msr_bitmap)); - } + if (cpu_has_vmx_msr_bitmap()) + vmx_set_msr_bitmap(&vmx->vcpu); } /* @@ -1839,11 +1918,10 @@ static u64 guest_read_tsc(void) * Like guest_read_tsc, but always returns L1's notion of the timestamp * counter, even if a nested guest (L2) is currently running. */ -u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu) +u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) { - u64 host_tsc, tsc_offset; + u64 tsc_offset; - rdtscll(host_tsc); tsc_offset = is_guest_mode(vcpu) ? to_vmx(vcpu)->nested.vmcs01_tsc_offset : vmcs_read64(TSC_OFFSET); @@ -1866,6 +1944,11 @@ static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) WARN(1, "user requested TSC rate below hardware speed\n"); } +static u64 vmx_read_tsc_offset(struct kvm_vcpu *vcpu) +{ + return vmcs_read64(TSC_OFFSET); +} + /* * writes 'offset' into guest's timestamp counter offset register */ @@ -2202,15 +2285,17 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called. */ -static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) +static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct shared_msr_entry *msr; int ret = 0; + u32 msr_index = msr_info->index; + u64 data = msr_info->data; switch (msr_index) { case MSR_EFER: - ret = kvm_set_msr_common(vcpu, msr_index, data); + ret = kvm_set_msr_common(vcpu, msr_info); break; #ifdef CONFIG_X86_64 case MSR_FS_BASE: @@ -2236,7 +2321,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) vmcs_writel(GUEST_SYSENTER_ESP, data); break; case MSR_IA32_TSC: - kvm_write_tsc(vcpu, data); + kvm_write_tsc(vcpu, msr_info); break; case MSR_IA32_CR_PAT: if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { @@ -2244,7 +2329,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) vcpu->arch.pat = data; break; } - ret = kvm_set_msr_common(vcpu, msr_index, data); + ret = kvm_set_msr_common(vcpu, msr_info); + break; + case MSR_IA32_TSC_ADJUST: + ret = kvm_set_msr_common(vcpu, msr_info); break; case MSR_TSC_AUX: if (!vmx->rdtscp_enabled) @@ -2267,7 +2355,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) } break; } - ret = kvm_set_msr_common(vcpu, msr_index, data); + ret = kvm_set_msr_common(vcpu, msr_info); } return ret; @@ -2341,6 +2429,18 @@ static int hardware_enable(void *garbage) return -EBUSY; INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); + + /* + * Now we can enable the vmclear operation in kdump + * since the loaded_vmcss_on_cpu list on this cpu + * has been initialized. + * + * Though the cpu is not in VMX operation now, there + * is no problem to enable the vmclear operation + * for the loaded_vmcss_on_cpu list is empty! + */ + crash_enable_local_vmclear(cpu); + rdmsrl(MSR_IA32_FEATURE_CONTROL, old); test_bits = FEATURE_CONTROL_LOCKED; @@ -2464,13 +2564,16 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { min2 = 0; opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | SECONDARY_EXEC_WBINVD_EXITING | SECONDARY_EXEC_ENABLE_VPID | SECONDARY_EXEC_ENABLE_EPT | SECONDARY_EXEC_UNRESTRICTED_GUEST | SECONDARY_EXEC_PAUSE_LOOP_EXITING | SECONDARY_EXEC_RDTSCP | - SECONDARY_EXEC_ENABLE_INVPCID; + SECONDARY_EXEC_ENABLE_INVPCID | + SECONDARY_EXEC_APIC_REGISTER_VIRT | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY; if (adjust_vmx_controls(min2, opt2, MSR_IA32_VMX_PROCBASED_CTLS2, &_cpu_based_2nd_exec_control) < 0) @@ -2481,6 +2584,13 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; #endif + + if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)) + _cpu_based_2nd_exec_control &= ~( + SECONDARY_EXEC_APIC_REGISTER_VIRT | + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); + if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { /* CR3 accesses and invlpg don't need to cause VM Exits when EPT enabled */ @@ -2678,6 +2788,15 @@ static __init int hardware_setup(void) if (!cpu_has_vmx_ple()) ple_gap = 0; + if (!cpu_has_vmx_apic_register_virt() || + !cpu_has_vmx_virtual_intr_delivery()) + enable_apicv_reg_vid = 0; + + if (enable_apicv_reg_vid) + kvm_x86_ops->update_cr8_intercept = NULL; + else + kvm_x86_ops->hwapic_irr_update = NULL; + if (nested) nested_vmx_setup_ctls_msrs(); @@ -2689,17 +2808,28 @@ static __exit void hardware_unsetup(void) free_kvm_area(); } -static void fix_pmode_dataseg(struct kvm_vcpu *vcpu, int seg, struct kvm_segment *save) +static bool emulation_required(struct kvm_vcpu *vcpu) { - const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; - struct kvm_segment tmp = *save; + return emulate_invalid_guest_state && !guest_state_valid(vcpu); +} - if (!(vmcs_readl(sf->base) == tmp.base && tmp.s)) { - tmp.base = vmcs_readl(sf->base); - tmp.selector = vmcs_read16(sf->selector); - tmp.s = 1; +static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg, + struct kvm_segment *save) +{ + if (!emulate_invalid_guest_state) { + /* + * CS and SS RPL should be equal during guest entry according + * to VMX spec, but in reality it is not always so. Since vcpu + * is in the middle of the transition from real mode to + * protected mode it is safe to assume that RPL 0 is a good + * default value. + */ + if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS) + save->selector &= ~SELECTOR_RPL_MASK; + save->dpl = save->selector & SELECTOR_RPL_MASK; + save->s = 1; } - vmx_set_segment(vcpu, &tmp, seg); + vmx_set_segment(vcpu, save, seg); } static void enter_pmode(struct kvm_vcpu *vcpu) @@ -2707,7 +2837,17 @@ static void enter_pmode(struct kvm_vcpu *vcpu) unsigned long flags; struct vcpu_vmx *vmx = to_vmx(vcpu); - vmx->emulation_required = 1; + /* + * Update real mode segment cache. It may be not up-to-date if sement + * register was written while vcpu was in a guest mode. + */ + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS); + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS); + vmx->rmode.vm86_active = 0; vmx_segment_cache_clear(vmx); @@ -2724,22 +2864,16 @@ static void enter_pmode(struct kvm_vcpu *vcpu) update_exception_bitmap(vcpu); - if (emulate_invalid_guest_state) - return; + fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); + fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); + fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); + fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); + fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); + fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); - fix_pmode_dataseg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); - fix_pmode_dataseg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); - fix_pmode_dataseg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); - fix_pmode_dataseg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); - - vmx_segment_cache_clear(vmx); - - vmcs_write16(GUEST_SS_SELECTOR, 0); - vmcs_write32(GUEST_SS_AR_BYTES, 0x93); - - vmcs_write16(GUEST_CS_SELECTOR, - vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK); - vmcs_write32(GUEST_CS_AR_BYTES, 0x9b); + /* CPL is always 0 when CPU enters protected mode */ + __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); + vmx->cpl = 0; } static gva_t rmode_tss_base(struct kvm *kvm) @@ -2761,36 +2895,51 @@ static gva_t rmode_tss_base(struct kvm *kvm) static void fix_rmode_seg(int seg, struct kvm_segment *save) { const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; - - vmcs_write16(sf->selector, save->base >> 4); - vmcs_write32(sf->base, save->base & 0xffff0); - vmcs_write32(sf->limit, 0xffff); - vmcs_write32(sf->ar_bytes, 0xf3); - if (save->base & 0xf) - printk_once(KERN_WARNING "kvm: segment base is not paragraph" - " aligned when entering protected mode (seg=%d)", - seg); + struct kvm_segment var = *save; + + var.dpl = 0x3; + if (seg == VCPU_SREG_CS) + var.type = 0x3; + + if (!emulate_invalid_guest_state) { + var.selector = var.base >> 4; + var.base = var.base & 0xffff0; + var.limit = 0xffff; + var.g = 0; + var.db = 0; + var.present = 1; + var.s = 1; + var.l = 0; + var.unusable = 0; + var.type = 0x3; + var.avl = 0; + if (save->base & 0xf) + printk_once(KERN_WARNING "kvm: segment base is not " + "paragraph aligned when entering " + "protected mode (seg=%d)", seg); + } + + vmcs_write16(sf->selector, var.selector); + vmcs_write32(sf->base, var.base); + vmcs_write32(sf->limit, var.limit); + vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var)); } static void enter_rmode(struct kvm_vcpu *vcpu) { unsigned long flags; struct vcpu_vmx *vmx = to_vmx(vcpu); - struct kvm_segment var; - - if (enable_unrestricted_guest) - return; vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS); + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS); - vmx->emulation_required = 1; vmx->rmode.vm86_active = 1; - /* * Very old userspace does not call KVM_SET_TSS_ADDR before entering * vcpu. Call it here with phys address pointing 16M below 4G. @@ -2818,28 +2967,13 @@ static void enter_rmode(struct kvm_vcpu *vcpu) vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); update_exception_bitmap(vcpu); - if (emulate_invalid_guest_state) - goto continue_rmode; + fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); + fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); + fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); + fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); + fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); + fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); - vmx_get_segment(vcpu, &var, VCPU_SREG_SS); - vmx_set_segment(vcpu, &var, VCPU_SREG_SS); - - vmx_get_segment(vcpu, &var, VCPU_SREG_CS); - vmx_set_segment(vcpu, &var, VCPU_SREG_CS); - - vmx_get_segment(vcpu, &var, VCPU_SREG_ES); - vmx_set_segment(vcpu, &var, VCPU_SREG_ES); - - vmx_get_segment(vcpu, &var, VCPU_SREG_DS); - vmx_set_segment(vcpu, &var, VCPU_SREG_DS); - - vmx_get_segment(vcpu, &var, VCPU_SREG_GS); - vmx_set_segment(vcpu, &var, VCPU_SREG_GS); - - vmx_get_segment(vcpu, &var, VCPU_SREG_FS); - vmx_set_segment(vcpu, &var, VCPU_SREG_FS); - -continue_rmode: kvm_mmu_reset_context(vcpu); } @@ -2998,17 +3132,18 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long hw_cr0; + hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK); if (enable_unrestricted_guest) - hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST) - | KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; - else - hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON; + hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; + else { + hw_cr0 |= KVM_VM_CR0_ALWAYS_ON; - if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) - enter_pmode(vcpu); + if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) + enter_pmode(vcpu); - if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE)) - enter_rmode(vcpu); + if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE)) + enter_rmode(vcpu); + } #ifdef CONFIG_X86_64 if (vcpu->arch.efer & EFER_LME) { @@ -3028,7 +3163,9 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) vmcs_writel(CR0_READ_SHADOW, cr0); vmcs_writel(GUEST_CR0, hw_cr0); vcpu->arch.cr0 = cr0; - __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); + + /* depends on vcpu->arch.cr0 to be set to a new value */ + vmx->emulation_required = emulation_required(vcpu); } static u64 construct_eptp(unsigned long root_hpa) @@ -3085,6 +3222,14 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if (!is_paging(vcpu)) { hw_cr4 &= ~X86_CR4_PAE; hw_cr4 |= X86_CR4_PSE; + /* + * SMEP is disabled if CPU is in non-paging mode in + * hardware. However KVM always uses paging mode to + * emulate guest non-paging mode with TDP. + * To emulate this behavior, SMEP needs to be manually + * disabled when guest switches to non-paging mode. + */ + hw_cr4 &= ~X86_CR4_SMEP; } else if (!(cr4 & X86_CR4_PAE)) { hw_cr4 &= ~X86_CR4_PAE; } @@ -3101,10 +3246,7 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx = to_vmx(vcpu); u32 ar; - if (vmx->rmode.vm86_active - && (seg == VCPU_SREG_TR || seg == VCPU_SREG_ES - || seg == VCPU_SREG_DS || seg == VCPU_SREG_FS - || seg == VCPU_SREG_GS)) { + if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { *var = vmx->rmode.segs[seg]; if (seg == VCPU_SREG_TR || var->selector == vmx_read_guest_seg_selector(vmx, seg)) @@ -3117,8 +3259,6 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu, var->limit = vmx_read_guest_seg_limit(vmx, seg); var->selector = vmx_read_guest_seg_selector(vmx, seg); ar = vmx_read_guest_seg_ar(vmx, seg); - if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state) - ar = 0; var->type = ar & 15; var->s = (ar >> 4) & 1; var->dpl = (ar >> 5) & 3; @@ -3141,8 +3281,10 @@ static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) return vmx_read_guest_seg_base(to_vmx(vcpu), seg); } -static int __vmx_get_cpl(struct kvm_vcpu *vcpu) +static int vmx_get_cpl(struct kvm_vcpu *vcpu) { + struct vcpu_vmx *vmx = to_vmx(vcpu); + if (!is_protmode(vcpu)) return 0; @@ -3150,24 +3292,9 @@ static int __vmx_get_cpl(struct kvm_vcpu *vcpu) && (kvm_get_rflags(vcpu) & X86_EFLAGS_VM)) /* if virtual 8086 */ return 3; - return vmx_read_guest_seg_selector(to_vmx(vcpu), VCPU_SREG_CS) & 3; -} - -static int vmx_get_cpl(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - /* - * If we enter real mode with cs.sel & 3 != 0, the normal CPL calculations - * fail; use the cache instead. - */ - if (unlikely(vmx->emulation_required && emulate_invalid_guest_state)) { - return vmx->cpl; - } - if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) { __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); - vmx->cpl = __vmx_get_cpl(vcpu); + vmx->cpl = vmx_read_guest_seg_selector(vmx, VCPU_SREG_CS) & 3; } return vmx->cpl; @@ -3199,28 +3326,23 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, { struct vcpu_vmx *vmx = to_vmx(vcpu); const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; - u32 ar; vmx_segment_cache_clear(vmx); + if (seg == VCPU_SREG_CS) + __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); - if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) { - vmcs_write16(sf->selector, var->selector); - vmx->rmode.segs[VCPU_SREG_TR] = *var; - return; + if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { + vmx->rmode.segs[seg] = *var; + if (seg == VCPU_SREG_TR) + vmcs_write16(sf->selector, var->selector); + else if (var->s) + fix_rmode_seg(seg, &vmx->rmode.segs[seg]); + goto out; } + vmcs_writel(sf->base, var->base); vmcs_write32(sf->limit, var->limit); vmcs_write16(sf->selector, var->selector); - if (vmx->rmode.vm86_active && var->s) { - vmx->rmode.segs[seg] = *var; - /* - * Hack real-mode segments into vm86 compatibility. - */ - if (var->base == 0xffff0000 && var->selector == 0xf000) - vmcs_writel(sf->base, 0xf0000); - ar = 0xf3; - } else - ar = vmx_segment_access_rights(var); /* * Fix the "Accessed" bit in AR field of segment registers for older @@ -3234,42 +3356,12 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, * kvm hack. */ if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR)) - ar |= 0x1; /* Accessed */ + var->type |= 0x1; /* Accessed */ - vmcs_write32(sf->ar_bytes, ar); - __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); + vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var)); - /* - * Fix segments for real mode guest in hosts that don't have - * "unrestricted_mode" or it was disabled. - * This is done to allow migration of the guests from hosts with - * unrestricted guest like Westmere to older host that don't have - * unrestricted guest like Nehelem. - */ - if (!enable_unrestricted_guest && vmx->rmode.vm86_active) { - switch (seg) { - case VCPU_SREG_CS: - vmcs_write32(GUEST_CS_AR_BYTES, 0xf3); - vmcs_write32(GUEST_CS_LIMIT, 0xffff); - if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000) - vmcs_writel(GUEST_CS_BASE, 0xf0000); - vmcs_write16(GUEST_CS_SELECTOR, - vmcs_readl(GUEST_CS_BASE) >> 4); - break; - case VCPU_SREG_ES: - case VCPU_SREG_DS: - case VCPU_SREG_GS: - case VCPU_SREG_FS: - fix_rmode_seg(seg, &vmx->rmode.segs[seg]); - break; - case VCPU_SREG_SS: - vmcs_write16(GUEST_SS_SELECTOR, - vmcs_readl(GUEST_SS_BASE) >> 4); - vmcs_write32(GUEST_SS_LIMIT, 0xffff); - vmcs_write32(GUEST_SS_AR_BYTES, 0xf3); - break; - } - } +out: + vmx->emulation_required |= emulation_required(vcpu); } static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) @@ -3310,13 +3402,16 @@ static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg) u32 ar; vmx_get_segment(vcpu, &var, seg); + var.dpl = 0x3; + if (seg == VCPU_SREG_CS) + var.type = 0x3; ar = vmx_segment_access_rights(&var); if (var.base != (var.selector << 4)) return false; - if (var.limit < 0xffff) + if (var.limit != 0xffff) return false; - if (((ar | (3 << AR_DPL_SHIFT)) & ~(AR_G_MASK | AR_DB_MASK)) != 0xf3) + if (ar != 0xf3) return false; return true; @@ -3451,6 +3546,9 @@ static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu) */ static bool guest_state_valid(struct kvm_vcpu *vcpu) { + if (enable_unrestricted_guest) + return true; + /* real mode guest state checks */ if (!is_protmode(vcpu)) { if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) @@ -3574,12 +3672,9 @@ static void seg_setup(int seg) vmcs_write16(sf->selector, 0); vmcs_writel(sf->base, 0); vmcs_write32(sf->limit, 0xffff); - if (enable_unrestricted_guest) { - ar = 0x93; - if (seg == VCPU_SREG_CS) - ar |= 0x08; /* code segment */ - } else - ar = 0xf3; + ar = 0x93; + if (seg == VCPU_SREG_CS) + ar |= 0x08; /* code segment */ vmcs_write32(sf->ar_bytes, ar); } @@ -3597,7 +3692,7 @@ static int alloc_apic_access_page(struct kvm *kvm) kvm_userspace_mem.flags = 0; kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL; kvm_userspace_mem.memory_size = PAGE_SIZE; - r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0); + r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, false); if (r) goto out; @@ -3627,7 +3722,7 @@ static int alloc_identity_pagetable(struct kvm *kvm) kvm_userspace_mem.guest_phys_addr = kvm->arch.ept_identity_map_addr; kvm_userspace_mem.memory_size = PAGE_SIZE; - r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0); + r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, false); if (r) goto out; @@ -3669,7 +3764,45 @@ static void free_vpid(struct vcpu_vmx *vmx) spin_unlock(&vmx_vpid_lock); } -static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr) +#define MSR_TYPE_R 1 +#define MSR_TYPE_W 2 +static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, + u32 msr, int type) +{ + int f = sizeof(unsigned long); + + if (!cpu_has_vmx_msr_bitmap()) + return; + + /* + * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals + * have the write-low and read-high bitmap offsets the wrong way round. + * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. + */ + if (msr <= 0x1fff) { + if (type & MSR_TYPE_R) + /* read-low */ + __clear_bit(msr, msr_bitmap + 0x000 / f); + + if (type & MSR_TYPE_W) + /* write-low */ + __clear_bit(msr, msr_bitmap + 0x800 / f); + + } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { + msr &= 0x1fff; + if (type & MSR_TYPE_R) + /* read-high */ + __clear_bit(msr, msr_bitmap + 0x400 / f); + + if (type & MSR_TYPE_W) + /* write-high */ + __clear_bit(msr, msr_bitmap + 0xc00 / f); + + } +} + +static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, + u32 msr, int type) { int f = sizeof(unsigned long); @@ -3682,20 +3815,58 @@ static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr) * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. */ if (msr <= 0x1fff) { - __clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */ - __clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */ + if (type & MSR_TYPE_R) + /* read-low */ + __set_bit(msr, msr_bitmap + 0x000 / f); + + if (type & MSR_TYPE_W) + /* write-low */ + __set_bit(msr, msr_bitmap + 0x800 / f); + } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { msr &= 0x1fff; - __clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */ - __clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */ + if (type & MSR_TYPE_R) + /* read-high */ + __set_bit(msr, msr_bitmap + 0x400 / f); + + if (type & MSR_TYPE_W) + /* write-high */ + __set_bit(msr, msr_bitmap + 0xc00 / f); + } } static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only) { if (!longmode_only) - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, msr); - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr); + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, + msr, MSR_TYPE_R | MSR_TYPE_W); + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, + msr, MSR_TYPE_R | MSR_TYPE_W); +} + +static void vmx_enable_intercept_msr_read_x2apic(u32 msr) +{ + __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, + msr, MSR_TYPE_R); + __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, + msr, MSR_TYPE_R); +} + +static void vmx_disable_intercept_msr_read_x2apic(u32 msr) +{ + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, + msr, MSR_TYPE_R); + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, + msr, MSR_TYPE_R); +} + +static void vmx_disable_intercept_msr_write_x2apic(u32 msr) +{ + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, + msr, MSR_TYPE_W); + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, + msr, MSR_TYPE_W); } /* @@ -3774,6 +3945,11 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx) return exec_control; } +static int vmx_vm_has_apicv(struct kvm *kvm) +{ + return enable_apicv_reg_vid && irqchip_in_kernel(kvm); +} + static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) { u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; @@ -3791,6 +3967,10 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; if (!ple_gap) exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; + if (!vmx_vm_has_apicv(vmx->vcpu.kvm)) + exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); + exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; return exec_control; } @@ -3835,6 +4015,15 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmx_secondary_exec_control(vmx)); } + if (enable_apicv_reg_vid) { + vmcs_write64(EOI_EXIT_BITMAP0, 0); + vmcs_write64(EOI_EXIT_BITMAP1, 0); + vmcs_write64(EOI_EXIT_BITMAP2, 0); + vmcs_write64(EOI_EXIT_BITMAP3, 0); + + vmcs_write16(GUEST_INTR_STATUS, 0); + } + if (ple_gap) { vmcs_write32(PLE_GAP, ple_gap); vmcs_write32(PLE_WINDOW, ple_window); @@ -3897,8 +4086,6 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); set_cr4_guest_host_mask(vmx); - kvm_write_tsc(&vmx->vcpu, 0); - return 0; } @@ -3908,8 +4095,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) u64 msr; int ret; - vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); - vmx->rmode.vm86_active = 0; vmx->soft_vnmi_blocked = 0; @@ -3921,21 +4106,12 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) msr |= MSR_IA32_APICBASE_BSP; kvm_set_apic_base(&vmx->vcpu, msr); - ret = fx_init(&vmx->vcpu); - if (ret != 0) - goto out; - vmx_segment_cache_clear(vmx); seg_setup(VCPU_SREG_CS); - /* - * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode - * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh. - */ - if (kvm_vcpu_is_bsp(&vmx->vcpu)) { + if (kvm_vcpu_is_bsp(&vmx->vcpu)) vmcs_write16(GUEST_CS_SELECTOR, 0xf000); - vmcs_writel(GUEST_CS_BASE, 0x000f0000); - } else { + else { vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8); vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12); } @@ -3965,7 +4141,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) kvm_rip_write(vcpu, 0xfff0); else kvm_rip_write(vcpu, 0); - kvm_register_write(vcpu, VCPU_REGS_RSP, 0); vmcs_writel(GUEST_GDTR_BASE, 0); vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); @@ -4012,10 +4187,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) ret = 0; - /* HACK: Don't enable emulation on guest boot/reset */ - vmx->emulation_required = 0; - -out: return ret; } @@ -4191,7 +4362,7 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) .flags = 0, }; - ret = kvm_set_memory_region(kvm, &tss_mem, 0); + ret = kvm_set_memory_region(kvm, &tss_mem, false); if (ret) return ret; kvm->arch.tss_addr = addr; @@ -4201,28 +4372,9 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) return 0; } -static int handle_rmode_exception(struct kvm_vcpu *vcpu, - int vec, u32 err_code) +static bool rmode_exception(struct kvm_vcpu *vcpu, int vec) { - /* - * Instruction with address size override prefix opcode 0x67 - * Cause the #SS fault with 0 error code in VM86 mode. - */ - if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) - if (emulate_instruction(vcpu, 0) == EMULATE_DONE) - return 1; - /* - * Forward all other exceptions that are valid in real mode. - * FIXME: Breaks guest debugging in real mode, needs to be fixed with - * the required debugging infrastructure rework. - */ switch (vec) { - case DB_VECTOR: - if (vcpu->guest_debug & - (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) - return 0; - kvm_queue_exception(vcpu, vec); - return 1; case BP_VECTOR: /* * Update instruction length as we may reinject the exception @@ -4231,7 +4383,12 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, to_vmx(vcpu)->vcpu.arch.event_exit_inst_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) - return 0; + return false; + /* fall through */ + case DB_VECTOR: + if (vcpu->guest_debug & + (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) + return false; /* fall through */ case DE_VECTOR: case OF_VECTOR: @@ -4241,10 +4398,37 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, case SS_VECTOR: case GP_VECTOR: case MF_VECTOR: - kvm_queue_exception(vcpu, vec); - return 1; + return true; + break; } - return 0; + return false; +} + +static int handle_rmode_exception(struct kvm_vcpu *vcpu, + int vec, u32 err_code) +{ + /* + * Instruction with address size override prefix opcode 0x67 + * Cause the #SS fault with 0 error code in VM86 mode. + */ + if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) { + if (emulate_instruction(vcpu, 0) == EMULATE_DONE) { + if (vcpu->arch.halt_request) { + vcpu->arch.halt_request = 0; + return kvm_emulate_halt(vcpu); + } + return 1; + } + return 0; + } + + /* + * Forward all other exceptions that are valid in real mode. + * FIXME: Breaks guest debugging in real mode, needs to be fixed with + * the required debugging infrastructure rework. + */ + kvm_queue_exception(vcpu, vec); + return 1; } /* @@ -4287,16 +4471,6 @@ static int handle_exception(struct kvm_vcpu *vcpu) if (is_machine_check(intr_info)) return handle_machine_check(vcpu); - if ((vect_info & VECTORING_INFO_VALID_MASK) && - !is_page_fault(intr_info)) { - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; - vcpu->run->internal.ndata = 2; - vcpu->run->internal.data[0] = vect_info; - vcpu->run->internal.data[1] = intr_info; - return 0; - } - if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) return 1; /* already handled by vmx_vcpu_run() */ @@ -4315,6 +4489,22 @@ static int handle_exception(struct kvm_vcpu *vcpu) error_code = 0; if (intr_info & INTR_INFO_DELIVER_CODE_MASK) error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); + + /* + * The #PF with PFEC.RSVD = 1 indicates the guest is accessing + * MMIO, it is better to report an internal error. + * See the comments in vmx_handle_exit. + */ + if ((vect_info & VECTORING_INFO_VALID_MASK) && + !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; + vcpu->run->internal.ndata = 2; + vcpu->run->internal.data[0] = vect_info; + vcpu->run->internal.data[1] = intr_info; + return 0; + } + if (is_page_fault(intr_info)) { /* EPT won't cause page fault directly */ BUG_ON(enable_ept); @@ -4326,17 +4516,11 @@ static int handle_exception(struct kvm_vcpu *vcpu) return kvm_mmu_page_fault(vcpu, cr2, error_code, NULL, 0); } - if (vmx->rmode.vm86_active && - handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK, - error_code)) { - if (vcpu->arch.halt_request) { - vcpu->arch.halt_request = 0; - return kvm_emulate_halt(vcpu); - } - return 1; - } - ex_no = intr_info & INTR_INFO_VECTOR_MASK; + + if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no)) + return handle_rmode_exception(vcpu, ex_no, error_code); + switch (ex_no) { case DB_VECTOR: dr6 = vmcs_readl(EXIT_QUALIFICATION); @@ -4626,11 +4810,15 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu) static int handle_wrmsr(struct kvm_vcpu *vcpu) { + struct msr_data msr; u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); - if (vmx_set_msr(vcpu, ecx, data) != 0) { + msr.data = data; + msr.index = ecx; + msr.host_initiated = false; + if (vmx_set_msr(vcpu, &msr) != 0) { trace_kvm_msr_write_ex(ecx, data); kvm_inject_gp(vcpu, 0); return 1; @@ -4750,6 +4938,26 @@ static int handle_apic_access(struct kvm_vcpu *vcpu) return emulate_instruction(vcpu, 0) == EMULATE_DONE; } +static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu) +{ + unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + int vector = exit_qualification & 0xff; + + /* EOI-induced VM exit is trap-like and thus no need to adjust IP */ + kvm_apic_set_eoi_accelerated(vcpu, vector); + return 1; +} + +static int handle_apic_write(struct kvm_vcpu *vcpu) +{ + unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + u32 offset = exit_qualification & 0xfff; + + /* APIC-write VM exit is trap-like and thus no need to adjust IP */ + kvm_apic_write_nodecode(vcpu, offset); + return 1; +} + static int handle_task_switch(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -4827,11 +5035,6 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - if (exit_qualification & (1 << 6)) { - printk(KERN_ERR "EPT: GPA exceeds GAW!\n"); - return -EINVAL; - } - gla_validity = (exit_qualification >> 7) & 0x3; if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) { printk(KERN_ERR "EPT: Handling EPT violation failed!\n"); @@ -5000,7 +5203,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) schedule(); } - vmx->emulation_required = !guest_state_valid(vcpu); + vmx->emulation_required = emulation_required(vcpu); out: return ret; } @@ -5689,6 +5892,8 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_VMON] = handle_vmon, [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, [EXIT_REASON_APIC_ACCESS] = handle_apic_access, + [EXIT_REASON_APIC_WRITE] = handle_apic_write, + [EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced, [EXIT_REASON_WBINVD] = handle_wbinvd, [EXIT_REASON_XSETBV] = handle_xsetbv, [EXIT_REASON_TASK_SWITCH] = handle_task_switch, @@ -5715,7 +5920,7 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX]; gpa_t bitmap; - if (!nested_cpu_has(get_vmcs12(vcpu), CPU_BASED_USE_MSR_BITMAPS)) + if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) return 1; /* @@ -5943,7 +6148,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) u32 vectoring_info = vmx->idt_vectoring_info; /* If guest state is invalid, start emulating */ - if (vmx->emulation_required && emulate_invalid_guest_state) + if (vmx->emulation_required) return handle_invalid_guest_state(vcpu); /* @@ -5979,13 +6184,24 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) return 0; } + /* + * Note: + * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by + * delivery event since it indicates guest is accessing MMIO. + * The vm-exit can be triggered again after return to guest that + * will cause infinite loop. + */ if ((vectoring_info & VECTORING_INFO_VALID_MASK) && (exit_reason != EXIT_REASON_EXCEPTION_NMI && exit_reason != EXIT_REASON_EPT_VIOLATION && - exit_reason != EXIT_REASON_TASK_SWITCH)) - printk(KERN_WARNING "%s: unexpected, valid vectoring info " - "(0x%x) and exit reason is 0x%x\n", - __func__, vectoring_info, exit_reason); + exit_reason != EXIT_REASON_TASK_SWITCH)) { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV; + vcpu->run->internal.ndata = 2; + vcpu->run->internal.data[0] = vectoring_info; + vcpu->run->internal.data[1] = exit_reason; + return 0; + } if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked && !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis( @@ -6027,6 +6243,85 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) vmcs_write32(TPR_THRESHOLD, irr); } +static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) +{ + u32 sec_exec_control; + + /* + * There is not point to enable virtualize x2apic without enable + * apicv + */ + if (!cpu_has_vmx_virtualize_x2apic_mode() || + !vmx_vm_has_apicv(vcpu->kvm)) + return; + + if (!vm_need_tpr_shadow(vcpu->kvm)) + return; + + sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + + if (set) { + sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; + } else { + sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; + sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + } + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control); + + vmx_set_msr_bitmap(vcpu); +} + +static void vmx_hwapic_isr_update(struct kvm *kvm, int isr) +{ + u16 status; + u8 old; + + if (!vmx_vm_has_apicv(kvm)) + return; + + if (isr == -1) + isr = 0; + + status = vmcs_read16(GUEST_INTR_STATUS); + old = status >> 8; + if (isr != old) { + status &= 0xff; + status |= isr << 8; + vmcs_write16(GUEST_INTR_STATUS, status); + } +} + +static void vmx_set_rvi(int vector) +{ + u16 status; + u8 old; + + status = vmcs_read16(GUEST_INTR_STATUS); + old = (u8)status & 0xff; + if ((u8)vector != old) { + status &= ~0xff; + status |= (u8)vector; + vmcs_write16(GUEST_INTR_STATUS, status); + } +} + +static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) +{ + if (max_irr == -1) + return; + + vmx_set_rvi(max_irr); +} + +static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) +{ + vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]); + vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]); + vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]); + vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]); +} + static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) { u32 exit_intr_info; @@ -6215,7 +6510,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) /* Don't enter VMX if guest state is invalid, let the exit handler start emulation until we arrive back to a valid state */ - if (vmx->emulation_required && emulate_invalid_guest_state) + if (vmx->emulation_required) return; if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) @@ -6549,19 +6844,22 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) } } - exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); /* Exposing INVPCID only when PCID is exposed */ best = kvm_find_cpuid_entry(vcpu, 0x7, 0); if (vmx_invpcid_supported() && best && (best->ebx & bit(X86_FEATURE_INVPCID)) && guest_cpuid_has_pcid(vcpu)) { + exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); exec_control |= SECONDARY_EXEC_ENABLE_INVPCID; vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); } else { - exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, - exec_control); + if (cpu_has_secondary_exec_ctrls()) { + exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, + exec_control); + } if (best) best->ebx &= ~bit(X86_FEATURE_INVPCID); } @@ -7287,6 +7585,11 @@ static struct kvm_x86_ops vmx_x86_ops = { .enable_nmi_window = enable_nmi_window, .enable_irq_window = enable_irq_window, .update_cr8_intercept = update_cr8_intercept, + .set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode, + .vm_has_apicv = vmx_vm_has_apicv, + .load_eoi_exitmap = vmx_load_eoi_exitmap, + .hwapic_irr_update = vmx_hwapic_irr_update, + .hwapic_isr_update = vmx_hwapic_isr_update, .set_tss_addr = vmx_set_tss_addr, .get_tdp_level = get_ept_level, @@ -7306,6 +7609,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, .set_tsc_khz = vmx_set_tsc_khz, + .read_tsc_offset = vmx_read_tsc_offset, .write_tsc_offset = vmx_write_tsc_offset, .adjust_tsc_offset = vmx_adjust_tsc_offset, .compute_tsc_offset = vmx_compute_tsc_offset, @@ -7318,7 +7622,7 @@ static struct kvm_x86_ops vmx_x86_ops = { static int __init vmx_init(void) { - int r, i; + int r, i, msr; rdmsrl_safe(MSR_EFER, &host_efer); @@ -7339,11 +7643,19 @@ static int __init vmx_init(void) if (!vmx_msr_bitmap_legacy) goto out1; + vmx_msr_bitmap_legacy_x2apic = + (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_msr_bitmap_legacy_x2apic) + goto out2; vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL); if (!vmx_msr_bitmap_longmode) - goto out2; + goto out3; + vmx_msr_bitmap_longmode_x2apic = + (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_msr_bitmap_longmode_x2apic) + goto out4; /* * Allow direct access to the PC debug port (it is often used for I/O @@ -7364,12 +7676,39 @@ static int __init vmx_init(void) if (r) goto out3; +#ifdef CONFIG_KEXEC + rcu_assign_pointer(crash_vmclear_loaded_vmcss, + crash_vmclear_local_loaded_vmcss); +#endif + vmx_disable_intercept_for_msr(MSR_FS_BASE, false); vmx_disable_intercept_for_msr(MSR_GS_BASE, false); vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true); vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); + memcpy(vmx_msr_bitmap_legacy_x2apic, + vmx_msr_bitmap_legacy, PAGE_SIZE); + memcpy(vmx_msr_bitmap_longmode_x2apic, + vmx_msr_bitmap_longmode, PAGE_SIZE); + + if (enable_apicv_reg_vid) { + for (msr = 0x800; msr <= 0x8ff; msr++) + vmx_disable_intercept_msr_read_x2apic(msr); + + /* According SDM, in x2apic mode, the whole id reg is used. + * But in KVM, it only use the highest eight bits. Need to + * intercept it */ + vmx_enable_intercept_msr_read_x2apic(0x802); + /* TMCCT */ + vmx_enable_intercept_msr_read_x2apic(0x839); + /* TPR */ + vmx_disable_intercept_msr_write_x2apic(0x808); + /* EOI */ + vmx_disable_intercept_msr_write_x2apic(0x80b); + /* SELF-IPI */ + vmx_disable_intercept_msr_write_x2apic(0x83f); + } if (enable_ept) { kvm_mmu_set_mask_ptes(0ull, @@ -7383,8 +7722,10 @@ static int __init vmx_init(void) return 0; -out3: +out4: free_page((unsigned long)vmx_msr_bitmap_longmode); +out3: + free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); out2: free_page((unsigned long)vmx_msr_bitmap_legacy); out1: @@ -7396,11 +7737,18 @@ out: static void __exit vmx_exit(void) { + free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); + free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); free_page((unsigned long)vmx_msr_bitmap_legacy); free_page((unsigned long)vmx_msr_bitmap_longmode); free_page((unsigned long)vmx_io_bitmap_b); free_page((unsigned long)vmx_io_bitmap_a); +#ifdef CONFIG_KEXEC + rcu_assign_pointer(crash_vmclear_loaded_vmcss, NULL); + synchronize_rcu(); +#endif + kvm_exit(); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 224a7e78cb6..f71500af1f8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -46,6 +46,8 @@ #include <linux/uaccess.h> #include <linux/hash.h> #include <linux/pci.h> +#include <linux/timekeeper_internal.h> +#include <linux/pvclock_gtod.h> #include <trace/events/kvm.h> #define CREATE_TRACE_POINTS @@ -118,7 +120,7 @@ struct kvm_shared_msrs { }; static struct kvm_shared_msrs_global __read_mostly shared_msrs_global; -static DEFINE_PER_CPU(struct kvm_shared_msrs, shared_msrs); +static struct kvm_shared_msrs __percpu *shared_msrs; struct kvm_stats_debugfs_item debugfs_entries[] = { { "pf_fixed", VCPU_STAT(pf_fixed) }, @@ -158,7 +160,9 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { u64 __read_mostly host_xcr0; -int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); +static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); + +static int kvm_vcpu_reset(struct kvm_vcpu *vcpu); static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) { @@ -187,10 +191,10 @@ static void kvm_on_user_return(struct user_return_notifier *urn) static void shared_msr_update(unsigned slot, u32 msr) { - struct kvm_shared_msrs *smsr; u64 value; + unsigned int cpu = smp_processor_id(); + struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu); - smsr = &__get_cpu_var(shared_msrs); /* only read, and nobody should modify it at this time, * so don't need lock */ if (slot >= shared_msrs_global.nr) { @@ -222,7 +226,8 @@ static void kvm_shared_msr_cpu_online(void) void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask) { - struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs); + unsigned int cpu = smp_processor_id(); + struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu); if (((value ^ smsr->values[slot].curr) & mask) == 0) return; @@ -238,7 +243,8 @@ EXPORT_SYMBOL_GPL(kvm_set_shared_msr); static void drop_user_return_notifiers(void *ignore) { - struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs); + unsigned int cpu = smp_processor_id(); + struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu); if (smsr->registered) kvm_on_user_return(&smsr->urn); @@ -633,7 +639,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) } if (is_long_mode(vcpu)) { - if (kvm_read_cr4(vcpu) & X86_CR4_PCIDE) { + if (kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) { if (cr3 & CR3_PCID_ENABLED_RESERVED_BITS) return 1; } else @@ -827,6 +833,7 @@ static u32 msrs_to_save[] = { static unsigned num_msrs_to_save; static const u32 emulated_msrs[] = { + MSR_IA32_TSC_ADJUST, MSR_IA32_TSCDEADLINE, MSR_IA32_MISC_ENABLE, MSR_IA32_MCG_STATUS, @@ -865,8 +872,6 @@ static int set_efer(struct kvm_vcpu *vcpu, u64 efer) kvm_x86_ops->set_efer(vcpu, efer); - vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; - /* Update reserved bits */ if ((efer ^ old_efer) & EFER_NX) kvm_mmu_reset_context(vcpu); @@ -886,9 +891,9 @@ EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called. */ -int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) +int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) { - return kvm_x86_ops->set_msr(vcpu, msr_index, data); + return kvm_x86_ops->set_msr(vcpu, msr); } /* @@ -896,8 +901,62 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) */ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) { - return kvm_set_msr(vcpu, index, *data); + struct msr_data msr; + + msr.data = *data; + msr.index = index; + msr.host_initiated = true; + return kvm_set_msr(vcpu, &msr); +} + +#ifdef CONFIG_X86_64 +struct pvclock_gtod_data { + seqcount_t seq; + + struct { /* extract of a clocksource struct */ + int vclock_mode; + cycle_t cycle_last; + cycle_t mask; + u32 mult; + u32 shift; + } clock; + + /* open coded 'struct timespec' */ + u64 monotonic_time_snsec; + time_t monotonic_time_sec; +}; + +static struct pvclock_gtod_data pvclock_gtod_data; + +static void update_pvclock_gtod(struct timekeeper *tk) +{ + struct pvclock_gtod_data *vdata = &pvclock_gtod_data; + + write_seqcount_begin(&vdata->seq); + + /* copy pvclock gtod data */ + vdata->clock.vclock_mode = tk->clock->archdata.vclock_mode; + vdata->clock.cycle_last = tk->clock->cycle_last; + vdata->clock.mask = tk->clock->mask; + vdata->clock.mult = tk->mult; + vdata->clock.shift = tk->shift; + + vdata->monotonic_time_sec = tk->xtime_sec + + tk->wall_to_monotonic.tv_sec; + vdata->monotonic_time_snsec = tk->xtime_nsec + + (tk->wall_to_monotonic.tv_nsec + << tk->shift); + while (vdata->monotonic_time_snsec >= + (((u64)NSEC_PER_SEC) << tk->shift)) { + vdata->monotonic_time_snsec -= + ((u64)NSEC_PER_SEC) << tk->shift; + vdata->monotonic_time_sec++; + } + + write_seqcount_end(&vdata->seq); } +#endif + static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) { @@ -995,6 +1054,10 @@ static inline u64 get_kernel_ns(void) return timespec_to_ns(&ts); } +#ifdef CONFIG_X86_64 +static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0); +#endif + static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); unsigned long max_tsc_khz; @@ -1046,12 +1109,47 @@ static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) return tsc; } -void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) +void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_X86_64 + bool vcpus_matched; + bool do_request = false; + struct kvm_arch *ka = &vcpu->kvm->arch; + struct pvclock_gtod_data *gtod = &pvclock_gtod_data; + + vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 == + atomic_read(&vcpu->kvm->online_vcpus)); + + if (vcpus_matched && gtod->clock.vclock_mode == VCLOCK_TSC) + if (!ka->use_master_clock) + do_request = 1; + + if (!vcpus_matched && ka->use_master_clock) + do_request = 1; + + if (do_request) + kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); + + trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc, + atomic_read(&vcpu->kvm->online_vcpus), + ka->use_master_clock, gtod->clock.vclock_mode); +#endif +} + +static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset) +{ + u64 curr_offset = kvm_x86_ops->read_tsc_offset(vcpu); + vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset; +} + +void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) { struct kvm *kvm = vcpu->kvm; u64 offset, ns, elapsed; unsigned long flags; s64 usdiff; + bool matched; + u64 data = msr->data; raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); @@ -1094,6 +1192,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); pr_debug("kvm: adjusted tsc offset by %llu\n", delta); } + matched = true; } else { /* * We split periods of matched TSC writes into generations. @@ -1108,6 +1207,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) kvm->arch.cur_tsc_nsec = ns; kvm->arch.cur_tsc_write = data; kvm->arch.cur_tsc_offset = offset; + matched = false; pr_debug("kvm: new tsc generation %u, clock %llu\n", kvm->arch.cur_tsc_generation, data); } @@ -1129,26 +1229,195 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec; vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write; + if (guest_cpuid_has_tsc_adjust(vcpu) && !msr->host_initiated) + update_ia32_tsc_adjust_msr(vcpu, offset); kvm_x86_ops->write_tsc_offset(vcpu, offset); raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); + + spin_lock(&kvm->arch.pvclock_gtod_sync_lock); + if (matched) + kvm->arch.nr_vcpus_matched_tsc++; + else + kvm->arch.nr_vcpus_matched_tsc = 0; + + kvm_track_tsc_matching(vcpu); + spin_unlock(&kvm->arch.pvclock_gtod_sync_lock); } EXPORT_SYMBOL_GPL(kvm_write_tsc); +#ifdef CONFIG_X86_64 + +static cycle_t read_tsc(void) +{ + cycle_t ret; + u64 last; + + /* + * Empirically, a fence (of type that depends on the CPU) + * before rdtsc is enough to ensure that rdtsc is ordered + * with respect to loads. The various CPU manuals are unclear + * as to whether rdtsc can be reordered with later loads, + * but no one has ever seen it happen. + */ + rdtsc_barrier(); + ret = (cycle_t)vget_cycles(); + + last = pvclock_gtod_data.clock.cycle_last; + + if (likely(ret >= last)) + return ret; + + /* + * GCC likes to generate cmov here, but this branch is extremely + * predictable (it's just a funciton of time and the likely is + * very likely) and there's a data dependence, so force GCC + * to generate a branch instead. I don't barrier() because + * we don't actually need a barrier, and if this function + * ever gets inlined it will generate worse code. + */ + asm volatile (""); + return last; +} + +static inline u64 vgettsc(cycle_t *cycle_now) +{ + long v; + struct pvclock_gtod_data *gtod = &pvclock_gtod_data; + + *cycle_now = read_tsc(); + + v = (*cycle_now - gtod->clock.cycle_last) & gtod->clock.mask; + return v * gtod->clock.mult; +} + +static int do_monotonic(struct timespec *ts, cycle_t *cycle_now) +{ + unsigned long seq; + u64 ns; + int mode; + struct pvclock_gtod_data *gtod = &pvclock_gtod_data; + + ts->tv_nsec = 0; + do { + seq = read_seqcount_begin(>od->seq); + mode = gtod->clock.vclock_mode; + ts->tv_sec = gtod->monotonic_time_sec; + ns = gtod->monotonic_time_snsec; + ns += vgettsc(cycle_now); + ns >>= gtod->clock.shift; + } while (unlikely(read_seqcount_retry(>od->seq, seq))); + timespec_add_ns(ts, ns); + + return mode; +} + +/* returns true if host is using tsc clocksource */ +static bool kvm_get_time_and_clockread(s64 *kernel_ns, cycle_t *cycle_now) +{ + struct timespec ts; + + /* checked again under seqlock below */ + if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC) + return false; + + if (do_monotonic(&ts, cycle_now) != VCLOCK_TSC) + return false; + + monotonic_to_bootbased(&ts); + *kernel_ns = timespec_to_ns(&ts); + + return true; +} +#endif + +/* + * + * Assuming a stable TSC across physical CPUS, and a stable TSC + * across virtual CPUs, the following condition is possible. + * Each numbered line represents an event visible to both + * CPUs at the next numbered event. + * + * "timespecX" represents host monotonic time. "tscX" represents + * RDTSC value. + * + * VCPU0 on CPU0 | VCPU1 on CPU1 + * + * 1. read timespec0,tsc0 + * 2. | timespec1 = timespec0 + N + * | tsc1 = tsc0 + M + * 3. transition to guest | transition to guest + * 4. ret0 = timespec0 + (rdtsc - tsc0) | + * 5. | ret1 = timespec1 + (rdtsc - tsc1) + * | ret1 = timespec0 + N + (rdtsc - (tsc0 + M)) + * + * Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity: + * + * - ret0 < ret1 + * - timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M)) + * ... + * - 0 < N - M => M < N + * + * That is, when timespec0 != timespec1, M < N. Unfortunately that is not + * always the case (the difference between two distinct xtime instances + * might be smaller then the difference between corresponding TSC reads, + * when updating guest vcpus pvclock areas). + * + * To avoid that problem, do not allow visibility of distinct + * system_timestamp/tsc_timestamp values simultaneously: use a master + * copy of host monotonic time values. Update that master copy + * in lockstep. + * + * Rely on synchronization of host TSCs and guest TSCs for monotonicity. + * + */ + +static void pvclock_update_vm_gtod_copy(struct kvm *kvm) +{ +#ifdef CONFIG_X86_64 + struct kvm_arch *ka = &kvm->arch; + int vclock_mode; + bool host_tsc_clocksource, vcpus_matched; + + vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 == + atomic_read(&kvm->online_vcpus)); + + /* + * If the host uses TSC clock, then passthrough TSC as stable + * to the guest. + */ + host_tsc_clocksource = kvm_get_time_and_clockread( + &ka->master_kernel_ns, + &ka->master_cycle_now); + + ka->use_master_clock = host_tsc_clocksource & vcpus_matched; + + if (ka->use_master_clock) + atomic_set(&kvm_guest_has_master_clock, 1); + + vclock_mode = pvclock_gtod_data.clock.vclock_mode; + trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode, + vcpus_matched); +#endif +} + static int kvm_guest_time_update(struct kvm_vcpu *v) { - unsigned long flags; + unsigned long flags, this_tsc_khz; struct kvm_vcpu_arch *vcpu = &v->arch; + struct kvm_arch *ka = &v->kvm->arch; void *shared_kaddr; - unsigned long this_tsc_khz; s64 kernel_ns, max_kernel_ns; - u64 tsc_timestamp; + u64 tsc_timestamp, host_tsc; + struct pvclock_vcpu_time_info *guest_hv_clock; u8 pvclock_flags; + bool use_master_clock; + + kernel_ns = 0; + host_tsc = 0; /* Keep irq disabled to prevent changes to the clock */ local_irq_save(flags); - tsc_timestamp = kvm_x86_ops->read_l1_tsc(v); - kernel_ns = get_kernel_ns(); this_tsc_khz = __get_cpu_var(cpu_tsc_khz); if (unlikely(this_tsc_khz == 0)) { local_irq_restore(flags); @@ -1157,6 +1426,24 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) } /* + * If the host uses TSC clock, then passthrough TSC as stable + * to the guest. + */ + spin_lock(&ka->pvclock_gtod_sync_lock); + use_master_clock = ka->use_master_clock; + if (use_master_clock) { + host_tsc = ka->master_cycle_now; + kernel_ns = ka->master_kernel_ns; + } + spin_unlock(&ka->pvclock_gtod_sync_lock); + if (!use_master_clock) { + host_tsc = native_read_tsc(); + kernel_ns = get_kernel_ns(); + } + + tsc_timestamp = kvm_x86_ops->read_l1_tsc(v, host_tsc); + + /* * We may have to catch up the TSC to match elapsed wall clock * time for two reasons, even if kvmclock is used. * 1) CPU could have been running below the maximum TSC rate @@ -1217,23 +1504,20 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) vcpu->hw_tsc_khz = this_tsc_khz; } - if (max_kernel_ns > kernel_ns) - kernel_ns = max_kernel_ns; - + /* with a master <monotonic time, tsc value> tuple, + * pvclock clock reads always increase at the (scaled) rate + * of guest TSC - no need to deal with sampling errors. + */ + if (!use_master_clock) { + if (max_kernel_ns > kernel_ns) + kernel_ns = max_kernel_ns; + } /* With all the info we got, fill in the values */ vcpu->hv_clock.tsc_timestamp = tsc_timestamp; vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; vcpu->last_kernel_ns = kernel_ns; vcpu->last_guest_tsc = tsc_timestamp; - pvclock_flags = 0; - if (vcpu->pvclock_set_guest_stopped_request) { - pvclock_flags |= PVCLOCK_GUEST_STOPPED; - vcpu->pvclock_set_guest_stopped_request = false; - } - - vcpu->hv_clock.flags = pvclock_flags; - /* * The interface expects us to write an even number signaling that the * update is finished. Since the guest won't see the intermediate @@ -1243,6 +1527,22 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) shared_kaddr = kmap_atomic(vcpu->time_page); + guest_hv_clock = shared_kaddr + vcpu->time_offset; + + /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */ + pvclock_flags = (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED); + + if (vcpu->pvclock_set_guest_stopped_request) { + pvclock_flags |= PVCLOCK_GUEST_STOPPED; + vcpu->pvclock_set_guest_stopped_request = false; + } + + /* If the host uses TSC clocksource, then it is stable */ + if (use_master_clock) + pvclock_flags |= PVCLOCK_TSC_STABLE_BIT; + + vcpu->hv_clock.flags = pvclock_flags; + memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock, sizeof(vcpu->hv_clock)); @@ -1572,11 +1872,21 @@ static void record_steal_time(struct kvm_vcpu *vcpu) &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); } -int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) +int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { bool pr = false; + u32 msr = msr_info->index; + u64 data = msr_info->data; switch (msr) { + case MSR_AMD64_NB_CFG: + case MSR_IA32_UCODE_REV: + case MSR_IA32_UCODE_WRITE: + case MSR_VM_HSAVE_PA: + case MSR_AMD64_PATCH_LOADER: + case MSR_AMD64_BU_CFG2: + break; + case MSR_EFER: return set_efer(vcpu, data); case MSR_K7_HWCR: @@ -1596,8 +1906,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) return 1; } break; - case MSR_AMD64_NB_CFG: - break; case MSR_IA32_DEBUGCTLMSR: if (!data) { /* We support the non-activated case already */ @@ -1610,11 +1918,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", __func__, data); break; - case MSR_IA32_UCODE_REV: - case MSR_IA32_UCODE_WRITE: - case MSR_VM_HSAVE_PA: - case MSR_AMD64_PATCH_LOADER: - break; case 0x200 ... 0x2ff: return set_msr_mtrr(vcpu, msr, data); case MSR_IA32_APICBASE: @@ -1625,6 +1928,15 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) case MSR_IA32_TSCDEADLINE: kvm_set_lapic_tscdeadline_msr(vcpu, data); break; + case MSR_IA32_TSC_ADJUST: + if (guest_cpuid_has_tsc_adjust(vcpu)) { + if (!msr_info->host_initiated) { + u64 adj = data - vcpu->arch.ia32_tsc_adjust_msr; + kvm_x86_ops->adjust_tsc_offset(vcpu, adj, true); + } + vcpu->arch.ia32_tsc_adjust_msr = data; + } + break; case MSR_IA32_MISC_ENABLE: vcpu->arch.ia32_misc_enable_msr = data; break; @@ -1940,6 +2252,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_K8_INT_PENDING_MSG: case MSR_AMD64_NB_CFG: case MSR_FAM10H_MMIO_CONF_BASE: + case MSR_AMD64_BU_CFG2: data = 0; break; case MSR_P6_PERFCTR0: @@ -1984,6 +2297,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_IA32_TSCDEADLINE: data = kvm_get_lapic_tscdeadline_msr(vcpu); break; + case MSR_IA32_TSC_ADJUST: + data = (u64)vcpu->arch.ia32_tsc_adjust_msr; + break; case MSR_IA32_MISC_ENABLE: data = vcpu->arch.ia32_misc_enable_msr; break; @@ -2204,7 +2520,7 @@ int kvm_dev_ioctl_check_extension(long ext) r = KVM_MAX_VCPUS; break; case KVM_CAP_NR_MEMSLOTS: - r = KVM_MEMORY_SLOTS; + r = KVM_USER_MEM_SLOTS; break; case KVM_CAP_PV_MMU: /* obsolete */ r = 0; @@ -2342,7 +2658,12 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_x86_ops->write_tsc_offset(vcpu, offset); vcpu->arch.tsc_catchup = 1; } - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); + /* + * On a host with synchronized TSC, there is no need to update + * kvmclock on vcpu->cpu migration + */ + if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1) + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); if (vcpu->cpu != cpu) kvm_migrate_timers(vcpu); vcpu->cpu = cpu; @@ -2691,15 +3012,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp, if (!vcpu->arch.apic) goto out; u.lapic = memdup_user(argp, sizeof(*u.lapic)); - if (IS_ERR(u.lapic)) { - r = PTR_ERR(u.lapic); - goto out; - } + if (IS_ERR(u.lapic)) + return PTR_ERR(u.lapic); r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic); - if (r) - goto out; - r = 0; break; } case KVM_INTERRUPT: { @@ -2709,16 +3025,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp, if (copy_from_user(&irq, argp, sizeof irq)) goto out; r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); - if (r) - goto out; - r = 0; break; } case KVM_NMI: { r = kvm_vcpu_ioctl_nmi(vcpu); - if (r) - goto out; - r = 0; break; } case KVM_SET_CPUID: { @@ -2729,8 +3039,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp, if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) goto out; r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries); - if (r) - goto out; break; } case KVM_SET_CPUID2: { @@ -2742,8 +3050,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp, goto out; r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid, cpuid_arg->entries); - if (r) - goto out; break; } case KVM_GET_CPUID2: { @@ -2875,10 +3181,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } case KVM_SET_XSAVE: { u.xsave = memdup_user(argp, sizeof(*u.xsave)); - if (IS_ERR(u.xsave)) { - r = PTR_ERR(u.xsave); - goto out; - } + if (IS_ERR(u.xsave)) + return PTR_ERR(u.xsave); r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave); break; @@ -2900,10 +3204,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } case KVM_SET_XCRS: { u.xcrs = memdup_user(argp, sizeof(*u.xcrs)); - if (IS_ERR(u.xcrs)) { - r = PTR_ERR(u.xcrs); - goto out; - } + if (IS_ERR(u.xcrs)) + return PTR_ERR(u.xcrs); r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs); break; @@ -2951,7 +3253,7 @@ static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) int ret; if (addr > (unsigned int)(-3 * PAGE_SIZE)) - return -1; + return -EINVAL; ret = kvm_x86_ops->set_tss_addr(kvm, addr); return ret; } @@ -2970,12 +3272,10 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, return -EINVAL; mutex_lock(&kvm->slots_lock); - spin_lock(&kvm->mmu_lock); kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; - spin_unlock(&kvm->mmu_lock); mutex_unlock(&kvm->slots_lock); return 0; } @@ -3135,7 +3435,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) mutex_lock(&kvm->slots_lock); r = -EINVAL; - if (log->slot >= KVM_MEMORY_SLOTS) + if (log->slot >= KVM_USER_MEM_SLOTS) goto out; memslot = id_to_memslot(kvm->memslots, log->slot); @@ -3212,8 +3512,6 @@ long kvm_arch_vm_ioctl(struct file *filp, switch (ioctl) { case KVM_SET_TSS_ADDR: r = kvm_vm_ioctl_set_tss_addr(kvm, arg); - if (r < 0) - goto out; break; case KVM_SET_IDENTITY_MAP_ADDR: { u64 ident_addr; @@ -3222,14 +3520,10 @@ long kvm_arch_vm_ioctl(struct file *filp, if (copy_from_user(&ident_addr, argp, sizeof ident_addr)) goto out; r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr); - if (r < 0) - goto out; break; } case KVM_SET_NR_MMU_PAGES: r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); - if (r) - goto out; break; case KVM_GET_NR_MMU_PAGES: r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); @@ -3320,8 +3614,6 @@ long kvm_arch_vm_ioctl(struct file *filp, r = 0; get_irqchip_out: kfree(chip); - if (r) - goto out; break; } case KVM_SET_IRQCHIP: { @@ -3343,8 +3635,6 @@ long kvm_arch_vm_ioctl(struct file *filp, r = 0; set_irqchip_out: kfree(chip); - if (r) - goto out; break; } case KVM_GET_PIT: { @@ -3371,9 +3661,6 @@ long kvm_arch_vm_ioctl(struct file *filp, if (!kvm->arch.vpit) goto out; r = kvm_vm_ioctl_set_pit(kvm, &u.ps); - if (r) - goto out; - r = 0; break; } case KVM_GET_PIT2: { @@ -3397,9 +3684,6 @@ long kvm_arch_vm_ioctl(struct file *filp, if (!kvm->arch.vpit) goto out; r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2); - if (r) - goto out; - r = 0; break; } case KVM_REINJECT_CONTROL: { @@ -3408,9 +3692,6 @@ long kvm_arch_vm_ioctl(struct file *filp, if (copy_from_user(&control, argp, sizeof(control))) goto out; r = kvm_vm_ioctl_reinject(kvm, &control); - if (r) - goto out; - r = 0; break; } case KVM_XEN_HVM_CONFIG: { @@ -4210,8 +4491,10 @@ static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector, kvm_get_segment(emul_to_vcpu(ctxt), &var, seg); *selector = var.selector; - if (var.unusable) + if (var.unusable) { + memset(desc, 0, sizeof(*desc)); return false; + } if (var.g) var.limit >>= 12; @@ -4273,7 +4556,12 @@ static int emulator_get_msr(struct x86_emulate_ctxt *ctxt, static int emulator_set_msr(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data) { - return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data); + struct msr_data msr; + + msr.data = data; + msr.index = msr_index; + msr.host_initiated = false; + return kvm_set_msr(emul_to_vcpu(ctxt), &msr); } static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt, @@ -4467,26 +4755,26 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu) return r; } -static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) +static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2, + bool write_fault_to_shadow_pgtable) { - gpa_t gpa; + gpa_t gpa = cr2; pfn_t pfn; - if (tdp_enabled) - return false; - - /* - * if emulation was due to access to shadowed page table - * and it failed try to unshadow page and re-enter the - * guest to let CPU execute the instruction. - */ - if (kvm_mmu_unprotect_page_virt(vcpu, gva)) - return true; - - gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL); + if (!vcpu->arch.mmu.direct_map) { + /* + * Write permission should be allowed since only + * write access need to be emulated. + */ + gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL); - if (gpa == UNMAPPED_GVA) - return true; /* let cpu generate fault */ + /* + * If the mapping is invalid in guest, let cpu retry + * it to generate fault. + */ + if (gpa == UNMAPPED_GVA) + return true; + } /* * Do not retry the unhandleable instruction if it faults on the @@ -4495,12 +4783,43 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) * instruction -> ... */ pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa)); - if (!is_error_pfn(pfn)) { - kvm_release_pfn_clean(pfn); + + /* + * If the instruction failed on the error pfn, it can not be fixed, + * report the error to userspace. + */ + if (is_error_noslot_pfn(pfn)) + return false; + + kvm_release_pfn_clean(pfn); + + /* The instructions are well-emulated on direct mmu. */ + if (vcpu->arch.mmu.direct_map) { + unsigned int indirect_shadow_pages; + + spin_lock(&vcpu->kvm->mmu_lock); + indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages; + spin_unlock(&vcpu->kvm->mmu_lock); + + if (indirect_shadow_pages) + kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); + return true; } - return false; + /* + * if emulation was due to access to shadowed page table + * and it failed try to unshadow page and re-enter the + * guest to let CPU execute the instruction. + */ + kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); + + /* + * If the access faults on its page table, it can not + * be fixed by unprotecting shadow page and it should + * be reported to userspace. + */ + return !write_fault_to_shadow_pgtable; } static bool retry_instruction(struct x86_emulate_ctxt *ctxt, @@ -4542,7 +4861,7 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt, if (!vcpu->arch.mmu.direct_map) gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL); - kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); + kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); return true; } @@ -4559,7 +4878,13 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, int r; struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; bool writeback = true; + bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable; + /* + * Clear write_fault_to_shadow_pgtable here to ensure it is + * never reused. + */ + vcpu->arch.write_fault_to_shadow_pgtable = false; kvm_clear_exception_queue(vcpu); if (!(emulation_type & EMULTYPE_NO_DECODE)) { @@ -4578,7 +4903,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, if (r != EMULATION_OK) { if (emulation_type & EMULTYPE_TRAP_UD) return EMULATE_FAIL; - if (reexecute_instruction(vcpu, cr2)) + if (reexecute_instruction(vcpu, cr2, + write_fault_to_spt)) return EMULATE_DONE; if (emulation_type & EMULTYPE_SKIP) return EMULATE_FAIL; @@ -4608,7 +4934,7 @@ restart: return EMULATE_DONE; if (r == EMULATION_FAILED) { - if (reexecute_instruction(vcpu, cr2)) + if (reexecute_instruction(vcpu, cr2, write_fault_to_spt)) return EMULATE_DONE; return handle_emulation_failure(vcpu); @@ -4881,6 +5207,50 @@ static void kvm_set_mmio_spte_mask(void) kvm_mmu_set_mmio_spte_mask(mask); } +#ifdef CONFIG_X86_64 +static void pvclock_gtod_update_fn(struct work_struct *work) +{ + struct kvm *kvm; + + struct kvm_vcpu *vcpu; + int i; + + raw_spin_lock(&kvm_lock); + list_for_each_entry(kvm, &vm_list, vm_list) + kvm_for_each_vcpu(i, vcpu, kvm) + set_bit(KVM_REQ_MASTERCLOCK_UPDATE, &vcpu->requests); + atomic_set(&kvm_guest_has_master_clock, 0); + raw_spin_unlock(&kvm_lock); +} + +static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn); + +/* + * Notification about pvclock gtod data update. + */ +static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused, + void *priv) +{ + struct pvclock_gtod_data *gtod = &pvclock_gtod_data; + struct timekeeper *tk = priv; + + update_pvclock_gtod(tk); + + /* disable master clock if host does not trust, or does not + * use, TSC clocksource + */ + if (gtod->clock.vclock_mode != VCLOCK_TSC && + atomic_read(&kvm_guest_has_master_clock) != 0) + queue_work(system_long_wq, &pvclock_gtod_work); + + return 0; +} + +static struct notifier_block pvclock_gtod_notifier = { + .notifier_call = pvclock_gtod_notify, +}; +#endif + int kvm_arch_init(void *opaque) { int r; @@ -4903,9 +5273,16 @@ int kvm_arch_init(void *opaque) goto out; } + r = -ENOMEM; + shared_msrs = alloc_percpu(struct kvm_shared_msrs); + if (!shared_msrs) { + printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n"); + goto out; + } + r = kvm_mmu_module_init(); if (r) - goto out; + goto out_free_percpu; kvm_set_mmio_spte_mask(); kvm_init_msr_list(); @@ -4922,8 +5299,14 @@ int kvm_arch_init(void *opaque) host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); kvm_lapic_init(); +#ifdef CONFIG_X86_64 + pvclock_gtod_register_notifier(&pvclock_gtod_notifier); +#endif + return 0; +out_free_percpu: + free_percpu(shared_msrs); out: return r; } @@ -4936,8 +5319,12 @@ void kvm_arch_exit(void) cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block); +#ifdef CONFIG_X86_64 + pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier); +#endif kvm_x86_ops = NULL; kvm_mmu_module_exit(); + free_percpu(shared_msrs); } int kvm_emulate_halt(struct kvm_vcpu *vcpu) @@ -5059,7 +5446,7 @@ out: } EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); -int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) +static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); char instruction[3]; @@ -5190,7 +5577,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu) vcpu->arch.nmi_injected = true; kvm_x86_ops->set_nmi(vcpu); } - } else if (kvm_cpu_has_interrupt(vcpu)) { + } else if (kvm_cpu_has_injectable_intr(vcpu)) { if (kvm_x86_ops->interrupt_allowed(vcpu)) { kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false); @@ -5235,6 +5622,39 @@ static void process_nmi(struct kvm_vcpu *vcpu) kvm_make_request(KVM_REQ_EVENT, vcpu); } +static void kvm_gen_update_masterclock(struct kvm *kvm) +{ +#ifdef CONFIG_X86_64 + int i; + struct kvm_vcpu *vcpu; + struct kvm_arch *ka = &kvm->arch; + + spin_lock(&ka->pvclock_gtod_sync_lock); + kvm_make_mclock_inprogress_request(kvm); + /* no guest entries from this point */ + pvclock_update_vm_gtod_copy(kvm); + + kvm_for_each_vcpu(i, vcpu, kvm) + set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests); + + /* guest entries allowed */ + kvm_for_each_vcpu(i, vcpu, kvm) + clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests); + + spin_unlock(&ka->pvclock_gtod_sync_lock); +#endif +} + +static void update_eoi_exitmap(struct kvm_vcpu *vcpu) +{ + u64 eoi_exit_bitmap[4]; + + memset(eoi_exit_bitmap, 0, 32); + + kvm_ioapic_calculate_eoi_exitmap(vcpu, eoi_exit_bitmap); + kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap); +} + static int vcpu_enter_guest(struct kvm_vcpu *vcpu) { int r; @@ -5247,6 +5667,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_mmu_unload(vcpu); if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) __kvm_migrate_timers(vcpu); + if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu)) + kvm_gen_update_masterclock(vcpu->kvm); if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) { r = kvm_guest_time_update(vcpu); if (unlikely(r)) @@ -5286,6 +5708,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_handle_pmu_event(vcpu); if (kvm_check_request(KVM_REQ_PMI, vcpu)) kvm_deliver_pmi(vcpu); + if (kvm_check_request(KVM_REQ_EOIBITMAP, vcpu)) + update_eoi_exitmap(vcpu); } if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { @@ -5294,10 +5718,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) /* enable NMI/IRQ window open exits if needed */ if (vcpu->arch.nmi_pending) kvm_x86_ops->enable_nmi_window(vcpu); - else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) + else if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win) kvm_x86_ops->enable_irq_window(vcpu); if (kvm_lapic_enabled(vcpu)) { + /* + * Update architecture specific hints for APIC + * virtual interrupt delivery. + */ + if (kvm_x86_ops->hwapic_irr_update) + kvm_x86_ops->hwapic_irr_update(vcpu, + kvm_lapic_find_highest_irr(vcpu)); update_cr8_intercept(vcpu); kvm_lapic_sync_to_vapic(vcpu); } @@ -5362,7 +5793,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (hw_breakpoint_active()) hw_breakpoint_restore(); - vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu); + vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, + native_read_tsc()); vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); @@ -5419,7 +5851,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) pr_debug("vcpu %d received sipi with vector # %x\n", vcpu->vcpu_id, vcpu->arch.sipi_vector); kvm_lapic_reset(vcpu); - r = kvm_arch_vcpu_reset(vcpu); + r = kvm_vcpu_reset(vcpu); if (r) return r; vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; @@ -5781,6 +6213,9 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, int pending_vec, max_bits, idx; struct desc_ptr dt; + if (!guest_cpuid_has_xsave(vcpu) && (sregs->cr4 & X86_CR4_OSXSAVE)) + return -EINVAL; + dt.size = sregs->idt.limit; dt.address = sregs->idt.base; kvm_x86_ops->set_idt(vcpu, &dt); @@ -6044,7 +6479,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) r = vcpu_load(vcpu); if (r) return r; - r = kvm_arch_vcpu_reset(vcpu); + r = kvm_vcpu_reset(vcpu); if (r == 0) r = kvm_mmu_setup(vcpu); vcpu_put(vcpu); @@ -6052,6 +6487,23 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) return r; } +int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) +{ + int r; + struct msr_data msr; + + r = vcpu_load(vcpu); + if (r) + return r; + msr.data = 0x0; + msr.index = MSR_IA32_TSC; + msr.host_initiated = true; + kvm_write_tsc(vcpu, &msr); + vcpu_put(vcpu); + + return r; +} + void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { int r; @@ -6066,7 +6518,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kvm_x86_ops->vcpu_free(vcpu); } -int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) +static int kvm_vcpu_reset(struct kvm_vcpu *vcpu) { atomic_set(&vcpu->arch.nmi_queued, 0); vcpu->arch.nmi_pending = 0; @@ -6089,6 +6541,10 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) kvm_pmu_reset(vcpu); + memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs)); + vcpu->arch.regs_avail = ~0; + vcpu->arch.regs_dirty = ~0; + return kvm_x86_ops->vcpu_reset(vcpu); } @@ -6165,6 +6621,8 @@ int kvm_arch_hardware_enable(void *garbage) kvm_for_each_vcpu(i, vcpu, kvm) { vcpu->arch.tsc_offset_adjustment += delta_cyc; vcpu->arch.last_host_tsc = local_tsc; + set_bit(KVM_REQ_MASTERCLOCK_UPDATE, + &vcpu->requests); } /* @@ -6255,10 +6713,17 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) goto fail_free_mce_banks; + r = fx_init(vcpu); + if (r) + goto fail_free_wbinvd_dirty_mask; + + vcpu->arch.ia32_tsc_adjust_msr = 0x0; kvm_async_pf_hash_reset(vcpu); kvm_pmu_init(vcpu); return 0; +fail_free_wbinvd_dirty_mask: + free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); fail_free_mce_banks: kfree(vcpu->arch.mce_banks); fail_free_lapic: @@ -6302,6 +6767,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) raw_spin_lock_init(&kvm->arch.tsc_write_lock); mutex_init(&kvm->arch.apic_map_lock); + spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock); + + pvclock_update_vm_gtod_copy(kvm); return 0; } @@ -6440,48 +6908,43 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, struct kvm_memory_slot old, struct kvm_userspace_memory_region *mem, - int user_alloc) + bool user_alloc) { int npages = memslot->npages; - int map_flags = MAP_PRIVATE | MAP_ANONYMOUS; - - /* Prevent internal slot pages from being moved by fork()/COW. */ - if (memslot->id >= KVM_MEMORY_SLOTS) - map_flags = MAP_SHARED | MAP_ANONYMOUS; - /*To keep backward compatibility with older userspace, - *x86 needs to handle !user_alloc case. + /* + * Only private memory slots need to be mapped here since + * KVM_SET_MEMORY_REGION ioctl is no longer supported. */ - if (!user_alloc) { - if (npages && !old.npages) { - unsigned long userspace_addr; + if ((memslot->id >= KVM_USER_MEM_SLOTS) && npages && !old.npages) { + unsigned long userspace_addr; - userspace_addr = vm_mmap(NULL, 0, - npages * PAGE_SIZE, - PROT_READ | PROT_WRITE, - map_flags, - 0); + /* + * MAP_SHARED to prevent internal slot pages from being moved + * by fork()/COW. + */ + userspace_addr = vm_mmap(NULL, 0, npages * PAGE_SIZE, + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, 0); - if (IS_ERR((void *)userspace_addr)) - return PTR_ERR((void *)userspace_addr); + if (IS_ERR((void *)userspace_addr)) + return PTR_ERR((void *)userspace_addr); - memslot->userspace_addr = userspace_addr; - } + memslot->userspace_addr = userspace_addr; } - return 0; } void kvm_arch_commit_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, struct kvm_memory_slot old, - int user_alloc) + bool user_alloc) { int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT; - if (!user_alloc && !old.user_alloc && old.npages && !npages) { + if ((mem->slot >= KVM_USER_MEM_SLOTS) && old.npages && !npages) { int ret; ret = vm_munmap(old.userspace_addr, @@ -6495,11 +6958,15 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, if (!kvm->arch.n_requested_mmu_pages) nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); - spin_lock(&kvm->mmu_lock); if (nr_mmu_pages) kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); - kvm_mmu_slot_remove_write_access(kvm, mem->slot); - spin_unlock(&kvm->mmu_lock); + /* + * Write protect all pages for dirty logging. + * Existing largepage mappings are destroyed here and new ones will + * not be created until the end of the logging. + */ + if (npages && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES)) + kvm_mmu_slot_remove_write_access(kvm, mem->slot); /* * If memory slot is created, or moved, we need to clear all * mmio sptes. diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 2b5219c12ac..e224f7a671b 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -112,7 +112,7 @@ void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); -void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data); +void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr); int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val, unsigned int bytes, diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig index 7872a3330fb..29043d2048a 100644 --- a/arch/x86/lguest/Kconfig +++ b/arch/x86/lguest/Kconfig @@ -2,6 +2,7 @@ config LGUEST_GUEST bool "Lguest guest support" select PARAVIRT depends on X86_32 + select TTY select VIRTUALIZATION select VIRTIO select VIRTIO_CONSOLE diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 642d8805bc1..1cbd89ca556 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -552,7 +552,8 @@ static void lguest_write_cr3(unsigned long cr3) current_cr3 = cr3; /* These two page tables are simple, linear, and used during boot */ - if (cr3 != __pa(swapper_pg_dir) && cr3 != __pa(initial_page_table)) + if (cr3 != __pa_symbol(swapper_pg_dir) && + cr3 != __pa_symbol(initial_page_table)) cr3_changed = true; } @@ -1412,7 +1413,7 @@ __init void lguest_init(void) /* We don't have features. We have puppies! Puppies! */ #ifdef CONFIG_X86_MCE - mce_disabled = 1; + mca_cfg.disabled = true; #endif #ifdef CONFIG_ACPI acpi_disabled = 1; diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index b00f6785da7..96b2c6697c9 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -32,7 +32,6 @@ ifeq ($(CONFIG_X86_32),y) lib-y += checksum_32.o lib-y += strstr_32.o lib-y += string_32.o - lib-y += cmpxchg.o ifneq ($(CONFIG_X86_CMPXCHG64),y) lib-y += cmpxchg8b_emu.o atomic64_386_32.o endif diff --git a/arch/x86/lib/cmpxchg.c b/arch/x86/lib/cmpxchg.c deleted file mode 100644 index 5d619f6df3e..00000000000 --- a/arch/x86/lib/cmpxchg.c +++ /dev/null @@ -1,54 +0,0 @@ -/* - * cmpxchg*() fallbacks for CPU not supporting these instructions - */ - -#include <linux/kernel.h> -#include <linux/smp.h> -#include <linux/module.h> - -#ifndef CONFIG_X86_CMPXCHG -unsigned long cmpxchg_386_u8(volatile void *ptr, u8 old, u8 new) -{ - u8 prev; - unsigned long flags; - - /* Poor man's cmpxchg for 386. Unsuitable for SMP */ - local_irq_save(flags); - prev = *(u8 *)ptr; - if (prev == old) - *(u8 *)ptr = new; - local_irq_restore(flags); - return prev; -} -EXPORT_SYMBOL(cmpxchg_386_u8); - -unsigned long cmpxchg_386_u16(volatile void *ptr, u16 old, u16 new) -{ - u16 prev; - unsigned long flags; - - /* Poor man's cmpxchg for 386. Unsuitable for SMP */ - local_irq_save(flags); - prev = *(u16 *)ptr; - if (prev == old) - *(u16 *)ptr = new; - local_irq_restore(flags); - return prev; -} -EXPORT_SYMBOL(cmpxchg_386_u16); - -unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new) -{ - u32 prev; - unsigned long flags; - - /* Poor man's cmpxchg for 386. Unsuitable for SMP */ - local_irq_save(flags); - prev = *(u32 *)ptr; - if (prev == old) - *(u32 *)ptr = new; - local_irq_restore(flags); - return prev; -} -EXPORT_SYMBOL(cmpxchg_386_u32); -#endif diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S index 6b34d04d096..176cca67212 100644 --- a/arch/x86/lib/copy_page_64.S +++ b/arch/x86/lib/copy_page_64.S @@ -5,91 +5,89 @@ #include <asm/alternative-asm.h> ALIGN -copy_page_c: +copy_page_rep: CFI_STARTPROC - movl $4096/8,%ecx - rep movsq + movl $4096/8, %ecx + rep movsq ret CFI_ENDPROC -ENDPROC(copy_page_c) +ENDPROC(copy_page_rep) -/* Don't use streaming store because it's better when the target - ends up in cache. */ - -/* Could vary the prefetch distance based on SMP/UP */ +/* + * Don't use streaming copy unless the CPU indicates X86_FEATURE_REP_GOOD. + * Could vary the prefetch distance based on SMP/UP. +*/ ENTRY(copy_page) CFI_STARTPROC - subq $2*8,%rsp + subq $2*8, %rsp CFI_ADJUST_CFA_OFFSET 2*8 - movq %rbx,(%rsp) + movq %rbx, (%rsp) CFI_REL_OFFSET rbx, 0 - movq %r12,1*8(%rsp) + movq %r12, 1*8(%rsp) CFI_REL_OFFSET r12, 1*8 - movl $(4096/64)-5,%ecx + movl $(4096/64)-5, %ecx .p2align 4 .Loop64: - dec %rcx - - movq (%rsi), %rax - movq 8 (%rsi), %rbx - movq 16 (%rsi), %rdx - movq 24 (%rsi), %r8 - movq 32 (%rsi), %r9 - movq 40 (%rsi), %r10 - movq 48 (%rsi), %r11 - movq 56 (%rsi), %r12 + dec %rcx + movq 0x8*0(%rsi), %rax + movq 0x8*1(%rsi), %rbx + movq 0x8*2(%rsi), %rdx + movq 0x8*3(%rsi), %r8 + movq 0x8*4(%rsi), %r9 + movq 0x8*5(%rsi), %r10 + movq 0x8*6(%rsi), %r11 + movq 0x8*7(%rsi), %r12 prefetcht0 5*64(%rsi) - movq %rax, (%rdi) - movq %rbx, 8 (%rdi) - movq %rdx, 16 (%rdi) - movq %r8, 24 (%rdi) - movq %r9, 32 (%rdi) - movq %r10, 40 (%rdi) - movq %r11, 48 (%rdi) - movq %r12, 56 (%rdi) + movq %rax, 0x8*0(%rdi) + movq %rbx, 0x8*1(%rdi) + movq %rdx, 0x8*2(%rdi) + movq %r8, 0x8*3(%rdi) + movq %r9, 0x8*4(%rdi) + movq %r10, 0x8*5(%rdi) + movq %r11, 0x8*6(%rdi) + movq %r12, 0x8*7(%rdi) - leaq 64 (%rsi), %rsi - leaq 64 (%rdi), %rdi + leaq 64 (%rsi), %rsi + leaq 64 (%rdi), %rdi - jnz .Loop64 + jnz .Loop64 - movl $5,%ecx + movl $5, %ecx .p2align 4 .Loop2: - decl %ecx - - movq (%rsi), %rax - movq 8 (%rsi), %rbx - movq 16 (%rsi), %rdx - movq 24 (%rsi), %r8 - movq 32 (%rsi), %r9 - movq 40 (%rsi), %r10 - movq 48 (%rsi), %r11 - movq 56 (%rsi), %r12 - - movq %rax, (%rdi) - movq %rbx, 8 (%rdi) - movq %rdx, 16 (%rdi) - movq %r8, 24 (%rdi) - movq %r9, 32 (%rdi) - movq %r10, 40 (%rdi) - movq %r11, 48 (%rdi) - movq %r12, 56 (%rdi) - - leaq 64(%rdi),%rdi - leaq 64(%rsi),%rsi - + decl %ecx + + movq 0x8*0(%rsi), %rax + movq 0x8*1(%rsi), %rbx + movq 0x8*2(%rsi), %rdx + movq 0x8*3(%rsi), %r8 + movq 0x8*4(%rsi), %r9 + movq 0x8*5(%rsi), %r10 + movq 0x8*6(%rsi), %r11 + movq 0x8*7(%rsi), %r12 + + movq %rax, 0x8*0(%rdi) + movq %rbx, 0x8*1(%rdi) + movq %rdx, 0x8*2(%rdi) + movq %r8, 0x8*3(%rdi) + movq %r9, 0x8*4(%rdi) + movq %r10, 0x8*5(%rdi) + movq %r11, 0x8*6(%rdi) + movq %r12, 0x8*7(%rdi) + + leaq 64(%rdi), %rdi + leaq 64(%rsi), %rsi jnz .Loop2 - movq (%rsp),%rbx + movq (%rsp), %rbx CFI_RESTORE rbx - movq 1*8(%rsp),%r12 + movq 1*8(%rsp), %r12 CFI_RESTORE r12 - addq $2*8,%rsp + addq $2*8, %rsp CFI_ADJUST_CFA_OFFSET -2*8 ret .Lcopy_page_end: @@ -103,7 +101,7 @@ ENDPROC(copy_page) .section .altinstr_replacement,"ax" 1: .byte 0xeb /* jmp <disp8> */ - .byte (copy_page_c - copy_page) - (2f - 1b) /* offset */ + .byte (copy_page_rep - copy_page) - (2f - 1b) /* offset */ 2: .previous .section .altinstructions,"a" diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index e395693abdb..7c3bee636e2 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -98,7 +98,7 @@ void use_tsc_delay(void) delay_fn = delay_tsc; } -int __devinit read_current_timer(unsigned long *timer_val) +int read_current_timer(unsigned long *timer_val) { if (delay_fn == delay_tsc) { rdtscll(*timer_val); diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S index 156b9c80467..a4512359656 100644 --- a/arch/x86/lib/getuser.S +++ b/arch/x86/lib/getuser.S @@ -15,11 +15,10 @@ * __get_user_X * * Inputs: %[r|e]ax contains the address. - * The register is modified, but all changes are undone - * before returning because the C code doesn't know about it. * * Outputs: %[r|e]ax is error code (0 or -EFAULT) * %[r|e]dx contains zero-extended value + * %ecx contains the high half for 32-bit __get_user_8 * * * These functions should not modify any other registers, @@ -42,7 +41,7 @@ ENTRY(__get_user_1) cmp TI_addr_limit(%_ASM_DX),%_ASM_AX jae bad_get_user ASM_STAC -1: movzb (%_ASM_AX),%edx +1: movzbl (%_ASM_AX),%edx xor %eax,%eax ASM_CLAC ret @@ -72,29 +71,42 @@ ENTRY(__get_user_4) cmp TI_addr_limit(%_ASM_DX),%_ASM_AX jae bad_get_user ASM_STAC -3: mov -3(%_ASM_AX),%edx +3: movl -3(%_ASM_AX),%edx xor %eax,%eax ASM_CLAC ret CFI_ENDPROC ENDPROC(__get_user_4) -#ifdef CONFIG_X86_64 ENTRY(__get_user_8) CFI_STARTPROC +#ifdef CONFIG_X86_64 add $7,%_ASM_AX jc bad_get_user GET_THREAD_INFO(%_ASM_DX) cmp TI_addr_limit(%_ASM_DX),%_ASM_AX - jae bad_get_user + jae bad_get_user ASM_STAC -4: movq -7(%_ASM_AX),%_ASM_DX +4: movq -7(%_ASM_AX),%rdx xor %eax,%eax ASM_CLAC ret +#else + add $7,%_ASM_AX + jc bad_get_user_8 + GET_THREAD_INFO(%_ASM_DX) + cmp TI_addr_limit(%_ASM_DX),%_ASM_AX + jae bad_get_user_8 + ASM_STAC +4: movl -7(%_ASM_AX),%edx +5: movl -3(%_ASM_AX),%ecx + xor %eax,%eax + ASM_CLAC + ret +#endif CFI_ENDPROC ENDPROC(__get_user_8) -#endif + bad_get_user: CFI_STARTPROC @@ -105,9 +117,24 @@ bad_get_user: CFI_ENDPROC END(bad_get_user) +#ifdef CONFIG_X86_32 +bad_get_user_8: + CFI_STARTPROC + xor %edx,%edx + xor %ecx,%ecx + mov $(-EFAULT),%_ASM_AX + ASM_CLAC + ret + CFI_ENDPROC +END(bad_get_user_8) +#endif + _ASM_EXTABLE(1b,bad_get_user) _ASM_EXTABLE(2b,bad_get_user) _ASM_EXTABLE(3b,bad_get_user) #ifdef CONFIG_X86_64 _ASM_EXTABLE(4b,bad_get_user) +#else + _ASM_EXTABLE(4b,bad_get_user_8) + _ASM_EXTABLE(5b,bad_get_user_8) #endif diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index 98f6d6b68f5..f0312d74640 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -570,63 +570,6 @@ do { \ unsigned long __copy_to_user_ll(void __user *to, const void *from, unsigned long n) { -#ifndef CONFIG_X86_WP_WORKS_OK - if (unlikely(boot_cpu_data.wp_works_ok == 0) && - ((unsigned long)to) < TASK_SIZE) { - /* - * When we are in an atomic section (see - * mm/filemap.c:file_read_actor), return the full - * length to take the slow path. - */ - if (in_atomic()) - return n; - - /* - * CPU does not honor the WP bit when writing - * from supervisory mode, and due to preemption or SMP, - * the page tables can change at any time. - * Do it manually. Manfred <manfred@colorfullife.com> - */ - while (n) { - unsigned long offset = ((unsigned long)to)%PAGE_SIZE; - unsigned long len = PAGE_SIZE - offset; - int retval; - struct page *pg; - void *maddr; - - if (len > n) - len = n; - -survive: - down_read(¤t->mm->mmap_sem); - retval = get_user_pages(current, current->mm, - (unsigned long)to, 1, 1, 0, &pg, NULL); - - if (retval == -ENOMEM && is_global_init(current)) { - up_read(¤t->mm->mmap_sem); - congestion_wait(BLK_RW_ASYNC, HZ/50); - goto survive; - } - - if (retval != 1) { - up_read(¤t->mm->mmap_sem); - break; - } - - maddr = kmap_atomic(pg); - memcpy(maddr + offset, from, len); - kunmap_atomic(maddr); - set_page_dirty_lock(pg); - put_page(pg); - up_read(¤t->mm->mmap_sem); - - from += len; - to += len; - n -= len; - } - return n; - } -#endif stac(); if (movsl_is_ok(to, from, n)) __copy_user(to, from, n); diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 8e13ecb41be..2b97525246d 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -18,7 +18,7 @@ #include <asm/pgalloc.h> /* pgd_*(), ... */ #include <asm/kmemcheck.h> /* kmemcheck_*(), ... */ #include <asm/fixmap.h> /* VSYSCALL_START */ -#include <asm/rcu.h> /* exception_enter(), ... */ +#include <asm/context_tracking.h> /* exception_enter(), ... */ /* * Page fault error code bits: @@ -748,13 +748,15 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, return; } #endif + /* Kernel addresses are always protection faults: */ + if (address >= TASK_SIZE) + error_code |= PF_PROT; - if (unlikely(show_unhandled_signals)) + if (likely(show_unhandled_signals)) show_signal_msg(regs, error_code, address, tsk); - /* Kernel addresses are always protection faults: */ tsk->thread.cr2 = address; - tsk->thread.error_code = error_code | (address >= TASK_SIZE); + tsk->thread.error_code = error_code; tsk->thread.trap_nr = X86_TRAP_PF; force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0); @@ -803,20 +805,6 @@ bad_area_access_error(struct pt_regs *regs, unsigned long error_code, __bad_area(regs, error_code, address, SEGV_ACCERR); } -/* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */ -static void -out_of_memory(struct pt_regs *regs, unsigned long error_code, - unsigned long address) -{ - /* - * We ran out of memory, call the OOM killer, and return the userspace - * (which will retry the fault, or kill us if we got oom-killed): - */ - up_read(¤t->mm->mmap_sem); - - pagefault_out_of_memory(); -} - static void do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, unsigned int fault) @@ -879,7 +867,14 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, return 1; } - out_of_memory(regs, error_code, address); + up_read(¤t->mm->mmap_sem); + + /* + * We ran out of memory, call the OOM killer, and return the + * userspace (which will retry the fault, or kill us if we got + * oom-killed): + */ + pagefault_out_of_memory(); } else { if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| VM_FAULT_HWPOISON_LARGE)) @@ -944,14 +939,8 @@ spurious_fault(unsigned long error_code, unsigned long address) if (pmd_large(*pmd)) return spurious_fault_check(error_code, (pte_t *) pmd); - /* - * Note: don't use pte_present() here, since it returns true - * if the _PAGE_PROTNONE bit is set. However, this aliases the - * _PAGE_GLOBAL bit, which for kernel pages give false positives - * when CONFIG_DEBUG_PAGEALLOC is used. - */ pte = pte_offset_kernel(pmd, address); - if (!(pte_flags(*pte) & _PAGE_PRESENT)) + if (!pte_present(*pte)) return 0; ret = spurious_fault_check(error_code, pte); diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 937bff5cdaa..ae1aa71d011 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -274,42 +274,15 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long pgoff, unsigned long flags) { struct hstate *h = hstate_file(file); - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - unsigned long start_addr; - - if (len > mm->cached_hole_size) { - start_addr = mm->free_area_cache; - } else { - start_addr = TASK_UNMAPPED_BASE; - mm->cached_hole_size = 0; - } - -full_search: - addr = ALIGN(start_addr, huge_page_size(h)); - - for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { - /* At this point: (!vma || addr < vma->vm_end). */ - if (TASK_SIZE - len < addr) { - /* - * Start a new search - just in case we missed - * some holes. - */ - if (start_addr != TASK_UNMAPPED_BASE) { - start_addr = TASK_UNMAPPED_BASE; - mm->cached_hole_size = 0; - goto full_search; - } - return -ENOMEM; - } - if (!vma || addr + len <= vma->vm_start) { - mm->free_area_cache = addr + len; - return addr; - } - if (addr + mm->cached_hole_size < vma->vm_start) - mm->cached_hole_size = vma->vm_start - addr; - addr = ALIGN(vma->vm_end, huge_page_size(h)); - } + struct vm_unmapped_area_info info; + + info.flags = 0; + info.length = len; + info.low_limit = TASK_UNMAPPED_BASE; + info.high_limit = TASK_SIZE; + info.align_mask = PAGE_MASK & ~huge_page_mask(h); + info.align_offset = 0; + return vm_unmapped_area(&info); } static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, @@ -317,83 +290,30 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long pgoff, unsigned long flags) { struct hstate *h = hstate_file(file); - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - unsigned long base = mm->mmap_base; - unsigned long addr = addr0; - unsigned long largest_hole = mm->cached_hole_size; - unsigned long start_addr; - - /* don't allow allocations above current base */ - if (mm->free_area_cache > base) - mm->free_area_cache = base; - - if (len <= largest_hole) { - largest_hole = 0; - mm->free_area_cache = base; - } -try_again: - start_addr = mm->free_area_cache; - - /* make sure it can fit in the remaining address space */ - if (mm->free_area_cache < len) - goto fail; - - /* either no address requested or can't fit in requested address hole */ - addr = (mm->free_area_cache - len) & huge_page_mask(h); - do { - /* - * Lookup failure means no vma is above this address, - * i.e. return with success: - */ - vma = find_vma(mm, addr); - if (!vma) - return addr; + struct vm_unmapped_area_info info; + unsigned long addr; - if (addr + len <= vma->vm_start) { - /* remember the address as a hint for next time */ - mm->cached_hole_size = largest_hole; - return (mm->free_area_cache = addr); - } else if (mm->free_area_cache == vma->vm_end) { - /* pull free_area_cache down to the first hole */ - mm->free_area_cache = vma->vm_start; - mm->cached_hole_size = largest_hole; - } + info.flags = VM_UNMAPPED_AREA_TOPDOWN; + info.length = len; + info.low_limit = PAGE_SIZE; + info.high_limit = current->mm->mmap_base; + info.align_mask = PAGE_MASK & ~huge_page_mask(h); + info.align_offset = 0; + addr = vm_unmapped_area(&info); - /* remember the largest hole we saw so far */ - if (addr + largest_hole < vma->vm_start) - largest_hole = vma->vm_start - addr; - - /* try just below the current vma->vm_start */ - addr = (vma->vm_start - len) & huge_page_mask(h); - } while (len <= vma->vm_start); - -fail: - /* - * if hint left us with no space for the requested - * mapping then try again: - */ - if (start_addr != base) { - mm->free_area_cache = base; - largest_hole = 0; - goto try_again; - } /* * A failed mmap() very likely causes application failure, * so fall back to the bottom-up function here. This scenario * can happen with large stack limits and large mmap() * allocations. */ - mm->free_area_cache = TASK_UNMAPPED_BASE; - mm->cached_hole_size = ~0UL; - addr = hugetlb_get_unmapped_area_bottomup(file, addr0, - len, pgoff, flags); - - /* - * Restore the topdown base: - */ - mm->free_area_cache = base; - mm->cached_hole_size = ~0UL; + if (addr & ~PAGE_MASK) { + VM_BUG_ON(addr != -ENOMEM); + info.flags = 0; + info.low_limit = TASK_UNMAPPED_BASE; + info.high_limit = TASK_SIZE; + addr = vm_unmapped_area(&info); + } return addr; } diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index d7aea41563b..59b7fc45327 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -16,87 +16,134 @@ #include <asm/tlb.h> #include <asm/proto.h> #include <asm/dma.h> /* for MAX_DMA_PFN */ +#include <asm/microcode.h> -unsigned long __initdata pgt_buf_start; -unsigned long __meminitdata pgt_buf_end; -unsigned long __meminitdata pgt_buf_top; +#include "mm_internal.h" -int after_bootmem; +static unsigned long __initdata pgt_buf_start; +static unsigned long __initdata pgt_buf_end; +static unsigned long __initdata pgt_buf_top; -int direct_gbpages -#ifdef CONFIG_DIRECT_GBPAGES - = 1 -#endif -; +static unsigned long min_pfn_mapped; -struct map_range { - unsigned long start; - unsigned long end; - unsigned page_size_mask; -}; +static bool __initdata can_use_brk_pgt = true; /* - * First calculate space needed for kernel direct mapping page tables to cover - * mr[0].start to mr[nr_range - 1].end, while accounting for possible 2M and 1GB - * pages. Then find enough contiguous space for those page tables. + * Pages returned are already directly mapped. + * + * Changing that is likely to break Xen, see commit: + * + * 279b706 x86,xen: introduce x86_init.mapping.pagetable_reserve + * + * for detailed information. */ -static void __init find_early_table_space(struct map_range *mr, int nr_range) +__ref void *alloc_low_pages(unsigned int num) { + unsigned long pfn; int i; - unsigned long puds = 0, pmds = 0, ptes = 0, tables; - unsigned long start = 0, good_end; - phys_addr_t base; - for (i = 0; i < nr_range; i++) { - unsigned long range, extra; + if (after_bootmem) { + unsigned int order; - range = mr[i].end - mr[i].start; - puds += (range + PUD_SIZE - 1) >> PUD_SHIFT; + order = get_order((unsigned long)num << PAGE_SHIFT); + return (void *)__get_free_pages(GFP_ATOMIC | __GFP_NOTRACK | + __GFP_ZERO, order); + } - if (mr[i].page_size_mask & (1 << PG_LEVEL_1G)) { - extra = range - ((range >> PUD_SHIFT) << PUD_SHIFT); - pmds += (extra + PMD_SIZE - 1) >> PMD_SHIFT; - } else { - pmds += (range + PMD_SIZE - 1) >> PMD_SHIFT; - } + if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) { + unsigned long ret; + if (min_pfn_mapped >= max_pfn_mapped) + panic("alloc_low_page: ran out of memory"); + ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT, + max_pfn_mapped << PAGE_SHIFT, + PAGE_SIZE * num , PAGE_SIZE); + if (!ret) + panic("alloc_low_page: can not alloc memory"); + memblock_reserve(ret, PAGE_SIZE * num); + pfn = ret >> PAGE_SHIFT; + } else { + pfn = pgt_buf_end; + pgt_buf_end += num; + printk(KERN_DEBUG "BRK [%#010lx, %#010lx] PGTABLE\n", + pfn << PAGE_SHIFT, (pgt_buf_end << PAGE_SHIFT) - 1); + } - if (mr[i].page_size_mask & (1 << PG_LEVEL_2M)) { - extra = range - ((range >> PMD_SHIFT) << PMD_SHIFT); -#ifdef CONFIG_X86_32 - extra += PMD_SIZE; -#endif - ptes += (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; - } else { - ptes += (range + PAGE_SIZE - 1) >> PAGE_SHIFT; - } + for (i = 0; i < num; i++) { + void *adr; + + adr = __va((pfn + i) << PAGE_SHIFT); + clear_page(adr); } - tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); - tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); - tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE); + return __va(pfn << PAGE_SHIFT); +} -#ifdef CONFIG_X86_32 - /* for fixmap */ - tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); -#endif - good_end = max_pfn_mapped << PAGE_SHIFT; +/* need 4 4k for initial PMD_SIZE, 4k for 0-ISA_END_ADDRESS */ +#define INIT_PGT_BUF_SIZE (5 * PAGE_SIZE) +RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE); +void __init early_alloc_pgt_buf(void) +{ + unsigned long tables = INIT_PGT_BUF_SIZE; + phys_addr_t base; - base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE); - if (!base) - panic("Cannot find space for the kernel page tables"); + base = __pa(extend_brk(tables, PAGE_SIZE)); pgt_buf_start = base >> PAGE_SHIFT; pgt_buf_end = pgt_buf_start; pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT); +} - printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem %#010lx-%#010lx]\n", - mr[nr_range - 1].end - 1, pgt_buf_start << PAGE_SHIFT, - (pgt_buf_top << PAGE_SHIFT) - 1); +int after_bootmem; + +int direct_gbpages +#ifdef CONFIG_DIRECT_GBPAGES + = 1 +#endif +; + +static void __init init_gbpages(void) +{ +#ifdef CONFIG_X86_64 + if (direct_gbpages && cpu_has_gbpages) + printk(KERN_INFO "Using GB pages for direct mapping\n"); + else + direct_gbpages = 0; +#endif } -void __init native_pagetable_reserve(u64 start, u64 end) +struct map_range { + unsigned long start; + unsigned long end; + unsigned page_size_mask; +}; + +static int page_size_mask; + +static void __init probe_page_size_mask(void) { - memblock_reserve(start, end - start); + init_gbpages(); + +#if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_KMEMCHECK) + /* + * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. + * This will simplify cpa(), which otherwise needs to support splitting + * large pages into small in interrupt context, etc. + */ + if (direct_gbpages) + page_size_mask |= 1 << PG_LEVEL_1G; + if (cpu_has_pse) + page_size_mask |= 1 << PG_LEVEL_2M; +#endif + + /* Enable PSE if available */ + if (cpu_has_pse) + set_in_cr4(X86_CR4_PSE); + + /* Enable PGE if available */ + if (cpu_has_pge) { + set_in_cr4(X86_CR4_PGE); + __supported_pte_mask |= _PAGE_GLOBAL; + } } #ifdef CONFIG_X86_32 @@ -122,58 +169,51 @@ static int __meminit save_mr(struct map_range *mr, int nr_range, } /* - * Setup the direct mapping of the physical memory at PAGE_OFFSET. - * This runs before bootmem is initialized and gets pages directly from - * the physical memory. To access them they are temporarily mapped. + * adjust the page_size_mask for small range to go with + * big page size instead small one if nearby are ram too. */ -unsigned long __init_refok init_memory_mapping(unsigned long start, - unsigned long end) +static void __init_refok adjust_range_page_size_mask(struct map_range *mr, + int nr_range) { - unsigned long page_size_mask = 0; - unsigned long start_pfn, end_pfn; - unsigned long ret = 0; - unsigned long pos; - - struct map_range mr[NR_RANGE_MR]; - int nr_range, i; - int use_pse, use_gbpages; + int i; - printk(KERN_INFO "init_memory_mapping: [mem %#010lx-%#010lx]\n", - start, end - 1); + for (i = 0; i < nr_range; i++) { + if ((page_size_mask & (1<<PG_LEVEL_2M)) && + !(mr[i].page_size_mask & (1<<PG_LEVEL_2M))) { + unsigned long start = round_down(mr[i].start, PMD_SIZE); + unsigned long end = round_up(mr[i].end, PMD_SIZE); -#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK) - /* - * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. - * This will simplify cpa(), which otherwise needs to support splitting - * large pages into small in interrupt context, etc. - */ - use_pse = use_gbpages = 0; -#else - use_pse = cpu_has_pse; - use_gbpages = direct_gbpages; +#ifdef CONFIG_X86_32 + if ((end >> PAGE_SHIFT) > max_low_pfn) + continue; #endif - /* Enable PSE if available */ - if (cpu_has_pse) - set_in_cr4(X86_CR4_PSE); + if (memblock_is_region_memory(start, end - start)) + mr[i].page_size_mask |= 1<<PG_LEVEL_2M; + } + if ((page_size_mask & (1<<PG_LEVEL_1G)) && + !(mr[i].page_size_mask & (1<<PG_LEVEL_1G))) { + unsigned long start = round_down(mr[i].start, PUD_SIZE); + unsigned long end = round_up(mr[i].end, PUD_SIZE); - /* Enable PGE if available */ - if (cpu_has_pge) { - set_in_cr4(X86_CR4_PGE); - __supported_pte_mask |= _PAGE_GLOBAL; + if (memblock_is_region_memory(start, end - start)) + mr[i].page_size_mask |= 1<<PG_LEVEL_1G; + } } +} - if (use_gbpages) - page_size_mask |= 1 << PG_LEVEL_1G; - if (use_pse) - page_size_mask |= 1 << PG_LEVEL_2M; +static int __meminit split_mem_range(struct map_range *mr, int nr_range, + unsigned long start, + unsigned long end) +{ + unsigned long start_pfn, end_pfn, limit_pfn; + unsigned long pfn; + int i; - memset(mr, 0, sizeof(mr)); - nr_range = 0; + limit_pfn = PFN_DOWN(end); /* head if not big page alignment ? */ - start_pfn = start >> PAGE_SHIFT; - pos = start_pfn << PAGE_SHIFT; + pfn = start_pfn = PFN_DOWN(start); #ifdef CONFIG_X86_32 /* * Don't use a large page for the first 2/4MB of memory @@ -181,66 +221,60 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, * and overlapping MTRRs into large pages can cause * slowdowns. */ - if (pos == 0) - end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT); + if (pfn == 0) + end_pfn = PFN_DOWN(PMD_SIZE); else - end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) - << (PMD_SHIFT - PAGE_SHIFT); + end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); #else /* CONFIG_X86_64 */ - end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT) - << (PMD_SHIFT - PAGE_SHIFT); + end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); #endif - if (end_pfn > (end >> PAGE_SHIFT)) - end_pfn = end >> PAGE_SHIFT; + if (end_pfn > limit_pfn) + end_pfn = limit_pfn; if (start_pfn < end_pfn) { nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); - pos = end_pfn << PAGE_SHIFT; + pfn = end_pfn; } /* big page (2M) range */ - start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) - << (PMD_SHIFT - PAGE_SHIFT); + start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); #ifdef CONFIG_X86_32 - end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); + end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE)); #else /* CONFIG_X86_64 */ - end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT) - << (PUD_SHIFT - PAGE_SHIFT); - if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT))) - end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)); + end_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE)); + if (end_pfn > round_down(limit_pfn, PFN_DOWN(PMD_SIZE))) + end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE)); #endif if (start_pfn < end_pfn) { nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, page_size_mask & (1<<PG_LEVEL_2M)); - pos = end_pfn << PAGE_SHIFT; + pfn = end_pfn; } #ifdef CONFIG_X86_64 /* big page (1G) range */ - start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT) - << (PUD_SHIFT - PAGE_SHIFT); - end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT); + start_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE)); + end_pfn = round_down(limit_pfn, PFN_DOWN(PUD_SIZE)); if (start_pfn < end_pfn) { nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, page_size_mask & ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G))); - pos = end_pfn << PAGE_SHIFT; + pfn = end_pfn; } /* tail is not big page (1G) alignment */ - start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) - << (PMD_SHIFT - PAGE_SHIFT); - end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); + start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); + end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE)); if (start_pfn < end_pfn) { nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, page_size_mask & (1<<PG_LEVEL_2M)); - pos = end_pfn << PAGE_SHIFT; + pfn = end_pfn; } #endif /* tail is not big page (2M) alignment */ - start_pfn = pos>>PAGE_SHIFT; - end_pfn = end>>PAGE_SHIFT; + start_pfn = pfn; + end_pfn = limit_pfn; nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); /* try to merge same page size and continuous */ @@ -257,59 +291,168 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, nr_range--; } + if (!after_bootmem) + adjust_range_page_size_mask(mr, nr_range); + for (i = 0; i < nr_range; i++) printk(KERN_DEBUG " [mem %#010lx-%#010lx] page %s\n", mr[i].start, mr[i].end - 1, (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":( (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k")); - /* - * Find space for the kernel direct mapping tables. - * - * Later we should allocate these tables in the local node of the - * memory mapped. Unfortunately this is done currently before the - * nodes are discovered. - */ - if (!after_bootmem) - find_early_table_space(mr, nr_range); + return nr_range; +} + +struct range pfn_mapped[E820_X_MAX]; +int nr_pfn_mapped; + +static void add_pfn_range_mapped(unsigned long start_pfn, unsigned long end_pfn) +{ + nr_pfn_mapped = add_range_with_merge(pfn_mapped, E820_X_MAX, + nr_pfn_mapped, start_pfn, end_pfn); + nr_pfn_mapped = clean_sort_range(pfn_mapped, E820_X_MAX); + + max_pfn_mapped = max(max_pfn_mapped, end_pfn); + + if (start_pfn < (1UL<<(32-PAGE_SHIFT))) + max_low_pfn_mapped = max(max_low_pfn_mapped, + min(end_pfn, 1UL<<(32-PAGE_SHIFT))); +} + +bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn) +{ + int i; + + for (i = 0; i < nr_pfn_mapped; i++) + if ((start_pfn >= pfn_mapped[i].start) && + (end_pfn <= pfn_mapped[i].end)) + return true; + + return false; +} + +/* + * Setup the direct mapping of the physical memory at PAGE_OFFSET. + * This runs before bootmem is initialized and gets pages directly from + * the physical memory. To access them they are temporarily mapped. + */ +unsigned long __init_refok init_memory_mapping(unsigned long start, + unsigned long end) +{ + struct map_range mr[NR_RANGE_MR]; + unsigned long ret = 0; + int nr_range, i; + + pr_info("init_memory_mapping: [mem %#010lx-%#010lx]\n", + start, end - 1); + + memset(mr, 0, sizeof(mr)); + nr_range = split_mem_range(mr, 0, start, end); for (i = 0; i < nr_range; i++) ret = kernel_physical_mapping_init(mr[i].start, mr[i].end, mr[i].page_size_mask); -#ifdef CONFIG_X86_32 - early_ioremap_page_table_range_init(); + add_pfn_range_mapped(start >> PAGE_SHIFT, ret >> PAGE_SHIFT); - load_cr3(swapper_pg_dir); -#endif + return ret >> PAGE_SHIFT; +} - __flush_tlb_all(); +/* + * would have hole in the middle or ends, and only ram parts will be mapped. + */ +static unsigned long __init init_range_memory_mapping( + unsigned long r_start, + unsigned long r_end) +{ + unsigned long start_pfn, end_pfn; + unsigned long mapped_ram_size = 0; + int i; - /* - * Reserve the kernel pagetable pages we used (pgt_buf_start - - * pgt_buf_end) and free the other ones (pgt_buf_end - pgt_buf_top) - * so that they can be reused for other purposes. - * - * On native it just means calling memblock_reserve, on Xen it also - * means marking RW the pagetable pages that we allocated before - * but that haven't been used. - * - * In fact on xen we mark RO the whole range pgt_buf_start - - * pgt_buf_top, because we have to make sure that when - * init_memory_mapping reaches the pagetable pages area, it maps - * RO all the pagetable pages, including the ones that are beyond - * pgt_buf_end at that time. - */ - if (!after_bootmem && pgt_buf_end > pgt_buf_start) - x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start), - PFN_PHYS(pgt_buf_end)); + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { + u64 start = clamp_val(PFN_PHYS(start_pfn), r_start, r_end); + u64 end = clamp_val(PFN_PHYS(end_pfn), r_start, r_end); + if (start >= end) + continue; - if (!after_bootmem) - early_memtest(start, end); + /* + * if it is overlapping with brk pgt, we need to + * alloc pgt buf from memblock instead. + */ + can_use_brk_pgt = max(start, (u64)pgt_buf_end<<PAGE_SHIFT) >= + min(end, (u64)pgt_buf_top<<PAGE_SHIFT); + init_memory_mapping(start, end); + mapped_ram_size += end - start; + can_use_brk_pgt = true; + } - return ret >> PAGE_SHIFT; + return mapped_ram_size; } +/* (PUD_SHIFT-PMD_SHIFT)/2 */ +#define STEP_SIZE_SHIFT 5 +void __init init_mem_mapping(void) +{ + unsigned long end, real_end, start, last_start; + unsigned long step_size; + unsigned long addr; + unsigned long mapped_ram_size = 0; + unsigned long new_mapped_ram_size; + + probe_page_size_mask(); + +#ifdef CONFIG_X86_64 + end = max_pfn << PAGE_SHIFT; +#else + end = max_low_pfn << PAGE_SHIFT; +#endif + + /* the ISA range is always mapped regardless of memory holes */ + init_memory_mapping(0, ISA_END_ADDRESS); + + /* xen has big range in reserved near end of ram, skip it at first.*/ + addr = memblock_find_in_range(ISA_END_ADDRESS, end, PMD_SIZE, PMD_SIZE); + real_end = addr + PMD_SIZE; + + /* step_size need to be small so pgt_buf from BRK could cover it */ + step_size = PMD_SIZE; + max_pfn_mapped = 0; /* will get exact value next */ + min_pfn_mapped = real_end >> PAGE_SHIFT; + last_start = start = real_end; + while (last_start > ISA_END_ADDRESS) { + if (last_start > step_size) { + start = round_down(last_start - 1, step_size); + if (start < ISA_END_ADDRESS) + start = ISA_END_ADDRESS; + } else + start = ISA_END_ADDRESS; + new_mapped_ram_size = init_range_memory_mapping(start, + last_start); + last_start = start; + min_pfn_mapped = last_start >> PAGE_SHIFT; + /* only increase step_size after big range get mapped */ + if (new_mapped_ram_size > mapped_ram_size) + step_size <<= STEP_SIZE_SHIFT; + mapped_ram_size += new_mapped_ram_size; + } + + if (real_end < end) + init_range_memory_mapping(real_end, end); + +#ifdef CONFIG_X86_64 + if (max_pfn > max_low_pfn) { + /* can we preseve max_low_pfn ?*/ + max_low_pfn = max_pfn; + } +#else + early_ioremap_page_table_range_init(); +#endif + + load_cr3(swapper_pg_dir); + __flush_tlb_all(); + + early_memtest(0, max_pfn_mapped << PAGE_SHIFT); +} /* * devmem_is_allowed() checks to see if /dev/mem access to a certain address @@ -391,6 +534,15 @@ void free_initmem(void) #ifdef CONFIG_BLK_DEV_INITRD void __init free_initrd_mem(unsigned long start, unsigned long end) { +#ifdef CONFIG_MICROCODE_EARLY + /* + * Remember, initrd memory may contain microcode or other useful things. + * Before we lose initrd mem, we need to find a place to hold them + * now that normal virtual memory is enabled. + */ + save_microcode_in_initrd(); +#endif + /* * end could be not aligned, and We can not align that, * decompresser could be confused by aligned initrd_end diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 11a58001b4c..2d19001151d 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -53,25 +53,14 @@ #include <asm/page_types.h> #include <asm/init.h> +#include "mm_internal.h" + unsigned long highstart_pfn, highend_pfn; static noinline int do_test_wp_bit(void); bool __read_mostly __vmalloc_start_set = false; -static __init void *alloc_low_page(void) -{ - unsigned long pfn = pgt_buf_end++; - void *adr; - - if (pfn >= pgt_buf_top) - panic("alloc_low_page: ran out of memory"); - - adr = __va(pfn * PAGE_SIZE); - clear_page(adr); - return adr; -} - /* * Creates a middle page table and puts a pointer to it in the * given global directory entry. This only returns the gd entry @@ -84,10 +73,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd) #ifdef CONFIG_X86_PAE if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { - if (after_bootmem) - pmd_table = (pmd_t *)alloc_bootmem_pages(PAGE_SIZE); - else - pmd_table = (pmd_t *)alloc_low_page(); + pmd_table = (pmd_t *)alloc_low_page(); paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); pud = pud_offset(pgd, 0); @@ -109,17 +95,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd) static pte_t * __init one_page_table_init(pmd_t *pmd) { if (!(pmd_val(*pmd) & _PAGE_PRESENT)) { - pte_t *page_table = NULL; - - if (after_bootmem) { -#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK) - page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE); -#endif - if (!page_table) - page_table = - (pte_t *)alloc_bootmem_pages(PAGE_SIZE); - } else - page_table = (pte_t *)alloc_low_page(); + pte_t *page_table = (pte_t *)alloc_low_page(); paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT); set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); @@ -146,8 +122,39 @@ pte_t * __init populate_extra_pte(unsigned long vaddr) return one_page_table_init(pmd) + pte_idx; } +static unsigned long __init +page_table_range_init_count(unsigned long start, unsigned long end) +{ + unsigned long count = 0; +#ifdef CONFIG_HIGHMEM + int pmd_idx_kmap_begin = fix_to_virt(FIX_KMAP_END) >> PMD_SHIFT; + int pmd_idx_kmap_end = fix_to_virt(FIX_KMAP_BEGIN) >> PMD_SHIFT; + int pgd_idx, pmd_idx; + unsigned long vaddr; + + if (pmd_idx_kmap_begin == pmd_idx_kmap_end) + return 0; + + vaddr = start; + pgd_idx = pgd_index(vaddr); + + for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd_idx++) { + for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); + pmd_idx++) { + if ((vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin && + (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end) + count++; + vaddr += PMD_SIZE; + } + pmd_idx = 0; + } +#endif + return count; +} + static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd, - unsigned long vaddr, pte_t *lastpte) + unsigned long vaddr, pte_t *lastpte, + void **adr) { #ifdef CONFIG_HIGHMEM /* @@ -161,16 +168,15 @@ static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd, if (pmd_idx_kmap_begin != pmd_idx_kmap_end && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin - && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end - && ((__pa(pte) >> PAGE_SHIFT) < pgt_buf_start - || (__pa(pte) >> PAGE_SHIFT) >= pgt_buf_end)) { + && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end) { pte_t *newpte; int i; BUG_ON(after_bootmem); - newpte = alloc_low_page(); + newpte = *adr; for (i = 0; i < PTRS_PER_PTE; i++) set_pte(newpte + i, pte[i]); + *adr = (void *)(((unsigned long)(*adr)) + PAGE_SIZE); paravirt_alloc_pte(&init_mm, __pa(newpte) >> PAGE_SHIFT); set_pmd(pmd, __pmd(__pa(newpte)|_PAGE_TABLE)); @@ -204,6 +210,11 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base) pgd_t *pgd; pmd_t *pmd; pte_t *pte = NULL; + unsigned long count = page_table_range_init_count(start, end); + void *adr = NULL; + + if (count) + adr = alloc_low_pages(count); vaddr = start; pgd_idx = pgd_index(vaddr); @@ -216,7 +227,7 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base) for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) { pte = page_table_kmap_check(one_page_table_init(pmd), - pmd, vaddr, pte); + pmd, vaddr, pte, &adr); vaddr += PMD_SIZE; } @@ -310,6 +321,7 @@ repeat: __pgprot(PTE_IDENT_ATTR | _PAGE_PSE); + pfn &= PMD_MASK >> PAGE_SHIFT; addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1; @@ -455,9 +467,14 @@ void __init native_pagetable_init(void) /* * Remove any mappings which extend past the end of physical - * memory from the boot time page table: + * memory from the boot time page table. + * In virtual address space, we should have at least two pages + * from VMALLOC_END to pkmap or fixmap according to VMALLOC_END + * definition. And max_low_pfn is set to VMALLOC_END physical + * address. If initial memory mapping is doing right job, we + * should have pte used near max_low_pfn or one pmd is not present. */ - for (pfn = max_low_pfn + 1; pfn < 1<<(32-PAGE_SHIFT); pfn++) { + for (pfn = max_low_pfn; pfn < 1<<(32-PAGE_SHIFT); pfn++) { va = PAGE_OFFSET + (pfn<<PAGE_SHIFT); pgd = base + pgd_index(va); if (!pgd_present(*pgd)) @@ -468,10 +485,19 @@ void __init native_pagetable_init(void) if (!pmd_present(*pmd)) break; + /* should not be large page here */ + if (pmd_large(*pmd)) { + pr_warn("try to clear pte for ram above max_low_pfn: pfn: %lx pmd: %p pmd phys: %lx, but pmd is big page and is not using pte !\n", + pfn, pmd, __pa(pmd)); + BUG_ON(1); + } + pte = pte_offset_kernel(pmd, va); if (!pte_present(*pte)) break; + printk(KERN_DEBUG "clearing pte for ram above max_low_pfn: pfn: %lx pmd: %p pmd phys: %lx pte: %p pte phys: %lx\n", + pfn, pmd, __pa(pmd), pte, __pa(pte)); pte_clear(NULL, va, pte); } paravirt_alloc_pmd(&init_mm, __pa(base) >> PAGE_SHIFT); @@ -550,7 +576,7 @@ early_param("highmem", parse_highmem); * artificially via the highmem=x boot parameter then create * it: */ -void __init lowmem_pfn_init(void) +static void __init lowmem_pfn_init(void) { /* max_low_pfn is 0, we already have early_res support */ max_low_pfn = max_pfn; @@ -586,7 +612,7 @@ void __init lowmem_pfn_init(void) * We have more RAM than fits into lowmem - we try to put it into * highmem, also taking the highmem=x boot parameter into account: */ -void __init highmem_pfn_init(void) +static void __init highmem_pfn_init(void) { max_low_pfn = MAXMEM_PFN; @@ -669,8 +695,6 @@ void __init setup_bootmem_allocator(void) printk(KERN_INFO " mapped low ram: 0 - %08lx\n", max_pfn_mapped<<PAGE_SHIFT); printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT); - - after_bootmem = 1; } /* @@ -715,10 +739,7 @@ static void __init test_wp_bit(void) if (!boot_cpu_data.wp_works_ok) { printk(KERN_CONT "No.\n"); -#ifdef CONFIG_X86_WP_WORKS_OK - panic( - "This kernel doesn't support CPU's with broken WP. Recompile it for a 386!"); -#endif + panic("Linux doesn't support CPUs with broken WP."); } else { printk(KERN_CONT "Ok.\n"); } @@ -756,6 +777,8 @@ void __init mem_init(void) if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp))) reservedpages++; + after_bootmem = 1; + codesize = (unsigned long) &_etext - (unsigned long) &_text; datasize = (unsigned long) &_edata - (unsigned long) &_etext; initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; @@ -839,6 +862,18 @@ int arch_add_memory(int nid, u64 start, u64 size) return __add_pages(nid, zone, start_pfn, nr_pages); } + +#ifdef CONFIG_MEMORY_HOTREMOVE +int arch_remove_memory(u64 start, u64 size) +{ + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + struct zone *zone; + + zone = page_zone(pfn_to_page(start_pfn)); + return __remove_pages(zone, start_pfn, nr_pages); +} +#endif #endif /* diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 3baff255ada..474e28f1081 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -54,6 +54,82 @@ #include <asm/uv/uv.h> #include <asm/setup.h> +#include "mm_internal.h" + +static void ident_pmd_init(unsigned long pmd_flag, pmd_t *pmd_page, + unsigned long addr, unsigned long end) +{ + addr &= PMD_MASK; + for (; addr < end; addr += PMD_SIZE) { + pmd_t *pmd = pmd_page + pmd_index(addr); + + if (!pmd_present(*pmd)) + set_pmd(pmd, __pmd(addr | pmd_flag)); + } +} +static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, + unsigned long addr, unsigned long end) +{ + unsigned long next; + + for (; addr < end; addr = next) { + pud_t *pud = pud_page + pud_index(addr); + pmd_t *pmd; + + next = (addr & PUD_MASK) + PUD_SIZE; + if (next > end) + next = end; + + if (pud_present(*pud)) { + pmd = pmd_offset(pud, 0); + ident_pmd_init(info->pmd_flag, pmd, addr, next); + continue; + } + pmd = (pmd_t *)info->alloc_pgt_page(info->context); + if (!pmd) + return -ENOMEM; + ident_pmd_init(info->pmd_flag, pmd, addr, next); + set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); + } + + return 0; +} + +int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, + unsigned long addr, unsigned long end) +{ + unsigned long next; + int result; + int off = info->kernel_mapping ? pgd_index(__PAGE_OFFSET) : 0; + + for (; addr < end; addr = next) { + pgd_t *pgd = pgd_page + pgd_index(addr) + off; + pud_t *pud; + + next = (addr & PGDIR_MASK) + PGDIR_SIZE; + if (next > end) + next = end; + + if (pgd_present(*pgd)) { + pud = pud_offset(pgd, 0); + result = ident_pud_init(info, pud, addr, next); + if (result) + return result; + continue; + } + + pud = (pud_t *)info->alloc_pgt_page(info->context); + if (!pud) + return -ENOMEM; + result = ident_pud_init(info, pud, addr, next); + if (result) + return result; + set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); + } + + return 0; +} + static int __init parse_direct_gbpages_off(char *arg) { direct_gbpages = 0; @@ -302,10 +378,18 @@ void __init init_extra_mapping_uc(unsigned long phys, unsigned long size) void __init cleanup_highmap(void) { unsigned long vaddr = __START_KERNEL_map; - unsigned long vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT); + unsigned long vaddr_end = __START_KERNEL_map + KERNEL_IMAGE_SIZE; unsigned long end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1; pmd_t *pmd = level2_kernel_pgt; + /* + * Native path, max_pfn_mapped is not set yet. + * Xen has valid max_pfn_mapped set in + * arch/x86/xen/mmu.c:xen_setup_kernel_pagetable(). + */ + if (max_pfn_mapped) + vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT); + for (; vaddr + PMD_SIZE - 1 < vaddr_end; pmd++, vaddr += PMD_SIZE) { if (pmd_none(*pmd)) continue; @@ -314,69 +398,24 @@ void __init cleanup_highmap(void) } } -static __ref void *alloc_low_page(unsigned long *phys) -{ - unsigned long pfn = pgt_buf_end++; - void *adr; - - if (after_bootmem) { - adr = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK); - *phys = __pa(adr); - - return adr; - } - - if (pfn >= pgt_buf_top) - panic("alloc_low_page: ran out of memory"); - - adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE); - clear_page(adr); - *phys = pfn * PAGE_SIZE; - return adr; -} - -static __ref void *map_low_page(void *virt) -{ - void *adr; - unsigned long phys, left; - - if (after_bootmem) - return virt; - - phys = __pa(virt); - left = phys & (PAGE_SIZE - 1); - adr = early_memremap(phys & PAGE_MASK, PAGE_SIZE); - adr = (void *)(((unsigned long)adr) | left); - - return adr; -} - -static __ref void unmap_low_page(void *adr) -{ - if (after_bootmem) - return; - - early_iounmap((void *)((unsigned long)adr & PAGE_MASK), PAGE_SIZE); -} - static unsigned long __meminit phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end, pgprot_t prot) { - unsigned pages = 0; + unsigned long pages = 0, next; unsigned long last_map_addr = end; int i; pte_t *pte = pte_page + pte_index(addr); - for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) { - + for (i = pte_index(addr); i < PTRS_PER_PTE; i++, addr = next, pte++) { + next = (addr & PAGE_MASK) + PAGE_SIZE; if (addr >= end) { - if (!after_bootmem) { - for(; i < PTRS_PER_PTE; i++, pte++) - set_pte(pte, __pte(0)); - } - break; + if (!after_bootmem && + !e820_any_mapped(addr & PAGE_MASK, next, E820_RAM) && + !e820_any_mapped(addr & PAGE_MASK, next, E820_RESERVED_KERN)) + set_pte(pte, __pte(0)); + continue; } /* @@ -414,28 +453,25 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, int i = pmd_index(address); for (; i < PTRS_PER_PMD; i++, address = next) { - unsigned long pte_phys; pmd_t *pmd = pmd_page + pmd_index(address); pte_t *pte; pgprot_t new_prot = prot; + next = (address & PMD_MASK) + PMD_SIZE; if (address >= end) { - if (!after_bootmem) { - for (; i < PTRS_PER_PMD; i++, pmd++) - set_pmd(pmd, __pmd(0)); - } - break; + if (!after_bootmem && + !e820_any_mapped(address & PMD_MASK, next, E820_RAM) && + !e820_any_mapped(address & PMD_MASK, next, E820_RESERVED_KERN)) + set_pmd(pmd, __pmd(0)); + continue; } - next = (address & PMD_MASK) + PMD_SIZE; - if (pmd_val(*pmd)) { if (!pmd_large(*pmd)) { spin_lock(&init_mm.page_table_lock); - pte = map_low_page((pte_t *)pmd_page_vaddr(*pmd)); + pte = (pte_t *)pmd_page_vaddr(*pmd); last_map_addr = phys_pte_init(pte, address, end, prot); - unmap_low_page(pte); spin_unlock(&init_mm.page_table_lock); continue; } @@ -464,19 +500,18 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, pages++; spin_lock(&init_mm.page_table_lock); set_pte((pte_t *)pmd, - pfn_pte(address >> PAGE_SHIFT, + pfn_pte((address & PMD_MASK) >> PAGE_SHIFT, __pgprot(pgprot_val(prot) | _PAGE_PSE))); spin_unlock(&init_mm.page_table_lock); last_map_addr = next; continue; } - pte = alloc_low_page(&pte_phys); + pte = alloc_low_page(); last_map_addr = phys_pte_init(pte, address, end, new_prot); - unmap_low_page(pte); spin_lock(&init_mm.page_table_lock); - pmd_populate_kernel(&init_mm, pmd, __va(pte_phys)); + pmd_populate_kernel(&init_mm, pmd, pte); spin_unlock(&init_mm.page_table_lock); } update_page_count(PG_LEVEL_2M, pages); @@ -492,27 +527,24 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, int i = pud_index(addr); for (; i < PTRS_PER_PUD; i++, addr = next) { - unsigned long pmd_phys; pud_t *pud = pud_page + pud_index(addr); pmd_t *pmd; pgprot_t prot = PAGE_KERNEL; - if (addr >= end) - break; - next = (addr & PUD_MASK) + PUD_SIZE; - - if (!after_bootmem && !e820_any_mapped(addr, next, 0)) { - set_pud(pud, __pud(0)); + if (addr >= end) { + if (!after_bootmem && + !e820_any_mapped(addr & PUD_MASK, next, E820_RAM) && + !e820_any_mapped(addr & PUD_MASK, next, E820_RESERVED_KERN)) + set_pud(pud, __pud(0)); continue; } if (pud_val(*pud)) { if (!pud_large(*pud)) { - pmd = map_low_page(pmd_offset(pud, 0)); + pmd = pmd_offset(pud, 0); last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask, prot); - unmap_low_page(pmd); __flush_tlb_all(); continue; } @@ -541,19 +573,19 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, pages++; spin_lock(&init_mm.page_table_lock); set_pte((pte_t *)pud, - pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); + pfn_pte((addr & PUD_MASK) >> PAGE_SHIFT, + PAGE_KERNEL_LARGE)); spin_unlock(&init_mm.page_table_lock); last_map_addr = next; continue; } - pmd = alloc_low_page(&pmd_phys); + pmd = alloc_low_page(); last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask, prot); - unmap_low_page(pmd); spin_lock(&init_mm.page_table_lock); - pud_populate(&init_mm, pud, __va(pmd_phys)); + pud_populate(&init_mm, pud, pmd); spin_unlock(&init_mm.page_table_lock); } __flush_tlb_all(); @@ -578,34 +610,29 @@ kernel_physical_mapping_init(unsigned long start, for (; start < end; start = next) { pgd_t *pgd = pgd_offset_k(start); - unsigned long pud_phys; pud_t *pud; - next = (start + PGDIR_SIZE) & PGDIR_MASK; - if (next > end) - next = end; + next = (start & PGDIR_MASK) + PGDIR_SIZE; if (pgd_val(*pgd)) { - pud = map_low_page((pud_t *)pgd_page_vaddr(*pgd)); + pud = (pud_t *)pgd_page_vaddr(*pgd); last_map_addr = phys_pud_init(pud, __pa(start), __pa(end), page_size_mask); - unmap_low_page(pud); continue; } - pud = alloc_low_page(&pud_phys); - last_map_addr = phys_pud_init(pud, __pa(start), __pa(next), + pud = alloc_low_page(); + last_map_addr = phys_pud_init(pud, __pa(start), __pa(end), page_size_mask); - unmap_low_page(pud); spin_lock(&init_mm.page_table_lock); - pgd_populate(&init_mm, pgd, __va(pud_phys)); + pgd_populate(&init_mm, pgd, pud); spin_unlock(&init_mm.page_table_lock); pgd_changed = true; } if (pgd_changed) - sync_global_pgds(addr, end); + sync_global_pgds(addr, end - 1); __flush_tlb_all(); @@ -630,7 +657,9 @@ void __init paging_init(void) * numa support is not compiled in, and later node_set_state * will not set it back. */ - node_clear_state(0, N_NORMAL_MEMORY); + node_clear_state(0, N_MEMORY); + if (N_MEMORY != N_NORMAL_MEMORY) + node_clear_state(0, N_NORMAL_MEMORY); zone_sizes_init(); } @@ -662,13 +691,11 @@ int arch_add_memory(int nid, u64 start, u64 size) { struct pglist_data *pgdat = NODE_DATA(nid); struct zone *zone = pgdat->node_zones + ZONE_NORMAL; - unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT; + unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; int ret; - last_mapped_pfn = init_memory_mapping(start, start + size); - if (last_mapped_pfn > max_pfn_mapped) - max_pfn_mapped = last_mapped_pfn; + init_memory_mapping(start, start + size); ret = __add_pages(nid, zone, start_pfn, nr_pages); WARN_ON_ONCE(ret); @@ -680,10 +707,357 @@ int arch_add_memory(int nid, u64 start, u64 size) } EXPORT_SYMBOL_GPL(arch_add_memory); +#define PAGE_INUSE 0xFD + +static void __meminit free_pagetable(struct page *page, int order) +{ + struct zone *zone; + bool bootmem = false; + unsigned long magic; + unsigned int nr_pages = 1 << order; + + /* bootmem page has reserved flag */ + if (PageReserved(page)) { + __ClearPageReserved(page); + bootmem = true; + + magic = (unsigned long)page->lru.next; + if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) { + while (nr_pages--) + put_page_bootmem(page++); + } else + __free_pages_bootmem(page, order); + } else + free_pages((unsigned long)page_address(page), order); + + /* + * SECTION_INFO pages and MIX_SECTION_INFO pages + * are all allocated by bootmem. + */ + if (bootmem) { + zone = page_zone(page); + zone_span_writelock(zone); + zone->present_pages += nr_pages; + zone_span_writeunlock(zone); + totalram_pages += nr_pages; + } +} + +static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd) +{ + pte_t *pte; + int i; + + for (i = 0; i < PTRS_PER_PTE; i++) { + pte = pte_start + i; + if (pte_val(*pte)) + return; + } + + /* free a pte talbe */ + free_pagetable(pmd_page(*pmd), 0); + spin_lock(&init_mm.page_table_lock); + pmd_clear(pmd); + spin_unlock(&init_mm.page_table_lock); +} + +static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud) +{ + pmd_t *pmd; + int i; + + for (i = 0; i < PTRS_PER_PMD; i++) { + pmd = pmd_start + i; + if (pmd_val(*pmd)) + return; + } + + /* free a pmd talbe */ + free_pagetable(pud_page(*pud), 0); + spin_lock(&init_mm.page_table_lock); + pud_clear(pud); + spin_unlock(&init_mm.page_table_lock); +} + +/* Return true if pgd is changed, otherwise return false. */ +static bool __meminit free_pud_table(pud_t *pud_start, pgd_t *pgd) +{ + pud_t *pud; + int i; + + for (i = 0; i < PTRS_PER_PUD; i++) { + pud = pud_start + i; + if (pud_val(*pud)) + return false; + } + + /* free a pud table */ + free_pagetable(pgd_page(*pgd), 0); + spin_lock(&init_mm.page_table_lock); + pgd_clear(pgd); + spin_unlock(&init_mm.page_table_lock); + + return true; +} + +static void __meminit +remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end, + bool direct) +{ + unsigned long next, pages = 0; + pte_t *pte; + void *page_addr; + phys_addr_t phys_addr; + + pte = pte_start + pte_index(addr); + for (; addr < end; addr = next, pte++) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + if (next > end) + next = end; + + if (!pte_present(*pte)) + continue; + + /* + * We mapped [0,1G) memory as identity mapping when + * initializing, in arch/x86/kernel/head_64.S. These + * pagetables cannot be removed. + */ + phys_addr = pte_val(*pte) + (addr & PAGE_MASK); + if (phys_addr < (phys_addr_t)0x40000000) + return; + + if (IS_ALIGNED(addr, PAGE_SIZE) && + IS_ALIGNED(next, PAGE_SIZE)) { + /* + * Do not free direct mapping pages since they were + * freed when offlining, or simplely not in use. + */ + if (!direct) + free_pagetable(pte_page(*pte), 0); + + spin_lock(&init_mm.page_table_lock); + pte_clear(&init_mm, addr, pte); + spin_unlock(&init_mm.page_table_lock); + + /* For non-direct mapping, pages means nothing. */ + pages++; + } else { + /* + * If we are here, we are freeing vmemmap pages since + * direct mapped memory ranges to be freed are aligned. + * + * If we are not removing the whole page, it means + * other page structs in this page are being used and + * we canot remove them. So fill the unused page_structs + * with 0xFD, and remove the page when it is wholly + * filled with 0xFD. + */ + memset((void *)addr, PAGE_INUSE, next - addr); + + page_addr = page_address(pte_page(*pte)); + if (!memchr_inv(page_addr, PAGE_INUSE, PAGE_SIZE)) { + free_pagetable(pte_page(*pte), 0); + + spin_lock(&init_mm.page_table_lock); + pte_clear(&init_mm, addr, pte); + spin_unlock(&init_mm.page_table_lock); + } + } + } + + /* Call free_pte_table() in remove_pmd_table(). */ + flush_tlb_all(); + if (direct) + update_page_count(PG_LEVEL_4K, -pages); +} + +static void __meminit +remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end, + bool direct) +{ + unsigned long next, pages = 0; + pte_t *pte_base; + pmd_t *pmd; + void *page_addr; + + pmd = pmd_start + pmd_index(addr); + for (; addr < end; addr = next, pmd++) { + next = pmd_addr_end(addr, end); + + if (!pmd_present(*pmd)) + continue; + + if (pmd_large(*pmd)) { + if (IS_ALIGNED(addr, PMD_SIZE) && + IS_ALIGNED(next, PMD_SIZE)) { + if (!direct) + free_pagetable(pmd_page(*pmd), + get_order(PMD_SIZE)); + + spin_lock(&init_mm.page_table_lock); + pmd_clear(pmd); + spin_unlock(&init_mm.page_table_lock); + pages++; + } else { + /* If here, we are freeing vmemmap pages. */ + memset((void *)addr, PAGE_INUSE, next - addr); + + page_addr = page_address(pmd_page(*pmd)); + if (!memchr_inv(page_addr, PAGE_INUSE, + PMD_SIZE)) { + free_pagetable(pmd_page(*pmd), + get_order(PMD_SIZE)); + + spin_lock(&init_mm.page_table_lock); + pmd_clear(pmd); + spin_unlock(&init_mm.page_table_lock); + } + } + + continue; + } + + pte_base = (pte_t *)pmd_page_vaddr(*pmd); + remove_pte_table(pte_base, addr, next, direct); + free_pte_table(pte_base, pmd); + } + + /* Call free_pmd_table() in remove_pud_table(). */ + if (direct) + update_page_count(PG_LEVEL_2M, -pages); +} + +static void __meminit +remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end, + bool direct) +{ + unsigned long next, pages = 0; + pmd_t *pmd_base; + pud_t *pud; + void *page_addr; + + pud = pud_start + pud_index(addr); + for (; addr < end; addr = next, pud++) { + next = pud_addr_end(addr, end); + + if (!pud_present(*pud)) + continue; + + if (pud_large(*pud)) { + if (IS_ALIGNED(addr, PUD_SIZE) && + IS_ALIGNED(next, PUD_SIZE)) { + if (!direct) + free_pagetable(pud_page(*pud), + get_order(PUD_SIZE)); + + spin_lock(&init_mm.page_table_lock); + pud_clear(pud); + spin_unlock(&init_mm.page_table_lock); + pages++; + } else { + /* If here, we are freeing vmemmap pages. */ + memset((void *)addr, PAGE_INUSE, next - addr); + + page_addr = page_address(pud_page(*pud)); + if (!memchr_inv(page_addr, PAGE_INUSE, + PUD_SIZE)) { + free_pagetable(pud_page(*pud), + get_order(PUD_SIZE)); + + spin_lock(&init_mm.page_table_lock); + pud_clear(pud); + spin_unlock(&init_mm.page_table_lock); + } + } + + continue; + } + + pmd_base = (pmd_t *)pud_page_vaddr(*pud); + remove_pmd_table(pmd_base, addr, next, direct); + free_pmd_table(pmd_base, pud); + } + + if (direct) + update_page_count(PG_LEVEL_1G, -pages); +} + +/* start and end are both virtual address. */ +static void __meminit +remove_pagetable(unsigned long start, unsigned long end, bool direct) +{ + unsigned long next; + pgd_t *pgd; + pud_t *pud; + bool pgd_changed = false; + + for (; start < end; start = next) { + next = pgd_addr_end(start, end); + + pgd = pgd_offset_k(start); + if (!pgd_present(*pgd)) + continue; + + pud = (pud_t *)pgd_page_vaddr(*pgd); + remove_pud_table(pud, start, next, direct); + if (free_pud_table(pud, pgd)) + pgd_changed = true; + } + + if (pgd_changed) + sync_global_pgds(start, end - 1); + + flush_tlb_all(); +} + +void __ref vmemmap_free(struct page *memmap, unsigned long nr_pages) +{ + unsigned long start = (unsigned long)memmap; + unsigned long end = (unsigned long)(memmap + nr_pages); + + remove_pagetable(start, end, false); +} + +static void __meminit +kernel_physical_mapping_remove(unsigned long start, unsigned long end) +{ + start = (unsigned long)__va(start); + end = (unsigned long)__va(end); + + remove_pagetable(start, end, true); +} + +#ifdef CONFIG_MEMORY_HOTREMOVE +int __ref arch_remove_memory(u64 start, u64 size) +{ + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + struct zone *zone; + int ret; + + zone = page_zone(pfn_to_page(start_pfn)); + kernel_physical_mapping_remove(start, start + size); + ret = __remove_pages(zone, start_pfn, nr_pages); + WARN_ON_ONCE(ret); + + return ret; +} +#endif #endif /* CONFIG_MEMORY_HOTPLUG */ static struct kcore_list kcore_vsyscall; +static void __init register_page_bootmem_info(void) +{ +#ifdef CONFIG_NUMA + int i; + + for_each_online_node(i) + register_page_bootmem_info_node(NODE_DATA(i)); +#endif +} + void __init mem_init(void) { long codesize, reservedpages, datasize, initsize; @@ -696,11 +1070,8 @@ void __init mem_init(void) reservedpages = 0; /* this will put all low memory onto the freelists */ -#ifdef CONFIG_NUMA - totalram_pages = numa_free_all_bootmem(); -#else + register_page_bootmem_info(); totalram_pages = free_all_bootmem(); -#endif absent_pages = absent_pages_in_range(0, max_pfn); reservedpages = max_pfn - totalram_pages - absent_pages; @@ -770,12 +1141,11 @@ void set_kernel_text_ro(void) void mark_rodata_ro(void) { unsigned long start = PFN_ALIGN(_text); - unsigned long rodata_start = - ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK; + unsigned long rodata_start = PFN_ALIGN(__start_rodata); unsigned long end = (unsigned long) &__end_rodata_hpage_align; - unsigned long text_end = PAGE_ALIGN((unsigned long) &__stop___ex_table); - unsigned long rodata_end = PAGE_ALIGN((unsigned long) &__end_rodata); - unsigned long data_start = (unsigned long) &_sdata; + unsigned long text_end = PFN_ALIGN(&__stop___ex_table); + unsigned long rodata_end = PFN_ALIGN(&__end_rodata); + unsigned long all_end = PFN_ALIGN(&_end); printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", (end - start) >> 10); @@ -784,10 +1154,10 @@ void mark_rodata_ro(void) kernel_set_to_readonly = 1; /* - * The rodata section (but not the kernel text!) should also be - * not-executable. + * The rodata/data/bss/brk section (but not the kernel text!) + * should also be not-executable. */ - set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT); + set_memory_nx(rodata_start, (all_end - rodata_start) >> PAGE_SHIFT); rodata_test(); @@ -800,12 +1170,12 @@ void mark_rodata_ro(void) #endif free_init_pages("unused kernel memory", - (unsigned long) page_address(virt_to_page(text_end)), - (unsigned long) - page_address(virt_to_page(rodata_start))); + (unsigned long) __va(__pa_symbol(text_end)), + (unsigned long) __va(__pa_symbol(rodata_start))); + free_init_pages("unused kernel memory", - (unsigned long) page_address(virt_to_page(rodata_end)), - (unsigned long) page_address(virt_to_page(data_start))); + (unsigned long) __va(__pa_symbol(rodata_end)), + (unsigned long) __va(__pa_symbol(_sdata))); } #endif @@ -829,6 +1199,9 @@ int kern_addr_valid(unsigned long addr) if (pud_none(*pud)) return 0; + if (pud_large(*pud)) + return pfn_valid(pud_pfn(*pud)); + pmd = pmd_offset(pud, addr); if (pmd_none(*pmd)) return 0; @@ -979,10 +1352,70 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node) } } - sync_global_pgds((unsigned long)start_page, end); + sync_global_pgds((unsigned long)start_page, end - 1); return 0; } +#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HAVE_BOOTMEM_INFO_NODE) +void register_page_bootmem_memmap(unsigned long section_nr, + struct page *start_page, unsigned long size) +{ + unsigned long addr = (unsigned long)start_page; + unsigned long end = (unsigned long)(start_page + size); + unsigned long next; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + unsigned int nr_pages; + struct page *page; + + for (; addr < end; addr = next) { + pte_t *pte = NULL; + + pgd = pgd_offset_k(addr); + if (pgd_none(*pgd)) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + continue; + } + get_page_bootmem(section_nr, pgd_page(*pgd), MIX_SECTION_INFO); + + pud = pud_offset(pgd, addr); + if (pud_none(*pud)) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + continue; + } + get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO); + + if (!cpu_has_pse) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) + continue; + get_page_bootmem(section_nr, pmd_page(*pmd), + MIX_SECTION_INFO); + + pte = pte_offset_kernel(pmd, addr); + if (pte_none(*pte)) + continue; + get_page_bootmem(section_nr, pte_page(*pte), + SECTION_INFO); + } else { + next = pmd_addr_end(addr, end); + + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) + continue; + + nr_pages = 1 << (get_order(PMD_SIZE)); + page = pmd_page(*pmd); + while (nr_pages--) + get_page_bootmem(section_nr, page++, + SECTION_INFO); + } + } +} +#endif + void __meminit vmemmap_populate_print_last(void) { if (p_start) { diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c index c80b9fb9573..8dabbed409e 100644 --- a/arch/x86/mm/memtest.c +++ b/arch/x86/mm/memtest.c @@ -9,6 +9,7 @@ #include <linux/memblock.h> static u64 patterns[] __initdata = { + /* The first entry has to be 0 to leave memtest with zeroed memory */ 0, 0xffffffffffffffffULL, 0x5555555555555555ULL, @@ -110,15 +111,8 @@ void __init early_memtest(unsigned long start, unsigned long end) return; printk(KERN_INFO "early_memtest: # of tests: %d\n", memtest_pattern); - for (i = 0; i < memtest_pattern; i++) { + for (i = memtest_pattern-1; i < UINT_MAX; --i) { idx = i % ARRAY_SIZE(patterns); do_one_pass(patterns[idx], start, end); } - - if (idx > 0) { - printk(KERN_INFO "early_memtest: wipe out " - "test pattern from memory\n"); - /* additional test with pattern 0 will do this */ - do_one_pass(0, start, end); - } } diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h new file mode 100644 index 00000000000..6b563a11889 --- /dev/null +++ b/arch/x86/mm/mm_internal.h @@ -0,0 +1,19 @@ +#ifndef __X86_MM_INTERNAL_H +#define __X86_MM_INTERNAL_H + +void *alloc_low_pages(unsigned int num); +static inline void *alloc_low_page(void) +{ + return alloc_low_pages(1); +} + +void early_ioremap_page_table_range_init(void); + +unsigned long kernel_physical_mapping_init(unsigned long start, + unsigned long end, + unsigned long page_size_mask); +void zone_sizes_init(void); + +extern int after_bootmem; + +#endif /* __X86_MM_INTERNAL_H */ diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 2d125be1bae..72fe01e9e41 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -56,7 +56,7 @@ early_param("numa", numa_setup); /* * apicid, cpu, node mappings */ -s16 __apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { +s16 __apicid_to_node[MAX_LOCAL_APIC] = { [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE }; @@ -78,7 +78,7 @@ EXPORT_SYMBOL(node_to_cpumask_map); DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); -void __cpuinit numa_set_node(int cpu, int node) +void numa_set_node(int cpu, int node) { int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); @@ -97,11 +97,10 @@ void __cpuinit numa_set_node(int cpu, int node) #endif per_cpu(x86_cpu_to_node_map, cpu) = node; - if (node != NUMA_NO_NODE) - set_cpu_numa_node(cpu, node); + set_cpu_numa_node(cpu, node); } -void __cpuinit numa_clear_node(int cpu) +void numa_clear_node(int cpu) { numa_set_node(cpu, NUMA_NO_NODE); } @@ -193,7 +192,6 @@ int __init numa_add_memblk(int nid, u64 start, u64 end) static void __init setup_node_data(int nid, u64 start, u64 end) { const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE); - bool remapped = false; u64 nd_pa; void *nd; int tnid; @@ -205,37 +203,28 @@ static void __init setup_node_data(int nid, u64 start, u64 end) if (end && (end - start) < NODE_MIN_SIZE) return; - /* initialize remap allocator before aligning to ZONE_ALIGN */ - init_alloc_remap(nid, start, end); - start = roundup(start, ZONE_ALIGN); printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n", nid, start, end - 1); /* - * Allocate node data. Try remap allocator first, node-local - * memory and then any node. Never allocate in DMA zone. + * Allocate node data. Try node-local memory and then any node. + * Never allocate in DMA zone. */ - nd = alloc_remap(nid, nd_size); - if (nd) { - nd_pa = __pa(nd); - remapped = true; - } else { - nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid); - if (!nd_pa) { - pr_err("Cannot find %zu bytes in node %d\n", - nd_size, nid); - return; - } - nd = __va(nd_pa); + nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid); + if (!nd_pa) { + pr_err("Cannot find %zu bytes in node %d\n", + nd_size, nid); + return; } + nd = __va(nd_pa); /* report and initialize */ - printk(KERN_INFO " NODE_DATA [mem %#010Lx-%#010Lx]%s\n", - nd_pa, nd_pa + nd_size - 1, remapped ? " (remapped)" : ""); + printk(KERN_INFO " NODE_DATA [mem %#010Lx-%#010Lx]\n", + nd_pa, nd_pa + nd_size - 1); tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT); - if (!remapped && tnid != nid) + if (tnid != nid) printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid); node_data[nid] = nd; diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 534255a36b6..73a6d7395bd 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -73,167 +73,6 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn, extern unsigned long highend_pfn, highstart_pfn; -#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) - -static void *node_remap_start_vaddr[MAX_NUMNODES]; -void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); - -/* - * Remap memory allocator - */ -static unsigned long node_remap_start_pfn[MAX_NUMNODES]; -static void *node_remap_end_vaddr[MAX_NUMNODES]; -static void *node_remap_alloc_vaddr[MAX_NUMNODES]; - -/** - * alloc_remap - Allocate remapped memory - * @nid: NUMA node to allocate memory from - * @size: The size of allocation - * - * Allocate @size bytes from the remap area of NUMA node @nid. The - * size of the remap area is predetermined by init_alloc_remap() and - * only the callers considered there should call this function. For - * more info, please read the comment on top of init_alloc_remap(). - * - * The caller must be ready to handle allocation failure from this - * function and fall back to regular memory allocator in such cases. - * - * CONTEXT: - * Single CPU early boot context. - * - * RETURNS: - * Pointer to the allocated memory on success, %NULL on failure. - */ -void *alloc_remap(int nid, unsigned long size) -{ - void *allocation = node_remap_alloc_vaddr[nid]; - - size = ALIGN(size, L1_CACHE_BYTES); - - if (!allocation || (allocation + size) > node_remap_end_vaddr[nid]) - return NULL; - - node_remap_alloc_vaddr[nid] += size; - memset(allocation, 0, size); - - return allocation; -} - -#ifdef CONFIG_HIBERNATION -/** - * resume_map_numa_kva - add KVA mapping to the temporary page tables created - * during resume from hibernation - * @pgd_base - temporary resume page directory - */ -void resume_map_numa_kva(pgd_t *pgd_base) -{ - int node; - - for_each_online_node(node) { - unsigned long start_va, start_pfn, nr_pages, pfn; - - start_va = (unsigned long)node_remap_start_vaddr[node]; - start_pfn = node_remap_start_pfn[node]; - nr_pages = (node_remap_end_vaddr[node] - - node_remap_start_vaddr[node]) >> PAGE_SHIFT; - - printk(KERN_DEBUG "%s: node %d\n", __func__, node); - - for (pfn = 0; pfn < nr_pages; pfn += PTRS_PER_PTE) { - unsigned long vaddr = start_va + (pfn << PAGE_SHIFT); - pgd_t *pgd = pgd_base + pgd_index(vaddr); - pud_t *pud = pud_offset(pgd, vaddr); - pmd_t *pmd = pmd_offset(pud, vaddr); - - set_pmd(pmd, pfn_pmd(start_pfn + pfn, - PAGE_KERNEL_LARGE_EXEC)); - - printk(KERN_DEBUG "%s: %08lx -> pfn %08lx\n", - __func__, vaddr, start_pfn + pfn); - } - } -} -#endif - -/** - * init_alloc_remap - Initialize remap allocator for a NUMA node - * @nid: NUMA node to initizlie remap allocator for - * - * NUMA nodes may end up without any lowmem. As allocating pgdat and - * memmap on a different node with lowmem is inefficient, a special - * remap allocator is implemented which can be used by alloc_remap(). - * - * For each node, the amount of memory which will be necessary for - * pgdat and memmap is calculated and two memory areas of the size are - * allocated - one in the node and the other in lowmem; then, the area - * in the node is remapped to the lowmem area. - * - * As pgdat and memmap must be allocated in lowmem anyway, this - * doesn't waste lowmem address space; however, the actual lowmem - * which gets remapped over is wasted. The amount shouldn't be - * problematic on machines this feature will be used. - * - * Initialization failure isn't fatal. alloc_remap() is used - * opportunistically and the callers will fall back to other memory - * allocation mechanisms on failure. - */ -void __init init_alloc_remap(int nid, u64 start, u64 end) -{ - unsigned long start_pfn = start >> PAGE_SHIFT; - unsigned long end_pfn = end >> PAGE_SHIFT; - unsigned long size, pfn; - u64 node_pa, remap_pa; - void *remap_va; - - /* - * The acpi/srat node info can show hot-add memroy zones where - * memory could be added but not currently present. - */ - printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n", - nid, start_pfn, end_pfn); - - /* calculate the necessary space aligned to large page size */ - size = node_memmap_size_bytes(nid, start_pfn, end_pfn); - size += ALIGN(sizeof(pg_data_t), PAGE_SIZE); - size = ALIGN(size, LARGE_PAGE_BYTES); - - /* allocate node memory and the lowmem remap area */ - node_pa = memblock_find_in_range(start, end, size, LARGE_PAGE_BYTES); - if (!node_pa) { - pr_warning("remap_alloc: failed to allocate %lu bytes for node %d\n", - size, nid); - return; - } - memblock_reserve(node_pa, size); - - remap_pa = memblock_find_in_range(min_low_pfn << PAGE_SHIFT, - max_low_pfn << PAGE_SHIFT, - size, LARGE_PAGE_BYTES); - if (!remap_pa) { - pr_warning("remap_alloc: failed to allocate %lu bytes remap area for node %d\n", - size, nid); - memblock_free(node_pa, size); - return; - } - memblock_reserve(remap_pa, size); - remap_va = phys_to_virt(remap_pa); - - /* perform actual remap */ - for (pfn = 0; pfn < size >> PAGE_SHIFT; pfn += PTRS_PER_PTE) - set_pmd_pfn((unsigned long)remap_va + (pfn << PAGE_SHIFT), - (node_pa >> PAGE_SHIFT) + pfn, - PAGE_KERNEL_LARGE); - - /* initialize remap allocator parameters */ - node_remap_start_pfn[nid] = node_pa >> PAGE_SHIFT; - node_remap_start_vaddr[nid] = remap_va; - node_remap_end_vaddr[nid] = remap_va + size; - node_remap_alloc_vaddr[nid] = remap_va; - - printk(KERN_DEBUG "remap_alloc: node %d [%08llx-%08llx) -> [%p-%p)\n", - nid, node_pa, node_pa + size, remap_va, remap_va + size); -} - void __init initmem_init(void) { x86_numa_init(); diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 92e27119ee1..9405ffc9150 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -10,16 +10,3 @@ void __init initmem_init(void) { x86_numa_init(); } - -unsigned long __init numa_free_all_bootmem(void) -{ - unsigned long pages = 0; - int i; - - for_each_online_node(i) - pages += free_all_bootmem_node(NODE_DATA(i)); - - pages += free_low_memory_core_early(MAX_NUMNODES); - - return pages; -} diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h index 7178c3afe05..ad86ec91e64 100644 --- a/arch/x86/mm/numa_internal.h +++ b/arch/x86/mm/numa_internal.h @@ -21,12 +21,6 @@ void __init numa_reset_distance(void); void __init x86_numa_init(void); -#ifdef CONFIG_X86_64 -static inline void init_alloc_remap(int nid, u64 start, u64 end) { } -#else -void __init init_alloc_remap(int nid, u64 start, u64 end); -#endif - #ifdef CONFIG_NUMA_EMU void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt); diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index a718e0d2350..091934e1d0d 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -94,12 +94,12 @@ static inline void split_page_count(int level) { } static inline unsigned long highmap_start_pfn(void) { - return __pa(_text) >> PAGE_SHIFT; + return __pa_symbol(_text) >> PAGE_SHIFT; } static inline unsigned long highmap_end_pfn(void) { - return __pa(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT; + return __pa_symbol(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT; } #endif @@ -276,8 +276,8 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, * The .rodata section needs to be read-only. Using the pfn * catches all aliases. */ - if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT, - __pa((unsigned long)__end_rodata) >> PAGE_SHIFT)) + if (within(pfn, __pa_symbol(__start_rodata) >> PAGE_SHIFT, + __pa_symbol(__end_rodata) >> PAGE_SHIFT)) pgprot_val(forbidden) |= _PAGE_RW; #if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) @@ -364,6 +364,37 @@ pte_t *lookup_address(unsigned long address, unsigned int *level) EXPORT_SYMBOL_GPL(lookup_address); /* + * This is necessary because __pa() does not work on some + * kinds of memory, like vmalloc() or the alloc_remap() + * areas on 32-bit NUMA systems. The percpu areas can + * end up in this kind of memory, for instance. + * + * This could be optimized, but it is only intended to be + * used at inititalization time, and keeping it + * unoptimized should increase the testing coverage for + * the more obscure platforms. + */ +phys_addr_t slow_virt_to_phys(void *__virt_addr) +{ + unsigned long virt_addr = (unsigned long)__virt_addr; + phys_addr_t phys_addr; + unsigned long offset; + enum pg_level level; + unsigned long psize; + unsigned long pmask; + pte_t *pte; + + pte = lookup_address(virt_addr, &level); + BUG_ON(!pte); + psize = page_level_size(level); + pmask = page_level_mask(level); + offset = virt_addr & ~pmask; + phys_addr = pte_pfn(*pte) << PAGE_SHIFT; + return (phys_addr | offset); +} +EXPORT_SYMBOL_GPL(slow_virt_to_phys); + +/* * Set the new pmd in all the pgds we know about: */ static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) @@ -396,7 +427,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, pte_t new_pte, old_pte, *tmp; pgprot_t old_prot, new_prot, req_prot; int i, do_split = 1; - unsigned int level; + enum pg_level level; if (cpa->force_split) return 1; @@ -412,15 +443,12 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, switch (level) { case PG_LEVEL_2M: - psize = PMD_PAGE_SIZE; - pmask = PMD_PAGE_MASK; - break; #ifdef CONFIG_X86_64 case PG_LEVEL_1G: - psize = PUD_PAGE_SIZE; - pmask = PUD_PAGE_MASK; - break; #endif + psize = page_level_size(level); + pmask = page_level_mask(level); + break; default: do_split = -EINVAL; goto out_unlock; @@ -445,6 +473,19 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, pgprot_val(req_prot) |= pgprot_val(cpa->mask_set); /* + * Set the PSE and GLOBAL flags only if the PRESENT flag is + * set otherwise pmd_present/pmd_huge will return true even on + * a non present pmd. The canon_pgprot will clear _PAGE_GLOBAL + * for the ancient hardware that doesn't support it. + */ + if (pgprot_val(new_prot) & _PAGE_PRESENT) + pgprot_val(new_prot) |= _PAGE_PSE | _PAGE_GLOBAL; + else + pgprot_val(new_prot) &= ~(_PAGE_PSE | _PAGE_GLOBAL); + + new_prot = canon_pgprot(new_prot); + + /* * old_pte points to the large page base address. So we need * to add the offset of the virtual address: */ @@ -489,7 +530,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, * The address is aligned and the number of pages * covers the full page. */ - new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot)); + new_pte = pfn_pte(pte_pfn(old_pte), new_prot); __set_pmd_pte(kpte, address, new_pte); cpa->flags |= CPA_FLUSHTLB; do_split = 0; @@ -501,21 +542,13 @@ out_unlock: return do_split; } -static int split_large_page(pte_t *kpte, unsigned long address) +int __split_large_page(pte_t *kpte, unsigned long address, pte_t *pbase) { unsigned long pfn, pfninc = 1; unsigned int i, level; - pte_t *pbase, *tmp; + pte_t *tmp; pgprot_t ref_prot; - struct page *base; - - if (!debug_pagealloc) - spin_unlock(&cpa_lock); - base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0); - if (!debug_pagealloc) - spin_lock(&cpa_lock); - if (!base) - return -ENOMEM; + struct page *base = virt_to_page(pbase); spin_lock(&pgd_lock); /* @@ -523,10 +556,11 @@ static int split_large_page(pte_t *kpte, unsigned long address) * up for us already: */ tmp = lookup_address(address, &level); - if (tmp != kpte) - goto out_unlock; + if (tmp != kpte) { + spin_unlock(&pgd_lock); + return 1; + } - pbase = (pte_t *)page_address(base); paravirt_alloc_pte(&init_mm, page_to_pfn(base)); ref_prot = pte_pgprot(pte_clrhuge(*kpte)); /* @@ -540,27 +574,40 @@ static int split_large_page(pte_t *kpte, unsigned long address) #ifdef CONFIG_X86_64 if (level == PG_LEVEL_1G) { pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT; - pgprot_val(ref_prot) |= _PAGE_PSE; + /* + * Set the PSE flags only if the PRESENT flag is set + * otherwise pmd_present/pmd_huge will return true + * even on a non present pmd. + */ + if (pgprot_val(ref_prot) & _PAGE_PRESENT) + pgprot_val(ref_prot) |= _PAGE_PSE; + else + pgprot_val(ref_prot) &= ~_PAGE_PSE; } #endif /* + * Set the GLOBAL flags only if the PRESENT flag is set + * otherwise pmd/pte_present will return true even on a non + * present pmd/pte. The canon_pgprot will clear _PAGE_GLOBAL + * for the ancient hardware that doesn't support it. + */ + if (pgprot_val(ref_prot) & _PAGE_PRESENT) + pgprot_val(ref_prot) |= _PAGE_GLOBAL; + else + pgprot_val(ref_prot) &= ~_PAGE_GLOBAL; + + /* * Get the target pfn from the original entry: */ pfn = pte_pfn(*kpte); for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc) - set_pte(&pbase[i], pfn_pte(pfn, ref_prot)); + set_pte(&pbase[i], pfn_pte(pfn, canon_pgprot(ref_prot))); - if (address >= (unsigned long)__va(0) && - address < (unsigned long)__va(max_low_pfn_mapped << PAGE_SHIFT)) + if (pfn_range_is_mapped(PFN_DOWN(__pa(address)), + PFN_DOWN(__pa(address)) + 1)) split_page_count(level); -#ifdef CONFIG_X86_64 - if (address >= (unsigned long)__va(1UL<<32) && - address < (unsigned long)__va(max_pfn_mapped << PAGE_SHIFT)) - split_page_count(level); -#endif - /* * Install the new, split up pagetable. * @@ -579,17 +626,27 @@ static int split_large_page(pte_t *kpte, unsigned long address) * going on. */ __flush_tlb_all(); + spin_unlock(&pgd_lock); - base = NULL; + return 0; +} -out_unlock: - /* - * If we dropped out via the lookup_address check under - * pgd_lock then stick the page back into the pool: - */ - if (base) +static int split_large_page(pte_t *kpte, unsigned long address) +{ + pte_t *pbase; + struct page *base; + + if (!debug_pagealloc) + spin_unlock(&cpa_lock); + base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0); + if (!debug_pagealloc) + spin_lock(&cpa_lock); + if (!base) + return -ENOMEM; + + pbase = (pte_t *)page_address(base); + if (__split_large_page(kpte, address, pbase)) __free_page(base); - spin_unlock(&pgd_lock); return 0; } @@ -660,6 +717,18 @@ repeat: new_prot = static_protections(new_prot, address, pfn); /* + * Set the GLOBAL flags only if the PRESENT flag is + * set otherwise pte_present will return true even on + * a non present pte. The canon_pgprot will clear + * _PAGE_GLOBAL for the ancient hardware that doesn't + * support it. + */ + if (pgprot_val(new_prot) & _PAGE_PRESENT) + pgprot_val(new_prot) |= _PAGE_GLOBAL; + else + pgprot_val(new_prot) &= ~_PAGE_GLOBAL; + + /* * We need to keep the pfn from the existing PTE, * after all we're only going to change it's attributes * not the memory it points to @@ -729,13 +798,9 @@ static int cpa_process_alias(struct cpa_data *cpa) unsigned long vaddr; int ret; - if (cpa->pfn >= max_pfn_mapped) + if (!pfn_range_is_mapped(cpa->pfn, cpa->pfn + 1)) return 0; -#ifdef CONFIG_X86_64 - if (cpa->pfn >= max_low_pfn_mapped && cpa->pfn < (1UL<<(32-PAGE_SHIFT))) - return 0; -#endif /* * No need to redo, when the primary call touched the direct * mapping already: diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 0eb572eda40..657438858e8 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -560,10 +560,17 @@ int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flags) { unsigned long id_sz; - if (base >= __pa(high_memory)) + if (base > __pa(high_memory-1)) return 0; - id_sz = (__pa(high_memory) < base + size) ? + /* + * some areas in the middle of the kernel identity range + * are not mapped, like the PCI space. + */ + if (!page_is_ram(base >> PAGE_SHIFT)) + return 0; + + id_sz = (__pa(high_memory-1) <= base + size) ? __pa(high_memory) - base : size; diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 8573b83a63d..193350b51f9 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -137,7 +137,7 @@ static void pgd_dtor(pgd_t *pgd) * against pageattr.c; it is the unique case in which a valid change * of kernel pagetables can't be lazily synchronized by vmalloc faults. * vmalloc faults work because attached pagetables are never freed. - * -- wli + * -- nyc */ #ifdef CONFIG_X86_PAE @@ -301,6 +301,13 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd) free_page((unsigned long)pgd); } +/* + * Used to set accessed or dirty bits in the page table entries + * on other architectures. On x86, the accessed and dirty bits + * are tracked by hardware. However, do_wp_page calls this function + * to also make the pte writeable at the same time the dirty bit is + * set. In that case we do actually need to write the PTE. + */ int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t entry, int dirty) @@ -310,7 +317,6 @@ int ptep_set_access_flags(struct vm_area_struct *vma, if (changed && dirty) { *ptep = entry; pte_update_defer(vma->vm_mm, address, ptep); - flush_tlb_page(vma, address); } return changed; @@ -328,7 +334,12 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, if (changed && dirty) { *pmdp = entry; pmd_update_defer(vma->vm_mm, address, pmdp); - flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + /* + * We had a write-protection fault here and changed the pmd + * to to more permissive. No need to flush the TLB for that, + * #PF is architecturally guaranteed to do that and in the + * worst-case we'll generate a spurious fault. + */ } return changed; diff --git a/arch/x86/mm/physaddr.c b/arch/x86/mm/physaddr.c index d2e2735327b..e666cbbb926 100644 --- a/arch/x86/mm/physaddr.c +++ b/arch/x86/mm/physaddr.c @@ -1,3 +1,4 @@ +#include <linux/bootmem.h> #include <linux/mmdebug.h> #include <linux/module.h> #include <linux/mm.h> @@ -8,33 +9,54 @@ #ifdef CONFIG_X86_64 +#ifdef CONFIG_DEBUG_VIRTUAL unsigned long __phys_addr(unsigned long x) { - if (x >= __START_KERNEL_map) { - x -= __START_KERNEL_map; - VIRTUAL_BUG_ON(x >= KERNEL_IMAGE_SIZE); - x += phys_base; + unsigned long y = x - __START_KERNEL_map; + + /* use the carry flag to determine if x was < __START_KERNEL_map */ + if (unlikely(x > y)) { + x = y + phys_base; + + VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE); } else { - VIRTUAL_BUG_ON(x < PAGE_OFFSET); - x -= PAGE_OFFSET; - VIRTUAL_BUG_ON(!phys_addr_valid(x)); + x = y + (__START_KERNEL_map - PAGE_OFFSET); + + /* carry flag will be set if starting x was >= PAGE_OFFSET */ + VIRTUAL_BUG_ON((x > y) || !phys_addr_valid(x)); } + return x; } EXPORT_SYMBOL(__phys_addr); +unsigned long __phys_addr_symbol(unsigned long x) +{ + unsigned long y = x - __START_KERNEL_map; + + /* only check upper bounds since lower bounds will trigger carry */ + VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE); + + return y + phys_base; +} +EXPORT_SYMBOL(__phys_addr_symbol); +#endif + bool __virt_addr_valid(unsigned long x) { - if (x >= __START_KERNEL_map) { - x -= __START_KERNEL_map; - if (x >= KERNEL_IMAGE_SIZE) + unsigned long y = x - __START_KERNEL_map; + + /* use the carry flag to determine if x was < __START_KERNEL_map */ + if (unlikely(x > y)) { + x = y + phys_base; + + if (y >= KERNEL_IMAGE_SIZE) return false; - x += phys_base; } else { - if (x < PAGE_OFFSET) - return false; - x -= PAGE_OFFSET; - if (!phys_addr_valid(x)) + x = y + (__START_KERNEL_map - PAGE_OFFSET); + + /* carry flag will be set if starting x was >= PAGE_OFFSET */ + if ((x > y) || !phys_addr_valid(x)) return false; } @@ -47,10 +69,16 @@ EXPORT_SYMBOL(__virt_addr_valid); #ifdef CONFIG_DEBUG_VIRTUAL unsigned long __phys_addr(unsigned long x) { + unsigned long phys_addr = x - PAGE_OFFSET; /* VMALLOC_* aren't constants */ VIRTUAL_BUG_ON(x < PAGE_OFFSET); VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x)); - return x - PAGE_OFFSET; + /* max_low_pfn is set early, but not _that_ early */ + if (max_low_pfn) { + VIRTUAL_BUG_ON((phys_addr >> PAGE_SHIFT) > max_low_pfn); + BUG_ON(slow_virt_to_phys((void *)x) != phys_addr); + } + return phys_addr; } EXPORT_SYMBOL(__phys_addr); #endif diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c index 4ddf497ca65..cdd0da9dd53 100644 --- a/arch/x86/mm/srat.c +++ b/arch/x86/mm/srat.c @@ -149,39 +149,40 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) int node, pxm; if (srat_disabled()) - return -1; - if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) { - bad_srat(); - return -1; - } + goto out_err; + if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) + goto out_err_bad_srat; if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0) - return -1; - + goto out_err; if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info()) - return -1; + goto out_err; + start = ma->base_address; end = start + ma->length; pxm = ma->proximity_domain; if (acpi_srat_revision <= 1) pxm &= 0xff; + node = setup_node(pxm); if (node < 0) { printk(KERN_ERR "SRAT: Too many proximity domains.\n"); - bad_srat(); - return -1; + goto out_err_bad_srat; } - if (numa_add_memblk(node, start, end) < 0) { - bad_srat(); - return -1; - } + if (numa_add_memblk(node, start, end) < 0) + goto out_err_bad_srat; node_set(node, numa_nodes_parsed); printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]\n", node, pxm, (unsigned long long) start, (unsigned long long) end - 1); + return 0; +out_err_bad_srat: + bad_srat(); +out_err: + return -1; } void __init acpi_numa_arch_fixup(void) {} diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 0777f042e40..282375f13c7 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -104,7 +104,7 @@ static void flush_tlb_func(void *info) return; if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { - if (f->flush_end == TLB_FLUSH_ALL || !cpu_has_invlpg) + if (f->flush_end == TLB_FLUSH_ALL) local_flush_tlb(); else if (!f->flush_end) __flush_tlb_single(f->flush_start); @@ -197,7 +197,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, } if (end == TLB_FLUSH_ALL || tlb_flushall_shift == -1 - || vmflag == VM_HUGETLB) { + || vmflag & VM_HUGETLB) { local_flush_tlb(); goto flush_all; } @@ -335,12 +335,10 @@ static const struct file_operations fops_tlbflush = { .llseek = default_llseek, }; -static int __cpuinit create_tlb_flushall_shift(void) +static int __init create_tlb_flushall_shift(void) { - if (cpu_has_invlpg) { - debugfs_create_file("tlb_flushall_shift", S_IRUSR | S_IWUSR, - arch_debugfs_dir, NULL, &fops_tlbflush); - } + debugfs_create_file("tlb_flushall_shift", S_IRUSR | S_IWUSR, + arch_debugfs_dir, NULL, &fops_tlbflush); return 0; } late_initcall(create_tlb_flushall_shift); diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 520d2bd0b9c..3cbe45381bb 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1,6 +1,6 @@ /* bpf_jit_comp.c : BPF JIT compiler * - * Copyright (C) 2011 Eric Dumazet (eric.dumazet@gmail.com) + * Copyright (C) 2011-2013 Eric Dumazet (eric.dumazet@gmail.com) * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -11,6 +11,7 @@ #include <asm/cacheflush.h> #include <linux/netdevice.h> #include <linux/filter.h> +#include <linux/if_vlan.h> /* * Conventions : @@ -123,6 +124,26 @@ static inline void bpf_flush_icache(void *start, void *end) #define CHOOSE_LOAD_FUNC(K, func) \ ((int)K < 0 ? ((int)K >= SKF_LL_OFF ? func##_negative_offset : func) : func##_positive_offset) +/* Helper to find the offset of pkt_type in sk_buff + * We want to make sure its still a 3bit field starting at a byte boundary. + */ +#define PKT_TYPE_MAX 7 +static int pkt_type_offset(void) +{ + struct sk_buff skb_probe = { + .pkt_type = ~0, + }; + char *ct = (char *)&skb_probe; + unsigned int off; + + for (off = 0; off < sizeof(struct sk_buff); off++) { + if (ct[off] == PKT_TYPE_MAX) + return off; + } + pr_err_once("Please fix pkt_type_offset(), as pkt_type couldn't be found\n"); + return -1; +} + void bpf_jit_compile(struct sk_filter *fp) { u8 temp[64]; @@ -212,7 +233,10 @@ void bpf_jit_compile(struct sk_filter *fp) case BPF_S_ANC_MARK: case BPF_S_ANC_RXHASH: case BPF_S_ANC_CPU: + case BPF_S_ANC_VLAN_TAG: + case BPF_S_ANC_VLAN_TAG_PRESENT: case BPF_S_ANC_QUEUE: + case BPF_S_ANC_PKTTYPE: case BPF_S_LD_W_ABS: case BPF_S_LD_H_ABS: case BPF_S_LD_B_ABS: @@ -515,6 +539,41 @@ void bpf_jit_compile(struct sk_filter *fp) CLEAR_A(); #endif break; + case BPF_S_ANC_VLAN_TAG: + case BPF_S_ANC_VLAN_TAG_PRESENT: + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2); + if (is_imm8(offsetof(struct sk_buff, vlan_tci))) { + /* movzwl off8(%rdi),%eax */ + EMIT4(0x0f, 0xb7, 0x47, offsetof(struct sk_buff, vlan_tci)); + } else { + EMIT3(0x0f, 0xb7, 0x87); /* movzwl off32(%rdi),%eax */ + EMIT(offsetof(struct sk_buff, vlan_tci), 4); + } + BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000); + if (filter[i].code == BPF_S_ANC_VLAN_TAG) { + EMIT3(0x80, 0xe4, 0xef); /* and $0xef,%ah */ + } else { + EMIT3(0xc1, 0xe8, 0x0c); /* shr $0xc,%eax */ + EMIT3(0x83, 0xe0, 0x01); /* and $0x1,%eax */ + } + break; + case BPF_S_ANC_PKTTYPE: + { + int off = pkt_type_offset(); + + if (off < 0) + goto out; + if (is_imm8(off)) { + /* movzbl off8(%rdi),%eax */ + EMIT4(0x0f, 0xb6, 0x47, off); + } else { + /* movbl off32(%rdi),%eax */ + EMIT3(0x0f, 0xb6, 0x87); + EMIT(off, 4); + } + EMIT3(0x83, 0xe0, PKT_TYPE_MAX); /* and $0x7,%eax */ + break; + } case BPF_S_LD_W_ABS: func = CHOOSE_LOAD_FUNC(K, sk_load_word); common_load: seen |= SEEN_DATAREF; diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile index 3af5a1e79c9..ee0af58ca5b 100644 --- a/arch/x86/pci/Makefile +++ b/arch/x86/pci/Makefile @@ -16,6 +16,7 @@ obj-$(CONFIG_STA2X11) += sta2x11-fixup.o obj-$(CONFIG_X86_VISWS) += visws.o obj-$(CONFIG_X86_NUMAQ) += numaq_32.o +obj-$(CONFIG_X86_NUMACHIP) += numachip.o obj-$(CONFIG_X86_INTEL_MID) += mrst.o diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 192397c9860..3e724256dbe 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -12,6 +12,7 @@ struct pci_root_info { char name[16]; unsigned int res_num; struct resource *res; + resource_size_t *res_offset; struct pci_sysdata sd; #ifdef CONFIG_PCI_MMCONFIG bool mcfg_added; @@ -22,6 +23,7 @@ struct pci_root_info { }; static bool pci_use_crs = true; +static bool pci_ignore_seg = false; static int __init set_use_crs(const struct dmi_system_id *id) { @@ -35,7 +37,14 @@ static int __init set_nouse_crs(const struct dmi_system_id *id) return 0; } -static const struct dmi_system_id pci_use_crs_table[] __initconst = { +static int __init set_ignore_seg(const struct dmi_system_id *id) +{ + printk(KERN_INFO "PCI: %s detected: ignoring ACPI _SEG\n", id->ident); + pci_ignore_seg = true; + return 0; +} + +static const struct dmi_system_id pci_crs_quirks[] __initconst = { /* http://bugzilla.kernel.org/show_bug.cgi?id=14183 */ { .callback = set_use_crs, @@ -98,6 +107,16 @@ static const struct dmi_system_id pci_use_crs_table[] __initconst = { DMI_MATCH(DMI_BIOS_VERSION, "6JET85WW (1.43 )"), }, }, + + /* https://bugzilla.kernel.org/show_bug.cgi?id=15362 */ + { + .callback = set_ignore_seg, + .ident = "HP xw9300", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), + DMI_MATCH(DMI_PRODUCT_NAME, "HP xw9300 Workstation"), + }, + }, {} }; @@ -108,7 +127,7 @@ void __init pci_acpi_crs_quirks(void) if (dmi_get_date(DMI_BIOS_DATE, &year, NULL, NULL) && year < 2008) pci_use_crs = false; - dmi_check_system(pci_use_crs_table); + dmi_check_system(pci_crs_quirks); /* * If the user specifies "pci=use_crs" or "pci=nocrs" explicitly, that @@ -126,7 +145,7 @@ void __init pci_acpi_crs_quirks(void) } #ifdef CONFIG_PCI_MMCONFIG -static int __devinit check_segment(u16 seg, struct device *dev, char *estr) +static int check_segment(u16 seg, struct device *dev, char *estr) { if (seg) { dev_err(dev, @@ -149,9 +168,8 @@ static int __devinit check_segment(u16 seg, struct device *dev, char *estr) return 0; } -static int __devinit setup_mcfg_map(struct pci_root_info *info, - u16 seg, u8 start, u8 end, - phys_addr_t addr) +static int setup_mcfg_map(struct pci_root_info *info, u16 seg, u8 start, + u8 end, phys_addr_t addr) { int result; struct device *dev = &info->bridge->dev; @@ -189,7 +207,7 @@ static void teardown_mcfg_map(struct pci_root_info *info) } } #else -static int __devinit setup_mcfg_map(struct pci_root_info *info, +static int setup_mcfg_map(struct pci_root_info *info, u16 seg, u8 start, u8 end, phys_addr_t addr) { @@ -305,6 +323,7 @@ setup_resource(struct acpi_resource *acpi_res, void *data) res->flags = flags; res->start = start; res->end = end; + info->res_offset[info->res_num] = addr.translation_offset; if (!pci_use_crs) { dev_printk(KERN_DEBUG, &info->bridge->dev, @@ -374,7 +393,8 @@ static void add_resources(struct pci_root_info *info, "ignoring host bridge window %pR (conflicts with %s %pR)\n", res, conflict->name, conflict); else - pci_add_resource(resources, res); + pci_add_resource_offset(resources, res, + info->res_offset[i]); } } @@ -382,6 +402,8 @@ static void free_pci_root_info_res(struct pci_root_info *info) { kfree(info->res); info->res = NULL; + kfree(info->res_offset); + info->res_offset = NULL; info->res_num = 0; } @@ -432,16 +454,26 @@ probe_pci_root_info(struct pci_root_info *info, struct acpi_device *device, return; size = sizeof(*info->res) * info->res_num; - info->res_num = 0; info->res = kzalloc(size, GFP_KERNEL); - if (!info->res) + if (!info->res) { + info->res_num = 0; return; + } + + size = sizeof(*info->res_offset) * info->res_num; + info->res_num = 0; + info->res_offset = kzalloc(size, GFP_KERNEL); + if (!info->res_offset) { + kfree(info->res); + info->res = NULL; + return; + } acpi_walk_resources(device->handle, METHOD_NAME__CRS, setup_resource, info); } -struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root) +struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) { struct acpi_device *device = root->device; struct pci_root_info *info = NULL; @@ -455,6 +487,9 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root) int pxm; #endif + if (pci_ignore_seg) + domain = 0; + if (domain && !pci_domains_supported) { printk(KERN_WARNING "pci_bus %04x:%02x: " "ignored (multiple domains not supported)\n", @@ -486,6 +521,7 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root) sd = &info->sd; sd->domain = domain; sd->node = node; + sd->acpi = device->handle; /* * Maybe the desired pci bus has been already scanned. In such case * it is unnecessary to scan the pci bus with the given domain,busnum. @@ -557,6 +593,14 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root) return bus; } +int pcibios_root_bridge_prepare(struct pci_host_bridge *bridge) +{ + struct pci_sysdata *sd = bridge->bus->sysdata; + + ACPI_HANDLE_SET(&bridge->dev, sd->acpi); + return 0; +} + int __init pci_acpi_init(void) { struct pci_dev *dev = NULL; diff --git a/arch/x86/pci/bus_numa.c b/arch/x86/pci/bus_numa.c index d37e2fec97e..c2735feb250 100644 --- a/arch/x86/pci/bus_numa.c +++ b/arch/x86/pci/bus_numa.c @@ -93,8 +93,8 @@ struct pci_root_info __init *alloc_pci_root_info(int bus_min, int bus_max, return info; } -void __devinit update_res(struct pci_root_info *info, resource_size_t start, - resource_size_t end, unsigned long flags, int merge) +void update_res(struct pci_root_info *info, resource_size_t start, + resource_size_t end, unsigned long flags, int merge) { struct resource *res; struct pci_root_res *root_res; diff --git a/arch/x86/pci/ce4100.c b/arch/x86/pci/ce4100.c index 41bd2a2d2c5..b914e20b5a0 100644 --- a/arch/x86/pci/ce4100.c +++ b/arch/x86/pci/ce4100.c @@ -115,6 +115,16 @@ static void sata_revid_read(struct sim_dev_reg *reg, u32 *value) reg_read(reg, value); } +static void reg_noirq_read(struct sim_dev_reg *reg, u32 *value) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&pci_config_lock, flags); + /* force interrupt pin value to 0 */ + *value = reg->sim_reg.value & 0xfff00ff; + raw_spin_unlock_irqrestore(&pci_config_lock, flags); +} + static struct sim_dev_reg bus1_fixups[] = { DEFINE_REG(2, 0, 0x10, (16*MB), reg_init, reg_read, reg_write) DEFINE_REG(2, 0, 0x14, (256), reg_init, reg_read, reg_write) @@ -144,6 +154,7 @@ static struct sim_dev_reg bus1_fixups[] = { DEFINE_REG(11, 5, 0x10, (64*KB), reg_init, reg_read, reg_write) DEFINE_REG(11, 6, 0x10, (256), reg_init, reg_read, reg_write) DEFINE_REG(11, 7, 0x10, (64*KB), reg_init, reg_read, reg_write) + DEFINE_REG(11, 7, 0x3c, 256, reg_init, reg_noirq_read, reg_write) DEFINE_REG(12, 0, 0x10, (128*KB), reg_init, reg_read, reg_write) DEFINE_REG(12, 0, 0x14, (256), reg_init, reg_read, reg_write) DEFINE_REG(12, 1, 0x10, (1024), reg_init, reg_read, reg_write) @@ -161,8 +172,10 @@ static struct sim_dev_reg bus1_fixups[] = { DEFINE_REG(16, 0, 0x10, (64*KB), reg_init, reg_read, reg_write) DEFINE_REG(16, 0, 0x14, (64*MB), reg_init, reg_read, reg_write) DEFINE_REG(16, 0, 0x18, (64*MB), reg_init, reg_read, reg_write) + DEFINE_REG(16, 0, 0x3c, 256, reg_init, reg_noirq_read, reg_write) DEFINE_REG(17, 0, 0x10, (128*KB), reg_init, reg_read, reg_write) DEFINE_REG(18, 0, 0x10, (1*KB), reg_init, reg_read, reg_write) + DEFINE_REG(18, 0, 0x3c, 256, reg_init, reg_noirq_read, reg_write) }; static void __init init_sim_regs(void) diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 720e973fc34..901177d75ff 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -17,6 +17,7 @@ #include <asm/io.h> #include <asm/smp.h> #include <asm/pci_x86.h> +#include <asm/setup.h> unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 | PCI_PROBE_MMCONF; @@ -33,7 +34,6 @@ int noioapicreroute = 1; #endif int pcibios_last_bus = -1; unsigned long pirq_table_addr; -struct pci_bus *pci_root_bus; const struct pci_raw_ops *__read_mostly raw_pci_ops; const struct pci_raw_ops *__read_mostly raw_pci_ext_ops; @@ -80,14 +80,14 @@ struct pci_ops pci_root_ops = { */ DEFINE_RAW_SPINLOCK(pci_config_lock); -static int __devinit can_skip_ioresource_align(const struct dmi_system_id *d) +static int can_skip_ioresource_align(const struct dmi_system_id *d) { pci_probe |= PCI_CAN_SKIP_ISA_ALIGN; printk(KERN_INFO "PCI: %s detected, can skip ISA alignment\n", d->ident); return 0; } -static const struct dmi_system_id can_skip_pciprobe_dmi_table[] __devinitconst = { +static const struct dmi_system_id can_skip_pciprobe_dmi_table[] = { /* * Systems where PCI IO resource ISA alignment can be skipped * when the ISA enable bit in the bridge control is not set @@ -124,7 +124,7 @@ void __init dmi_check_skip_isa_align(void) dmi_check_system(can_skip_pciprobe_dmi_table); } -static void __devinit pcibios_fixup_device_resources(struct pci_dev *dev) +static void pcibios_fixup_device_resources(struct pci_dev *dev) { struct resource *rom_r = &dev->resource[PCI_ROM_RESOURCE]; struct resource *bar_r; @@ -161,7 +161,7 @@ static void __devinit pcibios_fixup_device_resources(struct pci_dev *dev) * are examined. */ -void __devinit pcibios_fixup_bus(struct pci_bus *b) +void pcibios_fixup_bus(struct pci_bus *b) { struct pci_dev *dev; @@ -175,7 +175,7 @@ void __devinit pcibios_fixup_bus(struct pci_bus *b) * on the kernel command line (which was parsed earlier). */ -static int __devinit set_bf_sort(const struct dmi_system_id *d) +static int set_bf_sort(const struct dmi_system_id *d) { if (pci_bf_sort == pci_bf_sort_default) { pci_bf_sort = pci_dmi_bf; @@ -184,7 +184,7 @@ static int __devinit set_bf_sort(const struct dmi_system_id *d) return 0; } -static void __devinit read_dmi_type_b1(const struct dmi_header *dm, +static void read_dmi_type_b1(const struct dmi_header *dm, void *private_data) { u8 *d = (u8 *)dm + 4; @@ -206,7 +206,7 @@ static void __devinit read_dmi_type_b1(const struct dmi_header *dm, } } -static int __devinit find_sort_method(const struct dmi_system_id *d) +static int find_sort_method(const struct dmi_system_id *d) { dmi_walk(read_dmi_type_b1, NULL); @@ -221,7 +221,7 @@ static int __devinit find_sort_method(const struct dmi_system_id *d) * Enable renumbering of PCI bus# ranges to reach all PCI busses (Cardbus) */ #ifdef __i386__ -static int __devinit assign_all_busses(const struct dmi_system_id *d) +static int assign_all_busses(const struct dmi_system_id *d) { pci_probe |= PCI_ASSIGN_ALL_BUSSES; printk(KERN_INFO "%s detected: enabling PCI bus# renumbering" @@ -230,7 +230,7 @@ static int __devinit assign_all_busses(const struct dmi_system_id *d) } #endif -static int __devinit set_scan_all(const struct dmi_system_id *d) +static int set_scan_all(const struct dmi_system_id *d) { printk(KERN_INFO "PCI: %s detected, enabling pci=pcie_scan_all\n", d->ident); @@ -238,7 +238,7 @@ static int __devinit set_scan_all(const struct dmi_system_id *d) return 0; } -static const struct dmi_system_id __devinitconst pciprobe_dmi_table[] = { +static const struct dmi_system_id pciprobe_dmi_table[] = { #ifdef __i386__ /* * Laptops which need pci=assign-busses to see Cardbus cards @@ -433,7 +433,8 @@ static const struct dmi_system_id __devinitconst pciprobe_dmi_table[] = { .callback = set_scan_all, .ident = "Stratus/NEC ftServer", .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "ftServer"), + DMI_MATCH(DMI_SYS_VENDOR, "Stratus"), + DMI_MATCH(DMI_PRODUCT_NAME, "ftServer"), }, }, {} @@ -444,7 +445,7 @@ void __init dmi_check_pciprobe(void) dmi_check_system(pciprobe_dmi_table); } -struct pci_bus * __devinit pcibios_scan_root(int busnum) +struct pci_bus *pcibios_scan_root(int busnum) { struct pci_bus *bus = NULL; @@ -608,6 +609,35 @@ unsigned int pcibios_assign_all_busses(void) return (pci_probe & PCI_ASSIGN_ALL_BUSSES) ? 1 : 0; } +int pcibios_add_device(struct pci_dev *dev) +{ + struct setup_data *data; + struct pci_setup_rom *rom; + u64 pa_data; + + pa_data = boot_params.hdr.setup_data; + while (pa_data) { + data = phys_to_virt(pa_data); + + if (data->type == SETUP_PCI) { + rom = (struct pci_setup_rom *)data; + + if ((pci_domain_nr(dev->bus) == rom->segment) && + (dev->bus->number == rom->bus) && + (PCI_SLOT(dev->devfn) == rom->device) && + (PCI_FUNC(dev->devfn) == rom->function) && + (dev->vendor == rom->vendor) && + (dev->device == rom->devid)) { + dev->rom = pa_data + + offsetof(struct pci_setup_rom, romdata); + dev->romlen = rom->pcilen; + } + } + pa_data = data->next; + } + return 0; +} + int pcibios_enable_device(struct pci_dev *dev, int mask) { int err; @@ -626,7 +656,7 @@ void pcibios_disable_device (struct pci_dev *dev) pcibios_disable_irq(dev); } -int pci_ext_cfg_avail(struct pci_dev *dev) +int pci_ext_cfg_avail(void) { if (raw_pci_ext_ops) return 1; @@ -634,7 +664,7 @@ int pci_ext_cfg_avail(struct pci_dev *dev) return 0; } -struct pci_bus * __devinit pci_scan_bus_on_node(int busno, struct pci_ops *ops, int node) +struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops, int node) { LIST_HEAD(resources); struct pci_bus *bus = NULL; @@ -662,7 +692,7 @@ struct pci_bus * __devinit pci_scan_bus_on_node(int busno, struct pci_ops *ops, return bus; } -struct pci_bus * __devinit pci_scan_bus_with_sysdata(int busno) +struct pci_bus *pci_scan_bus_with_sysdata(int busno) { return pci_scan_bus_on_node(busno, &pci_root_ops, -1); } diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index af8a224db21..f5809fa2753 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -9,7 +9,7 @@ #include <linux/vgaarb.h> #include <asm/pci_x86.h> -static void __devinit pci_fixup_i450nx(struct pci_dev *d) +static void pci_fixup_i450nx(struct pci_dev *d) { /* * i450NX -- Find and scan all secondary buses on all PXB's. @@ -34,7 +34,7 @@ static void __devinit pci_fixup_i450nx(struct pci_dev *d) } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82451NX, pci_fixup_i450nx); -static void __devinit pci_fixup_i450gx(struct pci_dev *d) +static void pci_fixup_i450gx(struct pci_dev *d) { /* * i450GX and i450KX -- Find and scan all secondary buses. @@ -48,7 +48,7 @@ static void __devinit pci_fixup_i450gx(struct pci_dev *d) } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82454GX, pci_fixup_i450gx); -static void __devinit pci_fixup_umc_ide(struct pci_dev *d) +static void pci_fixup_umc_ide(struct pci_dev *d) { /* * UM8886BF IDE controller sets region type bits incorrectly, @@ -62,7 +62,7 @@ static void __devinit pci_fixup_umc_ide(struct pci_dev *d) } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_UMC, PCI_DEVICE_ID_UMC_UM8886BF, pci_fixup_umc_ide); -static void __devinit pci_fixup_ncr53c810(struct pci_dev *d) +static void pci_fixup_ncr53c810(struct pci_dev *d) { /* * NCR 53C810 returns class code 0 (at least on some systems). @@ -75,7 +75,7 @@ static void __devinit pci_fixup_ncr53c810(struct pci_dev *d) } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NCR, PCI_DEVICE_ID_NCR_53C810, pci_fixup_ncr53c810); -static void __devinit pci_fixup_latency(struct pci_dev *d) +static void pci_fixup_latency(struct pci_dev *d) { /* * SiS 5597 and 5598 chipsets require latency timer set to @@ -87,7 +87,7 @@ static void __devinit pci_fixup_latency(struct pci_dev *d) DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_SI, PCI_DEVICE_ID_SI_5597, pci_fixup_latency); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_SI, PCI_DEVICE_ID_SI_5598, pci_fixup_latency); -static void __devinit pci_fixup_piix4_acpi(struct pci_dev *d) +static void pci_fixup_piix4_acpi(struct pci_dev *d) { /* * PIIX4 ACPI device: hardwired IRQ9 @@ -163,7 +163,7 @@ DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8367_0, pci_fixup_ * system to PCI bus no matter what are their window settings, so they are * "transparent" (or subtractive decoding) from programmers point of view. */ -static void __devinit pci_fixup_transparent_bridge(struct pci_dev *dev) +static void pci_fixup_transparent_bridge(struct pci_dev *dev) { if ((dev->device & 0xff00) == 0x2400) dev->transparent = 1; @@ -317,7 +317,7 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PC1, pcie_r * video device at this point. */ -static void __devinit pci_fixup_video(struct pci_dev *pdev) +static void pci_fixup_video(struct pci_dev *pdev) { struct pci_dev *bridge; struct pci_bus *bus; @@ -357,7 +357,7 @@ DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_ANY_ID, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA, 8, pci_fixup_video); -static const struct dmi_system_id __devinitconst msi_k8t_dmi_table[] = { +static const struct dmi_system_id msi_k8t_dmi_table[] = { { .ident = "MSI-K8T-Neo2Fir", .matches = { @@ -378,7 +378,7 @@ static const struct dmi_system_id __devinitconst msi_k8t_dmi_table[] = { * The soundcard is only enabled, if the mainborad is identified * via DMI-tables and the soundcard is detected to be off. */ -static void __devinit pci_fixup_msi_k8t_onboard_sound(struct pci_dev *dev) +static void pci_fixup_msi_k8t_onboard_sound(struct pci_dev *dev) { unsigned char val; if (!dmi_check_system(msi_k8t_dmi_table)) @@ -414,7 +414,7 @@ DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237, */ static u16 toshiba_line_size; -static const struct dmi_system_id __devinitconst toshiba_ohci1394_dmi_table[] = { +static const struct dmi_system_id toshiba_ohci1394_dmi_table[] = { { .ident = "Toshiba PS5 based laptop", .matches = { @@ -439,7 +439,7 @@ static const struct dmi_system_id __devinitconst toshiba_ohci1394_dmi_table[] = { } }; -static void __devinit pci_pre_fixup_toshiba_ohci1394(struct pci_dev *dev) +static void pci_pre_fixup_toshiba_ohci1394(struct pci_dev *dev) { if (!dmi_check_system(toshiba_ohci1394_dmi_table)) return; /* only applies to certain Toshibas (so far) */ @@ -450,7 +450,7 @@ static void __devinit pci_pre_fixup_toshiba_ohci1394(struct pci_dev *dev) DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_TI, 0x8032, pci_pre_fixup_toshiba_ohci1394); -static void __devinit pci_post_fixup_toshiba_ohci1394(struct pci_dev *dev) +static void pci_post_fixup_toshiba_ohci1394(struct pci_dev *dev) { if (!dmi_check_system(toshiba_ohci1394_dmi_table)) return; /* only applies to certain Toshibas (so far) */ @@ -488,7 +488,7 @@ DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, * Siemens Nixdorf AG FSC Multiprocessor Interrupt Controller: * prevent update of the BAR0, which doesn't look like a normal BAR. */ -static void __devinit pci_siemens_interrupt_controller(struct pci_dev *dev) +static void pci_siemens_interrupt_controller(struct pci_dev *dev) { dev->resource[0].flags |= IORESOURCE_PCI_FIXED; } @@ -531,7 +531,7 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_ATI, 0x4385, sb600_disable_hpet_bar); * * Match off the LPC and svid/sdid (older kernels lose the bridge subvendor) */ -static void __devinit twinhead_reserve_killing_zone(struct pci_dev *dev) +static void twinhead_reserve_killing_zone(struct pci_dev *dev) { if (dev->subsystem_vendor == 0x14FF && dev->subsystem_device == 0xA003) { pr_info("Reserving memory on Twinhead H12Y\n"); diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index dd8ca6f7223..94919e307f8 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -51,6 +51,7 @@ struct pcibios_fwaddrmap { static LIST_HEAD(pcibios_fwaddrmappings); static DEFINE_SPINLOCK(pcibios_fwaddrmap_lock); +static bool pcibios_fw_addr_done; /* Must be called with 'pcibios_fwaddrmap_lock' lock held. */ static struct pcibios_fwaddrmap *pcibios_fwaddrmap_lookup(struct pci_dev *dev) @@ -72,6 +73,9 @@ pcibios_save_fw_addr(struct pci_dev *dev, int idx, resource_size_t fw_addr) unsigned long flags; struct pcibios_fwaddrmap *map; + if (pcibios_fw_addr_done) + return; + spin_lock_irqsave(&pcibios_fwaddrmap_lock, flags); map = pcibios_fwaddrmap_lookup(dev); if (!map) { @@ -97,6 +101,9 @@ resource_size_t pcibios_retrieve_fw_addr(struct pci_dev *dev, int idx) struct pcibios_fwaddrmap *map; resource_size_t fw_addr = 0; + if (pcibios_fw_addr_done) + return 0; + spin_lock_irqsave(&pcibios_fwaddrmap_lock, flags); map = pcibios_fwaddrmap_lookup(dev); if (map) @@ -106,7 +113,7 @@ resource_size_t pcibios_retrieve_fw_addr(struct pci_dev *dev, int idx) return fw_addr; } -static void pcibios_fw_addr_list_del(void) +static void __init pcibios_fw_addr_list_del(void) { unsigned long flags; struct pcibios_fwaddrmap *entry, *next; @@ -118,6 +125,7 @@ static void pcibios_fw_addr_list_del(void) kfree(entry); } spin_unlock_irqrestore(&pcibios_fwaddrmap_lock, flags); + pcibios_fw_addr_done = true; } static int @@ -193,46 +201,46 @@ EXPORT_SYMBOL(pcibios_align_resource); * as well. */ -static void __init pcibios_allocate_bus_resources(struct list_head *bus_list) +static void pcibios_allocate_bridge_resources(struct pci_dev *dev) { - struct pci_bus *bus; - struct pci_dev *dev; int idx; struct resource *r; - /* Depth-First Search on bus tree */ - list_for_each_entry(bus, bus_list, node) { - if ((dev = bus->self)) { - for (idx = PCI_BRIDGE_RESOURCES; - idx < PCI_NUM_RESOURCES; idx++) { - r = &dev->resource[idx]; - if (!r->flags) - continue; - if (!r->start || - pci_claim_resource(dev, idx) < 0) { - /* - * Something is wrong with the region. - * Invalidate the resource to prevent - * child resource allocations in this - * range. - */ - r->start = r->end = 0; - r->flags = 0; - } - } + for (idx = PCI_BRIDGE_RESOURCES; idx < PCI_NUM_RESOURCES; idx++) { + r = &dev->resource[idx]; + if (!r->flags) + continue; + if (!r->start || pci_claim_resource(dev, idx) < 0) { + /* + * Something is wrong with the region. + * Invalidate the resource to prevent + * child resource allocations in this + * range. + */ + r->start = r->end = 0; + r->flags = 0; } - pcibios_allocate_bus_resources(&bus->children); } } +static void pcibios_allocate_bus_resources(struct pci_bus *bus) +{ + struct pci_bus *child; + + /* Depth-First Search on bus tree */ + if (bus->self) + pcibios_allocate_bridge_resources(bus->self); + list_for_each_entry(child, &bus->children, node) + pcibios_allocate_bus_resources(child); +} + struct pci_check_idx_range { int start; int end; }; -static void __init pcibios_allocate_resources(int pass) +static void pcibios_allocate_dev_resources(struct pci_dev *dev, int pass) { - struct pci_dev *dev = NULL; int idx, disabled, i; u16 command; struct resource *r; @@ -244,14 +252,13 @@ static void __init pcibios_allocate_resources(int pass) #endif }; - for_each_pci_dev(dev) { - pci_read_config_word(dev, PCI_COMMAND, &command); - for (i = 0; i < ARRAY_SIZE(idx_range); i++) + pci_read_config_word(dev, PCI_COMMAND, &command); + for (i = 0; i < ARRAY_SIZE(idx_range); i++) for (idx = idx_range[i].start; idx <= idx_range[i].end; idx++) { r = &dev->resource[idx]; - if (r->parent) /* Already allocated */ + if (r->parent) /* Already allocated */ continue; - if (!r->start) /* Address not assigned at all */ + if (!r->start) /* Address not assigned at all */ continue; if (r->flags & IORESOURCE_IO) disabled = !(command & PCI_COMMAND_IO); @@ -270,44 +277,74 @@ static void __init pcibios_allocate_resources(int pass) } } } - if (!pass) { - r = &dev->resource[PCI_ROM_RESOURCE]; - if (r->flags & IORESOURCE_ROM_ENABLE) { - /* Turn the ROM off, leave the resource region, - * but keep it unregistered. */ - u32 reg; - dev_dbg(&dev->dev, "disabling ROM %pR\n", r); - r->flags &= ~IORESOURCE_ROM_ENABLE; - pci_read_config_dword(dev, - dev->rom_base_reg, ®); - pci_write_config_dword(dev, dev->rom_base_reg, + if (!pass) { + r = &dev->resource[PCI_ROM_RESOURCE]; + if (r->flags & IORESOURCE_ROM_ENABLE) { + /* Turn the ROM off, leave the resource region, + * but keep it unregistered. */ + u32 reg; + dev_dbg(&dev->dev, "disabling ROM %pR\n", r); + r->flags &= ~IORESOURCE_ROM_ENABLE; + pci_read_config_dword(dev, dev->rom_base_reg, ®); + pci_write_config_dword(dev, dev->rom_base_reg, reg & ~PCI_ROM_ADDRESS_ENABLE); - } } } } -static int __init pcibios_assign_resources(void) +static void pcibios_allocate_resources(struct pci_bus *bus, int pass) +{ + struct pci_dev *dev; + struct pci_bus *child; + + list_for_each_entry(dev, &bus->devices, bus_list) { + pcibios_allocate_dev_resources(dev, pass); + + child = dev->subordinate; + if (child) + pcibios_allocate_resources(child, pass); + } +} + +static void pcibios_allocate_dev_rom_resource(struct pci_dev *dev) { - struct pci_dev *dev = NULL; struct resource *r; - if (!(pci_probe & PCI_ASSIGN_ROMS)) { - /* - * Try to use BIOS settings for ROMs, otherwise let - * pci_assign_unassigned_resources() allocate the new - * addresses. - */ - for_each_pci_dev(dev) { - r = &dev->resource[PCI_ROM_RESOURCE]; - if (!r->flags || !r->start) - continue; - if (pci_claim_resource(dev, PCI_ROM_RESOURCE) < 0) { - r->end -= r->start; - r->start = 0; - } - } + /* + * Try to use BIOS settings for ROMs, otherwise let + * pci_assign_unassigned_resources() allocate the new + * addresses. + */ + r = &dev->resource[PCI_ROM_RESOURCE]; + if (!r->flags || !r->start) + return; + + if (pci_claim_resource(dev, PCI_ROM_RESOURCE) < 0) { + r->end -= r->start; + r->start = 0; } +} +static void pcibios_allocate_rom_resources(struct pci_bus *bus) +{ + struct pci_dev *dev; + struct pci_bus *child; + + list_for_each_entry(dev, &bus->devices, bus_list) { + pcibios_allocate_dev_rom_resource(dev); + + child = dev->subordinate; + if (child) + pcibios_allocate_rom_resources(child); + } +} + +static int __init pcibios_assign_resources(void) +{ + struct pci_bus *bus; + + if (!(pci_probe & PCI_ASSIGN_ROMS)) + list_for_each_entry(bus, &pci_root_buses, node) + pcibios_allocate_rom_resources(bus); pci_assign_unassigned_resources(); pcibios_fw_addr_list_del(); @@ -315,12 +352,32 @@ static int __init pcibios_assign_resources(void) return 0; } +void pcibios_resource_survey_bus(struct pci_bus *bus) +{ + dev_printk(KERN_DEBUG, &bus->dev, "Allocating resources\n"); + + pcibios_allocate_bus_resources(bus); + + pcibios_allocate_resources(bus, 0); + pcibios_allocate_resources(bus, 1); + + if (!(pci_probe & PCI_ASSIGN_ROMS)) + pcibios_allocate_rom_resources(bus); +} + void __init pcibios_resource_survey(void) { + struct pci_bus *bus; + DBG("PCI: Allocating resources\n"); - pcibios_allocate_bus_resources(&pci_root_buses); - pcibios_allocate_resources(0); - pcibios_allocate_resources(1); + + list_for_each_entry(bus, &pci_root_buses, node) + pcibios_allocate_bus_resources(bus); + + list_for_each_entry(bus, &pci_root_buses, node) + pcibios_allocate_resources(bus, 0); + list_for_each_entry(bus, &pci_root_buses, node) + pcibios_allocate_resources(bus, 1); e820_reserve_resources_late(); /* diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c index a1df191129d..4db96fb1c23 100644 --- a/arch/x86/pci/legacy.c +++ b/arch/x86/pci/legacy.c @@ -10,7 +10,7 @@ * Discover remaining PCI buses in case there are peer host bridges. * We use the number of last PCI bus provided by the PCI BIOS. */ -static void __devinit pcibios_fixup_peer_bridges(void) +static void pcibios_fixup_peer_bridges(void) { int n; @@ -30,11 +30,11 @@ int __init pci_legacy_init(void) } printk("PCI: Probing PCI hardware\n"); - pci_root_bus = pcibios_scan_root(0); + pcibios_scan_root(0); return 0; } -void __devinit pcibios_scan_specific_bus(int busn) +void pcibios_scan_specific_bus(int busn) { int devfn; long node; diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 704b9ec043d..082e8812971 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -49,7 +49,7 @@ static __init void free_all_mmcfg(void) pci_mmconfig_remove(cfg); } -static __devinit void list_add_sorted(struct pci_mmcfg_region *new) +static void list_add_sorted(struct pci_mmcfg_region *new) { struct pci_mmcfg_region *cfg; @@ -65,9 +65,8 @@ static __devinit void list_add_sorted(struct pci_mmcfg_region *new) list_add_tail_rcu(&new->list, &pci_mmcfg_list); } -static __devinit struct pci_mmcfg_region *pci_mmconfig_alloc(int segment, - int start, - int end, u64 addr) +static struct pci_mmcfg_region *pci_mmconfig_alloc(int segment, int start, + int end, u64 addr) { struct pci_mmcfg_region *new; struct resource *res; @@ -371,8 +370,7 @@ static int __init pci_mmcfg_check_hostbridge(void) return !list_empty(&pci_mmcfg_list); } -static acpi_status __devinit check_mcfg_resource(struct acpi_resource *res, - void *data) +static acpi_status check_mcfg_resource(struct acpi_resource *res, void *data) { struct resource *mcfg_res = data; struct acpi_resource_address64 address; @@ -408,8 +406,8 @@ static acpi_status __devinit check_mcfg_resource(struct acpi_resource *res, return AE_OK; } -static acpi_status __devinit find_mboard_resource(acpi_handle handle, u32 lvl, - void *context, void **rv) +static acpi_status find_mboard_resource(acpi_handle handle, u32 lvl, + void *context, void **rv) { struct resource *mcfg_res = context; @@ -422,7 +420,7 @@ static acpi_status __devinit find_mboard_resource(acpi_handle handle, u32 lvl, return AE_OK; } -static int __devinit is_acpi_reserved(u64 start, u64 end, unsigned not_used) +static int is_acpi_reserved(u64 start, u64 end, unsigned not_used) { struct resource mcfg_res; @@ -550,8 +548,7 @@ static int __init acpi_mcfg_check_entry(struct acpi_table_mcfg *mcfg, if (cfg->address < 0xFFFFFFFF) return 0; - if (!strcmp(mcfg->header.oem_id, "SGI") || - !strcmp(mcfg->header.oem_id, "SGI2")) + if (!strncmp(mcfg->header.oem_id, "SGI", 3)) return 0; if (mcfg->header.revision >= 1) { @@ -693,9 +690,8 @@ static int __init pci_mmcfg_late_insert_resources(void) late_initcall(pci_mmcfg_late_insert_resources); /* Add MMCFG information for host bridges */ -int __devinit pci_mmconfig_insert(struct device *dev, - u16 seg, u8 start, u8 end, - phys_addr_t addr) +int pci_mmconfig_insert(struct device *dev, u16 seg, u8 start, u8 end, + phys_addr_t addr) { int rc; struct resource *tmp = NULL; diff --git a/arch/x86/pci/mmconfig_32.c b/arch/x86/pci/mmconfig_32.c index db63ac23e3d..5c90975cdf0 100644 --- a/arch/x86/pci/mmconfig_32.c +++ b/arch/x86/pci/mmconfig_32.c @@ -142,7 +142,7 @@ void __init pci_mmcfg_arch_free(void) { } -int __devinit pci_mmcfg_arch_map(struct pci_mmcfg_region *cfg) +int pci_mmcfg_arch_map(struct pci_mmcfg_region *cfg) { return 0; } diff --git a/arch/x86/pci/mmconfig_64.c b/arch/x86/pci/mmconfig_64.c index d4ebd07c306..bea52496aea 100644 --- a/arch/x86/pci/mmconfig_64.c +++ b/arch/x86/pci/mmconfig_64.c @@ -95,7 +95,7 @@ const struct pci_raw_ops pci_mmcfg = { .write = pci_mmcfg_write, }; -static void __iomem * __devinit mcfg_ioremap(struct pci_mmcfg_region *cfg) +static void __iomem *mcfg_ioremap(struct pci_mmcfg_region *cfg) { void __iomem *addr; u64 start, size; @@ -133,7 +133,7 @@ void __init pci_mmcfg_arch_free(void) pci_mmcfg_arch_unmap(cfg); } -int __devinit pci_mmcfg_arch_map(struct pci_mmcfg_region *cfg) +int pci_mmcfg_arch_map(struct pci_mmcfg_region *cfg) { cfg->virt = mcfg_ioremap(cfg); if (!cfg->virt) { diff --git a/arch/x86/pci/mrst.c b/arch/x86/pci/mrst.c index e14a2ff708b..6eb18c42a28 100644 --- a/arch/x86/pci/mrst.c +++ b/arch/x86/pci/mrst.c @@ -247,7 +247,7 @@ int __init pci_mrst_init(void) /* Langwell devices are not true pci devices, they are not subject to 10 ms * d3 to d0 delay required by pci spec. */ -static void __devinit pci_d3delay_fixup(struct pci_dev *dev) +static void pci_d3delay_fixup(struct pci_dev *dev) { /* PCI fixups are effectively decided compile time. If we have a dual SoC/non-SoC kernel we don't want to mangle d3 on non SoC devices */ @@ -262,7 +262,7 @@ static void __devinit pci_d3delay_fixup(struct pci_dev *dev) } DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, pci_d3delay_fixup); -static void __devinit mrst_power_off_unused_dev(struct pci_dev *dev) +static void mrst_power_off_unused_dev(struct pci_dev *dev) { pci_set_power_state(dev, PCI_D3hot); } @@ -275,7 +275,7 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0815, mrst_power_off_unused_dev); /* * Langwell devices reside at fixed offsets, don't try to move them. */ -static void __devinit pci_fixed_bar_fixup(struct pci_dev *dev) +static void pci_fixed_bar_fixup(struct pci_dev *dev) { unsigned long offset; u32 size; diff --git a/arch/x86/pci/numachip.c b/arch/x86/pci/numachip.c new file mode 100644 index 00000000000..7307d9d12d1 --- /dev/null +++ b/arch/x86/pci/numachip.c @@ -0,0 +1,129 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Numascale NumaConnect-specific PCI code + * + * Copyright (C) 2012 Numascale AS. All rights reserved. + * + * Send feedback to <support@numascale.com> + * + * PCI accessor functions derived from mmconfig_64.c + * + */ + +#include <linux/pci.h> +#include <asm/pci_x86.h> + +static u8 limit __read_mostly; + +static inline char __iomem *pci_dev_base(unsigned int seg, unsigned int bus, unsigned int devfn) +{ + struct pci_mmcfg_region *cfg = pci_mmconfig_lookup(seg, bus); + + if (cfg && cfg->virt) + return cfg->virt + (PCI_MMCFG_BUS_OFFSET(bus) | (devfn << 12)); + return NULL; +} + +static int pci_mmcfg_read_numachip(unsigned int seg, unsigned int bus, + unsigned int devfn, int reg, int len, u32 *value) +{ + char __iomem *addr; + + /* Why do we have this when nobody checks it. How about a BUG()!? -AK */ + if (unlikely((bus > 255) || (devfn > 255) || (reg > 4095))) { +err: *value = -1; + return -EINVAL; + } + + /* Ensure AMD Northbridges don't decode reads to other devices */ + if (unlikely(bus == 0 && devfn >= limit)) { + *value = -1; + return 0; + } + + rcu_read_lock(); + addr = pci_dev_base(seg, bus, devfn); + if (!addr) { + rcu_read_unlock(); + goto err; + } + + switch (len) { + case 1: + *value = mmio_config_readb(addr + reg); + break; + case 2: + *value = mmio_config_readw(addr + reg); + break; + case 4: + *value = mmio_config_readl(addr + reg); + break; + } + rcu_read_unlock(); + + return 0; +} + +static int pci_mmcfg_write_numachip(unsigned int seg, unsigned int bus, + unsigned int devfn, int reg, int len, u32 value) +{ + char __iomem *addr; + + /* Why do we have this when nobody checks it. How about a BUG()!? -AK */ + if (unlikely((bus > 255) || (devfn > 255) || (reg > 4095))) + return -EINVAL; + + /* Ensure AMD Northbridges don't decode writes to other devices */ + if (unlikely(bus == 0 && devfn >= limit)) + return 0; + + rcu_read_lock(); + addr = pci_dev_base(seg, bus, devfn); + if (!addr) { + rcu_read_unlock(); + return -EINVAL; + } + + switch (len) { + case 1: + mmio_config_writeb(addr + reg, value); + break; + case 2: + mmio_config_writew(addr + reg, value); + break; + case 4: + mmio_config_writel(addr + reg, value); + break; + } + rcu_read_unlock(); + + return 0; +} + +const struct pci_raw_ops pci_mmcfg_numachip = { + .read = pci_mmcfg_read_numachip, + .write = pci_mmcfg_write_numachip, +}; + +int __init pci_numachip_init(void) +{ + int ret = 0; + u32 val; + + /* For remote I/O, restrict bus 0 access to the actual number of AMD + Northbridges, which starts at device number 0x18 */ + ret = raw_pci_read(0, 0, PCI_DEVFN(0x18, 0), 0x60, sizeof(val), &val); + if (ret) + goto out; + + /* HyperTransport fabric size in bits 6:4 */ + limit = PCI_DEVFN(0x18 + ((val >> 4) & 7) + 1, 0); + + /* Use NumaChip PCI accessors for non-extended and extended access */ + raw_pci_ops = raw_pci_ext_ops = &pci_mmcfg_numachip; +out: + return ret; +} diff --git a/arch/x86/pci/numaq_32.c b/arch/x86/pci/numaq_32.c index 83e125b95ca..72c229f9ebc 100644 --- a/arch/x86/pci/numaq_32.c +++ b/arch/x86/pci/numaq_32.c @@ -116,7 +116,7 @@ static const struct pci_raw_ops pci_direct_conf1_mq = { }; -static void __devinit pci_fixup_i450nx(struct pci_dev *d) +static void pci_fixup_i450nx(struct pci_dev *d) { /* * i450NX -- Find and scan all secondary buses on all PXB's. @@ -152,7 +152,7 @@ int __init pci_numaq_init(void) raw_pci_ops = &pci_direct_conf1_mq; - pci_root_bus = pcibios_scan_root(0); + pcibios_scan_root(0); if (num_online_nodes() > 1) for_each_online_node(quad) { if (quad == 0) diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c index da8fe0535ff..c77b24a8b2d 100644 --- a/arch/x86/pci/pcbios.c +++ b/arch/x86/pci/pcbios.c @@ -124,7 +124,7 @@ static struct { static int pci_bios_present; -static int __devinit check_pcibios(void) +static int check_pcibios(void) { u32 signature, eax, ebx, ecx; u8 status, major_ver, minor_ver, hw_mech; @@ -312,7 +312,7 @@ static const struct pci_raw_ops pci_bios_access = { * Try to find PCI BIOS. */ -static const struct pci_raw_ops * __devinit pci_find_bios(void) +static const struct pci_raw_ops *pci_find_bios(void) { union bios32 *check; unsigned char sum; diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 56ab74989cf..94e76620460 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -162,6 +162,9 @@ static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) struct msi_desc *msidesc; int *v; + if (type == PCI_CAP_ID_MSI && nvec > 1) + return 1; + v = kzalloc(sizeof(int) * max(1, nvec), GFP_KERNEL); if (!v) return -ENOMEM; @@ -220,6 +223,9 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) struct msi_desc *msidesc; struct msi_msg msg; + if (type == PCI_CAP_ID_MSI && nvec > 1) + return 1; + list_for_each_entry(msidesc, &dev->msi_list, list) { __read_msi_msg(msidesc, &msg); pirq = MSI_ADDR_EXT_DEST_ID(msg.address_hi) | @@ -263,6 +269,9 @@ static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) int ret = 0; struct msi_desc *msidesc; + if (type == PCI_CAP_ID_MSI && nvec > 1) + return 1; + list_for_each_entry(msidesc, &dev->msi_list, list) { struct physdev_map_pirq map_irq; domid_t domid; diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile index 8d874396cb2..01e0231a113 100644 --- a/arch/x86/platform/Makefile +++ b/arch/x86/platform/Makefile @@ -2,10 +2,12 @@ obj-y += ce4100/ obj-y += efi/ obj-y += geode/ +obj-y += goldfish/ obj-y += iris/ obj-y += mrst/ obj-y += olpc/ obj-y += scx200/ obj-y += sfi/ +obj-y += ts5500/ obj-y += visws/ obj-y += uv/ diff --git a/arch/x86/platform/ce4100/ce4100.c b/arch/x86/platform/ce4100/ce4100.c index 4c61b52191e..f8ab4945892 100644 --- a/arch/x86/platform/ce4100/ce4100.c +++ b/arch/x86/platform/ce4100/ce4100.c @@ -21,12 +21,25 @@ #include <asm/i8259.h> #include <asm/io.h> #include <asm/io_apic.h> +#include <asm/emergency-restart.h> static int ce4100_i8042_detect(void) { return 0; } +/* + * The CE4100 platform has an internal 8051 Microcontroller which is + * responsible for signaling to the external Power Management Unit the + * intention to reset, reboot or power off the system. This 8051 device has + * its command register mapped at I/O port 0xcf9 and the value 0x4 is used + * to power off the system. + */ +static void ce4100_power_off(void) +{ + outb(0x4, 0xcf9); +} + #ifdef CONFIG_SERIAL_8250 static unsigned int mem_serial_in(struct uart_port *p, int offset) @@ -92,8 +105,11 @@ static void ce4100_serial_fixup(int port, struct uart_port *up, up->membase = (void __iomem *)__fix_to_virt(FIX_EARLYCON_MEM_BASE); up->membase += up->mapbase & ~PAGE_MASK; + up->mapbase += port * 0x100; + up->membase += port * 0x100; up->iotype = UPIO_MEM32; up->regshift = 2; + up->irq = 4; } #endif up->iobase = 0; @@ -139,8 +155,19 @@ void __init x86_ce4100_early_setup(void) x86_init.mpparse.find_smp_config = x86_init_noop; x86_init.pci.init = ce4100_pci_init; + /* + * By default, the reboot method is ACPI which is supported by the + * CE4100 bootloader CEFDK using FADT.ResetReg Address and ResetValue + * the bootloader will however issue a system power off instead of + * reboot. By using BOOT_KBD we ensure proper system reboot as + * expected. + */ + reboot_type = BOOT_KBD; + #ifdef CONFIG_X86_IO_APIC x86_init.pci.init_irq = sdv_pci_init; x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc_nocheck; #endif + + pm_power_off = ce4100_power_off; } diff --git a/arch/x86/platform/efi/efi-bgrt.c b/arch/x86/platform/efi/efi-bgrt.c index f6a0c1b8e51..7145ec63c52 100644 --- a/arch/x86/platform/efi/efi-bgrt.c +++ b/arch/x86/platform/efi/efi-bgrt.c @@ -11,20 +11,21 @@ * published by the Free Software Foundation. */ #include <linux/kernel.h> +#include <linux/init.h> #include <linux/acpi.h> #include <linux/efi.h> #include <linux/efi-bgrt.h> struct acpi_table_bgrt *bgrt_tab; -void *bgrt_image; -size_t bgrt_image_size; +void *__initdata bgrt_image; +size_t __initdata bgrt_image_size; struct bmp_header { u16 id; u32 size; } __packed; -void efi_bgrt_init(void) +void __init efi_bgrt_init(void) { acpi_status status; void __iomem *image; @@ -39,6 +40,8 @@ void efi_bgrt_init(void) if (ACPI_FAILURE(status)) return; + if (bgrt_tab->header.length < sizeof(*bgrt_tab)) + return; if (bgrt_tab->version != 1) return; if (bgrt_tab->image_type != 0 || !bgrt_tab->image_address) diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index ad4439145f8..5f2ecaf3f9d 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -51,9 +51,6 @@ #define EFI_DEBUG 1 -int efi_enabled; -EXPORT_SYMBOL(efi_enabled); - struct efi __read_mostly efi = { .mps = EFI_INVALID_TABLE_ADDR, .acpi = EFI_INVALID_TABLE_ADDR, @@ -69,19 +66,24 @@ EXPORT_SYMBOL(efi); struct efi_memory_map memmap; -bool efi_64bit; - static struct efi efi_phys __initdata; static efi_system_table_t efi_systab __initdata; -static inline bool efi_is_native(void) +unsigned long x86_efi_facility; + +/* + * Returns 1 if 'facility' is enabled, 0 otherwise. + */ +int efi_enabled(int facility) { - return IS_ENABLED(CONFIG_X86_64) == efi_64bit; + return test_bit(facility, &x86_efi_facility) != 0; } +EXPORT_SYMBOL(efi_enabled); +static bool __initdata disable_runtime = false; static int __init setup_noefi(char *arg) { - efi_enabled = 0; + disable_runtime = true; return 0; } early_param("noefi", setup_noefi); @@ -410,8 +412,8 @@ void __init efi_reserve_boot_services(void) * - Not within any part of the kernel * - Not the bios reserved area */ - if ((start+size >= virt_to_phys(_text) - && start <= virt_to_phys(_end)) || + if ((start+size >= __pa_symbol(_text) + && start <= __pa_symbol(_end)) || !e820_all_mapped(start, start+size, E820_RAM) || memblock_is_region_reserved(start, size)) { /* Could not reserve, skip it */ @@ -426,6 +428,7 @@ void __init efi_reserve_boot_services(void) void __init efi_unmap_memmap(void) { + clear_bit(EFI_MEMMAP, &x86_efi_facility); if (memmap.map) { early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size); memmap.map = NULL; @@ -460,7 +463,7 @@ void __init efi_free_boot_services(void) static int __init efi_systab_init(void *phys) { - if (efi_64bit) { + if (efi_enabled(EFI_64BIT)) { efi_system_table_64_t *systab64; u64 tmp = 0; @@ -552,7 +555,7 @@ static int __init efi_config_init(u64 tables, int nr_tables) void *config_tables, *tablep; int i, sz; - if (efi_64bit) + if (efi_enabled(EFI_64BIT)) sz = sizeof(efi_config_table_64_t); else sz = sizeof(efi_config_table_32_t); @@ -572,7 +575,7 @@ static int __init efi_config_init(u64 tables, int nr_tables) efi_guid_t guid; unsigned long table; - if (efi_64bit) { + if (efi_enabled(EFI_64BIT)) { u64 table64; guid = ((efi_config_table_64_t *)tablep)->guid; table64 = ((efi_config_table_64_t *)tablep)->table; @@ -684,7 +687,6 @@ void __init efi_init(void) if (boot_params.efi_info.efi_systab_hi || boot_params.efi_info.efi_memmap_hi) { pr_info("Table located above 4GB, disabling EFI.\n"); - efi_enabled = 0; return; } efi_phys.systab = (efi_system_table_t *)boot_params.efi_info.efi_systab; @@ -694,10 +696,10 @@ void __init efi_init(void) ((__u64)boot_params.efi_info.efi_systab_hi<<32)); #endif - if (efi_systab_init(efi_phys.systab)) { - efi_enabled = 0; + if (efi_systab_init(efi_phys.systab)) return; - } + + set_bit(EFI_SYSTEM_TABLES, &x86_efi_facility); /* * Show what we know for posterity @@ -715,10 +717,10 @@ void __init efi_init(void) efi.systab->hdr.revision >> 16, efi.systab->hdr.revision & 0xffff, vendor); - if (efi_config_init(efi.systab->tables, efi.systab->nr_tables)) { - efi_enabled = 0; + if (efi_config_init(efi.systab->tables, efi.systab->nr_tables)) return; - } + + set_bit(EFI_CONFIG_TABLES, &x86_efi_facility); /* * Note: We currently don't support runtime services on an EFI @@ -727,15 +729,17 @@ void __init efi_init(void) if (!efi_is_native()) pr_info("No EFI runtime due to 32/64-bit mismatch with kernel\n"); - else if (efi_runtime_init()) { - efi_enabled = 0; - return; + else { + if (disable_runtime || efi_runtime_init()) + return; + set_bit(EFI_RUNTIME_SERVICES, &x86_efi_facility); } - if (efi_memmap_init()) { - efi_enabled = 0; + if (efi_memmap_init()) return; - } + + set_bit(EFI_MEMMAP, &x86_efi_facility); + #ifdef CONFIG_X86_32 if (efi_is_native()) { x86_platform.get_wallclock = efi_get_time; @@ -835,7 +839,7 @@ void __init efi_enter_virtual_mode(void) efi_memory_desc_t *md, *prev_md = NULL; efi_status_t status; unsigned long size; - u64 end, systab, end_pfn; + u64 end, systab, start_pfn, end_pfn; void *p, *va, *new_memmap = NULL; int count = 0; @@ -888,10 +892,9 @@ void __init efi_enter_virtual_mode(void) size = md->num_pages << EFI_PAGE_SHIFT; end = md->phys_addr + size; + start_pfn = PFN_DOWN(md->phys_addr); end_pfn = PFN_UP(end); - if (end_pfn <= max_low_pfn_mapped - || (end_pfn > (1UL << (32 - PAGE_SHIFT)) - && end_pfn <= max_pfn_mapped)) { + if (pfn_range_is_mapped(start_pfn, end_pfn)) { va = __va(md->phys_addr); if (!(md->attribute & EFI_MEMORY_WB)) @@ -941,7 +944,7 @@ void __init efi_enter_virtual_mode(void) * * Call EFI services through wrapper functions. */ - efi.runtime_version = efi_systab.fw_revision; + efi.runtime_version = efi_systab.hdr.revision; efi.get_time = virt_efi_get_time; efi.set_time = virt_efi_set_time; efi.get_wakeup_time = virt_efi_get_wakeup_time; @@ -969,6 +972,9 @@ u32 efi_mem_type(unsigned long phys_addr) efi_memory_desc_t *md; void *p; + if (!efi_enabled(EFI_MEMMAP)) + return 0; + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { md = p; if ((md->phys_addr <= phys_addr) && diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 95fd505dfeb..2b200386061 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -38,7 +38,7 @@ #include <asm/cacheflush.h> #include <asm/fixmap.h> -static pgd_t save_pgd __initdata; +static pgd_t *save_pgd __initdata; static unsigned long efi_flags __initdata; static void __init early_code_mapping_set_exec(int executable) @@ -61,12 +61,20 @@ static void __init early_code_mapping_set_exec(int executable) void __init efi_call_phys_prelog(void) { unsigned long vaddress; + int pgd; + int n_pgds; early_code_mapping_set_exec(1); local_irq_save(efi_flags); - vaddress = (unsigned long)__va(0x0UL); - save_pgd = *pgd_offset_k(0x0UL); - set_pgd(pgd_offset_k(0x0UL), *pgd_offset_k(vaddress)); + + n_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT), PGDIR_SIZE); + save_pgd = kmalloc(n_pgds * sizeof(pgd_t), GFP_KERNEL); + + for (pgd = 0; pgd < n_pgds; pgd++) { + save_pgd[pgd] = *pgd_offset_k(pgd * PGDIR_SIZE); + vaddress = (unsigned long)__va(pgd * PGDIR_SIZE); + set_pgd(pgd_offset_k(pgd * PGDIR_SIZE), *pgd_offset_k(vaddress)); + } __flush_tlb_all(); } @@ -75,7 +83,11 @@ void __init efi_call_phys_epilog(void) /* * After the lock is released, the original page table is restored. */ - set_pgd(pgd_offset_k(0x0UL), save_pgd); + int pgd; + int n_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT) , PGDIR_SIZE); + for (pgd = 0; pgd < n_pgds; pgd++) + set_pgd(pgd_offset_k(pgd * PGDIR_SIZE), save_pgd[pgd]); + kfree(save_pgd); __flush_tlb_all(); local_irq_restore(efi_flags); early_code_mapping_set_exec(0); diff --git a/arch/x86/platform/goldfish/Makefile b/arch/x86/platform/goldfish/Makefile new file mode 100644 index 00000000000..f030b532fdf --- /dev/null +++ b/arch/x86/platform/goldfish/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_GOLDFISH) += goldfish.o diff --git a/arch/x86/platform/goldfish/goldfish.c b/arch/x86/platform/goldfish/goldfish.c new file mode 100644 index 00000000000..1693107a518 --- /dev/null +++ b/arch/x86/platform/goldfish/goldfish.c @@ -0,0 +1,51 @@ +/* + * Copyright (C) 2007 Google, Inc. + * Copyright (C) 2011 Intel, Inc. + * Copyright (C) 2013 Intel, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include <linux/kernel.h> +#include <linux/irq.h> +#include <linux/platform_device.h> + +/* + * Where in virtual device memory the IO devices (timers, system controllers + * and so on) + */ + +#define GOLDFISH_PDEV_BUS_BASE (0xff001000) +#define GOLDFISH_PDEV_BUS_END (0xff7fffff) +#define GOLDFISH_PDEV_BUS_IRQ (4) + +#define GOLDFISH_TTY_BASE (0x2000) + +static struct resource goldfish_pdev_bus_resources[] = { + { + .start = GOLDFISH_PDEV_BUS_BASE, + .end = GOLDFISH_PDEV_BUS_END, + .flags = IORESOURCE_MEM, + }, + { + .start = GOLDFISH_PDEV_BUS_IRQ, + .end = GOLDFISH_PDEV_BUS_IRQ, + .flags = IORESOURCE_IRQ, + } +}; + +static int __init goldfish_init(void) +{ + platform_device_register_simple("goldfish_pdev_bus", -1, + goldfish_pdev_bus_resources, 2); + return 0; +} +device_initcall(goldfish_init); diff --git a/arch/x86/platform/iris/iris.c b/arch/x86/platform/iris/iris.c index 5917eb56b31..e6cb80f620a 100644 --- a/arch/x86/platform/iris/iris.c +++ b/arch/x86/platform/iris/iris.c @@ -23,6 +23,7 @@ #include <linux/moduleparam.h> #include <linux/module.h> +#include <linux/platform_device.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/delay.h> @@ -62,29 +63,75 @@ static void iris_power_off(void) * by reading its input port and seeing whether the read value is * meaningful. */ -static int iris_init(void) +static int iris_probe(struct platform_device *pdev) { - unsigned char status; - if (force != 1) { - printk(KERN_ERR "The force parameter has not been set to 1 so the Iris poweroff handler will not be installed.\n"); - return -ENODEV; - } - status = inb(IRIS_GIO_INPUT); + unsigned char status = inb(IRIS_GIO_INPUT); if (status == IRIS_GIO_NODEV) { - printk(KERN_ERR "This machine does not seem to be an Iris. Power_off handler not installed.\n"); + printk(KERN_ERR "This machine does not seem to be an Iris. " + "Power off handler not installed.\n"); return -ENODEV; } old_pm_power_off = pm_power_off; pm_power_off = &iris_power_off; printk(KERN_INFO "Iris power_off handler installed.\n"); - return 0; } -static void iris_exit(void) +static int iris_remove(struct platform_device *pdev) { pm_power_off = old_pm_power_off; printk(KERN_INFO "Iris power_off handler uninstalled.\n"); + return 0; +} + +static struct platform_driver iris_driver = { + .driver = { + .name = "iris", + .owner = THIS_MODULE, + }, + .probe = iris_probe, + .remove = iris_remove, +}; + +static struct resource iris_resources[] = { + { + .start = IRIS_GIO_BASE, + .end = IRIS_GIO_OUTPUT, + .flags = IORESOURCE_IO, + .name = "address" + } +}; + +static struct platform_device *iris_device; + +static int iris_init(void) +{ + int ret; + if (force != 1) { + printk(KERN_ERR "The force parameter has not been set to 1." + " The Iris poweroff handler will not be installed.\n"); + return -ENODEV; + } + ret = platform_driver_register(&iris_driver); + if (ret < 0) { + printk(KERN_ERR "Failed to register iris platform driver: %d\n", + ret); + return ret; + } + iris_device = platform_device_register_simple("iris", (-1), + iris_resources, ARRAY_SIZE(iris_resources)); + if (IS_ERR(iris_device)) { + printk(KERN_ERR "Failed to register iris platform device\n"); + platform_driver_unregister(&iris_driver); + return PTR_ERR(iris_device); + } + return 0; +} + +static void iris_exit(void) +{ + platform_device_unregister(iris_device); + platform_driver_unregister(&iris_driver); } module_init(iris_init); diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c index fd41a9262d6..e31bcd8f2ee 100644 --- a/arch/x86/platform/mrst/mrst.c +++ b/arch/x86/platform/mrst/mrst.c @@ -782,7 +782,7 @@ BLOCKING_NOTIFIER_HEAD(intel_scu_notifier); EXPORT_SYMBOL_GPL(intel_scu_notifier); /* Called by IPC driver */ -void __devinit intel_scu_devices_create(void) +void intel_scu_devices_create(void) { int i; diff --git a/arch/x86/platform/olpc/olpc-xo1-pm.c b/arch/x86/platform/olpc/olpc-xo1-pm.c index d75582d1aa5..ff0174dda81 100644 --- a/arch/x86/platform/olpc/olpc-xo1-pm.c +++ b/arch/x86/platform/olpc/olpc-xo1-pm.c @@ -121,7 +121,7 @@ static const struct platform_suspend_ops xo1_suspend_ops = { .enter = xo1_power_state_enter, }; -static int __devinit xo1_pm_probe(struct platform_device *pdev) +static int xo1_pm_probe(struct platform_device *pdev) { struct resource *res; int err; @@ -154,7 +154,7 @@ static int __devinit xo1_pm_probe(struct platform_device *pdev) return 0; } -static int __devexit xo1_pm_remove(struct platform_device *pdev) +static int xo1_pm_remove(struct platform_device *pdev) { mfd_cell_disable(pdev); @@ -173,7 +173,7 @@ static struct platform_driver cs5535_pms_driver = { .owner = THIS_MODULE, }, .probe = xo1_pm_probe, - .remove = __devexit_p(xo1_pm_remove), + .remove = xo1_pm_remove, }; static struct platform_driver cs5535_acpi_driver = { @@ -182,7 +182,7 @@ static struct platform_driver cs5535_acpi_driver = { .owner = THIS_MODULE, }, .probe = xo1_pm_probe, - .remove = __devexit_p(xo1_pm_remove), + .remove = xo1_pm_remove, }; static int __init xo1_pm_init(void) diff --git a/arch/x86/platform/olpc/olpc-xo1-sci.c b/arch/x86/platform/olpc/olpc-xo1-sci.c index 63d4aa40956..74704be7b1f 100644 --- a/arch/x86/platform/olpc/olpc-xo1-sci.c +++ b/arch/x86/platform/olpc/olpc-xo1-sci.c @@ -309,7 +309,7 @@ static int xo1_sci_resume(struct platform_device *pdev) return 0; } -static int __devinit setup_sci_interrupt(struct platform_device *pdev) +static int setup_sci_interrupt(struct platform_device *pdev) { u32 lo, hi; u32 sts; @@ -351,7 +351,7 @@ static int __devinit setup_sci_interrupt(struct platform_device *pdev) return r; } -static int __devinit setup_ec_sci(void) +static int setup_ec_sci(void) { int r; @@ -395,7 +395,7 @@ static void free_ec_sci(void) gpio_free(OLPC_GPIO_ECSCI); } -static int __devinit setup_lid_events(void) +static int setup_lid_events(void) { int r; @@ -432,7 +432,7 @@ static void free_lid_events(void) gpio_free(OLPC_GPIO_LID); } -static int __devinit setup_power_button(struct platform_device *pdev) +static int setup_power_button(struct platform_device *pdev) { int r; @@ -463,7 +463,7 @@ static void free_power_button(void) input_free_device(power_button_idev); } -static int __devinit setup_ebook_switch(struct platform_device *pdev) +static int setup_ebook_switch(struct platform_device *pdev) { int r; @@ -494,7 +494,7 @@ static void free_ebook_switch(void) input_free_device(ebook_switch_idev); } -static int __devinit setup_lid_switch(struct platform_device *pdev) +static int setup_lid_switch(struct platform_device *pdev) { int r; @@ -538,7 +538,7 @@ static void free_lid_switch(void) input_free_device(lid_switch_idev); } -static int __devinit xo1_sci_probe(struct platform_device *pdev) +static int xo1_sci_probe(struct platform_device *pdev) { struct resource *res; int r; @@ -613,7 +613,7 @@ err_ebook: return r; } -static int __devexit xo1_sci_remove(struct platform_device *pdev) +static int xo1_sci_remove(struct platform_device *pdev) { mfd_cell_disable(pdev); free_irq(sci_irq, pdev); @@ -632,7 +632,7 @@ static struct platform_driver xo1_sci_driver = { .name = "olpc-xo1-sci-acpi", }, .probe = xo1_sci_probe, - .remove = __devexit_p(xo1_sci_remove), + .remove = xo1_sci_remove, .suspend = xo1_sci_suspend, .resume = xo1_sci_resume, }; diff --git a/arch/x86/platform/olpc/olpc-xo15-sci.c b/arch/x86/platform/olpc/olpc-xo15-sci.c index 2fdca25905a..fef7d0ba7e3 100644 --- a/arch/x86/platform/olpc/olpc-xo15-sci.c +++ b/arch/x86/platform/olpc/olpc-xo15-sci.c @@ -195,7 +195,7 @@ err_sysfs: return r; } -static int xo15_sci_remove(struct acpi_device *device, int type) +static int xo15_sci_remove(struct acpi_device *device) { acpi_disable_gpe(NULL, xo15_sci_gpe); acpi_remove_gpe_handler(NULL, xo15_sci_gpe, xo15_sci_gpe_handler); diff --git a/arch/x86/platform/scx200/scx200_32.c b/arch/x86/platform/scx200/scx200_32.c index 7a9ad30d6c9..3dc9aee41d9 100644 --- a/arch/x86/platform/scx200/scx200_32.c +++ b/arch/x86/platform/scx200/scx200_32.c @@ -35,7 +35,7 @@ static struct pci_device_id scx200_tbl[] = { }; MODULE_DEVICE_TABLE(pci,scx200_tbl); -static int __devinit scx200_probe(struct pci_dev *, const struct pci_device_id *); +static int scx200_probe(struct pci_dev *, const struct pci_device_id *); static struct pci_driver scx200_pci_driver = { .name = "scx200", @@ -45,7 +45,7 @@ static struct pci_driver scx200_pci_driver = { static DEFINE_MUTEX(scx200_gpio_config_lock); -static void __devinit scx200_init_shadow(void) +static void scx200_init_shadow(void) { int bank; @@ -54,7 +54,7 @@ static void __devinit scx200_init_shadow(void) scx200_gpio_shadow[bank] = inl(scx200_gpio_base + 0x10 * bank); } -static int __devinit scx200_probe(struct pci_dev *pdev, const struct pci_device_id *ent) +static int scx200_probe(struct pci_dev *pdev, const struct pci_device_id *ent) { unsigned base; diff --git a/arch/x86/platform/sfi/sfi.c b/arch/x86/platform/sfi/sfi.c index 7785b72ecc3..bcd1a703e3e 100644 --- a/arch/x86/platform/sfi/sfi.c +++ b/arch/x86/platform/sfi/sfi.c @@ -35,7 +35,7 @@ static unsigned long sfi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; /* All CPUs enumerated by SFI must be present and enabled */ -static void __cpuinit mp_sfi_register_lapic(u8 id) +static void __init mp_sfi_register_lapic(u8 id) { if (MAX_LOCAL_APIC - id <= 0) { pr_warning("Processor #%d invalid (max %d)\n", diff --git a/arch/x86/platform/ts5500/Makefile b/arch/x86/platform/ts5500/Makefile new file mode 100644 index 00000000000..c54e348c96a --- /dev/null +++ b/arch/x86/platform/ts5500/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_TS5500) += ts5500.o diff --git a/arch/x86/platform/ts5500/ts5500.c b/arch/x86/platform/ts5500/ts5500.c new file mode 100644 index 00000000000..39febb214e8 --- /dev/null +++ b/arch/x86/platform/ts5500/ts5500.c @@ -0,0 +1,339 @@ +/* + * Technologic Systems TS-5500 Single Board Computer support + * + * Copyright (C) 2013 Savoir-faire Linux Inc. + * Vivien Didelot <vivien.didelot@savoirfairelinux.com> + * + * This program is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free Software + * Foundation; either version 2 of the License, or (at your option) any later + * version. + * + * + * This driver registers the Technologic Systems TS-5500 Single Board Computer + * (SBC) and its devices, and exposes information to userspace such as jumpers' + * state or available options. For further information about sysfs entries, see + * Documentation/ABI/testing/sysfs-platform-ts5500. + * + * This code actually supports the TS-5500 platform, but it may be extended to + * support similar Technologic Systems x86-based platforms, such as the TS-5600. + */ + +#include <linux/delay.h> +#include <linux/io.h> +#include <linux/kernel.h> +#include <linux/leds.h> +#include <linux/module.h> +#include <linux/platform_data/gpio-ts5500.h> +#include <linux/platform_data/max197.h> +#include <linux/platform_device.h> +#include <linux/slab.h> + +/* Product code register */ +#define TS5500_PRODUCT_CODE_ADDR 0x74 +#define TS5500_PRODUCT_CODE 0x60 /* TS-5500 product code */ + +/* SRAM/RS-485/ADC options, and RS-485 RTS/Automatic RS-485 flags register */ +#define TS5500_SRAM_RS485_ADC_ADDR 0x75 +#define TS5500_SRAM BIT(0) /* SRAM option */ +#define TS5500_RS485 BIT(1) /* RS-485 option */ +#define TS5500_ADC BIT(2) /* A/D converter option */ +#define TS5500_RS485_RTS BIT(6) /* RTS for RS-485 */ +#define TS5500_RS485_AUTO BIT(7) /* Automatic RS-485 */ + +/* External Reset/Industrial Temperature Range options register */ +#define TS5500_ERESET_ITR_ADDR 0x76 +#define TS5500_ERESET BIT(0) /* External Reset option */ +#define TS5500_ITR BIT(1) /* Indust. Temp. Range option */ + +/* LED/Jumpers register */ +#define TS5500_LED_JP_ADDR 0x77 +#define TS5500_LED BIT(0) /* LED flag */ +#define TS5500_JP1 BIT(1) /* Automatic CMOS */ +#define TS5500_JP2 BIT(2) /* Enable Serial Console */ +#define TS5500_JP3 BIT(3) /* Write Enable Drive A */ +#define TS5500_JP4 BIT(4) /* Fast Console (115K baud) */ +#define TS5500_JP5 BIT(5) /* User Jumper */ +#define TS5500_JP6 BIT(6) /* Console on COM1 (req. JP2) */ +#define TS5500_JP7 BIT(7) /* Undocumented (Unused) */ + +/* A/D Converter registers */ +#define TS5500_ADC_CONV_BUSY_ADDR 0x195 /* Conversion state register */ +#define TS5500_ADC_CONV_BUSY BIT(0) +#define TS5500_ADC_CONV_INIT_LSB_ADDR 0x196 /* Start conv. / LSB register */ +#define TS5500_ADC_CONV_MSB_ADDR 0x197 /* MSB register */ +#define TS5500_ADC_CONV_DELAY 12 /* usec */ + +/** + * struct ts5500_sbc - TS-5500 board description + * @id: Board product ID. + * @sram: Flag for SRAM option. + * @rs485: Flag for RS-485 option. + * @adc: Flag for Analog/Digital converter option. + * @ereset: Flag for External Reset option. + * @itr: Flag for Industrial Temperature Range option. + * @jumpers: Bitfield for jumpers' state. + */ +struct ts5500_sbc { + int id; + bool sram; + bool rs485; + bool adc; + bool ereset; + bool itr; + u8 jumpers; +}; + +/* Board signatures in BIOS shadow RAM */ +static const struct { + const char * const string; + const ssize_t offset; +} ts5500_signatures[] __initdata = { + { "TS-5x00 AMD Elan", 0xb14 }, +}; + +static int __init ts5500_check_signature(void) +{ + void __iomem *bios; + int i, ret = -ENODEV; + + bios = ioremap(0xf0000, 0x10000); + if (!bios) + return -ENOMEM; + + for (i = 0; i < ARRAY_SIZE(ts5500_signatures); i++) { + if (check_signature(bios + ts5500_signatures[i].offset, + ts5500_signatures[i].string, + strlen(ts5500_signatures[i].string))) { + ret = 0; + break; + } + } + + iounmap(bios); + return ret; +} + +static int __init ts5500_detect_config(struct ts5500_sbc *sbc) +{ + u8 tmp; + int ret = 0; + + if (!request_region(TS5500_PRODUCT_CODE_ADDR, 4, "ts5500")) + return -EBUSY; + + tmp = inb(TS5500_PRODUCT_CODE_ADDR); + if (tmp != TS5500_PRODUCT_CODE) { + pr_err("This platform is not a TS-5500 (found ID 0x%x)\n", tmp); + ret = -ENODEV; + goto cleanup; + } + sbc->id = tmp; + + tmp = inb(TS5500_SRAM_RS485_ADC_ADDR); + sbc->sram = tmp & TS5500_SRAM; + sbc->rs485 = tmp & TS5500_RS485; + sbc->adc = tmp & TS5500_ADC; + + tmp = inb(TS5500_ERESET_ITR_ADDR); + sbc->ereset = tmp & TS5500_ERESET; + sbc->itr = tmp & TS5500_ITR; + + tmp = inb(TS5500_LED_JP_ADDR); + sbc->jumpers = tmp & ~TS5500_LED; + +cleanup: + release_region(TS5500_PRODUCT_CODE_ADDR, 4); + return ret; +} + +static ssize_t ts5500_show_id(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct ts5500_sbc *sbc = dev_get_drvdata(dev); + + return sprintf(buf, "0x%.2x\n", sbc->id); +} + +static ssize_t ts5500_show_jumpers(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ts5500_sbc *sbc = dev_get_drvdata(dev); + + return sprintf(buf, "0x%.2x\n", sbc->jumpers >> 1); +} + +#define TS5500_SHOW(field) \ + static ssize_t ts5500_show_##field(struct device *dev, \ + struct device_attribute *attr, \ + char *buf) \ + { \ + struct ts5500_sbc *sbc = dev_get_drvdata(dev); \ + return sprintf(buf, "%d\n", sbc->field); \ + } + +TS5500_SHOW(sram) +TS5500_SHOW(rs485) +TS5500_SHOW(adc) +TS5500_SHOW(ereset) +TS5500_SHOW(itr) + +static DEVICE_ATTR(id, S_IRUGO, ts5500_show_id, NULL); +static DEVICE_ATTR(jumpers, S_IRUGO, ts5500_show_jumpers, NULL); +static DEVICE_ATTR(sram, S_IRUGO, ts5500_show_sram, NULL); +static DEVICE_ATTR(rs485, S_IRUGO, ts5500_show_rs485, NULL); +static DEVICE_ATTR(adc, S_IRUGO, ts5500_show_adc, NULL); +static DEVICE_ATTR(ereset, S_IRUGO, ts5500_show_ereset, NULL); +static DEVICE_ATTR(itr, S_IRUGO, ts5500_show_itr, NULL); + +static struct attribute *ts5500_attributes[] = { + &dev_attr_id.attr, + &dev_attr_jumpers.attr, + &dev_attr_sram.attr, + &dev_attr_rs485.attr, + &dev_attr_adc.attr, + &dev_attr_ereset.attr, + &dev_attr_itr.attr, + NULL +}; + +static const struct attribute_group ts5500_attr_group = { + .attrs = ts5500_attributes, +}; + +static struct resource ts5500_dio1_resource[] = { + DEFINE_RES_IRQ_NAMED(7, "DIO1 interrupt"), +}; + +static struct platform_device ts5500_dio1_pdev = { + .name = "ts5500-dio1", + .id = -1, + .resource = ts5500_dio1_resource, + .num_resources = 1, +}; + +static struct resource ts5500_dio2_resource[] = { + DEFINE_RES_IRQ_NAMED(6, "DIO2 interrupt"), +}; + +static struct platform_device ts5500_dio2_pdev = { + .name = "ts5500-dio2", + .id = -1, + .resource = ts5500_dio2_resource, + .num_resources = 1, +}; + +static void ts5500_led_set(struct led_classdev *led_cdev, + enum led_brightness brightness) +{ + outb(!!brightness, TS5500_LED_JP_ADDR); +} + +static enum led_brightness ts5500_led_get(struct led_classdev *led_cdev) +{ + return (inb(TS5500_LED_JP_ADDR) & TS5500_LED) ? LED_FULL : LED_OFF; +} + +static struct led_classdev ts5500_led_cdev = { + .name = "ts5500:green:", + .brightness_set = ts5500_led_set, + .brightness_get = ts5500_led_get, +}; + +static int ts5500_adc_convert(u8 ctrl) +{ + u8 lsb, msb; + + /* Start conversion (ensure the 3 MSB are set to 0) */ + outb(ctrl & 0x1f, TS5500_ADC_CONV_INIT_LSB_ADDR); + + /* + * The platform has CPLD logic driving the A/D converter. + * The conversion must complete within 11 microseconds, + * otherwise we have to re-initiate a conversion. + */ + udelay(TS5500_ADC_CONV_DELAY); + if (inb(TS5500_ADC_CONV_BUSY_ADDR) & TS5500_ADC_CONV_BUSY) + return -EBUSY; + + /* Read the raw data */ + lsb = inb(TS5500_ADC_CONV_INIT_LSB_ADDR); + msb = inb(TS5500_ADC_CONV_MSB_ADDR); + + return (msb << 8) | lsb; +} + +static struct max197_platform_data ts5500_adc_pdata = { + .convert = ts5500_adc_convert, +}; + +static struct platform_device ts5500_adc_pdev = { + .name = "max197", + .id = -1, + .dev = { + .platform_data = &ts5500_adc_pdata, + }, +}; + +static int __init ts5500_init(void) +{ + struct platform_device *pdev; + struct ts5500_sbc *sbc; + int err; + + /* + * There is no DMI available or PCI bridge subvendor info, + * only the BIOS provides a 16-bit identification call. + * It is safer to find a signature in the BIOS shadow RAM. + */ + err = ts5500_check_signature(); + if (err) + return err; + + pdev = platform_device_register_simple("ts5500", -1, NULL, 0); + if (IS_ERR(pdev)) + return PTR_ERR(pdev); + + sbc = devm_kzalloc(&pdev->dev, sizeof(struct ts5500_sbc), GFP_KERNEL); + if (!sbc) { + err = -ENOMEM; + goto error; + } + + err = ts5500_detect_config(sbc); + if (err) + goto error; + + platform_set_drvdata(pdev, sbc); + + err = sysfs_create_group(&pdev->dev.kobj, &ts5500_attr_group); + if (err) + goto error; + + ts5500_dio1_pdev.dev.parent = &pdev->dev; + if (platform_device_register(&ts5500_dio1_pdev)) + dev_warn(&pdev->dev, "DIO1 block registration failed\n"); + ts5500_dio2_pdev.dev.parent = &pdev->dev; + if (platform_device_register(&ts5500_dio2_pdev)) + dev_warn(&pdev->dev, "DIO2 block registration failed\n"); + + if (led_classdev_register(&pdev->dev, &ts5500_led_cdev)) + dev_warn(&pdev->dev, "LED registration failed\n"); + + if (sbc->adc) { + ts5500_adc_pdev.dev.parent = &pdev->dev; + if (platform_device_register(&ts5500_adc_pdev)) + dev_warn(&pdev->dev, "ADC registration failed\n"); + } + + return 0; +error: + platform_device_unregister(pdev); + return err; +} +device_initcall(ts5500_init); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Savoir-faire Linux Inc. <kernel@savoirfairelinux.com>"); +MODULE_DESCRIPTION("Technologic Systems TS-5500 platform driver"); diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index b8b3a37c80c..0f92173a12b 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -1034,7 +1034,8 @@ static int set_distrib_bits(struct cpumask *flush_mask, struct bau_control *bcp, * globally purge translation cache of a virtual address or all TLB's * @cpumask: mask of all cpu's in which the address is to be removed * @mm: mm_struct containing virtual address range - * @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu) + * @start: start virtual address to be removed from TLB + * @end: end virtual address to be remove from TLB * @cpu: the current cpu * * This is the entry point for initiating any UV global TLB shootdown. @@ -1056,7 +1057,7 @@ static int set_distrib_bits(struct cpumask *flush_mask, struct bau_control *bcp, */ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm, unsigned long start, - unsigned end, unsigned int cpu) + unsigned long end, unsigned int cpu) { int locals = 0; int remotes = 0; @@ -1113,7 +1114,10 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, record_send_statistics(stat, locals, hubs, remotes, bau_desc); - bau_desc->payload.address = start; + if (!end || (end - start) <= PAGE_SIZE) + bau_desc->payload.address = start; + else + bau_desc->payload.address = TLB_FLUSH_ALL; bau_desc->payload.sending_cpu = cpu; /* * uv_flush_send_and_wait returns 0 if all cpu's were messaged, @@ -1463,7 +1467,7 @@ static ssize_t ptc_proc_write(struct file *file, const char __user *user, } if (input_arg == 0) { - elements = sizeof(stat_description)/sizeof(*stat_description); + elements = ARRAY_SIZE(stat_description); printk(KERN_DEBUG "# cpu: cpu number\n"); printk(KERN_DEBUG "Sender statistics:\n"); for (i = 0; i < elements; i++) @@ -1504,7 +1508,7 @@ static int parse_tunables_write(struct bau_control *bcp, char *instr, char *q; int cnt = 0; int val; - int e = sizeof(tunables) / sizeof(*tunables); + int e = ARRAY_SIZE(tunables); p = instr + strspn(instr, WHITESPACE); q = p; diff --git a/arch/x86/platform/uv/uv_time.c b/arch/x86/platform/uv/uv_time.c index 5032e0d19b8..98718f604eb 100644 --- a/arch/x86/platform/uv/uv_time.c +++ b/arch/x86/platform/uv/uv_time.c @@ -15,7 +15,7 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * - * Copyright (c) 2009 Silicon Graphics, Inc. All Rights Reserved. + * Copyright (c) 2009-2013 Silicon Graphics, Inc. All Rights Reserved. * Copyright (c) Dimitri Sivanich */ #include <linux/clockchips.h> @@ -102,9 +102,10 @@ static int uv_intr_pending(int pnode) if (is_uv1_hub()) return uv_read_global_mmr64(pnode, UVH_EVENT_OCCURRED0) & UV1H_EVENT_OCCURRED0_RTC1_MASK; - else - return uv_read_global_mmr64(pnode, UV2H_EVENT_OCCURRED2) & - UV2H_EVENT_OCCURRED2_RTC_1_MASK; + else if (is_uvx_hub()) + return uv_read_global_mmr64(pnode, UVXH_EVENT_OCCURRED2) & + UVXH_EVENT_OCCURRED2_RTC_1_MASK; + return 0; } /* Setup interrupt and return non-zero if early expiration occurred. */ @@ -122,8 +123,8 @@ static int uv_setup_intr(int cpu, u64 expires) uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS, UV1H_EVENT_OCCURRED0_RTC1_MASK); else - uv_write_global_mmr64(pnode, UV2H_EVENT_OCCURRED2_ALIAS, - UV2H_EVENT_OCCURRED2_RTC_1_MASK); + uv_write_global_mmr64(pnode, UVXH_EVENT_OCCURRED2_ALIAS, + UVXH_EVENT_OCCURRED2_RTC_1_MASK); val = (X86_PLATFORM_IPI_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) | ((u64)apicid << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT); diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 218cdb16163..3c68768d7a7 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -11,6 +11,7 @@ #include <linux/suspend.h> #include <linux/export.h> #include <linux/smp.h> +#include <linux/perf_event.h> #include <asm/pgtable.h> #include <asm/proto.h> @@ -21,6 +22,7 @@ #include <asm/suspend.h> #include <asm/debugreg.h> #include <asm/fpu-internal.h> /* pcntxt_mask */ +#include <asm/cpu.h> #ifdef CONFIG_X86_32 static struct saved_context saved_context; @@ -227,6 +229,7 @@ static void __restore_processor_state(struct saved_context *ctxt) do_fpu_end(); x86_platform.restore_sched_clock_state(); mtrr_bp_restore(); + perf_restore_debug_store(); } /* Needed by apm.c */ @@ -237,3 +240,84 @@ void restore_processor_state(void) #ifdef CONFIG_X86_32 EXPORT_SYMBOL(restore_processor_state); #endif + +/* + * When bsp_check() is called in hibernate and suspend, cpu hotplug + * is disabled already. So it's unnessary to handle race condition between + * cpumask query and cpu hotplug. + */ +static int bsp_check(void) +{ + if (cpumask_first(cpu_online_mask) != 0) { + pr_warn("CPU0 is offline.\n"); + return -ENODEV; + } + + return 0; +} + +static int bsp_pm_callback(struct notifier_block *nb, unsigned long action, + void *ptr) +{ + int ret = 0; + + switch (action) { + case PM_SUSPEND_PREPARE: + case PM_HIBERNATION_PREPARE: + ret = bsp_check(); + break; +#ifdef CONFIG_DEBUG_HOTPLUG_CPU0 + case PM_RESTORE_PREPARE: + /* + * When system resumes from hibernation, online CPU0 because + * 1. it's required for resume and + * 2. the CPU was online before hibernation + */ + if (!cpu_online(0)) + _debug_hotplug_cpu(0, 1); + break; + case PM_POST_RESTORE: + /* + * When a resume really happens, this code won't be called. + * + * This code is called only when user space hibernation software + * prepares for snapshot device during boot time. So we just + * call _debug_hotplug_cpu() to restore to CPU0's state prior to + * preparing the snapshot device. + * + * This works for normal boot case in our CPU0 hotplug debug + * mode, i.e. CPU0 is offline and user mode hibernation + * software initializes during boot time. + * + * If CPU0 is online and user application accesses snapshot + * device after boot time, this will offline CPU0 and user may + * see different CPU0 state before and after accessing + * the snapshot device. But hopefully this is not a case when + * user debugging CPU0 hotplug. Even if users hit this case, + * they can easily online CPU0 back. + * + * To simplify this debug code, we only consider normal boot + * case. Otherwise we need to remember CPU0's state and restore + * to that state and resolve racy conditions etc. + */ + _debug_hotplug_cpu(0, 0); + break; +#endif + default: + break; + } + return notifier_from_errno(ret); +} + +static int __init bsp_pm_check_init(void) +{ + /* + * Set this bsp_pm_callback as lower priority than + * cpu_hotplug_pm_callback. So cpu_hotplug_pm_callback will be called + * earlier to disable cpu hotplug before bsp online check. + */ + pm_notifier(bsp_pm_callback, -INT_MAX); + return 0; +} + +core_initcall(bsp_pm_check_init); diff --git a/arch/x86/power/hibernate_32.c b/arch/x86/power/hibernate_32.c index 74202c1910c..7d28c885d23 100644 --- a/arch/x86/power/hibernate_32.c +++ b/arch/x86/power/hibernate_32.c @@ -129,8 +129,6 @@ static int resume_physical_mapping_init(pgd_t *pgd_base) } } - resume_map_numa_kva(pgd_base); - return 0; } diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index 460f314d13e..a0fde91c16c 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -11,6 +11,8 @@ #include <linux/gfp.h> #include <linux/smp.h> #include <linux/suspend.h> + +#include <asm/init.h> #include <asm/proto.h> #include <asm/page.h> #include <asm/pgtable.h> @@ -39,41 +41,21 @@ pgd_t *temp_level4_pgt; void *relocated_restore_code; -static int res_phys_pud_init(pud_t *pud, unsigned long address, unsigned long end) +static void *alloc_pgt_page(void *context) { - long i, j; - - i = pud_index(address); - pud = pud + i; - for (; i < PTRS_PER_PUD; pud++, i++) { - unsigned long paddr; - pmd_t *pmd; - - paddr = address + i*PUD_SIZE; - if (paddr >= end) - break; - - pmd = (pmd_t *)get_safe_page(GFP_ATOMIC); - if (!pmd) - return -ENOMEM; - set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); - for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) { - unsigned long pe; - - if (paddr >= end) - break; - pe = __PAGE_KERNEL_LARGE_EXEC | paddr; - pe &= __supported_pte_mask; - set_pmd(pmd, __pmd(pe)); - } - } - return 0; + return (void *)get_safe_page(GFP_ATOMIC); } static int set_up_temporary_mappings(void) { - unsigned long start, end, next; - int error; + struct x86_mapping_info info = { + .alloc_pgt_page = alloc_pgt_page, + .pmd_flag = __PAGE_KERNEL_LARGE_EXEC, + .kernel_mapping = true, + }; + unsigned long mstart, mend; + int result; + int i; temp_level4_pgt = (pgd_t *)get_safe_page(GFP_ATOMIC); if (!temp_level4_pgt) @@ -84,21 +66,17 @@ static int set_up_temporary_mappings(void) init_level4_pgt[pgd_index(__START_KERNEL_map)]); /* Set up the direct mapping from scratch */ - start = (unsigned long)pfn_to_kaddr(0); - end = (unsigned long)pfn_to_kaddr(max_pfn); - - for (; start < end; start = next) { - pud_t *pud = (pud_t *)get_safe_page(GFP_ATOMIC); - if (!pud) - return -ENOMEM; - next = start + PGDIR_SIZE; - if (next > end) - next = end; - if ((error = res_phys_pud_init(pud, __pa(start), __pa(next)))) - return error; - set_pgd(temp_level4_pgt + pgd_index(start), - mk_kernel_pgd(__pa(pud))); + for (i = 0; i < nr_pfn_mapped; i++) { + mstart = pfn_mapped[i].start << PAGE_SHIFT; + mend = pfn_mapped[i].end << PAGE_SHIFT; + + result = kernel_ident_mapping_init(&info, temp_level4_pgt, + mstart, mend); + + if (result) + return result; } + return 0; } diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index cbca565af5b..a44f457e70a 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -8,9 +8,26 @@ struct real_mode_header *real_mode_header; u32 *trampoline_cr4_features; -void __init setup_real_mode(void) +void __init reserve_real_mode(void) { phys_addr_t mem; + unsigned char *base; + size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob); + + /* Has to be under 1M so we can execute real-mode AP code. */ + mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE); + if (!mem) + panic("Cannot allocate trampoline\n"); + + base = __va(mem); + memblock_reserve(mem, size); + real_mode_header = (struct real_mode_header *) base; + printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n", + base, (unsigned long long)mem, size); +} + +void __init setup_real_mode(void) +{ u16 real_mode_seg; u32 *rel; u32 count; @@ -25,16 +42,7 @@ void __init setup_real_mode(void) u64 efer; #endif - /* Has to be in very low memory so we can execute real-mode AP code. */ - mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE); - if (!mem) - panic("Cannot allocate trampoline\n"); - - base = __va(mem); - memblock_reserve(mem, size); - real_mode_header = (struct real_mode_header *) base; - printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n", - base, (unsigned long long)mem, size); + base = (unsigned char *)real_mode_header; memcpy(base, real_mode_blob, size); @@ -62,9 +70,9 @@ void __init setup_real_mode(void) __va(real_mode_header->trampoline_header); #ifdef CONFIG_X86_32 - trampoline_header->start = __pa(startup_32_smp); + trampoline_header->start = __pa_symbol(startup_32_smp); trampoline_header->gdt_limit = __BOOT_DS + 7; - trampoline_header->gdt_base = __pa(boot_gdt); + trampoline_header->gdt_base = __pa_symbol(boot_gdt); #else /* * Some AMD processors will #GP(0) if EFER.LMA is set in WRMSR @@ -78,16 +86,18 @@ void __init setup_real_mode(void) *trampoline_cr4_features = read_cr4(); trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd); - trampoline_pgd[0] = __pa(level3_ident_pgt) + _KERNPG_TABLE; - trampoline_pgd[511] = __pa(level3_kernel_pgt) + _KERNPG_TABLE; + trampoline_pgd[0] = init_level4_pgt[pgd_index(__PAGE_OFFSET)].pgd; + trampoline_pgd[511] = init_level4_pgt[511].pgd; #endif } /* - * set_real_mode_permissions() gets called very early, to guarantee the - * availability of low memory. This is before the proper kernel page + * reserve_real_mode() gets called very early, to guarantee the + * availability of low memory. This is before the proper kernel page * tables are set up, so we cannot set page permissions in that - * function. Thus, we use an arch_initcall instead. + * function. Also trampoline code will be executed by APs so we + * need to mark it executable at do_pre_smp_initcalls() at least, + * thus run it as a early_initcall(). */ static int __init set_real_mode_permissions(void) { @@ -111,5 +121,4 @@ static int __init set_real_mode_permissions(void) return 0; } - -arch_initcall(set_real_mode_permissions); +early_initcall(set_real_mode_permissions); diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl index a47103fbc69..e6d55f0064d 100644 --- a/arch/x86/syscalls/syscall_32.tbl +++ b/arch/x86/syscalls/syscall_32.tbl @@ -8,7 +8,7 @@ # 0 i386 restart_syscall sys_restart_syscall 1 i386 exit sys_exit -2 i386 fork ptregs_fork stub32_fork +2 i386 fork sys_fork stub32_fork 3 i386 read sys_read 4 i386 write sys_write 5 i386 open sys_open compat_sys_open @@ -25,7 +25,7 @@ 16 i386 lchown sys_lchown16 17 i386 break 18 i386 oldstat sys_stat -19 i386 lseek sys_lseek sys32_lseek +19 i386 lseek sys_lseek compat_sys_lseek 20 i386 getpid sys_getpid 21 i386 mount sys_mount compat_sys_mount 22 i386 umount sys_oldumount @@ -73,12 +73,12 @@ 64 i386 getppid sys_getppid 65 i386 getpgrp sys_getpgrp 66 i386 setsid sys_setsid -67 i386 sigaction sys_sigaction sys32_sigaction +67 i386 sigaction sys_sigaction compat_sys_sigaction 68 i386 sgetmask sys_sgetmask 69 i386 ssetmask sys_ssetmask 70 i386 setreuid sys_setreuid16 71 i386 setregid sys_setregid16 -72 i386 sigsuspend sys_sigsuspend sys32_sigsuspend +72 i386 sigsuspend sys_sigsuspend sys_sigsuspend 73 i386 sigpending sys_sigpending compat_sys_sigpending 74 i386 sethostname sys_sethostname 75 i386 setrlimit sys_setrlimit compat_sys_setrlimit @@ -98,8 +98,8 @@ 89 i386 readdir sys_old_readdir compat_sys_old_readdir 90 i386 mmap sys_old_mmap sys32_mmap 91 i386 munmap sys_munmap -92 i386 truncate sys_truncate -93 i386 ftruncate sys_ftruncate +92 i386 truncate sys_truncate compat_sys_truncate +93 i386 ftruncate sys_ftruncate compat_sys_ftruncate 94 i386 fchmod sys_fchmod 95 i386 fchown sys_fchown16 96 i386 getpriority sys_getpriority @@ -116,17 +116,17 @@ 107 i386 lstat sys_newlstat compat_sys_newlstat 108 i386 fstat sys_newfstat compat_sys_newfstat 109 i386 olduname sys_uname -110 i386 iopl ptregs_iopl stub32_iopl +110 i386 iopl sys_iopl 111 i386 vhangup sys_vhangup 112 i386 idle -113 i386 vm86old ptregs_vm86old sys32_vm86_warning +113 i386 vm86old sys_vm86old sys32_vm86_warning 114 i386 wait4 sys_wait4 compat_sys_wait4 115 i386 swapoff sys_swapoff 116 i386 sysinfo sys_sysinfo compat_sys_sysinfo 117 i386 ipc sys_ipc sys32_ipc 118 i386 fsync sys_fsync -119 i386 sigreturn ptregs_sigreturn stub32_sigreturn -120 i386 clone ptregs_clone stub32_clone +119 i386 sigreturn sys_sigreturn stub32_sigreturn +120 i386 clone sys_clone stub32_clone 121 i386 setdomainname sys_setdomainname 122 i386 uname sys_newuname 123 i386 modify_ldt sys_modify_ldt @@ -167,24 +167,24 @@ 158 i386 sched_yield sys_sched_yield 159 i386 sched_get_priority_max sys_sched_get_priority_max 160 i386 sched_get_priority_min sys_sched_get_priority_min -161 i386 sched_rr_get_interval sys_sched_rr_get_interval sys32_sched_rr_get_interval +161 i386 sched_rr_get_interval sys_sched_rr_get_interval compat_sys_sched_rr_get_interval 162 i386 nanosleep sys_nanosleep compat_sys_nanosleep 163 i386 mremap sys_mremap 164 i386 setresuid sys_setresuid16 165 i386 getresuid sys_getresuid16 -166 i386 vm86 ptregs_vm86 sys32_vm86_warning +166 i386 vm86 sys_vm86 sys32_vm86_warning 167 i386 query_module 168 i386 poll sys_poll 169 i386 nfsservctl 170 i386 setresgid sys_setresgid16 171 i386 getresgid sys_getresgid16 172 i386 prctl sys_prctl -173 i386 rt_sigreturn ptregs_rt_sigreturn stub32_rt_sigreturn -174 i386 rt_sigaction sys_rt_sigaction sys32_rt_sigaction +173 i386 rt_sigreturn sys_rt_sigreturn stub32_rt_sigreturn +174 i386 rt_sigaction sys_rt_sigaction compat_sys_rt_sigaction 175 i386 rt_sigprocmask sys_rt_sigprocmask -176 i386 rt_sigpending sys_rt_sigpending sys32_rt_sigpending +176 i386 rt_sigpending sys_rt_sigpending compat_sys_rt_sigpending 177 i386 rt_sigtimedwait sys_rt_sigtimedwait compat_sys_rt_sigtimedwait -178 i386 rt_sigqueueinfo sys_rt_sigqueueinfo sys32_rt_sigqueueinfo +178 i386 rt_sigqueueinfo sys_rt_sigqueueinfo compat_sys_rt_sigqueueinfo 179 i386 rt_sigsuspend sys_rt_sigsuspend 180 i386 pread64 sys_pread64 sys32_pread 181 i386 pwrite64 sys_pwrite64 sys32_pwrite @@ -192,11 +192,11 @@ 183 i386 getcwd sys_getcwd 184 i386 capget sys_capget 185 i386 capset sys_capset -186 i386 sigaltstack ptregs_sigaltstack stub32_sigaltstack +186 i386 sigaltstack sys_sigaltstack compat_sys_sigaltstack 187 i386 sendfile sys_sendfile sys32_sendfile 188 i386 getpmsg 189 i386 putpmsg -190 i386 vfork ptregs_vfork stub32_vfork +190 i386 vfork sys_vfork stub32_vfork 191 i386 ugetrlimit sys_getrlimit compat_sys_getrlimit 192 i386 mmap2 sys_mmap_pgoff 193 i386 truncate64 sys_truncate64 sys32_truncate64 @@ -356,3 +356,4 @@ 347 i386 process_vm_readv sys_process_vm_readv compat_sys_process_vm_readv 348 i386 process_vm_writev sys_process_vm_writev compat_sys_process_vm_writev 349 i386 kcmp sys_kcmp +350 i386 finit_module sys_finit_module diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl index a582bfed95b..38ae65dfd14 100644 --- a/arch/x86/syscalls/syscall_64.tbl +++ b/arch/x86/syscalls/syscall_64.tbl @@ -137,7 +137,7 @@ 128 64 rt_sigtimedwait sys_rt_sigtimedwait 129 64 rt_sigqueueinfo sys_rt_sigqueueinfo 130 common rt_sigsuspend sys_rt_sigsuspend -131 64 sigaltstack stub_sigaltstack +131 64 sigaltstack sys_sigaltstack 132 common utime sys_utime 133 common mknod sys_mknod 134 64 uselib @@ -319,12 +319,13 @@ 310 64 process_vm_readv sys_process_vm_readv 311 64 process_vm_writev sys_process_vm_writev 312 common kcmp sys_kcmp +313 common finit_module sys_finit_module # # x32-specific system call numbers start at 512 to avoid cache impact # for native 64-bit operation. # -512 x32 rt_sigaction sys32_rt_sigaction +512 x32 rt_sigaction compat_sys_rt_sigaction 513 x32 rt_sigreturn stub_x32_rt_sigreturn 514 x32 ioctl compat_sys_ioctl 515 x32 readv compat_sys_readv @@ -334,10 +335,10 @@ 519 x32 recvmsg compat_sys_recvmsg 520 x32 execve stub_x32_execve 521 x32 ptrace compat_sys_ptrace -522 x32 rt_sigpending sys32_rt_sigpending +522 x32 rt_sigpending compat_sys_rt_sigpending 523 x32 rt_sigtimedwait compat_sys_rt_sigtimedwait -524 x32 rt_sigqueueinfo sys32_rt_sigqueueinfo -525 x32 sigaltstack stub_x32_sigaltstack +524 x32 rt_sigqueueinfo compat_sys_rt_sigqueueinfo +525 x32 sigaltstack compat_sys_sigaltstack 526 x32 timer_create compat_sys_timer_create 527 x32 mq_notify compat_sys_mq_notify 528 x32 kexec_load compat_sys_kexec_load diff --git a/arch/x86/tools/gen-insn-attr-x86.awk b/arch/x86/tools/gen-insn-attr-x86.awk index ddcf39b1a18..e6773dc8ac4 100644 --- a/arch/x86/tools/gen-insn-attr-x86.awk +++ b/arch/x86/tools/gen-insn-attr-x86.awk @@ -356,7 +356,7 @@ END { exit 1 # print escape opcode map's array print "/* Escape opcode map array */" - print "const insn_attr_t const *inat_escape_tables[INAT_ESC_MAX + 1]" \ + print "const insn_attr_t * const inat_escape_tables[INAT_ESC_MAX + 1]" \ "[INAT_LSTPFX_MAX + 1] = {" for (i = 0; i < geid; i++) for (j = 0; j < max_lprefix; j++) @@ -365,7 +365,7 @@ END { print "};\n" # print group opcode map's array print "/* Group opcode map array */" - print "const insn_attr_t const *inat_group_tables[INAT_GRP_MAX + 1]"\ + print "const insn_attr_t * const inat_group_tables[INAT_GRP_MAX + 1]"\ "[INAT_LSTPFX_MAX + 1] = {" for (i = 0; i < ggid; i++) for (j = 0; j < max_lprefix; j++) @@ -374,7 +374,7 @@ END { print "};\n" # print AVX opcode map's array print "/* AVX opcode map array */" - print "const insn_attr_t const *inat_avx_tables[X86_VEX_M_MAX + 1]"\ + print "const insn_attr_t * const inat_avx_tables[X86_VEX_M_MAX + 1]"\ "[INAT_LSTPFX_MAX + 1] = {" for (i = 0; i < gaid; i++) for (j = 0; j < max_lprefix; j++) diff --git a/arch/x86/tools/insn_sanity.c b/arch/x86/tools/insn_sanity.c index cc2f8c13128..872eb60e780 100644 --- a/arch/x86/tools/insn_sanity.c +++ b/arch/x86/tools/insn_sanity.c @@ -55,7 +55,7 @@ static FILE *input_file; /* Input file name */ static void usage(const char *err) { if (err) - fprintf(stderr, "Error: %s\n\n", err); + fprintf(stderr, "%s: Error: %s\n\n", prog, err); fprintf(stderr, "Usage: %s [-y|-n|-v] [-s seed[,no]] [-m max] [-i input]\n", prog); fprintf(stderr, "\t-y 64bit mode\n"); fprintf(stderr, "\t-n 32bit mode\n"); @@ -269,7 +269,13 @@ int main(int argc, char **argv) insns++; } - fprintf(stdout, "%s: decoded and checked %d %s instructions with %d errors (seed:0x%x)\n", (errors) ? "Failure" : "Success", insns, (input_file) ? "given" : "random", errors, seed); + fprintf(stdout, "%s: %s: decoded and checked %d %s instructions with %d errors (seed:0x%x)\n", + prog, + (errors) ? "Failure" : "Success", + insns, + (input_file) ? "given" : "random", + errors, + seed); return errors ? 1 : 0; } diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index 5a1847d6193..79d67bd507f 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -814,12 +814,14 @@ int main(int argc, char **argv) read_relocs(fp); if (show_absolute_syms) { print_absolute_symbols(); - return 0; + goto out; } if (show_absolute_relocs) { print_absolute_relocs(); - return 0; + goto out; } emit_relocs(as_text, use_real_mode); +out: + fclose(fp); return 0; } diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig index 07611759ce3..14ef8d1dbc3 100644 --- a/arch/x86/um/Kconfig +++ b/arch/x86/um/Kconfig @@ -13,8 +13,6 @@ endmenu config UML_X86 def_bool y select GENERIC_FIND_FIRST_BIT - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE config 64BIT bool "64-bit kernel" if SUBARCH = "x86" @@ -25,21 +23,23 @@ config X86_32 select HAVE_AOUT select ARCH_WANT_IPC_PARSE_VERSION select MODULES_USE_ELF_REL + select CLONE_BACKWARDS + select OLD_SIGSUSPEND3 + select OLD_SIGACTION config X86_64 def_bool 64BIT select MODULES_USE_ELF_RELA config RWSEM_XCHGADD_ALGORITHM - def_bool X86_XADD && 64BIT + def_bool 64BIT config RWSEM_GENERIC_SPINLOCK def_bool !RWSEM_XCHGADD_ALGORITHM config 3_LEVEL_PGTABLES - bool "Three-level pagetables (EXPERIMENTAL)" if !64BIT + bool "Three-level pagetables" if !64BIT default 64BIT - depends on EXPERIMENTAL help Three-level pagetables will let UML have more than 4G of physical memory. All the memory that can't be mapped directly will be treated diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile index 5d065b2222d..eafa324eb7a 100644 --- a/arch/x86/um/Makefile +++ b/arch/x86/um/Makefile @@ -10,7 +10,7 @@ endif obj-y = bug.o bugs_$(BITS).o delay.o fault.o ksyms.o ldt.o \ ptrace_$(BITS).o ptrace_user.o setjmp_$(BITS).o signal.o \ - stub_$(BITS).o stub_segv.o syscalls_$(BITS).o \ + stub_$(BITS).o stub_segv.o \ sys_call_table_$(BITS).o sysrq_$(BITS).o tls_$(BITS).o \ mem_$(BITS).o subarch.o os-$(OS)/ @@ -25,7 +25,7 @@ subarch-$(CONFIG_HIGHMEM) += ../mm/highmem_32.o else -obj-y += vdso/ +obj-y += syscalls_64.o vdso/ subarch-y = ../lib/csum-partial_64.o ../lib/memcpy_64.o ../lib/thunk_64.o \ ../lib/rwsem.o diff --git a/arch/x86/um/asm/ptrace.h b/arch/x86/um/asm/ptrace.h index 755133258c4..54f8102ccde 100644 --- a/arch/x86/um/asm/ptrace.h +++ b/arch/x86/um/asm/ptrace.h @@ -86,4 +86,5 @@ extern long arch_prctl(struct task_struct *task, int code, unsigned long __user *addr); #endif +#define user_stack_pointer(regs) PT_REGS_SP(regs) #endif /* __UM_X86_PTRACE_H */ diff --git a/arch/x86/um/fault.c b/arch/x86/um/fault.c index 8784ab30d91..84ac7f7b025 100644 --- a/arch/x86/um/fault.c +++ b/arch/x86/um/fault.c @@ -20,7 +20,7 @@ int arch_fixup(unsigned long address, struct uml_pt_regs *regs) const struct exception_table_entry *fixup; fixup = search_exception_tables(address); - if (fixup != 0) { + if (fixup) { UPT_IP(regs) = fixup->fixup; return 1; } diff --git a/arch/x86/um/shared/sysdep/syscalls.h b/arch/x86/um/shared/sysdep/syscalls.h index ca255a805ed..bd9a89b67e4 100644 --- a/arch/x86/um/shared/sysdep/syscalls.h +++ b/arch/x86/um/shared/sysdep/syscalls.h @@ -1,5 +1,3 @@ -extern long sys_clone(unsigned long clone_flags, unsigned long newsp, - void __user *parent_tid, void __user *child_tid); #ifdef __i386__ #include "syscalls_32.h" #else diff --git a/arch/x86/um/shared/sysdep/syscalls_32.h b/arch/x86/um/shared/sysdep/syscalls_32.h index 8436079be91..68fd2cf526f 100644 --- a/arch/x86/um/shared/sysdep/syscalls_32.h +++ b/arch/x86/um/shared/sysdep/syscalls_32.h @@ -8,11 +8,6 @@ typedef long syscall_handler_t(struct pt_regs); -/* Not declared on x86, incompatible declarations on x86_64, so these have - * to go here rather than in sys_call_table.c - */ -extern syscall_handler_t sys_rt_sigaction; - extern syscall_handler_t *sys_call_table[]; #define EXECUTE_SYSCALL(syscall, regs) \ diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c index bdaa08cfbcf..ae7319db18e 100644 --- a/arch/x86/um/signal.c +++ b/arch/x86/um/signal.c @@ -342,9 +342,7 @@ static int copy_ucontext_to_user(struct ucontext __user *uc, { int err = 0; - err |= put_user(current->sas_ss_sp, &uc->uc_stack.ss_sp); - err |= put_user(sas_ss_flags(sp), &uc->uc_stack.ss_flags); - err |= put_user(current->sas_ss_size, &uc->uc_stack.ss_size); + err |= __save_altstack(&uc->uc_stack, sp); err |= copy_sc_to_user(&uc->uc_mcontext, fp, ¤t->thread.regs, 0); err |= copy_to_user(&uc->uc_sigmask, set, sizeof(*set)); return err; @@ -466,7 +464,7 @@ int setup_signal_stack_si(unsigned long stack_top, int sig, return 0; } -long sys_sigreturn(struct pt_regs *regs) +long sys_sigreturn(void) { unsigned long sp = PT_REGS_SP(¤t->thread.regs); struct sigframe __user *frame = (struct sigframe __user *)(sp - 8); @@ -529,10 +527,7 @@ int setup_signal_stack_si(unsigned long stack_top, int sig, /* Create the ucontext. */ err |= __put_user(0, &frame->uc.uc_flags); err |= __put_user(0, &frame->uc.uc_link); - err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp); - err |= __put_user(sas_ss_flags(PT_REGS_SP(regs)), - &frame->uc.uc_stack.ss_flags); - err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size); + err |= __save_altstack(&frame->uc.uc_stack, PT_REGS_SP(regs)); err |= copy_sc_to_user(&frame->uc.uc_mcontext, &frame->fpstate, regs, set->sig[0]); err |= __put_user(&frame->fpstate, &frame->uc.uc_mcontext.fpstate); @@ -582,7 +577,7 @@ int setup_signal_stack_si(unsigned long stack_top, int sig, } #endif -long sys_rt_sigreturn(struct pt_regs *regs) +long sys_rt_sigreturn(void) { unsigned long sp = PT_REGS_SP(¤t->thread.regs); struct rt_sigframe __user *frame = @@ -606,14 +601,3 @@ long sys_rt_sigreturn(struct pt_regs *regs) force_sig(SIGSEGV, current); return 0; } - -#ifdef CONFIG_X86_32 -long ptregs_sigreturn(void) -{ - return sys_sigreturn(NULL); -} -long ptregs_rt_sigreturn(void) -{ - return sys_rt_sigreturn(NULL); -} -#endif diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c index 232e60504b3..531d4269e2e 100644 --- a/arch/x86/um/sys_call_table_32.c +++ b/arch/x86/um/sys_call_table_32.c @@ -24,14 +24,6 @@ #define old_mmap sys_old_mmap -#define ptregs_fork sys_fork -#define ptregs_iopl sys_iopl -#define ptregs_vm86old sys_vm86old -#define ptregs_clone i386_clone -#define ptregs_vm86 sys_vm86 -#define ptregs_sigaltstack sys_sigaltstack -#define ptregs_vfork sys_vfork - #define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void sym(void) ; #include <asm/syscalls_32.h> diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c index 170bd926a69..f2f0723070c 100644 --- a/arch/x86/um/sys_call_table_64.c +++ b/arch/x86/um/sys_call_table_64.c @@ -31,7 +31,6 @@ #define stub_fork sys_fork #define stub_vfork sys_vfork #define stub_execve sys_execve -#define stub_sigaltstack sys_sigaltstack #define stub_rt_sigreturn sys_rt_sigreturn #define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat) diff --git a/arch/x86/um/syscalls_32.c b/arch/x86/um/syscalls_32.c deleted file mode 100644 index db444c7218f..00000000000 --- a/arch/x86/um/syscalls_32.c +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Copyright (C) 2000 - 2003 Jeff Dike (jdike@addtoit.com) - * Licensed under the GPL - */ - -#include <linux/syscalls.h> -#include <sysdep/syscalls.h> - -/* - * The prototype on i386 is: - * - * int clone(int flags, void * child_stack, int * parent_tidptr, struct user_desc * newtls - * - * and the "newtls" arg. on i386 is read by copy_thread directly from the - * register saved on the stack. - */ -long i386_clone(unsigned long clone_flags, unsigned long newsp, - int __user *parent_tid, void *newtls, int __user *child_tid) -{ - return sys_clone(clone_flags, newsp, parent_tid, child_tid); -} - - -long sys_sigaction(int sig, const struct old_sigaction __user *act, - struct old_sigaction __user *oact) -{ - struct k_sigaction new_ka, old_ka; - int ret; - - if (act) { - old_sigset_t mask; - if (!access_ok(VERIFY_READ, act, sizeof(*act)) || - __get_user(new_ka.sa.sa_handler, &act->sa_handler) || - __get_user(new_ka.sa.sa_restorer, &act->sa_restorer) || - __get_user(new_ka.sa.sa_flags, &act->sa_flags) || - __get_user(mask, &act->sa_mask)) - return -EFAULT; - siginitset(&new_ka.sa.sa_mask, mask); - } - - ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); - - if (!ret && oact) { - if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) || - __put_user(old_ka.sa.sa_handler, &oact->sa_handler) || - __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer) || - __put_user(old_ka.sa.sa_flags, &oact->sa_flags) || - __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask)) - return -EFAULT; - } - - return ret; -} diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index 4df6c373421..c74436e687b 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -22,6 +22,7 @@ #include <asm/hpet.h> #include <asm/unistd.h> #include <asm/io.h> +#include <asm/pvclock.h> #define gtod (&VVAR(vsyscall_gtod_data)) @@ -59,9 +60,79 @@ notrace static cycle_t vread_tsc(void) static notrace cycle_t vread_hpet(void) { - return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0); + return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + HPET_COUNTER); } +#ifdef CONFIG_PARAVIRT_CLOCK + +static notrace const struct pvclock_vsyscall_time_info *get_pvti(int cpu) +{ + const struct pvclock_vsyscall_time_info *pvti_base; + int idx = cpu / (PAGE_SIZE/PVTI_SIZE); + int offset = cpu % (PAGE_SIZE/PVTI_SIZE); + + BUG_ON(PVCLOCK_FIXMAP_BEGIN + idx > PVCLOCK_FIXMAP_END); + + pvti_base = (struct pvclock_vsyscall_time_info *) + __fix_to_virt(PVCLOCK_FIXMAP_BEGIN+idx); + + return &pvti_base[offset]; +} + +static notrace cycle_t vread_pvclock(int *mode) +{ + const struct pvclock_vsyscall_time_info *pvti; + cycle_t ret; + u64 last; + u32 version; + u32 migrate_count; + u8 flags; + unsigned cpu, cpu1; + + + /* + * When looping to get a consistent (time-info, tsc) pair, we + * also need to deal with the possibility we can switch vcpus, + * so make sure we always re-fetch time-info for the current vcpu. + */ + do { + cpu = __getcpu() & VGETCPU_CPU_MASK; + /* TODO: We can put vcpu id into higher bits of pvti.version. + * This will save a couple of cycles by getting rid of + * __getcpu() calls (Gleb). + */ + + pvti = get_pvti(cpu); + + migrate_count = pvti->migrate_count; + + version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags); + + /* + * Test we're still on the cpu as well as the version. + * We could have been migrated just after the first + * vgetcpu but before fetching the version, so we + * wouldn't notice a version change. + */ + cpu1 = __getcpu() & VGETCPU_CPU_MASK; + } while (unlikely(cpu != cpu1 || + (pvti->pvti.version & 1) || + pvti->pvti.version != version || + pvti->migrate_count != migrate_count)); + + if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT))) + *mode = VCLOCK_NONE; + + /* refer to tsc.c read_tsc() comment for rationale */ + last = VVAR(vsyscall_gtod_data).clock.cycle_last; + + if (likely(ret >= last)) + return ret; + + return last; +} +#endif + notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) { long ret; @@ -80,7 +151,7 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) } -notrace static inline u64 vgetsns(void) +notrace static inline u64 vgetsns(int *mode) { long v; cycles_t cycles; @@ -88,6 +159,10 @@ notrace static inline u64 vgetsns(void) cycles = vread_tsc(); else if (gtod->clock.vclock_mode == VCLOCK_HPET) cycles = vread_hpet(); +#ifdef CONFIG_PARAVIRT_CLOCK + else if (gtod->clock.vclock_mode == VCLOCK_PVCLOCK) + cycles = vread_pvclock(mode); +#endif else return 0; v = (cycles - gtod->clock.cycle_last) & gtod->clock.mask; @@ -107,7 +182,7 @@ notrace static int __always_inline do_realtime(struct timespec *ts) mode = gtod->clock.vclock_mode; ts->tv_sec = gtod->wall_time_sec; ns = gtod->wall_time_snsec; - ns += vgetsns(); + ns += vgetsns(&mode); ns >>= gtod->clock.shift; } while (unlikely(read_seqcount_retry(>od->seq, seq))); @@ -127,7 +202,7 @@ notrace static int do_monotonic(struct timespec *ts) mode = gtod->clock.vclock_mode; ts->tv_sec = gtod->monotonic_time_sec; ns = gtod->monotonic_time_snsec; - ns += vgetsns(); + ns += vgetsns(&mode); ns >>= gtod->clock.shift; } while (unlikely(read_seqcount_retry(>od->seq, seq))); timespec_add_ns(ts, ns); diff --git a/arch/x86/vdso/vgetcpu.c b/arch/x86/vdso/vgetcpu.c index 5463ad55857..2f94b039e55 100644 --- a/arch/x86/vdso/vgetcpu.c +++ b/arch/x86/vdso/vgetcpu.c @@ -17,15 +17,10 @@ __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused) { unsigned int p; - if (VVAR(vgetcpu_mode) == VGETCPU_RDTSCP) { - /* Load per CPU data from RDTSCP */ - native_read_tscp(&p); - } else { - /* Load per CPU data from GDT */ - asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG)); - } + p = __getcpu(); + if (cpu) - *cpu = p & 0xfff; + *cpu = p & VGETCPU_CPU_MASK; if (node) *node = p >> 12; return 0; diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index 00aaf047b39..431e8754441 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -141,7 +141,7 @@ static unsigned long vdso_addr(unsigned long start, unsigned len) * unaligned here as a result of stack start randomization. */ addr = PAGE_ALIGN(addr); - addr = align_addr(addr, NULL, ALIGN_VDSO); + addr = align_vdso_addr(addr); return addr; } diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index fdce49c7aff..131dacd2748 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -6,8 +6,9 @@ config XEN bool "Xen guest support" select PARAVIRT select PARAVIRT_CLOCK + select XEN_HAVE_PVMMU depends on X86_64 || (X86_32 && X86_PAE && !X86_VISWS) - depends on X86_CMPXCHG && X86_TSC + depends on X86_TSC help This is the Linux Xen port. Enabling this will allow the kernel to boot in a paravirtualized environment under the diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 586d83812b6..c8e1c7b95c3 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -67,6 +67,7 @@ #include <asm/hypervisor.h> #include <asm/mwait.h> #include <asm/pci_x86.h> +#include <asm/pat.h> #ifdef CONFIG_ACPI #include <linux/acpi.h> @@ -193,10 +194,11 @@ void xen_vcpu_restore(void) { int cpu; - for_each_online_cpu(cpu) { + for_each_possible_cpu(cpu) { bool other_cpu = (cpu != smp_processor_id()); + bool is_up = HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL); - if (other_cpu && + if (other_cpu && is_up && HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL)) BUG(); @@ -205,7 +207,7 @@ void xen_vcpu_restore(void) if (have_vcpu_info_placement) xen_vcpu_setup(cpu); - if (other_cpu && + if (other_cpu && is_up && HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL)) BUG(); } @@ -223,6 +225,21 @@ static void __init xen_banner(void) version >> 16, version & 0xffff, extra.extraversion, xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : ""); } +/* Check if running on Xen version (major, minor) or later */ +bool +xen_running_on_version_or_later(unsigned int major, unsigned int minor) +{ + unsigned int version; + + if (!xen_domain()) + return false; + + version = HYPERVISOR_xen_version(XENVER_version, NULL); + if ((((version >> 16) == major) && ((version & 0xffff) >= minor)) || + ((version >> 16) > major)) + return true; + return false; +} #define CPUID_THERM_POWER_LEAF 6 #define APERFMPERF_PRESENT 0 @@ -287,8 +304,7 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx, static bool __init xen_check_mwait(void) { -#if defined(CONFIG_ACPI) && !defined(CONFIG_ACPI_PROCESSOR_AGGREGATOR) && \ - !defined(CONFIG_ACPI_PROCESSOR_AGGREGATOR_MODULE) +#ifdef CONFIG_ACPI struct xen_platform_op op = { .cmd = XENPF_set_processor_pminfo, .u.set_pminfo.id = -1, @@ -309,6 +325,13 @@ static bool __init xen_check_mwait(void) if (!xen_initial_domain()) return false; + /* + * When running under platform earlier than Xen4.2, do not expose + * mwait, to avoid the risk of loading native acpi pad driver + */ + if (!xen_running_on_version_or_later(4, 2)) + return false; + ax = 1; cx = 0; @@ -1395,7 +1418,14 @@ asmlinkage void __init xen_start_kernel(void) */ acpi_numa = -1; #endif - +#ifdef CONFIG_X86_PAT + /* + * For right now disable the PAT. We should remove this once + * git commit 8eaffa67b43e99ae581622c5133e20b0f48bcef1 + * (xen/pat: Disable PAT support for now) is reverted. + */ + pat_enabled = 0; +#endif /* Don't do the full vcpu_info placement stuff until we have a possible map and a non-dummy shared_info. */ per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; @@ -1615,6 +1645,7 @@ const struct hypervisor_x86 x86_hyper_xen_hvm __refconst = { .name = "Xen HVM", .detect = xen_hvm_platform, .init_platform = xen_hvm_guest_init, + .x2apic_available = xen_x2apic_para_available, }; EXPORT_SYMBOL(x86_hyper_xen_hvm); #endif diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index dcf5f2dd91e..e8e34938c57 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1178,20 +1178,6 @@ static void xen_exit_mmap(struct mm_struct *mm) static void xen_post_allocator_init(void); -static __init void xen_mapping_pagetable_reserve(u64 start, u64 end) -{ - /* reserve the range used */ - native_pagetable_reserve(start, end); - - /* set as RW the rest */ - printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n", end, - PFN_PHYS(pgt_buf_top)); - while (end < PFN_PHYS(pgt_buf_top)) { - make_lowmem_page_readwrite(__va(end)); - end += PAGE_SIZE; - } -} - #ifdef CONFIG_X86_64 static void __init xen_cleanhighmap(unsigned long vaddr, unsigned long vaddr_end) @@ -1422,7 +1408,6 @@ static void __xen_write_cr3(bool kernel, unsigned long cr3) xen_mc_callback(set_current_cr3, (void *)cr3); } } - static void xen_write_cr3(unsigned long cr3) { BUG_ON(preemptible()); @@ -1448,6 +1433,45 @@ static void xen_write_cr3(unsigned long cr3) xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ } +#ifdef CONFIG_X86_64 +/* + * At the start of the day - when Xen launches a guest, it has already + * built pagetables for the guest. We diligently look over them + * in xen_setup_kernel_pagetable and graft as appropiate them in the + * init_level4_pgt and its friends. Then when we are happy we load + * the new init_level4_pgt - and continue on. + * + * The generic code starts (start_kernel) and 'init_mem_mapping' sets + * up the rest of the pagetables. When it has completed it loads the cr3. + * N.B. that baremetal would start at 'start_kernel' (and the early + * #PF handler would create bootstrap pagetables) - so we are running + * with the same assumptions as what to do when write_cr3 is executed + * at this point. + * + * Since there are no user-page tables at all, we have two variants + * of xen_write_cr3 - the early bootup (this one), and the late one + * (xen_write_cr3). The reason we have to do that is that in 64-bit + * the Linux kernel and user-space are both in ring 3 while the + * hypervisor is in ring 0. + */ +static void __init xen_write_cr3_init(unsigned long cr3) +{ + BUG_ON(preemptible()); + + xen_mc_batch(); /* disables interrupts */ + + /* Update while interrupts are disabled, so its atomic with + respect to ipis */ + this_cpu_write(xen_cr3, cr3); + + __xen_write_cr3(true, cr3); + + xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ + + pv_mmu_ops.write_cr3 = &xen_write_cr3; +} +#endif + static int xen_pgd_alloc(struct mm_struct *mm) { pgd_t *pgd = mm->pgd; @@ -1503,19 +1527,6 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte) #else /* CONFIG_X86_64 */ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte) { - unsigned long pfn = pte_pfn(pte); - - /* - * If the new pfn is within the range of the newly allocated - * kernel pagetable, and it isn't being mapped into an - * early_ioremap fixmap slot as a freshly allocated page, make sure - * it is RO. - */ - if (((!is_early_ioremap_ptep(ptep) && - pfn >= pgt_buf_start && pfn < pgt_buf_top)) || - (is_early_ioremap_ptep(ptep) && pfn != (pgt_buf_end - 1))) - pte = pte_wrprotect(pte); - return pte; } #endif /* CONFIG_X86_64 */ @@ -2129,11 +2140,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = { .write_cr2 = xen_write_cr2, .read_cr3 = xen_read_cr3, -#ifdef CONFIG_X86_32 .write_cr3 = xen_write_cr3_init, -#else - .write_cr3 = xen_write_cr3, -#endif .flush_tlb_user = xen_flush_tlb, .flush_tlb_kernel = xen_flush_tlb, @@ -2197,7 +2204,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = { void __init xen_init_mmu_ops(void) { - x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve; x86_init.paging.pagetable_init = xen_pagetable_init; pv_mmu_ops = xen_mmu_ops; @@ -2497,8 +2503,10 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token, int xen_remap_domain_mfn_range(struct vm_area_struct *vma, unsigned long addr, - unsigned long mfn, int nr, - pgprot_t prot, unsigned domid) + xen_pfn_t mfn, int nr, + pgprot_t prot, unsigned domid, + struct page **pages) + { struct remap_data rmd; struct mmu_update mmu_update[REMAP_BATCH_SIZE]; @@ -2542,3 +2550,14 @@ out: return err; } EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range); + +/* Returns: 0 success */ +int xen_unmap_domain_mfn_range(struct vm_area_struct *vma, + int numpgs, struct page **pages) +{ + if (!pages || !xen_feature(XENFEAT_auto_translated_physmap)) + return 0; + + return -EINVAL; +} +EXPORT_SYMBOL_GPL(xen_unmap_domain_mfn_range); diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 8971a26d21a..94eac5c85cd 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -556,12 +556,9 @@ void __init xen_arch_setup(void) COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE); /* Set up idle, making sure it calls safe_halt() pvop */ -#ifdef CONFIG_X86_32 - boot_cpu_data.hlt_works_ok = 1; -#endif disable_cpuidle(); disable_cpufreq(); - WARN_ON(set_pm_idle_to_default()); + WARN_ON(xen_set_default_idle()); fiddle_vdso(); #ifdef CONFIG_NUMA numa_off = 1; diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 353c50f1870..09ea61d2e02 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -254,7 +254,7 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus) } xen_init_lock_cpu(0); - smp_store_cpu_info(0); + smp_store_boot_cpu_info(); cpu_data(0).x86_max_cores = 1; for_each_possible_cpu(i) { @@ -300,8 +300,6 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) gdt = get_cpu_gdt_table(cpu); ctxt->flags = VGCF_IN_KERNEL; - ctxt->user_regs.ds = __USER_DS; - ctxt->user_regs.es = __USER_DS; ctxt->user_regs.ss = __KERNEL_DS; #ifdef CONFIG_X86_32 ctxt->user_regs.fs = __KERNEL_PERCPU; @@ -310,35 +308,41 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) ctxt->gs_base_kernel = per_cpu_offset(cpu); #endif ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; - ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt)); - xen_copy_trap_info(ctxt->trap_ctxt); + { + ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ + ctxt->user_regs.ds = __USER_DS; + ctxt->user_regs.es = __USER_DS; - ctxt->ldt_ents = 0; + xen_copy_trap_info(ctxt->trap_ctxt); - BUG_ON((unsigned long)gdt & ~PAGE_MASK); + ctxt->ldt_ents = 0; - gdt_mfn = arbitrary_virt_to_mfn(gdt); - make_lowmem_page_readonly(gdt); - make_lowmem_page_readonly(mfn_to_virt(gdt_mfn)); + BUG_ON((unsigned long)gdt & ~PAGE_MASK); - ctxt->gdt_frames[0] = gdt_mfn; - ctxt->gdt_ents = GDT_ENTRIES; + gdt_mfn = arbitrary_virt_to_mfn(gdt); + make_lowmem_page_readonly(gdt); + make_lowmem_page_readonly(mfn_to_virt(gdt_mfn)); - ctxt->user_regs.cs = __KERNEL_CS; - ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs); + ctxt->gdt_frames[0] = gdt_mfn; + ctxt->gdt_ents = GDT_ENTRIES; - ctxt->kernel_ss = __KERNEL_DS; - ctxt->kernel_sp = idle->thread.sp0; + ctxt->kernel_ss = __KERNEL_DS; + ctxt->kernel_sp = idle->thread.sp0; #ifdef CONFIG_X86_32 - ctxt->event_callback_cs = __KERNEL_CS; - ctxt->failsafe_callback_cs = __KERNEL_CS; + ctxt->event_callback_cs = __KERNEL_CS; + ctxt->failsafe_callback_cs = __KERNEL_CS; #endif - ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback; - ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback; + ctxt->event_callback_eip = + (unsigned long)xen_hypervisor_callback; + ctxt->failsafe_callback_eip = + (unsigned long)xen_failsafe_callback; + } + ctxt->user_regs.cs = __KERNEL_CS; + ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs); per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir); ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir)); @@ -432,13 +436,6 @@ static void __cpuinit xen_play_dead(void) /* used only with HOTPLUG_CPU */ play_dead_common(); HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); cpu_bringup(); - /* - * Balance out the preempt calls - as we are running in cpu_idle - * loop which has been called at bootup from cpu_bringup_and_idle. - * The cpucpu_bringup_and_idle called cpu_bringup which made a - * preempt_disable() So this preempt_enable will balance it out. - */ - preempt_enable(); } #else /* !CONFIG_HOTPLUG_CPU */ diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 83e866d714c..f7a080ef035 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -328,7 +328,6 @@ static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl) if (per_cpu(lock_spinners, cpu) == xl) { ADD_STATS(released_slow_kicked, 1); xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR); - break; } } } diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S index f9643fc50de..33ca6e42a4c 100644 --- a/arch/x86/xen/xen-asm_32.S +++ b/arch/x86/xen/xen-asm_32.S @@ -89,11 +89,11 @@ ENTRY(xen_iret) */ #ifdef CONFIG_SMP GET_THREAD_INFO(%eax) - movl TI_cpu(%eax), %eax - movl __per_cpu_offset(,%eax,4), %eax - mov xen_vcpu(%eax), %eax + movl %ss:TI_cpu(%eax), %eax + movl %ss:__per_cpu_offset(,%eax,4), %eax + mov %ss:xen_vcpu(%eax), %eax #else - movl xen_vcpu, %eax + movl %ss:xen_vcpu, %eax #endif /* check IF state we're restoring */ @@ -106,11 +106,11 @@ ENTRY(xen_iret) * resuming the code, so we don't have to be worried about * being preempted to another CPU. */ - setz XEN_vcpu_info_mask(%eax) + setz %ss:XEN_vcpu_info_mask(%eax) xen_iret_start_crit: /* check for unmasked and pending */ - cmpw $0x0001, XEN_vcpu_info_pending(%eax) + cmpw $0x0001, %ss:XEN_vcpu_info_pending(%eax) /* * If there's something pending, mask events again so we can @@ -118,7 +118,7 @@ xen_iret_start_crit: * touch XEN_vcpu_info_mask. */ jne 1f - movb $1, XEN_vcpu_info_mask(%eax) + movb $1, %ss:XEN_vcpu_info_mask(%eax) 1: popl %eax |